Python源码示例:sklearn.ensemble.ExtraTreesClassifier()
示例1
def create_model_from_signatures(sig_csv_path, model_out, sig_datatype=np.int32):
"""
Takes a .csv file containing class signatures - produced by extract_features_to_csv - and uses it to train
and pickle a scikit-learn model.
Parameters
----------
sig_csv_path
The path to the signatures file
model_out
The location to save the pickled model to.
sig_datatype
The datatype to read the csv as. Defaults to int32.
Notes
-----
At present, the model is an ExtraTreesClassifier arrived at by tpot:
model = ens.ExtraTreesClassifier(bootstrap=False, criterion="gini", max_features=0.55, min_samples_leaf=2,
min_samples_split=16, n_estimators=100, n_jobs=4, class_weight='balanced')
"""
model = ens.ExtraTreesClassifier(bootstrap=False, criterion="gini", max_features=0.55, min_samples_leaf=2,
min_samples_split=16, n_estimators=100, n_jobs=4, class_weight='balanced')
features, labels = load_signatures(sig_csv_path, sig_datatype)
model.fit(features, labels)
joblib.dump(model, model_out)
示例2
def __init__(self, params):
super(ExtraTreesAlgorithm, self).__init__(params)
logger.debug("ExtraTreesAlgorithm.__init__")
self.library_version = sklearn.__version__
self.trees_in_step = additional.get("trees_in_step", 100)
self.max_steps = additional.get("max_steps", 50)
self.early_stopping_rounds = additional.get("early_stopping_rounds", 50)
self.model = ExtraTreesClassifier(
n_estimators=self.trees_in_step,
criterion=params.get("criterion", "gini"),
max_features=params.get("max_features", 0.6),
min_samples_split=params.get("min_samples_split", 30),
warm_start=True,
n_jobs=-1,
random_state=params.get("seed", 1),
)
示例3
def run_sklearn():
n_trees = 100
n_folds = 3
# https://www.analyticsvidhya.com/blog/2015/06/tuning-random-forest-model/
alg_list = [
['lreg',LinearRegression()],
['rforest',RandomForestRegressor(n_estimators=1000, n_jobs=-1, max_depth=3)],
['extree',ExtraTreesClassifier(n_estimators = 1000,max_depth=2)],
['adaboost',AdaBoostRegressor(base_estimator=None, n_estimators=600, learning_rate=1.0)],
['knn', sklearn.neighbors.KNeighborsRegressor(n_neighbors=5)]
]
start_time = time.time()
for name,alg in alg_list:
train = jhkaggle.train_sklearn.TrainSKLearn("1",name,alg,False)
train.run()
train = None
elapsed_time = time.time() - start_time
print("Elapsed time: {}".format(jhkaggle.util.hms_string(elapsed_time)))
示例4
def test_time(pipeline_name, name, path):
if pipeline_name == "LR":
pipeline = make_pipeline(LogisticRegression())
if pipeline_name == "FGS":
pipeline = make_pipeline(FeatureGradientSelector(), LogisticRegression())
if pipeline_name == "Tree":
pipeline = make_pipeline(SelectFromModel(ExtraTreesClassifier(n_estimators=50)), LogisticRegression())
test_benchmark = Benchmark()
print("Dataset:\t", name)
print("Pipeline:\t", pipeline_name)
starttime = datetime.datetime.now()
test_benchmark.run_test(pipeline, name, path)
endtime = datetime.datetime.now()
print("Used time: ", (endtime - starttime).microseconds/1000)
print("")
示例5
def test():
url_zip_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/rcv1_train.binary.bz2'
urllib.request.urlretrieve(url_zip_train, filename='train.bz2')
f_svm = open('train.svm', 'wt')
with bz2.open('train.bz2', 'rb') as f_zip:
data = f_zip.read()
f_svm.write(data.decode('utf-8'))
f_svm.close()
X, y = load_svmlight_file('train.svm')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
pipeline = make_pipeline(FeatureGradientSelector(n_epochs=1, n_features=10), LogisticRegression())
# pipeline = make_pipeline(SelectFromModel(ExtraTreesClassifier(n_estimators=50)), LogisticRegression())
pipeline.fit(X_train, y_train)
print("Pipeline Score: ", pipeline.score(X_train, y_train))
示例6
def __init__(self, options):
self.handle_options(options)
out_params = convert_params(
options.get('params', {}),
ints=['random_state', 'n_estimators', 'max_depth',
'min_samples_split', 'max_leaf_nodes'],
strs=['max_features', 'criterion'],
)
if 'max_depth' not in out_params:
out_params.setdefault('max_leaf_nodes', 2000)
if 'max_features' in out_params:
out_params['max_features'] = handle_max_features(out_params['max_features'])
self.estimator = _ExtraTreesClassifier(class_weight='balanced',
**out_params)
示例7
def __init__(
self,data_block, predictors=[],cv_folds=10,
scoring_metric='accuracy',additional_display_metrics=[]):
base_classification.__init__(
self, alg=ExtraTreesClassifier(), data_block=data_block,
predictors=predictors,cv_folds=cv_folds,
scoring_metric=scoring_metric,
additional_display_metrics=additional_display_metrics)
self.model_output = pd.Series(self.default_parameters)
self.model_output['Feature_Importance'] = "-"
self.model_output['OOB_Score'] = "-"
#Set parameters to default values:
self.set_parameters(set_default=True)
示例8
def define_clfs_params(self):
'''
Defines all relevant parameters and classes for classfier objects.
Edit these if you wish to change parameters.
'''
# These are the classifiers
self.clfs = {
'RF': RandomForestClassifier(n_estimators = 50, n_jobs = -1),
'ET': ExtraTreesClassifier(n_estimators = 10, n_jobs = -1, criterion = 'entropy'),
'AB': AdaBoostClassifier(DecisionTreeClassifier(max_depth = [1, 5, 10, 15]), algorithm = "SAMME", n_estimators = 200),
'LR': LogisticRegression(penalty = 'l1', C = 1e5),
'SVM': svm.SVC(kernel = 'linear', probability = True, random_state = 0),
'GB': GradientBoostingClassifier(learning_rate = 0.05, subsample = 0.5, max_depth = 6, n_estimators = 10),
'NB': GaussianNB(),
'DT': DecisionTreeClassifier(),
'SGD': SGDClassifier(loss = 'log', penalty = 'l2'),
'KNN': KNeighborsClassifier(n_neighbors = 3)
}
# These are the parameters which will be run through
self.params = {
'RF':{'n_estimators': [1,10,100,1000], 'max_depth': [10, 15,20,30,40,50,60,70,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10], 'random_state': [1]},
'LR': {'penalty': ['l1','l2'], 'C': [0.00001,0.0001,0.001,0.01,0.1,1,10], 'random_state': [1]},
'SGD': {'loss': ['log'], 'penalty': ['l2','l1','elasticnet'], 'random_state': [1]},
'ET': {'n_estimators': [1,10,100,1000], 'criterion' : ['gini', 'entropy'], 'max_depth': [1,3,5,10,15], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10], 'random_state': [1]},
'AB': {'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [1,10,100,1000], 'random_state': [1]},
'GB': {'n_estimators': [1,10,100,1000], 'learning_rate' : [0.001,0.01,0.05,0.1,0.5],'subsample' : [0.1,0.5,1.0], 'max_depth': [1,3,5,10,20,50,100], 'random_state': [1]},
'NB': {},
'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1,2,15,20,30,40,50], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10], 'random_state': [1]},
'SVM' :{'C' :[0.00001,0.0001,0.001,0.01,0.1,1,10],'kernel':['linear'], 'random_state': [1]},
'KNN' :{'n_neighbors': [1,5,10,25,50,100],'weights': ['uniform','distance'],'algorithm': ['auto','ball_tree','kd_tree']}
}
示例9
def test_change_algorithms():
X, y = make_classification(n_samples=1000,
n_features=10,
n_classes=2,
n_clusters_per_class=1,
random_state=0)
X = pd.DataFrame(X)
y = pd.Series(y)
cls = MALSS('classification')
cls.fit(X, y, algorithm_selection_only=True)
algorithms = cls.get_algorithms()
assert algorithms[0][0] == 'Support Vector Machine (RBF Kernel)'
assert algorithms[1][0] == 'Random Forest'
assert algorithms[2][0] == 'Logistic Regression'
assert algorithms[3][0] == 'Decision Tree'
assert algorithms[4][0] == 'k-Nearest Neighbors'
cls.remove_algorithm(0)
cls.remove_algorithm()
algorithms = cls.get_algorithms()
assert algorithms[0][0] == 'Random Forest'
assert algorithms[1][0] == 'Logistic Regression'
assert algorithms[2][0] == 'Decision Tree'
from sklearn.ensemble import ExtraTreesClassifier as ET
cls.add_algorithm(ET(n_jobs=3),
[{'n_estimators': [10, 30, 50],
'max_depth': [3, 5, None],
'max_features': [0.3, 0.6, 'auto']}],
'Extremely Randomized Trees')
algorithms = cls.get_algorithms()
assert algorithms[0][0] == 'Random Forest'
assert algorithms[1][0] == 'Logistic Regression'
assert algorithms[2][0] == 'Decision Tree'
assert algorithms[3][0] == 'Extremely Randomized Trees'
示例10
def test_min_impurity_split():
# Test if min_impurity_split of base estimators is set
# Regression test for #8006
X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
all_estimators = [RandomForestClassifier, RandomForestRegressor,
ExtraTreesClassifier, ExtraTreesRegressor]
for Estimator in all_estimators:
est = Estimator(min_impurity_split=0.1)
est = assert_warns_message(DeprecationWarning, "min_impurity_decrease",
est.fit, X, y)
for tree in est.estimators_:
assert_equal(tree.min_impurity_split, 0.1)
示例11
def test_min_impurity_decrease():
X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
all_estimators = [RandomForestClassifier, RandomForestRegressor,
ExtraTreesClassifier, ExtraTreesRegressor]
for Estimator in all_estimators:
est = Estimator(min_impurity_decrease=0.1)
est.fit(X, y)
for tree in est.estimators_:
# Simply check if the parameter is passed on correctly. Tree tests
# will suffice for the actual working of this param
assert_equal(tree.min_impurity_decrease, 0.1)
示例12
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs):
orig_cols = list(X.names)
if self.num_classes >= 2:
lb = LabelEncoder()
lb.fit(self.labels)
y = lb.transform(y)
model = ExtraTreesClassifier(**self.params)
else:
model = ExtraTreesRegressor(**self.params)
# Replace missing values with a value smaller than all observed values
self.min = dict()
for col in X.names:
XX = X[:, col]
self.min[col] = XX.min1()
if self.min[col] is None or np.isnan(self.min[col]):
self.min[col] = -1e10
else:
self.min[col] -= 1
XX.replace(None, self.min[col])
X[:, col] = XX
assert X[dt.isna(dt.f[col]), col].nrows == 0
X = X.to_numpy()
model.fit(X, y)
importances = np.array(model.feature_importances_)
self.set_model_properties(model=model,
features=orig_cols,
importances=importances.tolist(),
iterations=self.params['n_estimators'])
示例13
def random_forest(train_vecs,y_train,test_vecs,y_test):
clf = RandomForestClassifier(n_estimators=10, max_depth=10,min_samples_split=2,n_jobs=1,random_state=0)
clf.fit(train_vecs,y_train)
joblib.dump(clf,storedpaths+'model_randomforest.pkl')
test_scores=clf.score(test_vecs,y_test)
return test_scores
# 训练 ExtraTreesClassifier 分类算法
示例14
def extract_tree(train_vecs,y_train,test_vecs,y_test):
clf = ExtraTreesClassifier(n_estimators=10, max_depth=10,min_samples_split=2,n_jobs=1,random_state=0)
clf.fit(train_vecs,y_train)
joblib.dump(clf,storedpaths+'model_extracttree.pkl')
test_scores=clf.score(test_vecs,y_test)
return test_scores
# 训练 GBDT 分类算法
示例15
def __init__(self, X, label_words):
self.le = preprocessing.LabelEncoder()
self.clf = ExtraTreesClassifier(n_estimators=100,
max_depth=16, random_state=0)
y = self.encode_labels(label_words)
self.clf.fit(np.asarray(X), y)
示例16
def learn(x, y, test_x):
cw = {"0":variables.weight_0_rf, "1000":variables.weight_1000_rf, "1500":variables.weight_1500_rf, "2000":variables.weight_2000_rf}
clf = ExtraTreesClassifier(n_jobs = -1,
n_estimators=variables.n_estimators_et,
max_depth=variables.max_depth_et, random_state=0,
min_samples_split=variables.min_samples_split_et,
min_samples_leaf=variables.min_samples_leaf_et,
max_features=variables.max_feature_et,
max_leaf_nodes=variables.max_leaf_nodes_et,
criterion=variables.criterion_et,
min_impurity_split=variables.min_impurity_split_et,
class_weight=variables.cw_et).fit(x, y)
print "n_estimators=", variables.n_estimators_et,
print "max_depth=", variables.max_depth_et,
print "min_samples_split=", variables.min_samples_split_et,
print "min_samples_leaf=", variables.min_samples_leaf_et,
print "max_features=",variables.max_feature_et,
print "max_leaf_nodes=",variables.max_leaf_nodes_et,
print "criterion=",variables.criterion_et,
print "min_impurity_split=",variables.min_impurity_split_et,
print "class_weight=", variables.cw_et
prediction_list = clf.predict(test_x)
prediction_list_prob = clf.predict_proba(test_x)
return prediction_list,prediction_list_prob
示例17
def run_sklearn():
n_trees = 100
n_folds = 3
# https://www.analyticsvidhya.com/blog/2015/06/tuning-random-forest-model/
alg_list = [
['rforest',RandomForestClassifier(n_estimators=1000, n_jobs=-1, verbose=1, max_depth=3)],
['extree',ExtraTreesClassifier(n_estimators = 1000,max_depth=3,n_jobs=-1)],
['adaboost',AdaBoostClassifier(base_estimator=None, n_estimators=600, learning_rate=1.0)],
['knn', sklearn.neighbors.KNeighborsClassifier(n_neighbors=5,n_jobs=-1)]
]
start_time = time.time()
for name,alg in alg_list:
train = jhkaggle.train_sklearn.TrainSKLearn("1",name,alg,False)
train.run()
train = None
示例18
def test_memory(pipeline_name, name, path):
if pipeline_name == "LR":
pipeline = make_pipeline(LogisticRegression())
if pipeline_name == "FGS":
pipeline = make_pipeline(FeatureGradientSelector(), LogisticRegression())
if pipeline_name == "Tree":
pipeline = make_pipeline(SelectFromModel(ExtraTreesClassifier(n_estimators=50)), LogisticRegression())
test_benchmark = Benchmark()
print("Dataset:\t", name)
print("Pipeline:\t", pipeline_name)
test_benchmark.run_test(pipeline, name, path)
print("")
示例19
def test_FS2(self):
from sklearn.ensemble import ExtraTreesClassifier
trainable = lale.lib.autoai_libs.FS2(
cols_ids_must_keep=[1],
additional_col_count_to_keep=3,
ptype='classification',
eval_algo=ExtraTreesClassifier,
)
self.doTest(trainable, **self._iris)
示例20
def build_model(self):
return ExtraTreesClassifier(**self.params)
示例21
def build_model(self):
return ExtraTreesClassifier(**self.params)
示例22
def importance(self):
"""
Plot importance of features based on ExtraTreesClassifier.
"""
Base.data_n()
X = Base.train_n
y = X[Base.target].copy()
X = X.drop([Base.target], axis=1)
model = ExtraTreesClassifier()
model.fit(X, y)
self._plot_importance(X.columns, model.feature_importances_)
示例23
def initialize(context):
set_symbol_lookup_date('2012-01-01')
# Parameters to be changed
context.model = ExtraTreesClassifier(n_estimators=300)
context.lookback = 14
context.history_range = 1000
context.beta_coefficient = 0.0
context.percentage_change = 0.025
context.maximum_leverage = 2.0
context.number_of_stocks = 150
context.maximum_pe_ratio = 8
context.maximum_market_cap = 0.1e9
context.starting_probability = 0.5
# End of parameters
schedule_function(create_model, date_rules.month_start(), time_rules.market_open())
schedule_function(rebalance, date_rules.month_start(), time_rules.market_open())
schedule_function(trade, date_rules.every_day(), time_rules.market_open())
context.algorithm_returns = []
context.longs = []
context.shorts = []
context.training_stocks = symbols('SPY')
context.trading_stocks = []
context.beta = 1.0
context.beta_list = []
context.completed = False
示例24
def feature_importances(X,y):
# the output does not stable because of the randomness
# Build a classification task using 3 informative features
#X, y = make_classification(n_samples=1000,n_features=10,n_informative=3,n_redundant=0,n_repeated=0,n_classes=2,n_state=0,shuffle=False)
# Build a forest and compute the feature importances
from sklearn.ensemble import ExtraTreesClassifier
forest = ExtraTreesClassifier(n_estimators= 25, criterion = 'entropy' , random_state=None)
forest.fit(X, y)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],axis=0)
indices = np.argsort(importances)[::-1]
# print (indices)
# Print the feature ranking
print("Feature ranking:")
sum1 = 0.0
for f in range(80):
print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
sum1 = sum1 + importances[indices[f]]
print (sum1)
# Plot the feature importances of the forest
#width = 0.5
x_len = range(len(importances))
plt.figure()
plt.title("Feature importances")
plt.bar(x_len, importances[indices] ,color="r", yerr=std[indices], align="center")
plt.xticks(x_len, indices)
plt.xlim([-1, max(x_len)+1])
plt.show()
######################################READ DATA####################################################
示例25
def tree_based_feature_selection(features, clases, nombres_features_ordenadas):
print("Realizando tree-based feature selection")
clf = ExtraTreesClassifier(n_estimators=1000)
clf.fit(features, clases)
imprimir_importancias(clf.feature_importances_, "Tree-based feature selection", nombres_features_ordenadas)
示例26
def test_objectmapper(self):
df = pdml.ModelFrame([])
self.assertIs(df.ensemble.AdaBoostClassifier,
ensemble.AdaBoostClassifier)
self.assertIs(df.ensemble.AdaBoostRegressor,
ensemble.AdaBoostRegressor)
self.assertIs(df.ensemble.BaggingClassifier,
ensemble.BaggingClassifier)
self.assertIs(df.ensemble.BaggingRegressor,
ensemble.BaggingRegressor)
self.assertIs(df.ensemble.ExtraTreesClassifier,
ensemble.ExtraTreesClassifier)
self.assertIs(df.ensemble.ExtraTreesRegressor,
ensemble.ExtraTreesRegressor)
self.assertIs(df.ensemble.GradientBoostingClassifier,
ensemble.GradientBoostingClassifier)
self.assertIs(df.ensemble.GradientBoostingRegressor,
ensemble.GradientBoostingRegressor)
self.assertIs(df.ensemble.IsolationForest,
ensemble.IsolationForest)
self.assertIs(df.ensemble.RandomForestClassifier,
ensemble.RandomForestClassifier)
self.assertIs(df.ensemble.RandomTreesEmbedding,
ensemble.RandomTreesEmbedding)
self.assertIs(df.ensemble.RandomForestRegressor,
ensemble.RandomForestRegressor)
self.assertIs(df.ensemble.VotingClassifier,
ensemble.VotingClassifier)
示例27
def GetKFeatures(filename, method='RFE',kbest=30,alpha=0.01, reduceMatrix = True):
'''
Gets best features using chosen method
(K-best, RFE, RFECV,'L1' (RandomizedLogisticRegression),'Tree' (ExtraTreesClassifier), mrmr),
then prints top K features' names (from featNames).
If reduceMatrix = True, then also returns X reduced to the K best features.
Available methods' names are: 'RFE','RFECV','RandomizedLogisticRegression','K-best','ExtraTreesClassifier'..
Note, that effectiveyl, Any scikit learn method could be used, if correctly imported..
'''
#est = method()
'''
Gets the K-best features (filtered by FDR, then select best ranked by t-test , more advanced options can be implemented).
Save the data/matrix with the resulting/kept features to a new output file, "REDUCED_Feat.csv"
'''
features, labels, lb_encoder,featureNames = load_data(filename)
X, y = features, labels
# change the names as ints back to strings
class_names=lb_encoder.inverse_transform(y)
print("Data and labels imported. PreFilter Feature matrix shape:")
print(X.shape)
selectK = SelectKBest(k=kbest)
selectK.fit(X,y)
selectK_mask=selectK.get_support()
K_featnames = featureNames[selectK_mask]
print('X After K filter:',X.shape)
print("K_featnames: %s" %(K_featnames))
if reduceMatrix ==True :
Reduced_df = pd.read_csv(filename, index_col=0)
Reduced_df = Reduced_df[Reduced_df.columns[selectK_mask]]
Reduced_df.to_csv('REDUCED_Feat.csv')
print('Saved to REDUCED_Feat.csv')
return Reduced_df
#WORKS! But unreadable with too many features!
示例28
def __init__(self, X, label_words):
self.le=preprocessing.LabelEncoder()
self.clf=ExtraTreesClassifier(n_estimators=100,max_depth=16,random_state=0) #http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html
y=self.encode_labels(label_words)
self.clf.fit(np.asarray(X),y)
with open('clf.pkl', 'wb') as f: #存储训练好的图像分类器模型
pickle.dump(self.clf, f)
示例29
def fit(self, X_train, y_train, X_val, y_val):
results = dict()
self.all_nan = np.all(np.isnan(X_train), axis=0)
X_train = X_train[:, ~self.all_nan]
X_val = X_val[:, ~self.all_nan]
X_train = np.nan_to_num(X_train)
X_val = np.nan_to_num(X_val)
self.config["warm_start"] = False
self.num_classes = len(np.unique(y_train))
if self.num_classes>2:
print("==> Using warmstarting for multiclass")
final_n_estimators = self.config["n_estimators"]
self.config["n_estimators"] = 8
self.config["warm_start"] = True
self.model = ExtraTreesClassifier(**self.config)
self.model.fit(X_train, y_train)
if self.config["warm_start"]:
self.model.n_estimators = final_n_estimators
self.model.fit(X_train, y_train)
pred_val_probas = self.model.predict_proba(X_val)
pred_train = self.model.predict(X_train)
pred_val = self.model.predict(X_val)
results["train_acc"] = metrics.accuracy_score(y_train, pred_train)
results["train_balanced_acc"] = metrics.balanced_accuracy_score(y_train, pred_train)
results["val_acc"] = metrics.accuracy_score(y_val, pred_val)
results["val_balanced_acc"] = metrics.balanced_accuracy_score(y_val, pred_val)
results["val_preds"] = pred_val_probas.tolist()
results["labels"] = y_val.tolist()
return results
示例30
def test_min_impurity_split():
# Test if min_impurity_split of base estimators is set
# Regression test for #8006
X, y = datasets.make_hastie_10_2(n_samples=100, random_state=1)
all_estimators = [RandomForestClassifier, RandomForestRegressor,
ExtraTreesClassifier, ExtraTreesRegressor]
for Estimator in all_estimators:
est = Estimator(min_impurity_split=0.1)
est = assert_warns_message(DeprecationWarning, "min_impurity_decrease",
est.fit, X, y)
for tree in est.estimators_:
assert_equal(tree.min_impurity_split, 0.1)