Python源码示例:sklearn.datasets.make_classification()

示例1
def test_plot_estimator_and_lightgbm(tmpdir):
    pytest.importorskip('graphviz')
    lightgbm = pytest.importorskip('lightgbm')
    from pygbm.plotting import plot_tree

    n_classes = 3
    X, y = make_classification(n_samples=150, n_classes=n_classes,
                               n_features=5, n_informative=3, n_redundant=0,
                               random_state=0)

    n_trees = 3
    est_pygbm = GradientBoostingClassifier(max_iter=n_trees,
                                           n_iter_no_change=None)
    est_pygbm.fit(X, y)
    est_lightgbm = lightgbm.LGBMClassifier(n_estimators=n_trees)
    est_lightgbm.fit(X, y)

    n_total_trees = n_trees * n_classes
    for i in range(n_total_trees):
        filename = tmpdir.join('plot_mixed_predictors.pdf')
        plot_tree(est_pygbm, est_lightgbm=est_lightgbm, tree_index=i,
                  view=False, filename=filename)
        assert filename.exists() 
示例2
def make_classification_df(n_samples: int = 1024,
                           n_num_features: int = 20,
                           n_cat_features: int = 0,
                           class_sep: float = 1.0,
                           n_classes: int = 2,
                           feature_name: str = 'col_{}',
                           target_name: str = 'target',
                           random_state: int = 0,
                           id_column: str = None) -> Tuple[pd.DataFrame, pd.Series]:
    np.random.seed(random_state)
    X, y = make_classification(n_samples=n_samples, n_features=n_num_features, class_sep=class_sep,
                               random_state=random_state, n_classes=n_classes, n_informative=max(n_classes, 2))

    X = pd.DataFrame(X, columns=[feature_name.format(i) for i in range(n_num_features)])
    y = pd.Series(y, name=target_name)

    if id_column is not None:
        X[id_column] = range(n_samples)

    for i in range(n_cat_features):
        X['cat_{}'.format(i)] = \
            pd.Series(np.random.choice(['A', 'B', None], size=n_samples)).astype('category')

    return X, y 
示例3
def test_cv_lgbm():
    X, y = make_classification(n_samples=1024, n_features=20, class_sep=0.98, random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

    models = [LGBMClassifier(n_estimators=300) for _ in range(5)]

    pred_oof, pred_test, scores, importance = cross_validate(models, X_train, y_train, X_test, cv=5,
                                                             eval_func=roc_auc_score,
                                                             fit_params={'early_stopping_rounds': 200})

    print(scores)
    assert len(scores) == 5 + 1
    assert scores[-1] >= 0.85  # overall roc_auc
    assert roc_auc_score(y_train, pred_oof) == scores[-1]
    assert roc_auc_score(y_test, pred_test) >= 0.85  # test roc_auc
    assert roc_auc_score(y, models[0].predict_proba(X)[:, 1]) >= 0.85  # make sure models are trained
    assert len(importance) == 5
    assert list(importance[0].columns) == ['feature', 'importance']
    assert len(importance[0]) == 20 
示例4
def test_cv_partial_evaluate():
    X, y = make_classification(n_samples=1024, n_features=20, class_sep=0.98, random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

    model = RidgeClassifier(alpha=1.0)

    n = 0

    def _fold_count(*args):
        nonlocal n
        n += 1

    cv = Take(2, KFold(5))

    pred_oof, pred_test, scores, _ = cross_validate(model, X_train, y_train, X_test, cv=cv, eval_func=roc_auc_score,
                                                    on_each_fold=_fold_count)

    assert len(scores) == 2 + 1
    assert scores[-1] >= 0.8  # overall auc
    assert n == 2 
示例5
def test_fit_params_callback():
    X, y = make_classification(n_samples=1024, n_features=20, class_sep=0.98, random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

    models = [LGBMClassifier(n_estimators=300) for _ in range(5)]

    sample_weights = np.random.randint(1, 10, size=len(X_train))
    sample_weights = sample_weights / sample_weights.sum()

    def fit_params(n: int, train_index: List[int], valid_index: List[int]):
        return {
            'early_stopping_rounds': 100,
            'sample_weight': list(sample_weights[train_index]),
            'eval_sample_weight': [list(sample_weights[valid_index])]
        }

    result_w_weight = cross_validate(models, X_train, y_train, X_test, cv=5,
                                     eval_func=roc_auc_score, fit_params=fit_params)

    result_wo_weight = cross_validate(models, X_train, y_train, X_test, cv=5,
                                      eval_func=roc_auc_score, fit_params={'early_stopping_rounds': 50})

    assert result_w_weight.scores[-1] != result_wo_weight.scores[-1] 
示例6
def test_label_spreading_closed_form():
    n_classes = 2
    X, y = make_classification(n_classes=n_classes, n_samples=200,
                               random_state=0)
    y[::3] = -1
    clf = label_propagation.LabelSpreading().fit(X, y)
    # adopting notation from Zhou et al (2004):
    S = clf._build_graph()
    Y = np.zeros((len(y), n_classes + 1))
    Y[np.arange(len(y)), y] = 1
    Y = Y[:, :-1]
    for alpha in [0.1, 0.3, 0.5, 0.7, 0.9]:
        expected = np.dot(np.linalg.inv(np.eye(len(S)) - alpha * S), Y)
        expected /= expected.sum(axis=1)[:, np.newaxis]
        clf = label_propagation.LabelSpreading(max_iter=10000, alpha=alpha)
        clf.fit(X, y)
        assert_array_almost_equal(expected, clf.label_distributions_, 4) 
示例7
def test_importances():
    # Check variable importances.
    X, y = datasets.make_classification(n_samples=2000,
                                        n_features=10,
                                        n_informative=3,
                                        n_redundant=0,
                                        n_repeated=0,
                                        shuffle=False,
                                        random_state=1)

    for alg in ['SAMME', 'SAMME.R']:
        clf = AdaBoostClassifier(algorithm=alg)

        clf.fit(X, y)
        importances = clf.feature_importances_

        assert_equal(importances.shape[0], 10)
        assert_equal((importances[:3, np.newaxis] >= importances[3:]).all(),
                     True) 
示例8
def test_importances_gini_equal_mse():
    # Check that gini is equivalent to mse for binary output variable

    X, y = datasets.make_classification(n_samples=2000,
                                        n_features=10,
                                        n_informative=3,
                                        n_redundant=0,
                                        n_repeated=0,
                                        shuffle=False,
                                        random_state=0)

    # The gini index and the mean square error (variance) might differ due
    # to numerical instability. Since those instabilities mainly occurs at
    # high tree depth, we restrict this maximal depth.
    clf = DecisionTreeClassifier(criterion="gini", max_depth=5,
                                 random_state=0).fit(X, y)
    reg = DecisionTreeRegressor(criterion="mse", max_depth=5,
                                random_state=0).fit(X, y)

    assert_almost_equal(clf.feature_importances_, reg.feature_importances_)
    assert_array_equal(clf.tree_.feature, reg.tree_.feature)
    assert_array_equal(clf.tree_.children_left, reg.tree_.children_left)
    assert_array_equal(clf.tree_.children_right, reg.tree_.children_right)
    assert_array_equal(clf.tree_.n_node_samples, reg.tree_.n_node_samples) 
示例9
def test_mean_variance_illegal_axis():
    X, _ = make_classification(5, 4, random_state=0)
    # Sparsify the array a little bit
    X[0, 0] = 0
    X[2, 1] = 0
    X[4, 3] = 0
    X_csr = sp.csr_matrix(X)
    assert_raises(ValueError, mean_variance_axis, X_csr, axis=-3)
    assert_raises(ValueError, mean_variance_axis, X_csr, axis=2)
    assert_raises(ValueError, mean_variance_axis, X_csr, axis=-1)

    assert_raises(ValueError, incr_mean_variance_axis, X_csr, axis=-3,
                  last_mean=None, last_var=None, last_n=None)
    assert_raises(ValueError, incr_mean_variance_axis, X_csr, axis=2,
                  last_mean=None, last_var=None, last_n=None)
    assert_raises(ValueError, incr_mean_variance_axis, X_csr, axis=-1,
                  last_mean=None, last_var=None, last_n=None) 
示例10
def test_max_features_tiebreak():
    # Test if max_features can break tie among feature importance
    X, y = datasets.make_classification(
        n_samples=1000, n_features=10, n_informative=3, n_redundant=0,
        n_repeated=0, shuffle=False, random_state=0)
    max_features = X.shape[1]

    feature_importances = np.array([4, 4, 4, 4, 3, 3, 3, 2, 2, 1])
    for n_features in range(1, max_features + 1):
        transformer = SelectFromModel(
            FixedImportanceEstimator(feature_importances),
            max_features=n_features,
            threshold=-np.inf)
        X_new = transformer.fit_transform(X, y)
        selected_feature_indices = np.where(transformer._get_support_mask())[0]
        assert_array_equal(selected_feature_indices, np.arange(n_features))
        assert X_new.shape[1] == n_features 
示例11
def test_threshold_and_max_features():
    X, y = datasets.make_classification(
        n_samples=1000, n_features=10, n_informative=3, n_redundant=0,
        n_repeated=0, shuffle=False, random_state=0)
    est = RandomForestClassifier(n_estimators=50, random_state=0)

    transformer1 = SelectFromModel(estimator=est, max_features=3,
                                   threshold=-np.inf)
    X_new1 = transformer1.fit_transform(X, y)

    transformer2 = SelectFromModel(estimator=est, threshold=0.04)
    X_new2 = transformer2.fit_transform(X, y)

    transformer3 = SelectFromModel(estimator=est, max_features=3,
                                   threshold=0.04)
    X_new3 = transformer3.fit_transform(X, y)
    assert X_new3.shape[1] == min(X_new1.shape[1], X_new2.shape[1])
    selected_indices = transformer3.transform(
        np.arange(X.shape[1])[np.newaxis, :])
    assert_allclose(X_new3, X[:, selected_indices[0]]) 
示例12
def test_feature_importances():
    X, y = datasets.make_classification(
        n_samples=1000, n_features=10, n_informative=3, n_redundant=0,
        n_repeated=0, shuffle=False, random_state=0)

    est = RandomForestClassifier(n_estimators=50, random_state=0)
    for threshold, func in zip(["mean", "median"], [np.mean, np.median]):
        transformer = SelectFromModel(estimator=est, threshold=threshold)
        transformer.fit(X, y)
        assert hasattr(transformer.estimator_, 'feature_importances_')

        X_new = transformer.transform(X)
        assert_less(X_new.shape[1], X.shape[1])
        importances = transformer.estimator_.feature_importances_

        feature_mask = np.abs(importances) > func(importances)
        assert_array_almost_equal(X_new, X[:, feature_mask]) 
示例13
def test_2d_coef():
    X, y = datasets.make_classification(
        n_samples=1000, n_features=10, n_informative=3, n_redundant=0,
        n_repeated=0, shuffle=False, random_state=0, n_classes=4)

    est = LogisticRegression()
    for threshold, func in zip(["mean", "median"], [np.mean, np.median]):
        for order in [1, 2, np.inf]:
            # Fit SelectFromModel a multi-class problem
            transformer = SelectFromModel(estimator=LogisticRegression(),
                                          threshold=threshold,
                                          norm_order=order)
            transformer.fit(X, y)
            assert hasattr(transformer.estimator_, 'coef_')
            X_new = transformer.transform(X)
            assert_less(X_new.shape[1], X.shape[1])

            # Manually check that the norm is correctly performed
            est.fit(X, y)
            importances = np.linalg.norm(est.coef_, axis=0, ord=order)
            feature_mask = importances > func(importances)
            assert_array_almost_equal(X_new, X[:, feature_mask]) 
示例14
def test_weight():
    # Test class weights
    clf = svm.SVC(gamma='scale', class_weight={1: 0.1})
    # we give a small weights to class 1
    clf.fit(X, Y)
    # so all predicted values belong to class 2
    assert_array_almost_equal(clf.predict(X), [2] * 6)

    X_, y_ = make_classification(n_samples=200, n_features=10,
                                 weights=[0.833, 0.167], random_state=2)

    for clf in (linear_model.LogisticRegression(),
                svm.LinearSVC(random_state=0), svm.SVC(gamma="scale")):
        clf.set_params(class_weight={0: .1, 1: 10})
        clf.fit(X_[:100], y_[:100])
        y_pred = clf.predict(X_[100:])
        assert f1_score(y_[100:], y_pred) > .3 
示例15
def test_cross_val_score_predict_groups():
    # Check if ValueError (when groups is None) propagates to cross_val_score
    # and cross_val_predict
    # And also check if groups is correctly passed to the cv object
    X, y = make_classification(n_samples=20, n_classes=2, random_state=0)

    clf = SVC(kernel="linear")

    group_cvs = [LeaveOneGroupOut(), LeavePGroupsOut(2), GroupKFold(),
                 GroupShuffleSplit()]
    for cv in group_cvs:
        assert_raise_message(ValueError,
                             "The 'groups' parameter should not be None.",
                             cross_val_score, estimator=clf, X=X, y=y, cv=cv)
        assert_raise_message(ValueError,
                             "The 'groups' parameter should not be None.",
                             cross_val_predict, estimator=clf, X=X, y=y, cv=cv) 
示例16
def test_cross_val_predict_unbalanced():
    X, y = make_classification(n_samples=100, n_features=2, n_redundant=0,
                               n_informative=2, n_clusters_per_class=1,
                               random_state=1)
    # Change the first sample to a new class
    y[0] = 2
    clf = LogisticRegression(random_state=1)
    cv = StratifiedKFold(n_splits=2, random_state=1)
    train, test = list(cv.split(X, y))
    yhat_proba = cross_val_predict(clf, X, y, cv=cv, method="predict_proba")
    assert y[test[0]][0] == 2  # sanity check for further assertions
    assert np.all(yhat_proba[test[0]][:, 2] == 0)
    assert np.all(yhat_proba[test[0]][:, 0:1] > 0)
    assert np.all(yhat_proba[test[1]] > 0)
    assert_array_almost_equal(yhat_proba.sum(axis=1), np.ones(y.shape),
                              decimal=12) 
示例17
def test_learning_curve_verbose():
    X, y = make_classification(n_samples=30, n_features=1, n_informative=1,
                               n_redundant=0, n_classes=2,
                               n_clusters_per_class=1, random_state=0)
    estimator = MockImprovingEstimator(20)

    old_stdout = sys.stdout
    sys.stdout = StringIO()
    try:
        train_sizes, train_scores, test_scores = \
            learning_curve(estimator, X, y, cv=3, verbose=1)
    finally:
        out = sys.stdout.getvalue()
        sys.stdout.close()
        sys.stdout = old_stdout

    assert("[learning_curve]" in out) 
示例18
def test_learning_curve_incremental_learning_unsupervised():
    X, _ = make_classification(n_samples=30, n_features=1, n_informative=1,
                               n_redundant=0, n_classes=2,
                               n_clusters_per_class=1, random_state=0)
    estimator = MockIncrementalImprovingEstimator(20)
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y=None, cv=3, exploit_incremental_learning=True,
        train_sizes=np.linspace(0.1, 1.0, 10))
    assert_array_equal(train_sizes, np.linspace(2, 20, 10))
    assert_array_almost_equal(train_scores.mean(axis=1),
                              np.linspace(1.9, 1.0, 10))
    assert_array_almost_equal(test_scores.mean(axis=1),
                              np.linspace(0.1, 1.0, 10))


# 0.23. warning about tol not having its correct default value. 
示例19
def test_learning_curve_batch_and_incremental_learning_are_equal():
    X, y = make_classification(n_samples=30, n_features=1, n_informative=1,
                               n_redundant=0, n_classes=2,
                               n_clusters_per_class=1, random_state=0)
    train_sizes = np.linspace(0.2, 1.0, 5)
    estimator = PassiveAggressiveClassifier(max_iter=1, tol=None,
                                            shuffle=False)

    train_sizes_inc, train_scores_inc, test_scores_inc = \
        learning_curve(
            estimator, X, y, train_sizes=train_sizes,
            cv=3, exploit_incremental_learning=True)
    train_sizes_batch, train_scores_batch, test_scores_batch = \
        learning_curve(
            estimator, X, y, cv=3, train_sizes=train_sizes,
            exploit_incremental_learning=False)

    assert_array_equal(train_sizes_inc, train_sizes_batch)
    assert_array_almost_equal(train_scores_inc.mean(axis=1),
                              train_scores_batch.mean(axis=1))
    assert_array_almost_equal(test_scores_inc.mean(axis=1),
                              test_scores_batch.mean(axis=1)) 
示例20
def test_grid_search_groups():
    # Check if ValueError (when groups is None) propagates to GridSearchCV
    # And also check if groups is correctly passed to the cv object
    rng = np.random.RandomState(0)

    X, y = make_classification(n_samples=15, n_classes=2, random_state=0)
    groups = rng.randint(0, 3, 15)

    clf = LinearSVC(random_state=0)
    grid = {'C': [1]}

    group_cvs = [LeaveOneGroupOut(), LeavePGroupsOut(2), GroupKFold(),
                 GroupShuffleSplit()]
    for cv in group_cvs:
        gs = GridSearchCV(clf, grid, cv=cv)
        assert_raise_message(ValueError,
                             "The 'groups' parameter should not be None.",
                             gs.fit, X, y)
        gs.fit(X, y, groups=groups)

    non_group_cvs = [StratifiedKFold(), StratifiedShuffleSplit()]
    for cv in non_group_cvs:
        gs = GridSearchCV(clf, grid, cv=cv)
        # Should not raise an error
        gs.fit(X, y) 
示例21
def test_grid_search_sparse():
    # Test that grid search works with both dense and sparse matrices
    X_, y_ = make_classification(n_samples=200, n_features=100, random_state=0)

    clf = LinearSVC()
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
    cv.fit(X_[:180], y_[:180])
    y_pred = cv.predict(X_[180:])
    C = cv.best_estimator_.C

    X_ = sp.csr_matrix(X_)
    clf = LinearSVC()
    cv = GridSearchCV(clf, {'C': [0.1, 1.0]})
    cv.fit(X_[:180].tocoo(), y_[:180])
    y_pred2 = cv.predict(X_[180:])
    C2 = cv.best_estimator_.C

    assert np.mean(y_pred == y_pred2) >= .9
    assert_equal(C, C2) 
示例22
def test_refit_callable_invalid_type():
    """
    Test implementation catches the errors when 'best_index_' returns an
    invalid result.
    """
    def refit_callable_invalid_type(cv_results):
        """
        A dummy function tests when returned 'best_index_' is not integer.
        """
        return None

    X, y = make_classification(n_samples=100, n_features=4,
                               random_state=42)

    clf = GridSearchCV(LinearSVC(random_state=42), {'C': [0.1, 1]},
                       scoring='precision', refit=refit_callable_invalid_type,
                       cv=5)
    with pytest.raises(TypeError,
                       match='best_index_ returned is not an integer'):
        clf.fit(X, y) 
示例23
def test_refit_callable_out_bound(out_bound_value, search_cv):
    """
    Test implementation catches the errors when 'best_index_' returns an
    out of bound result.
    """
    def refit_callable_out_bound(cv_results):
        """
        A dummy function tests when returned 'best_index_' is out of bounds.
        """
        return out_bound_value

    X, y = make_classification(n_samples=100, n_features=4,
                               random_state=42)

    clf = search_cv(LinearSVC(random_state=42), {'C': [0.1, 1]},
                    scoring='precision', refit=refit_callable_out_bound, cv=5)
    with pytest.raises(IndexError, match='best_index_ index out of range'):
        clf.fit(X, y) 
示例24
def test_refit_callable_multi_metric():
    """
    Test refit=callable in multiple metric evaluation setting
    """
    def refit_callable(cv_results):
        """
        A dummy function tests `refit=callable` interface.
        Return the index of a model that has the least
        `mean_test_prec`.
        """
        assert 'mean_test_prec' in cv_results
        return cv_results['mean_test_prec'].argmin()

    X, y = make_classification(n_samples=100, n_features=4,
                               random_state=42)
    scoring = {'Accuracy': make_scorer(accuracy_score), 'prec': 'precision'}
    clf = GridSearchCV(LinearSVC(random_state=42), {'C': [0.01, 0.1, 1]},
                       scoring=scoring, refit=refit_callable, cv=5)
    clf.fit(X, y)

    assert clf.best_index_ == 0
    # Ensure `best_score_` is disabled when using `refit=callable`
    assert not hasattr(clf, 'best_score_') 
示例25
def test_grid_search_cv_results_multimetric():
    X, y = make_classification(n_samples=50, n_features=4, random_state=42)

    n_splits = 3
    params = [dict(kernel=['rbf', ], C=[1, 10], gamma=[0.1, 1]),
              dict(kernel=['poly', ], degree=[1, 2])]

    for iid in (False, True):
        grid_searches = []
        for scoring in ({'accuracy': make_scorer(accuracy_score),
                         'recall': make_scorer(recall_score)},
                        'accuracy', 'recall'):
            grid_search = GridSearchCV(SVC(gamma='scale'), cv=n_splits,
                                       iid=iid, param_grid=params,
                                       scoring=scoring, refit=False)
            grid_search.fit(X, y)
            assert_equal(grid_search.iid, iid)
            grid_searches.append(grid_search)

        compare_cv_results_multimetric_with_single(*grid_searches, iid=iid) 
示例26
def test_sample_weight():
    n_samples = 100
    X, y = make_classification(n_samples=2 * n_samples, n_features=6,
                               random_state=42)

    sample_weight = np.random.RandomState(seed=42).uniform(size=len(y))
    X_train, y_train, sw_train = \
        X[:n_samples], y[:n_samples], sample_weight[:n_samples]
    X_test = X[n_samples:]

    for method in ['sigmoid', 'isotonic']:
        base_estimator = LinearSVC(random_state=42)
        calibrated_clf = CalibratedClassifierCV(base_estimator, method=method)
        calibrated_clf.fit(X_train, y_train, sample_weight=sw_train)
        probs_with_sw = calibrated_clf.predict_proba(X_test)

        # As the weights are used for the calibration, they should still yield
        # a different predictions
        calibrated_clf.fit(X_train, y_train)
        probs_without_sw = calibrated_clf.predict_proba(X_test)

        diff = np.linalg.norm(probs_with_sw - probs_without_sw)
        assert_greater(diff, 0.1) 
示例27
def test_l1_ratio():
    # Test if l1 ratio extremes match L1 and L2 penalty settings.
    X, y = datasets.make_classification(n_samples=1000,
                                        n_features=100, n_informative=20,
                                        random_state=1234)

    # test if elasticnet with l1_ratio near 1 gives same result as pure l1
    est_en = SGDClassifier(alpha=0.001, penalty='elasticnet', tol=None,
                           max_iter=6, l1_ratio=0.9999999999,
                           random_state=42).fit(X, y)
    est_l1 = SGDClassifier(alpha=0.001, penalty='l1', max_iter=6,
                           random_state=42, tol=None).fit(X, y)
    assert_array_almost_equal(est_en.coef_, est_l1.coef_)

    # test if elasticnet with l1_ratio near 0 gives same result as pure l2
    est_en = SGDClassifier(alpha=0.001, penalty='elasticnet', tol=None,
                           max_iter=6, l1_ratio=0.0000000001,
                           random_state=42).fit(X, y)
    est_l2 = SGDClassifier(alpha=0.001, penalty='l2', max_iter=6,
                           random_state=42, tol=None).fit(X, y)
    assert_array_almost_equal(est_en.coef_, est_l2.coef_) 
示例28
def test_sag_classifier_raises_error(solver):
    # Following #13316, the error handling behavior changed in cython sag. This
    # is simply a non-regression test to make sure numerical errors are
    # properly raised.

    # Train a classifier on a simple problem
    rng = np.random.RandomState(42)
    X, y = make_classification(random_state=rng)
    clf = LogisticRegression(solver=solver, random_state=rng, warm_start=True)
    clf.fit(X, y)

    # Trigger a numerical error by:
    # - corrupting the fitted coefficients of the classifier
    # - fit it again starting from its current state thanks to warm_start
    clf.coef_[:] = np.nan

    with pytest.raises(ValueError, match="Floating-point under-/overflow"):
        clf.fit(X, y) 
示例29
def get_sample_dataset(dataset_properties):
    """Returns sample dataset

    Args:
        dataset_properties (dict): Dictionary corresponding to the properties of the dataset
            used to verify the estimator and metric generators.

    Returns:
        X (array-like): Features array

        y (array-like): Labels array

        splits (iterator): This is an iterator that returns train test splits for
            cross-validation purposes on ``X`` and ``y``.
    """
    kwargs = dataset_properties.copy()
    data_type = kwargs.pop('type')
    if data_type == 'multiclass':
        try:
            X, y = datasets.make_classification(random_state=8, **kwargs)
            splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y)
        except Exception as e:
            raise exceptions.UserError(repr(e))
    elif data_type == 'iris':
        X, y = datasets.load_iris(return_X_y=True)
        splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y)
    elif data_type == 'mnist':
        X, y = datasets.load_digits(return_X_y=True)
        splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y)
    elif data_type == 'breast_cancer':
        X, y = datasets.load_breast_cancer(return_X_y=True)
        splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y)
    elif data_type == 'boston':
        X, y = datasets.load_boston(return_X_y=True)
        splits = model_selection.KFold(n_splits=2, random_state=8).split(X)
    elif data_type == 'diabetes':
        X, y = datasets.load_diabetes(return_X_y=True)
        splits = model_selection.KFold(n_splits=2, random_state=8).split(X)
    else:
        raise exceptions.UserError('Unknown dataset type {}'.format(dataset_properties['type']))
    return X, y, splits 
示例30
def test_early_stopping_loss():
    # Make sure that when scoring is None, the early stopping is done w.r.t to
    # the loss. Using scoring='neg_log_loss' and scoring=None should be
    # equivalent since the loss is precisely the negative log likelihood
    n_samples = int(1e3)
    max_iter = 100
    n_iter_no_change = 5

    X, y = make_classification(n_samples, random_state=0)

    clf_scoring = GradientBoostingClassifier(max_iter=max_iter,
                                             scoring='neg_log_loss',
                                             validation_split=.1,
                                             n_iter_no_change=n_iter_no_change,
                                             tol=1e-4,
                                             verbose=1,
                                             random_state=0)
    clf_scoring.fit(X, y)

    clf_loss = GradientBoostingClassifier(max_iter=max_iter,
                                          scoring=None,
                                          validation_split=.1,
                                          n_iter_no_change=n_iter_no_change,
                                          tol=1e-4,
                                          verbose=1,
                                          random_state=0)
    clf_loss.fit(X, y)

    assert n_iter_no_change < clf_loss.n_iter_ < max_iter
    assert clf_loss.n_iter_ == clf_scoring.n_iter_