Python源码示例:sklearn.datasets.make_multilabel_classification()

示例1
def check_alternative_lrap_implementation(lrap_score, n_classes=5,
                                          n_samples=20, random_state=0):
    _, y_true = make_multilabel_classification(n_features=1,
                                               allow_unlabeled=False,
                                               random_state=random_state,
                                               n_classes=n_classes,
                                               n_samples=n_samples)

    # Score with ties
    y_score = sparse_random_matrix(n_components=y_true.shape[0],
                                   n_features=y_true.shape[1],
                                   random_state=random_state)

    if hasattr(y_score, "toarray"):
        y_score = y_score.toarray()
    score_lrap = label_ranking_average_precision_score(y_true, y_score)
    score_my_lrap = _my_lrap(y_true, y_score)
    assert_almost_equal(score_lrap, score_my_lrap)

    # Uniform score
    random_state = check_random_state(random_state)
    y_score = random_state.uniform(size=(n_samples, n_classes))
    score_lrap = label_ranking_average_precision_score(y_true, y_score)
    score_my_lrap = _my_lrap(y_true, y_score)
    assert_almost_equal(score_lrap, score_my_lrap) 
示例2
def test_multilabel_sample_weight_invariance(name):
    # multilabel indicator
    random_state = check_random_state(0)
    _, ya = make_multilabel_classification(n_features=1, n_classes=20,
                                           random_state=0, n_samples=100,
                                           allow_unlabeled=False)
    _, yb = make_multilabel_classification(n_features=1, n_classes=20,
                                           random_state=1, n_samples=100,
                                           allow_unlabeled=False)
    y_true = np.vstack([ya, yb])
    y_pred = np.vstack([ya, ya])
    y_score = random_state.randint(1, 4, size=y_true.shape)

    metric = ALL_METRICS[name]
    if name in THRESHOLDED_METRICS:
        check_sample_weight_invariance(name, metric, y_true, y_score)
    else:
        check_sample_weight_invariance(name, metric, y_true, y_pred) 
示例3
def test_multilabel_classification():
    # Test that multi-label classification works as expected.
    # test fit method
    X, y = make_multilabel_classification(n_samples=50, random_state=0,
                                          return_indicator=True)
    mlp = MLPClassifier(solver='lbfgs', hidden_layer_sizes=50, alpha=1e-5,
                        max_iter=150, random_state=0, activation='logistic',
                        learning_rate_init=0.2)
    mlp.fit(X, y)
    assert_greater(mlp.score(X, y), 0.97)

    # test partial fit method
    mlp = MLPClassifier(solver='sgd', hidden_layer_sizes=50, max_iter=150,
                        random_state=0, activation='logistic', alpha=1e-5,
                        learning_rate_init=0.2)
    for i in range(100):
        mlp.partial_fit(X, y, classes=[0, 1, 2, 3, 4])
    assert_greater(mlp.score(X, y), 0.9)

    # Make sure early stopping still work now that spliting is stratified by
    # default (it is disabled for multilabel classification)
    mlp = MLPClassifier(early_stopping=True)
    mlp.fit(X, y).predict(X) 
示例4
def test_predict_proba_multilabel():
    # Test that predict_proba works as expected for multilabel.
    # Multilabel should not use softmax which makes probabilities sum to 1
    X, Y = make_multilabel_classification(n_samples=50, random_state=0,
                                          return_indicator=True)
    n_samples, n_classes = Y.shape

    clf = MLPClassifier(solver='lbfgs', hidden_layer_sizes=30,
                        random_state=0)
    clf.fit(X, Y)
    y_proba = clf.predict_proba(X)

    assert_equal(y_proba.shape, (n_samples, n_classes))
    assert_array_equal(y_proba > 0.5, Y)

    y_log_proba = clf.predict_log_proba(X)
    proba_max = y_proba.argmax(axis=1)
    proba_log_max = y_log_proba.argmax(axis=1)

    assert_greater((y_proba.sum(1) - 1).dot(y_proba.sum(1) - 1), 1e-10)
    assert_array_equal(proba_max, proba_log_max)
    assert_array_equal(y_log_proba, np.log(y_proba)) 
示例5
def test_ovr_multilabel_dataset():
    base_clf = MultinomialNB(alpha=1)
    for au, prec, recall in zip((True, False), (0.51, 0.66), (0.51, 0.80)):
        X, Y = datasets.make_multilabel_classification(n_samples=100,
                                                       n_features=20,
                                                       n_classes=5,
                                                       n_labels=2,
                                                       length=50,
                                                       allow_unlabeled=au,
                                                       random_state=0)
        X_train, Y_train = X[:80], Y[:80]
        X_test, Y_test = X[80:], Y[80:]
        clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)
        Y_pred = clf.predict(X_test)

        assert clf.multilabel_
        assert_almost_equal(precision_score(Y_test, Y_pred, average="micro"),
                            prec,
                            decimal=2)
        assert_almost_equal(recall_score(Y_test, Y_pred, average="micro"),
                            recall,
                            decimal=2) 
示例6
def test_make_multilabel_classification_return_indicator():
    for allow_unlabeled, min_length in zip((True, False), (0, 1)):
        X, Y = make_multilabel_classification(n_samples=25, n_features=20,
                                              n_classes=3, random_state=0,
                                              allow_unlabeled=allow_unlabeled)
        assert_equal(X.shape, (25, 20), "X shape mismatch")
        assert_equal(Y.shape, (25, 3), "Y shape mismatch")
        assert np.all(np.sum(Y, axis=0) > min_length)

    # Also test return_distributions and return_indicator with True
    X2, Y2, p_c, p_w_c = make_multilabel_classification(
        n_samples=25, n_features=20, n_classes=3, random_state=0,
        allow_unlabeled=allow_unlabeled, return_distributions=True)

    assert_array_almost_equal(X, X2)
    assert_array_equal(Y, Y2)
    assert_equal(p_c.shape, (3,))
    assert_almost_equal(p_c.sum(), 1)
    assert_equal(p_w_c.shape, (20, 3))
    assert_almost_equal(p_w_c.sum(axis=0), [1] * 3) 
示例7
def test_sparse_multilabel_targets(n_neighbors, n_jobs):
    X, y_dense = make_multilabel_classification(random_state=123)
    thresh = 80

    knn = KNeighborsClassifier(n_neighbors=n_neighbors,
                               n_jobs=n_jobs, )
    assert not issparse(y_dense)
    knn.fit(X[:thresh], y_dense[:thresh])
    y_pred = knn.predict(X[thresh:])

    y_sparse = csr_matrix(y_dense)
    knn = KNeighborsClassifier(n_neighbors=n_neighbors,
                               n_jobs=n_jobs,)
    assert issparse(y_sparse)
    knn.fit(X[:thresh], y_sparse[:thresh])
    y_pred_sparse = knn.predict(X[thresh:, :])

    # Test array equality
    np.testing.assert_array_equal(y_pred, y_pred_sparse.toarray()) 
示例8
def generate_classification(self, num_classes, num_features, num_samples, test_split=0.1, seed=0):
        """Generate a classification task
        
        Arguments:
            num_classes {int} -- Number of classes
            num_features {int} -- Number of features
            num_samples {int} -- Number of samples
        
        Keyword Arguments:
            test_split {float} -- Size of test split (default: {0.1})
            seed {int} -- A random seed (default: {0})
        """
        #X, Y = make_classification(n_samples=800, n_features=num_feats, n_classes=num_classes, n_informative=4)
        X, y = make_multilabel_classification(
            n_samples=num_samples, n_features=num_features, n_classes=num_classes, n_labels=0.01,
            length=50, allow_unlabeled=False, sparse=False, return_indicator='dense',
            return_distributions=False, random_state=seed
        )
        Y = np.argmax(y, axis=1)
        self.categorical_features = [False] * num_features
        self.problem_type = ProblemType.FeatureClassification
        self.X, self.Y = X, Y
        self._split_data(test_split, seed) 
示例9
def check_alternative_lrap_implementation(lrap_score, n_classes=5,
                                          n_samples=20, random_state=0):
    _, y_true = make_multilabel_classification(n_features=1,
                                               allow_unlabeled=False,
                                               random_state=random_state,
                                               n_classes=n_classes,
                                               n_samples=n_samples)

    # Score with ties
    y_score = sparse_random_matrix(n_components=y_true.shape[0],
                                   n_features=y_true.shape[1],
                                   random_state=random_state)

    if hasattr(y_score, "toarray"):
        y_score = y_score.toarray()
    score_lrap = label_ranking_average_precision_score(y_true, y_score)
    score_my_lrap = _my_lrap(y_true, y_score)
    assert_almost_equal(score_lrap, score_my_lrap)

    # Uniform score
    random_state = check_random_state(random_state)
    y_score = random_state.uniform(size=(n_samples, n_classes))
    score_lrap = label_ranking_average_precision_score(y_true, y_score)
    score_my_lrap = _my_lrap(y_true, y_score)
    assert_almost_equal(score_lrap, score_my_lrap) 
示例10
def test_multilabel_classification():
    # Test that multi-label classification works as expected.
    # test fit method
    X, y = make_multilabel_classification(n_samples=50, random_state=0,
                                          return_indicator=True)
    mlp = MLPClassifier(solver='lbfgs', hidden_layer_sizes=50, alpha=1e-5,
                        max_iter=150, random_state=0, activation='logistic',
                        learning_rate_init=0.2)
    mlp.fit(X, y)
    assert_equal(mlp.score(X, y), 1)

    # test partial fit method
    mlp = MLPClassifier(solver='sgd', hidden_layer_sizes=50, max_iter=150,
                        random_state=0, activation='logistic', alpha=1e-5,
                        learning_rate_init=0.2)
    for i in range(100):
        mlp.partial_fit(X, y, classes=[0, 1, 2, 3, 4])
    assert_greater(mlp.score(X, y), 0.9) 
示例11
def test_predict_proba_multilabel():
    # Test that predict_proba works as expected for multilabel.
    # Multilabel should not use softmax which makes probabilities sum to 1
    X, Y = make_multilabel_classification(n_samples=50, random_state=0,
                                          return_indicator=True)
    n_samples, n_classes = Y.shape

    clf = MLPClassifier(solver='lbfgs', hidden_layer_sizes=30,
                        random_state=0)
    clf.fit(X, Y)
    y_proba = clf.predict_proba(X)

    assert_equal(y_proba.shape, (n_samples, n_classes))
    assert_array_equal(y_proba > 0.5, Y)

    y_log_proba = clf.predict_log_proba(X)
    proba_max = y_proba.argmax(axis=1)
    proba_log_max = y_log_proba.argmax(axis=1)

    assert_greater((y_proba.sum(1) - 1).dot(y_proba.sum(1) - 1), 1e-10)
    assert_array_equal(proba_max, proba_log_max)
    assert_array_equal(y_log_proba, np.log(y_proba)) 
示例12
def test_ovr_multilabel_dataset():
    base_clf = MultinomialNB(alpha=1)
    for au, prec, recall in zip((True, False), (0.51, 0.66), (0.51, 0.80)):
        X, Y = datasets.make_multilabel_classification(n_samples=100,
                                                       n_features=20,
                                                       n_classes=5,
                                                       n_labels=2,
                                                       length=50,
                                                       allow_unlabeled=au,
                                                       random_state=0)
        X_train, Y_train = X[:80], Y[:80]
        X_test, Y_test = X[80:], Y[80:]
        clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)
        Y_pred = clf.predict(X_test)

        assert_true(clf.multilabel_)
        assert_almost_equal(precision_score(Y_test, Y_pred, average="micro"),
                            prec,
                            decimal=2)
        assert_almost_equal(recall_score(Y_test, Y_pred, average="micro"),
                            recall,
                            decimal=2) 
示例13
def test_make_multilabel_classification_return_indicator():
    for allow_unlabeled, min_length in zip((True, False), (0, 1)):
        X, Y = make_multilabel_classification(n_samples=25, n_features=20,
                                              n_classes=3, random_state=0,
                                              allow_unlabeled=allow_unlabeled)
        assert_equal(X.shape, (25, 20), "X shape mismatch")
        assert_equal(Y.shape, (25, 3), "Y shape mismatch")
        assert_true(np.all(np.sum(Y, axis=0) > min_length))

    # Also test return_distributions and return_indicator with True
    X2, Y2, p_c, p_w_c = make_multilabel_classification(
        n_samples=25, n_features=20, n_classes=3, random_state=0,
        allow_unlabeled=allow_unlabeled, return_distributions=True)

    assert_array_equal(X, X2)
    assert_array_equal(Y, Y2)
    assert_equal(p_c.shape, (3,))
    assert_almost_equal(p_c.sum(), 1)
    assert_equal(p_w_c.shape, (20, 3))
    assert_almost_equal(p_w_c.sum(axis=0), [1] * 3) 
示例14
def setup_mlc_dataset(self):
        X, Y = datasets.make_multilabel_classification(
                n_features=5, random_state=1126)
        return Dataset(X, Y) 
示例15
def setUp(self):
        X, Y = datasets.make_multilabel_classification(random_state=1126)
        self.X_train, self.X_test, self.Y_train, self.Y_test = \
            train_test_split(X, Y, test_size=0.3, random_state=1126) 
示例16
def split_train_test(test_size):
    # choose a dataset with unbalanced class instances
    data = make_multilabel_classification(
        n_samples=300, n_classes=10, allow_unlabeled=False)
    X = StandardScaler().fit_transform(data[0])
    Y = data[1]

    X_trn, X_tst, Y_trn, Y_tst = train_test_split(X, Y, test_size=test_size)

    trn_ds = Dataset(X_trn, Y_trn[:5].tolist() + [None] * (len(Y_trn) - 5))
    tst_ds = Dataset(X_tst, Y_tst.tolist())

    fully_labeled_trn_ds = Dataset(X_trn, Y_trn)

    return trn_ds, tst_ds, fully_labeled_trn_ds 
示例17
def _prepare_for_use(self):
        self._random_state = check_random_state(self.random_state)
        self.X, self.y = make_multilabel_classification(n_samples=self.n_samples,
                                                        n_features=self.n_features,
                                                        n_classes=self.n_targets,
                                                        n_labels=self.n_labels,
                                                        random_state=self._random_state)
        self.target_names = ["target_" + str(i) for i in range(self.n_targets)]
        self.feature_names = ["att_num_" + str(i) for i in range(self.n_num_features)]
        self.target_values = np.unique(self.y).tolist() if self.n_targets == 1 else \
            [np.unique(self.y[:, i]).tolist() for i in range(self.n_targets)] 
示例18
def test_multilabel():
    """Check if error is raised for multilabel classification."""
    X, y = make_multilabel_classification(n_classes=2, n_labels=1,
                                          allow_unlabeled=False,
                                          random_state=123)
    clf = OneVsRestClassifier(SVC(kernel='linear'))

    eclf = VotingClassifier(estimators=[('ovr', clf)], voting='hard')

    try:
        eclf.fit(X, y)
    except NotImplementedError:
        return 
示例19
def test_sparse_input(EstimatorClass, sparse_matrix):
    y, X = datasets.make_multilabel_classification(random_state=0,
                                                   n_samples=50,
                                                   n_features=1,
                                                   n_classes=20)
    y = y[:, 0]

    check_sparse_input(EstimatorClass, X, sparse_matrix(X), y) 
示例20
def test_random_hasher_sparse_data():
    X, y = datasets.make_multilabel_classification(random_state=0)
    hasher = RandomTreesEmbedding(n_estimators=30, random_state=1)
    X_transformed = hasher.fit_transform(X)
    X_transformed_sparse = hasher.fit_transform(csc_matrix(X))
    assert_array_equal(X_transformed_sparse.toarray(), X_transformed.toarray()) 
示例21
def test_sparse_input(name, sparse_matrix):
    X, y = datasets.make_multilabel_classification(random_state=0,
                                                   n_samples=50)

    check_sparse_input(name, X, sparse_matrix(X), y) 
示例22
def test_presort_sparse():
    ests = (DecisionTreeClassifier(presort=True),
            DecisionTreeRegressor(presort=True))
    sparse_matrices = (csr_matrix, csc_matrix, coo_matrix)

    y, X = datasets.make_multilabel_classification(random_state=0,
                                                   n_samples=50,
                                                   n_features=1,
                                                   n_classes=20)
    y = y[:, 0]

    for est, sparse_matrix in product(ests, sparse_matrices):
        check_presort_sparse(est, sparse_matrix(X), y) 
示例23
def test_multilabel_representation_invariance():
    # Generate some data
    n_classes = 4
    n_samples = 50

    _, y1 = make_multilabel_classification(n_features=1, n_classes=n_classes,
                                           random_state=0, n_samples=n_samples,
                                           allow_unlabeled=True)
    _, y2 = make_multilabel_classification(n_features=1, n_classes=n_classes,
                                           random_state=1, n_samples=n_samples,
                                           allow_unlabeled=True)

    # To make sure at least one empty label is present
    y1 = np.vstack([y1, [[0] * n_classes]])
    y2 = np.vstack([y2, [[0] * n_classes]])

    y1_sparse_indicator = sp.coo_matrix(y1)
    y2_sparse_indicator = sp.coo_matrix(y2)

    for name in MULTILABELS_METRICS:
        metric = ALL_METRICS[name]

        # XXX cruel hack to work with partial functions
        if isinstance(metric, partial):
            metric.__module__ = 'tmp'
            metric.__name__ = name

        measure = metric(y1, y2)

        # Check representation invariance
        assert_allclose(metric(y1_sparse_indicator, y2_sparse_indicator),
                        measure,
                        err_msg="%s failed representation invariance between "
                                "dense and sparse indicator formats." % name) 
示例24
def test_normalize_option_multilabel_classification():
    # Test in the multilabel case
    n_classes = 4
    n_samples = 100

    # for both random_state 0 and 1, y_true and y_pred has at least one
    # unlabelled entry
    _, y_true = make_multilabel_classification(n_features=1,
                                               n_classes=n_classes,
                                               random_state=0,
                                               allow_unlabeled=True,
                                               n_samples=n_samples)
    _, y_pred = make_multilabel_classification(n_features=1,
                                               n_classes=n_classes,
                                               random_state=1,
                                               allow_unlabeled=True,
                                               n_samples=n_samples)

    # To make sure at least one empty label is present
    y_true += [0]*n_classes
    y_pred += [0]*n_classes

    for name in METRICS_WITH_NORMALIZE_OPTION:
        metrics = ALL_METRICS[name]
        measure = metrics(y_true, y_pred, normalize=True)
        assert_array_less(-1.0 * measure, 0,
                          err_msg="We failed to test correctly the normalize "
                                  "option")
        assert_allclose(metrics(y_true, y_pred, normalize=False) / n_samples,
                        measure, err_msg="Failed with %s" % name) 
示例25
def test_averaging_multilabel(name):
    n_samples, n_classes = 40, 5
    _, y = make_multilabel_classification(n_features=1, n_classes=n_classes,
                                          random_state=5, n_samples=n_samples,
                                          allow_unlabeled=False)
    y_true = y[:20]
    y_pred = y[20:]
    y_score = check_random_state(0).normal(size=(20, n_classes))
    y_true_binarize = y_true
    y_pred_binarize = y_pred

    check_averaging(name, y_true, y_true_binarize,
                    y_pred, y_pred_binarize, y_score) 
示例26
def test_thresholded_scorers_multilabel_indicator_data():
    # Test that the scorer work with multilabel-indicator format
    # for multilabel and multi-output multi-class classifier
    X, y = make_multilabel_classification(allow_unlabeled=False,
                                          random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    # Multi-output multi-class predict_proba
    clf = DecisionTreeClassifier()
    clf.fit(X_train, y_train)
    y_proba = clf.predict_proba(X_test)
    score1 = get_scorer('roc_auc')(clf, X_test, y_test)
    score2 = roc_auc_score(y_test, np.vstack([p[:, -1] for p in y_proba]).T)
    assert_almost_equal(score1, score2)

    # Multi-output multi-class decision_function
    # TODO Is there any yet?
    clf = DecisionTreeClassifier()
    clf.fit(X_train, y_train)
    clf._predict_proba = clf.predict_proba
    clf.predict_proba = None
    clf.decision_function = lambda X: [p[:, 1] for p in clf._predict_proba(X)]

    y_proba = clf.decision_function(X_test)
    score1 = get_scorer('roc_auc')(clf, X_test, y_test)
    score2 = roc_auc_score(y_test, np.vstack([p for p in y_proba]).T)
    assert_almost_equal(score1, score2)

    # Multilabel predict_proba
    clf = OneVsRestClassifier(DecisionTreeClassifier())
    clf.fit(X_train, y_train)
    score1 = get_scorer('roc_auc')(clf, X_test, y_test)
    score2 = roc_auc_score(y_test, clf.predict_proba(X_test))
    assert_almost_equal(score1, score2)

    # Multilabel decision function
    clf = OneVsRestClassifier(LinearSVC(random_state=0))
    clf.fit(X_train, y_train)
    score1 = get_scorer('roc_auc')(clf, X_test, y_test)
    score2 = roc_auc_score(y_test, clf.decision_function(X_test))
    assert_almost_equal(score1, score2) 
示例27
def test_cross_val_predict_sparse_prediction():
    # check that cross_val_predict gives same result for sparse and dense input
    X, y = make_multilabel_classification(n_classes=2, n_labels=1,
                                          allow_unlabeled=False,
                                          return_indicator=True,
                                          random_state=1)
    X_sparse = csr_matrix(X)
    y_sparse = csr_matrix(y)
    classif = OneVsRestClassifier(SVC(kernel='linear'))
    preds = cross_val_predict(classif, X, y, cv=10)
    preds_sparse = cross_val_predict(classif, X_sparse, y_sparse, cv=10)
    preds_sparse = preds_sparse.toarray()
    assert_array_almost_equal(preds_sparse, preds) 
示例28
def test_cross_val_predict_with_method_multilabel_rf():
    # The RandomForest allows multiple classes in each label.
    # Output of predict_proba is a list of outputs of predict_proba
    # for each individual label.
    n_classes = 4
    X, y = make_multilabel_classification(n_samples=100, n_labels=3,
                                          n_classes=n_classes, n_features=5,
                                          random_state=42)
    y[:, 0] += y[:, 1]  # Put three classes in the first column
    for method in ['predict_proba', 'predict_log_proba', 'decision_function']:
        est = RFWithDecisionFunction(n_estimators=5, random_state=0)
        with warnings.catch_warnings():
            # Suppress "RuntimeWarning: divide by zero encountered in log"
            warnings.simplefilter('ignore')
            check_cross_val_predict_multilabel(est, X, y, method=method) 
示例29
def test_ovr_fit_predict_sparse():
    for sparse in [sp.csr_matrix, sp.csc_matrix, sp.coo_matrix, sp.dok_matrix,
                   sp.lil_matrix]:
        base_clf = MultinomialNB(alpha=1)

        X, Y = datasets.make_multilabel_classification(n_samples=100,
                                                       n_features=20,
                                                       n_classes=5,
                                                       n_labels=3,
                                                       length=50,
                                                       allow_unlabeled=True,
                                                       random_state=0)

        X_train, Y_train = X[:80], Y[:80]
        X_test = X[80:]

        clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)
        Y_pred = clf.predict(X_test)

        clf_sprs = OneVsRestClassifier(base_clf).fit(X_train, sparse(Y_train))
        Y_pred_sprs = clf_sprs.predict(X_test)

        assert clf.multilabel_
        assert sp.issparse(Y_pred_sprs)
        assert_array_equal(Y_pred_sprs.toarray(), Y_pred)

        # Test predict_proba
        Y_proba = clf_sprs.predict_proba(X_test)

        # predict assigns a label if the probability that the
        # sample has the label is greater than 0.5.
        pred = Y_proba > .5
        assert_array_equal(pred, Y_pred_sprs.toarray())

        # Test decision_function
        clf = svm.SVC(gamma="scale")
        clf_sprs = OneVsRestClassifier(clf).fit(X_train, sparse(Y_train))
        dec_pred = (clf_sprs.decision_function(X_test) > 0).astype(int)
        assert_array_equal(dec_pred, clf_sprs.predict(X_test).toarray()) 
示例30
def test_ridge_classifier_no_support_multilabel():
    X, y = make_multilabel_classification(n_samples=10, random_state=0)
    assert_raises(ValueError, RidgeClassifier().fit, X, y)