Python源码示例:sklearn.ensemble.IsolationForest()

示例1
def __init__(self, hybrid=False, n_estimators=100, max_samples='auto', contamination=0.1, n_jobs=-1, seed=None,
                 **kwargs):
        """Init Isolation Forest instance."""
        self.n_estimators = n_estimators
        self.max_samples = max_samples
        self.contamination = contamination
        self.n_jobs = n_jobs
        self.seed = seed

        self.model = IsolationForest(n_estimators=n_estimators, max_samples=max_samples, contamination=contamination,
                                     n_jobs=n_jobs, random_state=seed, **kwargs)

        self.hybrid = hybrid
        self.ae_net = None  # autoencoder network for the case of a hybrid model

        self.results = {
            'train_time': None,
            'test_time': None,
            'test_auc': None,
            'test_scores': None
        } 
示例2
def test_iforest_sparse():
    """Check IForest for various parameter settings on sparse input."""
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(boston.data[:50],
                                                        boston.target[:50],
                                                        random_state=rng)
    grid = ParameterGrid({"max_samples": [0.5, 1.0],
                          "bootstrap": [True, False]})

    for sparse_format in [csc_matrix, csr_matrix]:
        X_train_sparse = sparse_format(X_train)
        X_test_sparse = sparse_format(X_test)

        for params in grid:
            # Trained on sparse format
            sparse_classifier = IsolationForest(
                n_estimators=10, random_state=1, **params).fit(X_train_sparse)
            sparse_results = sparse_classifier.predict(X_test_sparse)

            # Trained on dense format
            dense_classifier = IsolationForest(
                n_estimators=10, random_state=1, **params).fit(X_train)
            dense_results = dense_classifier.predict(X_test)

            assert_array_equal(sparse_results, dense_results) 
示例3
def test_iforest_performance():
    """Test Isolation Forest performs well"""

    # Generate train/test data
    rng = check_random_state(2)
    X = 0.3 * rng.randn(120, 2)
    X_train = np.r_[X + 2, X - 2]
    X_train = X[:100]

    # Generate some abnormal novel observations
    X_outliers = rng.uniform(low=-4, high=4, size=(20, 2))
    X_test = np.r_[X[100:], X_outliers]
    y_test = np.array([0] * 20 + [1] * 20)

    # fit the model
    clf = IsolationForest(max_samples=100, random_state=rng).fit(X_train)

    # predict scores (the lower, the more normal)
    y_pred = - clf.decision_function(X_test)

    # check that there is at most 6 errors (false positive or false negative)
    assert_greater(roc_auc_score(y_test, y_pred), 0.98) 
示例4
def test_deprecation():
    X = [[0.0], [1.0]]
    clf = IsolationForest()

    assert_warns_message(FutureWarning,
                         'default contamination parameter 0.1 will change '
                         'in version 0.22 to "auto"',
                         clf.fit, X)

    assert_warns_message(FutureWarning,
                         'behaviour="old" is deprecated and will be removed '
                         'in version 0.22',
                         clf.fit, X)

    clf = IsolationForest().fit(X)
    assert_warns_message(DeprecationWarning,
                         "threshold_ attribute is deprecated in 0.20 and will"
                         " be removed in 0.22.",
                         getattr, clf, "threshold_") 
示例5
def sample_hyps_iso_forest(nest, contam, boot):
    """

    :param nest:
    :param contam:
    :param boot:
    :return: An IsolationForest object with specified hyperparameters, used to detect anomaly.
    """

    n_estimators = nest # random.choice(range(20, 300))  # default is 100
    max_samples = 'auto'
    contamination = contam #randrange_float(0.0, 0.5, 0.05)
    max_features = 1.0 # default is 1.0 (use all features)
    bootstrap = boot # random.choice(['True', 'False'])
    n_jobs = -1  # Uses all cores
    verbose = 0

    model = IsolationForest(n_estimators=n_estimators, max_samples=max_samples,
                            contamination=contamination, max_features=max_features,
                            bootstrap=bootstrap, n_jobs=n_jobs, verbose=verbose)
    return model 
示例6
def run_isolation_forest(features, id_list, fraction_of_outliers=.3):
    """Performs anomaly detection based on Isolation Forest."""

    rng = np.random.RandomState(1984)

    num_samples = features.shape[0]
    iso_f = IsolationForest(max_samples=num_samples,
                            contamination=fraction_of_outliers,
                            random_state=rng)
    iso_f.fit(features)
    pred_scores = iso_f.decision_function(features)

    threshold = stats.scoreatpercentile(pred_scores, 100 * fraction_of_outliers)
    outlying_ids = id_list[pred_scores < threshold]

    return outlying_ids 
示例7
def test_iforest_sparse():
    """Check IForest for various parameter settings on sparse input."""
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(boston.data[:50],
                                                        boston.target[:50],
                                                        random_state=rng)
    grid = ParameterGrid({"max_samples": [0.5, 1.0],
                          "bootstrap": [True, False]})

    for sparse_format in [csc_matrix, csr_matrix]:
        X_train_sparse = sparse_format(X_train)
        X_test_sparse = sparse_format(X_test)

        for params in grid:
            # Trained on sparse format
            sparse_classifier = IsolationForest(
                n_estimators=10, random_state=1, **params).fit(X_train_sparse)
            sparse_results = sparse_classifier.predict(X_test_sparse)

            # Trained on dense format
            dense_classifier = IsolationForest(
                n_estimators=10, random_state=1, **params).fit(X_train)
            dense_results = dense_classifier.predict(X_test)

            assert_array_equal(sparse_results, dense_results) 
示例8
def test_iforest_error():
    """Test that it gives proper exception on deficient input."""
    X = iris.data

    # Test max_samples
    assert_raises(ValueError,
                  IsolationForest(max_samples=-1).fit, X)
    assert_raises(ValueError,
                  IsolationForest(max_samples=0.0).fit, X)
    assert_raises(ValueError,
                  IsolationForest(max_samples=2.0).fit, X)
    # The dataset has less than 256 samples, explicitly setting
    # max_samples > n_samples should result in a warning. If not set
    # explicitly there should be no warning
    assert_warns_message(UserWarning,
                         "max_samples will be set to n_samples for estimation",
                         IsolationForest(max_samples=1000).fit, X)
    assert_no_warnings(IsolationForest(max_samples='auto').fit, X)
    assert_no_warnings(IsolationForest(max_samples=np.int64(2)).fit, X)
    assert_raises(ValueError, IsolationForest(max_samples='foobar').fit, X)
    assert_raises(ValueError, IsolationForest(max_samples=1.5).fit, X) 
示例9
def test_iforest_performance():
    """Test Isolation Forest performs well"""

    # Generate train/test data
    rng = check_random_state(2)
    X = 0.3 * rng.randn(120, 2)
    X_train = np.r_[X + 2, X - 2]
    X_train = X[:100]

    # Generate some abnormal novel observations
    X_outliers = rng.uniform(low=-4, high=4, size=(20, 2))
    X_test = np.r_[X[100:], X_outliers]
    y_test = np.array([0] * 20 + [1] * 20)

    # fit the model
    clf = IsolationForest(max_samples=100, random_state=rng).fit(X_train)

    # predict scores (the lower, the more normal)
    y_pred = - clf.decision_function(X_test)

    # check that there is at most 6 errors (false positive or false negative)
    assert_greater(roc_auc_score(y_test, y_pred), 0.98) 
示例10
def __init__(self, _id, _config):
        super(IsolationForest, self).__init__(_id, _config)
        self._nb_samples = int(_config['nb_samples']) 
示例11
def get_default_config():
        return {
            'module': IsolationForest.__name__,
            'nb_samples': N_SAMPLES
        } 
示例12
def _get_best_detector(self, train):
        detector = ensemble.IsolationForest()
        detector.fit(train)
        return detector 
示例13
def setUp(self):
        super(TestIsolationForest, self).setUp()
        self.if_sml = isolation_forest.IsolationForest(
            "fakeid", {"module": "fake", "nb_samples": 1000}) 
示例14
def test_learn_structure(self):
        data = self.get_testing_data()
        clf = self.if_sml.learn_structure(data)
        self.assertIsInstance(clf, ensemble.IsolationForest) 
示例15
def test_iforest():
    """Check Isolation Forest for various parameter settings."""
    X_train = np.array([[0, 1], [1, 2]])
    X_test = np.array([[2, 1], [1, 1]])

    grid = ParameterGrid({"n_estimators": [3],
                          "max_samples": [0.5, 1.0, 3],
                          "bootstrap": [True, False]})

    with ignore_warnings():
        for params in grid:
            IsolationForest(random_state=rng,
                            **params).fit(X_train).predict(X_test) 
示例16
def test_iforest_error():
    """Test that it gives proper exception on deficient input."""
    X = iris.data

    # Test max_samples
    assert_raises(ValueError,
                  IsolationForest(max_samples=-1).fit, X)
    assert_raises(ValueError,
                  IsolationForest(max_samples=0.0).fit, X)
    assert_raises(ValueError,
                  IsolationForest(max_samples=2.0).fit, X)
    # The dataset has less than 256 samples, explicitly setting
    # max_samples > n_samples should result in a warning. If not set
    # explicitly there should be no warning
    assert_warns_message(UserWarning,
                         "max_samples will be set to n_samples for estimation",
                         IsolationForest(max_samples=1000).fit, X)
    # note that assert_no_warnings does not apply since it enables a
    # PendingDeprecationWarning triggered by scipy.sparse's use of
    # np.matrix. See issue #11251.
    with pytest.warns(None) as record:
        IsolationForest(max_samples='auto').fit(X)
    user_warnings = [each for each in record
                     if issubclass(each.category, UserWarning)]
    assert len(user_warnings) == 0
    with pytest.warns(None) as record:
        IsolationForest(max_samples=np.int64(2)).fit(X)
    user_warnings = [each for each in record
                     if issubclass(each.category, UserWarning)]
    assert len(user_warnings) == 0

    assert_raises(ValueError, IsolationForest(max_samples='foobar').fit, X)
    assert_raises(ValueError, IsolationForest(max_samples=1.5).fit, X)

    # test X_test n_features match X_train one:
    assert_raises(ValueError, IsolationForest().fit(X).predict, X[:, 1:])

    # test threshold_ attribute error when behaviour is not old:
    msg = "threshold_ attribute does not exist when behaviour != 'old'"
    assert_raises_regex(AttributeError, msg, getattr,
                        IsolationForest(behaviour='new'), 'threshold_') 
示例17
def test_recalculate_max_depth():
    """Check max_depth recalculation when max_samples is reset to n_samples"""
    X = iris.data
    clf = IsolationForest().fit(X)
    for est in clf.estimators_:
        assert_equal(est.max_depth, int(np.ceil(np.log2(X.shape[0])))) 
示例18
def test_max_samples_attribute():
    X = iris.data
    clf = IsolationForest().fit(X)
    assert_equal(clf.max_samples_, X.shape[0])

    clf = IsolationForest(max_samples=500)
    assert_warns_message(UserWarning,
                         "max_samples will be set to n_samples for estimation",
                         clf.fit, X)
    assert_equal(clf.max_samples_, X.shape[0])

    clf = IsolationForest(max_samples=0.4).fit(X)
    assert_equal(clf.max_samples_, 0.4*X.shape[0]) 
示例19
def test_iforest_works(contamination):
    # toy sample (the last two samples are outliers)
    X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [-4, 7]]

    # Test IsolationForest
    clf = IsolationForest(
        behaviour="new", random_state=rng, contamination=contamination
    )
    clf.fit(X)
    decision_func = -clf.decision_function(X)
    pred = clf.predict(X)
    # assert detect outliers:
    assert_greater(np.min(decision_func[-2:]), np.max(decision_func[:-2]))
    assert_array_equal(pred, 6 * [1] + 2 * [-1]) 
示例20
def test_max_samples_consistency():
    # Make sure validated max_samples in iforest and BaseBagging are identical
    X = iris.data
    clf = IsolationForest().fit(X)
    assert_equal(clf.max_samples_, clf._max_samples) 
示例21
def test_iforest_subsampled_features():
    # It tests non-regression for #5732 which failed at predict.
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(boston.data[:50],
                                                        boston.target[:50],
                                                        random_state=rng)
    clf = IsolationForest(max_features=0.8)
    clf.fit(X_train, y_train)
    clf.predict(X_test) 
示例22
def test_score_samples():
    X_train = [[1, 1], [1, 2], [2, 1]]
    clf1 = IsolationForest(contamination=0.1).fit(X_train)
    clf2 = IsolationForest().fit(X_train)
    assert_array_equal(clf1.score_samples([[2., 2.]]),
                       clf1.decision_function([[2., 2.]]) + clf1.offset_)
    assert_array_equal(clf2.score_samples([[2., 2.]]),
                       clf2.decision_function([[2., 2.]]) + clf2.offset_)
    assert_array_equal(clf1.score_samples([[2., 2.]]),
                       clf2.score_samples([[2., 2.]])) 
示例23
def test_behaviour_param():
    X_train = [[1, 1], [1, 2], [2, 1]]
    clf1 = IsolationForest(behaviour='old').fit(X_train)
    clf2 = IsolationForest(behaviour='new', contamination='auto').fit(X_train)
    assert_array_equal(clf1.decision_function([[2., 2.]]),
                       clf2.decision_function([[2., 2.]]))


# mock get_chunk_n_rows to actually test more than one chunk (here one
# chunk = 3 rows: 
示例24
def build_model(self, outlier_ratio=0.1, n_estimators=100, max_samples='auto'):
        self.model = IsolationForest(contamination=outlier_ratio,
                                     n_estimators=n_estimators,
                                     max_samples=max_samples,
                                     behaviour='new') 
示例25
def _fit(self, X):
        self.estimator_   = IsolationForest(
            behaviour     = 'new',
            bootstrap     = self.bootstrap,
            contamination = self.contamination,
            max_features  = self.max_features,
            max_samples   = self.max_samples,
            n_estimators  = self.n_estimators,
            n_jobs        = self.n_jobs,
            random_state  = self.random_state
        ).fit(X)

        return self 
示例26
def fit(self):
        isf = IsolationForest(n_estimators=self.nestimators, contamination=self.contamination)
        isf.fit(self.X)
        self.ift = isf 
示例27
def fit(self, X, y=None, sample_weight=None):
        self.ifor = IsolationForest(n_estimators=self.n_estimators,
                                    max_samples=self.max_samples,
                                    contamination=self.contamination,
                                    max_features=self.max_features,
                                    bootstrap=self.bootstrap,
                                    n_jobs=self.n_jobs,
                                    random_state=self.random_state,
                                    verbose=self.verbose)
        self.ifor.fit(X, y, sample_weight)
        self.estimators_ = self.ifor.estimators_
        self.estimators_features_ = self.ifor.estimators_features_
        self.updated = False 
示例28
def _multiview_fit(self, X, y, feature_partitions, n_estimators_view):
        n_features = X.shape[1]

        estimators_group = []
        feature_offset = 0
        logger.debug("IForestMultiview n_estimators_view: %s" % str(list(n_estimators_view)))
        for n_feats, n_est_ in zip(feature_partitions, n_estimators_view):
            estimators = []
            X_ = X[:, feature_offset:(feature_offset+n_feats)]

            if n_est_ > 0:
                # contruct isolation forest for the view containing just the feature subset
                ifor_ = IsolationForest(n_estimators=n_est_,
                                        max_samples=self.max_samples,
                                        contamination=self.contamination,
                                        max_features=self.max_features,
                                        bootstrap=self.bootstrap,
                                        n_jobs=self.n_jobs,
                                        random_state=self.random_state,
                                        verbose=self.verbose)
                ifor_.fit(X_, y, sample_weight=None)

                for tree in ifor_.estimators_:
                    # The IsolationForest trees contain read-only properties. We copy
                    # over all the properties to our custom tree structure so that we
                    # can modify them if needed.
                    ifor_mv_estimator = IForestMultiviewTree(n_features=n_features, ifor_tree=tree.tree_)

                    # adjust the feature indexes at the tree nodes.
                    ifor_mv_estimator.tree_.feature += feature_offset

                    estimators.append(ifor_mv_estimator)

            estimators_group.append(estimators)
            feature_offset += n_feats

        return estimators_group 
示例29
def get_iso_model(x, y, opts):
    outliers_fraction = 0.1
    ifor_random_state = opts.randseed
    iso_model = IsolationForest(n_estimators=100, max_samples=256,
                                contamination=outliers_fraction,
                                random_state=ifor_random_state)
    iso_model.fit(x)
    r = np.reshape(iso_model.decision_function(x), (-1, 1))
    # logger.debug("iforest r:\n%s" % str(list(r)))
    return iso_model, r 
示例30
def __init__(self,options):
        self.handle_options(options)
        out_params = convert_params(
            options.get('params',{}),
            ints = ['n_estimators','n_jobs','random_state','verbose'],
            floats = ['max_samples','contamination','max_features'],
            bools = ['bootstrap']
            )
        self.return_scores = out_params.pop('anomaly_score', True)

        # whitelist n_estimators > 0
        if 'n_estimators' in out_params and out_params['n_estimators']<=0:
            msg = 'Invalid value error: n_estimators must be greater than 0 and an integer, but found n_estimators="{}".'
            raise RuntimeError(msg.format(out_params['n_estimators']))
        
        # whitelist max_samples > 0 and < 1
        if 'max_samples' in out_params and out_params['max_samples']<0 and out_params['max_samples']>1:
            msg = 'Invalid value error: max_samples must be greater than 0 and a float, but found max_samples="{}".'
            raise RuntimeError(msg.format(out_params['max_samples']))
        
        #   whitelist contamination should be in (0.0, 0.5] as error raised by sklearn for values out of range
        if 'contamination' in out_params and not (0.0 < out_params['contamination'] <= 0.5):
            msg = (
                'Invalid value error: Valid values for contamination are in (0.0, 0.5], '
                'but found contamination="{}".'
            )
            raise RuntimeError(msg.format(out_params['contamination']))

        # whitelist max_features > 0 and < 1
        if 'max_features' in out_params and out_params['max_features']<0 and out_params['max_features']>1:
            msg = 'Invalid value error: max_features must be greater than 0, but found max_features="{}".'
            raise RuntimeError(msg.format(out_params['max_features']))

        
        self.estimator = _IsolationForest(**out_params)