Python源码示例:sklearn.preprocessing.Normalizer()

示例1
def test_make_column_transformer_kwargs():
    scaler = StandardScaler()
    norm = Normalizer()
    ct = make_column_transformer((scaler, 'first'), (norm, ['second']),
                                 n_jobs=3, remainder='drop',
                                 sparse_threshold=0.5)
    assert_equal(ct.transformers, make_column_transformer(
        (scaler, 'first'), (norm, ['second'])).transformers)
    assert_equal(ct.n_jobs, 3)
    assert_equal(ct.remainder, 'drop')
    assert_equal(ct.sparse_threshold, 0.5)
    # invalid keyword parameters should raise an error message
    assert_raise_message(
        TypeError,
        'Unknown keyword arguments: "transformer_weights"',
        make_column_transformer, (scaler, 'first'), (norm, ['second']),
        transformer_weights={'pca': 10, 'Transf': 1}
    ) 
示例2
def get_model(with_pipeline=False):
    """Get a multi-layer perceptron model.

    Optionally, put it in a pipeline that scales the data.

    """
    model = NeuralNetClassifier(MLPClassifier)
    if with_pipeline:
        model = Pipeline([
            ('scale', FeatureUnion([
                ('minmax', MinMaxScaler()),
                ('normalize', Normalizer()),
            ])),
            ('select', SelectKBest(k=N_FEATURES)),  # keep input size constant
            ('net', model),
        ])
    return model 
示例3
def test_boston_OHE_pipeline(self):
        data = load_boston()

        for categorical_features in [[3], [8], [3, 8], [8, 3]]:
            # Put it in a pipeline so that we can test whether the output dimension
            # handling is correct.

            model = Pipeline(
                [
                    ("OHE", OneHotEncoder(categorical_features=categorical_features)),
                    ("Normalizer", Normalizer()),
                ]
            )

            model.fit(data.data.copy(), data.target)

            # Convert the model
            spec = sklearn.convert(model, data.feature_names, "out").get_spec()

            input_data = [dict(zip(data.feature_names, row)) for row in data.data]
            output_data = [{"out": row} for row in model.transform(data.data.copy())]

            result = evaluate_transformer(spec, input_data, output_data)

            assert result["num_errors"] == 0 
示例4
def test_random(self):
        # Generate some random data_imputeValue.multiArrayValue[i]
        X = _np.random.random(size=(50, 3))

        for param in ("l1", "l2", "max"):
            cur_model = Normalizer(norm=param)

            output = cur_model.fit_transform(X)

            spec = converter.convert(cur_model, ["a", "b", "c"], "out")

            evaluate_transformer(
                spec,
                [dict(zip(["a", "b", "c"], row)) for row in X],
                [{"out": row} for row in output],
            ) 
示例5
def test_within_pipeline():
    pytest.importorskip('cv2')
    pytest.importorskip('sklearn')
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import Normalizer
    stim = join(get_test_data_path(), 'image', 'apple.jpg')
    graph = Graph([BrightnessExtractor(), SharpnessExtractor()])
    trans = PliersTransformer(graph)
    normalizer = Normalizer()
    pipeline = Pipeline([('pliers', trans), ('normalizer', normalizer)])
    res = pipeline.fit_transform(stim)
    assert res.shape == (1, 2)
    assert np.isclose(res[0][0], 0.66393, 1e-5)
    assert np.isclose(res[0][1], 0.74780, 1e-5)
    meta = trans.metadata_
    assert 'onset' in meta.columns
    assert meta['class'][0] == 'ImageStim' 
示例6
def __init__(self, source_model: mx.mod.Module, feature_layer_names, context_function=mx.context.cpu, num_devices=1,
                 max_function_evaluations=100, apply_l2_norm=False):
        # Call base class constructor with parameters required for meta-models
        super().__init__(source_model, feature_layer_names, context_function, num_devices)
        self.max_function_evaluations = max_function_evaluations
        self.apply_l2_norm = apply_l2_norm

        # Mean of features to use for normalization. Computed in training phase.
        # Used to normalize features in training and in prediction.
        self.feature_mean = None

        # Optimizer to use for training GP model
        self.optimizer = 'lbfgs'

        # Number of inducing points to use for sparse GP
        self.NUM_INDUCING_SPARSE_GP = 100

        # Normalizer to use when apply_l2_norm flag is set
        self.l2_normalizer = Normalizer(norm='l2') 
示例7
def test_kneighbors_with_or_without_self_hit(LSH: callable, metric, n_jobs, verbose):
    X, y = make_classification(random_state=234)
    X = Normalizer().fit_transform(X)
    lsh = LSH(metric=metric, n_jobs=n_jobs, verbose=verbose)
    lsh.fit(X, y)
    neigh_dist, neigh_ind = lsh.kneighbors(return_distance=True)
    neigh_dist_self, neigh_ind_self = lsh.kneighbors(X, return_distance=True)

    ind_only = lsh.kneighbors(return_distance=False)
    ind_only_self = lsh.kneighbors(X, return_distance=False)

    assert_array_equal(neigh_ind, ind_only)
    assert_array_equal(neigh_ind_self, ind_only_self)

    assert (neigh_ind - neigh_ind_self).mean() <= .01, f'More than 1% of neighbors mismatch'
    assert ((neigh_dist - neigh_dist_self) < 0.0001).mean() <= 0.01,\
        f'Not almost equal to 4 decimals in more than 1% of neighbor slots' 
示例8
def test_radius_neighbors_with_or_without_self_hit(LSH, metric, n_jobs, verbose):
    X, y = make_classification()
    X = Normalizer().fit_transform(X)
    lsh = LSH(metric=metric, n_jobs=n_jobs, verbose=verbose)
    lsh.fit(X, y)
    radius = lsh.kneighbors(n_candidates=3)[0][:, 2].max()
    neigh_dist, neigh_ind = lsh.radius_neighbors(return_distance=True, radius=radius)
    neigh_dist_self, neigh_ind_self = lsh.radius_neighbors(X, return_distance=True, radius=radius)

    ind_only = lsh.radius_neighbors(return_distance=False, radius=radius)
    ind_only_self = lsh.radius_neighbors(X, return_distance=False, radius=radius)

    assert len(neigh_ind) == len(neigh_ind_self) == len(neigh_dist) == len(neigh_dist_self)
    for i in range(len(neigh_ind)):
        assert_array_equal(neigh_ind[i], ind_only[i])
        assert_array_equal(neigh_ind_self[i], ind_only_self[i])

        assert_array_equal(neigh_ind[i][:3],
                           neigh_ind_self[i][1:4])
        assert_array_almost_equal(neigh_dist[i][:3],
                                  neigh_dist_self[i][1:4]) 
示例9
def test_squared_euclidean_same_neighbors_as_euclidean(LSH):
    X, y = make_classification(random_state=234)
    X = Normalizer().fit_transform(X)
    lsh = LSH(metric='minkowski')
    lsh.fit(X, y)
    neigh_dist_eucl, neigh_ind_eucl = lsh.kneighbors()

    lsh_sq = LSH(metric='sqeuclidean')
    lsh_sq.fit(X, y)
    neigh_dist_sqeucl, neigh_ind_sqeucl = lsh_sq.kneighbors()

    assert_array_equal(neigh_ind_eucl, neigh_ind_sqeucl)
    assert_array_almost_equal(neigh_dist_eucl ** 2, neigh_dist_sqeucl)

    if LSH in LSH_WITH_RADIUS:
        radius = neigh_dist_eucl[:, 2].max()
        rad_dist_eucl, rad_ind_eucl = lsh.radius_neighbors(radius=radius)
        rad_dist_sqeucl, rad_ind_sqeucl = lsh_sq.radius_neighbors(radius=radius**2)
        for i in range(len(rad_ind_eucl)):
            assert_array_equal(rad_ind_eucl[i], rad_ind_sqeucl[i])
            assert_array_almost_equal(rad_dist_eucl[i] ** 2, rad_dist_sqeucl[i]) 
示例10
def nbow_model(task, embeddings, word2idx):
    if task == "clf":
        algo = LogisticRegression(C=0.6, random_state=0,
                                  class_weight='balanced')
    elif task == "reg":
        algo = SVR(kernel='linear', C=0.6)
    else:
        raise ValueError("invalid task!")

    embeddings_features = NBOWVectorizer(aggregation=["mean"],
                                         embeddings=embeddings,
                                         word2idx=word2idx,
                                         stopwords=False)

    model = Pipeline([
        ('embeddings-feats', embeddings_features),
        ('normalizer', Normalizer(norm='l2')),
        ('clf', algo)
    ])

    return model 
示例11
def test_objectmapper(self):
        df = pdml.ModelFrame([])
        self.assertIs(df.preprocessing.Binarizer, pp.Binarizer)
        self.assertIs(df.preprocessing.FunctionTransformer,
                      pp.FunctionTransformer)
        self.assertIs(df.preprocessing.Imputer, pp.Imputer)
        self.assertIs(df.preprocessing.KernelCenterer, pp.KernelCenterer)
        self.assertIs(df.preprocessing.LabelBinarizer, pp.LabelBinarizer)
        self.assertIs(df.preprocessing.LabelEncoder, pp.LabelEncoder)
        self.assertIs(df.preprocessing.MultiLabelBinarizer, pp.MultiLabelBinarizer)
        self.assertIs(df.preprocessing.MaxAbsScaler, pp.MaxAbsScaler)
        self.assertIs(df.preprocessing.MinMaxScaler, pp.MinMaxScaler)
        self.assertIs(df.preprocessing.Normalizer, pp.Normalizer)
        self.assertIs(df.preprocessing.OneHotEncoder, pp.OneHotEncoder)
        self.assertIs(df.preprocessing.PolynomialFeatures, pp.PolynomialFeatures)
        self.assertIs(df.preprocessing.RobustScaler, pp.RobustScaler)
        self.assertIs(df.preprocessing.StandardScaler, pp.StandardScaler) 
示例12
def __init__(self, min_df=1, max_df=0.9, tokenizer=LemmaTokenizer, hash=False):
        """
        `min_df` is set to filter out extremely rare words,
        since we don't want those to dominate the distance metric.

        `max_df` is set to filter out extremely common words,
        since they don't convey much information.
        """

        # Wrap the specified tokenizer
        t = Tokenizer(tokenizer())

        if hash:
            vectr = HashingVectorizer(input='content', stop_words='english', lowercase=True, tokenizer=t)
        else:
            vectr = CountVectorizer(input='content', stop_words='english', lowercase=True, tokenizer=t, min_df=min_df, max_df=max_df)

        args = [
            ('vectorizer', vectr),
            ('tfidf', TfidfTransformer(norm=None, use_idf=True, smooth_idf=True)),
            ('normalizer', Normalizer(copy=False))
        ]

        self.pipeline = Pipeline(args)
        self.trained = False 
示例13
def __init__(self):
        super().__init__()
        self.normalizer = Normalizer() 
示例14
def train(self, training_data_X, training_data_Y):
        self.normalizer = Normalizer()
        self.svc = svm.SVC(gamma=0.001, C=100.)
        normalised_training_data_X = self.normalizer.fit_transform(training_data_X)
        self.svc.fit(normalised_training_data_X, training_data_Y) 
示例15
def data_cleaning_formatting(X):
    # Basic cleaning
    X = X.fillna(0)
    X = X.fillna('ffill')

    # Encode data
    X = encode_data(X)
    X = Normalizer().fit_transform(X)
    return X 
示例16
def test_make_column_transformer():
    scaler = StandardScaler()
    norm = Normalizer()
    ct = make_column_transformer((scaler, 'first'), (norm, ['second']))
    names, transformers, columns = zip(*ct.transformers)
    assert_equal(names, ("standardscaler", "normalizer"))
    assert_equal(transformers, (scaler, norm))
    assert_equal(columns, ('first', ['second']))

    # XXX remove in v0.22
    with pytest.warns(DeprecationWarning,
                      match='`make_column_transformer` now expects'):
        ct1 = make_column_transformer(([0], norm))
    ct2 = make_column_transformer((norm, [0]))
    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
    assert_almost_equal(ct1.fit_transform(X_array),
                        ct2.fit_transform(X_array))

    with pytest.warns(DeprecationWarning,
                      match='`make_column_transformer` now expects'):
        make_column_transformer(('first', 'drop'))

    with pytest.warns(DeprecationWarning,
                      match='`make_column_transformer` now expects'):
        make_column_transformer(('passthrough', 'passthrough'),
                                ('first', 'drop')) 
示例17
def test_make_column_transformer_pandas():
    pd = pytest.importorskip('pandas')
    X_array = np.array([[0, 1, 2], [2, 4, 6]]).T
    X_df = pd.DataFrame(X_array, columns=['first', 'second'])
    norm = Normalizer()
    # XXX remove in v0.22
    with pytest.warns(DeprecationWarning,
                      match='`make_column_transformer` now expects'):
        ct1 = make_column_transformer((X_df.columns, norm))
    ct2 = make_column_transformer((norm, X_df.columns))
    assert_almost_equal(ct1.fit_transform(X_df),
                        ct2.fit_transform(X_df)) 
示例18
def test_make_column_transformer_remainder_transformer():
    scaler = StandardScaler()
    norm = Normalizer()
    remainder = StandardScaler()
    ct = make_column_transformer((scaler, 'first'), (norm, ['second']),
                                 remainder=remainder)
    assert ct.remainder == remainder 
示例19
def test():
    parser = argparse.ArgumentParser()

    parser.add_argument("File")

    args = parser.parse_args()

    info = fh.get_function_information(args.File)
    #info = fh.get_arg_funcs(args.File)

    info = trim_funcs(info, args.File)

    vect, func_sparse = funcs_to_sparse(info)

    transformer = Normalizer().fit(func_sparse)

    func_sparse = transformer.transform(func_sparse)

    #svd = TruncatedSVD(random_state=2)
    svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42)

    func_sparse = svd.fit_transform(func_sparse)

    scores = []
    clust_count = []
    for x in range(2, 20):
        result = KMeans(n_clusters=x, random_state=2).fit(func_sparse)

        score = silhouette_score(func_sparse, result.labels_, metric="cosine")
        scores.append(score)
        clust_count.append(x)

        print("Clusters {:<3} | Silhoette Score : {}".format(x, score))

    plt.plot(clust_count, scores)
    plt.xlabel("Cluster Centroid Count")
    plt.ylabel("Silhoette Score")
    plt.grid = True
    plt.show()

    pass 
示例20
def single_cluster(all_functions, centroid_count=2):
    vect, func_sparse = funcs_to_sparse(all_functions)

    transformer = Normalizer().fit(func_sparse)

    func_sparse = transformer.transform(func_sparse)

    # svd = TruncatedSVD(random_state=2)
    # svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42)

    # func_sparse = svd.fit_transform(func_sparse)

    labels = []

    result = KMeans(n_clusters=centroid_count, random_state=2).fit(func_sparse)

    score = silhouette_score(func_sparse,
                             result.labels_,
                             metric="cosine",
                             random_state=2,
                             sample_size=5000)
    labels.append(result.labels_)

    print("Clusters {:<3} | Silhoette Score : {}".format(
        centroid_count, score))

    return result.labels_ 
示例21
def get_cosine_dist(all_functions):
    return_dict = {}
    vect, func_sparse = funcs_to_sparse(all_functions)

    transformer = Normalizer().fit(func_sparse)

    func_sparse = transformer.transform(func_sparse)

    return cosine_distances(func_sparse, func_sparse) 
示例22
def get_single_cluster(all_functions, centroid_count=2):
    return_dict = {}
    vect, func_sparse = funcs_to_sparse(all_functions)

    transformer = Normalizer().fit(func_sparse)

    func_sparse = transformer.transform(func_sparse)

    # svd = TruncatedSVD(random_state=2)
    # svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42)

    # func_sparse = svd.fit_transform(func_sparse)

    labels = []

    result = KMeans(n_clusters=centroid_count, random_state=2).fit(func_sparse)

    score = silhouette_score(func_sparse,
                             result.labels_,
                             metric="cosine",
                             random_state=2,
                             sample_size=5000)
    labels.append(result.labels_)

    #print("Clusters {:<3} | Silhoette Score : {}".format(centroid_count, score))
    return_dict['count'] = centroid_count
    return_dict['score'] = score
    return_dict['labels'] = result.labels_

    return return_dict 
示例23
def get_normalizer(self, x_train):
        from sklearn.preprocessing import Normalizer
        return Normalizer().fit(self.x_train) 
示例24
def test_boston(self):
        from sklearn.datasets import load_boston

        scikit_data = load_boston()
        scikit_model = Normalizer(norm="l2").fit(scikit_data.data)

        spec = converter.convert(scikit_model, scikit_data.feature_names, "out")

        input_data = [
            dict(zip(scikit_data.feature_names, row)) for row in scikit_data.data
        ]

        output_data = [{"out": row} for row in scikit_model.transform(scikit_data.data)]

        evaluate_transformer(spec, input_data, output_data) 
示例25
def get_normalized_vectors(vectors, norm='l1', progress=progress, pad_zeros=True):
    progress.update('Normalizing vectors using "%s" norm' % norm)
    vectors = np.array(vectors, dtype=np.float64)
    if pad_zeros:
        vectors += 0.0000001
    normalizer = preprocessing.Normalizer(norm=norm)
    return normalizer.fit_transform(vectors) 
示例26
def __init__(self, norm='l2'):
        super(BagNormalizer, self).__init__(Normalizer(norm)) 
示例27
def __init__(self, **kwargs):
        super().__init__()
        self.estimator = prep.Normalizer() 
示例28
def train_lsa(corpus,n_topics, max_df=0.95, min_df=2,cleaning=clearstring,stop_words='english'):
    if cleaning is not None:
        for i in range(len(corpus)): corpus[i] = cleaning(corpus[i])
    tfidf_vectorizer = TfidfVectorizer(max_df = max_df, min_df = min_df, stop_words = stop_words)
    tfidf = tfidf_vectorizer.fit_transform(corpus)
    tfidf_features = tfidf_vectorizer.get_feature_names()
    tfidf = Normalizer().fit_transform(tfidf)
    lsa = TruncatedSVD(n_topics).fit(tfidf)
    return TOPIC(tfidf_features,lsa) 
示例29
def train_lsa(corpus, n_topics, max_df=0.95, min_df=2,
              cleaning=clearstring, stop_words='english'):
    if cleaning is not None:
        for i in range(len(corpus)):
            corpus[i] = cleaning(corpus[i])
    tfidf_vectorizer = TfidfVectorizer(
        max_df=max_df, min_df=min_df, stop_words=stop_words)
    tfidf = tfidf_vectorizer.fit_transform(corpus)
    tfidf_features = tfidf_vectorizer.get_feature_names()
    tfidf = Normalizer().fit_transform(tfidf)
    lsa = TruncatedSVD(n_topics).fit(tfidf)
    return TOPIC(tfidf_features, lsa) 
示例30
def normalize(train_inputs, non_train_inputs):
    normalizer = Normalizer()
    train_inputs[train_inputs.columns] = normalizer.fit_transform(train_inputs.values)
    non_train_inputs[train_inputs.columns] = normalizer.transform(non_train_inputs.values)
    return train_inputs, non_train_inputs