Python源码示例:sklearn.decomposition.TruncatedSVD()

示例1
def fit_transform(self, documents):
        # Vectorizer will be False if pipeline hasn't been fit yet,
        # Trigger fit_transform and save the vectorizer and lexicon.
        if self.vectorizer == False:
            self.lexicon = self.pipeline.fit_transform(documents)
            self.vect = self.pipeline.named_steps['tfidf']
            self.knn = self.pipeline.named_steps['knn']
            self.save()
        # If there's a stored vectorizer and prefitted lexicon,
        # use them instead.
        else:
            self.vect = self.vectorizer
            self.knn = Pipeline([
                ('svd', TruncatedSVD(n_components=100)),
                ('knn', KNNTransformer(k=self.k, algorithm='ball_tree'))
            ])
            self.knn.fit_transform(self.lexicon) 
示例2
def optimize(self):
        """
        Learning an embedding.
        """
        print("\nOptimization started.\n")
        self.embeddings = []
        for step in tqdm(range(self.args.order)):
            target_matrix = self._create_target_matrix()

            svd = TruncatedSVD(n_components=self.args.dimensions,
                               n_iter=self.args.iterations,
                               random_state=self.args.seed)

            svd.fit(target_matrix)
            embedding = svd.transform(target_matrix)
            self.embeddings.append(embedding) 
示例3
def test_resolve_embeddings(self):
        tdm = self.corpus.get_unigram_corpus().select(ClassPercentageCompactor(term_count=1))
        embeddings_resolver = EmbeddingsResolver(tdm)
        # embeddings = TruncatedSVD(n_components=20).fit_transform(tdm.get_term_doc_mat().T).T
        # embeddings_resolver.set_embeddings(embeddings)
        embeddings_resolver = embeddings_resolver.set_embeddings(tdm.get_term_doc_mat())
        if self.assertRaisesRegex:
            with self.assertRaisesRegex(Exception,
                                        "You have already set embeddings by running set_embeddings or set_embeddings_model."):
                embeddings_resolver.set_embeddings_model(None)
        embeddings_resolver = EmbeddingsResolver(tdm)

        embeddings_resolver = embeddings_resolver.set_embeddings_model(MockWord2Vec(tdm.get_terms()))
        if self.assertRaisesRegex:
            with self.assertRaisesRegex(Exception,
                                        "You have already set embeddings by running set_embeddings or set_embeddings_model."):
                embeddings_resolver.set_embeddings(tdm.get_term_doc_mat())
        c, axes = embeddings_resolver.project_embeddings(projection_model=TruncatedSVD(3))
        self.assertIsInstance(c, ParsedCorpus)
        self.assertEqual(axes.to_dict(), pd.DataFrame(index=['speak'], data={'x': [0.,], 'y':[0.,]}).to_dict()) 
示例4
def test_selective_tsvd():
    original = X
    cols = [original.columns[0], original.columns[1]]  # Only perform on first two columns...
    compare_cols = np.array(
        original[['petal length (cm)', 'petal width (cm)']].as_matrix())  # should be the same as the trans cols

    transformer = SelectiveTruncatedSVD(cols=cols, n_components=1).fit(original)
    transformed = transformer.transform(original)

    untouched_cols = np.array(transformed[['petal length (cm)', 'petal width (cm)']].as_matrix())
    assert_array_almost_equal(compare_cols, untouched_cols)
    assert 'Concept1' in transformed.columns
    assert transformed.shape[1] == 3
    assert isinstance(transformer.get_decomposition(), TruncatedSVD)
    assert SelectiveTruncatedSVD().get_decomposition() is None  # default None

    # test the selective mixin
    assert isinstance(transformer.cols, list) 
示例5
def transform(self):
        # ngrams
        obs_ngrams = list(map(lambda x: ngram_utils._ngrams(x.split(" "), self.obs_ngram, "_"), self.obs_corpus))
        target_ngrams = list(map(lambda x: ngram_utils._ngrams(x.split(" "), self.target_ngram, "_"), self.target_corpus))
        # cooccurrence ngrams
        cooc_terms = list(map(lambda lst1,lst2: self._get_cooc_terms(lst1, lst2, "X"), obs_ngrams, target_ngrams))
        ## tfidf
        tfidf = self._init_word_ngram_tfidf(ngram=1)
        X = tfidf.fit_transform(cooc_terms)
        ## svd
        svd = TruncatedSVD(n_components=self.svd_dim, 
                n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED)
        return svd.fit_transform(X)


# 2nd in CrowdFlower (preprocessing_mikhail.py) 
示例6
def transform(self):
        ## get common vocabulary
        tfidf = self._init_word_ngram_tfidf(self.ngram)
        tfidf.fit(list(self.obs_corpus) + list(self.target_corpus))
        vocabulary = tfidf.vocabulary_
        ## obs tfidf
        tfidf = self._init_word_ngram_tfidf(self.ngram, vocabulary)
        X_obs = tfidf.fit_transform(self.obs_corpus)
        ## targetument tfidf
        tfidf = self._init_word_ngram_tfidf(self.ngram, vocabulary)
        X_target = tfidf.fit_transform(self.target_corpus)
        ## svd
        svd = TruncatedSVD(n_components = self.svd_dim, 
                n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED)
        svd.fit(scipy.sparse.vstack((X_obs, X_target)))
        X_obs = svd.transform(X_obs)
        X_target = svd.transform(X_target)
        ## cosine similarity
        sim = list(map(dist_utils._cosine_sim, X_obs, X_target))
        sim = np.asarray(sim).squeeze()
        return sim 
示例7
def _reduce_dimensions(self, X):
        """
        Using Truncated SVD.

        Arg types:
            * **X** *(Scipy COO or Numpy array)* - The wide feature matrix.

        Return types:
            * **X** *(Numpy array)* - The reduced feature matrix of nodes.
        """
        svd = TruncatedSVD(n_components=self.reduction_dimensions,
                           n_iter=self.svd_iterations,
                           random_state=self.seed)
        svd.fit(X)
        X = svd.transform(X)
        return X 
示例8
def _create_reduced_features(self, X):
        """
        Creating a dense reduced node feature matrix.

        Arg types:
            * **X** *(Scipy COO or Numpy array)* - The wide feature matrix.

        Return types:
            * **T** *(Numpy array)* - The reduced feature matrix of nodes.
        """
        svd = TruncatedSVD(n_components=self.reduction_dimensions,
                           n_iter=self.svd_iterations,
                           random_state=self.seed)
        svd.fit(X)
        T = svd.transform(X)
        return T.T 
示例9
def fit_truncatedSVD(data):
    '''
        Fit the model with truncated SVD principal components
    '''
    # keyword parameters for the PCA
    kwrd_params = {
        'algorithm': 'randomized', 
        'n_components': 5, 
        'n_iter': 5,
        'random_state': 42, 
        'tol': 0.0
    }

    # reduce the data
    reduced = reduceDimensions(cd.TruncatedSVD, 
        data, **kwrd_params)

    # prepare the data for the classifier
    data_l = prepare_data(data, reduced, 
        kwrd_params['n_components'])

    # fit the model
    class_fit_predict_print(data_l)

# the file name of the dataset 
示例10
def test_random_hasher():
    # test random forest hashing on circles dataset
    # make sure that it is linearly separable.
    # even after projected to two SVD dimensions
    # Note: Not all random_states produce perfect results.
    hasher = RandomTreesEmbedding(n_estimators=30, random_state=1)
    X, y = datasets.make_circles(factor=0.5)
    X_transformed = hasher.fit_transform(X)

    # test fit and transform:
    hasher = RandomTreesEmbedding(n_estimators=30, random_state=1)
    assert_array_equal(hasher.fit(X).transform(X).toarray(),
                       X_transformed.toarray())

    # one leaf active per data point per forest
    assert_equal(X_transformed.shape[0], X.shape[0])
    assert_array_equal(X_transformed.sum(axis=1), hasher.n_estimators)
    svd = TruncatedSVD(n_components=2)
    X_reduced = svd.fit_transform(X_transformed)
    linear_clf = LinearSVC()
    linear_clf.fit(X_reduced, y)
    assert_equal(linear_clf.score(X_reduced, y), 1.) 
示例11
def test_truncated_svd_eq_pca():
    # TruncatedSVD should be equal to PCA on centered data

    X_c = X - X.mean(axis=0)

    params = dict(n_components=10, random_state=42)

    svd = TruncatedSVD(algorithm='arpack', **params)
    pca = PCA(svd_solver='arpack', **params)

    Xt_svd = svd.fit_transform(X_c)
    Xt_pca = pca.fit_transform(X_c)

    assert_allclose(Xt_svd, Xt_pca, rtol=1e-9)
    assert_allclose(pca.mean_, 0, atol=1e-9)
    assert_allclose(svd.components_, pca.components_) 
示例12
def dim_reduction_method(self):
        """
        select dimensionality reduction method
        """
        if self.dim_reduction=='pca':
            return PCA()
        elif self.dim_reduction=='factor-analysis':
            return FactorAnalysis()
        elif self.dim_reduction=='fast-ica':
            return FastICA()
        elif self.dim_reduction=='kernel-pca':
            return KernelPCA()
        elif self.dim_reduction=='sparse-pca':
            return SparsePCA()
        elif self.dim_reduction=='truncated-svd':
            return TruncatedSVD()
        elif self.dim_reduction!=None:
            raise ValueError('%s is not a supported dimensionality reduction method. Valid inputs are: \
                             "pca","factor-analysis","fast-ica,"kernel-pca","sparse-pca","truncated-svd".' 
                             %(self.dim_reduction)) 
示例13
def plot_z_run(z_run, label, ):
    f1, ax1 = plt.subplots(2, 1)

    # First fit a PCA
    PCA_model = TruncatedSVD(n_components=3).fit(z_run)
    z_run_reduced = PCA_model.transform(z_run)
    ax1[0].scatter(z_run_reduced[:, 0], z_run_reduced[:, 1], c=label, marker='*', linewidths=0)
    ax1[0].set_title('PCA on z_run')

    # THen fit a tSNE
    tSNE_model = TSNE(verbose=2, perplexity=80, min_grad_norm=1E-12, n_iter=3000)
    z_run_tsne = tSNE_model.fit_transform(z_run)
    ax1[1].scatter(z_run_tsne[:, 0], z_run_tsne[:, 1], c=label, marker='*', linewidths=0)
    ax1[1].set_title('tSNE on z_run')

    plt.show()
    return 
示例14
def __init__(self, k=3, **kwargs):
        self.k = k
        self.pipeline = Pipeline([
            ('norm', TextNormalizer(minimum=10, maximum=100)),
            ('tfidf', TfidfVectorizer()),
            ('knn', Pipeline([
                ('svd', TruncatedSVD(n_components=100)),
                ('model', KNNTransformer(k=self.k, algorithm='ball_tree'))
            ]))
        ])

        self.lex_path = "lexicon.pkl"
        self.vect_path = "vect.pkl"
        self.vectorizer = False
        self.lexicon = None
        self.load() 
示例15
def __init__(self, n_topics=50, estimator='LDA'):
        """
        n_topics is the desired number of topics
        To use Latent Semantic Analysis, set estimator to 'LSA',
        To use Non-Negative Matrix Factorization, set estimator to 'NMF',
        otherwise, defaults to Latent Dirichlet Allocation ('LDA').
        """
        self.n_topics = n_topics

        if estimator == 'LSA':
            self.estimator = TruncatedSVD(n_components=self.n_topics)
        elif estimator == 'NMF':
            self.estimator = NMF(n_components=self.n_topics)
        else:
            self.estimator = LatentDirichletAllocation(n_topics=self.n_topics)

        self.model = Pipeline([
            ('norm', TextNormalizer()),
            ('tfidf', CountVectorizer(tokenizer=identity,
                                      preprocessor=None, lowercase=False)),
            ('model', self.estimator)
        ]) 
示例16
def create_pipeline(estimator, reduction=False):

    steps = [
        ('normalize', TextNormalizer()),
        ('vectorize', TfidfVectorizer(
            tokenizer=identity, preprocessor=None, lowercase=False
        ))
    ]

    if reduction:
        steps.append((
            'reduction', TruncatedSVD(n_components=10000)
        ))

    # Add the estimator
    steps.append(('classifier', estimator))
    return Pipeline(steps) 
示例17
def reduce_dimensionality(X, n_features):
        """
        Apply PCA or SVD to reduce dimension to n_features.
        :param X:
        :param n_features:
        :return:
        """
        # Initialize reduction method: PCA or SVD
        # reducer = PCA(n_components=n_features)
        reducer = TruncatedSVD(n_components=n_features)
        # Fit and transform data to n_features-dimensional space
        reducer.fit(X)
        X = reducer.transform(X)
        logging.debug("Reduced number of features to {0}".format(n_features))
        logging.debug("Percentage explained: %s\n" % reducer.explained_variance_ratio_.sum())
        return X 
示例18
def do_tfidf_feature(df, tfidf):
    n_components = 30
    svd = TruncatedSVD(
        n_components=n_components, algorithm="arpack", random_state=2019
    )

    col_tfidf = tfidf.transform(df["col"])

    feature_names = tfidf.get_feature_names()
    ret_df = pd.DataFrame(col_tfidf.toarray(), columns=feature_names)
    return ret_df

    col_svd = svd.fit_transform(col_tfidf)

    best_fearures = [
        feature_names[i] + "i" for i in svd.components_[0].argsort()[::-1]
    ]
    ret_df = pd.DataFrame(col_svd, columns=best_fearures[:n_components])
    return ret_df 
示例19
def __init__(self, **kwargs):
        super().__init__()
        self.truncated_svd = decomposition.TruncatedSVD(**kwargs) 
示例20
def create_spectral_features(args, positive_edges, negative_edges, node_count):
    """
    Creating spectral node features using the train dataset edges.
    :param args: Arguments object.
    :param positive_edges: Positive edges list.
    :param negative_edges: Negative edges list.
    :param node_count: Number of nodes.
    :return X: Node features.
    """
    p_edges = positive_edges + [[edge[1], edge[0]] for edge in positive_edges]
    n_edges = negative_edges + [[edge[1], edge[0]] for edge in negative_edges]
    train_edges = p_edges + n_edges
    index_1 = [edge[0] for edge in train_edges]
    index_2 = [edge[1] for edge in train_edges]
    values = [1]*len(p_edges) + [-1]*len(n_edges)
    shaping = (node_count, node_count)
    signed_A = sparse.csr_matrix(sparse.coo_matrix((values, (index_1, index_2)),
                                                   shape=shaping,
                                                   dtype=np.float32))

    svd = TruncatedSVD(n_components=args.reduction_dimensions,
                       n_iter=args.reduction_iterations,
                       random_state=args.seed)
    svd.fit(signed_A)
    X = svd.components_.T
    return X 
示例21
def create_spectral_features(self, pos_edge_index, neg_edge_index,
                                 num_nodes=None):
        r"""Creates :obj:`in_channels` spectral node features based on
        positive and negative edges.

        Args:
            pos_edge_index (LongTensor): The positive edge indices.
            neg_edge_index (LongTensor): The negative edge indices.
            num_nodes (int, optional): The number of nodes, *i.e.*
                :obj:`max_val + 1` of :attr:`pos_edge_index` and
                :attr:`neg_edge_index`. (default: :obj:`None`)
        """

        edge_index = torch.cat([pos_edge_index, neg_edge_index], dim=1)
        N = edge_index.max().item() + 1 if num_nodes is None else num_nodes
        edge_index = edge_index.to(torch.device('cpu'))

        pos_val = torch.full((pos_edge_index.size(1), ), 2, dtype=torch.float)
        neg_val = torch.full((neg_edge_index.size(1), ), 0, dtype=torch.float)
        val = torch.cat([pos_val, neg_val], dim=0)

        row, col = edge_index
        edge_index = torch.cat([edge_index, torch.stack([col, row])], dim=1)
        val = torch.cat([val, val], dim=0)

        edge_index, val = coalesce(edge_index, val, N, N)
        val = val - 1

        # Borrowed from:
        # https://github.com/benedekrozemberczki/SGCN/blob/master/src/utils.py
        edge_index = edge_index.detach().numpy()
        val = val.detach().numpy()
        A = scipy.sparse.coo_matrix((val, edge_index), shape=(N, N))
        svd = TruncatedSVD(n_components=self.in_channels, n_iter=128)
        svd.fit(A)
        x = svd.components_.T
        return torch.from_numpy(x).to(torch.float).to(pos_edge_index.device) 
示例22
def train(self, x, y, textual_evidence=None):

        if self.TEXT:
            x = textual_evidence

            x = [str(i) for i in x]  # make sure every word is string

        self.train_x = x
        self.train_y = y

        # VECTORIZATION
        print("vectorization..")
        # X = np.concatenate([train['triples_prep'], test['triples_prep']])

        self.count_vect = CountVectorizer().fit(x)
        x = self.count_vect.transform(x)

        self.tf_transformer = TfidfTransformer().fit(x)
        x = self.tf_transformer.transform(x)

        self.svd = TruncatedSVD(n_components=self.N_COMPONENTS).fit(x)
        x = self.svd.transform(x)

        # CLUSTERING
        print("clustering..")
        self.neigh = NearestNeighbors(self.K, self.RADIUS)
        self.neigh.fit(x)  # clustering only training set 
示例23
def fit_base_SVD_model(self):
        """
        Reducing the dimensionality with SVD in the 1st step.
        """
        self.P = self.P.dot(self.X)
        self.model = TruncatedSVD(n_components=self.args.dimensions,
                                  n_iter=70,
                                  random_state=42)

        self.model.fit(self.P)
        self.P = self.model.fit_transform(self.P) 
示例24
def fit(self, X, y=None):
        """Fit the transformer.

        Parameters
        ----------

        X : Pandas ``DataFrame``, shape=(n_samples, n_features)
            The Pandas frame to fit. The frame will only
            be fit on the prescribed ``cols`` (see ``__init__``) or
            all of them if ``cols`` is None. Furthermore, ``X`` will
            not be altered in the process of the fit.

        y : None
            Passthrough for ``sklearn.pipeline.Pipeline``. Even
            if explicitly set, will not change behavior of ``fit``.

        Returns
        -------

        self
        """
        # check on state of X and cols
        X, self.cols = validate_is_pd(X, self.cols)
        cols = _cols_if_none(X, self.cols)

        # fails thru if names don't exist:
        self.svd_ = TruncatedSVD(
            n_components=self.n_components,
            algorithm=self.algorithm,
            n_iter=self.n_iter).fit(X[cols].as_matrix())

        return self 
示例25
def get_decomposition(self):
        """Overridden from the :class:``skutil.decomposition.decompose._BaseSelectiveDecomposer`` class,
        this method returns the internal decomposition class: 
        ``sklearn.decomposition.TruncatedSVD``

        Returns
        -------
        self.svd_ : ``sklearn.decomposition.TruncatedSVD``
            The fit internal decomposition class
        """
        return self.svd_ if hasattr(self, 'svd_') else None 
示例26
def transform(self):
        tfidf = self._init_word_ngram_tfidf(self.ngram)
        X = tfidf.fit_transform(self.obs_corpus)
        svd = TruncatedSVD(n_components = self.svd_dim, 
                n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED)
        return svd.fit_transform(X) 
示例27
def transform(self):
        tfidf = self._init_char_ngram_tfidf(self.ngram)
        X = tfidf.fit_transform(self.obs_corpus)
        svd = TruncatedSVD(n_components=self.svd_dim, 
                n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED)
        return svd.fit_transform(X)


# ------------------------ Cooccurrence LSA -------------------------------
# 1st in CrowdFlower 
示例28
def transform(self):
        ## tfidf
        tfidf = self._init_word_ngram_tfidf(ngram=self.ngram)
        X_obs = tfidf.fit_transform(self.obs_corpus)
        X_target = tfidf.fit_transform(self.target_corpus)
        X_tfidf = scipy.sparse.hstack([X_obs, X_target]).tocsr()
        ## svd
        svd = TruncatedSVD(n_components=self.svd_dim, 
                n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED)
        X_svd = svd.fit_transform(X_tfidf)
        return X_svd


# -------------------------------- TSNE ------------------------------------------
# 2nd in CrowdFlower (preprocessing_mikhail.py) 
示例29
def _create_single_embedding(self, target_matrix):
        """
        Fitting a single SVD embedding of a PMI matrix.
        """
        svd = TruncatedSVD(n_components=self.dimensions,
                           n_iter=self.iterations,
                           random_state=self.seed)
        svd.fit(target_matrix)
        embedding = svd.transform(target_matrix)
        self._embeddings.append(embedding) 
示例30
def _create_embedding(self, target_matrix):
        """
        Fitting a truncated SVD embedding of a PMI matrix.
        """
        svd = TruncatedSVD(n_components=self.dimensions,
                           n_iter=self.iterations,
                           random_state=self.seed)
        svd.fit(target_matrix)
        embedding = svd.transform(target_matrix)
        return embedding