
def test_equal_similarities_and_preferences():
    # Unequal distances
    X = np.array([[0, 0], [1, 1], [-2, -2]])
    S = -euclidean_distances(X, squared=True)

    assert not _equal_similarities_and_preferences(S, np.array(0))
    assert not _equal_similarities_and_preferences(S, np.array([0, 0]))
    assert not _equal_similarities_and_preferences(S, np.array([0, 1]))

    # Equal distances
    X = np.array([[0, 0], [1, 1]])
    S = -euclidean_distances(X, squared=True)

    # Different preferences
    assert not _equal_similarities_and_preferences(S, np.array([0, 1]))

    # Same preferences
    assert _equal_similarities_and_preferences(S, np.array([0, 0]))
    assert _equal_similarities_and_preferences(S, np.array(0)) 
def predict(self, X):
        """ A reference implementation of a prediction for a classifier.

        X : array-like, shape (n_samples, n_features)
            The input samples.

        y : ndarray, shape (n_samples,)
            The label for each sample is the label of the closest sample
            seen during fit.
        # Check is fit had been called
        check_is_fitted(self, ['X_', 'y_'])

        # Input validation
        X = check_array(X)

        closest = np.argmin(euclidean_distances(X, self.X_), axis=1)
        return self.y_[closest] 
def fit_transform(self, X, y=None, init=None):
        Fit the data from X, and returns the embedded coordinates

        X : array, shape=[n_samples, n_features], or [n_samples, n_samples] \
                if dissimilarity='precomputed'
            Input data.

        init : {None or ndarray, shape (n_samples,)}, optional
            If None, randomly chooses the initial configuration
            if ndarray, initialize the SMACOF algorithm with this array.

        X = check_array(X)
        if X.shape[0] == X.shape[1] and self.dissimilarity != "precomputed":
            warnings.warn("The MDS API has changed. ``fit`` now constructs an"
                          " dissimilarity matrix from data. To use a custom "
                          "dissimilarity matrix, set "

        if self.dissimilarity == "precomputed":
            self.dissimilarity_matrix_ = X
        elif self.dissimilarity == "euclidean":
            self.dissimilarity_matrix_ = euclidean_distances(X)
            raise ValueError("Proximity must be 'precomputed' or 'euclidean'."
                             " Got %s instead" % str(self.dissimilarity))

        self.embedding_, self.stress_, self.n_iter_ = smacof_p(
            self.dissimilarity_matrix_, self.n_uq, metric=self.metric,
            n_components=self.n_components, init=init, n_init=self.n_init,
            n_jobs=self.n_jobs, max_iter=self.max_iter, verbose=self.verbose,
            eps=self.eps, random_state=self.random_state,

        return self.embedding_ 
def _optimize(self, X, y):
        nb_prototypes = self.c_w_.size

        n_data, n_dim = X.shape
        prototypes = self.w_.reshape(nb_prototypes, n_dim)

        for i in range(n_data):
            xi = X[i]
            c_xi = int(y[i])
            best_euclid_corr = np.inf
            best_euclid_incorr = np.inf

            # find nearest correct and nearest wrong prototype
            for j in range(prototypes.shape[0]):
                if self.c_w_[j] == c_xi:
                    eucl_dis = euclidean_distances(xi.reshape(1, xi.size),
                                                   .reshape(1, prototypes[j]
                    if eucl_dis < best_euclid_corr:
                        best_euclid_corr = eucl_dis
                        corr_index = j
                    eucl_dis = euclidean_distances(xi.reshape(1, xi.size),
                                                   .reshape(1, prototypes[j]
                    if eucl_dis < best_euclid_incorr:
                        best_euclid_incorr = eucl_dis
                        incorr_index = j

            # Update nearest wrong prototype and nearest correct prototype
            # if correct prototype isn't the nearest
            if best_euclid_incorr < best_euclid_corr:
                self._update_prototype(j=corr_index, c_xi=c_xi, xi=xi,
                self._update_prototype(j=incorr_index, c_xi=c_xi, xi=xi,
def predict(self, x):
        Predict clusters for one sample

        x: ndarray
            Samples to predict

        label: int
            Predicted cluster

        # Find the closest cluster to samples
        # To do it, project x to appropriate subspace, find distance to mean value and norm by variance
        min_score = None
        closest = None
        for i in range(self.clusters):
            projection = x[:, self.features_[i]]
            norm = euclidean_distances(projection, self.means_[i])
            score = norm / self.vars_[i]
            if min_score is None or score < min_score:
                min_score = score
                closest = i
        return closest 
def wmdistance(sent1_embs, sent2_embs):
    wmd = 0.0
    for _,x in sent1_embs:
        min_dist = sys.float_info.max
        for _,y in sent2_embs:
            x = x.reshape(1, -1)
            y = y.reshape(1, -1)
            distance = euclidean_distances(x,y)
            if distance < min_dist:
                min_dist = distance
        wmd += min_dist
    return - float(wmd) / (len(sent1_embs) + len(sent2_embs))
# Note that this breaks the symmetry and is not a distance anymore:
# To overcome this, we compute the average of the score in both side: (weigthedWMD(a,b) + weightedWMD(b,a))/2 
def weighted_wmdistance(sent1_embs, sent2_embs, idfs, mean):
    wmd = 0.0
    for token1, x in sent1_embs:
        min_dist = sys.float_info.max
        weight = idfs[token1] if token1 in idfs else mean
        for _, y in sent2_embs:
            print(x, x.shape())
            print(y, y.shape())
            score = weight * euclidean_distances(x,y) 
            if score < min_dist:
                min_dist = score
        wmd += min_dist
    return - float(wmd) / (len(sent1_embs) + len(sent2_embs)) 
def test_shuffle_equal(verbose):
    # for this data set there shouldn't be any equal distances,
    # and shuffle should make no difference
    X, _ = make_classification(random_state=12354)
    dist = euclidean_distances(X)
    skew_shuffle, skew_no_shuffle = \
        [Hubness(metric='precomputed', shuffle_equal=v, verbose=verbose)
         .fit(dist).score() for v in [True, False]]
    assert skew_no_shuffle == skew_shuffle 
def test_sparse_equal_dense(verbose, shuffle_equal):
    X, _ = make_classification()
    dist_dense = euclidean_distances(X)
    dist_sparse = csr_matrix(dist_dense)

    hub = Hubness(metric='precomputed',
    skew_dense = hub.score(has_self_distances=True)
    skew_sparse = hub.score(has_self_distances=True)

    np.testing.assert_almost_equal(skew_dense, skew_sparse) 
def test_sparse_equal_dense_if_variable_hits_per_row(shuffle_equal):
    X, _ = make_classification(random_state=123)
    dist = euclidean_distances(X)
    dist[0, 1:3] = 999
    dist[1:3, 0] = 999
    dist[1, 1:5] = 999
    dist[1:5, 1] = 999
    sparse = dist.copy()
    sparse[0, 1:3] = 0
    sparse[1:3, 0] = 0
    sparse[1, 1:5] = 0
    sparse[1:5, 1] = 0
    sparse = csr_matrix(sparse)

    hub = Hubness(metric='precomputed',
    skew_dense = hub.score(has_self_distances=True)

    hub = Hubness(metric='precomputed',
    skew_sparse = hub.score(has_self_distances=True)

    np.testing.assert_almost_equal(skew_dense, skew_sparse, decimal=2) 
def test_hubness_against_distance(has_self_distances):
    """Test hubness class against distance-based methods."""

    X = np.random.rand(100, 50)
    D = euclidean_distances(X)
    verbose = 1

    hub = Hubness(k=10, metric='precomputed',
    skew_d = hub.score(has_self_distances=has_self_distances)
    neigh_d = hub.k_neighbors
    occ_d = hub.k_occurrence

    hub = Hubness(k=10, metric='euclidean',
    skew_v = hub.score(X if not has_self_distances else None)
    neigh_v = hub.k_neighbors
    occ_v = hub.k_occurrence

    np.testing.assert_allclose(skew_d, skew_v, atol=1e-7)
    np.testing.assert_array_equal(neigh_d, neigh_v)
    np.testing.assert_array_equal(occ_d, occ_v) 
def __call__(self, track,  slice=None):

        # remove WHERE when table cleaned up to remove header rows
        statement = (
            "SELECT transcript_id, TPM, sample_id FROM sailfish_transcripts")

        # fetch data
        df = pd.DataFrame.from_dict(self.getAll(statement))

        df = df.pivot('transcript_id', 'sample_id')['TPM']

        # calculate dissimilarities
        similarities = euclidean_distances(df.transpose())

        # run MDS
        mds = manifold.MDS(n_components=2, max_iter=3000,
                           eps=1e-9, dissimilarity="precomputed", n_jobs=1)
        mds =
        pos = pd.DataFrame(mds.embedding_)

        pos.columns = ["MD1", "MD2"]
        pos['sample'] = df.columns

        factors_df = self.getDataFrame(
            "SELECT * FROM factors WHERE factor != 'genome'")

        merged_df = pd.merge(pos, factors_df,
                             left_on="sample", right_on="sample_id")
        return merged_df.reset_index().set_index("factor") 
def test_dissimilarity_precomputed_euclidean(data):
    test_views = []
    for i in data['samp_views']:
    mvmds1 = MVMDS(dissimilarity='euclidean')
    mvmds2 = MVMDS(dissimilarity='precomputed')

    fit1 = mvmds1.fit_transform(data['samp_views'])
    fit2 = mvmds2.fit_transform(test_views)

    np.testing.assert_almost_equal(np.abs(fit2), np.abs(fit1)) 
def testEuclideanDistancesExecution(self):
        dense_raw_x = np.random.rand(30, 10)
        dense_raw_y = np.random.rand(40, 10)
        sparse_raw_x = SparseNDArray(sps.random(30, 10, density=0.5, format='csr'))
        sparse_raw_y = SparseNDArray(sps.random(40, 10, density=0.5, format='csr'))

        for raw_x, raw_y in [(dense_raw_x, dense_raw_y),
                             (sparse_raw_x, sparse_raw_y)]:
            x = mt.tensor(raw_x, chunk_size=9)
            y = mt.tensor(raw_y, chunk_size=7)

            distance = euclidean_distances(x, y)

            result = self.executor.execute_tensor(distance, concat=True)[0]
            expected = sk_euclidean_distances(raw_x, Y=raw_y)
            np.testing.assert_almost_equal(result, expected)

            x_norm = x.sum(axis=1)[..., np.newaxis]
            y_norm = y.sum(axis=1)[np.newaxis, ...]
            distance = euclidean_distances(x, y, X_norm_squared=x_norm,
            x_raw_norm = raw_x.sum(axis=1)[..., np.newaxis]
            y_raw_norm = raw_y.sum(axis=1)[np.newaxis, ...]

            result = self.executor.execute_tensor(distance, concat=True)[0]
            expected = sk_euclidean_distances(raw_x, raw_y, X_norm_squared=x_raw_norm,
            np.testing.assert_almost_equal(result, expected)

            x_sq = (x ** 2).astype(np.float32)
            y_sq = (y ** 2).astype(np.float32)

            distance = euclidean_distances(x_sq, y_sq, squared=True)

            x_raw_sq = (raw_x ** 2).astype(np.float32)
            y_raw_sq = (raw_y ** 2).astype(np.float32)

            result = self.executor.execute_tensor(distance, concat=True)[0]
            expected = sk_euclidean_distances(x_raw_sq, y_raw_sq, squared=True)
            np.testing.assert_almost_equal(result, expected, decimal=6)

            # test x is y
            distance = euclidean_distances(x)

            result = self.executor.execute_tensor(distance, concat=True)[0]
            expected = sk_euclidean_distances(raw_x)

            np.testing.assert_almost_equal(result, expected) 
def fit(self, Xs):
        Calculates dimensionally reduced components by inputting the Euclidean
        distances of each view, double centering them, and using the _commonpcs
        function to find common components between views. Works similarly to
        traditional, single-view Multidimensional Scaling.

        Xs: list of array-likes or numpy.ndarray
                - Xs length: n_views
                - Xs[i] shape: (n_samples, n_features_i)


        if (self.n_components) > len(Xs[0]):
            self.n_components = len(Xs[0])
            warnings.warn('The number of components you have requested is '
                          + 'greater than the number of samples in the '
                          + 'dataset. ' + str(self.n_components)
                          + ' components were computed instead.')

        Xs = check_Xs(Xs, multiview=True)

        mat = np.ones(shape=(len(Xs), len(Xs[0]), len(Xs[0])))

        # Double centering each view as in single-view MDS

        if (self.dissimilarity == 'euclidean'):

            for i in np.arange(len(Xs)):
                view = euclidean_distances(Xs[i])
                view_squared = np.power(np.array(view), 2)

                J = np.eye(len(view)) - (1/len(view))*np.ones(view.shape)
                B = -(1/2) * J @ view_squared @ J
                mat[i] = B

        # If user wants to input special distance matrix

        elif (self.dissimilarity == 'precomputed'):
            for i in np.arange(len(Xs)):
                if (Xs[i].shape[0] != Xs[i].shape[1]):
                    raise ValueError('The input distance matrix must be '
                                     + 'a square matrix')
                    view = Xs[i]
                    view_squared = np.power(np.array(view), 2)
                    J = np.eye(len(view)) - (1/len(view))*np.ones(view.shape)
                    B = -(1/2) * J @ view_squared @ J
                    mat[i] = B
            raise ValueError('The parameter `dissimilarity` must be one of \
                {`euclidean`, `precomputed`}')

        self.components_ = self._commonpcs(mat)

        return self