Python源码示例:sklearn.metrics.pairwise_distances()

示例1
def test_lof_precomputed(random_state=42):
    """Tests LOF with a distance matrix."""
    # Note: smaller samples may result in spurious test success
    rng = np.random.RandomState(random_state)
    X = rng.random_sample((10, 4))
    Y = rng.random_sample((3, 4))
    DXX = metrics.pairwise_distances(X, metric='euclidean')
    DYX = metrics.pairwise_distances(Y, X, metric='euclidean')
    # As a feature matrix (n_samples by n_features)
    lof_X = neighbors.LocalOutlierFactor(n_neighbors=3, novelty=True)
    lof_X.fit(X)
    pred_X_X = lof_X._predict()
    pred_X_Y = lof_X.predict(Y)

    # As a dense distance matrix (n_samples by n_samples)
    lof_D = neighbors.LocalOutlierFactor(n_neighbors=3, algorithm='brute',
                                         metric='precomputed', novelty=True)
    lof_D.fit(DXX)
    pred_D_X = lof_D._predict()
    pred_D_Y = lof_D.predict(DYX)

    assert_array_almost_equal(pred_X_X, pred_D_X)
    assert_array_almost_equal(pred_X_Y, pred_D_Y) 
示例2
def test_simple_example():
    """Test on a simple example.

    Puts four points in the input space where the opposite labels points are
    next to each other. After transform the samples from the same class
    should be next to each other.

    """
    X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]])
    y = np.array([1, 0, 1, 0])
    nca = NeighborhoodComponentsAnalysis(n_components=2, init='identity',
                                         random_state=42)
    nca.fit(X, y)
    X_t = nca.transform(X)
    assert_array_equal(pairwise_distances(X_t).argsort()[:, 1],
                       np.array([2, 3, 0, 1])) 
示例3
def compute_heterogeneity(data, k, centroids, cluster_assignment):

    heterogeneity = 0.0
    for i in range(k):

        # Select all data points that belong to cluster i. Fill in the blank (RHS only)
        member_data_points = data[cluster_assignment == i, :]

        if member_data_points.shape[0] > 0:  # check if i-th cluster is non-empty
            # Compute distances from centroid to data points (RHS only)
            distances = pairwise_distances(
                member_data_points, [centroids[i]], metric="euclidean"
            )
            squared_distances = distances ** 2
            heterogeneity += np.sum(squared_distances)

    return heterogeneity 
示例4
def get_similarities(query_feats, para_features, top=10, combine_feat_scores="mul"):
    """
    Get similarities based on multiple independent queries that are then combined using combine_feat_scores
    :param query_feats: Multiple vectorized text queries
    :param para_features: Multiple vectorized text paragraphs that will be scored against the queries
    :param top: Top N facts to keep
    :param combine_feat_scores: The way for combining the multiple scores
    :return: Ranked fact ids with scores List[tuple(id, weight)]
    """
    scores_per_feat = [pairwise_distances(q_feat, para_features, "cosine").ravel() for q_feat in query_feats]  # this is distance - low is better!!!
    comb_func = comb_funcs[combine_feat_scores]

    smoothing_val = 0.000001
    max_val = pow((1 + smoothing_val), 2)
    dists = scores_per_feat[0] + smoothing_val
    if len(scores_per_feat) > 1:
        for i in range(1, len(scores_per_feat)):
            dists = comb_func(scores_per_feat[i] + smoothing_val, dists)
    sorted_ix = np.argsort(dists).tolist()  # this is asc (lowers first), in case of ties, uses the earlier paragraph

    return [[i, (max_val - dists[i]) / max_val] for i in sorted_ix][:top] 
示例5
def combine_similarities(scores_per_feat, top=10, combine_feat_scores="mul"):
    """
    Get similarities based on multiple independent queries that are then combined using combine_feat_scores
    :param query_feats: Multiple vectorized text queries
    :param para_features: Multiple vectorized text paragraphs that will be scored against the queries
    :param top: Top N facts to keep
    :param combine_feat_scores: The way for combining the multiple scores
    :return: Ranked fact ids with scores List[tuple(id, weight)]
    """
    # scores_per_feat = [pairwise_distances(q_feat, para_features, "cosine").ravel() for q_feat in query_feats]  # this is distance - low is better!!!
    comb_func = comb_funcs[combine_feat_scores]

    smoothing_val = 0.000001
    max_val = pow((1 + smoothing_val), 2)
    dists = scores_per_feat[0] + smoothing_val
    if len(scores_per_feat) > 1:
        for i in range(1, len(scores_per_feat)):
            dists = comb_func(scores_per_feat[i] + smoothing_val, dists)
    sorted_ix = np.argsort(dists).tolist()  # this is asc (lowers first) ,in case of ties, uses the earlier paragraph

    max_val = max(np.max(dists), 1)
    return [[i, (max_val - dists[i]) / max_val] for i in sorted_ix][:top] 
示例6
def spatial_check(metric):
    dist_matrix = pairwise_distances(spatial_data, metric=metric)
    # scipy is bad sometimes
    if metric == "braycurtis":
        dist_matrix[np.where(~np.isfinite(dist_matrix))] = 0.0
    if metric in ("cosine", "correlation"):
        dist_matrix[np.where(~np.isfinite(dist_matrix))] = 1.0
        # And because distance between all zero vectors should be zero
        dist_matrix[10, 11] = 0.0
        dist_matrix[11, 10] = 0.0
    dist_function = dist.named_distances[metric]
    test_matrix = np.array(
        [
            [
                dist_function(spatial_data[i], spatial_data[j])
                for j in range(spatial_data.shape[0])
            ]
            for i in range(spatial_data.shape[0])
        ]
    )
    assert_array_almost_equal(
        test_matrix,
        dist_matrix,
        err_msg="Distances don't match " "for metric {}".format(metric),
    ) 
示例7
def binary_check(metric):
    dist_matrix = pairwise_distances(binary_data, metric=metric)
    if metric in ("jaccard", "dice", "sokalsneath", "yule"):
        dist_matrix[np.where(~np.isfinite(dist_matrix))] = 0.0
    if metric in ("kulsinski", "russellrao"):
        dist_matrix[np.where(~np.isfinite(dist_matrix))] = 0.0
        # And because distance between all zero vectors should be zero
        dist_matrix[10, 11] = 0.0
        dist_matrix[11, 10] = 0.0
    dist_function = dist.named_distances[metric]
    test_matrix = np.array(
        [
            [
                dist_function(binary_data[i], binary_data[j])
                for j in range(binary_data.shape[0])
            ]
            for i in range(binary_data.shape[0])
        ]
    )
    assert_array_almost_equal(
        test_matrix,
        dist_matrix,
        err_msg="Distances don't match " "for metric {}".format(metric),
    ) 
示例8
def test_seuclidean():
    v = np.abs(np.random.randn(spatial_data.shape[1]))
    dist_matrix = pairwise_distances(spatial_data, metric="seuclidean", V=v)
    test_matrix = np.array(
        [
            [
                dist.standardised_euclidean(spatial_data[i], spatial_data[j], v)
                for j in range(spatial_data.shape[0])
            ]
            for i in range(spatial_data.shape[0])
        ]
    )
    assert_array_almost_equal(
        test_matrix,
        dist_matrix,
        err_msg="Distances don't match " "for metric seuclidean",
    ) 
示例9
def test_mahalanobis():
    v = np.cov(np.transpose(spatial_data))
    dist_matrix = pairwise_distances(spatial_data, metric="mahalanobis", VI=v)
    test_matrix = np.array(
        [
            [
                dist.mahalanobis(spatial_data[i], spatial_data[j], v)
                for j in range(spatial_data.shape[0])
            ]
            for i in range(spatial_data.shape[0])
        ]
    )
    assert_array_almost_equal(
        test_matrix,
        dist_matrix,
        err_msg="Distances don't match " "for metric mahalanobis",
    ) 
示例10
def sparse_spatial_check(metric, sparse_spatial_data):
    # Check that metric is supported for this test, otherwise, fail!
    assert (
        metric in spdist.sparse_named_distances
    ), f"{metric} not supported for sparse data"
    dist_matrix = pairwise_distances(sparse_spatial_data.todense(), metric=metric)

    if metric in ("braycurtis", "dice", "sokalsneath", "yule"):
        dist_matrix[np.where(~np.isfinite(dist_matrix))] = 0.0

    if metric in ("cosine", "correlation", "kulsinski", "russellrao"):
        dist_matrix[np.where(~np.isfinite(dist_matrix))] = 1.0
        # And because distance between all zero vectors should be zero
        dist_matrix[10, 11] = 0.0
        dist_matrix[11, 10] = 0.0

    run_test_sparse_metric(metric, sparse_spatial_data, dist_matrix) 
示例11
def sparse_binary_check(metric, sparse_binary_data):
    # Check that metric is supported for this test, otherwise, fail!
    assert (
        metric in spdist.sparse_named_distances
    ), f"{metric} not supported for sparse data"
    dist_matrix = pairwise_distances(sparse_binary_data.todense(), metric=metric)
    if metric in ("jaccard", "dice", "sokalsneath", "yule"):
        dist_matrix[np.where(~np.isfinite(dist_matrix))] = 0.0

    if metric in ("kulsinski", "russellrao"):
        dist_matrix[np.where(~np.isfinite(dist_matrix))] = 1.0
        # And because distance between all zero vectors should be zero
        dist_matrix[10, 11] = 0.0
        dist_matrix[11, 10] = 0.0

    run_test_sparse_metric(metric, sparse_binary_data, dist_matrix)


# --------------------
# Spatial Metric Tests
# -------------------- 
示例12
def test_weighted_minkowski(spatial_data):
    v = np.abs(np.random.randn(spatial_data.shape[1]))
    dist_matrix = pairwise_distances(spatial_data, metric="wminkowski", w=v, p=3)
    test_matrix = np.array(
        [
            [
                dist.weighted_minkowski(spatial_data[i], spatial_data[j], v, p=3)
                for j in range(spatial_data.shape[0])
            ]
            for i in range(spatial_data.shape[0])
        ]
    )
    assert_array_almost_equal(
        test_matrix,
        dist_matrix,
        err_msg="Distances don't match " "for metric weighted_minkowski",
    ) 
示例13
def test_mahalanobis(spatial_data):
    v = np.cov(np.transpose(spatial_data))
    dist_matrix = pairwise_distances(spatial_data, metric="mahalanobis", VI=v)
    test_matrix = np.array(
        [
            [
                dist.mahalanobis(spatial_data[i], spatial_data[j], v)
                for j in range(spatial_data.shape[0])
            ]
            for i in range(spatial_data.shape[0])
        ]
    )
    assert_array_almost_equal(
        test_matrix,
        dist_matrix,
        err_msg="Distances don't match " "for metric mahalanobis",
    ) 
示例14
def gaussian(x, workers=None):
    """Default medial gaussian kernel similarity calculation"""
    l1 = pairwise_distances(X=x, metric="l1", n_jobs=workers)
    n = l1.shape[0]
    med = np.median(
        np.lib.stride_tricks.as_strided(
            l1, (n - 1, n + 1), (l1.itemsize * (n + 1), l1.itemsize)
        )[:, 1:]
    )
    # prevents division by zero when used on label vectors
    med = med if med else 1
    gamma = 1.0 / (2 * (med ** 2))
    return rbf_kernel(x, gamma=gamma)


# p-value computation 
示例15
def _compute_isc(data, metric='median'):
    ''' Helper function to compute intersubject correlation from observations by subjects array.
        
        Args:
            data: (pd.DataFrame, np.array) observations by subjects where isc is computed across subjects
            metric: (str) type of association metric ['spearman','pearson','kendall']
        
        Returns:
            isc: (float) intersubject correlation coefficient
            
    '''

    from nltools.data import Adjacency

    similarity = Adjacency(1 - pairwise_distances(data.T, metric='correlation'), matrix_type='similarity')
    if metric =='mean':
        isc = np.tanh(similarity.r_to_z().mean())
    elif metric =='median':
        isc = similarity.median()
    return isc 
示例16
def test_lof_precomputed(algorithm, random_state=42):
    """Tests LOF with a distance matrix."""
    # Note: smaller samples may result in spurious test success
    local_rng = np.random.RandomState(random_state)
    X = local_rng.random_sample((10, 4))
    Y = local_rng.random_sample((3, 4))
    DXX = metrics.pairwise_distances(X, metric='euclidean')
    DYX = metrics.pairwise_distances(Y, X, metric='euclidean')
    # As a feature matrix (n_samples by n_features)
    lof_X = neighbors.LocalOutlierFactor(n_neighbors=3,
                                         algorithm=algorithm,
                                         novelty=True)
    lof_X.fit(X)
    pred_X_X = lof_X._predict()
    pred_X_Y = lof_X.predict(Y)

    # As a dense distance matrix (n_samples by n_samples)
    lof_D = neighbors.LocalOutlierFactor(n_neighbors=3, algorithm='brute',
                                         metric='precomputed', novelty=True)
    lof_D.fit(DXX)
    pred_D_X = lof_D._predict()
    pred_D_Y = lof_D.predict(DYX)

    assert_array_almost_equal(pred_X_X, pred_D_X)
    assert_array_almost_equal(pred_X_Y, pred_D_Y) 
示例17
def test_simple_example():
    """Test on a simple example.

    Puts four points in the input space where the opposite labels points are
    next to each other. After transform the samples from the same class
    should be next to each other.

    """
    X = np.array([[0, 0], [0, 1], [2, 0], [2, 1]])
    y = np.array([1, 0, 1, 0])
    nca = NeighborhoodComponentsAnalysis(n_components=2, init='identity',
                                         random_state=42)
    nca.fit(X, y)
    X_t = nca.transform(X)
    assert_array_equal(pairwise_distances(X_t).argsort()[:, 1],
                       np.array([2, 3, 0, 1])) 
示例18
def test_precomputed_cross_validation():
    # Ensure array is split correctly
    rng = np.random.RandomState(0)
    X = rng.rand(20, 2)
    D = pairwise_distances(X, metric='euclidean')
    y = rng.randint(3, size=20)
    for Est in (neighbors.KNeighborsClassifier,
                neighbors.RadiusNeighborsClassifier,
                neighbors.KNeighborsRegressor,
                neighbors.RadiusNeighborsRegressor):
        metric_score = cross_val_score(Est(algorithm_params={'n_candidates': 5}), X, y)
        precomp_score = cross_val_score(Est(metric='precomputed',
                                            algorithm_params={'n_candidates': 5},
                                            ),
                                        D, y)
        assert_array_equal(metric_score, precomp_score) 
示例19
def pairwise_distances(
    X: ArrayLike,
    Y: ArrayLike,
    metric: Union[str, Callable[[ArrayLike, ArrayLike], float]] = "euclidean",
    n_jobs: Optional[int] = None,
    **kwargs: Any
):
    if isinstance(Y, da.Array):
        raise TypeError("`Y` must be a numpy array")
    chunks = (X.chunks[0], (len(Y),))
    return X.map_blocks(
        metrics.pairwise_distances,
        Y,
        dtype=float,
        chunks=chunks,
        metric=metric,
        **kwargs
    ) 
示例20
def update_distances(self, cluster_centers, only_new=True, reset_dist=False):
    """Update min distances given cluster centers.

    Args:
      cluster_centers: indices of cluster centers
      only_new: only calculate distance for newly selected points and update
        min_distances.
      rest_dist: whether to reset min_distances.
    """

    if reset_dist:
      self.min_distances = None
    if only_new:
      cluster_centers = [d for d in cluster_centers
                         if d not in self.already_selected]
    if cluster_centers:
      # Update min_distances for all examples given new cluster center.
      x = self.features[cluster_centers]
      dist = pairwise_distances(self.features, x, metric=self.metric)

      if self.min_distances is None:
        self.min_distances = np.min(dist, axis=1).reshape(-1,1)
      else:
        self.min_distances = np.minimum(self.min_distances, dist) 
示例21
def _max_representative_samples(self, image_features, candidate_image_features, selection_count):
        all_distances = pairwise_distances(image_features, candidate_image_features, metric='euclidean')
        selected_sample_indices = []
        print('Finding max representative candidates..')
        minimum_distances = np.ones((len(image_features))) * float('inf')
        for _ in tqdm(range(selection_count)):
            current_best_score = float("-inf")
            current_best_idx = None
            current_minimum_distances = None
            for i in range(len(candidate_image_features)):
                if i not in selected_sample_indices:
                    selected_sample_indices.append(i)
                    tmp_distances = np.minimum(minimum_distances, all_distances[:, i])
                    tmp_score = np.sum(tmp_distances) * -1
                    if tmp_score > current_best_score:
                        current_best_score = tmp_score
                        current_minimum_distances = tmp_distances
                        current_best_idx = i
                    selected_sample_indices.pop()
            selected_sample_indices.append(current_best_idx)
            minimum_distances = current_minimum_distances
        return selected_sample_indices 
示例22
def assign_clusters(data, centroids):
    
    # Compute distances between each data point and the set of centroids:
    # Fill in the blank (RHS only)
    distances_from_centroids = pairwise_distances(data, centroids, metric='euclidean')
        
    # Compute cluster assignments for each data point:
    # Fill in the blank (RHS only)
    cluster_assignment = np.argmin(distances_from_centroids, axis=1)
    
    return cluster_assignment


# **Checkpoint**. For the last time, let us check if Step 1 was implemented correctly. With rows 0, 2, 4, and 6 of `tf_idf` as an initial set of centroids, we assign cluster labels to rows 0, 10, 20, ..., and 90 of `tf_idf`. The resulting cluster labels should be `[0, 1, 1, 0, 0, 2, 0, 2, 2, 1]`.

# In[15]: 
示例23
def compute_heterogeneity(data, k, centroids, cluster_assignment):
    
    heterogeneity = 0.0
    for i in xrange(k):
        
        # Select all data points that belong to cluster i. Fill in the blank (RHS only)
        member_data_points = data[cluster_assignment==i, :]
        
        if member_data_points.shape[0] > 0: # check if i-th cluster is non-empty
            # Compute distances from centroid to data points (RHS only)
            distances = pairwise_distances(member_data_points, [centroids[i]], metric='euclidean')
            squared_distances = distances**2
            heterogeneity += np.sum(squared_distances)
        
    return heterogeneity


# Let's compute the cluster heterogeneity for the 2-cluster example we've been considering based on our current cluster assignments and centroids.

# In[26]: 
示例24
def computer_perf(self, instances):
        X = instances.features.get_values()
        labels = instances.ground_truth.get_labels()
        # For unsupervised projection methods,
        # the performance is always computed with the labels
        # (not the families).
        if hasattr(self.projection.conf, 'multiclass'):
            if self.projection.conf.multiclass:
                labels = instances.ground_truth.get_families()
        unique_labels, label_inds = np.unique(labels, return_inverse=True)
        ratio = 0
        for li in range(len(unique_labels)):
            Xc = X[label_inds == li]
            Xnc = X[label_inds != li]
            ratio += pairwise_distances(Xc).mean() / \
                pairwise_distances(Xc, Xnc).mean()
        self.class_separation = ratio / len(unique_labels) 
示例25
def compute_distances_to_neighborhood(self, to_explain_X, samples_X):
        distances = pairwise_distances(to_explain_X, samples_X, metric='euclidean')
        #distance to self will always appear in first column
        distances[:,0] = np.zeros(distances.shape[0])
        return distances 
示例26
def _main(args):
    with open(args.results_file, 'rb') as f:
        results = pickle.load(f)

    features, preds, labels, filenames = results
    distances = pairwise_distances(features, metric='cosine', n_jobs=-1)
    filenames = np.array([_get_only_filename(item) for item in filenames])

    avg_top_one = []
    results = []
    for i, row in enumerate(distances):
        sorted_similarity = np.argsort(row)[1:]
        gt = labels[i]
        top_one = labels[sorted_similarity[0]]
        avg_top_one.append([[top_one, ].count(gt)])

        tmp = []
        tmp.append(filenames[i])
        query_results = filenames[sorted_similarity[:args.num_results]]
        _ = [tmp.append(item) for item in query_results]
        results.append(tmp)

    tmp = []
    tmp.append('Query')
    if args.num_results == None:
        args.num_results = len(features) - 1
    _ = [tmp.append('R{}'.format(i + 1)) for i in range(args.num_results)]

    results = sorted(results, key=lambda e:e[0])
    dframe = DataFrame(results, columns=tmp)
    dframe.to_csv(args.output_file, index=False)
    print('Precision@1: {}'.format(np.average(avg_top_one)))

    return 
示例27
def _calc_mean_pd(data, f):
    return (pairwise_distances(data[f], metric='cosine').sum() / 2) / (f.sum() ** 2 - f.sum()) 
示例28
def test_silhouette():
    # Tests the Silhouette Coefficient.
    dataset = datasets.load_iris()
    X_dense = dataset.data
    X_csr = csr_matrix(X_dense)
    X_dok = sp.dok_matrix(X_dense)
    X_lil = sp.lil_matrix(X_dense)
    y = dataset.target

    for X in [X_dense, X_csr, X_dok, X_lil]:
        D = pairwise_distances(X, metric='euclidean')
        # Given that the actual labels are used, we can assume that S would be
        # positive.
        score_precomputed = silhouette_score(D, y, metric='precomputed')
        assert_greater(score_precomputed, 0)
        # Test without calculating D
        score_euclidean = silhouette_score(X, y, metric='euclidean')
        pytest.approx(score_precomputed, score_euclidean)

        if X is X_dense:
            score_dense_without_sampling = score_precomputed
        else:
            pytest.approx(score_euclidean,
                          score_dense_without_sampling)

        # Test with sampling
        score_precomputed = silhouette_score(D, y, metric='precomputed',
                                             sample_size=int(X.shape[0] / 2),
                                             random_state=0)
        score_euclidean = silhouette_score(X, y, metric='euclidean',
                                           sample_size=int(X.shape[0] / 2),
                                           random_state=0)
        assert_greater(score_precomputed, 0)
        assert_greater(score_euclidean, 0)
        pytest.approx(score_euclidean, score_precomputed)

        if X is X_dense:
            score_dense_with_sampling = score_precomputed
        else:
            pytest.approx(score_euclidean, score_dense_with_sampling) 
示例29
def test_spectral_unknown_mode():
    # Test that SpectralClustering fails with an unknown mode set.
    centers = np.array([
        [0., 0., 0.],
        [10., 10., 10.],
        [20., 20., 20.],
    ])
    X, true_labels = make_blobs(n_samples=100, centers=centers,
                                cluster_std=1., random_state=42)
    D = pairwise_distances(X)  # Distance matrix
    S = np.max(D) - D  # Similarity matrix
    S = sparse.coo_matrix(S)
    assert_raises(ValueError, spectral_clustering, S, n_clusters=2,
                  random_state=0, eigen_solver="<unknown>") 
示例30
def test_spectral_unknown_assign_labels():
    # Test that SpectralClustering fails with an unknown assign_labels set.
    centers = np.array([
        [0., 0., 0.],
        [10., 10., 10.],
        [20., 20., 20.],
    ])
    X, true_labels = make_blobs(n_samples=100, centers=centers,
                                cluster_std=1., random_state=42)
    D = pairwise_distances(X)  # Distance matrix
    S = np.max(D) - D  # Similarity matrix
    S = sparse.coo_matrix(S)
    assert_raises(ValueError, spectral_clustering, S, n_clusters=2,
                  random_state=0, assign_labels="<unknown>")