Python源码示例:sklearn.metrics.euclidean_distances()
示例1
def test_equal_similarities_and_preferences():
# Unequal distances
X = np.array([[0, 0], [1, 1], [-2, -2]])
S = -euclidean_distances(X, squared=True)
assert not _equal_similarities_and_preferences(S, np.array(0))
assert not _equal_similarities_and_preferences(S, np.array([0, 0]))
assert not _equal_similarities_and_preferences(S, np.array([0, 1]))
# Equal distances
X = np.array([[0, 0], [1, 1]])
S = -euclidean_distances(X, squared=True)
# Different preferences
assert not _equal_similarities_and_preferences(S, np.array([0, 1]))
# Same preferences
assert _equal_similarities_and_preferences(S, np.array([0, 0]))
assert _equal_similarities_and_preferences(S, np.array(0))
示例2
def predict(self, X):
""" A reference implementation of a prediction for a classifier.
Parameters
----------
X : array-like, shape (n_samples, n_features)
The input samples.
Returns
-------
y : ndarray, shape (n_samples,)
The label for each sample is the label of the closest sample
seen during fit.
"""
# Check is fit had been called
check_is_fitted(self, ['X_', 'y_'])
# Input validation
X = check_array(X)
closest = np.argmin(euclidean_distances(X, self.X_), axis=1)
return self.y_[closest]
示例3
def fit_transform(self, X, y=None, init=None):
"""
Fit the data from X, and returns the embedded coordinates
Parameters
----------
X : array, shape=[n_samples, n_features], or [n_samples, n_samples] \
if dissimilarity='precomputed'
Input data.
init : {None or ndarray, shape (n_samples,)}, optional
If None, randomly chooses the initial configuration
if ndarray, initialize the SMACOF algorithm with this array.
"""
X = check_array(X)
if X.shape[0] == X.shape[1] and self.dissimilarity != "precomputed":
warnings.warn("The MDS API has changed. ``fit`` now constructs an"
" dissimilarity matrix from data. To use a custom "
"dissimilarity matrix, set "
"``dissimilarity=precomputed``.")
if self.dissimilarity == "precomputed":
self.dissimilarity_matrix_ = X
elif self.dissimilarity == "euclidean":
self.dissimilarity_matrix_ = euclidean_distances(X)
else:
raise ValueError("Proximity must be 'precomputed' or 'euclidean'."
" Got %s instead" % str(self.dissimilarity))
self.embedding_, self.stress_, self.n_iter_ = smacof_p(
self.dissimilarity_matrix_, self.n_uq, metric=self.metric,
n_components=self.n_components, init=init, n_init=self.n_init,
n_jobs=self.n_jobs, max_iter=self.max_iter, verbose=self.verbose,
eps=self.eps, random_state=self.random_state,
return_n_iter=True)
return self.embedding_
示例4
def _optimize(self, X, y):
nb_prototypes = self.c_w_.size
n_data, n_dim = X.shape
prototypes = self.w_.reshape(nb_prototypes, n_dim)
for i in range(n_data):
xi = X[i]
c_xi = int(y[i])
best_euclid_corr = np.inf
best_euclid_incorr = np.inf
# find nearest correct and nearest wrong prototype
for j in range(prototypes.shape[0]):
if self.c_w_[j] == c_xi:
eucl_dis = euclidean_distances(xi.reshape(1, xi.size),
prototypes[j]
.reshape(1, prototypes[j]
.size))
if eucl_dis < best_euclid_corr:
best_euclid_corr = eucl_dis
corr_index = j
else:
eucl_dis = euclidean_distances(xi.reshape(1, xi.size),
prototypes[j]
.reshape(1, prototypes[j]
.size))
if eucl_dis < best_euclid_incorr:
best_euclid_incorr = eucl_dis
incorr_index = j
# Update nearest wrong prototype and nearest correct prototype
# if correct prototype isn't the nearest
if best_euclid_incorr < best_euclid_corr:
self._update_prototype(j=corr_index, c_xi=c_xi, xi=xi,
prototypes=prototypes)
self._update_prototype(j=incorr_index, c_xi=c_xi, xi=xi,
prototypes=prototypes)
示例5
def test_random_projection_embedding_quality():
data, _ = make_sparse_random_data(8, 5000, 15000)
eps = 0.2
original_distances = euclidean_distances(data, squared=True)
original_distances = original_distances.ravel()
non_identical = original_distances != 0.0
# remove 0 distances to avoid division by 0
original_distances = original_distances[non_identical]
for RandomProjection in all_RandomProjection:
rp = RandomProjection(n_components='auto', eps=eps, random_state=0)
projected = rp.fit_transform(data)
projected_distances = euclidean_distances(projected, squared=True)
projected_distances = projected_distances.ravel()
# remove 0 distances to avoid division by 0
projected_distances = projected_distances[non_identical]
distances_ratio = projected_distances / original_distances
# check that the automatically tuned values for the density respect the
# contract for eps: pairwise distances are preserved according to the
# Johnson-Lindenstrauss lemma
assert_less(distances_ratio.max(), 1 + eps)
assert_less(1 - eps, distances_ratio.min())
示例6
def test_affinity_propagation():
# Affinity Propagation algorithm
# Compute similarities
S = -euclidean_distances(X, squared=True)
preference = np.median(S) * 10
# Compute Affinity Propagation
cluster_centers_indices, labels = affinity_propagation(
S, preference=preference)
n_clusters_ = len(cluster_centers_indices)
assert_equal(n_clusters, n_clusters_)
af = AffinityPropagation(preference=preference, affinity="precomputed")
labels_precomputed = af.fit(S).labels_
af = AffinityPropagation(preference=preference, verbose=True)
labels = af.fit(X).labels_
assert_array_equal(labels, labels_precomputed)
cluster_centers_indices = af.cluster_centers_indices_
n_clusters_ = len(cluster_centers_indices)
assert_equal(np.unique(labels).size, n_clusters_)
assert_equal(n_clusters, n_clusters_)
# Test also with no copy
_, labels_no_copy = affinity_propagation(S, preference=preference,
copy=False)
assert_array_equal(labels, labels_no_copy)
# Test input validation
assert_raises(ValueError, affinity_propagation, S[:, :-1])
assert_raises(ValueError, affinity_propagation, S, damping=0)
af = AffinityPropagation(affinity="unknown")
assert_raises(ValueError, af.fit, X)
示例7
def test_affinity_propagation_equal_mutual_similarities():
X = np.array([[-1, 1], [1, -1]])
S = -euclidean_distances(X, squared=True)
# setting preference > similarity
cluster_center_indices, labels = assert_warns_message(
UserWarning, "mutually equal", affinity_propagation, S, preference=0)
# expect every sample to become an exemplar
assert_array_equal([0, 1], cluster_center_indices)
assert_array_equal([0, 1], labels)
# setting preference < similarity
cluster_center_indices, labels = assert_warns_message(
UserWarning, "mutually equal", affinity_propagation, S, preference=-10)
# expect one cluster, with arbitrary (first) sample as exemplar
assert_array_equal([0], cluster_center_indices)
assert_array_equal([0, 0], labels)
# setting different preferences
cluster_center_indices, labels = assert_no_warnings(
affinity_propagation, S, preference=[-20, -10])
# expect one cluster, with highest-preference sample as exemplar
assert_array_equal([1], cluster_center_indices)
assert_array_equal([0, 0], labels)
示例8
def predict(self, x):
"""
Predict clusters for one sample
Parameters
----------
x: ndarray
Samples to predict
Returns
-------
label: int
Predicted cluster
"""
# Find the closest cluster to samples
# To do it, project x to appropriate subspace, find distance to mean value and norm by variance
min_score = None
closest = None
for i in range(self.clusters):
projection = x[:, self.features_[i]]
norm = euclidean_distances(projection, self.means_[i])
score = norm / self.vars_[i]
if min_score is None or score < min_score:
min_score = score
closest = i
return closest
示例9
def wmdistance(sent1_embs, sent2_embs):
wmd = 0.0
for _,x in sent1_embs:
min_dist = sys.float_info.max
for _,y in sent2_embs:
x = x.reshape(1, -1)
y = y.reshape(1, -1)
distance = euclidean_distances(x,y)
if distance < min_dist:
min_dist = distance
wmd += min_dist
return - float(wmd) / (len(sent1_embs) + len(sent2_embs))
# Note that this breaks the symmetry and is not a distance anymore:
# To overcome this, we compute the average of the score in both side: (weigthedWMD(a,b) + weightedWMD(b,a))/2
示例10
def weighted_wmdistance(sent1_embs, sent2_embs, idfs, mean):
wmd = 0.0
for token1, x in sent1_embs:
min_dist = sys.float_info.max
weight = idfs[token1] if token1 in idfs else mean
for _, y in sent2_embs:
print(x, x.shape())
print(y, y.shape())
score = weight * euclidean_distances(x,y)
exit(0)
if score < min_dist:
min_dist = score
wmd += min_dist
return - float(wmd) / (len(sent1_embs) + len(sent2_embs))
示例11
def test_shuffle_equal(verbose):
# for this data set there shouldn't be any equal distances,
# and shuffle should make no difference
X, _ = make_classification(random_state=12354)
dist = euclidean_distances(X)
skew_shuffle, skew_no_shuffle = \
[Hubness(metric='precomputed', shuffle_equal=v, verbose=verbose)
.fit(dist).score() for v in [True, False]]
assert skew_no_shuffle == skew_shuffle
示例12
def test_sparse_equal_dense(verbose, shuffle_equal):
X, _ = make_classification()
dist_dense = euclidean_distances(X)
dist_sparse = csr_matrix(dist_dense)
hub = Hubness(metric='precomputed',
shuffle_equal=shuffle_equal,
verbose=verbose)
hub.fit(dist_dense)
skew_dense = hub.score(has_self_distances=True)
hub.fit(dist_sparse)
skew_sparse = hub.score(has_self_distances=True)
np.testing.assert_almost_equal(skew_dense, skew_sparse)
示例13
def test_sparse_equal_dense_if_variable_hits_per_row(shuffle_equal):
X, _ = make_classification(random_state=123)
dist = euclidean_distances(X)
dist[0, 1:3] = 999
dist[1:3, 0] = 999
dist[1, 1:5] = 999
dist[1:5, 1] = 999
sparse = dist.copy()
sparse[0, 1:3] = 0
sparse[1:3, 0] = 0
sparse[1, 1:5] = 0
sparse[1:5, 1] = 0
sparse = csr_matrix(sparse)
hub = Hubness(metric='precomputed',
shuffle_equal=shuffle_equal,
random_state=123)
hub.fit(dist)
skew_dense = hub.score(has_self_distances=True)
hub = Hubness(metric='precomputed',
shuffle_equal=shuffle_equal,
random_state=123)
hub.fit(sparse)
skew_sparse = hub.score(has_self_distances=True)
np.testing.assert_almost_equal(skew_dense, skew_sparse, decimal=2)
示例14
def test_hubness_against_distance(has_self_distances):
"""Test hubness class against distance-based methods."""
np.random.seed(123)
X = np.random.rand(100, 50)
D = euclidean_distances(X)
verbose = 1
hub = Hubness(k=10, metric='precomputed',
store_k_occurrence=True,
store_k_neighbors=True,
)
hub.fit(D)
skew_d = hub.score(has_self_distances=has_self_distances)
neigh_d = hub.k_neighbors
occ_d = hub.k_occurrence
hub = Hubness(k=10, metric='euclidean',
store_k_neighbors=True,
store_k_occurrence=True,
verbose=verbose)
hub.fit(X)
skew_v = hub.score(X if not has_self_distances else None)
neigh_v = hub.k_neighbors
occ_v = hub.k_occurrence
np.testing.assert_allclose(skew_d, skew_v, atol=1e-7)
np.testing.assert_array_equal(neigh_d, neigh_v)
np.testing.assert_array_equal(occ_d, occ_v)
示例15
def fit_transform(self, X, y=None, init=None):
"""
Fit the data from X, and returns the embedded coordinates
Parameters
----------
X : array, shape=[n_samples, n_features], or [n_samples, n_samples] \
if dissimilarity='precomputed'
Input data.
init : {None or ndarray, shape (n_samples,)}, optional
If None, randomly chooses the initial configuration
if ndarray, initialize the SMACOF algorithm with this array.
"""
X = check_array(X)
if X.shape[0] == X.shape[1] and self.dissimilarity != "precomputed":
warnings.warn("The MDS API has changed. ``fit`` now constructs an"
" dissimilarity matrix from data. To use a custom "
"dissimilarity matrix, set "
"``dissimilarity=precomputed``.")
if self.dissimilarity == "precomputed":
self.dissimilarity_matrix_ = X
elif self.dissimilarity == "euclidean":
self.dissimilarity_matrix_ = euclidean_distances(X)
else:
raise ValueError("Proximity must be 'precomputed' or 'euclidean'."
" Got %s instead" % str(self.dissimilarity))
self.embedding_, self.stress_, self.n_iter_ = _smacof_w(
self.dissimilarity_matrix_, self.n_uq, self.uq_weight, metric=self.metric,
n_components=self.n_components, init=init, n_init=self.n_init,
n_jobs=self.n_jobs, max_iter=self.max_iter, verbose=self.verbose,
eps=self.eps, random_state=self.random_state,
return_n_iter=True)
return self.embedding_
示例16
def __call__(self, track, slice=None):
# remove WHERE when table cleaned up to remove header rows
statement = (
"SELECT transcript_id, TPM, sample_id FROM sailfish_transcripts")
# fetch data
df = pd.DataFrame.from_dict(self.getAll(statement))
df = df.pivot('transcript_id', 'sample_id')['TPM']
# calculate dissimilarities
similarities = euclidean_distances(df.transpose())
# run MDS
mds = manifold.MDS(n_components=2, max_iter=3000,
eps=1e-9, dissimilarity="precomputed", n_jobs=1)
mds = mds.fit(similarities)
pos = pd.DataFrame(mds.embedding_)
pos.columns = ["MD1", "MD2"]
pos['sample'] = df.columns
factors_df = self.getDataFrame(
"SELECT * FROM factors WHERE factor != 'genome'")
merged_df = pd.merge(pos, factors_df,
left_on="sample", right_on="sample_id")
return merged_df.reset_index().set_index("factor")
示例17
def test_dissimilarity_precomputed_euclidean(data):
test_views = []
for i in data['samp_views']:
test_views.append(euclidean_distances(i))
mvmds1 = MVMDS(dissimilarity='euclidean')
mvmds2 = MVMDS(dissimilarity='precomputed')
fit1 = mvmds1.fit_transform(data['samp_views'])
fit2 = mvmds2.fit_transform(test_views)
np.testing.assert_almost_equal(np.abs(fit2), np.abs(fit1))
示例18
def test_random_projection_embedding_quality():
data, _ = make_sparse_random_data(8, 5000, 15000)
eps = 0.2
original_distances = euclidean_distances(data, squared=True)
original_distances = original_distances.ravel()
non_identical = original_distances != 0.0
# remove 0 distances to avoid division by 0
original_distances = original_distances[non_identical]
for RandomProjection in all_RandomProjection:
rp = RandomProjection(n_components='auto', eps=eps, random_state=0)
projected = rp.fit_transform(data)
projected_distances = euclidean_distances(projected, squared=True)
projected_distances = projected_distances.ravel()
# remove 0 distances to avoid division by 0
projected_distances = projected_distances[non_identical]
distances_ratio = projected_distances / original_distances
# check that the automatically tuned values for the density respect the
# contract for eps: pairwise distances are preserved according to the
# Johnson-Lindenstrauss lemma
assert_less(distances_ratio.max(), 1 + eps)
assert_less(1 - eps, distances_ratio.min())
示例19
def test_affinity_propagation():
# Affinity Propagation algorithm
# Compute similarities
S = -euclidean_distances(X, squared=True)
preference = np.median(S) * 10
# Compute Affinity Propagation
cluster_centers_indices, labels = affinity_propagation(
S, preference=preference)
n_clusters_ = len(cluster_centers_indices)
assert_equal(n_clusters, n_clusters_)
af = AffinityPropagation(preference=preference, affinity="precomputed")
labels_precomputed = af.fit(S).labels_
af = AffinityPropagation(preference=preference, verbose=True)
labels = af.fit(X).labels_
assert_array_equal(labels, labels_precomputed)
cluster_centers_indices = af.cluster_centers_indices_
n_clusters_ = len(cluster_centers_indices)
assert_equal(np.unique(labels).size, n_clusters_)
assert_equal(n_clusters, n_clusters_)
# Test also with no copy
_, labels_no_copy = affinity_propagation(S, preference=preference,
copy=False)
assert_array_equal(labels, labels_no_copy)
# Test input validation
assert_raises(ValueError, affinity_propagation, S[:, :-1])
assert_raises(ValueError, affinity_propagation, S, damping=0)
af = AffinityPropagation(affinity="unknown")
assert_raises(ValueError, af.fit, X)
示例20
def testEuclideanDistancesExecution(self):
dense_raw_x = np.random.rand(30, 10)
dense_raw_y = np.random.rand(40, 10)
sparse_raw_x = SparseNDArray(sps.random(30, 10, density=0.5, format='csr'))
sparse_raw_y = SparseNDArray(sps.random(40, 10, density=0.5, format='csr'))
for raw_x, raw_y in [(dense_raw_x, dense_raw_y),
(sparse_raw_x, sparse_raw_y)]:
x = mt.tensor(raw_x, chunk_size=9)
y = mt.tensor(raw_y, chunk_size=7)
distance = euclidean_distances(x, y)
result = self.executor.execute_tensor(distance, concat=True)[0]
expected = sk_euclidean_distances(raw_x, Y=raw_y)
np.testing.assert_almost_equal(result, expected)
x_norm = x.sum(axis=1)[..., np.newaxis]
y_norm = y.sum(axis=1)[np.newaxis, ...]
distance = euclidean_distances(x, y, X_norm_squared=x_norm,
Y_norm_squared=y_norm)
x_raw_norm = raw_x.sum(axis=1)[..., np.newaxis]
y_raw_norm = raw_y.sum(axis=1)[np.newaxis, ...]
result = self.executor.execute_tensor(distance, concat=True)[0]
expected = sk_euclidean_distances(raw_x, raw_y, X_norm_squared=x_raw_norm,
Y_norm_squared=y_raw_norm)
np.testing.assert_almost_equal(result, expected)
x_sq = (x ** 2).astype(np.float32)
y_sq = (y ** 2).astype(np.float32)
distance = euclidean_distances(x_sq, y_sq, squared=True)
x_raw_sq = (raw_x ** 2).astype(np.float32)
y_raw_sq = (raw_y ** 2).astype(np.float32)
result = self.executor.execute_tensor(distance, concat=True)[0]
expected = sk_euclidean_distances(x_raw_sq, y_raw_sq, squared=True)
np.testing.assert_almost_equal(result, expected, decimal=6)
# test x is y
distance = euclidean_distances(x)
result = self.executor.execute_tensor(distance, concat=True)[0]
expected = sk_euclidean_distances(raw_x)
np.testing.assert_almost_equal(result, expected)
示例21
def fit(self, Xs):
"""
Calculates dimensionally reduced components by inputting the Euclidean
distances of each view, double centering them, and using the _commonpcs
function to find common components between views. Works similarly to
traditional, single-view Multidimensional Scaling.
Parameters
----------
Xs: list of array-likes or numpy.ndarray
- Xs length: n_views
- Xs[i] shape: (n_samples, n_features_i)
"""
if (self.n_components) > len(Xs[0]):
self.n_components = len(Xs[0])
warnings.warn('The number of components you have requested is '
+ 'greater than the number of samples in the '
+ 'dataset. ' + str(self.n_components)
+ ' components were computed instead.')
Xs = check_Xs(Xs, multiview=True)
mat = np.ones(shape=(len(Xs), len(Xs[0]), len(Xs[0])))
# Double centering each view as in single-view MDS
if (self.dissimilarity == 'euclidean'):
for i in np.arange(len(Xs)):
view = euclidean_distances(Xs[i])
view_squared = np.power(np.array(view), 2)
J = np.eye(len(view)) - (1/len(view))*np.ones(view.shape)
B = -(1/2) * J @ view_squared @ J
mat[i] = B
# If user wants to input special distance matrix
elif (self.dissimilarity == 'precomputed'):
for i in np.arange(len(Xs)):
if (Xs[i].shape[0] != Xs[i].shape[1]):
raise ValueError('The input distance matrix must be '
+ 'a square matrix')
else:
view = Xs[i]
view_squared = np.power(np.array(view), 2)
J = np.eye(len(view)) - (1/len(view))*np.ones(view.shape)
B = -(1/2) * J @ view_squared @ J
mat[i] = B
else:
raise ValueError('The parameter `dissimilarity` must be one of \
{`euclidean`, `precomputed`}')
self.components_ = self._commonpcs(mat)
return self