Python源码示例:sklearn.utils.validation._num_samples()
示例1
def check_cv_coverage(cv, X, y, groups, expected_n_splits=None):
n_samples = _num_samples(X)
# Check that a all the samples appear at least once in a test fold
if expected_n_splits is not None:
assert_equal(cv.get_n_splits(X, y, groups), expected_n_splits)
else:
expected_n_splits = cv.get_n_splits(X, y, groups)
collected_test_samples = set()
iterations = 0
for train, test in cv.split(X, y, groups):
check_valid_split(train, test, n_samples=n_samples)
iterations += 1
collected_test_samples.update(test)
# Check that the accumulated test samples cover the whole dataset
assert_equal(iterations, expected_n_splits)
if n_samples is not None:
assert_equal(collected_test_samples, set(range(n_samples)))
示例2
def transform(self, y):
"""Transform labels to normalized encoding.
Parameters
----------
y : array-like of shape [n_samples]
Target values.
Returns
-------
y : array-like of shape [n_samples]
"""
check_is_fitted(self, 'classes_')
y = column_or_1d(y, warn=True)
# transform of empty array is empty array
if _num_samples(y) == 0:
return np.array([])
_, y = _encode(y, uniques=self.classes_, encode=True)
return y
示例3
def inverse_transform(self, y):
"""Transform labels back to original encoding.
Parameters
----------
y : numpy array of shape [n_samples]
Target values.
Returns
-------
y : numpy array of shape [n_samples]
"""
check_is_fitted(self, 'classes_')
y = column_or_1d(y, warn=True)
# inverse transform of empty array is empty array
if _num_samples(y) == 0:
return np.array([])
diff = np.setdiff1d(y, np.arange(len(self.classes_)))
if len(diff):
raise ValueError(
"y contains previously unseen labels: %s" % str(diff))
y = np.asarray(y)
return self.classes_[y]
示例4
def fit(self, y):
"""Fit label binarizer
Parameters
----------
y : array of shape [n_samples,] or [n_samples, n_classes]
Target values. The 2-d matrix should only contain 0 and 1,
represents multilabel classification.
Returns
-------
self : returns an instance of self.
"""
self.y_type_ = type_of_target(y)
if 'multioutput' in self.y_type_:
raise ValueError("Multioutput target data is not supported with "
"label binarization")
if _num_samples(y) == 0:
raise ValueError('y has 0 samples: %r' % y)
self.sparse_input_ = sp.issparse(y)
self.classes_ = unique_labels(y)
return self
示例5
def predict(self, x):
"""
Applying multiple estimators for prediction.
Args:
x (numpy.ndarray): NxD array
Returns:
numpy.ndarray: predicted labels, Nx1 array
"""
n_samples = _num_samples(x)
maxima = np.empty(n_samples, dtype=float)
maxima.fill(-np.inf)
argmaxima = np.zeros(n_samples, dtype=int)
for i, e in enumerate(self.estimators):
pred = np.ravel(e.decision_function(x))
np.maximum(maxima, pred, out=maxima)
argmaxima[maxima == pred] = i
return self.classes[np.array(argmaxima.T)]
示例6
def get_n_splits(self, X, y=None, groups=None):
"""Returns the number of splitting iterations in the cross-validator
Parameters
----------
X : array-like, shape (n_samples, n_features)
Training data, where n_samples is the number of samples
and n_features is the number of features.
y : object
Always ignored, exists for compatibility.
groups : object
Always ignored, exists for compatibility.
"""
self.__check_validity(X, y, groups)
n_samples = _num_samples(X)
gap_before, gap_after = self.gap_before, self.gap_after
if n_samples - gap_after - self.p >= gap_before + 1:
n_splits = n_samples - self.p + 1
else:
n_splits = max(n_samples - gap_after - self.p, 0)
n_splits += max(n_samples - self.p - gap_before, 0)
return n_splits
示例7
def _do_n_samples(dsk, token, Xs, n_splits):
name = "n_samples-" + token
n_samples = []
n_samples_append = n_samples.append
seen = {}
m = 0
for x in Xs:
if x in seen:
n_samples_append(seen[x])
else:
for n in range(n_splits):
dsk[name, m, n] = (_num_samples, x + (n,))
n_samples_append((name, m))
seen[x] = (name, m)
m += 1
return n_samples
示例8
def check_cv_coverage(cv, X, y, groups, expected_n_splits=None):
n_samples = _num_samples(X)
# Check that a all the samples appear at least once in a test fold
if expected_n_splits is not None:
assert_equal(cv.get_n_splits(X, y, groups), expected_n_splits)
else:
expected_n_splits = cv.get_n_splits(X, y, groups)
collected_test_samples = set()
iterations = 0
for train, test in cv.split(X, y, groups):
check_valid_split(train, test, n_samples=n_samples)
iterations += 1
collected_test_samples.update(test)
# Check that the accumulated test samples cover the whole dataset
assert_equal(iterations, expected_n_splits)
if n_samples is not None:
assert_equal(collected_test_samples, set(range(n_samples)))
示例9
def test_check_sample_weight():
from sklearn.cluster.k_means_ import _check_sample_weight
sample_weight = None
checked_sample_weight = _check_sample_weight(X, sample_weight)
assert_equal(_num_samples(X), _num_samples(checked_sample_weight))
assert_almost_equal(checked_sample_weight.sum(), _num_samples(X))
assert_equal(X.dtype, checked_sample_weight.dtype)
示例10
def transform(self, y):
"""Transform labels to normalized encoding.
If ``self.fill_unseen_labels`` is ``True``, use ``self.fill_encoded_label_value`` for unseen values.
Seen labels are encoded with value between 0 and n_classes-1. Unseen labels are encoded with
``self.fill_encoded_label_value`` with a default value of n_classes.
Parameters
----------
y : array-like of shape [n_samples]
Label values.
Returns
-------
y_encoded : array-like of shape [n_samples]
Encoded label values.
"""
check_is_fitted(self, "classes_")
y = column_or_1d(y, warn=True)
# transform of empty array is empty array
if _num_samples(y) == 0:
return np.array([])
if self.fill_unseen_labels:
_, mask = _encode_check_unknown(y, self.classes_, return_mask=True)
y_encoded = np.searchsorted(self.classes_, y)
fill_encoded_label_value = self.fill_encoded_label_value or len(self.classes_)
y_encoded[~mask] = fill_encoded_label_value
else:
_, y_encoded = _encode(y, uniques=self.classes_, encode=True)
return y_encoded
示例11
def inverse_transform(self, y):
"""Transform labels back to original encoding.
If ``self.fill_unseen_labels`` is ``True``, use ``self.fill_label_value`` for unseen values.
Parameters
----------
y : numpy array of shape [n_samples]
Encoded label values.
Returns
-------
y_decoded : numpy array of shape [n_samples]
Label values.
"""
check_is_fitted(self, "classes_")
y = column_or_1d(y, warn=True)
if y.dtype.kind not in ("i", "u"):
try:
y = y.astype(np.float).astype(np.int)
except ValueError:
raise ValueError("`y` contains values not convertible to integer.")
# inverse transform of empty array is empty array
if _num_samples(y) == 0:
return np.array([])
labels = np.arange(len(self.classes_))
diff = np.setdiff1d(y, labels)
if diff and not self.fill_unseen_labels:
raise ValueError("y contains previously unseen labels: %s" % str(diff))
y_decoded = [self.classes_[idx] if idx in labels else self.fill_label_value for idx in y]
return y_decoded
示例12
def _iter_train_indices(self, X=None, y=None, groups=None):
"""Generates integer indices corresponding to training sets.
By default, delegates to _iter_test_indices(X, y, groups)
"""
return self.__complement_indices(
self._iter_test_indices(X, y, groups), _num_samples(X))
示例13
def _iter_train_masks(self, X=None, y=None, groups=None):
"""Generates boolean masks corresponding to training sets.
By default, delegates to _iter_train_indices(X, y, groups)
"""
return GapCrossValidator.__indices_to_masks(
self._iter_train_indices(X, y, groups), _num_samples(X))
示例14
def _iter_test_indices(self, X, y=None, groups=None):
self.__check_validity(X, y, groups)
n_samples = _num_samples(X)
gap_before, gap_after = self.gap_before, self.gap_after
if n_samples - gap_after - self.p >= gap_before + 1:
for i in range(n_samples - self.p + 1):
yield np.arange(i, i + self.p)
else:
for i in range(n_samples - gap_after - self.p):
yield np.arange(i, i + self.p)
for i in range(gap_before + 1, n_samples - self.p + 1):
yield np.arange(i, i + self.p)
示例15
def __check_validity(self, X, y=None, groups=None):
if X is None:
raise ValueError("The 'X' parameter should not be None.")
n_samples = _num_samples(X)
gap_before, gap_after = self.gap_before, self.gap_after
if (0 >= n_samples - gap_after - self.p and
gap_before >= n_samples - self.p):
raise ValueError("Not enough training samples available.")
if n_samples - gap_after - self.p <= gap_before + 1:
warnings.warn(SINGLETON_WARNING, Warning)
示例16
def _iter_indices(self, X, y, groups=None):
n_samples = _num_samples(X)
y = check_array(y, ensure_2d=False, dtype=None)
y = np.asarray(y, dtype=bool)
type_of_target_y = type_of_target(y)
if type_of_target_y != 'multilabel-indicator':
raise ValueError(
'Supported target type is: multilabel-indicator. Got {!r} instead.'.format(
type_of_target_y))
n_train, n_test = _validate_shuffle_split(n_samples, self.test_size,
self.train_size)
n_samples = y.shape[0]
rng = check_random_state(self.random_state)
y_orig = y.copy()
r = np.array([n_train, n_test]) / (n_train + n_test)
for _ in range(self.n_splits):
indices = np.arange(n_samples)
rng.shuffle(indices)
y = y_orig[indices]
test_folds = IterativeStratification(labels=y, r=r, random_state=rng)
test_idx = test_folds[np.argsort(indices)] == 1
test = np.where(test_idx)[0]
train = np.where(~test_idx)[0]
yield train, test
示例17
def _num_samples(X):
result = sk_validation._num_samples(X)
if dask.is_dask_collection(result):
# dask dataframe
result = result.compute()
return result
示例18
def predict(self, T):
if self.check_X is not None:
assert self.check_X(T)
return self.classes_[np.zeros(_num_samples(T), dtype=np.int)]
示例19
def split(self, X, y=None, groups=None):
"""Generate indices to split data into training and test set.
Parameters
----------
X : array-like, shape (n_samples, n_features)
Training data, where n_samples is the number of samples
and n_features is the number of features.
y : array-like, shape (n_samples,)
Always ignored, exists for compatibility.
groups : array-like, with shape (n_samples,)
Always ignored, exists for compatibility.
Yields
------
train : ndarray
The training set indices for that split.
test : ndarray
The testing set indices for that split.
"""
X, y, groups = indexable(X, y, groups)
n_samples = _num_samples(X)
n_splits = self.n_splits
n_folds = n_splits + 1
gap_size = self.gap_size
test_size = self.test_size if self.test_size else n_samples // n_folds
# Make sure we have enough samples for the given split parameters
if n_folds > n_samples:
raise ValueError(
("Cannot have number of folds ={0} greater"
" than the number of samples: {1}.").format(n_folds,
n_samples))
if n_samples - gap_size - (test_size * n_splits) <= 0:
raise ValueError(
("Too many splits ={0} for number of samples"
" ={1} with test_size ={2} and gap_size ={3}."
"").format(n_splits, n_samples, test_size, gap_size))
indices = np.arange(n_samples)
test_starts = range(n_samples - n_splits * test_size,
n_samples, test_size)
for test_start in test_starts:
train_end = test_start - gap_size
if self.max_train_size and self.max_train_size < train_end:
yield (indices[train_end - self.max_train_size:train_end],
indices[test_start:test_start + test_size])
else:
yield (indices[:train_end],
indices[test_start:test_start + test_size])
示例20
def _fit_and_score(estimator, Z, scorer, train, test, verbose,
parameters, fit_params, return_train_score=False,
return_parameters=False, error_score='raise'):
if verbose > 1:
if parameters is None:
msg = "no parameters to be set"
else:
msg = '%s' % (', '.join('%s=%s' % (k, v)
for k, v in list(parameters.items())))
print(("[CV] %s %s" % (msg, (64 - len(msg)) * '.')))
fit_params = fit_params if fit_params is not None else {}
if parameters is not None:
estimator.set_params(**parameters)
start_time = time.time()
Z_train = Z[train]
Z_test = Z[test]
try:
estimator.fit(Z_train, **fit_params)
except Exception as e:
if error_score == 'raise':
raise
elif isinstance(error_score, numbers.Number):
test_score = error_score
if return_train_score:
train_score = error_score
warnings.warn("Classifier fit failed. The score on this train-test"
" partition for these parameters will be set to %f. "
"Details: \n%r" % (error_score, e), FitFailedWarning)
else:
raise ValueError("error_score must be the string 'raise' or a"
" numeric value. (Hint: if using 'raise', please"
" make sure that it has been spelled correctly.)"
)
else:
test_score = _score(estimator, Z_test, scorer)
if return_train_score:
train_score = _score(estimator, Z_train, scorer)
scoring_time = time.time() - start_time
if verbose > 2:
msg += ", score=%f" % test_score
if verbose > 1:
end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time))
print(("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)))
ret = [train_score] if return_train_score else []
ret.extend([test_score, _num_samples(Z_test), scoring_time])
if return_parameters:
ret.append(parameters)
return ret