Python源码示例:sklearn.utils.validation._num_samples()

示例1
def check_cv_coverage(cv, X, y, groups, expected_n_splits=None):
    n_samples = _num_samples(X)
    # Check that a all the samples appear at least once in a test fold
    if expected_n_splits is not None:
        assert_equal(cv.get_n_splits(X, y, groups), expected_n_splits)
    else:
        expected_n_splits = cv.get_n_splits(X, y, groups)

    collected_test_samples = set()
    iterations = 0
    for train, test in cv.split(X, y, groups):
        check_valid_split(train, test, n_samples=n_samples)
        iterations += 1
        collected_test_samples.update(test)

    # Check that the accumulated test samples cover the whole dataset
    assert_equal(iterations, expected_n_splits)
    if n_samples is not None:
        assert_equal(collected_test_samples, set(range(n_samples))) 
示例2
def transform(self, y):
        """Transform labels to normalized encoding.

        Parameters
        ----------
        y : array-like of shape [n_samples]
            Target values.

        Returns
        -------
        y : array-like of shape [n_samples]
        """
        check_is_fitted(self, 'classes_')
        y = column_or_1d(y, warn=True)
        # transform of empty array is empty array
        if _num_samples(y) == 0:
            return np.array([])

        _, y = _encode(y, uniques=self.classes_, encode=True)
        return y 
示例3
def inverse_transform(self, y):
        """Transform labels back to original encoding.

        Parameters
        ----------
        y : numpy array of shape [n_samples]
            Target values.

        Returns
        -------
        y : numpy array of shape [n_samples]
        """
        check_is_fitted(self, 'classes_')
        y = column_or_1d(y, warn=True)
        # inverse transform of empty array is empty array
        if _num_samples(y) == 0:
            return np.array([])

        diff = np.setdiff1d(y, np.arange(len(self.classes_)))
        if len(diff):
            raise ValueError(
                    "y contains previously unseen labels: %s" % str(diff))
        y = np.asarray(y)
        return self.classes_[y] 
示例4
def fit(self, y):
        """Fit label binarizer

        Parameters
        ----------
        y : array of shape [n_samples,] or [n_samples, n_classes]
            Target values. The 2-d matrix should only contain 0 and 1,
            represents multilabel classification.

        Returns
        -------
        self : returns an instance of self.
        """
        self.y_type_ = type_of_target(y)
        if 'multioutput' in self.y_type_:
            raise ValueError("Multioutput target data is not supported with "
                             "label binarization")
        if _num_samples(y) == 0:
            raise ValueError('y has 0 samples: %r' % y)

        self.sparse_input_ = sp.issparse(y)
        self.classes_ = unique_labels(y)
        return self 
示例5
def predict(self, x):
        """
        Applying multiple estimators for prediction.

        Args:
            x (numpy.ndarray): NxD array
        Returns:
            numpy.ndarray: predicted labels, Nx1 array
        """
        n_samples = _num_samples(x)
        maxima = np.empty(n_samples, dtype=float)
        maxima.fill(-np.inf)
        argmaxima = np.zeros(n_samples, dtype=int)
        for i, e in enumerate(self.estimators):
            pred = np.ravel(e.decision_function(x))
            np.maximum(maxima, pred, out=maxima)
            argmaxima[maxima == pred] = i
        return self.classes[np.array(argmaxima.T)] 
示例6
def get_n_splits(self, X, y=None, groups=None):
        """Returns the number of splitting iterations in the cross-validator

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.

        y : object
            Always ignored, exists for compatibility.

        groups : object
            Always ignored, exists for compatibility.
        """
        self.__check_validity(X, y, groups)
        n_samples = _num_samples(X)
        gap_before, gap_after = self.gap_before, self.gap_after
        if n_samples - gap_after - self.p >= gap_before + 1:
            n_splits = n_samples - self.p + 1
        else:
            n_splits = max(n_samples - gap_after - self.p, 0)
            n_splits += max(n_samples - self.p - gap_before, 0)
        return n_splits 
示例7
def _do_n_samples(dsk, token, Xs, n_splits):
    name = "n_samples-" + token
    n_samples = []
    n_samples_append = n_samples.append
    seen = {}
    m = 0
    for x in Xs:
        if x in seen:
            n_samples_append(seen[x])
        else:
            for n in range(n_splits):
                dsk[name, m, n] = (_num_samples, x + (n,))
            n_samples_append((name, m))
            seen[x] = (name, m)
            m += 1
    return n_samples 
示例8
def check_cv_coverage(cv, X, y, groups, expected_n_splits=None):
    n_samples = _num_samples(X)
    # Check that a all the samples appear at least once in a test fold
    if expected_n_splits is not None:
        assert_equal(cv.get_n_splits(X, y, groups), expected_n_splits)
    else:
        expected_n_splits = cv.get_n_splits(X, y, groups)

    collected_test_samples = set()
    iterations = 0
    for train, test in cv.split(X, y, groups):
        check_valid_split(train, test, n_samples=n_samples)
        iterations += 1
        collected_test_samples.update(test)

    # Check that the accumulated test samples cover the whole dataset
    assert_equal(iterations, expected_n_splits)
    if n_samples is not None:
        assert_equal(collected_test_samples, set(range(n_samples))) 
示例9
def test_check_sample_weight():
    from sklearn.cluster.k_means_ import _check_sample_weight
    sample_weight = None
    checked_sample_weight = _check_sample_weight(X, sample_weight)
    assert_equal(_num_samples(X), _num_samples(checked_sample_weight))
    assert_almost_equal(checked_sample_weight.sum(), _num_samples(X))
    assert_equal(X.dtype, checked_sample_weight.dtype) 
示例10
def transform(self, y):
        """Transform labels to normalized encoding.

        If ``self.fill_unseen_labels`` is ``True``, use ``self.fill_encoded_label_value`` for unseen values.
        Seen labels are encoded with value between 0 and n_classes-1.  Unseen labels are encoded with
        ``self.fill_encoded_label_value`` with a default value of n_classes.

        Parameters
        ----------
        y : array-like of shape [n_samples]
            Label values.

        Returns
        -------
        y_encoded : array-like of shape [n_samples]
                    Encoded label values.
        """
        check_is_fitted(self, "classes_")
        y = column_or_1d(y, warn=True)

        # transform of empty array is empty array
        if _num_samples(y) == 0:
            return np.array([])

        if self.fill_unseen_labels:
            _, mask = _encode_check_unknown(y, self.classes_, return_mask=True)
            y_encoded = np.searchsorted(self.classes_, y)
            fill_encoded_label_value = self.fill_encoded_label_value or len(self.classes_)
            y_encoded[~mask] = fill_encoded_label_value
        else:
            _, y_encoded = _encode(y, uniques=self.classes_, encode=True)

        return y_encoded 
示例11
def inverse_transform(self, y):
        """Transform labels back to original encoding.

        If ``self.fill_unseen_labels`` is ``True``, use ``self.fill_label_value`` for unseen values.

        Parameters
        ----------
        y : numpy array of shape [n_samples]
            Encoded label values.

        Returns
        -------
        y_decoded : numpy array of shape [n_samples]
                    Label values.
        """
        check_is_fitted(self, "classes_")
        y = column_or_1d(y, warn=True)

        if y.dtype.kind not in ("i", "u"):
            try:
                y = y.astype(np.float).astype(np.int)
            except ValueError:
                raise ValueError("`y` contains values not convertible to integer.")

        # inverse transform of empty array is empty array
        if _num_samples(y) == 0:
            return np.array([])

        labels = np.arange(len(self.classes_))
        diff = np.setdiff1d(y, labels)

        if diff and not self.fill_unseen_labels:
            raise ValueError("y contains previously unseen labels: %s" % str(diff))

        y_decoded = [self.classes_[idx] if idx in labels else self.fill_label_value for idx in y]
        return y_decoded 
示例12
def _iter_train_indices(self, X=None, y=None, groups=None):
        """Generates integer indices corresponding to training sets.

        By default, delegates to _iter_test_indices(X, y, groups)
        """
        return self.__complement_indices(
                self._iter_test_indices(X, y, groups), _num_samples(X)) 
示例13
def _iter_train_masks(self, X=None, y=None, groups=None):
        """Generates boolean masks corresponding to training sets.

        By default, delegates to _iter_train_indices(X, y, groups)
        """
        return GapCrossValidator.__indices_to_masks(
                self._iter_train_indices(X, y, groups), _num_samples(X)) 
示例14
def _iter_test_indices(self, X, y=None, groups=None):
        self.__check_validity(X, y, groups)
        n_samples = _num_samples(X)
        gap_before, gap_after = self.gap_before, self.gap_after
        if n_samples - gap_after - self.p >= gap_before + 1:
            for i in range(n_samples - self.p + 1):
                yield np.arange(i, i + self.p)
        else:
            for i in range(n_samples - gap_after - self.p):
                yield np.arange(i, i + self.p)
            for i in range(gap_before + 1, n_samples - self.p + 1):
                yield np.arange(i, i + self.p) 
示例15
def __check_validity(self, X, y=None, groups=None):
        if X is None:
            raise ValueError("The 'X' parameter should not be None.")
        n_samples = _num_samples(X)
        gap_before, gap_after = self.gap_before, self.gap_after
        if (0 >= n_samples - gap_after - self.p and
                gap_before >= n_samples - self.p):
            raise ValueError("Not enough training samples available.")
        if n_samples - gap_after - self.p <= gap_before + 1:
            warnings.warn(SINGLETON_WARNING, Warning) 
示例16
def _iter_indices(self, X, y, groups=None):
        n_samples = _num_samples(X)
        y = check_array(y, ensure_2d=False, dtype=None)
        y = np.asarray(y, dtype=bool)
        type_of_target_y = type_of_target(y)

        if type_of_target_y != 'multilabel-indicator':
            raise ValueError(
                'Supported target type is: multilabel-indicator. Got {!r} instead.'.format(
                    type_of_target_y))

        n_train, n_test = _validate_shuffle_split(n_samples, self.test_size,
                                                  self.train_size)

        n_samples = y.shape[0]
        rng = check_random_state(self.random_state)
        y_orig = y.copy()

        r = np.array([n_train, n_test]) / (n_train + n_test)

        for _ in range(self.n_splits):
            indices = np.arange(n_samples)
            rng.shuffle(indices)
            y = y_orig[indices]

            test_folds = IterativeStratification(labels=y, r=r, random_state=rng)

            test_idx = test_folds[np.argsort(indices)] == 1
            test = np.where(test_idx)[0]
            train = np.where(~test_idx)[0]

            yield train, test 
示例17
def _num_samples(X):
    result = sk_validation._num_samples(X)
    if dask.is_dask_collection(result):
        # dask dataframe
        result = result.compute()
    return result 
示例18
def predict(self, T):
        if self.check_X is not None:
            assert self.check_X(T)
        return self.classes_[np.zeros(_num_samples(T), dtype=np.int)] 
示例19
def split(self, X, y=None, groups=None):
        """Generate indices to split data into training and test set.
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like, shape (n_samples,)
            Always ignored, exists for compatibility.
        groups : array-like, with shape (n_samples,)
            Always ignored, exists for compatibility.
        Yields
        ------
        train : ndarray
            The training set indices for that split.
        test : ndarray
            The testing set indices for that split.
        """
        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        n_splits = self.n_splits
        n_folds = n_splits + 1
        gap_size = self.gap_size
        test_size = self.test_size if self.test_size else n_samples // n_folds

        # Make sure we have enough samples for the given split parameters
        if n_folds > n_samples:
            raise ValueError(
                ("Cannot have number of folds ={0} greater"
                 " than the number of samples: {1}.").format(n_folds,
                                                             n_samples))
        if n_samples - gap_size - (test_size * n_splits) <= 0:
            raise ValueError(
                ("Too many splits ={0} for number of samples"
                 " ={1} with test_size ={2} and gap_size ={3}."
                 "").format(n_splits, n_samples, test_size, gap_size))

        indices = np.arange(n_samples)
        test_starts = range(n_samples - n_splits * test_size,
                            n_samples, test_size)

        for test_start in test_starts:
            train_end = test_start - gap_size
            if self.max_train_size and self.max_train_size < train_end:
                yield (indices[train_end - self.max_train_size:train_end],
                       indices[test_start:test_start + test_size])
            else:
                yield (indices[:train_end],
                       indices[test_start:test_start + test_size]) 
示例20
def _fit_and_score(estimator, Z, scorer, train, test, verbose,
                   parameters, fit_params, return_train_score=False,
                   return_parameters=False, error_score='raise'):

    if verbose > 1:
        if parameters is None:
            msg = "no parameters to be set"
        else:
            msg = '%s' % (', '.join('%s=%s' % (k, v)
                                    for k, v in list(parameters.items())))
        print(("[CV] %s %s" % (msg, (64 - len(msg)) * '.')))

    fit_params = fit_params if fit_params is not None else {}

    if parameters is not None:
        estimator.set_params(**parameters)

    start_time = time.time()

    Z_train = Z[train]
    Z_test = Z[test]

    try:
        estimator.fit(Z_train, **fit_params)
    except Exception as e:
        if error_score == 'raise':
            raise
        elif isinstance(error_score, numbers.Number):
            test_score = error_score
            if return_train_score:
                train_score = error_score
            warnings.warn("Classifier fit failed. The score on this train-test"
                          " partition for these parameters will be set to %f. "
                          "Details: \n%r" % (error_score, e), FitFailedWarning)
        else:
            raise ValueError("error_score must be the string 'raise' or a"
                             " numeric value. (Hint: if using 'raise', please"
                             " make sure that it has been spelled correctly.)"
                             )
    else:
        test_score = _score(estimator, Z_test, scorer)
        if return_train_score:
            train_score = _score(estimator, Z_train, scorer)

    scoring_time = time.time() - start_time

    if verbose > 2:
        msg += ", score=%f" % test_score
    if verbose > 1:
        end_msg = "%s -%s" % (msg, logger.short_format_time(scoring_time))
        print(("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg)))

    ret = [train_score] if return_train_score else []
    ret.extend([test_score, _num_samples(Z_test), scoring_time])
    if return_parameters:
        ret.append(parameters)
    return ret