Python源码示例:sklearn.datasets.load_diabetes()

示例1
def test_lasso_cv_with_some_model_selection():
    from sklearn.pipeline import make_pipeline
    from sklearn.preprocessing import StandardScaler
    from sklearn.model_selection import StratifiedKFold
    from sklearn import datasets
    from sklearn.linear_model import LassoCV

    diabetes = datasets.load_diabetes()
    X = diabetes.data
    y = diabetes.target

    pipe = make_pipeline(
        StandardScaler(),
        LassoCV(cv=StratifiedKFold(n_splits=5))
    )
    pipe.fit(X, y) 
示例2
def test_lasso_path(self):
        diabetes = datasets.load_diabetes()
        df = pdml.ModelFrame(diabetes)

        result = df.linear_model.lasso_path()
        expected = lm.lasso_path(diabetes.data, diabetes.target)

        self.assertEqual(len(result), 3)
        tm.assert_numpy_array_equal(result[0], expected[0])
        self.assertIsInstance(result[1], pdml.ModelFrame)
        tm.assert_index_equal(result[1].index, df.data.columns)
        self.assert_numpy_array_almost_equal(result[1].values, expected[1])
        self.assert_numpy_array_almost_equal(result[2], expected[2])

        result = df.linear_model.lasso_path(return_models=True)
        expected = lm.lasso_path(diabetes.data, diabetes.target, return_models=True)
        self.assertEqual(len(result), len(expected))
        self.assertIsInstance(result, tuple)
        tm.assert_numpy_array_equal(result[0], result[0])
        tm.assert_numpy_array_equal(result[1], result[1])
        tm.assert_numpy_array_equal(result[2], result[2]) 
示例3
def test_LassoCV(self, criterion):
        diabetes = datasets.load_diabetes()
        X = diabetes.data
        y = diabetes.target

        X = pp.normalize(X)

        df = pdml.ModelFrame(diabetes)
        df.data = df.data.pp.normalize()

        mod1 = lm.LassoLarsIC(criterion=criterion)
        mod1.fit(X, y)

        mod2 = df.lm.LassoLarsIC(criterion=criterion)
        df.fit(mod2)
        self.assertAlmostEqual(mod1.alpha_, mod2.alpha_)

        expected = mod1.predict(X)
        predicted = df.predict(mod2)
        self.assertIsInstance(predicted, pdml.ModelSeries)
        self.assert_numpy_array_almost_equal(predicted.values, expected) 
示例4
def test_MixedLM(self):
        import statsmodels.regression.mixed_linear_model as mlm
        diabetes = datasets.load_diabetes()
        models = ['MixedLM']
        data = diabetes.data[:100, :]
        target = diabetes.target[:100]
        groups = np.array([0] * 50 + [1] * 50)
        for model in models:
            klass = getattr(sm, model)

            estimator = base.StatsModelsRegressor(klass, groups=groups)
            fitted = estimator.fit(data, target)
            # result = estimator.predict(diabetes.data)
            # NotImplementedError
            self.assertIsInstance(fitted, mlm.MixedLMResultsWrapper)

            # expected = klass(target, data, groups=groups).fit().predict(diabetes.data)
            # self.assert_numpy_array_almost_equal(result, expected) 
示例5
def test_pipeline(self):
        from sklearn.feature_selection import SelectKBest
        from sklearn.feature_selection import f_regression
        from sklearn.pipeline import Pipeline

        diabetes = datasets.load_diabetes()
        models = ['OLS', 'GLS', 'WLS', 'GLSAR', 'QuantReg', 'GLM', 'RLM']

        for model in models:
            klass = getattr(sm, model)

            selector = SelectKBest(f_regression, k=5)
            estimator = Pipeline([('selector', selector),
                                  ('reg', base.StatsModelsRegressor(klass))])

            estimator.fit(diabetes.data, diabetes.target)
            result = estimator.predict(diabetes.data)

            data = SelectKBest(f_regression, k=5).fit_transform(diabetes.data, diabetes.target)
            expected = klass(diabetes.target, data).fit().predict(data)
            self.assert_numpy_array_almost_equal(result, expected) 
示例6
def _timeseries_generated_data(self):
        # Load diabetes data and convert to data frame
        x, y = datasets.load_diabetes(return_X_y=True)
        nrows, ncols = x.shape
        column_names = [str(i) for i in range(ncols)]
        X = pd.DataFrame(x, columns=column_names)

        # Add an arbitrary time axis
        time_column_name = "Date" + str(uuid.uuid4())
        dates = pd.date_range('1980-01-01', periods=nrows, freq='MS')
        X[time_column_name] = dates
        index_keys = [time_column_name]
        X.set_index(index_keys, inplace=True)

        # Split into train and test sets
        test_frac = 0.2
        cutoff_index = int(np.floor((1.0 - test_frac) * nrows))

        X_train = X.iloc[:cutoff_index]
        y_train = y[:cutoff_index]
        X_test = X.iloc[cutoff_index:]
        y_test = y[cutoff_index:]

        return X_train, X_test, y_train, y_test, time_column_name 
示例7
def main():
  diabetes = datasets.load_diabetes()
  diabetes_X = diabetes.data[:, np.newaxis, 2]

  diabetes_X_train = diabetes_X[:-20]
  diabetes_X_test = diabetes_X[-20:]

  diabetes_y_train = diabetes.target[:-20]
  diabetes_y_test = diabetes.target[-20:]

  regr = linear_model.LinearRegression()
  regr.fit(diabetes_X_train, diabetes_y_train)

  print('Coefficients: \n', regr.coef_)
  print("Mean squared error: %.2f" %
        np.mean((regr.predict(diabetes_X_test) - diabetes_y_test)**2))
  print('Variance score: %.2f' % regr.score(diabetes_X_test, diabetes_y_test)) 
示例8
def test_svr():
    # Test Support Vector Regression

    diabetes = datasets.load_diabetes()
    for clf in (svm.NuSVR(kernel='linear', nu=.4, C=1.0),
                svm.NuSVR(kernel='linear', nu=.4, C=10.),
                svm.SVR(kernel='linear', C=10.),
                svm.LinearSVR(C=10.),
                svm.LinearSVR(C=10.),
                ):
        clf.fit(diabetes.data, diabetes.target)
        assert_greater(clf.score(diabetes.data, diabetes.target), 0.02)

    # non-regression test; previously, BaseLibSVM would check that
    # len(np.unique(y)) < 2, which must only be done for SVC
    svm.SVR().fit(diabetes.data, np.ones(len(diabetes.data)))
    svm.LinearSVR().fit(diabetes.data, np.ones(len(diabetes.data))) 
示例9
def test_bayesian_on_diabetes():
    # Test BayesianRidge on diabetes
    raise SkipTest("XFailed Test")
    diabetes = datasets.load_diabetes()
    X, y = diabetes.data, diabetes.target

    clf = BayesianRidge(compute_score=True)

    # Test with more samples than features
    clf.fit(X, y)
    # Test that scores are increasing at each iteration
    assert_array_equal(np.diff(clf.scores_) > 0, True)

    # Test with more features than samples
    X = X[:5, :]
    y = y[:5]
    clf.fit(X, y)
    # Test that scores are increasing at each iteration
    assert_array_equal(np.diff(clf.scores_) > 0, True) 
示例10
def test_xgb_regressor(self):
        iris = load_diabetes()
        x = iris.data
        y = iris.target
        x_train, x_test, y_train, _ = train_test_split(x, y, test_size=0.5,
                                                       random_state=42)
        xgb = XGBRegressor()
        xgb.fit(x_train, y_train)
        conv_model = convert_xgboost(
            xgb, initial_types=[('input', FloatTensorType(shape=['None', 'None']))])
        self.assertTrue(conv_model is not None)
        dump_data_and_model(
            x_test.astype("float32"),
            xgb,
            conv_model,
            basename="SklearnXGBRegressor-Dec3",
            allow_failure="StrictVersion("
            "onnx.__version__)"
            "< StrictVersion('1.3.0')",
        ) 
示例11
def test_h2o_regressor(self):
        diabetes = load_diabetes()
        train, test = _train_test_split_as_frames(diabetes.data, diabetes.target)
        dists = ["auto", "gaussian", "huber", "laplace", "quantile"]
        for d in dists:
            gbm = H2OGradientBoostingEstimator(ntrees=7, max_depth=5, distribution=d)
            mojo_path = _make_mojo(gbm, train)
            onnx_model = _convert_mojo(mojo_path)
            self.assertIsNot(onnx_model, None)
            dump_data_and_model(
                test,
                H2OMojoWrapper(mojo_path),
                onnx_model,
                basename="H2OReg-Dec4",
                allow_failure="StrictVersion("
                              "onnx.__version__)"
                              "< StrictVersion('1.3.0')",
            ) 
示例12
def get_sample_dataset(dataset_properties):
    """Returns sample dataset

    Args:
        dataset_properties (dict): Dictionary corresponding to the properties of the dataset
            used to verify the estimator and metric generators.

    Returns:
        X (array-like): Features array

        y (array-like): Labels array

        splits (iterator): This is an iterator that returns train test splits for
            cross-validation purposes on ``X`` and ``y``.
    """
    kwargs = dataset_properties.copy()
    data_type = kwargs.pop('type')
    if data_type == 'multiclass':
        try:
            X, y = datasets.make_classification(random_state=8, **kwargs)
            splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y)
        except Exception as e:
            raise exceptions.UserError(repr(e))
    elif data_type == 'iris':
        X, y = datasets.load_iris(return_X_y=True)
        splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y)
    elif data_type == 'mnist':
        X, y = datasets.load_digits(return_X_y=True)
        splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y)
    elif data_type == 'breast_cancer':
        X, y = datasets.load_breast_cancer(return_X_y=True)
        splits = model_selection.StratifiedKFold(n_splits=2, random_state=8).split(X, y)
    elif data_type == 'boston':
        X, y = datasets.load_boston(return_X_y=True)
        splits = model_selection.KFold(n_splits=2, random_state=8).split(X)
    elif data_type == 'diabetes':
        X, y = datasets.load_diabetes(return_X_y=True)
        splits = model_selection.KFold(n_splits=2, random_state=8).split(X)
    else:
        raise exceptions.UserError('Unknown dataset type {}'.format(dataset_properties['type']))
    return X, y, splits 
示例13
def test_regression_scorers():
    # Test regression scorers.
    diabetes = load_diabetes()
    X, y = diabetes.data, diabetes.target
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    clf = Ridge()
    clf.fit(X_train, y_train)
    score1 = get_scorer('r2')(clf, X_test, y_test)
    score2 = r2_score(y_test, clf.predict(X_test))
    assert_almost_equal(score1, score2) 
示例14
def test_svr():
    # Test Support Vector Regression

    diabetes = datasets.load_diabetes()
    for clf in (svm.NuSVR(kernel='linear', nu=.4, C=1.0),
                svm.NuSVR(kernel='linear', nu=.4, C=10.),
                svm.SVR(kernel='linear', C=10.),
                svm.LinearSVR(C=10.),
                svm.LinearSVR(C=10.),
                ):
        clf.fit(diabetes.data, diabetes.target)
        assert_greater(clf.score(diabetes.data, diabetes.target), 0.02)

    # non-regression test; previously, BaseLibSVM would check that
    # len(np.unique(y)) < 2, which must only be done for SVC
    svm.SVR(gamma='scale').fit(diabetes.data, np.ones(len(diabetes.data)))
    svm.LinearSVR().fit(diabetes.data, np.ones(len(diabetes.data))) 
示例15
def test_linearsvr():
    # check that SVR(kernel='linear') and LinearSVC() give
    # comparable results
    diabetes = datasets.load_diabetes()
    lsvr = svm.LinearSVR(C=1e3).fit(diabetes.data, diabetes.target)
    score1 = lsvr.score(diabetes.data, diabetes.target)

    svr = svm.SVR(kernel='linear', C=1e3).fit(diabetes.data, diabetes.target)
    score2 = svr.score(diabetes.data, diabetes.target)

    assert_allclose(np.linalg.norm(lsvr.coef_),
                    np.linalg.norm(svr.coef_), 1, 0.0001)
    assert_almost_equal(score1, score2, 2) 
示例16
def test_linearsvr_fit_sampleweight():
    # check correct result when sample_weight is 1
    # check that SVR(kernel='linear') and LinearSVC() give
    # comparable results
    diabetes = datasets.load_diabetes()
    n_samples = len(diabetes.target)
    unit_weight = np.ones(n_samples)
    lsvr = svm.LinearSVR(C=1e3).fit(diabetes.data, diabetes.target,
                                    sample_weight=unit_weight)
    score1 = lsvr.score(diabetes.data, diabetes.target)

    lsvr_no_weight = svm.LinearSVR(C=1e3).fit(diabetes.data, diabetes.target)
    score2 = lsvr_no_weight.score(diabetes.data, diabetes.target)

    assert_allclose(np.linalg.norm(lsvr.coef_),
                    np.linalg.norm(lsvr_no_weight.coef_), 1, 0.0001)
    assert_almost_equal(score1, score2, 2)

    # check that fit(X)  = fit([X1, X2, X3],sample_weight = [n1, n2, n3]) where
    # X = X1 repeated n1 times, X2 repeated n2 times and so forth
    random_state = check_random_state(0)
    random_weight = random_state.randint(0, 10, n_samples)
    lsvr_unflat = svm.LinearSVR(C=1e3).fit(diabetes.data, diabetes.target,
                                           sample_weight=random_weight)
    score3 = lsvr_unflat.score(diabetes.data, diabetes.target,
                               sample_weight=random_weight)

    X_flat = np.repeat(diabetes.data, random_weight, axis=0)
    y_flat = np.repeat(diabetes.target, random_weight, axis=0)
    lsvr_flat = svm.LinearSVR(C=1e3).fit(X_flat, y_flat)
    score4 = lsvr_flat.score(X_flat, y_flat)

    assert_almost_equal(score3, score4, 2) 
示例17
def setUp(self):
        self._data = datasets.load_diabetes()
        self._labels =[
            'age', 'sex', 'bmi', 'bp', 's1',
            's2', 's3', 's4', 's5', 's6'
        ] 
示例18
def test_replicability():
    """Make sure running fit twice in a row finds the same parameters."""
    diabetes = load_diabetes()
    X_diabetes, y_diabetes = diabetes.data, diabetes.target
    ind = np.arange(X_diabetes.shape[0])
    rng = np.random.RandomState(0)
    rng.shuffle(ind)
    X_diabetes, y_diabetes = X_diabetes[ind], y_diabetes[ind]

    clf = MLPRegressor(keep_prob=0.9, random_state=42, n_epochs=100)
    target = y_diabetes
    # Just predict on the training set, for simplicity.
    pred1 = clf.fit(X_diabetes, target).predict(X_diabetes)
    pred2 = clf.fit(X_diabetes, target).predict(X_diabetes)
    assert_array_almost_equal(pred1, pred2) 
示例19
def test_partial_fit():
    data = load_diabetes()
    clf = MLPRegressor(n_epochs=1)

    X, y = data['data'], data['target']

    for _ in range(30):
        clf.partial_fit(X, y)

    y_pred = clf.predict(X)
    assert pearsonr(y_pred, y)[0] > 0.5 
示例20
def test_embedding_default():
    # Make sure the embedding works by default.
    data = load_diabetes()
    X, y = data['data'], data['target']

    clf = MLPRegressor(n_epochs=1)
    clf.fit(X, y)

    assert clf.transform(X).shape[1] == 256 
示例21
def test_embedding_no_layers():
    # Make sure the embedding works with no layers.
    data = load_diabetes()
    X, y = data['data'], data['target']

    clf = MLPRegressor(n_epochs=1, hidden_units=[])
    clf.fit(X, y)

    assert clf.transform(X).shape[1] == 1 
示例22
def test_embedding_specific_layer():
    # Make sure the embedding works with no layers.
    data = load_diabetes()
    X, y = data['data'], data['target']

    clf = MLPRegressor(
        n_epochs=1,
        hidden_units=(256, 8, 256),
        transform_layer_index=1)
    clf.fit(X, y)

    assert clf.transform(X).shape[1] == 8 
示例23
def create_sample_data_csv(file_name: str = "diabetes.csv",
                           for_scoring: bool = False):
    sample_data = load_diabetes()
    df = pd.DataFrame(
        data=sample_data.data,
        columns=sample_data.feature_names)
    if not for_scoring:
        df['Y'] = sample_data.target
    # Hard code to diabetes so we fail fast if the project has been
    # bootstrapped.
    df.to_csv(file_name, index=False) 
示例24
def get_data(n_clients):
    """
    Import the dataset via sklearn, shuffle and split train/test.
    Return training, target lists for `n_clients` and a holdout test set
    """
    print("Loading data")
    diabetes = load_diabetes()
    y = diabetes.target
    X = diabetes.data
    # Add constant to emulate intercept
    X = np.c_[X, np.ones(X.shape[0])]

    # The features are already preprocessed
    # Shuffle
    perm = np.random.permutation(X.shape[0])
    X, y = X[perm, :], y[perm]

    # Select test at random
    test_size = 50
    test_idx = np.random.choice(X.shape[0], size=test_size, replace=False)
    train_idx = np.ones(X.shape[0], dtype=bool)
    train_idx[test_idx] = False
    X_test, y_test = X[test_idx, :], y[test_idx]
    X_train, y_train = X[train_idx, :], y[train_idx]

    # Split train among multiple clients.
    # The selection is not at random. We simulate the fact that each client
    # sees a potentially very different sample of patients.
    X, y = [], []
    step = int(X_train.shape[0] / n_clients)
    for c in range(n_clients):
        X.append(X_train[step * c: step * (c + 1), :])
        y.append(y_train[step * c: step * (c + 1)])

    return X, y, X_test, y_test 
示例25
def _fit_model_pca(model):
    data = load_diabetes()
    X_train, X_test, *_ = train_test_split(
        data.data, data.target, test_size=0.2, random_state=42)
    model.fit(X_train)
    return model, X_test.astype(np.float32) 
示例26
def test_pipeline(self):
        dataset = datasets.load_diabetes()
        target_scaler = preprocessing.MinMaxScaler()
        target = dataset.target.reshape(-1, 1)

        x_train, x_test, y_train, y_test = train_test_split(
            asfloat(dataset.data),
            asfloat(target_scaler.fit_transform(target)),
            test_size=0.15
        )

        network = algorithms.GradientDescent(
            network=[
                layers.Input(10),
                layers.Sigmoid(25),
                layers.Sigmoid(1),
            ],
            batch_size=None,
            show_epoch=100,
            verbose=False,
        )
        pipeline = Pipeline([
            ('min_max_scaler', preprocessing.MinMaxScaler()),
            ('gd', network),
        ])
        pipeline.fit(x_train, y_train, gd__epochs=50)
        y_predict = pipeline.predict(x_test)

        error = objectives.rmsle(
            target_scaler.inverse_transform(y_test),
            target_scaler.inverse_transform(y_predict).round()
        )
        error = self.eval(error)
        self.assertGreater(0.5, error) 
示例27
def test_grid_search(self):
        def scorer(network, X, y):
            y = asfloat(y)
            result = asfloat(network.predict(X))
            return self.eval(objectives.rmsle(result[:, 0], y))

        dataset = datasets.load_diabetes()
        x_train, x_test, y_train, y_test = train_test_split(
            dataset.data, dataset.target, test_size=0.3
        )

        grnnet = algorithms.GRNN(std=0.5, verbose=False)
        grnnet.train(x_train, y_train)
        error = scorer(grnnet, x_test, y_test)

        self.assertAlmostEqual(0.513, error, places=3)

        random_search = model_selection.RandomizedSearchCV(
            grnnet,
            param_distributions={'std': np.arange(1e-2, 0.1, 1e-4)},
            n_iter=10,
            scoring=scorer,
            random_state=self.random_seed,
            cv=3,
        )
        random_search.fit(dataset.data, dataset.target)
        scores = random_search.cv_results_

        best_score = min(scores['mean_test_score'])
        self.assertAlmostEqual(0.4266, best_score, places=3) 
示例28
def test_transfrom_method(self):
        dataset = datasets.load_diabetes()

        grnnet = algorithms.GRNN(std=0.5, verbose=False)
        grnnet.train(dataset.data, dataset.target)

        y_predicted = grnnet.predict(dataset.data)
        y_transformed = grnnet.transform(dataset.data)

        np.testing.assert_array_almost_equal(y_predicted, y_transformed) 
示例29
def test_pandas_for_bp(self):
        dataset = datasets.load_diabetes()
        target = dataset.target.reshape(-1, 1)

        input_scaler = preprocessing.MinMaxScaler()
        target_scaler = preprocessing.MinMaxScaler()

        n_features = dataset.data.shape[1]
        input_columns = ['column_' + str(i) for i in range(n_features)]

        pandas_data = pd.DataFrame(dataset.data, columns=input_columns)
        pandas_data['target'] = target_scaler.fit_transform(target)
        pandas_data[input_columns] = input_scaler.fit_transform(
            pandas_data[input_columns]
        )

        x_train, x_test, y_train, y_test = train_test_split(
            asfloat(pandas_data[input_columns]),
            asfloat(pandas_data['target']),
            test_size=0.15
        )

        bpnet = algorithms.GradientDescent(
            [
                layers.Input(10),
                layers.Sigmoid(30),
                layers.Sigmoid(1),
            ],
            batch_size=None,
        )
        bpnet.train(x_train, y_train, epochs=50)
        y_predict = bpnet.predict(x_test).reshape(-1, 1)
        y_test = y_test.reshape(-1, 1)

        error = objectives.rmsle(
            target_scaler.inverse_transform(y_test),
            target_scaler.inverse_transform(y_predict).round()
        )
        error = self.eval(error)
        self.assertGreater(0.5, error) 
示例30
def test_simple_grnn(self):
        dataset = datasets.load_diabetes()
        x_train, x_test, y_train, y_test = train_test_split(
            dataset.data, dataset.target, test_size=0.3
        )

        x_train_before = x_train.copy()
        x_test_before = x_test.copy()
        y_train_before = y_train.copy()

        grnnet = algorithms.GRNN(std=0.1, verbose=False)
        grnnet.train(x_train, y_train)
        result = grnnet.predict(x_test)
        error = metrics.mean_absolute_error(result, y_test)

        old_result = result.copy()
        self.assertAlmostEqual(error, 46.3358, places=4)

        # Test problem with variable links
        np.testing.assert_array_equal(x_train, x_train_before)
        np.testing.assert_array_equal(x_test, x_test_before)
        np.testing.assert_array_equal(y_train, y_train_before)

        x_train[:, :] = 0
        result = grnnet.predict(x_test)

        np.testing.assert_array_almost_equal(result, old_result)
        self.assertPickledNetwork(grnnet, x_test)