Python源码示例:sklearn.preprocessing.Imputer()

示例1
def setUpClass(self):
        """
        Set up the unit test by loading the dataset and training a model.
        """
        from sklearn.datasets import load_boston

        scikit_data = load_boston()
        scikit_model = Imputer(strategy="most_frequent", axis=0)
        scikit_data["data"][1, 8] = np.NaN

        input_data = scikit_data["data"][:, 8].reshape(-1, 1)
        scikit_model.fit(input_data, scikit_data["target"])

        # Save the data and the model
        self.scikit_data = scikit_data
        self.scikit_model = scikit_model 
示例2
def readFile(inpath):
    if os.path.isfile(inpath):
        dataset = genfromtxt(open(inpath,'r'), delimiter=',', dtype='f8')[0:] 
        imp = Imputer(missing_values='NaN', strategy='mean', axis=0)# fill in the missing values with the mean of each column
        transformedData = imp.fit_transform(dataset)
        rmvedCols = imp.statistics_
        idxRmved = np.where(np.isnan(rmvedCols))#take the indices of the nan columns
        nanTarget = dataset.shape[1]-1 in idxRmved[0]#check if the target is a nan column
        if nanTarget:
            raise ValueError("The target variable contains only nan values or inf")
    else:
        raise ValueError("File does not exist")    
    return transformedData
    
#parameters: vector 'target' which is the target variable
#returns: the dataset which includes the previous values of the target 
示例3
def test_imputer_float_inputs(self):
        model = Imputer(missing_values="NaN", strategy="mean", axis=0)
        data = [[1, 2], [np.nan, 3], [7, 6]]
        model.fit(data)

        model_onnx = convert_sklearn(model, "scikit-learn imputer",
                                     [("input", FloatTensorType([None, 2]))])
        self.assertTrue(model_onnx.graph.node is not None)

        # should contain only node
        self.assertEqual(len(model_onnx.graph.node), 1)

        # last node should contain the Imputer
        outputs = model_onnx.graph.output
        self.assertEqual(len(outputs), 1)
        self.assertEqual(outputs[0].type.tensor_type.shape.dim[-1].dim_value,
                         2)
        dump_data_and_model(
            np.array(data, dtype=np.float32),
            model,
            model_onnx,
            basename="SklearnImputerMeanFloat32",
        ) 
示例4
def test_simple_imputer_float_inputs(self):
        model = SimpleImputer(strategy="mean", fill_value="nan")
        data = [[1, 2], [np.nan, 3], [7, 6]]
        model.fit(data)

        model_onnx = convert_sklearn(
            model,
            "scikit-learn simple imputer",
            [("input", FloatTensorType([None, 2]))],
            target_opset=TARGET_OPSET)
        self.assertTrue(model_onnx.graph.node is not None)

        # should contain only node
        self.assertEqual(len(model_onnx.graph.node), 1)

        # last node should contain the Imputer
        outputs = model_onnx.graph.output
        self.assertEqual(len(outputs), 1)
        self.assertEqual(
            outputs[0].type.tensor_type.shape.dim[-1].dim_value, 2)
        dump_data_and_model(
            np.array(data, dtype=np.float32),
            model, model_onnx,
            basename="SklearnSimpleImputerMeanFloat32") 
示例5
def test_objectmapper(self):
        df = pdml.ModelFrame([])
        self.assertIs(df.preprocessing.Binarizer, pp.Binarizer)
        self.assertIs(df.preprocessing.FunctionTransformer,
                      pp.FunctionTransformer)
        self.assertIs(df.preprocessing.Imputer, pp.Imputer)
        self.assertIs(df.preprocessing.KernelCenterer, pp.KernelCenterer)
        self.assertIs(df.preprocessing.LabelBinarizer, pp.LabelBinarizer)
        self.assertIs(df.preprocessing.LabelEncoder, pp.LabelEncoder)
        self.assertIs(df.preprocessing.MultiLabelBinarizer, pp.MultiLabelBinarizer)
        self.assertIs(df.preprocessing.MaxAbsScaler, pp.MaxAbsScaler)
        self.assertIs(df.preprocessing.MinMaxScaler, pp.MinMaxScaler)
        self.assertIs(df.preprocessing.Normalizer, pp.Normalizer)
        self.assertIs(df.preprocessing.OneHotEncoder, pp.OneHotEncoder)
        self.assertIs(df.preprocessing.PolynomialFeatures, pp.PolynomialFeatures)
        self.assertIs(df.preprocessing.RobustScaler, pp.RobustScaler)
        self.assertIs(df.preprocessing.StandardScaler, pp.StandardScaler) 
示例6
def test_transform_1d_frame_int(self):
        arr = np.array([1, 2, 3, 1, 2, 3, 1, 2, 3])
        idx = pd.Index('a b c d e f g h i'.split(' '))
        df = pdml.ModelFrame(arr, index=idx, columns=['X'])
        self.assertEqual(len(df.columns), 1)

        # reshape arr to 2d
        arr = arr.reshape(-1, 1)

        if pd.compat.PY3:
            models = ['Binarizer', 'Imputer', 'StandardScaler']
            # MinMaxScalar raises TypeError in ufunc
        else:
            models = ['Binarizer', 'Imputer', 'StandardScaler', 'MinMaxScaler']

        for model in models:
            mod1 = getattr(df.preprocessing, model)()
            mod2 = getattr(pp, model)()

            self._assert_transform(df, arr, mod1, mod2)

            mod1 = getattr(df.preprocessing, model)()
            mod2 = getattr(pp, model)()
            self._assert_fit_transform(df, arr, mod1, mod2) 
示例7
def test_Imputer(self):
        arr = np.array([1, np.nan, 3, 2])
        s = pdml.ModelSeries(arr)

        mod1 = s.pp.Imputer(axis=0)
        s.fit(mod1)
        result = s.transform(mod1)

        expected = np.array([1, 2, 3, 2])

        self.assertIsInstance(result, pdml.ModelSeries)
        self.assert_numpy_array_almost_equal(result.values, expected)

        mod1 = s.pp.Imputer(axis=0)
        result = s.fit_transform(mod1)

        self.assertIsInstance(result, pdml.ModelSeries)
        self.assert_numpy_array_almost_equal(result.values, expected) 
示例8
def FeatureCombination(Df,s='',num_feature=2): 
    feature_set = []
    for c in Df.columns:
        if c.startswith(s): feature_set.append(c)
    print('combining', len(feature_set), 'features')
    data = Df[feature_set].values

    for c in Df.columns:
        if Df[c].dtype == 'object':
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(Df[c].values))
            Df[c] = lbl.transform(list(Df[c].values))
            
    imp = preprocessing.Imputer()
    data = imp.fit_transform(data)
    data = preprocessing.scale(data)
    pca = PCA(num_feature)
    pca.fit(data)
    print('explained_variance_ratio_:', pca.explained_variance_ratio_)
    trans = pca.transform(data)
    for i in range(0,num_feature):
        Df[s+'_%d'%(i+1)] = trans[:,i]
    Df.drop(feature_set,1,inplace=True)
    return Df 
示例9
def test_imputer(self):
        try:
            model = Imputer(missing_values='NaN', strategy='mean', axis=0)
        except TypeError:
            model = Imputer(missing_values=np.nan, strategy='mean')
            model.axis = 0
        data = [[1, 2], [np.nan, 3], [7, 6]]
        model.fit(data)
        from onnxmltools.convert.coreml.convert import convert
        import coremltools  # noqa
        try:
            model_coreml = coremltools.converters.sklearn.convert(model)
        except ValueError as e:
            if 'not supported' in str(e):
                # Python 2.7 + scikit-learn 0.22
                return
        model_onnx = convert(model_coreml.get_spec())
        self.assertTrue(model_onnx is not None)
        dump_data_and_model(np.array(data, dtype=np.float32),
                            model, model_onnx, basename="CmlImputerMeanFloat32") 
示例10
def impute_data(self,x):
        """Imputes data set containing Nan values"""
        imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
        return imp.fit_transform(x) 
示例11
def __init__(self, params, dataset):
        """Initializes a UMAPTransformer object.

        Args:
            params (Namespace): Contains parameters used to instantiate the transformer.
            dataset (Dataset): Dataset used to "train" the projection mapping.
        """

        # TODO: decide whether to make n_epochs a parameter
        #default_n_epochs = None
        default_n_epochs = 500

        if params.prediction_type == 'classification':
            target_metric = 'categorical'
        else:
            target_metric = 'l2'
        self.scaler = RobustScaler()
        # Use Imputer to replace missing values (NaNs) with means for each column
        self.imputer = Imputer()
        scaled_X = self.scaler.fit_transform(self.imputer.fit_transform(dataset.X))
        self.mapper = umap.UMAP(n_neighbors=params.umap_neighbors, 
                                n_components=params.umap_dim,
                                metric=params.umap_metric,
                                target_metric=target_metric,
                                target_weight=params.umap_targ_wt,
                                min_dist=params.umap_min_dist,
                                n_epochs=default_n_epochs)
        # TODO: How to deal with multitask data?
        self.mapper.fit(scaled_X, y=dataset.y.flatten())

    # **************************************************************************************** 
示例12
def get_clf_pipeline():
    clf = models.DefaultClassifier(
            GradientBoostingClassifier(
                       loss='deviance', learning_rate=0.01, n_estimators=3000,
                       subsample=0.6, min_samples_split=12, min_samples_leaf=12,
                       max_depth=6, random_state=1357, verbose=0)
           )
    steps = [('features', models.FeatureSelector()),
             ('Impute', Imputer(strategy='median')),
             ('scaler', StandardScaler()),
             ('clf', clf)]
    return Pipeline(steps) 
示例13
def get_reg_pipeline():
    clf = models.PartialRegressor(
            GradientBoostingRegressor(loss='ls', learning_rate=0.0075, n_estimators=5000,
                 subsample=0.5, min_samples_split=20, min_samples_leaf=20, max_leaf_nodes=30,
                 random_state=9753, verbose=0)
            )
    steps = [('features', models.FeatureSelector()),
             ('Impute', Imputer(strategy='median')),
             ('scaler', StandardScaler()),
             ('clf', clf)]
    return Pipeline(steps) 
示例14
def __init__(self, max_iter=10, initial_strategy='mean', tol=1e-3, f_model="RandomForest"):
        self.max_iter = max_iter
        self.initial_strategy = initial_strategy
        self.initial_imputer = Imputer(strategy=initial_strategy)
        self.tol = tol
        self.f_model = f_model 
示例15
def data_handlemissing(dataframe, pipeline):
    try:
        if pipeline['options']['type'] == "dropcolumns":
            thresh = pipeline['options']['thresh']
            if thresh == -1:
                dataframe.dropna(axis=1, how="all", inplace=True)
            elif thresh == 0:
                dataframe.dropna(axis=1, how="any", inplace=True)
            elif thresh > 0:
                dataframe.dropna(axis=1, thresh=thresh, inplace=True)
        elif pipeline['options']['type'] == "droprows":
            thresh = pipeline['options']['thresh']
            if thresh == -1:
                dataframe.dropna(axis=0, how="all", inplace=True)
            elif thresh == 0:
                dataframe.dropna(axis=0, how="any", inplace=True)
            elif thresh > 0:
                dataframe.dropna(axis=0, thresh=thresh)
        elif pipeline['options']['type'] == "fillmissing":
            strategy = pipeline['options']['strategy']
            imp = Imputer(missing_values='NaN', strategy=strategy, axis=0)
            array = imp.fit_transform(dataframe.values)
            dataframe = pandas.DataFrame(array, columns = dataframe.columns)

        return dataframe
    except Exception as e:
        raise Exception("data_handlemissing: " + str(e)) 
示例16
def test_conversion_boston(self):

        from sklearn.datasets import load_boston

        scikit_data = load_boston()

        sh = scikit_data.data.shape

        rn.seed(0)
        missing_value_indices = [
            (rn.randint(sh[0]), rn.randint(sh[1])) for k in range(sh[0])
        ]

        for strategy in ["mean", "median", "most_frequent"]:
            for missing_value in [0, "NaN", -999]:

                X = np.array(scikit_data.data).copy()

                for i, j in missing_value_indices:
                    X[i, j] = missing_value

                model = Imputer(missing_values=missing_value, strategy=strategy)
                model = model.fit(X)

                tr_X = model.transform(X.copy())

                spec = converter.convert(model, scikit_data.feature_names, "out")

                input_data = [dict(zip(scikit_data.feature_names, row)) for row in X]

                output_data = [{"out": row} for row in tr_X]

                result = evaluate_transformer(spec, input_data, output_data)

                assert result["num_errors"] == 0 
示例17
def test_conversion_bad_inputs(self):
        # Error on converting an untrained model
        with self.assertRaises(Exception):
            model = Imputer()
            spec = converter.convert(model, "data", "out")

        # Check the expected class during covnersion.
        with self.assertRaises(Exception):
            from sklearn.linear_model import LinearRegression

            model = LinearRegression()
            spec = converter.convert(model, "data", "out") 
示例18
def __init__(self):
        self.supports_output = True
        self.default_transformation_pipeline = [Imputer(strategy='mean'), StandardScaler()] 
示例19
def createAuto(target):
    win=13 # window size, how many previous values we take of the target (here 12 because the range goes from 1-12 without the 13)
    dataAuto = np.empty((len(target),win-1))
    for i in range(1,win):
        dataAuto[:,i-1] = shift2(target, i)
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    transformedDataAuto = imp.fit_transform(dataAuto)           
    X_auto = transformedDataAuto
    return X_auto  
    
#parameters: 'X' the predictors, 'y' the target, 'cvFolds' number of folds, 'estimator' machine learning algorithm 
#returns: the R squared for each fold 
示例20
def fit(self, X, y=None):
        self.imp = Imputer(strategy=self.strategy)
        self.imp.fit(X)
        self.statistics_ = pd.Series(self.imp.statistics_, index=X.columns)
        return self 
示例21
def train_model(titanic_data_path, model_output_path):
    print('Loading the data...')
    try:
        with tf.gfile.Open(titanic_data_path, 'r') as data_file:
            train_df = pd.read_csv(data_file)
            print('Number of samples: {}'.format(train_df.shape[0]))

            target_name = 'Survived'
            feature_names = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Embarked']

            print('Preparing the features...')
            train_features = train_df[feature_names].copy()
            train_features['Age'] = Imputer().fit_transform(train_features['Age'].values.reshape(-1, 1))
            embarked = train_features['Embarked']
            train_features['Embarked'] = embarked.fillna(embarked.mode()[0])
            train_features = pd.get_dummies(train_features)
            train_target = train_df[target_name]

            print('Training the model...')
            parameters = {'max_depth': [2, 3, 4, 5, 6, 7], 'n_estimators': [50, 100, 150, 200]}
            gsc = GridSearchCV(GradientBoostingClassifier(), parameters, n_jobs=-1, cv=5)
            gsc.fit(train_features, train_target)
            print('Best Hyper Parameters: {}'.format(gsc.best_params_))
            print('Accuracy: {}'.format(gsc.best_score_))

            with tf.gfile.Open(model_output_path, 'wb') as model_file:
                joblib.dump(gsc.best_estimator_, model_file, protocol=1)
    except Exception as e:
        print('Error: {}'.format(e)) 
示例22
def test_model_imputer(self):
        model = Imputer(missing_values="NaN", strategy="mean", axis=0)
        data = [[1, 2], [np.nan, 3], [7, 6]]
        model.fit(data)
        # The conversion works but internally scikit-learn converts
        # everything into float before looking into missing values.
        # There is no nan integer. The runtime is not tested
        # in this case.
        model_onnx = convert_sklearn(model, "scikit-learn imputer",
                                     [("input", Int64TensorType([None, 2]))])
        self.assertTrue(model_onnx is not None) 
示例23
def test_imputer_int_inputs(self):
        model = Imputer(missing_values="NaN", strategy="mean", axis=0)
        data = [[1, 2], [np.nan, 3], [7, 6]]
        model.fit(data)
        model_onnx = convert_sklearn(model, "scikit-learn imputer",
                                     [("input", Int64TensorType([None, 2]))])
        self.assertEqual(len(model_onnx.graph.node), 2)

        # Last node should be Imputer
        outputs = model_onnx.graph.output
        self.assertEqual(len(outputs), 1)
        self.assertEqual(outputs[0].type.tensor_type.shape.dim[-1].dim_value,
                         2) 
示例24
def test_transform_series_int(self):
        arr = np.array([1, 2, 3, 1, 2, 3, 1, 2, 3])
        s = pdml.ModelSeries(arr, index='a b c d e f g h i'.split(' '))

        # reshape arr to 2d
        arr = arr.reshape(-1, 1)

        if pd.compat.PY3:
            models = ['Binarizer', 'Imputer', 'StandardScaler']
            # MinMaxScalar raises TypeError in ufunc
        else:
            models = ['Binarizer', 'Imputer', 'StandardScaler', 'MinMaxScaler']

        for model in models:
            mod1 = getattr(s.preprocessing, model)()
            mod2 = getattr(pp, model)()
            s.fit(mod1)
            mod2.fit(arr)

            result = s.transform(mod1)
            expected = mod2.transform(arr).flatten()

            self.assertIsInstance(result, pdml.ModelSeries)
            self.assert_numpy_array_almost_equal(result.values, expected)

            mod1 = getattr(s.preprocessing, model)()
            mod2 = getattr(pp, model)()

            result = s.fit_transform(mod1)
            expected = mod2.fit_transform(arr).flatten()

            self.assertIsInstance(result, pdml.ModelSeries)
            self.assert_numpy_array_almost_equal(result.values, expected) 
示例25
def impute_and_scale(df, scaling='std'):
    """Impute missing values with mean and scale data included in pandas dataframe.

    Parameters
    ----------
    df : pandas dataframe
        dataframe to impute and scale
    scaling : 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional (default 'std')
        type of scaling to apply
    """

    df = df.dropna(axis=1, how='all')

    #imputer = Imputer(strategy='mean', axis=0)
    imputer = Imputer(strategy='mean')
    mat = imputer.fit_transform(df)

    if scaling is None or scaling.lower() == 'none':
        return pd.DataFrame(mat, columns=df.columns)

    if scaling == 'maxabs':
        scaler = MaxAbsScaler()
    elif scaling == 'minmax':
        scaler = MinMaxScaler()
    else:
        scaler = StandardScaler()

    mat = scaler.fit_transform(mat)

    df = pd.DataFrame(mat, columns=df.columns)

    return df 
示例26
def impute_and_scale(df, scaling='std'):
    """Impute missing values with mean and scale data included in pandas dataframe.

    Parameters
    ----------
    df : pandas dataframe
        dataframe to impute and scale
    scaling : 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional (default 'std')
        type of scaling to apply
    """

    df = df.dropna(axis=1, how='all')

    imputer = Imputer(strategy='mean')
    mat = imputer.fit_transform(df)

    if scaling is None or scaling.lower() == 'none':
        return pd.DataFrame(mat, columns=df.columns)

    if scaling == 'maxabs':
        scaler = MaxAbsScaler()
    elif scaling == 'minmax':
        scaler = MinMaxScaler()
    else:
        scaler = StandardScaler()

    mat = scaler.fit_transform(mat)

    df = pd.DataFrame(mat, columns=df.columns)

    return df 
示例27
def impute_and_scale(df, scaling='std', imputing='mean', dropna='all'):
    """Impute missing values with mean and scale data included in pandas dataframe.

    Parameters
    ----------
    df : pandas dataframe
        dataframe to impute and scale
    scaling : 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional (default 'std')
        type of scaling to apply
    """

    if dropna:
        df = df.dropna(axis=1, how=dropna)
    else:
        empty_cols = df.columns[df.notnull().sum() == 0]
        df[empty_cols] = 0

    if imputing is None or imputing.lower() == 'none':
        mat = df.values
    else:
        imputer = Imputer(strategy=imputing)
        mat = imputer.fit_transform(df)

    if scaling is None or scaling.lower() == 'none':
        return pd.DataFrame(mat, columns=df.columns)

    if scaling == 'maxabs':
        scaler = MaxAbsScaler()
    elif scaling == 'minmax':
        scaler = MinMaxScaler()
    else:
        scaler = StandardScaler()

    mat = scaler.fit_transform(mat)
    df = pd.DataFrame(mat, columns=df.columns)

    return df 
示例28
def impute_and_scale_array(mat, scaling=None):
    """ Impute missing values with mean and scale data included in numpy array.

        Parameters
        ----------
        mat : numpy array
            Array to scale
        scaling : string
            String describing type of scaling to apply.
            Options recognized: 'maxabs', 'minmax', 'std'.
            'maxabs' : scales data to range [-1 to 1].
            'minmax' : scales data to range [-1 to 1].
            'std'    : scales data to normal variable with mean 0 and standard deviation 1.
            (Default: None, no scaling).

        Return
        ----------
        Returns the numpy array imputed with the mean value of the \
        column and scaled by the method specified. If no scaling method is specified, \
        it returns the imputed numpy array.
    """
    
#    imputer = Imputer(strategy='mean', axis=0, copy=False)
#    imputer = SimpleImputer(strategy='mean', copy=False)
    # Next line is from conditional import. axis=0 is default
    # in old version so it is not necessary.
    imputer = Imputer(strategy='mean', copy=False)
    imputer.fit_transform(mat)
    
    return scale_array(mat, scaling) 
示例29
def test_permutation_test_score_allow_nans():
    # Check that permutation_test_score allows input data with NaNs
    X = np.arange(200, dtype=np.float64).reshape(10, -1)
    X[2, :] = np.nan
    y = np.repeat([0, 1], X.shape[0] / 2)
    p = Pipeline([
        ('imputer', Imputer(strategy='mean', missing_values='NaN')),
        ('classifier', MockClassifier()),
    ])
    permutation_test_score(p, X, y, cv=5) 
示例30
def test_cross_val_score_allow_nans():
    # Check that cross_val_score allows input data with NaNs
    X = np.arange(200, dtype=np.float64).reshape(10, -1)
    X[2, :] = np.nan
    y = np.repeat([0, 1], X.shape[0] / 2)
    p = Pipeline([
        ('imputer', Imputer(strategy='mean', missing_values='NaN')),
        ('classifier', MockClassifier()),
    ])
    cross_val_score(p, X, y, cv=5)