機器學習sklearn(五): 資料處理(二)缺失值處理
來源 https://www.cnblogs.com/B-Hanan/articles/12774433.html
1 單變數缺失
import numpy as np from sklearn.impute import SimpleImputer
class SimpleImputer(_BaseImputer):Imputation transformer for completing missing values.
missing_values(缺失值型別) : number, string, np.nan (default) or None
The placeholder for the missing values. All occurrences ofmissing_values
will be imputed.
strategy : string, default='mean'
The imputation strategy.
If "mean", then replace missing values using the mean along each column. Can only be used with numeric data.
If "median", then replace missing values using the median along each column. Can only be used with numeric data.
If "most_frequent", then replace missing using the most frequent value along each column. Can be used with strings or numeric data.
If "constant", then replace missing values with fill_value. Can be used with strings or numeric data.strategy="constant" for fixed value imputation.
fill_value : string or numerical value, default=None
When strategy == "constant", fill_value is used to replace all occurrences of missing_values.If left to the default, fill_value will be 0 when imputing numericaldata and "missing_value" for strings or object data types.
imp=SimpleImputer(missing_values=np.nan,strategy='mean') imp.fit([[1,2],[np.nan,3],[7,6]])
SimpleImputer(add_indicator=False, copy=True, fill_value=None, missing_values=nan, strategy='mean', verbose=0)
##SimpleImputer類支援稀疏矩陣 import scipy.sparse as sp X=sp.csc_matrix([[1,2],[0,-1],[8,4]]) imp=SimpleImputer(missing_values=-1,strategy='mean') imp.fit(X)
SimpleImputer(add_indicator=False, copy=True, fill_value=None, missing_values=-1, strategy='mean', verbose=0)
X_test=sp.csc_matrix([[-1,2],[6,-1],[7,6]]) print(imp.transform(X_test))
(0, 0) 3.0 (1, 0) 6.0 (2, 0) 7.0 (0, 1) 2.0 (1, 1) 3.0 (2, 1) 6.0
print(imp.transform(X_test).toarray()) [[3. 2.] [6. 3.] [7. 6.]] import pandas as pd df=pd.DataFrame([['a','x'], [np.nan,'y'], ['a',np.nan], ['b','y']],dtype='category') df
0 | 1 | |
0 | a | x |
1 | NaN | y |
2 | a | NaN |
3 | b | y |
imp=SimpleImputer(strategy='most_frequent') print(imp.fit_transform(df))
[['a' 'x'] ['a' 'y'] ['a' 'y'] ['b' 'y']]
2 多元特徵估計
import numpy as np from sklearn.experimental import enable_iterative_imputer from sklearn.impute import IterativeImputer
IterativeImputer(add_indicator=False, estimator=None, imputation_order='ascending', initial_strategy='mean', max_iter=10, max_value=None, min_value=None, missing_values=nan, n_nearest_features=None, random_state=0, sample_posterior=False, skip_complete=False, tol=0.001, verbose=0)
array([[ 1. , 2. ], [ 3. , 6. ], [ 4. , 8. ], [ 1.50004509, 3. ], [ 7. , 14.00004135]])
X_test = [[np.nan, 2], [6, np.nan], [np.nan, 6]] print(imp.transform(X_test))
[[ 1.00007297 2. ] [ 6. 12.00002754] [ 2.99996145 6. ]]
3 K-近鄰法
from sklearn.impute import KNNImputer
Imputation for completing missing values using k-Nearest Neighbors.
Each sample's missing values are imputed using the mean value fromn_neighbors
nearest neighbors found in the training set. Two samples are close if the features that neither is missing are close.
missing_values : number, string, np.nan or None, default=np.nan
The placeholder for the missing values. All occurrences ofmissing_values
will be imputed.
n_neighbors : int, default=5 Number of neighboring samples to use for imputation.
weights : {'uniform', 'distance'} or callable, default='uniform' Weight function used in prediction.
import numpy as np
from sklearn.impute import KNNImputer
nan = np.nan
X = [[1, 2, nan], [3, 4, 3], [nan, 6, 5], [8, 8, 7]]
imputer = KNNImputer(n_neighbors=2, weights="uniform")
[[1, 2, nan], [3, 4, 3], [nan, 6, 5], [8, 8, 7]]
array([[1. , 2. , 4. ],
[3. , 4. , 3. ],
[5.5, 6. , 5. ],
[8. , 8. , 7. ]])
4 標記推算值
from sklearn.impute import MissingIndicator
class MissingIndicator(sklearn.base.TransformerMixin, sklearn.base.BaseEstimator)
Binary indicators for missing values(缺失值的二進位制指示符).
MissingIndicator(missing_values=nan, features='missing-only', sparse='auto', error_on_new=True)
X = np.array([[-1, -1, 1, 3],
[4, -1, 0, -1],
[8, -1, 1, 0]])
indicator = MissingIndicator(missing_values=-1)
mask_missing_values_only = indicator.fit_transform(X)
array([[ True, True, False],
[False, True, True],
[False, True, False]])
array([0, 1, 2, 3])
indicator = MissingIndicator(missing_values=-1, features="all")
mask_all = indicator.fit_transform(X)
array([[ True, True, False, False],
[False, True, False, True],
[False, True, False, False]])
array([0, 1, 2, 3])