Python 填補缺失值 Pandas SimpleImputer 隨機森林模型 (機器學習)
阿新 • • 發佈:2021-02-12
技術標籤:機器學習AI找不到女朋友系列機器學習python大資料人工智慧深度學習
文章目錄
填補缺失值
1、匯入相關庫
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
2、建立資料
x=np.random.randint(1,100,(10000,5)) y=np.random.randint(1,10,10000) rows=np.random.randint(0,1000,20) cols=np.random.randint(0,5,20) x=pd.DataFrame(x) x.iloc[rows,cols]=np.nan
3、利用Pandas填補資料
x1=x.copy()
for i in x1.columns:
x1[x1.isnull()]=x1[i].mean()
x1.isnull().sum()
4、sklearn庫填補
from sklearn.impute import SimpleImputer
sim=SimpleImputer(missing_values=np.nan,strategy='constant',fill_value=0)
x2=x.copy()
x2=sim.fit_transform(x2)
pd.DataFrame(x2).isnull().sum()
5、利用模型預測
from sklearn.ensemble import RandomForestRegressor x3= x.copy() sortindex = np.argsort(x3.isnull().sum(axis=0)).values for i in sortindex: #構建我們的新特徵矩陣和新標籤 df = x3 fillc = df.iloc[:,i] df = pd.concat([df.iloc[:,df.columns != i],pd.DataFrame(y)],axis=1) #在新特徵矩陣中,對含有缺失值的列,進行0的填補 df_0 =SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0).fit_transform(df) #找出我們的訓練集和測試集 y_train = fillc[fillc.notnull()] y_test = fillc[fillc.isnull()] x_train = df_0[y_train.index,:] x_test = df_0[y_test.index,:] clf = RandomForestRegressor(n_estimators=100) clf = clf.fit(x_train, y_train) y_pred = clf.predict(x_test) #將填補好的特徵返回到我們的原始的特徵矩陣中 x3.loc[x3.iloc[:,i].isnull(),i] = y_pred x3.isnull().sum()