1. 程式人生 > >pandas:填充缺失值 fillna("missing") 和fillna("missing",inplace=True)的區別

pandas:填充缺失值 fillna("missing") 和fillna("missing",inplace=True)的區別

當資料中存在NaN缺失值時,我們可以用其他數值替代NaN,主要用到了DataFrame.fillna()方法,下面我們來看看具體的用法:

1.先來建立一個帶有缺失值的DataFrame:

# coding=utf-8
import numpy as np
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import csv
from pandas import DataFrame

df=pd.DataFrame(np.random.randn(5,3),index=list('abcde'),columns=['one','two','three'])
print df  #原有資料
print('\n')
df.ix[1,:-1]=np.nan  #第1行,從第0列到倒數第1列 的所有資料填充為NaN (都是從0行,0列開始的 )
df.ix[1:-1,2]=np.nan #第1行到倒數第1行,第2列 的所有資料填充為NaN (都是從0行,0列開始的 )
print df  #被填充有空值的資料
print('\n')
print df.fillna("missing")  #此資料已被填充
print('\n')
print df  #df 的內容還是原來有空值的資料
print('\n')
print df.fillna(method='pad')  #用前一個數據代替NaN:method='pad'
print('\n')
print df.fillna(method='bfill',limit=1)  #與pad相反,bfill表示用後一個數據代替NaN
print('\n')
print df.fillna(df.mean()) #使用平均數或者其他描述性統計量來代替NaN
print('\n')
print df.fillna(df.mean()['one':'two']) #選擇哪一列進行缺失值的處理

結果:
        one       two     three
a  0.348287 -0.579763 -0.687073
b -0.128967  1.734375 -1.530778
c  0.448428 -0.791999  0.620952
d  0.808736 -0.554402 -0.094709
e  1.553160 -1.336362 -0.159426


        one       two     three
a  0.348287 -0.579763 -0.687073
b       NaN       NaN       NaN
c  0.448428 -0.791999       NaN
d  0.808736 -0.554402       NaN
e  1.553160 -1.336362 -0.159426


        one       two     three
a  0.348287 -0.579763 -0.687073
b   missing   missing   missing
c  0.448428 -0.791999   missing
d  0.808736 -0.554402   missing
e   1.55316  -1.33636 -0.159426


        one       two     three
a  0.348287 -0.579763 -0.687073
b       NaN       NaN       NaN
c  0.448428 -0.791999       NaN
d  0.808736 -0.554402       NaN
e  1.553160 -1.336362 -0.159426


        one       two     three
a  0.348287 -0.579763 -0.687073
b  0.348287 -0.579763 -0.687073
c  0.448428 -0.791999 -0.687073
d  0.808736 -0.554402 -0.687073
e  1.553160 -1.336362 -0.159426


        one       two     three
a  0.348287 -0.579763 -0.687073
b  0.448428 -0.791999       NaN
c  0.448428 -0.791999       NaN
d  0.808736 -0.554402 -0.159426
e  1.553160 -1.336362 -0.159426


        one       two     three
a  0.348287 -0.579763 -0.687073
b  0.789653 -0.815631 -0.423249
c  0.448428 -0.791999 -0.423249
d  0.808736 -0.554402 -0.423249
e  1.553160 -1.336362 -0.159426


        one       two     three
a  0.348287 -0.579763 -0.687073
b  0.789653 -0.815631       NaN
c  0.448428 -0.791999       NaN
d  0.808736 -0.554402       NaN
e  1.553160 -1.336362 -0.159426
2、 fillna("missing") 和fillna("missing",inplace=True)的區別
# coding=utf-8
import numpy as np
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import csv
from pandas import DataFrame

df=pd.DataFrame(np.random.randn(5,3),index=list('abcde'),columns=['one','two','three'])
print df  #原有資料
print('\n')
df.ix[1,:-1]=np.nan  #第1行,從第0列到倒數第1列 的所有資料填充為NaN (都是從0行,0列開始的 )
df.ix[1:-1,2]=np.nan #第1行到倒數第1行,第2列 的所有資料填充為NaN (都是從0行,0列開始的 )
print df    #被填充有空值的資料
print('\n')
print df.fillna("missing")  #df 原資料沒有被 missing 填充,df 的資料沒有變
print('\n')
print df   #df 原資料沒有被 missing 填充,df 的資料沒有變
print('\n')
df.fillna("missing",inplace=True)  #df 原資料已被 missing 填充
print df   #df 原資料已被 missing 填充
print "----------"

結果:
        one       two     three
a  0.428457 -0.797473 -0.448647
b -1.744598 -0.944395  0.952140
c  1.096071  0.812616  1.980379
d -1.120961  1.193119  0.455609
e  1.039164 -0.384459  0.289628


        one       two     three
a  0.428457 -0.797473 -0.448647
b       NaN       NaN       NaN
c  1.096071  0.812616       NaN
d -1.120961  1.193119       NaN
e  1.039164 -0.384459  0.289628


        one       two     three
a  0.428457 -0.797473 -0.448647
b   missing   missing   missing
c   1.09607  0.812616   missing
d  -1.12096   1.19312   missing
e   1.03916 -0.384459  0.289628


        one       two     three
a  0.428457 -0.797473 -0.448647
b       NaN       NaN       NaN
c  1.096071  0.812616       NaN
d -1.120961  1.193119       NaN
e  1.039164 -0.384459  0.289628


        one       two     three
a  0.428457 -0.797473 -0.448647
b   missing   missing   missing
c   1.09607  0.812616   missing
d  -1.12096   1.19312   missing
e   1.03916 -0.384459  0.289628
----------