python資料分析八_Na值得資料處理
阿新 • • 發佈:2021-07-02
# -*- coding: utf-8 -*- import pandas as pd from pandas import Series,DataFrame import numpy as np string_data=Series(['aa','bb','cc',np.nan]) print(string_data) # 0 aa # 1 bb # 2 cc # 3 NaN print(string_data.isnull()) # 0 False # 1 False # 2 False # 3 True # dtype: bool #python none值也會當做Na值處理 ''' dropna() 過濾,還可以設定閾值。調節確實容忍度 fillna() 設定預設值 isnull() 判斷為空 notnull() 判斷不為空 ''' print(string_data[string_data.notnull()]) print(string_data.dropna()) # 0 aa # 1 bb # 2 cc print(string_data.fillna(1)) # 0 aa # 1 bb # 2 cc # 3 1 # dtype: object data=DataFrame([[1,2,3],[np.nan,2,3],[np.nan,np.nan,np.nan],[np.nan,2,3]]) print(data) # 0 1 2 # 0 1.0 2.0 3.0 # 1 NaN 2.0 3.0 # 2 NaN NaN NaN # 3 NaN 2.0 3.0 #過濾行Na的行 print(data.dropna()) # 0 1 2 # 0 1.0 2.0 3.0 #過濾全是Na的行 print(data.dropna(how='all')) # 0 1.0 2.0 3.0 # 1 NaN 2.0 3.0 # 3 NaN 2.0 3.0 #過濾含Na的列 data[4]=np.nan print(data) # 0 1 2 4 # 0 1.0 2.0 3.0 NaN # 1 NaN 2.0 3.0 NaN # 2 NaN NaN NaN NaN # 3 NaN 2.0 3.0 NaN print(data.dropna(axis=1,how='all')) # 0 1 2 # 0 1.0 2.0 3.0 # 1 NaN 2.0 3.0 # 2 NaN NaN NaN # 3 NaN 2.0 3.0 data=DataFrame(np.random.randn(7,3)) print(data) # 0 1 2 # 0 0.329393 -0.849128 1.864103 # 1 -1.413807 0.461709 1.097039 # 2 0.191843 0.654831 -0.527389 # 3 -1.012322 -0.210649 -0.226674 # 4 0.864600 0.960556 -1.436670 # 5 -1.411272 -0.315413 0.575377 # 6 -0.819563 -0.893195 0.057541 data.ix[:4,1]=np.nan print(data) # 0 1 2 # 0 0.051068 NaN 0.333383 # 1 0.793481 NaN -1.240897 # 2 0.705667 NaN 0.797441 # 3 0.088447 NaN 0.059333 # 4 -1.638566 NaN -0.853278 # 5 0.676200 0.151795 -1.329852 # 6 -0.849945 0.955315 -0.526976 data.ix[:2,2]=np.nan print(data) # 0 1 2 # 0 -0.429629 NaN NaN # 1 0.071094 NaN NaN # 2 0.735083 NaN NaN # 3 -2.396363 NaN 0.236465 # 4 -2.817603 NaN -0.919750 # 5 -1.031900 0.941620 1.547814 # 6 1.290588 0.116789 0.375252 #過濾前三行 print(data.dropna(thresh=3)) # 0 1 2 # 5 0.559282 0.406619 -0.633786 # 6 0.061291 -0.586182 0.123497 ''' 填充缺失資料 ''' print(data.fillna(0)) # 0 1 2 # 0 -0.621390 0.000000 0.000000 # 1 -1.483077 0.000000 0.000000 # 2 -0.948426 0.000000 0.000000 # 3 1.641440 0.000000 2.015218 # 4 -1.036951 0.000000 0.990668 # 5 -0.396387 -0.043747 -0.579406 # 6 -0.272858 -1.523178 -0.494554 #不同列的填充 print(data.fillna({1:0.5,2:-1}))#2列賦值0.5,3列瀆職-1 # 0 1 2 # 0 -0.857152 0.500000 -1.000000 # 1 -1.054372 0.500000 -1.000000 # 2 0.930643 0.500000 -1.000000 # 3 -1.130063 0.500000 0.240622 # 4 -0.623378 0.500000 0.524232 # 5 0.662496 -0.101754 0.170956 # 6 0.213570 0.864930 -0.383311 #fillna 對原有物件進行更改 _=data.fillna(0,inplace=True) print(data) # 0 1 2 # 0 -1.440240 0.000000 0.000000 # 1 0.100231 0.000000 0.000000 # 2 -0.660957 0.000000 0.000000 # 3 0.004898 0.000000 -1.313950 # 4 1.110324 0.000000 -0.276177 # 5 2.536283 -0.294194 -0.706136 # 6 -2.313634 -0.270051 0.295415 #reindex 插值的方法 df=DataFrame(np.random.randn(6,3)) df.ix[2:,1]=np.nan df.ix[4:,2]=np.nan print(df) # 0 1 2 # 0 -1.292891 0.977053 -1.339258 # 1 -0.981534 0.643460 -0.699660 # 2 -0.343731 NaN 0.812251 # 3 0.446141 NaN -0.824229 # 4 0.389609 NaN NaN # 5 0.716714 NaN NaN print(df.fillna(method='ffill')) # 0 1 2 # 0 -0.672379 0.088150 -0.765589 # 1 0.225561 1.370398 -1.211027 # 2 0.379040 1.370398 -1.009322 # 3 -0.388188 1.370398 -0.986014 # 4 0.387574 1.370398 -0.986014 # 5 -0.196254 1.370398 -0.986014 print(df.fillna(method='ffill',limit=2)) # 0 1 2 # 0 0.816393 0.031747 1.354395 # 1 -0.940994 0.093215 0.837312 # 2 -0.163731 0.093215 0.349830 # 3 0.268226 0.093215 -0.762212 # 4 -2.650622 NaN -0.762212 # 5 -1.195725 NaN -0.762212 #在Na值中插入平均數 data=Series([1,np.nan,2,np.nan,3]) print(data.fillna(data.mean())) # 0 1.0 # 1 2.0 # 2 2.0 # 3 2.0 # 4 3.0 # dtype: float64 ''' value 預設填充值 method 預設是ffill向下填充 axis 行填充 預設是列 inplace 不產生副本 limit 限定填充數量 '''