pandas學習筆記 - 常見的數據處理方式
阿新 • • 發佈:2018-10-16
levels index 3.5 mis 功能 frame excel inner 連接方式
1.缺失值處理 - 拉格朗日插值法
input_file數據文件內容(存在部分缺失值):
from scipy.interpolate import lagrange import pandas as pd input_file = ‘./data/catering_sale.xls‘ output_file = ‘./data/sales.xls‘ data = pd.read_excel(input_file) data[‘銷量‘][(data[‘銷量‘] < 400) | (data[‘銷量‘] > 5000)] = None # 銷量小於400及大於5000的視為異常值,置為None # 自定義列向量插值函數# 問題:當n<k時,list(range(n-k, n))會出現負數,導致y的值出現空值,會影響最終的插值結果,這個問題還未解決。。。 def ployinterp_column(s, n, k=5): # s為列向量,n為被插值的位置,k為取前後的數據個數,默認為5 y = s[list(range(n-k, n)) + list(range(n+1, n+k+1))] y = y[y.notnull()] # 剔除空值 if n-k < 0: # 如果NaN值在前5位,則插值結果取k-n位 return lagrange(y.index, list(y))(k-n)else: return lagrange(y.index, list(y))(n) # 插值並返回插值結果 # 逐個元素判斷是否需要插值 for j in range(len(data)): if (data[‘銷量‘].isnull())[j]: # 如果元素為空,則進行插值 data[‘銷量‘][j] = ployinterp_column(data[‘銷量‘], j) data.to_excel(output_file)
output_file結果:
# np.where() a = pd.Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan], index=[‘f‘, ‘e‘, ‘d‘, ‘c‘, ‘b‘, ‘a‘]) b = pd.Series(np.arange(len(a), dtype=np.float64), index=[‘f‘, ‘e‘, ‘d‘, ‘c‘, ‘b‘, ‘a‘]) # 如果a有缺失值,則用相應位置的b填充,否則使用a的原有元素 print(np.where(pd.isnull(a), b, a)) # result [ 0. 2.5 2. 3.5 4.5 5. ]
# df.combine_first() df1 = pd.DataFrame({‘a‘: [1., np.nan, 5., np.nan], ‘b‘: [np.nan, 2., np.nan, 6.], ‘c‘: range(2, 18, 4)}) df2 = pd.DataFrame({‘a‘: [5., 4., np.nan, 3., 7.], ‘b‘: [np.nan, 3., 4., 6., 8.]}) # 將df1中的缺失值用df2中相同位置的元素填充,如果沒有缺失值則保持df1的原有元素 df1.combine_first(df2) # result a b c 0 1.0 NaN 2.0 1 4.0 2.0 6.0 2 5.0 4.0 10.0 3 3.0 6.0 14.0 4 7.0 8.0 NaN
2.數據合並:
# pd.merge() # 使用列或者索引,以類似數據庫連接的方式合並多個DataFrame對象 df1 = pd.DataFrame({‘key‘: [‘b‘, ‘b‘, ‘a‘, ‘c‘, ‘a‘, ‘a‘, ‘b‘], ‘data1‘: range(7)}) df2 = pd.DataFrame({‘key‘: [‘a‘, ‘b‘, ‘d‘], ‘data2‘: range(3)}) print(pd.merge(df1, df2)) # 自動匹配合並列, 默認內連接 print(pd.merge(df1, df2, on=‘key‘)) # 顯式指定
# result
data1 key data2
0 0 b 1
1 1 b 1
2 6 b 1
3 2 a 0
4 4 a 0
5 5 a 0
df3 = pd.DataFrame({‘lkey‘: [‘b‘, ‘b‘, ‘a‘, ‘c‘, ‘a‘, ‘a‘, ‘b‘], ‘data1‘: range(7)}) df4 = pd.DataFrame({‘rkey‘: [‘a‘, ‘b‘, ‘d‘], ‘data2‘: range(3)}) print(pd.merge(df3, df4, left_on=‘lkey‘, right_on=‘rkey‘)) # 當不存在相同column時,需要分別指定連接列名
# result
data1 lkey data2 rkey
0 0 b 1 b
1 1 b 1 b
2 6 b 1 b
3 2 a 0 a
4 4 a 0 a
5 5 a 0 a
## 指定連接方式 # 外連接 print(pd.merge(df1, df2, how=‘outer‘)) # result data1 key data2 0 0.0 b 1.0 1 1.0 b 1.0 2 6.0 b 1.0 3 2.0 a 0.0 4 4.0 a 0.0 5 5.0 a 0.0 6 3.0 c NaN 7 NaN d 2.0
# 左連接 df1 = pd.DataFrame({‘key‘: [‘b‘, ‘b‘, ‘a‘, ‘c‘, ‘a‘, ‘b‘], ‘data1‘: range(6)}) df2 = pd.DataFrame({‘key‘: [‘a‘, ‘b‘, ‘a‘, ‘b‘, ‘d‘] ,‘data2‘: range(5)}) print(pd.merge(df1, df2, how=‘left‘)) # result data1 key data2 0 0 b 1.0 1 0 b 3.0 2 1 b 1.0 3 1 b 3.0 4 2 a 0.0 5 2 a 2.0 6 3 c NaN 7 4 a 0.0 8 4 a 2.0 9 5 b 1.0 10 5 b 3.0
# 多列連接 left = pd.DataFrame({‘key1‘: [‘foo‘, ‘foo‘, ‘bar‘], ‘key2‘: [‘one‘, ‘two‘, ‘one‘], ‘lval‘: [1, 2, 3]}) right = pd.DataFrame({‘key1‘: [‘foo‘, ‘foo‘, ‘bar‘, ‘bar‘], ‘key2‘: [‘one‘, ‘one‘, ‘one‘, ‘two‘], ‘rval‘: [4, 5, 6, 7]}) print(pd.merge(left, right, on=[‘key1‘, ‘key2‘])) # 默認內連接 # result key1 key2 lval rval 0 foo one 1 4 1 foo one 1 5 2 bar one 3 6 print(pd.merge(left, right, on=[‘key1‘, ‘key2‘], how=‘outer‘)) # 外連接 # result key1 key2 lval rval 0 foo one 1.0 4.0 1 foo one 1.0 5.0 2 foo two 2.0 NaN 3 bar one 3.0 6.0 4 bar two NaN 7.0
# 只以其中一個列連接,會出現冗余列 pd.merge(left, right, on=‘key1‘) # result key1 key2_x lval key2_y rval 0 foo one 1 one 4 1 foo one 1 one 5 2 foo two 2 one 4 3 foo two 2 one 5 4 bar one 3 one 6 5 bar one 3 two 7 print(pd.merge(left, right, on=‘key1‘, suffixes=(‘_left‘, ‘_right‘))) # 給冗余列增加後綴 # result key1 key2_left lval key2_right rval 0 foo one 1 one 4 1 foo one 1 one 5 2 foo two 2 one 4 3 foo two 2 one 5 4 bar one 3 one 6 5 bar one 3 two 7
# 使用索引與列進行合並 left1 = pd.DataFrame({‘key‘: [‘a‘, ‘b‘, ‘a‘, ‘a‘, ‘b‘, ‘c‘],‘value‘: range(6)}) right1 = pd.DataFrame({‘group_val‘: [3.5, 7]}, index=[‘a‘, ‘b‘]) print(pd.merge(left1, right1, left_on=‘key‘, right_index=True)) # left1使用key列連接,right1使用index列連接 # result key value group_val 0 a 0 3.5 2 a 2 3.5 3 a 3 3.5 1 b 1 7.0 4 b 4 7.0
# 多列索引連接 lefth = pd.DataFrame({‘key1‘: [‘Ohio‘, ‘Ohio‘, ‘Ohio‘, ‘Nevada‘, ‘Nevada‘], ‘key2‘: [2000, 2001, 2002, 2001, 2002], ‘data‘: np.arange(5.)}) righth = pd.DataFrame(np.arange(12).reshape((6, 2)), index=[[‘Nevada‘, ‘Nevada‘, ‘Ohio‘, ‘Ohio‘, ‘Ohio‘, ‘Ohio‘], [2001, 2000, 2000, 2000, 2001, 2002]], columns=[‘event1‘, ‘event2‘]) print(pd.merge(lefth, righth, left_on=[‘key1‘, ‘key2‘], right_index=True)) # result
data key1 key2 event1 event2
0 0.0 Ohio 2000 4 5
0 0.0 Ohio 2000 6 7
1 1.0 Ohio 2001 8 9
2 2.0 Ohio 2002 10 11
3 3.0 Nevada 2001 0 1
# pd.join()
# pd.join()可以使用index或key合並兩個及以上的DataFrame(列方向上的合並)
left2 = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]], index=[‘a‘, ‘c‘, ‘e‘], columns=[‘Ohio‘, ‘Nevada‘]) right2 = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]], index=[‘b‘, ‘c‘, ‘d‘, ‘e‘], columns=[‘Missouri‘, ‘Alabama‘]) print(left2.join(right2, how=‘outer‘)) # result Ohio Nevada Missouri Alabama a 1.0 2.0 NaN NaN b NaN NaN 7.0 8.0 c 3.0 4.0 9.0 10.0 d NaN NaN 11.0 12.0 e 5.0 6.0 13.0 14.0
# 合並多個DataFrame another = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [16., 17.]], index=[‘a‘, ‘c‘, ‘e‘, ‘f‘], columns=[‘New York‘, ‘Oregon‘]) left2.join([right2, another], how=‘outer‘) # result Ohio Nevada Missouri Alabama New York Oregon a 1.0 2.0 NaN NaN 7.0 8.0 b NaN NaN 7.0 8.0 NaN NaN c 3.0 4.0 9.0 10.0 9.0 10.0 d NaN NaN 11.0 12.0 NaN NaN e 5.0 6.0 13.0 14.0 11.0 12.0 f NaN NaN NaN NaN 16.0 17.0
# 軸向連接 import numpy as np # np.concatenate() arr = np.arange(12).reshape((3,4)) print(np.concatenate([arr, arr], axis=1)) # 在column方向上連接 # result array([[ 0, 1, 2, ..., 1, 2, 3], [ 4, 5, 6, ..., 5, 6, 7], [ 8, 9, 10, ..., 9, 10, 11]])
# pd.concat() s1 = pd.Series([0,1], index=[‘a‘, ‘b‘]) s2 = pd.Series([2, 3, 4], index=[‘c‘, ‘d‘, ‘e‘]) s3 = pd.Series([5, 6], index=[‘f‘, ‘g‘]) print(pd.concat([s1, s2, s3])) # axis參數默認為0,row方向的 # result a 0 b 1 c 2 d 3 e 4 f 5 g 6 dtype: int64 print(pd.concat([s1, s2, s3], axis=1)) # column方向合並,值如果不存在則記為NaN # result 0 1 2 a 0.0 NaN NaN b 1.0 NaN NaN c NaN 2.0 NaN d NaN 3.0 NaN e NaN 4.0 NaN f NaN NaN 5.0 g NaN NaN 6.0 s4 = pd.concat([s1 * 5, s3]) s5 = pd.concat([s1, s4], axis=1) s5.columns = [‘s1‘, ‘s4‘] print(s5) # result s1 s4 a 0.0 0 b 1.0 5 f NaN 5 g NaN 6 print(pd.concat([s1, s4], axis=1, join=‘inner‘)) # join參數指定連接方式 # result 0 1 a 0 0 b 1 5 print(pd.concat([s1, s4], axis=1, join_axes=[[‘a‘, ‘c‘, ‘b‘, ‘e‘]])) # 手動指定要連接的index # result 0 1 a 0.0 0.0 c NaN NaN b 1.0 5.0 e NaN NaN
# 使用keys參數對索引進行分級 result = pd.concat([s1, s2, s3], keys=[‘one‘, ‘two‘, ‘three‘]) # 在row方向合並時,keys對應每個Series的一級index,每個Series原有的index則作為二級index print(result) # result one a 0 b 1 two c 2 d 3 e 4 three f 5 g 6 dtype: int64
# Series.unstack() 將Seris格式轉換為DataFrame格式 print(result.unstack()) # 一級索引將作為index,二級索引作為columns # result a b c d e f g one 0.0 1.0 NaN NaN NaN NaN NaN two NaN NaN 2.0 3.0 4.0 NaN NaN three NaN NaN NaN NaN NaN 5.0 6.0
# 在列合並時使用keys參數指定column名稱 print(pd.concat([s1, s2, s3], axis=1, keys=[‘one‘, ‘two‘, ‘three‘])) # 在column方向合並時,keys對應每個合並的Series的column # result one two three a 0.0 NaN NaN b 1.0 NaN NaN c NaN 2.0 NaN d NaN 3.0 NaN e NaN 4.0 NaN f NaN NaN 5.0 g NaN NaN 6.0
# 指定分級column df1 = pd.DataFrame(np.arange(6).reshape(3, 2), index=[‘a‘, ‘b‘, ‘c‘], columns=[‘one‘, ‘two‘]) df2 = pd.DataFrame(5 + np.arange(4).reshape(2, 2), index=[‘a‘, ‘c‘], columns=[‘three‘, ‘four‘])
# 因為DataFrame對象已經有了column,所以keys參數會設置新的一級column, df原有的column則作為二級column df3 = pd.concat([df1, df2], axis=1, keys=[‘level1‘, ‘level2‘]) print(df3) print(df3.columns) # result level1 level2 one two three four a 0 1 5.0 6.0 b 2 3 NaN NaN c 4 5 7.0 8.0 MultiIndex(levels=[[‘level1‘, ‘level2‘], [‘four‘, ‘one‘, ‘three‘, ‘two‘]], labels=[[0, 0, 1, 1], [1, 3, 2, 0]]) # 使用字典實現相同的功能 print(pd.concat({‘level1‘: df1, ‘level2‘: df2}, axis=1)) #result level1 level2 one two three four a 0 1 5.0 6.0 b 2 3 NaN NaN c 4 5 7.0 8.0 # 指定分級column名稱 df = pd.concat([df1, df2], axis=1, keys=[‘level1‘, ‘level2‘], names=[‘levels‘, ‘number‘]) print(df) print(df.columns) # result levels level1 level2 number one two three four a 0 1 5.0 6.0 b 2 3 NaN NaN c 4 5 7.0 8.0 MultiIndex(levels=[[‘level1‘, ‘level2‘], [‘four‘, ‘one‘, ‘three‘, ‘two‘]], labels=[[0, 0, 1, 1], [1, 3, 2, 0]], names=[‘levels‘, ‘number‘])
# ignore_index df1 = pd.DataFrame(np.random.randn(3, 4), columns=[‘a‘, ‘b‘, ‘c‘, ‘d‘]) df2 = pd.DataFrame(np.random.randn(2, 3), columns=[‘b‘, ‘d‘, ‘a‘]) # row方向忽略索引 print(pd.concat([df1, df2], ignore_index=True)) # result a b c d 0 1.261208 0.022188 -2.489475 -1.098245 1 0.618618 -1.179827 1.475738 0.334444 2 -0.319088 -0.153492 0.029245 0.336055 3 -0.999023 -0.502154 NaN 0.722256 4 1.428007 -0.726810 NaN 0.432440 # column方向忽略列名 print(pd.concat([df1, df2], axis=1, ignore_index=True)) # result 0 1 2 3 4 5 6 0 1.261208 0.022188 -2.489475 -1.098245 -0.502154 0.722256 -0.999023 1 0.618618 -1.179827 1.475738 0.334444 -0.726810 0.432440 1.428007 2 -0.319088 -0.153492 0.029245 0.336055 NaN NaN NaN
3.重塑層次化索引
data = pd.DataFrame(np.arange(6).reshape((2, 3)), index=pd.Index([‘Ohio‘, ‘Colorado‘], name=‘state‘), columns=pd.Index([‘one‘, ‘two‘, ‘three‘], name=‘number‘)) # 軸向旋轉 result = data.stack() print(result) # result state number Ohio one 0 two 1 three 2 Colorado one 3 two 4 three 5 # 還原操作 print(result.unstack()) # result number one two three state Ohio 0 1 2 Colorado 3 4 5 # 行列轉置 print(result.unstack(0)) # result state Ohio Colorado number one 0 3 two 1 4 three 2 5 # 指定要轉置的索引名 print(result.unstack(‘number‘)) # result number one two three state Ohio 0 1 2 Colorado 3 4 5
# 例1:
s1 = pd.Series([0, 1, 2, 3], index=[‘a‘, ‘b‘, ‘c‘, ‘d‘]) s2 = pd.Series([4, 5, 6], index=[‘c‘, ‘d‘, ‘e‘]) data2 = pd.concat([s1, s2], keys=[‘one‘, ‘two‘]) print(data2.unstack()) # result a b c d e one 0.0 1.0 2.0 3.0 NaN two NaN NaN 4.0 5.0 6.0 print(data2.unstack().stack()) # result one a 0.0 b 1.0 c 2.0 d 3.0 two c 4.0 d 5.0 e 6.0 dtype: float64 # 不dropnan值 print(data2.unstack().stack(dropna=False)) # result one a 0.0 b 1.0 c 2.0 d 3.0 e NaN two a NaN b NaN c 4.0 d 5.0 e 6.0 dtype: float64
# 例2:
df = pd.DataFrame({‘left‘: result, ‘right‘: result + 5}, columns=pd.Index([‘left‘, ‘right‘], name=‘side‘)) print(df.unstack(‘state‘)) # result side left right state Ohio Colorado Ohio Colorado number one 0 3 5 8 two 1 4 6 9 three 2 5 7 10 print(df.unstack(‘state‘).stack(‘side‘)) # result state Colorado Ohio number side one left 3 0 right 8 5 two left 4 1 right 9 6 three left 5 2 right 10 7
# 未完待續。。。 有點多
pandas學習筆記 - 常見的數據處理方式