1. 程式人生 > >Python-dataframe合併(merge函式)

Python-dataframe合併(merge函式)

import pandas as pd
import numpy as np
df1=pd.DataFrame({'key':['b','b','a','c','a','a','b'],'data1':range(7)})
df2=pd.DataFrame({'key':['a','b','d'], 'data2':range(3)})
pd.merge(df1,df2)  #沒有指明聯結的建,會將重疊列的列名當作鍵。
pd.merge(df1,df2,on='key').sort_values(by='key')

df3=pd.DataFrame({'lkey':['b','b','a','c','a','a','b'],'data1':range(7)})
df4=pd.DataFrame({'rkey':['a','b','d'],'data2':range(3)})

pd.merge(df3,df4,left_on='lkey',right_on='rkey')
#若兩列所要聯結的鍵不同可以分別制定,且預設也是inner

pd.merge(df1,df2,how='outer')  #兩個資料框不都有的key值會出現NaN

多對多

df1=pd.DataFrame({'key':['b','b','a','c','a','b'],'data1':range(6)})
df2
=pd.DataFrame({'key':['a','b','a','b','d'],'data2':range(5)})

pd.merge(df1,df2,how='left') #left保證df1的data1全部出現
#多對多聯結產生的結果是笛卡兒積,左邊df1有3個b,右邊df2有2兩個b,共得6個b

pd.merge(df1,df2,how='inner')

根據多個列合併

left=pd.DataFrame({'key1':['foo','foo','bar'],
                  'key2':['one','two','one'],
                  'lval':[1,2,3]})
right=pd.DataFrame({'key1':['foo','foo','bar','bar'],
                   'key2':['one','one','one','two'],
                   
'rval':[4,5,6,7]})
pd.merge(left,right,on=['key1','key2'],how='outer')
pd.merge(left,right,on=['key1','key2'])

重複列名處理

pd.merge(left,right,on='key1')  
#對於key2來說其是重複的列名

pd.merge(left,right,on='key1',suffixes=('_left','_right'))

 索引上的合併 (列名上無重複,index上有重複)

left1=pd.DataFrame({'key':['a','b','a','a','b','c'],
                   'values':range(6)})
right1=pd.DataFrame({'group_val':[3.5,7]},index=['a','b'])

pd.merge(left1,right1,left_on='key',right_index=True)
#將left1的key列與left2的index列聯結

left2=pd.DataFrame([[1.,2.],[3.,4.],[5.,6.]],index=['a','c','e'],
                    columns=['ohio','nevada'])
right2=pd.DataFrame([[7.,8.],[9.,10.],[11.,12.],[13.,14]],
                    index=['b','c','d','e'],columns=['missouri','alabama'])

pd.merge(left2,right2,how='outer',left_index=True,right_index=True)

join方法

left2.join(right2,how='outer') #直接實現按索引合併
left1.join(right1,on='key')  #是merge裡的outer聯結