1. 程式人生 > >pandas中concatenate和combine_first的用法

pandas中concatenate和combine_first的用法

concatenate主要作用是拼接series和dataframe的資料。
combine_first可以做來填充資料。

import numpy as np
import pandas as pd
from pandas import Series, DataFrame

# 設定一個隨機種子,方便除錯
np.random.seed(666)

# Series
arr1 = np.arange(9).reshape(3, 3)
arr2 = np.arange(9).reshape(3, 3)

# numpy的 concatenate 用法
print(np.concatenate(
[arr1, arr2])) ''' [[0 1 2] [3 4 5] [6 7 8] [0 1 2] [3 4 5] [6 7 8]] ''' print(np.concatenate([arr1, arr2], axis=1)) ''' [[0 1 2 0 1 2] [3 4 5 3 4 5] [6 7 8 6 7 8]] ''' s1 = Series([1, 2, 3], index=['A', 'B', 'C']) s2 = Series([4, 5], index=['E', 'F']) # 可以看出和numpy的效果一樣 print(pd.concat([s1, s2]
)) ''' A 1 B 2 C 3 E 4 F 5 dtype: int64 ''' # 用法和 np 一樣 axis = 1, 等於增加了一列 print(pd.concat([s1, s2], axis=1)) # 但是,返回的是一個 <class 'pandas.core.frame.DataFrame'> print(type(pd.concat([s1, s2], axis=1))) ''' 0 1 A 1.0 NaN B 2.0 NaN C 3.0 NaN E NaN 4.0 F NaN 5.0 ''' df1 =
DataFrame(np.random.randn(4, 3), columns=['X', 'Y', 'Z']) print(df1) ''' X Y Z 0 0.824188 0.479966 1.173468 1 0.909048 -0.571721 -0.109497 2 0.019028 -0.943761 0.640573 3 -0.786443 0.608870 -0.931012 ''' df2 = DataFrame(np.random.randn(3, 3), columns=['X', 'Y', 'A']) print(df2) ''' X Y A 0 0.978222 -0.736918 -0.298733 1 -0.460587 -1.088793 -0.575771 2 -1.682901 0.229185 -1.756625 ''' print(pd.concat([df1, df2])) ''' A X Y Z 0 NaN 0.824188 0.479966 1.173468 1 NaN 0.909048 -0.571721 -0.109497 2 NaN 0.019028 -0.943761 0.640573 3 NaN -0.786443 0.608870 -0.931012 0 -0.298733 0.978222 -0.736918 NaN 1 -0.575771 -0.460587 -1.088793 NaN 2 -1.756625 -1.682901 0.229185 NaN ''' # combine s1 = Series([2, np.nan, 4, np.nan], index=['A', 'B', 'C', 'D']) s2 = Series([1, 2, 3, 4], index=['A', 'B', 'C', 'D']) # 用 s2 中的數值來填充 s1 print(s1.combine_first(s2)) ''' A 2.0 B 2.0 C 4.0 D 4.0 dtype: float64 ''' df1 = DataFrame({ 'X':[1, np.nan, 3, np.nan], 'Y':[5, np.nan, 7, np.nan], 'Z':[9, np.nan, 11, np.nan] }) df2 = DataFrame({ 'Z':[np.nan, 10, np.nan, 12], 'A':[1, 2, 3, 4] }) # 功能同樣是填充 print(df1.combine_first(df2)) ''' A X Y Z 0 1.0 1.0 5.0 9.0 1 2.0 NaN NaN 10.0 2 3.0 3.0 7.0 11.0 3 4.0 NaN NaN 12.0 '''