1. 程式人生 > 其它 >速戰速決 Python - python 第三方庫(pandas): DataFrame連線,去重,空值處理

速戰速決 Python - python 第三方庫(pandas): DataFrame連線,去重,空值處理

速戰速決 Python - python 第三方庫(pandas): DataFrame連線,去重,空值處理

速戰速決 Python https://github.com/webabcd/PythonSample
作者 webabcd

速戰速決 Python - python 第三方庫(pandas): DataFrame連線,去重,空值處理

示例如下:

thirdLib/pandas/sample3.py

# pandas
#   DataFrame 連線:append(), concat(), merge(), join()
#   DataFrame 去重
#   DataFrame 空值處理

import pandas as pd

data1 = {
    "name": ["zhao", "qian"],
    "age": [40, 25], 
    "city": ["beijing ", "beijing"],
    "gender": ["M", "F"]
} 
a = pd.DataFrame(data=data1) 
data2 = { 
    "name": ["qian", "sun"],
    "age": [25, 22], 
    "city": ["beijing", "shanghai"],
    "salary": [100, 150]
} 
b = pd.DataFrame(data=data2) 
print(a)
'''
   name  age      city gender
0  zhao   40  beijing       M
1  qian   25   beijing      F
'''
print(b)
'''
   name  age      city  salary
0  qian   25   beijing     100
1   sun   22  shanghai     150
'''

# append() 在 DataFrame 資料的末尾追加指定的 DataFrame 資料
print(a.append(b))
'''
   name  age      city gender  salary
0  zhao   40  beijing       M     NaN
1  qian   25   beijing      F     NaN
0  qian   25   beijing    NaN   100.0
1   sun   22  shanghai    NaN   150.0
'''


# drop_duplicates() 去重
#   subset - 根據指定的欄位去重
#   keep - first保留第一條重複資料,last保留最後一條重複資料,False刪除全部重複資料
#   inplace - 是否直接修改原物件
#     False 預設值,原物件不變,返回資料修改後的副本
#     True 直接修改原物件,返回值為 None
print(a.append(b).drop_duplicates(subset=['age','city'], keep='first', inplace=False))
'''
   name  age      city gender  salary
0  zhao   40  beijing       M     NaN
1  qian   25   beijing      F     NaN
1   sun   22  shanghai    NaN   150.0
'''


# concat() 拼接多個 DataFrame 資料
print(pd.concat([a, b]))
'''
   name  age      city gender  salary
0  zhao   40  beijing       M     NaN
1  qian   25   beijing      F     NaN
0  qian   25   beijing    NaN   100.0
1   sun   22  shanghai    NaN   150.0
'''

# concat() 拼接多個 DataFrame 資料
#   ignore_index=True 重建索引
print(pd.concat([a, b], ignore_index=True))
'''
   name  age      city gender  salary
0  zhao   40  beijing       M     NaN
1  qian   25   beijing      F     NaN
2  qian   25   beijing    NaN   100.0
3   sun   22  shanghai    NaN   150.0
'''

# concat() 拼接多個 DataFrame 資料
#   join='outer' 預設值,各方列名不相同的也整合到一起
#   join='inner' 各方列名不相同的就捨棄,只保留列名相同的資料
print(pd.concat([a, b], ignore_index=True, join='inner'))
'''
   name  age      city
0  zhao   40  beijing
1  qian   25   beijing
2  qian   25   beijing
3   sun   22  shanghai
'''

# concat() 拼接多個 DataFrame 資料
#   axis=1 橫向拼接
print(pd.concat([a, b], axis=1))
'''
   name  age      city gender  name  age      city  salary
0  zhao   40  beijing       M  qian   25   beijing     100
1  qian   25   beijing      F   sun   22  shanghai     150
'''


# merge() 整合 2 個 DataFrame 資料
#   on='name' 以 name 列為關聯關鍵字整 2 個 DataFrame 資料
#   如果 2 個 DataFrame 資料的關聯關鍵字不相同,則類似這麼設定 left_on="name1", right_on="name2"
print(pd.merge(a, b, on='name'))
'''
   name  age_x   city_x gender  age_y   city_y  salary
0  qian     25  beijing      F     25  beijing     100
'''

# merge() 整合 2 個 DataFrame 資料
#   how='inner' 預設值,只整合兩邊都存在的資料
#   how='outer' 整合兩邊的全部資料
#   how='left' 只整合左邊存在的資料
#   how='right' 只整合右邊存在的資料
print(pd.merge(a, b, on='name', how='outer'))
'''
   name  age_x    city_x gender  age_y    city_y  salary
0  zhao   40.0  beijing       M    NaN       NaN     NaN
1  qian   25.0   beijing      F   25.0   beijing   100.0
2   sun    NaN       NaN    NaN   22.0  shanghai   150.0
'''

# merge() 整合 2 個 DataFrame 資料
#   suffixes - 相同列名整合時,為列名加上指定的字尾,以便區分列是來自哪個 DataFrame 的
print(pd.merge(a, b, on='name', how='outer', suffixes=("_left", "_right")))
'''
   name  age_left city_left gender  age_right city_right  salary
0  zhao      40.0  beijing       M        NaN        NaN     NaN
1  qian      25.0   beijing      F       25.0    beijing   100.0
2   sun       NaN       NaN    NaN       22.0   shanghai   150.0
'''


# join() 整合 2 個 DataFrame 資料
#   相當於 how='left' 的 merge()
print(a.join(b.set_index("name"), on="name", lsuffix="_left", rsuffix='_right'))
'''
   name  age_left city_left gender  age_right city_right  salary
0  zhao        40  beijing       M        NaN        NaN     NaN
1  qian        25   beijing      F       25.0    beijing   100.0
'''



# 以下用於說明空值如何處理
index3 = ['zhao', 'qian', 'sun', 'li', 'zhou']
data3 = {
    "age": [None, None, 22, 28, 28],
    "city": ['beijing', 'beijing', None, 'beijing', 'shanghai']
}
c = pd.DataFrame(data=data3, index=index3)
print(c)
'''
NaN 數字型別的空值(來自 numpy 的 nan),None 其他型別的空值
       age      city
zhao   NaN   beijing
qian   NaN   beijing
sun   22.0      None
li    28.0   beijing
zhou  28.0  shanghai
'''

# isnull() 判斷資料是否有空值
# notnull() 判斷資料是否沒有空值
print(c.isnull())
'''
        age   city
zhao   True  False
qian   True  False
sun   False   True
li    False  False
zhou  False  False
'''

# 獲取指定列沒有空值的資料
print(c[c.age.notnull()])
'''
       age      city
sun   22.0      None
li    28.0   beijing
zhou  28.0  shanghai
'''

# dropna() 刪除空值資料
#   how="any" 有一個欄位空值,則整行刪除
#   how="all" 所有欄位都空值,才整行刪除
#   subset=["column1", "column2"] 只從指定的列中查詢
print(c.dropna(how="any"))
'''
       age      city
li    28.0   beijing
zhou  28.0  shanghai
'''
print(c.dropna(how="any", subset=["age"]))
'''
       age      city
sun   22.0      None
li    28.0   beijing
zhou  28.0  shanghai
'''

# 為指定的欄位中的空值填充一個指定的值
#   inplace=False 預設值,原物件不變,返回資料修改後的副本
#   inplace=True 直接修改原物件,返回值為 None
print(c.fillna({'age':0,'city':'unknown'}, inplace=True)) # None
print(c)
'''
       age      city
zhao   0.0   beijing
qian   0.0   beijing
sun   22.0   unknown
li    28.0   beijing
zhou  28.0  shanghai
'''

速戰速決 Python https://github.com/webabcd/PythonSample
作者 webabcd