速戰速決 Python - python 第三方庫(pandas): DataFrame連線,去重,空值處理
阿新 • • 發佈:2022-01-20
速戰速決 Python - python 第三方庫(pandas): DataFrame連線,去重,空值處理
速戰速決 Python https://github.com/webabcd/PythonSample
作者 webabcd
速戰速決 Python - python 第三方庫(pandas): DataFrame連線,去重,空值處理
示例如下:
thirdLib/pandas/sample3.py
# pandas # DataFrame 連線:append(), concat(), merge(), join() # DataFrame 去重 # DataFrame 空值處理 import pandas as pd data1 = { "name": ["zhao", "qian"], "age": [40, 25], "city": ["beijing ", "beijing"], "gender": ["M", "F"] } a = pd.DataFrame(data=data1) data2 = { "name": ["qian", "sun"], "age": [25, 22], "city": ["beijing", "shanghai"], "salary": [100, 150] } b = pd.DataFrame(data=data2) print(a) ''' name age city gender 0 zhao 40 beijing M 1 qian 25 beijing F ''' print(b) ''' name age city salary 0 qian 25 beijing 100 1 sun 22 shanghai 150 ''' # append() 在 DataFrame 資料的末尾追加指定的 DataFrame 資料 print(a.append(b)) ''' name age city gender salary 0 zhao 40 beijing M NaN 1 qian 25 beijing F NaN 0 qian 25 beijing NaN 100.0 1 sun 22 shanghai NaN 150.0 ''' # drop_duplicates() 去重 # subset - 根據指定的欄位去重 # keep - first保留第一條重複資料,last保留最後一條重複資料,False刪除全部重複資料 # inplace - 是否直接修改原物件 # False 預設值,原物件不變,返回資料修改後的副本 # True 直接修改原物件,返回值為 None print(a.append(b).drop_duplicates(subset=['age','city'], keep='first', inplace=False)) ''' name age city gender salary 0 zhao 40 beijing M NaN 1 qian 25 beijing F NaN 1 sun 22 shanghai NaN 150.0 ''' # concat() 拼接多個 DataFrame 資料 print(pd.concat([a, b])) ''' name age city gender salary 0 zhao 40 beijing M NaN 1 qian 25 beijing F NaN 0 qian 25 beijing NaN 100.0 1 sun 22 shanghai NaN 150.0 ''' # concat() 拼接多個 DataFrame 資料 # ignore_index=True 重建索引 print(pd.concat([a, b], ignore_index=True)) ''' name age city gender salary 0 zhao 40 beijing M NaN 1 qian 25 beijing F NaN 2 qian 25 beijing NaN 100.0 3 sun 22 shanghai NaN 150.0 ''' # concat() 拼接多個 DataFrame 資料 # join='outer' 預設值,各方列名不相同的也整合到一起 # join='inner' 各方列名不相同的就捨棄,只保留列名相同的資料 print(pd.concat([a, b], ignore_index=True, join='inner')) ''' name age city 0 zhao 40 beijing 1 qian 25 beijing 2 qian 25 beijing 3 sun 22 shanghai ''' # concat() 拼接多個 DataFrame 資料 # axis=1 橫向拼接 print(pd.concat([a, b], axis=1)) ''' name age city gender name age city salary 0 zhao 40 beijing M qian 25 beijing 100 1 qian 25 beijing F sun 22 shanghai 150 ''' # merge() 整合 2 個 DataFrame 資料 # on='name' 以 name 列為關聯關鍵字整 2 個 DataFrame 資料 # 如果 2 個 DataFrame 資料的關聯關鍵字不相同,則類似這麼設定 left_on="name1", right_on="name2" print(pd.merge(a, b, on='name')) ''' name age_x city_x gender age_y city_y salary 0 qian 25 beijing F 25 beijing 100 ''' # merge() 整合 2 個 DataFrame 資料 # how='inner' 預設值,只整合兩邊都存在的資料 # how='outer' 整合兩邊的全部資料 # how='left' 只整合左邊存在的資料 # how='right' 只整合右邊存在的資料 print(pd.merge(a, b, on='name', how='outer')) ''' name age_x city_x gender age_y city_y salary 0 zhao 40.0 beijing M NaN NaN NaN 1 qian 25.0 beijing F 25.0 beijing 100.0 2 sun NaN NaN NaN 22.0 shanghai 150.0 ''' # merge() 整合 2 個 DataFrame 資料 # suffixes - 相同列名整合時,為列名加上指定的字尾,以便區分列是來自哪個 DataFrame 的 print(pd.merge(a, b, on='name', how='outer', suffixes=("_left", "_right"))) ''' name age_left city_left gender age_right city_right salary 0 zhao 40.0 beijing M NaN NaN NaN 1 qian 25.0 beijing F 25.0 beijing 100.0 2 sun NaN NaN NaN 22.0 shanghai 150.0 ''' # join() 整合 2 個 DataFrame 資料 # 相當於 how='left' 的 merge() print(a.join(b.set_index("name"), on="name", lsuffix="_left", rsuffix='_right')) ''' name age_left city_left gender age_right city_right salary 0 zhao 40 beijing M NaN NaN NaN 1 qian 25 beijing F 25.0 beijing 100.0 ''' # 以下用於說明空值如何處理 index3 = ['zhao', 'qian', 'sun', 'li', 'zhou'] data3 = { "age": [None, None, 22, 28, 28], "city": ['beijing', 'beijing', None, 'beijing', 'shanghai'] } c = pd.DataFrame(data=data3, index=index3) print(c) ''' NaN 數字型別的空值(來自 numpy 的 nan),None 其他型別的空值 age city zhao NaN beijing qian NaN beijing sun 22.0 None li 28.0 beijing zhou 28.0 shanghai ''' # isnull() 判斷資料是否有空值 # notnull() 判斷資料是否沒有空值 print(c.isnull()) ''' age city zhao True False qian True False sun False True li False False zhou False False ''' # 獲取指定列沒有空值的資料 print(c[c.age.notnull()]) ''' age city sun 22.0 None li 28.0 beijing zhou 28.0 shanghai ''' # dropna() 刪除空值資料 # how="any" 有一個欄位空值,則整行刪除 # how="all" 所有欄位都空值,才整行刪除 # subset=["column1", "column2"] 只從指定的列中查詢 print(c.dropna(how="any")) ''' age city li 28.0 beijing zhou 28.0 shanghai ''' print(c.dropna(how="any", subset=["age"])) ''' age city sun 22.0 None li 28.0 beijing zhou 28.0 shanghai ''' # 為指定的欄位中的空值填充一個指定的值 # inplace=False 預設值,原物件不變,返回資料修改後的副本 # inplace=True 直接修改原物件,返回值為 None print(c.fillna({'age':0,'city':'unknown'}, inplace=True)) # None print(c) ''' age city zhao 0.0 beijing qian 0.0 beijing sun 22.0 unknown li 28.0 beijing zhou 28.0 shanghai '''
速戰速決 Python https://github.com/webabcd/PythonSample
作者 webabcd