速戰速決 Python - python 第三方庫(pandas): DataFrame基礎
阿新 • • 發佈:2022-01-20
速戰速決 Python - python 第三方庫(pandas): DataFrame基礎
速戰速決 Python https://github.com/webabcd/PythonSample
作者 webabcd
速戰速決 Python - python 第三方庫(pandas): DataFrame基礎
示例如下:
thirdLib/pandas/sample2.py
# pandas - DataFrame 基礎 # Series - 帶索引的一維表 # DataFrame - 帶索引的二維表 # # 注 pandas 是基於 numpy 的 import pandas as pd index1 = ['zhao', 'qian', 'sun', 'li', 'zhou'] data1 = { "age": [40, 25, 22, 28, 28], "city": ['beijing', 'beijing', 'shanghai', 'beijing', 'shanghai'] } # 例項化 DataFrame(指定索引列,包括列名的資料列),不指定索引的話預設就是從 0 開始的序列 a = pd.DataFrame(data=data1, index=index1) print(a) ''' 第一列是索引列,右面的都是資料列 age city zhao 40 beijing qian 25 beijing sun 22 shanghai li 28 beijing zhou 28 shanghai ''' index2 = pd.Index(['zhao', 'qian', 'sun', 'li', 'zhou'], dtype=object) data2 = [[40,"beijing"], [25,"beijing"], [22,"shanghai"], [28,"beijing"], [28,"shanghai"]] # 例項化 DataFrame(指定索引列和索引列的資料型別,不包括列名的資料列,資料列的列名列表),不指定索引的話預設就是從 0 開始的序列 b = pd.DataFrame(data=data2, index=index2, columns=["age", "city"]) print(b) ''' 第一列是索引列,右面的都是資料列 age city zhao 40 beijing qian 25 beijing sun 22 shanghai li 28 beijing zhou 28 shanghai ''' # 獲取資料的行數和列數 print(b.shape) # (5, 2) # 獲取資料列的資料 print(b.values) ''' [[40 'beijing'] [25 'beijing'] [22 'shanghai'] [28 'beijing'] [28 'shanghai']] ''' # 獲取索引列的資料 print(list(b.index)) # ['zhao', 'qian', 'sun', 'li', 'zhou'] # 行列轉換 print(b.T) ''' zhao qian sun li zhou age 40 25 22 28 28 city beijing beijing shanghai beijing shanghai ''' # 獲取指定列的資料 print(b.get('age')) print(b.age) ''' zhao 40 qian 25 sun 22 li 28 zhou 28 ''' # 獲取指定列的指定索引的資料 print(b.get('age').get('zhao')) # 40 print(b.age.zhao) # 40 # 取指定列最大值的索引列的值 print(b.age.idxmax()) # zhao # 取指定列最小值的索引列的值 print(b.age.idxmin()) # sun # 取指定列的最大值 print(b.age.max()) # 40 # 取指定列的最小值 print(b.age.min()) # 22 # 取指定列的平均值 print(b.age.mean()) # 28.6 # 取指定列的累加值 print(b.age.sum()) # 143 # 取指定列的中位數 print(b.age.quantile()) # 28.0 # 取指定列的標準差 print(b.age.std()) # 6.841052550594829 # 對指定的列做累加求和(返回一個 Series 型別的資料) print(b.age.cumsum()) ''' zhao 40 qian 65 sun 87 li 115 zhou 143 ''' # 獲取指定索引的資料 print(b.loc['zhao']) # 返回的是 Series 物件 ''' age 40 city beijing ''' # 獲取指定位置的資料 print(b.iloc[0]) # 返回的是 Series 物件 ''' age 40 city beijing ''' # 獲取指定索引的指定列的資料 print(b.loc['zhao'].city) # beijing print(b.loc['zhao'].get('city')) # beijing print(a.loc['zhao'][['age', 'city']]) ''' age 40 city beijing ''' # 獲取指定索引的行的最大值(前提是這行都是數字型別的)。類似的還有 min(), idxmax(), idxmin(), mean(), sum(), quantile(), std(), cumsum() 之類的,上面都寫了 # print(b.loc['zhao'].max()) # 為 DataFrame 新增列,併為其指定相同的值 b["gender"] = "M" ''' age city gender zhao 40 beijing M qian 25 beijing M sun 22 shanghai M li 28 beijing M zhou 28 shanghai M ''' print(b) # 為 DataFrame 新增列,併為其指定不同的值 b["salary"] = [100, 200, 150, 150, 150] print(b) ''' age city gender salary zhao 40 beijing M 100 qian 25 beijing M 200 sun 22 shanghai M 150 li 28 beijing M 150 zhou 28 shanghai M 150 ''' # 為 DataFrame 新增列,併為其指定與原有資訊相關的值 b = b.assign(salary_expected=b["salary"] * 2) print(b) ''' age city gender salary salary_expected zhao 40 beijing M 100 200 qian 25 beijing M 200 400 sun 22 shanghai M 150 300 li 28 beijing M 150 300 zhou 28 shanghai M 150 300 ''' # 為 DataFrame 新增列,併為其指定與原有資訊相關的值(通過呼叫函式的方式) def getLevel(age, salary): # 注意:這裡的 age 和 salary 是 Series 型別的資料 print(type(age), type(salary)) # <class 'pandas.core.series.Series'> <class 'pandas.core.series.Series'> return (age * 0.5 + salary * 2) / 50 b = b.assign(level=getLevel(b["age"], b["salary"])) print(b) ''' age city gender salary salary_expected level zhao 40 beijing M 100 200 4.40 qian 25 beijing M 200 400 8.25 sun 22 shanghai M 150 300 6.22 li 28 beijing M 150 300 6.28 zhou 28 shanghai M 150 300 6.28 ''' # 刪除指定的列 b = b.drop(["salary_expected"], axis=1) print(b) ''' age city gender salary level zhao 40 beijing M 100 4.40 qian 25 beijing M 200 8.25 sun 22 shanghai M 150 6.22 li 28 beijing M 150 6.28 zhou 28 shanghai M 150 6.28 ''' # 修改列名 b = b.rename(columns={"level": "employee_level"}) print(b) ''' age city gender salary employee_level zhao 40 beijing M 100 4.40 qian 25 beijing M 200 8.25 sun 22 shanghai M 150 6.22 li 28 beijing M 150 6.28 zhou 28 shanghai M 150 6.28 ''' # 刪除指定的行 b = b.drop(["zhou"], axis=0) print(b) ''' age city gender salary employee_level zhao 40 beijing M 100 4.40 qian 25 beijing M 200 8.25 sun 22 shanghai M 150 6.22 li 28 beijing M 150 6.28 ''' # 替換指定欄位的指定的值為另一個指定的值 b = b.replace({"city": 'beijing'}, 'bj') print(b) ''' age city gender salary employee_level zhao 40 bj M 100 4.40 qian 25 bj M 200 8.25 sun 22 shanghai M 150 6.22 li 28 bj M 150 6.28 ''' # 所有資料都變為大寫 print(a.applymap(lambda x: str(x).upper())) ''' age city zhao 40 BEIJING qian 25 BEIJING sun 22 SHANGHAI li 28 BEIJING zhou 28 SHANGHAI ''' # 所有資料都乘以 2 def temp(x): return x * 2 print(a.applymap(lambda x: temp(x))) ''' age city zhao 80 beijingbeijing qian 50 beijingbeijing sun 44 shanghaishanghai li 56 beijingbeijing zhou 56 shanghaishanghai ''' # 修改索引名 b = b.rename(index={"zhao": "wang"}) print(b) ''' age city gender salary employee_level wang 40 bj M 100 4.40 qian 25 bj M 200 8.25 sun 22 shanghai M 150 6.22 li 28 bj M 150 6.28 '''
速戰速決 Python https://github.com/webabcd/PythonSample
作者 webabcd