pandas資料處理進階詳解

阿新 • • 發佈：2020-01-09

一、pandas的統計分析

1、關於pandas 的數值統計（統計detail 中的單價的相關指標）

import pandas as pd
 
# 載入資料
detail = pd.read_excel("./meal_order_detail.xlsx")
print("detail :\n",detail)
 
print("detail 的列索引名稱:\n",detail.columns)
print("detail 的形狀:\n",detail.shape)
print("detail 資料型別:\n",detail.dtypes)
 
 
print("amounts 的最大值：\n",detail.loc[:,'amounts'].max())
print("amounts 的最小值：\n",'amounts'].min())
print("amounts 的均值：\n",'amounts'].mean())
print("amounts 的中位數：\n",'amounts'].median())
print("amounts 的方差：\n",'amounts'].var())
print("amounts 的describe：\n",'amounts'].describe())
# 對於兩列的統計結果
print("amounts 的describe：\n",['counts','amounts']].describe())
print("amounts 的describe：\n",'amounts'].describe())
print("amounts 的describe：\n",'counts'].describe())
print("amounts 的極差：\n",'amounts'].ptp())
print("amounts 的標準差：\n",'amounts'].std())
print("amounts 的眾數：\n",'amounts'].mode()) # 返回眾數的陣列
print("amounts 的眾數：\n",'counts'].mode()) # 返回眾數的陣列
print("amounts 的非空值的數目：\n",'amounts'].count())
print("amounts 的最大值的位置：\n",'amounts'].idxmax()) # np.argmax()
print("amounts 的最小值的位置：\n",'amounts'].idxmin()) # np.argmin()

2、pandas對於非數值型資料的統計分析

（1）對於dataframe轉化資料型別，其他型別轉化為object型別

detail.loc[:,'amounts'] = detail.loc[:,'amounts'].astype('object')

（2）類別型資料

detail.loc[:,'amounts'].astype('category')
print("統計類別型資料的describe指標:\n",'amounts'].describe())

（3）統計例項

## 在detail中 哪些菜品最火？菜品賣出了多少份？
# 若白飯算菜
detail.loc[:,'dishes_name'] = detail.loc[:,'dishes_name'].astype('category')
print("按照dishes_name統計描述資訊：\n",'dishes_name'].describe())
 
# 若白飯不算菜 ---把白飯刪除，再統計
# drop labels ---行的名稱， axis =0,inplace = True
# 行的名稱？？？ 怎麼獲取----bool值
# 定位到白飯的行
bool_id = detail.loc[:,'dishes_name'] == '白飯/大碗'
 
# 進行 獲取行名稱
index = detail.loc[bool_id,:].index
 
# 進行刪除
detail.drop(labels=index,axis=0,inplace=True)
 
# 在進行轉化型別
detail.loc[:,'dishes_name'].astype('category')
 
# 在進行統計描述資訊
print("按照dishes_name統計描述資訊：\n",'dishes_name'].describe())
 
# 看 在detail 中那個訂單點的菜最多，點了多少份菜？
# 將 order_id 轉化為類別型資料 ，再 進行describe
detail.loc[:,'order_id'] = detail.loc[:,'order_id'].astype("category")
# 統計描述
print("按照order_id統計描述資訊為:\n",'order_id'].describe())

二、pandas時間資料

datetime64[ns] ---numpy 裡面的時間點類
Timestamp ---pandas 預設的時間點型別----封裝了datetime64[ns]
DatetimeIndex ---pandas 預設支援的時間序列結構

1、可以通過 pd.to_datetime 將時間點資料轉化為pandas預設支援的時間點資料

res = pd.to_datetime("2016/01/01")
print("res:\n",res)
print("res 的型別：\n",type(res))

2、時間序列轉化 --可以通過pd.to_datetime 或者pd.DatetimeIndex將時間序列轉化為pandas預設支援的時間序列結構

res = pd.to_datetime(['2016-01-01','2016-01-01','2011-01-01'])
res1 = pd.DatetimeIndex(['2016-01-01','2016-01-02','2016-02-05','2011-09-01'])
print("res:\n",type(res))
 
print("res1:\n",res1)
print("res1 的型別：\n",type(res1))

3、

import pandas as pd
# #載入資料
detail = pd.read_excel("./meal_order_detail.xlsx")
# print("detail :\n",detail)
print("detail 的列索引名稱:\n",detail.shape)
# print("detail 資料型別:\n",detail.dtypes)
print("*" * 80)
# 獲取place_order_time列
print(detail.loc[:,'place_order_time'])
 
# 轉化為pandas預設支援的時間序列結構
detail.loc[:,'place_order_time'] = pd.to_datetime(detail.loc[:,'place_order_time'])
 
# print(detail.dtypes)
print("*" * 80)
 
# 獲取該時間序列的屬性---可以通過列表推導式來獲取時間點的屬性
year = [i.year for i in detail.loc[:,'place_order_time']]
print("年：\n",year)
 
month = [i.month for i in detail.loc[:,'place_order_time']]
print("月：\n",month)
 
day = [i.day for i in detail.loc[:,'place_order_time']]
print("日：\n",day)
 
quarter = [i.quarter for i in detail.loc[:,'place_order_time']]
print("季度：\n",quarter)
 
# 返回物件
weekday = [i.weekday for i in detail.loc[:,'place_order_time']]
print("周幾：\n",weekday)
 
weekday_name = [i.weekday_name for i in detail.loc[:,weekday_name)
 
is_leap_year = [i.is_leap_year for i in detail.loc[:,'place_order_time']]
print("是否閏年：\n",is_leap_year)

4、時間加減

import pandas as pd
res = pd.to_datetime("2016-01-01")
print("res:\n",type(res))
 
print("時間推後一天：\n",res + pd.Timedelta(days=1))
print("時間推後一小時：\n",res + pd.Timedelta(hours=1))
 
detail.loc[:,'place_over_time'] = detail.loc[:,'place_order_time'] + pd.Timedelta(days=1)
print(detail)
 
## 時間差距計算
res = pd.to_datetime('2019-10-9') - pd.to_datetime('1996-11-07')
print(res)

5、獲取本機可以使用的最初時間和最後使用的時間節點

print(pd.Timestamp.min)
print(pd.Timestamp.max)

三、分組聚合

import pandas as pd
import numpy as np
 
# 載入資料
users = pd.read_excel("./users.xlsx")
print("users:\n",users)
print("users 的列索引：\n",users.columns)
print("users 的資料型別：\n",users.dtypes)
 
# 根據班級分組、統計學員的班級的平均年齡
# groupby 分組
# by ---指定分組的列，可以是單列 也可以是多列
# res = users.groupby(by='ORGANIZE_NAME')['age'].mean()
# 按照單列進行分組，統計多個列的指標
# res = users.groupby(by='ORGANIZE_NAME')[['age','USER_ID']].mean()
res = users.groupby(by=['ORGANIZE_NAME','poo','sex'])['age'].mean()
print(res)
 
# 利用agg
# 進行同時對age 求平均值、對userid 求最大值
# 只需要指定 np.方法名
print(users.agg({'age': np.mean,'USER_ID': np.max}))
 
# 對age 和 USER_ID 同時分別求 和 和均值
print(users[['age','USER_ID']].agg([np.sum,np.mean]))
 
# 對age USER_ID 求取不同個數的統計指標
print(users.agg({'age': np.min,'USER_ID': [np.mean,np.sum]}))
 
 
def hh(x):
 return x + 1
 
 
# 自定義函式進行計算
# res = users['age'].apply(hh)
# res = users[['age','USER_ID']].apply(lambda x:x+1)
res = users['age'].transform(lambda x: x + 1)
# 不能進行跨列的運算
print(res)

四、透視表與交叉表

import pandas as pd
 
# 載入資料
detail = pd.read_excel("./meal_order_detail.xlsx")
print("detail :\n",detail)
print("detail 的列名：\n",detail.columns)
print("detail 的資料型別：\n",detail.dtypes)
 
# 獲取時間點的日屬性
# 必須pandas預設支援的時間序列型別
detail.loc[:,'place_order_time'])
 
# 以列表推導式來獲取日屬性
detail.loc[:,'day'] = [i.day for i in detail.loc[:,'place_order_time']]
 
# 透視表 是一種plus 版的分組聚合
# 建立一個透視表
# data dataframe資料
# values 最終統計指標所針對物件，要關心的資料主體
# index --按照index 進行行分組
# columns ---按照columns進行列分組
# aggfunc ---對主體 進行什麼指標的統計
 
# res = pd.pivot_table(data=detail[['amounts','order_id','counts','dishes_name','day']],values='amounts',columns=['day','counts'],index=['order_id','dishes_name'],aggfunc='mean',margins=True)
# # print(res)
# res.to_excel("./hh.xlsx")
 
# 交叉表 mini版的透視表
# 如果只傳index 與columns 統計這兩列的相對個數
# res = pd.crosstab(index=detail['counts'],columns=detail['amounts'])
# values 必須和aggfunc同時存在
res = pd.crosstab(index=detail['order_id'],columns=detail['counts'],values=detail['amounts'],aggfunc='mean')
print(res)

五、案例

1、營業額案例

import pandas as pd
 
# detail 有時間資料
 
# 載入資料
detail = pd.read_excel("./meal_order_detail.xlsx")
print("detail :\n",detail.dtypes)
 
# 計算每個菜品的銷售額 ，增加到detail
detail.loc[:,'pay'] = detail.loc[:,'counts'] * detail.loc[:,'amounts']
 
# print(detail)
 
# 獲取時間點的日屬性
# 必須pandas預設支援的時間序列型別
detail.loc[:,'place_order_time']]
# print(detail)
# 以 日 為分組 ，統計pay的sum
res = detail.groupby(by='day')['pay'].sum()
print(res)
# print(type(res))
 
df = pd.DataFrame(res.values,columns=['monty'],index=res.index)
print(df)
print(type(df))

2、連鎖超市案例

import pandas as pd
 
# 載入資料
order = pd.read_csv("./order.csv",encoding='ansi')
print("order:\n",order)
print("order 的列索引：\n",order.columns)
 
# 1、哪些類別的商品比較暢銷？
# 剔除銷量 < 0 的資料 （保留銷量 >0 的資料）
# 儲存
bool_id = order.loc[:,'銷量'] > 0
data = order.loc[bool_id,:] # 剔除異常資料之後的正常資料
 
print(data.shape)
print("*" * 80)
 
# 刪除異常
# bool_id = order.loc[:,'銷量'] <= 0
# index = order.loc[bool_id,:].index
#
# data = order.drop(labels=index,inplace=False)
 
# 按照類別進行分組，統計銷量的 和
# 進行dataframe或者series的值排序
# 如果series sort_values()直接按照seies的值進行排序
# 如果df 那麼需要指定 按照哪一列進行排序，by= 列名
 
# 預設是升序ascending=True
# ascending=False 降序
# res = data.groupby(by='類別ID')['銷量'].sum().sort_values(ascending=False)
#
# print(res)
 
# 2、哪些商品比較暢銷？
# 分組聚合實現
# res = data.groupby(by='商品ID')['銷量'].sum().sort_values(ascending=False).head(10)
#
# print(res)
 
# 透視表實現
# res = pd.pivot_table(data=data.loc[:,['商品ID','銷量']],index='商品ID',values='銷量',aggfunc='sum').sort_values(by='銷量',#                            ascending=False).head(
#  10)
# print(res)
 
 
# 3、求不同門店的銷售額佔比
# 提示：訂單中沒有銷售額欄位，所有需要新增一個銷售額欄位。增加欄位後按照門店編號進行分組，然後計算佔比。
 
# # 先計算銷售額
# data.loc[:,'銷售額'] = data.loc[:,'單價'] * data.loc[:,'銷量']
#
# # 按照門店編號進行分組統計銷售額的sum
# res = data.groupby(by='門店編號')['銷售額'].sum()
# # print(res)
# # 計算所有的銷售額總和
# all_ = res.sum()
#
# # print(all_)
# per_ = res / all_
#
# print("各個門店的銷售額佔比為：\n",per_.apply(lambda x:format(x,".2%")))
 
# a = 100.105
# print("%.2f"%a)
 
# print("{}%".format(2.0))
 
# 匿名函式
# print(lambda x:x+5) #
#
# def add(x):
# #  return x+5
 
# 4、哪段時間段是超市的客流高峰期？
# 提示：需要知道每個時間段對應的客流量，但是訂單表中既有日期又有時間，我們需要從中提出小時數，這裡利用訂單ID去重計數代表客流量。
 
# 先對訂單去重
# subset 去重的那一列 的列名，可以是多列，多列的時候傳列表
data.drop_duplicates(subset='訂單ID',inplace=True)
 
# print(data.shape)
 
# 按照小時分組對訂單ID進行統計數量
 
# 將成交時間轉化為 pandas預設支援的時間序列型別
data.loc[:,'成交時間'] = pd.to_datetime(data.loc[:,'成交時間'])
 
# 獲取小時屬性，增加到data 中
 
data.loc[:,'hour'] = [i.hour for i in data.loc[:,'成交時間']]
 
# print(data)
 
# 按照hour 分組 統計 訂單ID數量
 
res = data.groupby(by='hour')['訂單ID'].count().sort_values(ascending=False)
 
print(res)

以上就是本文的全部內容，希望對大家的學習有所幫助，也希望大家多多支援我們。