1. 程式人生 > >pandas中pd.groupby()的用法

pandas中pd.groupby()的用法

在pandas中的groupby和在sql語句中的groupby有異曲同工之妙,不過也難怪,畢竟關係資料庫中的存放資料的結構也是一張大表罷了,與dataframe的形式相似。

import numpy as np
import pandas as pd
from pandas import Series, DataFrame


df = pd.read_csv('./city_weather.csv')
print(df)
'''
          date city  temperature  wind
0   03/01/2016   BJ            8     5
1   17/01/2016   BJ           12     2
2   31/01/2016   BJ           19     2
3   14/02/2016   BJ           -3     3
4   28/02/2016   BJ           19     2
5   13/03/2016   BJ            5     3
6   27/03/2016   SH           -4     4
7   10/04/2016   SH           19     3
8   24/04/2016   SH           20     3
9   08/05/2016   SH           17     3
10  22/05/2016   SH            4     2
11  05/06/2016   SH          -10     4
12  19/06/2016   SH            0     5
13  03/07/2016   SH           -9     5
14  17/07/2016   GZ           10     2
15  31/07/2016   GZ           -1     5
16  14/08/2016   GZ            1     5
17  28/08/2016   GZ           25     4
18  11/09/2016   SZ           20     1
19  25/09/2016   SZ          -10     4
'''
g = df.groupby(df['city']) # <pandas.core.groupby.groupby.DataFrameGroupBy object at 0x7f10450e12e8> print(g.groups) # {'BJ': Int64Index([0, 1, 2, 3, 4, 5], dtype='int64'), # 'GZ': Int64Index([14, 15, 16, 17], dtype='int64'), # 'SZ': Int64Index([18, 19], dtype='int64'), # 'SH': Int64Index([6, 7, 8, 9, 10, 11, 12, 13], dtype='int64')}
print(g.size()) # g.size() 可以統計每個組 成員的 數量 ''' city BJ 6 GZ 4 SH 8 SZ 2 dtype: int64 ''' print(g.get_group('BJ')) # 得到 某個 分組 ''' date city temperature wind 0 03/01/2016 BJ 8 5 1 17/01/2016 BJ 12 2 2 31/01/2016 BJ 19 2 3 14/02/2016 BJ -3 3 4 28/02/2016 BJ 19 2 5 13/03/2016 BJ 5 3 '''
df_bj = g.get_group('BJ') print(df_bj.mean()) # 對這個 分組 求平均 ''' temperature 10.000000 wind 2.833333 dtype: float64 ''' # 直接使用 g 物件,求平均值 print(g.mean()) # 對 每一個 分組, 都計算分組 ''' temperature wind city BJ 10.000 2.833333 GZ 8.750 4.000000 SH 4.625 3.625000 SZ 5.000 2.500000 ''' print(g.max()) ''' date temperature wind city BJ 31/01/2016 19 5 GZ 31/07/2016 25 5 SH 27/03/2016 20 5 SZ 25/09/2016 20 4 ''' print(g.min()) ''' date temperature wind city BJ 03/01/2016 -3 2 GZ 14/08/2016 -1 2 SH 03/07/2016 -10 2 SZ 11/09/2016 -10 1 ''' # g 物件還可以使用 for 進行迴圈遍歷 for name, group in g: print(name) print(group) # g 可以轉化為 list型別, dict型別 print(list(g)) # 元組第一個元素是 分組的label,第二個是dataframe ''' [('BJ', date city temperature wind 0 03/01/2016 BJ 8 5 1 17/01/2016 BJ 12 2 2 31/01/2016 BJ 19 2 3 14/02/2016 BJ -3 3 4 28/02/2016 BJ 19 2 5 13/03/2016 BJ 5 3), ('GZ', date city temperature wind 14 17/07/2016 GZ 10 2 15 31/07/2016 GZ -1 5 16 14/08/2016 GZ 1 5 17 28/08/2016 GZ 25 4), ('SH', date city temperature wind 6 27/03/2016 SH -4 4 7 10/04/2016 SH 19 3 8 24/04/2016 SH 20 3 9 08/05/2016 SH 17 3 10 22/05/2016 SH 4 2 11 05/06/2016 SH -10 4 12 19/06/2016 SH 0 5 13 03/07/2016 SH -9 5), ('SZ', date city temperature wind 18 11/09/2016 SZ 20 1 19 25/09/2016 SZ -10 4)] ''' print(dict(list(g))) # 返回鍵值對,值的型別是 dataframe ''' {'SH': date city temperature wind 6 27/03/2016 SH -4 4 7 10/04/2016 SH 19 3 8 24/04/2016 SH 20 3 9 08/05/2016 SH 17 3 10 22/05/2016 SH 4 2 11 05/06/2016 SH -10 4 12 19/06/2016 SH 0 5 13 03/07/2016 SH -9 5, 'SZ': date city temperature wind 18 11/09/2016 SZ 20 1 19 25/09/2016 SZ -10 4, 'GZ': date city temperature wind 14 17/07/2016 GZ 10 2 15 31/07/2016 GZ -1 5 16 14/08/2016 GZ 1 5 17 28/08/2016 GZ 25 4, 'BJ': date city temperature wind 0 03/01/2016 BJ 8 5 1 17/01/2016 BJ 12 2 2 31/01/2016 BJ 19 2 3 14/02/2016 BJ -3 3 4 28/02/2016 BJ 19 2 5 13/03/2016 BJ 5 3} '''