Pyhton科學計算工具Pandas(九)—— 資料分組
阿新 • • 發佈:2018-12-19
Pyhton科學計算工具Pandas(九)—— 資料分組
分組統計 - groupby功能
- 根據某些條件將資料拆分成組
- 對每個組獨立應用函式
- 將結果合併到一個數據結構中
Dataframe在行(axis=0)或列(axis=1)上進行分組,將一個函式應用到各個分組併產生一個新值,然後函式執行結果被合併到最終的結果物件中。
df.groupby(by=None, axis=0, level=None, as_index=True, sort=True, group_keys=True, squeeze=False, **kwargs)
分組的基本操作
分組
#分組
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar','foo', 'bar', 'foo', 'foo'],
'B' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
'C' : np.random.randn(8),
'D' : np.random.randn(8)})
print(df)
print('-----' )
dfa = df.groupby('A')
print(dfa.size(), type(dfa))
#groupby之後的資料並不是DataFrame格式的資料,而是特殊的groupby型別
#可以通過size()方法返回分組後的記錄數目的統計結果
print('========')
a = df.groupby('A').sum()
print(a, type(a))
b = df.groupby(['A','B']).mean()
print(b, type(b))
c = df.groupby('A')['D'].mean()
print(c, type(c))
# 通過分組後的計算,得到一個新的dataframe
# 預設axis = 0,以行來分組
# 可單個或多個([])列分組
A B C D
0 foo one 2.479737 -2.368789
1 bar one 1.028346 0.950277
2 foo two 1.001758 -1.278156
3 bar three -0.205714 -0.330909
4 foo two 0.337572 1.256110
5 bar two 0.244171 -0.820276
6 foo one 0.554198 0.683419
7 foo three -0.534419 -0.319840
-----
A
bar 3
foo 5
dtype: int64 <class 'pandas.core.groupby.DataFrameGroupBy'>
========
C D
A
bar 1.066804 -0.200907
foo 3.838847 -2.027256 <class 'pandas.core.frame.DataFrame'>
C D
A B
bar one 1.028346 0.950277
three -0.205714 -0.330909
two 0.244171 -0.820276
foo one 1.516967 -0.842685
three -0.534419 -0.319840
two 0.669665 -0.011023 <class 'pandas.core.frame.DataFrame'>
A
bar -0.066969
foo -0.405451
Name: D, dtype: float64 <class 'pandas.core.series.Series'>
分組是一個可迭代的物件
# 分組 - 可迭代物件
df = pd.DataFrame({'X' : ['A', 'B', 'A', 'B'], 'Y' : [1, 4, 3, 2]})
print(df)
print(df.groupby('X'), type(df.groupby('X')))
print('-----')
print(list(df.groupby('X')), '→ 可迭代物件,直接生成list\n')
print(list(df.groupby('X'))[0], '→ 以元祖形式顯示\n')
for n,g in df.groupby('X'):
print(n)
print(g, type(g))
print('======')
# n是組名,g是分組後的Dataframe
X Y
0 A 1
1 B 4
2 A 3
3 B 2
<pandas.core.groupby.DataFrameGroupBy object at 0x000002AF2EE7C080> <class 'pandas.core.groupby.DataFrameGroupBy'>
-----
[('A', X Y
0 A 1
2 A 3), ('B', X Y
1 B 4
3 B 2)] → 可迭代物件,直接生成list
('A', X Y
0 A 1
2 A 3) → 以元祖形式顯示
A
X Y
0 A 1
2 A 3 <class 'pandas.core.frame.DataFrame'>
======
B
X Y
1 B 4
3 B 2 <class 'pandas.core.frame.DataFrame'>
======
選擇分組 .get_group()
# 提取分組後的某組
df = pd.DataFrame({'X' : ['A', 'B', 'A', 'B'], 'Y' : [1, 4, 3, 2]})
print(df)
print('-------')
print(df.groupby('X').get_group('A'))
print('-------')
X Y
0 A 1
1 B 4
2 A 3
3 B 2
-------
X Y
0 A 1
2 A 3
將分組轉化為字典 .groups
# 將分組轉化為字典
df = pd.DataFrame({'X' : ['A', 'B', 'A', 'B'], 'Y' : [1, 4, 3, 2]})
print(df)
print('---------')
a = df.groupby('X')
print(a.groups,'\n')
print(a.groups['A'],'\n')
print(a.groups['A'][0])
# 字典的值為index
X Y
0 A 1
1 B 4
2 A 3
3 B 2
---------
{'A': Int64Index([0, 2], dtype='int64'), 'B': Int64Index([1, 3], dtype='int64')}
Int64Index([0, 2], dtype='int64')
0
檢視分組裡的記錄數 .size()
# .size() 檢視分組中的記錄的統計數目
df = pd.DataFrame({'X' : ['A', 'B', 'A', 'B'], 'Y' : [1, 4, 3, 2]})
print(df)
print('====')
a = df.groupby('X')
print(a.size())
X Y
0 A 1
1 B 4
2 A 3
3 B 2
====
X
A 2
B 2
dtype: int64
多個列分組
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar','foo', 'bar', 'foo', 'foo'],
'B' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
'C' : np.random.randn(8),
'D' : np.random.randn(8)})
grouped = df.groupby(['A','B']).groups
print(df)
print('---------')
print(grouped)
print('=====')
print(grouped[('foo', 'three')])
# 按照兩個列進行分組
A B C D
0 foo one -0.539735 0.252334
1 bar one 1.247811 -0.144133
2 foo two -0.965486 0.042095
3 bar three -0.158520 -0.667123
4 foo two 1.283692 1.201100
5 bar two -0.795091 0.368176
6 foo one -0.263945 0.085682
7 foo three 0.710263 -1.238407
---------
{('bar', 'one'): Int64Index([1], dtype='int64'), ('bar', 'three'): Int64Index([3], dtype='int64'), ('bar', 'two'): Int64Index([5], dtype='int64'), ('foo', 'one'): Int64Index([0, 6], dtype='int64'), ('foo', 'three'): Int64Index([7], dtype='int64'), ('foo', 'two'): Int64Index([2, 4], dtype='int64')}
=====
Int64Index([7], dtype='int64')
在其他軸上分組
df = pd.DataFrame({'data1':np.random.rand(2),
'data2':np.random.rand(2),
'key1':['a','b'],
'key2':['one','two']})
print(df)
print('------')
print(df.dtypes)
print('------')
for n,p in df.groupby(df.dtypes, axis=1):
print(n)
print(p)
print('===')
# 按照值型別分列
data1 data2 key1 key2
0 0.257623 0.81153 a one
1 0.325821 0.78845 b two
------
data1 float64
data2 float64
key1 object
key2 object
dtype: object
------
float64
data1 data2
0 0.257623 0.81153
1 0.325821 0.78845
===
object
key1 key2
0 a one
1 b two
===
通過字典或者Series分組
# 通過字典或者Series分組
df = pd.DataFrame(np.arange(16).reshape(4,4),
columns = ['a','b','c','d'])
print(df)
print('-----')
mapping = {'a':'one','b':'one','c':'two','d':'two','e':'three'}
by_column = df.groupby(mapping, axis = 1)
print(by_column.sum())
print('-----')
# mapping中,a、b列對應的為one,c、d列對應的為two,以字典來分組
s = pd.Series(mapping)
print(s,'\n')
print(s.groupby(s).count())
# s中,index中a、b對應的為one,c、d對應的為two,以Series來分組
'''??????'''
a b c d
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
3 12 13 14 15
-----
one two
0 1 5
1 9 13
2 17 21
3 25 29
-----
a one
b one
c two
d two
e three
dtype: object
one 2
three 1
two 2
dtype: int64
'??????'
通過函式分組
# 通過函式分組
df = pd.DataFrame(np.arange(16).reshape(4,4),
columns = ['a','b','c','d'],
index = ['abc','bcd','aa','b'])
print(df,'\n')
print(df.groupby(len).sum())
# 按照字母長度分組
a b c d
abc 0 1 2 3
bcd 4 5 6 7
aa 8 9 10 11
b 12 13 14 15
a b c d
1 12 13 14 15
2 8 9 10 11
3 4 6 8 10
分組中常見的函式
# 分組計算函式方法
s = pd.Series([1, 2, 3, 10, 20, 30], index = [1, 2, 3, 1, 2, 3])
grouped = s.groupby(level=0) # 唯一索引用.groupby(level=0),將同一個index的分為一組
print(grouped)
print(grouped.first(),'→ first:非NaN的第一個值\n')
print(grouped.last(),'→ last:非NaN的最後一個值\n')
print(grouped.sum(),'→ sum:非NaN的和\n')
print(grouped.mean(),'→ mean:非NaN的平均值\n')
print(grouped.median(),'→ median:非NaN的算術中位數\n')
print(grouped.count(),'→ count:非NaN的值\n')
print(grouped.min(),'→ min、max:非NaN的最小值、最大值\n')
print(grouped.std(),'→ std,var:非NaN的標準差和方差\n')
print(grouped.prod(),'→ prod:非NaN的積\n')
<pandas.core.groupby.SeriesGroupBy object at 0x000002AF2F1B7278>
1 1
2 2
3 3
dtype: int64 → first:非NaN的第一個值
1 10
2 20
3 30
dtype: int64 → last:非NaN的最後一個值
1 11
2 22
3 33
dtype: int64 → sum:非NaN的和
1 5.5
2 11.0
3 16.5
dtype: float64 → mean:非NaN的平均值
1 5.5
2 11.0
3 16.5
dtype: float64 → median:非NaN的算術中位數
1 2
2 2
3 2
dtype: int64 → count:非NaN的值
1 1
2 2
3 3
dtype: int64 → min、max:非NaN的最小值、最大值
1 6.363961
2 12.727922
3 19.091883
dtype: float64 → std,var:非NaN的標準差和方差
1 10
2 40
3 90
dtype: int64 → prod:非NaN的積
多函式計算
# 多函式計算:agg()
df = pd.DataFrame({'a':[1,1,2,2],
'b':np.random.randint(100, size=4),
'c':np.random.randint(100, size=4),
'd':np.random.randint(100, size=4)})
print(df)
print(df.groupby('a').agg(['mean',sum]))
print(df.groupby('a')['b'].agg({'mean':np.mean,
'sum':'sum'}))
# 函式寫法可以用str,或者np.方法
# 可以通過list,dict傳入,當用dict時,key名為columns
a b c d
0 1 47 0 61
1 1 83 52 2
2 2 54 77 87
3 2 52 99 97
b c d
mean sum mean sum mean sum
a
1 65 130 26 52 31.5 63
2 53 106 88 176 92.0 184
mean sum
a
1 65 130
2 53 106
F:\Anaconda3\lib\site-packages\ipykernel_launcher.py:10: FutureWarning: using a dict on a Series for aggregation
is deprecated and will be removed in a future version
# Remove the CWD from sys.path while we load stuff.
分組轉換
資料分組轉換 transform
# 資料分組轉換,transform
df = pd.DataFrame({'data1':np.random.randint(100, size=5),
'data2':np.random.randint(100, size=5),
'key1':list('aabba'),
'key2':['one','two','one','two','one']})
k_mean = df.groupby('key1').mean()
print(df)
print(k_mean)
print(pd.merge(df, k_mean, left_on='key1', right_index=True).add_prefix('mean_')) # .add_prefix('mean_'):新增字首
print('============')
# 通過分組、合併,得到一個包含均值的Dataframe
print(df.groupby('key2').mean()) # 按照key2分組求均值
print(df.groupby('key2').transform(np.mean))
# data1、data2每個位置元素取對應分組列的均值
# 字串不能進行計算
data1 data2 key1 key2
0 7 98 a one
1 77 3 a two
2 50 73 b one
3 74 23 b two
4 21 9 a one
data1 data2
key1
a 35.0 36.666667
b 62.0 48.000000
mean_data1_x mean_data2_x mean_key1 mean_key2 mean_data1_y mean_data2_y
0 7 98 a one 35.0 36.666667
1 77 3 a two 35.0 36.666667
4 21 9 a one 35.0 36.666667
2 50 73 b one 62.0 48.000000
3 74 23 b two 62.0 48.000000
============
data1 data2
key2
one 26.0 60.0
two 75.5 13.0
data1 data2
0 26.0 60
1 75.5 13
2 26.0 60
3 75.5 13
4 26.0 60
一般化Groupby方法:apply
# 一般化Groupby方法:apply
df = pd.DataFrame({'data1':np.random.randint(100, size=5),
'data2':np.random.randint(100, size=5),
'key1':list('aabba'),
'key2':['one','two','one','two','one']})
print(df.groupby('key1').apply(lambda x: x.describe()))
# apply直接執行其中的函式
# 這裡為匿名函式,描述性統計
print('=========================')
def f_df1(d,n):
return(d.sort_index()[:n])
def f_df2(d,k1):
return(d[k1])
print(df.groupby('key1').apply(f_df1,2),'\n')
print(df.groupby('key1').apply(f_df2,'data2'))
print(type(df.groupby('key1').apply(f_df2,'data2')))
# f_df1函式:返回排序後的前n行資料
# f_df2函式:返回分組後表的k1列,結果為Series,層次化索引
# 直接執行f_df函式
# 引數直接寫在後面,也可以為.apply(f_df,n = 2))
data1 data2
key1
a count 3.000000 3.000000
mean 39.666667 47.333333
std 45.566801 33.306656
min 4.000000 10.000000
25% 14.000000 34.000000
50% 24.000000 58.000000
75% 57.500000 66.000000
max 91.000000 74.000000
b count 2.000000 2.000000
mean 25.500000 18.500000
std 3.535534 16.263456
min 23.000000 7.000000
25% 24.250000 12.750000
50% 25.500000 18.500000
75% 26.750000 24.250000
max 28.000000 30.000000
=========================
data1 data2 key1 key2
key1
a 0 4 10 a one
1 91 58 a two
b 2 28 7 b one
3 23 30 b two
key1
a 0 10
1 58
4 74
b 2 7
3 30
Name: data2, dtype: int32
<class 'pandas.core.series.Series'>