python學習筆記(4)-理論:資料分析工具Pandas
阿新 • • 發佈:2020-07-10
Pandas数据结构¶
In[487]:import pandas as pd %matplotlib inline
- ## Series
# 通過list構建Series ser_obj = pd.Series(range(10, 20)) print(type(ser_obj))
<class 'pandas.core.series.Series'>In[489]:
# 獲取資料 print(ser_obj.values) # 獲取索引 print(ser_obj.index)
[10 11 12 13 14 15 16 17 18 19] RangeIndex(start=0, stop=10, step=1)In[490]:
# 預覽資料 print(ser_obj.head(3))
0 10 1 11 2 12 dtype: int64In[491]:
print(ser_obj)
0 10 1 11 2 12 3 13 4 14 5 15 6 16 7 17 8 18 9 19 dtype: int64In[492]:
# 通過索引獲取資料 print(ser_obj[0]) print(ser_obj[8])
10 18In[493]:
# 索引與資料的對應關係仍保持在陣列運算的結果中 print(ser_obj * 2) print(ser_obj > 15) print(ser_obj[ser_obj > 15])
0 20 1 22 2 24 3 26 4 28 5 30 6 32 7 34 8 36 9 38 dtype: int64 0 False 1 False 2 False 3 False 4 False 5 False 6 True 7 True 8 True 9 True dtype: bool 6 16 7 17 8 18 9 19 dtype: int64In[494]:
# 通過dict構建Series year_data = {2001: 17.8, 2002: 20.1, 2003: 16.5} ser_obj2 = pd.Series(year_data) print(ser_obj2.head()) print(ser_obj2.index)
2001 17.8 2002 20.1 2003 16.5 dtype: float64 Int64Index([2001, 2002, 2003], dtype='int64')In[495]:
# name屬性 ser_obj2.name = 'temp' ser_obj2.index.name = 'year' print(ser_obj2.head())
year 2001 17.8 2002 20.1 2003 16.5 Name: temp, dtype: float64In[496]:
import numpy as np # 通過ndarray構建DataFrame array = np.random.randn(5,4) print(array) df_obj = pd.DataFrame(array) print(df_obj.head())
[[-0.01886471 -1.40819766 0.1409696 1.23839493] [-0.95234362 0.84017655 0.96431593 1.22662473] [-0.23179682 1.1027172 1.75420058 1.54925205] [-0.2529594 -1.12648376 -0.52386023 -0.80252582] [ 1.44400167 -3.26985176 -1.77381084 0.56538251]] 0 1 2 3 0 -0.018865 -1.408198 0.140970 1.238395 1 -0.952344 0.840177 0.964316 1.226625 2 -0.231797 1.102717 1.754201 1.549252 3 -0.252959 -1.126484 -0.523860 -0.802526 4 1.444002 -3.269852 -1.773811 0.565383In[497]:
# 通過dict構建DataFrame dict_data = {'A': 1., 'B': pd.Timestamp('20161217'), 'C': pd.Series(1, index=list(range(4)), dtype='float32'), 'D': np.array([3] * 4, dtype='int32'), 'E': pd.Categorical(["Python", "Java", "C++", "C#"]), 'F': "China"} # print(dict_data) df_obj2 = pd.DataFrame(dict_data) df_obj2.head()Out[497]:
A | B | C | D | E | F | |
---|---|---|---|---|---|---|
0 | 1.0 | 2016-12-17 | 1.0 | 3 | Python | China |
1 | 1.0 | 2016-12-17 | 1.0 | 3 | Java | China |
2 | 1.0 | 2016-12-17 | 1.0 | 3 | C++ | China |
3 | 1.0 | 2016-12-17 | 1.0 | 3 | C# | China |
# 通過列索引獲取列資料 print(df_obj2['A']) print(type(df_obj2['A'])) print(df_obj2.A)
0 1.0 1 1.0 2 1.0 3 1.0 Name: A, dtype: float64 <class 'pandas.core.series.Series'> 0 1.0 1 1.0 2 1.0 3 1.0 Name: A, dtype: float64In[499]:
# 增加列 df_obj2['G'] = df_obj2['D'] + 4 print(df_obj2)
A B C D E F G 0 1.0 2016-12-17 1.0 3 Python China 7 1 1.0 2016-12-17 1.0 3 Java China 7 2 1.0 2016-12-17 1.0 3 C++ China 7 3 1.0 2016-12-17 1.0 3 C# China 7In[500]:
# 刪除列 del(df_obj2['G']) print(df_obj2)
A B C D E F 0 1.0 2016-12-17 1.0 3 Python China 1 1.0 2016-12-17 1.0 3 Java China 2 1.0 2016-12-17 1.0 3 C++ China 3 1.0 2016-12-17 1.0 3 C# ChinaIn[501]:
print(type(ser_obj.index)) print(type(df_obj2.index)) print(df_obj2.index)
<class 'pandas.core.indexes.range.RangeIndex'> <class 'pandas.core.indexes.numeric.Int64Index'> Int64Index([0, 1, 2, 3], dtype='int64')In[502]:
# 索引物件不可變 # df_obj2.index[0] = 2
import pandas as pd ser_obj = pd.Series(range(5), index = ['a', 'b', 'c', 'd', 'e']) print(ser_obj.head())
a 0 b 1 c 2 d 3 e 4 dtype: int64In[504]:
# 行索引 print(ser_obj['a']) print(ser_obj[0])
0 0In[505]:
# 切片索引 print(ser_obj[1:3]) print(ser_obj['b': 'd'])
b 1 c 2 dtype: int64 b 1 c 2 d 3 dtype: int64In[506]:
# 不連續索引 print(ser_obj[[0, 2, 4]]) print(ser_obj[['a', 'e']])
a 0 c 2 e 4 dtype: int64 a 0 e 4 dtype: int64In[507]:
# 布林索引 ser_bool = ser_obj > 2 print(ser_bool) print(ser_obj[ser_bool]) print(ser_obj[ser_obj > 2])
a False b False c False d True e True dtype: bool d 3 e 4 dtype: int64 d 3 e 4 dtype: int64In[508]:
import numpy as np df_obj = pd.DataFrame(np.random.randn(5, 4), columns = ['a', 'b', 'c', 'd']) print(df_obj.head())
a b c d 0 0.816394 -0.002626 0.514936 0.044557 1 -0.890728 -0.246314 0.097609 -0.687022 2 -0.095511 1.070962 -0.364357 0.056241 3 -1.900075 0.380913 0.740835 -1.336294 4 -0.451464 0.364149 0.596956 -0.073667In[509]:
# 列索引 print('列索引') print(df_obj['a']) # 返回Series型別 # print(type(df_obj[[0]])) # 返回DataFrame型別 好像有問題,會報錯??? # 不連續索引 print('不連續索引') print(df_obj[['a', 'c']]) # print(df_obj[[1, 3]]) # 好像有問題,會報錯???
列索引 0 0.816394 1 -0.890728 2 -0.095511 3 -1.900075 4 -0.451464 Name: a, dtype: float64 不連續索引 a c 0 0.816394 0.514936 1 -0.890728 0.097609 2 -0.095511 -0.364357 3 -1.900075 0.740835 4 -0.451464 0.596956In[510]:
# 標籤索引 loc # Series ser_obj = pd.Series(range(5), index = ['a', 'b', 'c', 'd', 'e']) df_obj = pd.DataFrame(np.random.randn(5, 4), columns = ['a', 'b', 'c', 'd']) print(ser_obj['b' : 'd']) print(ser_obj.loc['b' : 'd']) # DataFrame print(df_obj['a']) print(df_obj.loc[0:2, 'a'])
b 1 c 2 d 3 dtype: int64 b 1 c 2 d 3 dtype: int64 0 0.325330 1 0.168478 2 0.298383 3 -1.192005 4 -1.069254 Name: a, dtype: float64 0 0.325330 1 0.168478 2 0.298383 Name: a, dtype: float64In[511]:
# 整形位置索引 iloc print(ser_obj[1:3]) print(ser_obj.iloc[1:3]) # DataFrame print(df_obj.iloc[0:2, 0]) # 注意和df_obj.loc[0:2, 'a']的區別
b 1 c 2 dtype: int64 b 1 c 2 dtype: int64 0 0.325330 1 0.168478 Name: a, dtype: float64In[512]:
# 混合索引 ix print(ser_obj.ix[1:3]) print(ser_obj.ix['b' : 'c']) #DataFrame print(df_obj.ix[0:2, 0])# 先按照標籤索引嘗試操作,然後再按照位置索引嘗試操作
b 1 c 2 dtype: int64 b 1 c 2 dtype: int64 0 0.325330 1 0.168478 2 0.298383 Name: a, dtype: float64
e:\python_lesson\installer\anaconda3\lib\site-packages\ipykernel_launcher.py:2: DeprecationWarning: .ix is deprecated. Please use .loc for label based indexing or .iloc for positional indexing See the documentation here: http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated e:\python_lesson\installer\anaconda3\lib\site-packages\ipykernel_launcher.py:6: DeprecationWarning: .ix is deprecated. Please use .loc for label based indexing or .iloc for positional indexing See the documentation here: http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecatedIn[513]:
s1 = pd.Series(range(10, 20), index = range(10)) s2 = pd.Series(range(20, 25), index = range(5)) print(s1) print(s2) # Series 對齊運算 print(s1 + s2)
0 10 1 11 2 12 3 13 4 14 5 15 6 16 7 17 8 18 9 19 dtype: int64 0 20 1 21 2 22 3 23 4 24 dtype: int64 0 30.0 1 32.0 2 34.0 3 36.0 4 38.0 5 NaN 6 NaN 7 NaN 8 NaN 9 NaN dtype: float64In[514]:
df1 = pd.DataFrame(np.ones((2,2)), columns = ['a', 'b']) df2 = pd.DataFrame(np.ones((3,3)), columns = ['a', 'b', 'c']) print(df1) print(df2) # DataFrame對齊操作 print(df1 + df2)
a b 0 1.0 1.0 1 1.0 1.0 a b c 0 1.0 1.0 1.0 1 1.0 1.0 1.0 2 1.0 1.0 1.0 a b c 0 2.0 2.0 NaN 1 2.0 2.0 NaN 2 NaN NaN NaNIn[515]:
# 填充未對齊的資料進行計算 s1.add(s2, fill_value = -1) df1.sub(df2, fill_value = 2.)Out[515]:
a | b | c | |
---|---|---|---|
0 | 0.0 | 0.0 | 1.0 |
1 | 0.0 | 0.0 | 1.0 |
2 | 1.0 | 1.0 | 1.0 |
# 填充NaN s3 = s1 + s2 print(s3)
0 30.0 1 32.0 2 34.0 3 36.0 4 38.0 5 NaN 6 NaN 7 NaN 8 NaN 9 NaN dtype: float64In[517]:
s3_filled = s3.fillna(-1) print(s3_filled)
0 30.0 1 32.0 2 34.0 3 36.0 4 38.0 5 -1.0 6 -1.0 7 -1.0 8 -1.0 9 -1.0 dtype: float64In[518]:
df3 = df1 + df2 df3Out[518]:
a | b | c | |
---|---|---|---|
0 | 2.0 | 2.0 | NaN |
1 | 2.0 | 2.0 | NaN |
2 | NaN | NaN | NaN |
df3.fillna(100, inplace = True) df3Out[519]:
a | b | c | |
---|---|---|---|
0 | 2.0 | 2.0 | 100.0 |
1 | 2.0 | 2.0 | 100.0 |
2 | 100.0 | 100.0 | 100.0 |
# Numpy ufunc 函式 df = pd.DataFrame(np.random.randn(5, 4) - 1) print(df) print(np.abs(df))
0 1 2 3 0 -2.119619 -1.889218 0.354441 -0.568196 1 0.123669 0.683149 -1.900245 -0.289735 2 -0.883802 1.783550 -1.749171 -0.025582 3 0.071452 0.661371 0.000046 -2.492441 4 -0.054194 -1.471238 -0.750263 -0.210742 0 1 2 3 0 2.119619 1.889218 0.354441 0.568196 1 0.123669 0.683149 1.900245 0.289735 2 0.883802 1.783550 1.749171 0.025582 3 0.071452 0.661371 0.000046 2.492441 4 0.054194 1.471238 0.750263 0.210742In[521]:
# 使用apply應用行或列資料 print(df.apply(lambda x: x.max()))
0 0.123669 1 1.783550 2 0.354441 3 -0.025582 dtype: float64In[522]:
# 指定軸方向 print(df.apply(lambda x: x.max(), axis=1))
0 0.354441 1 0.683149 2 1.783550 3 0.661371 4 -0.054194 dtype: float64In[523]:
# 使用applymap應用到每個資料 print(df.applymap(lambda x: '%.2f' % x))
0 1 2 3 0 -2.12 -1.89 0.35 -0.57 1 0.12 0.68 -1.90 -0.29 2 -0.88 1.78 -1.75 -0.03 3 0.07 0.66 0.00 -2.49 4 -0.05 -1.47 -0.75 -0.21In[524]:
s4 = pd.Series(range(10, 15), index = np.random.randint(5, size=5)) print(s4)
1 10 3 11 0 12 2 13 1 14 dtype: int64In[525]:
# 索引排序 s4.sort_index()Out[525]:
0 12 1 10 1 14 2 13 3 11 dtype: int64In[526]:
df4 = pd.DataFrame(np.random.randn(3,4), index=np.random.randint(3, size=3), columns=np.random.randint(4, size=4)) df4Out[526]:
0 | 2 | 0 | 0 | |
---|---|---|---|---|
2 | 0.685978 | 0.065535 | -1.904955 | -0.844186 |
2 | -2.321484 | 0.827531 | -1.664262 | -0.736157 |
1 | -0.813704 | -0.588677 | 1.321903 | 0.323762 |
#df4.sort_index(ascending=False) df4.sort_index(axis=1)Out[527]:
0 | 0 | 0 | 2 | |
---|---|---|---|---|
2 | 0.685978 | -1.904955 | -0.844186 | 0.065535 |
2 | -2.321484 | -1.664262 | -0.736157 | 0.827531 |
1 | -0.813704 | 1.321903 | 0.323762 | -0.588677 |
# 按值排序 # df4.sort_values(by=2)In[529]:
df_data = pd.DataFrame([np.random.randn(3), [1., np.nan, np.nan], [4., np.nan, np.nan], [1., np.nan, 2.]]) df_dataOut[529]:
0 | 1 | 2 | |
---|---|---|---|
0 | 1.611935 | -0.311622 | 0.670604 |
1 | 1.000000 | NaN | NaN |
2 | 4.000000 | NaN | NaN |
3 | 1.000000 | NaN | 2.000000 |
# isnull df_data.isnull()Out[530]:
0 | 1 | 2 | |
---|---|---|---|
0 | False | False | False |
1 | False | True | True |
2 | False | True | True |
3 | False | True | False |
# dropna df_data.dropna() # df_data.dropna(axis=1)Out[531]:
0 | 1 | 2 | |
---|---|---|---|
0 | 1.611935 | -0.311622 | 0.670604 |
# fillna df_data.fillna(-100.)Out[532]:
0 | 1 | 2 | |
---|---|---|---|
0 | 1.611935 | -0.311622 | 0.670604 |
1 | 1.000000 | -100.000000 | -100.000000 |
2 | 4.000000 | -100.000000 | -100.000000 |
3 | 1.000000 | -100.000000 | 2.000000 |
import numpy as np import pandas as pdIn[534]:
df_obj = pd.DataFrame(np.random.randn(5,4),columns=['a', 'b', 'c', 'd']) df_objOut[534]:
a | b | c | d | |
---|---|---|---|---|
0 | -0.985329 | -0.240711 | 0.735334 | -1.650240 |
1 | 0.933588 | 1.289711 | -1.183634 | 0.539915 |
2 | 0.311238 | 0.925572 | 0.825993 | -0.598073 |
3 | 1.137575 | -0.114538 | 0.244627 | 0.019387 |
4 | 0.541883 | -0.674057 | 0.094889 | -0.753914 |
df_obj.sum()Out[535]:
a 1.938956 b 1.185977 c 0.717209 d -2.442924 dtype: float64In[536]:
df_obj.max()Out[536]:
a 1.137575 b 1.289711 c 0.825993 d 0.539915 dtype: float64In[537]:
df_obj.min(axis=1)Out[537]:
0 -1.650240 1 -1.183634 2 -0.598073 3 -0.114538 4 -0.753914 dtype: float64In[538]:
df_obj.describe()Out[538]:
a | b | c | d | |
---|---|---|---|---|
count | 5.000000 | 5.000000 | 5.000000 | 5.000000 |
mean | 0.387791 | 0.237195 | 0.143442 | -0.488585 |
std | 0.832937 | 0.831286 | 0.804664 | 0.828806 |
min | -0.985329 | -0.674057 | -1.183634 | -1.650240 |
25% | 0.311238 | -0.240711 | 0.094889 | -0.753914 |
50% | 0.541883 | -0.114538 | 0.244627 | -0.598073 |
75% | 0.933588 | 0.925572 | 0.735334 | 0.019387 |
max | 1.137575 | 1.289711 | 0.825993 | 0.539915 |
import pandas as pd import numpy as npIn[540]:
ser_obj = pd.Series(np.random.randn(12), index=[['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd', 'd'], [0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2]]) print(ser_obj)
a 0 1.651622 1 0.812348 2 0.643199 b 0 -1.540444 1 -0.219316 2 -0.526368 c 0 0.908866 1 0.107464 2 0.693538 d 0 0.521462 1 0.128478 2 0.288094 dtype: float64
- ## MultilIndex索引物件
type(ser_obj.index) ser_obj.indexOut[541]:
MultiIndex(levels=[['a', 'b', 'c', 'd'], [0, 1, 2]], labels=[[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2]])
- ## 選取子集
# 外層選取 ser_obj['c']Out[542]:
0 0.908866 1 0.107464 2 0.693538 dtype: float64In[543]:
# 內層選取 ser_obj[:, 2]Out[543]:
a 0.643199 b -0.526368 c 0.693538 d 0.288094 dtype: float64
- ## 交換分層順序
ser_obj.swaplevel()Out[544]:
0 a 1.651622 1 a 0.812348 2 a 0.643199 0 b -1.540444 1 b -0.219316 2 b -0.526368 0 c 0.908866 1 c 0.107464 2 c 0.693538 0 d 0.521462 1 d 0.128478 2 d 0.288094 dtype: float64
- ## 交換並排序分層
ser_obj.swaplevel().sortlevel()
e:\python_lesson\installer\anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: sortlevel is deprecated, use sort_index(level=...) """Entry point for launching an IPython kernel.Out[545]:
0 a 1.651622 b -1.540444 c 0.908866 d 0.521462 1 a 0.812348 b -0.219316 c 0.107464 d 0.128478 2 a 0.643199 b -0.526368 c 0.693538 d 0.288094 dtype: float64
- ## GroupBy物件
import pandas as pd import numpy as npIn[547]:
dict_obj = {'key1': ['a', 'b', 'a', 'b', 'a', 'b', 'a', 'a'], 'key2': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'data1': np.random.randint(1,10,(8)), 'data2': np.random.randint(1,10,(8))} df_obj = pd.DataFrame(dict_obj) df_objOut[547]:
data1 | data2 | key1 | key2 | |
---|---|---|---|---|
0 | 3 | 6 | a | one |
1 | 9 | 8 | b | one |
2 | 8 | 5 | a | two |
3 | 9 | 6 | b | three |
4 | 4 | 2 | a | two |
5 | 4 | 1 | b | two |
6 | 8 | 9 | a | one |
7 | 3 | 9 | a | three |
# dataframe 根據key1進行分組 type(df_obj.groupby('key1'))Out[548]:
pandas.core.groupby.DataFrameGroupByIn[549]:
# data1列根據key1進行分組 type(df_obj['data1'].groupby(df_obj['key1']))Out[549]:
pandas.core.groupby.SeriesGroupByIn[550]:
# 分組運算 grouped1 = df_obj.groupby('key1') print(grouped1.mean()) grouped2 = df_obj['data1'].groupby(df_obj['key1']) print(grouped2.mean())
data1 data2 key1 a 5.200000 6.2 b 7.333333 5.0 key1 a 5.200000 b 7.333333 Name: data1, dtype: float64In[551]:
grouped1.size() grouped2.size()Out[551]:
key1 a 5 b 3 Name: data1, dtype: int64In[552]:
# 按列名分組 df_obj.groupby('key1')Out[552]:
<pandas.core.groupby.DataFrameGroupBy object at 0x04AF9890>In[553]:
# 按自定義key分組,列表 self_def_key = [1, 1, 2, 2, 2, 1, 1, 1] df_obj.groupby(self_def_key).size()Out[553]:
1 5 2 3 dtype: int64In[554]:
# 按自定義key分組,多層列表 df_obj.groupby([df_obj['key1'], df_obj['key2']]).size()Out[554]:
key1 key2 a one 2 three 1 two 2 b one 1 three 1 two 1 dtype: int64In[555]:
# 按多個列多層分組 grouped2 = df_obj.groupby(['key1', 'key2']) grouped2.size()Out[555]:
key1 key2 a one 2 three 1 two 2 b one 1 three 1 two 1 dtype: int64In[556]:
# 多層分組按key得順序進行 grouped3 = df_obj.groupby(['key2', 'key1']) grouped3.mean() grouped3.mean().unstack()Out[556]:
data1 | data2 | |||
---|---|---|---|---|
key1 | a | b | a | b |
key2 | ||||
one | 5.5 | 9.0 | 7.5 | 8.0 |
three | 3.0 | 9.0 | 9.0 | 6.0 |
two | 6.0 | 4.0 | 3.5 | 1.0 |
- ## GroupBy物件分組迭代
# 單層分組 for group_name, group_data in grouped1: print(group_name) print(group_data)
a data1 data2 key1 key2 0 3 6 a one 2 8 5 a two 4 4 2 a two 6 8 9 a one 7 3 9 a three b data1 data2 key1 key2 1 9 8 b one 3 9 6 b three 5 4 1 b twoIn[558]:
# 多層分組 for group_name, group_data in grouped2: print(group_name) print(group_data)
('a', 'one') data1 data2 key1 key2 0 3 6 a one 6 8 9 a one ('a', 'three') data1 data2 key1 key2 7 3 9 a three ('a', 'two') data1 data2 key1 key2 2 8 5 a two 4 4 2 a two ('b', 'one') data1 data2 key1 key2 1 9 8 b one ('b', 'three') data1 data2 key1 key2 3 9 6 b three ('b', 'two') data1 data2 key1 key2 5 4 1 b twoIn[559]:
# GroupBy物件轉換list list(grouped1)Out[559]:
[('a', data1 data2 key1 key2 0 3 6 a one 2 8 5 a two 4 4 2 a two 6 8 9 a one 7 3 9 a three), ('b', data1 data2 key1 key2 1 9 8 b one 3 9 6 b three 5 4 1 b two)]In[560]:
# GroupBy物件轉換dict dict(list(grouped1))Out[560]:
{'a': data1 data2 key1 key2 0 3 6 a one 2 8 5 a two 4 4 2 a two 6 8 9 a one 7 3 9 a three, 'b': data1 data2 key1 key2 1 9 8 b one 3 9 6 b three 5 4 1 b two}In[561]:
# print(df_obj.dtypes) #按資料型別分組 # df_obj.groupby(df_obj.dtypes, axis=1).size() df_obj.groupby(df_obj.dtypes, axis=1).sum()Out[561]:
int32 | object | |
---|---|---|
0 | 9 | aone |
1 | 17 | bone |
2 | 13 | atwo |
3 | 15 | bthree |
4 | 6 | atwo |
5 | 5 | btwo |
6 | 17 | aone |
7 | 12 | athree |
- ## 其他分組方法
df_obj2 = pd.DataFrame(np.random.randint(1, 10, (5,5)), columns=['a', 'b', 'c', 'd', 'e'], index=['A', 'B', 'C', 'D', 'E']) df_obj2.iloc[1, 1:4] = np.NaN df_obj2Out[562]:
a | b | c | d | e | |
---|---|---|---|---|---|
A | 3 | 4.0 | 8.0 | 3.0 | 3 |
B | 3 | NaN | NaN | NaN | 9 |
C | 7 | 9.0 | 9.0 | 1.0 | 9 |
D | 2 | 9.0 | 8.0 | 6.0 | 2 |
E | 4 | 9.0 | 3.0 | 3.0 | 8 |
# 通過字典分組 mapping_dict = {'a':'Python', 'b':'python', 'c':'java', 'd':'C', 'e':'java'} df_obj2.groupby(mapping_dict, axis=1).size() df_obj2.groupby(mapping_dict, axis=1).count() # 非NaN的個數 df_obj2.groupby(mapping_dict, axis=1).sum()Out[563]:
C | Python | java | python | |
---|---|---|---|---|
A | 3.0 | 3.0 | 11.0 | 4.0 |
B | 0.0 | 3.0 | 9.0 | 0.0 |
C | 1.0 | 7.0 | 18.0 | 9.0 |
D | 6.0 | 2.0 | 10.0 | 9.0 |
E | 3.0 | 4.0 | 11.0 | 9.0 |
# 通過函式分組 df_obj3 = pd.DataFrame(np.random.randint(1, 10, (5,5)), columns=['a', 'b', 'c', 'd', 'e'], index=['AA', 'BBB', 'CC', 'D', 'EE']) df_obj3Out[564]:
a | b | c | d | e | |
---|---|---|---|---|---|
AA | 3 | 1 | 9 | 7 | 5 |
BBB | 7 | 6 | 7 | 9 | 9 |
CC | 6 | 2 | 3 | 3 | 4 |
D | 7 | 1 | 3 | 9 | 7 |
EE | 9 | 6 | 4 | 4 | 9 |
def group_key(idx): """ idx 為列索引或行索引 """ #return idx return len(idx) df_obj3.groupby(group_key).size() # 以上自定義函式等價於 #df_obj3.groupby(len).size()Out[565]:
1 1 2 3 3 1 dtype: int64In[566]:
# 通過索引級別分組 columns = pd.MultiIndex.from_arrays([['Python', 'Java', 'Python', 'Java', 'Python'], ['A', 'A', 'B', 'C', 'B']], names=['language', 'index']) df_obj4 = pd.DataFrame(np.random.randint(1, 10, (5,5)), columns=columns) df_obj4Out[566]:
language | Python | Java | Python | Java | Python |
---|---|---|---|---|---|
index | A | A | B | C | B |
0 | 2 | 2 | 9 | 1 | 6 |
1 | 4 | 6 | 2 | 6 | 5 |
2 | 7 | 3 | 5 | 3 | 8 |
3 | 4 | 8 | 4 | 1 | 3 |
4 | 6 | 2 | 9 | 9 | 2 |
# 根據language進行分組 df_obj4.groupby(level='language', axis=1).sum() df_obj4.groupby(level='index', axis=1).sum()Out[567]:
index | A | B | C |
---|---|---|---|
0 | 4 | 15 | 1 |
1 | 10 | 7 | 6 |
2 | 10 | 13 | 3 |
3 | 12 | 7 | 1 |
4 | 8 | 11 | 9 |
- ## 聚合
import pandas as pd import numpy as np dict_obj = {'key1': ['a', 'b', 'a', 'b', 'a', 'b', 'a', 'a'], 'key2': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'data1': np.random.randint(1,10,8), 'data2': np.random.randint(1,10,8)} df_obj5 = pd.DataFrame(dict_obj) df_obj5Out[568]:
data1 | data2 | key1 | key2 | |
---|---|---|---|---|
0 | 4 | 6 | a | one |
1 | 6 | 4 | b | one |
2 | 8 | 7 | a | two |
3 | 4 | 9 | b | three |
4 | 3 | 6 | a | two |
5 | 6 | 3 | b | two |
6 | 4 | 2 | a | one |
7 | 5 | 6 | a | three |
# 內建的聚合函式 print(df_obj5.groupby('key1').sum()) print(df_obj5.groupby('key1').max()) print(df_obj5.groupby('key1').min()) print(df_obj5.groupby('key1').mean()) print(df_obj5.groupby('key1').size()) print(df_obj5.groupby('key1').count()) print(df_obj5.groupby('key1').describe())
data1 data2 key1 a 24 27 b 16 16 data1 data2 key2 key1 a 8 7 two b 6 9 two data1 data2 key2 key1 a 3 2 one b 4 3 one data1 data2 key1 a 4.800000 5.400000 b 5.333333 5.333333 key1 a 5 b 3 dtype: int64 data1 data2 key2 key1 a 5 5 5 b 3 3 3 data1 data2 \ count mean std min 25% 50% 75% max count mean key1 a 5.0 4.800000 1.923538 3.0 4.0 4.0 5.0 8.0 5.0 5.400000 b 3.0 5.333333 1.154701 4.0 5.0 6.0 6.0 6.0 3.0 5.333333 std min 25% 50% 75% max key1 a 1.949359 2.0 6.0 6.0 6.0 7.0 b 3.214550 3.0 3.5 4.0 6.5 9.0In[570]:
# 自定義聚合函式 def peak_range(df): """ 返回數值範圍 """ #print(type(df)) #引數為索引所對應的記錄 return df.max() - df.min() print(df_obj5.groupby('key1').agg(peak_range)) print(df_obj5.groupby('key1').agg(lambda df: df.max() - df.min()))
data1 data2 key1 a 5 5 b 2 6 data1 data2 key1 a 5 5 b 2 6In[571]:
# 應用多個聚合函式 #同時應用多個聚合函式 print(df_obj5.groupby('key1').agg(['mean', 'std', 'count', peak_range])) # 預設列名為函式名
data1 data2 mean std count peak_range mean std count peak_range key1 a 4.800000 1.923538 5 5 5.400000 1.949359 5 5 b 5.333333 1.154701 3 2 5.333333 3.214550 3 6In[572]:
print(df_obj5.groupby('key1').agg(['mean', 'std', 'count', ('range', peak_range)]))
data1 data2 mean std count range mean std count range key1 a 4.800000 1.923538 5 5 5.400000 1.949359 5 5 b 5.333333 1.154701 3 2 5.333333 3.214550 3 6In[573]:
# 每列作用不同的聚合函式 dict_mapping = {'data1':'mean', 'data2':'sum'} print(df_obj5.groupby('key1').agg(dict_mapping))
data1 data2 key1 a 4.800000 27 b 5.333333 16
数据分组运算¶
In[574]:import pandas as pd import numpy as np # 分組運算後保持shape dict_obj = {'key1' : ['a', 'b', 'a', 'b', 'a', 'b', 'a', 'a'], 'key2' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'data1' : np.random.randint(1, 10, 8), 'data2' : np.random.randint(1, 10, 8)} df_obj = pd.DataFrame(dict_obj) df_objOut[574]:
data1 | data2 | key1 | key2 | |
---|---|---|---|---|
0 | 5 | 3 | a | one |
1 | 9 | 3 | b | one |
2 | 7 | 1 | a | two |
3 | 2 | 9 | b | three |
4 | 1 | 6 | a | two |
5 | 5 | 7 | b | two |
6 | 5 | 7 | a | one |
7 | 1 | 1 | a | three |
# 按key1分組後,計算data1,data2的統計資訊並附加到原始表格中 k1_sum = df_obj.groupby('key1').sum().add_prefix('sum_') k1_sumOut[575]:
sum_data1 | sum_data2 | |
---|---|---|
key1 | ||
a | 19 | 18 |
b | 16 | 19 |
# 方法1,使用merge pd.merge(df_obj, k1_sum, left_on='key1', right_index=True)Out[576]:
data1 | data2 | key1 | key2 | sum_data1 | sum_data2 | |
---|---|---|---|---|---|---|
0 | 5 | 3 | a | one | 19 | 18 |
2 | 7 | 1 | a | two | 19 | 18 |
4 | 1 | 6 | a | two | 19 | 18 |
6 | 5 | 7 | a | one | 19 | 18 |
7 | 1 | 1 | a | three | 19 | 18 |
1 | 9 | 3 | b | one | 16 | 19 |
3 | 2 | 9 | b | three | 16 | 19 |
5 | 5 | 7 | b | two | 16 | 19 |
- ## transform方法
# 方法2,使用transform k1_sum_tf = df_obj.groupby('key1').transform(np.sum).add_prefix('sum_') df_obj[k1_sum_tf.columns] = k1_sum_tf df_objOut[577]:
data1 | data2 | key1 | key2 | sum_data1 | sum_data2 | sum_key2 | |
---|---|---|---|---|---|---|---|
0 | 5 | 3 | a | one | 19 | 18 | onetwotwoonethree |
1 | 9 | 3 | b | one | 16 | 19 | onethreetwo |
2 | 7 | 1 | a | two | 19 | 18 | onetwotwoonethree |
3 | 2 | 9 | b | three | 16 | 19 | onethreetwo |
4 | 1 | 6 | a | two | 19 | 18 | onetwotwoonethree |
5 | 5 | 7 | b | two | 16 | 19 | onethreetwo |
6 | 5 | 7 | a | one | 19 | 18 | onetwotwoonethree |
7 | 1 | 1 | a | three | 19 | 18 | onetwotwoonethree |
# 自定義函式傳入transform def diff_mean(s): """ 返回資料與均值的差值 """ return s - s.mean() # df_obj.groupby('key1'.transform(diff_mean))In[579]:
dataset_path = './startcraft.csv' # df_data = pd.read_csv(dataset_path, usecols=['LeagueIndex', 'Age', 'HoursPerweek', 'TotalHours', 'APM'])In[580]:
import pandas as pd import numpy as np df_obj1 = pd.DataFrame({'key' : ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 'data1' : np.random.randint(0, 10, 7)}) df_obj2 = pd.DataFrame({'key' : ['a', 'b', 'd'], 'data2' : np.random.randint(0, 10, 3)}) print(df_obj1) print(df_obj2)
data1 key 0 4 b 1 4 b 2 4 a 3 5 c 4 2 a 5 8 a 6 8 b data2 key 0 3 a 1 2 b 2 4 dIn[581]:
# 預設將重疊列的列名作為"外來鍵"進行連線 pd.merge(df_obj1, df_obj2)Out[581]:
data1 | key | data2 | |
---|---|---|---|
0 | 4 | b | 2 |
1 | 4 | b | 2 |
2 | 8 | b | 2 |
3 | 4 | a | 3 |
4 | 2 | a | 3 |
5 | 8 | a | 3 |
# on 顯示指定"外來鍵" pd.merge(df_obj1, df_obj2, on='key')Out[582]:
data1 | key | data2 | |
---|---|---|---|
0 | 4 | b | 2 |
1 | 4 | b | 2 |
2 | 8 | b | 2 |
3 | 4 | a | 3 |
4 | 2 | a | 3 |
5 | 8 | a | 3 |
# left_on, right_on分別指定左側資料和右側資料的"外來鍵" # 更改列名 df_obj1 = df_obj1.rename(columns={'key':'key1'}) df_obj2 = df_obj2.rename(columns={'key':'key2'})In[584]:
pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2')Out[584]:
data1 | key1 | data2 | key2 | |
---|---|---|---|---|
0 | 4 | b | 2 | b |
1 | 4 | b | 2 | b |
2 | 8 | b | 2 | b |
3 | 4 | a | 3 | a |
4 | 2 | a | 3 | a |
5 | 8 | a | 3 | a |
# "外連線" pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='outer')Out[585]:
data1 | key1 | data2 | key2 | |
---|---|---|---|---|
0 | 4.0 | b | 2.0 | b |
1 | 4.0 | b | 2.0 | b |
2 | 8.0 | b | 2.0 | b |
3 | 4.0 | a | 3.0 | a |
4 | 2.0 | a | 3.0 | a |
5 | 8.0 | a | 3.0 | a |
6 | 5.0 | c | NaN | NaN |
7 | NaN | NaN | 4.0 | d |
# "左連線" pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='left')Out[586]:
data1 | key1 | data2 | key2 | |
---|---|---|---|---|
0 | 4 | b | 2.0 | b |
1 | 4 | b | 2.0 | b |
2 | 4 | a | 3.0 | a |
3 | 5 | c | NaN | NaN |
4 | 2 | a | 3.0 | a |
5 | 8 | a | 3.0 | a |
6 | 8 | b | 2.0 | b |
# "右連線" pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='right')Out[587]:
data1 | key1 | data2 | key2 | |
---|---|---|---|---|
0 | 4.0 | b | 2 | b |
1 | 4.0 | b | 2 | b |
2 | 8.0 | b | 2 | b |
3 | 4.0 | a | 3 | a |
4 | 2.0 | a | 3 | a |
5 | 8.0 | a | 3 | a |
6 | NaN | NaN | 4 | d |
import pandas as pd import numpy as np
- ## stack
df_obj = pd.DataFrame(np.random.randint(0,10,(5,2)), columns=['data1', 'data2']) df_objOut[589]:
data1 | data2 | |
---|---|---|
0 | 6 | 5 |
1 | 3 | 8 |
2 | 1 | 8 |
3 | 0 | 9 |
4 | 5 | 3 |
stacked = df_obj.stack() stackedOut[590]:
0 data1 6 data2 5 1 data1 3 data2 8 2 data1 1 data2 8 3 data1 0 data2 9 4 data1 5 data2 3 dtype: int32In[591]:
print(type(stacked)) print(stacked.index)
<class 'pandas.core.series.Series'> MultiIndex(levels=[[0, 1, 2, 3, 4], ['data1', 'data2']], labels=[[0, 0, 1, 1, 2, 2, 3, 3, 4, 4], [0, 1, 0, 1, 0, 1, 0, 1, 0, 1]])In[592]:
# 預設操作內層索引 stacked.unstack()Out[592]:
data1 | data2 | |
---|---|---|
0 | 6 | 5 |
1 | 3 | 8 |
2 | 1 | 8 |
3 | 0 | 9 |
4 | 5 | 3 |
- ## 例項
# kmeans_tool.py import math import random class Cluster(object): """ 聚類 """ def __init__(self, samples): if len(samples) == 0: # 如果聚類中無樣本點 raise Exception("錯誤:一個空的聚類!") # 屬於該聚類的樣本點 self.samples = samples # 該聚類中樣本點的維度 self.n_dim = samples[0].n_dim # 判斷該聚類中所有樣本點的維度是否相同 for sample in samples: if sample.n_dim != self.n_dim: raise Exception("錯誤,聚類中樣本點的維度不一致!") # 設定初始化的聚類中心 self.centroid = self.cal_centroid() def __repr__(self): return str(self.samples) def update(self, samples): old_centroid = self.centroid self.samples = samples self.centroid = self.cal_centroid() shift = get_distance(old_centroid, self.centroid) return shift def cal_centroid(self): n_samples = len(self.samples) coords = [sample.coords for sample in self.samples] unzipped = zip(*coords) centroid_coords = [math.fsum(d_list)/n_samples for d_list in unzipped] return Sample(centroid_coords) class Sample(object): """ 樣本點類 """ def __init__(self, coords): self.coords = coords # 樣本點包含的座標 self.n_dim = len(coords) # 樣本點維度 def __repr__(self): """ 輸出物件資訊 """ return str(self.coords) def get_distance(a, b): """ 返回樣本點a, b的歐式距離 """ if a.n_dim != b.n_dim: # 如果樣本點維度不同 raise Exception("錯誤,樣本點維度不同,無法計算距離!") acc_diff = 0.0 for i in range(a.n_dim): square_diff = pow((a.coords[i] - b.coords[i]), 2) acc_diff += square_diff distance = math.sqrt(acc_diff) return distance def gen_random_sample(n_dim, lower, upper): """ 生成隨機樣本 """ sample = Sample([random.uniform(lower, upper) for _ in range(n_dim)]) return sampleIn[595]:
# main.py import random # from kmeans_tools import Cluster, get_distance, gen_random_sample import matplotlib.pyplot as plt from matplotlib import colors as mcolors def kmeans(samples, k, cutoff): """ kmeans函式 """ # 隨機選k個樣本點作為初始聚類中心 init_samples = random.sample(samples, k) # 建立k個聚類,聚類的中心分別為隨機初始的樣本點 clusters = [Cluster([sample]) for sample in init_samples] # 迭代迴圈直到聚類劃分穩定 n_loop = 0 while True: # 初始化一組空列表用於儲存每個聚類的樣本點 lists = [[]for _ in clusters] # 開始迭代 n_loop += 1 # 遍歷樣本集中的每個樣本 for sample in samples: # 遍歷樣本點Sample和第一個聚類中心的距離 smallest_distance = get_distance(sample, clusters[0].centroid) # 初始化屬於聚類 0 cluster_index = 0 # 計算和其他聚類中心的距離 for i in range(k - 1): # 計算樣本點sample和聚類中心的距離 distance = get_distance(sample, clusters[i+1].centroid) # 如果存在更小的距離,更新距離 if distance < smallest_distance: smallest_distance = distance cluster_index = i + 1 # 找到最近的聚類中心,更新所屬聚類 lists[cluster_index].append(sample) # 初始化最大移動距離 biggest_shift = 0.0 # 計算本次迭代中,聚類中心移動的距離 for i in range(k): shift = clusters[i].update(lists[i]) # 記錄最大移動距離 biggest_shift = max(biggest_shift, shift) # 如果聚類中心移動的距離小於收斂閾值,即:聚類穩定 if biggest_shift < cutoff: # print("第{}次迭代後,聚類穩定。".format(n_loop)) break # 返回聚類結果 return clusters def run_main(): """ 主函式 """ # 樣本個數 n_samples = 1000 # 特徵個數(特徵維度) n_feat = 2 # 特徵數值範圍 lower = 0 upper = 200 # 聚類個數 n_cluster = 3 # 生成隨機樣本 samples = [gen_random_sample(n_feat, lower, upper) for _ in range(n_samples)] # 收斂閾值 cutoff = 0.2 clusters = kmeans(samples, n_cluster, cutoff) # 輸出結果 # for i, c in enumerate(clusters): # for sample in c.samples: # print('聚類--{},樣本點--{}'.format(i, sample)) # 視覺化結果 plt.subplot() color_names = list(mcolors.cnames) for i, c in enumerate(clusters): x = [] y = [] random.choice color = [color_names[i]] * len(c.samples) for sample in c.samples: x.append(sample.coords[0]) y.append(sample.coords[1]) plt.scatter(x, y, c=color) plt.show() if __name__ == '__main__': run_main()In[]: