1. 程式人生 > 實用技巧 >python學習筆記(4)-理論:資料分析工具Pandas

python學習筆記(4)-理論:資料分析工具Pandas

python學習筆記(4)-理論:資料分析工具Pandas





Pandas数据结构

In[487]:
import pandas as pd
%matplotlib inline
  • ## Series
In[488]:
# 通過list構建Series
ser_obj = pd.Series(range(10, 20))
print(type(ser_obj))
<class 'pandas.core.series.Series'>
In[489]:
# 獲取資料
print(ser_obj.values)

# 獲取索引
print(ser_obj.index)
[10 11 12 13 14 15 16 17 18 19]
RangeIndex(start=0, stop=10, step=1)
In[490]:
# 預覽資料
print(ser_obj.head(3))
0    10
1    11
2    12
dtype: int64
In[491]:
print(ser_obj)
0    10
1    11
2    12
3    13
4    14
5    15
6    16
7    17
8    18
9    19
dtype: int64

In[492]:
# 通過索引獲取資料
print(ser_obj[0])
print(ser_obj[8])
10
18
In[493]:
# 索引與資料的對應關係仍保持在陣列運算的結果中
print(ser_obj * 2)
print(ser_obj > 15)
print(ser_obj[ser_obj > 15])
0    20
1    22
2    24
3    26
4    28
5    30
6    32
7    34
8    36
9    38
dtype: int64
0    False
1    False
2    False
3    False
4    False
5    False
6     True
7     True
8     True
9     True
dtype: bool
6    16
7    17
8    18
9    19
dtype: int64
In[494]:
# 通過dict構建Series
year_data = {2001: 17.8, 2002: 20.1, 2003: 16.5}
ser_obj2 = pd.Series(year_data)
print(ser_obj2.head())
print(ser_obj2.index)
2001    17.8
2002    20.1
2003    16.5
dtype: float64
Int64Index([2001, 2002, 2003], dtype='int64')
In[495]:
# name屬性
ser_obj2.name = 'temp'
ser_obj2.index.name = 'year'
print(ser_obj2.head())
year
2001    17.8
2002    20.1
2003    16.5
Name: temp, dtype: float64

In[496]:
import numpy as np

# 通過ndarray構建DataFrame
array = np.random.randn(5,4)
print(array)

df_obj = pd.DataFrame(array)
print(df_obj.head())
[[-0.01886471 -1.40819766  0.1409696   1.23839493]
 [-0.95234362  0.84017655  0.96431593  1.22662473]
 [-0.23179682  1.1027172   1.75420058  1.54925205]
 [-0.2529594  -1.12648376 -0.52386023 -0.80252582]
 [ 1.44400167 -3.26985176 -1.77381084  0.56538251]]
          0         1         2         3
0 -0.018865 -1.408198  0.140970  1.238395
1 -0.952344  0.840177  0.964316  1.226625
2 -0.231797  1.102717  1.754201  1.549252
3 -0.252959 -1.126484 -0.523860 -0.802526
4  1.444002 -3.269852 -1.773811  0.565383
In[497]:
# 通過dict構建DataFrame

dict_data = {'A': 1., 'B': pd.Timestamp('20161217'), 'C': pd.Series(1, index=list(range(4)), dtype='float32'),
             'D': np.array([3] * 4, dtype='int32'), 'E': pd.Categorical(["Python", "Java", "C++", "C#"]),
              'F': "China"}

# print(dict_data)
df_obj2 = pd.DataFrame(dict_data)
df_obj2.head()
Out[497]:
A B C D E F
0 1.0 2016-12-17 1.0 3 Python China
1 1.0 2016-12-17 1.0 3 Java China
2 1.0 2016-12-17 1.0 3 C++ China
3 1.0 2016-12-17 1.0 3 C# China
In[498]:
# 通過列索引獲取列資料
print(df_obj2['A'])
print(type(df_obj2['A']))

print(df_obj2.A)
0    1.0
1    1.0
2    1.0
3    1.0
Name: A, dtype: float64
<class 'pandas.core.series.Series'>
0    1.0
1    1.0
2    1.0
3    1.0
Name: A, dtype: float64
In[499]:
# 增加列
df_obj2['G'] = df_obj2['D'] + 4
print(df_obj2)
     A          B    C  D       E      F  G
0  1.0 2016-12-17  1.0  3  Python  China  7
1  1.0 2016-12-17  1.0  3    Java  China  7
2  1.0 2016-12-17  1.0  3     C++  China  7
3  1.0 2016-12-17  1.0  3      C#  China  7
In[500]:
# 刪除列
del(df_obj2['G'])
print(df_obj2)
     A          B    C  D       E      F
0  1.0 2016-12-17  1.0  3  Python  China
1  1.0 2016-12-17  1.0  3    Java  China
2  1.0 2016-12-17  1.0  3     C++  China
3  1.0 2016-12-17  1.0  3      C#  China

In[501]:
print(type(ser_obj.index))
print(type(df_obj2.index))

print(df_obj2.index)
<class 'pandas.core.indexes.range.RangeIndex'>
<class 'pandas.core.indexes.numeric.Int64Index'>
Int64Index([0, 1, 2, 3], dtype='int64')
In[502]:
# 索引物件不可變
# df_obj2.index[0] = 2





In[503]:
import pandas as pd
ser_obj = pd.Series(range(5), index = ['a', 'b', 'c', 'd', 'e'])
print(ser_obj.head())
a    0
b    1
c    2
d    3
e    4
dtype: int64
In[504]:
# 行索引
print(ser_obj['a'])
print(ser_obj[0])
0
0
In[505]:
#  切片索引
print(ser_obj[1:3])
print(ser_obj['b': 'd'])
b    1
c    2
dtype: int64
b    1
c    2
d    3
dtype: int64
In[506]:
# 不連續索引
print(ser_obj[[0, 2, 4]])
print(ser_obj[['a', 'e']])
a    0
c    2
e    4
dtype: int64
a    0
e    4
dtype: int64
In[507]:
# 布林索引
ser_bool = ser_obj > 2
print(ser_bool)
print(ser_obj[ser_bool])

print(ser_obj[ser_obj > 2])
a    False
b    False
c    False
d     True
e     True
dtype: bool
d    3
e    4
dtype: int64
d    3
e    4
dtype: int64

In[508]:
import numpy as np
df_obj = pd.DataFrame(np.random.randn(5, 4), columns = ['a', 'b', 'c', 'd'])
print(df_obj.head())
          a         b         c         d
0  0.816394 -0.002626  0.514936  0.044557
1 -0.890728 -0.246314  0.097609 -0.687022
2 -0.095511  1.070962 -0.364357  0.056241
3 -1.900075  0.380913  0.740835 -1.336294
4 -0.451464  0.364149  0.596956 -0.073667
In[509]:
# 列索引
print('列索引')
print(df_obj['a']) # 返回Series型別
# print(type(df_obj[[0]])) # 返回DataFrame型別 好像有問題,會報錯???

# 不連續索引
print('不連續索引')
print(df_obj[['a', 'c']])
# print(df_obj[[1, 3]]) # 好像有問題,會報錯???
列索引
0    0.816394
1   -0.890728
2   -0.095511
3   -1.900075
4   -0.451464
Name: a, dtype: float64
不連續索引
          a         c
0  0.816394  0.514936
1 -0.890728  0.097609
2 -0.095511 -0.364357
3 -1.900075  0.740835
4 -0.451464  0.596956

In[510]:
# 標籤索引 loc
# Series
ser_obj = pd.Series(range(5), index = ['a', 'b', 'c', 'd', 'e'])
df_obj = pd.DataFrame(np.random.randn(5, 4), columns = ['a', 'b', 'c', 'd'])
print(ser_obj['b' : 'd'])
print(ser_obj.loc['b' : 'd'])

# DataFrame
print(df_obj['a'])
print(df_obj.loc[0:2, 'a'])
b    1
c    2
d    3
dtype: int64
b    1
c    2
d    3
dtype: int64
0    0.325330
1    0.168478
2    0.298383
3   -1.192005
4   -1.069254
Name: a, dtype: float64
0    0.325330
1    0.168478
2    0.298383
Name: a, dtype: float64
In[511]:
# 整形位置索引 iloc
print(ser_obj[1:3])
print(ser_obj.iloc[1:3])

# DataFrame
print(df_obj.iloc[0:2, 0]) # 注意和df_obj.loc[0:2, 'a']的區別
b    1
c    2
dtype: int64
b    1
c    2
dtype: int64
0    0.325330
1    0.168478
Name: a, dtype: float64
In[512]:
# 混合索引 ix
print(ser_obj.ix[1:3])
print(ser_obj.ix['b' : 'c'])

#DataFrame
print(df_obj.ix[0:2, 0])# 先按照標籤索引嘗試操作,然後再按照位置索引嘗試操作
b    1
c    2
dtype: int64
b    1
c    2
dtype: int64
0    0.325330
1    0.168478
2    0.298383
Name: a, dtype: float64
e:\python_lesson\installer\anaconda3\lib\site-packages\ipykernel_launcher.py:2: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  
e:\python_lesson\installer\anaconda3\lib\site-packages\ipykernel_launcher.py:6: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  

In[513]:
s1 = pd.Series(range(10, 20), index = range(10))
s2 = pd.Series(range(20, 25), index = range(5))
print(s1)
print(s2)

# Series 對齊運算
print(s1 + s2)
0    10
1    11
2    12
3    13
4    14
5    15
6    16
7    17
8    18
9    19
dtype: int64
0    20
1    21
2    22
3    23
4    24
dtype: int64
0    30.0
1    32.0
2    34.0
3    36.0
4    38.0
5     NaN
6     NaN
7     NaN
8     NaN
9     NaN
dtype: float64
In[514]:
df1 = pd.DataFrame(np.ones((2,2)), columns = ['a', 'b'])
df2 = pd.DataFrame(np.ones((3,3)), columns = ['a', 'b', 'c'])

print(df1)
print(df2)

# DataFrame對齊操作
print(df1 + df2)
     a    b
0  1.0  1.0
1  1.0  1.0
     a    b    c
0  1.0  1.0  1.0
1  1.0  1.0  1.0
2  1.0  1.0  1.0
     a    b   c
0  2.0  2.0 NaN
1  2.0  2.0 NaN
2  NaN  NaN NaN
In[515]:
# 填充未對齊的資料進行計算

s1.add(s2, fill_value = -1)
df1.sub(df2, fill_value = 2.)
Out[515]:
a b c
0 0.0 0.0 1.0
1 0.0 0.0 1.0
2 1.0 1.0 1.0
In[516]:
# 填充NaN
s3 = s1 + s2
print(s3)
0    30.0
1    32.0
2    34.0
3    36.0
4    38.0
5     NaN
6     NaN
7     NaN
8     NaN
9     NaN
dtype: float64
In[517]:
s3_filled = s3.fillna(-1)
print(s3_filled)
0    30.0
1    32.0
2    34.0
3    36.0
4    38.0
5    -1.0
6    -1.0
7    -1.0
8    -1.0
9    -1.0
dtype: float64
In[518]:
df3 = df1 + df2
df3
Out[518]:
a b c
0 2.0 2.0 NaN
1 2.0 2.0 NaN
2 NaN NaN NaN
In[519]:
df3.fillna(100, inplace = True)
df3
Out[519]:
a b c
0 2.0 2.0 100.0
1 2.0 2.0 100.0
2 100.0 100.0 100.0

In[520]:
# Numpy ufunc 函式
df = pd.DataFrame(np.random.randn(5, 4) - 1)
print(df)

print(np.abs(df))
          0         1         2         3
0 -2.119619 -1.889218  0.354441 -0.568196
1  0.123669  0.683149 -1.900245 -0.289735
2 -0.883802  1.783550 -1.749171 -0.025582
3  0.071452  0.661371  0.000046 -2.492441
4 -0.054194 -1.471238 -0.750263 -0.210742
          0         1         2         3
0  2.119619  1.889218  0.354441  0.568196
1  0.123669  0.683149  1.900245  0.289735
2  0.883802  1.783550  1.749171  0.025582
3  0.071452  0.661371  0.000046  2.492441
4  0.054194  1.471238  0.750263  0.210742
In[521]:
# 使用apply應用行或列資料
print(df.apply(lambda x: x.max()))
0    0.123669
1    1.783550
2    0.354441
3   -0.025582
dtype: float64
In[522]:
# 指定軸方向
print(df.apply(lambda x: x.max(), axis=1))
0    0.354441
1    0.683149
2    1.783550
3    0.661371
4   -0.054194
dtype: float64
In[523]:
# 使用applymap應用到每個資料
print(df.applymap(lambda x: '%.2f' % x))
       0      1      2      3
0  -2.12  -1.89   0.35  -0.57
1   0.12   0.68  -1.90  -0.29
2  -0.88   1.78  -1.75  -0.03
3   0.07   0.66   0.00  -2.49
4  -0.05  -1.47  -0.75  -0.21

In[524]:
s4 = pd.Series(range(10, 15), index = np.random.randint(5, size=5))
print(s4)
1    10
3    11
0    12
2    13
1    14
dtype: int64
In[525]:
# 索引排序
s4.sort_index()
Out[525]:
0    12
1    10
1    14
2    13
3    11
dtype: int64
In[526]:
df4 = pd.DataFrame(np.random.randn(3,4), 
                   index=np.random.randint(3, size=3),
                   columns=np.random.randint(4, size=4))
df4
Out[526]:
0 2 0 0
2 0.685978 0.065535 -1.904955 -0.844186
2 -2.321484 0.827531 -1.664262 -0.736157
1 -0.813704 -0.588677 1.321903 0.323762
In[527]:
#df4.sort_index(ascending=False)
df4.sort_index(axis=1)
Out[527]:
0 0 0 2
2 0.685978 -1.904955 -0.844186 0.065535
2 -2.321484 -1.664262 -0.736157 0.827531
1 -0.813704 1.321903 0.323762 -0.588677
In[528]:
# 按值排序
# df4.sort_values(by=2)

In[529]:
df_data = pd.DataFrame([np.random.randn(3), [1., np.nan, np.nan],
                        [4., np.nan, np.nan], [1., np.nan, 2.]])
df_data
Out[529]:
0 1 2
0 1.611935 -0.311622 0.670604
1 1.000000 NaN NaN
2 4.000000 NaN NaN
3 1.000000 NaN 2.000000
In[530]:
# isnull
df_data.isnull()
Out[530]:
0 1 2
0 False False False
1 False True True
2 False True True
3 False True False
In[531]:
# dropna
df_data.dropna()
# df_data.dropna(axis=1)
Out[531]:
0 1 2
0 1.611935 -0.311622 0.670604
In[532]:
# fillna
df_data.fillna(-100.)
Out[532]:
0 1 2
0 1.611935 -0.311622 0.670604
1 1.000000 -100.000000 -100.000000
2 4.000000 -100.000000 -100.000000
3 1.000000 -100.000000 2.000000





In[533]:
import numpy as np
import pandas as pd
In[534]:
df_obj = pd.DataFrame(np.random.randn(5,4),columns=['a', 'b', 'c', 'd'])
df_obj
Out[534]:
a b c d
0 -0.985329 -0.240711 0.735334 -1.650240
1 0.933588 1.289711 -1.183634 0.539915
2 0.311238 0.925572 0.825993 -0.598073
3 1.137575 -0.114538 0.244627 0.019387
4 0.541883 -0.674057 0.094889 -0.753914
In[535]:
df_obj.sum()
Out[535]:
a    1.938956
b    1.185977
c    0.717209
d   -2.442924
dtype: float64
In[536]:
df_obj.max()
Out[536]:
a    1.137575
b    1.289711
c    0.825993
d    0.539915
dtype: float64
In[537]:
df_obj.min(axis=1)
Out[537]:
0   -1.650240
1   -1.183634
2   -0.598073
3   -0.114538
4   -0.753914
dtype: float64
In[538]:
df_obj.describe()
Out[538]:
a b c d
count 5.000000 5.000000 5.000000 5.000000
mean 0.387791 0.237195 0.143442 -0.488585
std 0.832937 0.831286 0.804664 0.828806
min -0.985329 -0.674057 -1.183634 -1.650240
25% 0.311238 -0.240711 0.094889 -0.753914
50% 0.541883 -0.114538 0.244627 -0.598073
75% 0.933588 0.925572 0.735334 0.019387
max 1.137575 1.289711 0.825993 0.539915





In[539]:
import pandas as pd
import numpy as np
In[540]:
ser_obj = pd.Series(np.random.randn(12),
                    index=[['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c', 'd', 'd', 'd'], 
                           [0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2]])

print(ser_obj)
a  0    1.651622
   1    0.812348
   2    0.643199
b  0   -1.540444
   1   -0.219316
   2   -0.526368
c  0    0.908866
   1    0.107464
   2    0.693538
d  0    0.521462
   1    0.128478
   2    0.288094
dtype: float64
  • ## MultilIndex索引物件
In[541]:
type(ser_obj.index)
ser_obj.index
Out[541]:
MultiIndex(levels=[['a', 'b', 'c', 'd'], [0, 1, 2]],
           labels=[[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2]])
  • ## 選取子集
In[542]:
# 外層選取
ser_obj['c']
Out[542]:
0    0.908866
1    0.107464
2    0.693538
dtype: float64
In[543]:
# 內層選取
ser_obj[:, 2]
Out[543]:
a    0.643199
b   -0.526368
c    0.693538
d    0.288094
dtype: float64
  • ## 交換分層順序
In[544]:
ser_obj.swaplevel()
Out[544]:
0  a    1.651622
1  a    0.812348
2  a    0.643199
0  b   -1.540444
1  b   -0.219316
2  b   -0.526368
0  c    0.908866
1  c    0.107464
2  c    0.693538
0  d    0.521462
1  d    0.128478
2  d    0.288094
dtype: float64
  • ## 交換並排序分層
In[545]:
ser_obj.swaplevel().sortlevel()
e:\python_lesson\installer\anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: sortlevel is deprecated, use sort_index(level=...)
  """Entry point for launching an IPython kernel.
Out[545]:
0  a    1.651622
   b   -1.540444
   c    0.908866
   d    0.521462
1  a    0.812348
   b   -0.219316
   c    0.107464
   d    0.128478
2  a    0.643199
   b   -0.526368
   c    0.693538
   d    0.288094
dtype: float64





  • ## GroupBy物件
In[546]:
import pandas as pd
import numpy as np
In[547]:
dict_obj = {'key1': ['a', 'b', 'a', 'b', 'a', 'b', 'a', 'a'],
            'key2': ['one', 'one', 'two', 'three',
                     'two', 'two', 'one', 'three'],
            'data1': np.random.randint(1,10,(8)),
            'data2': np.random.randint(1,10,(8))}
df_obj = pd.DataFrame(dict_obj)
df_obj
Out[547]:
data1 data2 key1 key2
0 3 6 a one
1 9 8 b one
2 8 5 a two
3 9 6 b three
4 4 2 a two
5 4 1 b two
6 8 9 a one
7 3 9 a three
In[548]:
# dataframe 根據key1進行分組
type(df_obj.groupby('key1'))
Out[548]:
pandas.core.groupby.DataFrameGroupBy
In[549]:
# data1列根據key1進行分組
type(df_obj['data1'].groupby(df_obj['key1']))
Out[549]:
pandas.core.groupby.SeriesGroupBy
In[550]:
# 分組運算
grouped1 = df_obj.groupby('key1')
print(grouped1.mean())

grouped2 = df_obj['data1'].groupby(df_obj['key1'])
print(grouped2.mean())
         data1  data2
key1                 
a     5.200000    6.2
b     7.333333    5.0
key1
a    5.200000
b    7.333333
Name: data1, dtype: float64
In[551]:
grouped1.size()
grouped2.size()
Out[551]:
key1
a    5
b    3
Name: data1, dtype: int64
In[552]:
# 按列名分組
df_obj.groupby('key1')
Out[552]:
<pandas.core.groupby.DataFrameGroupBy object at 0x04AF9890>
In[553]:
# 按自定義key分組,列表
self_def_key = [1, 1, 2, 2, 2, 1, 1, 1]
df_obj.groupby(self_def_key).size()
Out[553]:
1    5
2    3
dtype: int64
In[554]:
# 按自定義key分組,多層列表
df_obj.groupby([df_obj['key1'], df_obj['key2']]).size()
Out[554]:
key1  key2 
a     one      2
      three    1
      two      2
b     one      1
      three    1
      two      1
dtype: int64
In[555]:
# 按多個列多層分組
grouped2 = df_obj.groupby(['key1', 'key2'])
grouped2.size()
Out[555]:
key1  key2 
a     one      2
      three    1
      two      2
b     one      1
      three    1
      two      1
dtype: int64
In[556]:
# 多層分組按key得順序進行
grouped3 = df_obj.groupby(['key2', 'key1'])
grouped3.mean()
grouped3.mean().unstack()
Out[556]:
data1 data2
key1 a b a b
key2
one 5.5 9.0 7.5 8.0
three 3.0 9.0 9.0 6.0
two 6.0 4.0 3.5 1.0

  • ## GroupBy物件分組迭代
In[557]:
# 單層分組
for group_name, group_data in grouped1:
    print(group_name)
    print(group_data)
a
   data1  data2 key1   key2
0      3      6    a    one
2      8      5    a    two
4      4      2    a    two
6      8      9    a    one
7      3      9    a  three
b
   data1  data2 key1   key2
1      9      8    b    one
3      9      6    b  three
5      4      1    b    two
In[558]:
# 多層分組
for group_name, group_data in grouped2:
    print(group_name)
    print(group_data)
('a', 'one')
   data1  data2 key1 key2
0      3      6    a  one
6      8      9    a  one
('a', 'three')
   data1  data2 key1   key2
7      3      9    a  three
('a', 'two')
   data1  data2 key1 key2
2      8      5    a  two
4      4      2    a  two
('b', 'one')
   data1  data2 key1 key2
1      9      8    b  one
('b', 'three')
   data1  data2 key1   key2
3      9      6    b  three
('b', 'two')
   data1  data2 key1 key2
5      4      1    b  two
In[559]:
# GroupBy物件轉換list
list(grouped1)
Out[559]:
[('a',    data1  data2 key1   key2
  0      3      6    a    one
  2      8      5    a    two
  4      4      2    a    two
  6      8      9    a    one
  7      3      9    a  three), ('b',    data1  data2 key1   key2
  1      9      8    b    one
  3      9      6    b  three
  5      4      1    b    two)]
In[560]:
# GroupBy物件轉換dict
dict(list(grouped1))
Out[560]:
{'a':    data1  data2 key1   key2
 0      3      6    a    one
 2      8      5    a    two
 4      4      2    a    two
 6      8      9    a    one
 7      3      9    a  three, 'b':    data1  data2 key1   key2
 1      9      8    b    one
 3      9      6    b  three
 5      4      1    b    two}
In[561]:
# print(df_obj.dtypes)

#按資料型別分組
# df_obj.groupby(df_obj.dtypes, axis=1).size()
df_obj.groupby(df_obj.dtypes, axis=1).sum()
Out[561]:
int32 object
0 9 aone
1 17 bone
2 13 atwo
3 15 bthree
4 6 atwo
5 5 btwo
6 17 aone
7 12 athree
  • ## 其他分組方法
In[562]:
df_obj2 = pd.DataFrame(np.random.randint(1, 10, (5,5)),
                       columns=['a', 'b', 'c', 'd', 'e'],
                       index=['A', 'B', 'C', 'D', 'E'])
df_obj2.iloc[1, 1:4] = np.NaN
df_obj2
Out[562]:
a b c d e
A 3 4.0 8.0 3.0 3
B 3 NaN NaN NaN 9
C 7 9.0 9.0 1.0 9
D 2 9.0 8.0 6.0 2
E 4 9.0 3.0 3.0 8
In[563]:
# 通過字典分組
mapping_dict = {'a':'Python', 'b':'python', 'c':'java', 'd':'C', 'e':'java'}
df_obj2.groupby(mapping_dict, axis=1).size()
df_obj2.groupby(mapping_dict, axis=1).count() # 非NaN的個數
df_obj2.groupby(mapping_dict, axis=1).sum()
Out[563]:
C Python java python
A 3.0 3.0 11.0 4.0
B 0.0 3.0 9.0 0.0
C 1.0 7.0 18.0 9.0
D 6.0 2.0 10.0 9.0
E 3.0 4.0 11.0 9.0
In[564]:
# 通過函式分組
df_obj3 = pd.DataFrame(np.random.randint(1, 10, (5,5)),
                       columns=['a', 'b', 'c', 'd', 'e'],
                       index=['AA', 'BBB', 'CC', 'D', 'EE'])
df_obj3
Out[564]:
a b c d e
AA 3 1 9 7 5
BBB 7 6 7 9 9
CC 6 2 3 3 4
D 7 1 3 9 7
EE 9 6 4 4 9
In[565]:
def group_key(idx):
    """
        idx 為列索引或行索引
    """
    #return idx
    return len(idx)

df_obj3.groupby(group_key).size()

# 以上自定義函式等價於
#df_obj3.groupby(len).size()
Out[565]:
1    1
2    3
3    1
dtype: int64
In[566]:
# 通過索引級別分組
columns = pd.MultiIndex.from_arrays([['Python', 'Java', 'Python', 'Java', 'Python'],
                                     ['A', 'A', 'B', 'C', 'B']], names=['language', 'index'])
df_obj4 = pd.DataFrame(np.random.randint(1, 10, (5,5)), columns=columns)
df_obj4
Out[566]:
language Python Java Python Java Python
index A A B C B
0 2 2 9 1 6
1 4 6 2 6 5
2 7 3 5 3 8
3 4 8 4 1 3
4 6 2 9 9 2
In[567]:
# 根據language進行分組
df_obj4.groupby(level='language', axis=1).sum()
df_obj4.groupby(level='index', axis=1).sum()
Out[567]:
index A B C
0 4 15 1
1 10 7 6
2 10 13 3
3 12 7 1
4 8 11 9





  • ## 聚合
In[568]:
import pandas as pd
import numpy as np

dict_obj = {'key1': ['a', 'b', 'a', 'b',
                     'a', 'b', 'a', 'a'],
            'key2': ['one', 'one', 'two', 'three',
                     'two', 'two', 'one', 'three'],
            'data1': np.random.randint(1,10,8),
            'data2': np.random.randint(1,10,8)}
df_obj5 = pd.DataFrame(dict_obj)
df_obj5
Out[568]:
data1 data2 key1 key2
0 4 6 a one
1 6 4 b one
2 8 7 a two
3 4 9 b three
4 3 6 a two
5 6 3 b two
6 4 2 a one
7 5 6 a three
In[569]:
# 內建的聚合函式

print(df_obj5.groupby('key1').sum())
print(df_obj5.groupby('key1').max())
print(df_obj5.groupby('key1').min())
print(df_obj5.groupby('key1').mean())
print(df_obj5.groupby('key1').size())
print(df_obj5.groupby('key1').count())
print(df_obj5.groupby('key1').describe())
      data1  data2
key1              
a        24     27
b        16     16
      data1  data2 key2
key1                   
a         8      7  two
b         6      9  two
      data1  data2 key2
key1                   
a         3      2  one
b         4      3  one
         data1     data2
key1                    
a     4.800000  5.400000
b     5.333333  5.333333
key1
a    5
b    3
dtype: int64
      data1  data2  key2
key1                    
a         5      5     5
b         3      3     3
     data1                                              data2            \
     count      mean       std  min  25%  50%  75%  max count      mean   
key1                                                                      
a      5.0  4.800000  1.923538  3.0  4.0  4.0  5.0  8.0   5.0  5.400000   
b      3.0  5.333333  1.154701  4.0  5.0  6.0  6.0  6.0   3.0  5.333333   

                                         
           std  min  25%  50%  75%  max  
key1                                     
a     1.949359  2.0  6.0  6.0  6.0  7.0  
b     3.214550  3.0  3.5  4.0  6.5  9.0  
In[570]:
# 自定義聚合函式
def peak_range(df):
    """
        返回數值範圍
    """
    #print(type(df)) #引數為索引所對應的記錄
    return df.max() - df.min()

print(df_obj5.groupby('key1').agg(peak_range))
print(df_obj5.groupby('key1').agg(lambda df: df.max() - df.min()))
      data1  data2
key1              
a         5      5
b         2      6
      data1  data2
key1              
a         5      5
b         2      6

In[571]:
# 應用多個聚合函式

#同時應用多個聚合函式
print(df_obj5.groupby('key1').agg(['mean', 'std', 'count', peak_range])) # 預設列名為函式名
         data1                                data2                           
          mean       std count peak_range      mean       std count peak_range
key1                                                                          
a     4.800000  1.923538     5          5  5.400000  1.949359     5          5
b     5.333333  1.154701     3          2  5.333333  3.214550     3          6
In[572]:
print(df_obj5.groupby('key1').agg(['mean', 'std', 'count', ('range', peak_range)]))
         data1                           data2                      
          mean       std count range      mean       std count range
key1                                                                
a     4.800000  1.923538     5     5  5.400000  1.949359     5     5
b     5.333333  1.154701     3     2  5.333333  3.214550     3     6
In[573]:
# 每列作用不同的聚合函式
dict_mapping = {'data1':'mean',
                'data2':'sum'}
print(df_obj5.groupby('key1').agg(dict_mapping))
         data1  data2
key1                 
a     4.800000     27
b     5.333333     16





数据分组运算

In[574]:
import pandas as pd
import numpy as np
# 分組運算後保持shape
dict_obj = {'key1' : ['a', 'b', 'a', 'b',
                      'a', 'b', 'a', 'a'],
            'key2' : ['one', 'one', 'two', 'three',
                      'two', 'two', 'one', 'three'],
            'data1' : np.random.randint(1, 10, 8),
            'data2' : np.random.randint(1, 10, 8)}

df_obj = pd.DataFrame(dict_obj)
df_obj
Out[574]:
data1 data2 key1 key2
0 5 3 a one
1 9 3 b one
2 7 1 a two
3 2 9 b three
4 1 6 a two
5 5 7 b two
6 5 7 a one
7 1 1 a three
In[575]:
# 按key1分組後,計算data1,data2的統計資訊並附加到原始表格中
k1_sum = df_obj.groupby('key1').sum().add_prefix('sum_')
k1_sum
Out[575]:
sum_data1 sum_data2
key1
a 19 18
b 16 19
In[576]:
# 方法1,使用merge
pd.merge(df_obj, k1_sum, left_on='key1', right_index=True)
Out[576]:
data1 data2 key1 key2 sum_data1 sum_data2
0 5 3 a one 19 18
2 7 1 a two 19 18
4 1 6 a two 19 18
6 5 7 a one 19 18
7 1 1 a three 19 18
1 9 3 b one 16 19
3 2 9 b three 16 19
5 5 7 b two 16 19
  • ## transform方法
In[577]:
# 方法2,使用transform
k1_sum_tf = df_obj.groupby('key1').transform(np.sum).add_prefix('sum_')
df_obj[k1_sum_tf.columns] = k1_sum_tf
df_obj
Out[577]:
data1 data2 key1 key2 sum_data1 sum_data2 sum_key2
0 5 3 a one 19 18 onetwotwoonethree
1 9 3 b one 16 19 onethreetwo
2 7 1 a two 19 18 onetwotwoonethree
3 2 9 b three 16 19 onethreetwo
4 1 6 a two 19 18 onetwotwoonethree
5 5 7 b two 16 19 onethreetwo
6 5 7 a one 19 18 onetwotwoonethree
7 1 1 a three 19 18 onetwotwoonethree
In[578]:
# 自定義函式傳入transform
def diff_mean(s):
    """
        返回資料與均值的差值
    """
    return s - s.mean()

# df_obj.groupby('key1'.transform(diff_mean))
In[579]:
dataset_path = './startcraft.csv'
# df_data = pd.read_csv(dataset_path, usecols=['LeagueIndex', 'Age', 'HoursPerweek', 'TotalHours', 'APM'])

In[580]:
import pandas as pd
import numpy as np

df_obj1 = pd.DataFrame({'key' : ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                        'data1' : np.random.randint(0, 10, 7)})
df_obj2 = pd.DataFrame({'key' : ['a', 'b', 'd'],
                        'data2' : np.random.randint(0, 10, 3)})
print(df_obj1)
print(df_obj2)
   data1 key
0      4   b
1      4   b
2      4   a
3      5   c
4      2   a
5      8   a
6      8   b
   data2 key
0      3   a
1      2   b
2      4   d
In[581]:
# 預設將重疊列的列名作為"外來鍵"進行連線
pd.merge(df_obj1, df_obj2)
Out[581]:
data1 key data2
0 4 b 2
1 4 b 2
2 8 b 2
3 4 a 3
4 2 a 3
5 8 a 3
In[582]:
# on 顯示指定"外來鍵"
pd.merge(df_obj1, df_obj2, on='key')
Out[582]:
data1 key data2
0 4 b 2
1 4 b 2
2 8 b 2
3 4 a 3
4 2 a 3
5 8 a 3
In[583]:
# left_on, right_on分別指定左側資料和右側資料的"外來鍵"

# 更改列名
df_obj1 = df_obj1.rename(columns={'key':'key1'})
df_obj2 = df_obj2.rename(columns={'key':'key2'})
In[584]:
pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2')
Out[584]:
data1 key1 data2 key2
0 4 b 2 b
1 4 b 2 b
2 8 b 2 b
3 4 a 3 a
4 2 a 3 a
5 8 a 3 a
In[585]:
# "外連線"
pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='outer')
Out[585]:
data1 key1 data2 key2
0 4.0 b 2.0 b
1 4.0 b 2.0 b
2 8.0 b 2.0 b
3 4.0 a 3.0 a
4 2.0 a 3.0 a
5 8.0 a 3.0 a
6 5.0 c NaN NaN
7 NaN NaN 4.0 d
In[586]:
# "左連線"
pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='left')
Out[586]:
data1 key1 data2 key2
0 4 b 2.0 b
1 4 b 2.0 b
2 4 a 3.0 a
3 5 c NaN NaN
4 2 a 3.0 a
5 8 a 3.0 a
6 8 b 2.0 b
In[587]:
# "右連線"
pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='right')
Out[587]:
data1 key1 data2 key2
0 4.0 b 2 b
1 4.0 b 2 b
2 8.0 b 2 b
3 4.0 a 3 a
4 2.0 a 3 a
5 8.0 a 3 a
6 NaN NaN 4 d

In[588]:
import pandas as pd
import numpy as np
  • ## stack
In[589]:
df_obj = pd.DataFrame(np.random.randint(0,10,(5,2)), columns=['data1', 'data2'])
df_obj
Out[589]:
data1 data2
0 6 5
1 3 8
2 1 8
3 0 9
4 5 3
In[590]:
stacked = df_obj.stack()
stacked
Out[590]:
0  data1    6
   data2    5
1  data1    3
   data2    8
2  data1    1
   data2    8
3  data1    0
   data2    9
4  data1    5
   data2    3
dtype: int32
In[591]:
print(type(stacked))
print(stacked.index)
<class 'pandas.core.series.Series'>
MultiIndex(levels=[[0, 1, 2, 3, 4], ['data1', 'data2']],
           labels=[[0, 0, 1, 1, 2, 2, 3, 3, 4, 4], [0, 1, 0, 1, 0, 1, 0, 1, 0, 1]])
In[592]:
# 預設操作內層索引
stacked.unstack()
Out[592]:
data1 data2
0 6 5
1 3 8
2 1 8
3 0 9
4 5 3





  • ## 例項
In[593]:
# kmeans_tool.py

import math
import random

class Cluster(object):
    """
        聚類
    """
    def __init__(self, samples):
        if len(samples) == 0:
            # 如果聚類中無樣本點
            raise Exception("錯誤:一個空的聚類!")

        # 屬於該聚類的樣本點
        self.samples = samples

        # 該聚類中樣本點的維度
        self.n_dim = samples[0].n_dim

        # 判斷該聚類中所有樣本點的維度是否相同
        for sample in samples:
            if sample.n_dim != self.n_dim:
                raise Exception("錯誤,聚類中樣本點的維度不一致!")

        # 設定初始化的聚類中心
        self.centroid = self.cal_centroid()

    def __repr__(self):
        return str(self.samples)

    def update(self, samples):
        old_centroid = self.centroid
        self.samples = samples
        self.centroid = self.cal_centroid()
        shift = get_distance(old_centroid, self.centroid)
        return shift

    def cal_centroid(self):
        n_samples = len(self.samples)
        coords = [sample.coords for sample in self.samples]
        unzipped = zip(*coords)
        centroid_coords = [math.fsum(d_list)/n_samples for d_list in unzipped]
        return Sample(centroid_coords)

class Sample(object):
    """
        樣本點類
    """
    def __init__(self, coords):
        self.coords = coords # 樣本點包含的座標
        self.n_dim = len(coords) # 樣本點維度

    def __repr__(self):
        """
            輸出物件資訊
        """
        return str(self.coords)

def get_distance(a, b):
    """
        返回樣本點a, b的歐式距離
    """
    if a.n_dim != b.n_dim:
        # 如果樣本點維度不同
        raise Exception("錯誤,樣本點維度不同,無法計算距離!")

    acc_diff = 0.0
    for i in range(a.n_dim):
        square_diff = pow((a.coords[i] - b.coords[i]), 2)
        acc_diff += square_diff
    distance = math.sqrt(acc_diff)
    return distance

def gen_random_sample(n_dim, lower, upper):
    """
        生成隨機樣本
    """
    sample = Sample([random.uniform(lower, upper) for _ in range(n_dim)])
    return sample
In[595]:
# main.py

import random
# from kmeans_tools import Cluster, get_distance, gen_random_sample
import matplotlib.pyplot as plt
from matplotlib import colors as mcolors

def kmeans(samples, k, cutoff):
    """
        kmeans函式
    """
    # 隨機選k個樣本點作為初始聚類中心
    init_samples = random.sample(samples, k)

    # 建立k個聚類,聚類的中心分別為隨機初始的樣本點
    clusters = [Cluster([sample]) for sample in init_samples]

    # 迭代迴圈直到聚類劃分穩定
    n_loop = 0
    while True:
        # 初始化一組空列表用於儲存每個聚類的樣本點
        lists = [[]for _ in clusters]

        # 開始迭代
        n_loop += 1
        # 遍歷樣本集中的每個樣本
        for sample in samples:
            # 遍歷樣本點Sample和第一個聚類中心的距離
            smallest_distance = get_distance(sample, clusters[0].centroid)
            # 初始化屬於聚類 0
            cluster_index = 0

            # 計算和其他聚類中心的距離
            for i in range(k - 1):
                # 計算樣本點sample和聚類中心的距離
                distance = get_distance(sample, clusters[i+1].centroid)
                # 如果存在更小的距離,更新距離
                if distance < smallest_distance:
                    smallest_distance = distance
                    cluster_index = i + 1

            # 找到最近的聚類中心,更新所屬聚類
            lists[cluster_index].append(sample)

        # 初始化最大移動距離
        biggest_shift = 0.0

        # 計算本次迭代中,聚類中心移動的距離
        for i in range(k):
            shift = clusters[i].update(lists[i])
            # 記錄最大移動距離
            biggest_shift = max(biggest_shift, shift)

        # 如果聚類中心移動的距離小於收斂閾值,即:聚類穩定
        if biggest_shift < cutoff:
#             print("第{}次迭代後,聚類穩定。".format(n_loop))
            break

    # 返回聚類結果
    return clusters

def run_main():
    """
        主函式
    """

    # 樣本個數
    n_samples = 1000

    # 特徵個數(特徵維度)
    n_feat = 2

    # 特徵數值範圍
    lower = 0
    upper = 200

    # 聚類個數
    n_cluster = 3

    # 生成隨機樣本
    samples = [gen_random_sample(n_feat, lower, upper) for _ in range(n_samples)]

    # 收斂閾值
    cutoff = 0.2

    clusters = kmeans(samples, n_cluster, cutoff)

    # 輸出結果
#     for i, c in enumerate(clusters):
#         for sample in c.samples:
#             print('聚類--{},樣本點--{}'.format(i, sample))

    # 視覺化結果
    plt.subplot()
    color_names = list(mcolors.cnames)
    for i, c in enumerate(clusters):
        x = []
        y = []
        random.choice
        color = [color_names[i]] * len(c.samples)
        for sample in c.samples:
            x.append(sample.coords[0])
            y.append(sample.coords[1])
        plt.scatter(x, y, c=color)

    plt.show()

if __name__ == '__main__':
    run_main()
In[]: