Python之Pandas(3)
阿新 • • 發佈:2018-12-03
#常用數學,統計方法 import numpy as np import pandas as pd In [7]: df = pd.DataFrame({'key1':[4,5,3,np.nan,2], 'key2':[1,2,np.nan,4,5], 'key3':['a','b','c','d','e']}) print(df['key1'].dtype,df['key2'].dtype,df['key3'].dtype) float64 float64 object In [14]: print(df) #單一列的均值 axis預設為0,按列來計算 print(df.mean()) print(df['key2'].mean()) #axis為1,按照行來計算 print(df.mean(axis = 1)) key1 key2 key3 0 4.0 1.0 a 1 5.0 2.0 b 2 3.0 NaN c 3 NaN 4.0 d 4 2.0 5.0 e key1 3.5 key2 3.0 dtype: float64 3.0 0 2.5 1 3.5 2 3.0 3 4.0 4 3.5 dtype: float64 In [30]: #skipna 引數選擇是否忽略NaN,預設為True,如果是False的列統計結果仍未NaN df = pd.DataFrame(np.random.rand(20).reshape(10,2),columns=['a','b']) #列計算 df['mean'] = df.mean(axis = 1) #行計算 df.loc['mean'] = df.mean() df Out[30]: a b mean 0 0.713674 0.378652 0.546163 1 0.881657 0.230902 0.556280 2 0.342403 0.473300 0.407851 3 0.321717 0.740015 0.530866 4 0.649596 0.021103 0.335350 5 0.011607 0.829877 0.420742 6 0.964894 0.209440 0.587167 7 0.338171 0.541400 0.439786 8 0.909710 0.121635 0.515672 9 0.934586 0.768681 0.851634 mean 0.606802 0.431500 0.519151 In [35]: print("統計非NAN的值的數量\n",df.count()) 統計非NAN的值的數量 a 11 b 11 mean 11 dtype: int64 In [36]: print("最小值\n",df.min()) 最小值 a 0.011607 b 0.021103 mean 0.335350 dtype: float64 In [37]: print("最大值\n",df.max()) 最大值 a 0.964894 b 0.829877 mean 0.851634 dtype: float64 In [38]: print("求和\n",df.sum()) 求和 a 6.674818 b 4.746505 mean 5.710661 dtype: float64 In [39]: print("平均值\n",df.mean()) 平均值 a 0.606802 b 0.431500 mean 0.519151 dtype: float64 In [40]: print("中位數\n",df.median()) 中位數 a 0.649596 b 0.431500 mean 0.519151 dtype: float64 In [41]: print("標準差,方差\n",df.std(),df.var()) 標準差,方差 a 0.315250 b 0.271696 mean 0.134008 dtype: float64 a 0.099382 b 0.073819 mean 0.017958 dtype: float64 In [42]: print("skew樣本偏度\n",df.skew()) skew樣本偏度 a -0.549653 b 0.087022 mean 1.427140 dtype: float64 In [43]: print("kurt樣本偏度\n",df.kurt()) kurt樣本偏度 a -0.743035 b -1.173489 mean 3.605701 dtype: float64 In [51]: #主要數學計算方法 #累計和 df['a_sum'] = df['a'].cumsum() df['b_sum'] = df['b'].cumsum() #累計積 df['a_p'] = df['a'].cumprod() df['b_p'] = df['b'].cumprod() print(df) #累計計算最大值和最小值 print(df.cummax(),"\n",df.cummin()) a b mean a_sum b_sum a_p b_p 0 0.713674 0.378652 0.546163 0.713674 0.378652 0.713674 0.378652 1 0.881657 0.230902 0.556280 1.595332 0.609554 0.629216 0.087432 2 0.342403 0.473300 0.407851 1.937735 1.082854 0.215445 0.041381 3 0.321717 0.740015 0.530866 2.259452 1.822868 0.069312 0.030623 4 0.649596 0.021103 0.335350 2.909048 1.843972 0.045025 0.000646 5 0.011607 0.829877 0.420742 2.920655 2.673849 0.000523 0.000536 6 0.964894 0.209440 0.587167 3.885549 2.883288 0.000504 0.000112 7 0.338171 0.541400 0.439786 4.223721 3.424688 0.000171 0.000061 8 0.909710 0.121635 0.515672 5.133431 3.546323 0.000155 0.000007 9 0.934586 0.768681 0.851634 6.068016 4.315004 0.000145 0.000006 mean 0.606802 0.431500 0.519151 6.674818 4.746505 0.000088 0.000002 a b mean a_sum b_sum a_p b_p 0 0.713674 0.378652 0.546163 0.713674 0.378652 0.713674 0.378652 1 0.881657 0.378652 0.556280 1.595332 0.609554 0.713674 0.378652 2 0.881657 0.473300 0.556280 1.937735 1.082854 0.713674 0.378652 3 0.881657 0.740015 0.556280 2.259452 1.822868 0.713674 0.378652 4 0.881657 0.740015 0.556280 2.909048 1.843972 0.713674 0.378652 5 0.881657 0.829877 0.556280 2.920655 2.673849 0.713674 0.378652 6 0.964894 0.829877 0.587167 3.885549 2.883288 0.713674 0.378652 7 0.964894 0.829877 0.587167 4.223721 3.424688 0.713674 0.378652 8 0.964894 0.829877 0.587167 5.133431 3.546323 0.713674 0.378652 9 0.964894 0.829877 0.851634 6.068016 4.315004 0.713674 0.378652 mean 0.964894 0.829877 0.851634 6.674818 4.746505 0.713674 0.378652 a b mean a_sum b_sum a_p b_p 0 0.713674 0.378652 0.546163 0.713674 0.378652 0.713674 0.378652 1 0.713674 0.230902 0.546163 0.713674 0.378652 0.629216 0.087432 2 0.342403 0.230902 0.407851 0.713674 0.378652 0.215445 0.041381 3 0.321717 0.230902 0.407851 0.713674 0.378652 0.069312 0.030623 4 0.321717 0.021103 0.335350 0.713674 0.378652 0.045025 0.000646 5 0.011607 0.021103 0.335350 0.713674 0.378652 0.000523 0.000536 6 0.011607 0.021103 0.335350 0.713674 0.378652 0.000504 0.000112 7 0.011607 0.021103 0.335350 0.713674 0.378652 0.000171 0.000061 8 0.011607 0.021103 0.335350 0.713674 0.378652 0.000155 0.000007 9 0.011607 0.021103 0.335350 0.713674 0.378652 0.000145 0.000006 mean 0.011607 0.021103 0.335350 0.713674 0.378652 0.000088 0.000002 In [74]: #唯一值 得到唯一值 s = pd.Series(list("aabacdefg")) print(s.unique()) print(s.count()) ['a' 'b' 'c' 'd' 'e' 'f' 'g'] 9 In [75]: #成員資格 isin print(s.isin(['a'])) 0 True 1 True 2 False 3 True 4 False 5 False 6 False 7 False 8 False dtype: bool In [79]: #小作業 ip = eval(input("please input a list:")) s = pd.Series(ip) def f(s): s1 = s.unique() if len(s1) == len(s): print("yes") else: print("no") f(s) please input a list:1,2,3,4,5,1 no
#文字資料 import numpy as np import pandas as pd In [3]: s = pd.Series(['a','b','c','hello','123',np.nan,'shit']) df = pd.DataFrame({'key1':list('abcdef'), 'key2':['hee','a','hija','123','w',np.nan]}) In [5]: print(s) print(df) 0 a 1 b 2 c 3 hello 4 123 5 NaN 6 shit dtype: object key1 key2 0 a hee 1 b a 2 c hija 3 d 123 4 e w 5 f NaN In [15]: #呼叫字串方法 print(s.str.upper())#過濾掉NaN print(s.str.count('a')) print(df['key1'].str.upper()) 0 A 1 B 2 C 3 HELLO 4 123 5 NaN 6 SHIT dtype: object 0 1.0 1 0.0 2 0.0 3 0.0 4 0.0 5 NaN 6 0.0 dtype: float64 0 A 1 B 2 C 3 D 4 E 5 F Name: key1, dtype: object In [21]: #常用字串方法 print(s.str.upper()) print(s.str.lower()) print(s.str.len()) print(s.str.startswith('a')) print(s.str.endswith('f')) #去掉字串的空格 還可以是左空格或者是右空格 print(s.str.strip()) print(s.str.lstrip()) print(s.str.rstrip()) 0 A 1 B 2 C 3 HELLO 4 123 5 NaN 6 SHIT dtype: object 0 a 1 b 2 c 3 hello 4 123 5 NaN 6 shit dtype: object 0 1.0 1 1.0 2 1.0 3 5.0 4 3.0 5 NaN 6 4.0 dtype: float64 0 True 1 False 2 False 3 False 4 False 5 NaN 6 False dtype: object 0 False 1 False 2 False 3 False 4 False 5 NaN 6 False dtype: object 0 a 1 b 2 c 3 hello 4 123 5 NaN 6 shit dtype: object 0 a 1 b 2 c 3 hello 4 123 5 NaN 6 shit dtype: object 0 a 1 b 2 c 3 hello 4 123 5 NaN 6 shit dtype: object Out[21]: 0 a 1 b 2 c 3 hello 4 123 5 NaN 6 shit dtype: object In [32]: #replace n是替換的個數 df = pd.DataFrame(np.random.rand(3,2),columns=[' Colum A',' Colun B'],index=range(3)) df.columns.str.replace(' ','_',n=1) Out[32]: Index(['_Colum A', '_Colun B'], dtype='object') In [44]: #拆分 s = pd.Series(['a,b,c','1,2,3',['a...c'],np.nan]) print(s) print(s.str.split(',')) #expand = True 分裂 print(s.str.split(',',expand = True)) 0 a,b,c 1 1,2,3 2 [a...c] 3 NaN dtype: object 0 [a, b, c] 1 [1, 2, 3] 2 NaN 3 NaN dtype: object 0 1 2 0 a b c 1 1 2 3 2 NaN None None 3 NaN None None In [67]: df = pd.DataFrame({'key1':['a,b,c','1,2,3',['...,..,..']], 'key2':['a-b-c','1-2-3',['...-...-']]}) df['k200'] = df['key1'].str.split(',').str[0] print(a) df['k201'] = df['key1'].str.split(',').str[1] df['k202'] = df['key1'].str.split(',').str[2] df 0 a,b,c 1 1,2,3 2 NaN Name: key1, dtype: object Out[67]: key1 key2 k200 k201 k202 0 a,b,c a-b-c a b c 1 1,2,3 1-2-3 1 2 3 2 [...,..,..] [...-...-] NaN NaN NaN