第九章 資料分組與聚合(中)
阿新 • • 發佈:2019-01-09
close_px=pd.read_csv('e:/stock_px.csv',parse_dates=True,index_col=0) close_px[-4:] #計算日收益率與SPX年度相關係數組成的DataFrame rets=close_px.pct_change().dropna() spx_corr=lambda x:x.corrwith(x['SPX']) by_year=rets.groupby(lambda x:x.year) by_year.apply(spx_corr) AAPL MSFT XOM SPX 2003 0.541124 0.745174 0.661265 1.0 2004 0.374283 0.588531 0.557742 1.0 2005 0.467540 0.562374 0.631010 1.0 2006 0.428267 0.406126 0.518514 1.0 2007 0.508118 0.658770 0.786264 1.0 2008 0.681434 0.804626 0.828303 1.0 2009 0.707103 0.654902 0.797921 1.0 2010 0.710105 0.730118 0.839057 1.0 2011 0.691931 0.800996 0.859975 1.0 #計算列與列之間的相關係數 by_year.apply(lambda g:g['AAPL'].corr(g['MSFT'])) 2003 0.480868 2004 0.259024 2005 0.300093 2006 0.161735 2007 0.417738 2008 0.611901 2009 0.432738 2010 0.571946 2011 0.581987 dtype: float64 import statsmodels.api as sm def regress(data,yvar,xvars): Y=data[yvar] X=data[xvars] X['intercept']=1. result=sm.OLS(Y,X).fit() return result.params #按年計算AAPL對SPX收益率的線性迴歸 by_year.apply(regress,'AAPL',['SPX']) Out[161]: SPX intercept 2003 1.195406 0.000710 2004 1.363463 0.004201 2005 1.766415 0.003246 2006 1.645496 0.000080 2007 1.198761 0.003438 2008 0.968016 -0.001110 2009 0.879103 0.002954 2010 1.052608 0.001261 2011 0.806605 0.001514
透視表和交叉表
要使用其他的聚合函式,將其傳給aggfunc即可,使用count或len可以得到有關分組大小的交叉表:tips.pivot_table(index=['sex','smoker']) Out[163]: size tip tip_pct total_bill sex smoker Female No 2.592593 2.773519 0.156921 18.105185 Yes 2.242424 2.931515 0.182150 17.977879 Male No 2.711340 3.113402 0.160669 19.791237 Yes 2.500000 3.051167 0.152771 22.284500 #只想聚合tip_pct和size,根據day分組,將smoker放在列上,day放在行上 tips.pivot_table(['tip_pct','size'],index=['sex','day'],columns='smoker') tip_pct size smoker No Yes No Yes sex day Female Fri 0.165296 0.209129 2.500000 2.000000 Sat 0.147993 0.163817 2.307692 2.200000 Sun 0.165710 0.237075 3.071429 2.500000 Thur 0.155971 0.163073 2.480000 2.428571 Male Fri 0.138005 0.144730 2.000000 2.125000 Sat 0.162132 0.139067 2.656250 2.629630 Sun 0.158291 0.173964 2.883721 2.600000 Thur 0.165706 0.164417 2.500000 2.300000 #加邊界 tips.pivot_table(['tip_pct','size'],index=['sex','day'],columns='smoker',margins=True) Out[166]: tip_pct size smoker No Yes All No Yes All sex day Female Fri 0.165296 0.209129 0.199388 2.500000 2.000000 2.111111 Sat 0.147993 0.163817 0.156470 2.307692 2.200000 2.250000 Sun 0.165710 0.237075 0.181569 3.071429 2.500000 2.944444 Thur 0.155971 0.163073 0.157525 2.480000 2.428571 2.468750 Male Fri 0.138005 0.144730 0.143385 2.000000 2.125000 2.100000 Sat 0.162132 0.139067 0.151577 2.656250 2.629630 2.644068 Sun 0.158291 0.173964 0.162344 2.883721 2.600000 2.810345 Thur 0.165706 0.164417 0.165276 2.500000 2.300000 2.433333 All 0.159328 0.163196 0.160803 2.668874 2.408602 2.569672
如果存在空的組合,設定一個fill_valuetips.pivot_table('tip_pct',index=['sex','smoker'],columns='day',aggfunc=len,margins=True) Out[167]: day Fri Sat Sun Thur All sex smoker Female No 2.0 13.0 14.0 25.0 54.0 Yes 7.0 15.0 4.0 7.0 33.0 Male No 2.0 32.0 43.0 20.0 97.0 Yes 8.0 27.0 15.0 10.0 60.0 All 19.0 87.0 76.0 62.0 244.0
tips.pivot_table('size',index=['time','sex','smoker'],columns='day',aggfunc='sum',fill_value=0)
Out[168]:
day Fri Sat Sun Thur
time sex smoker
Dinner Female No 2 30 43 2
Yes 8 33 10 0
Male No 4 85 124 0
Yes 12 71 39 0
Lunch Female No 3 0 0 60
Yes 6 0 0 17
Male No 0 0 0 50
Yes 5 0 0 23
交叉表
data=DataFrame({'Sample':range(1,11),'Gender':['Female','Male','Female','Male','Male','Male',
'Female','Female','Male','Female'],'Handedness':['Right-handed','Left-handed',
'Right-handed','Right-handed','Left-handed','Right-handed','Right-handed','Left-handed','Right-handed','Right-handed']})
data
data1=data[['Sample','Gender','Handedness']]
data1
Sample Gender Handedness
0 1 Female Right-handed
1 2 Male Left-handed
2 3 Female Right-handed
3 4 Male Right-handed
4 5 Male Left-handed
5 6 Male Right-handed
6 7 Female Right-handed
7 8 Female Left-handed
8 9 Male Right-handed
9 10 Female Right-handed
pd.crosstab(data1.Gender,data1.Handedness,margins=True)
Handedness Left-handed Right-handed All
Gender
Female 1 4 5
Male 2 3 5
All 3 7 10
pd.crosstab([data1.Gender,data1.Handedness],data1.Sample,margins=True)
Out[185]:
Sample 1 2 3 4 5 6 7 8 9 10 All
Gender Handedness
Female Left-handed 0 0 0 0 0 0 0 1 0 0 1
Right-handed 1 0 1 0 0 0 1 0 0 1 4
Male Left-handed 0 1 0 0 1 0 0 0 0 0 2
Right-handed 0 0 0 1 0 1 0 0 1 0 3
All 1 1 1 1 1 1 1 1 1 1 10