1. 程式人生 > >pandas_cookbook學習(二)

pandas_cookbook學習(二)

選擇資料

dataframes

> df = pd.DataFrame(
   ....:      {'AAA' : [4,5,6,7], 'BBB' : [10,20,30,40],'CCC' : [100,50,-30,-50]}); df
	AAA	BBB	CCC
0	4	10	100
1	5	20	50
2	6	30	-30
3	7	40	-50

#設定兩個“或”條件篩選資料
> df[(df.AAA > 5) | (df.index.isin([0, 2]))]
	AAA	BBB	CCC
0	4	10	100
2	6	30	-30
3	7	40	-50

#使用~表示“非”,注意書寫是全非還是部分非
#下面是一個部分非的例子 > df[~(df.AAA > 5) & (df.index.isin([0, 2, 4]))] AAA BBB CCC 0 4 10 100

df.loc的方法已經比較熟練,不再贅述。

面板資料

# 可能會報錯:panel將在未來的版本里棄用。不影響使用
# 三個時間序列資料組成了面板資料
> rng = pd.date_range('1/1/2013',periods=100,freq='D')
> data = np.random.randn(100, 4)
> cols = ['A','B','C','D']
> df1,
df2, df3 = pd.DataFrame(data, rng, cols), pd.DataFrame(data, rng, cols), pd.DataFrame(data, rng, cols) > pf = pd.Panel({'df1':df1,'df2':df2,'df3':df3});pf <class 'pandas.core.panel.Panel'> Dimensions: 3 (items) x 100 (major_axis) x 4 (minor_axis) Items axis: df1 to df3 Major_axis axis: 2013-01
-01 00:00:00 to 2013-04-10 00:00:00 Minor_axis axis: A to D > pf.loc[:,:,'F'] = pd.DataFrame(data, rng, cols);pf <class 'pandas.core.panel.Panel'> Dimensions: 3 (items) x 100 (major_axis) x 5 (minor_axis) Items axis: df1 to df3 Major_axis axis: 2013-01-01 00:00:00 to 2013-04-10 00:00:00 Minor_axis axis: A to F > pf['df3'] A B C D F 2013-01-01 -0.321314 -1.264130 -0.701352 -0.097620 NaN 2013-01-02 -0.798209 0.767679 0.073943 -1.692146 NaN 2013-01-03 0.927847 0.555940 0.590923 0.350158 NaN 2013-01-04 -0.042397 -1.003914 0.952686 -0.911484 NaN

生成新列

> df = pd.DataFrame(
   ....:      {'AAA' : [1,2,1,3], 'BBB' : [1,1,2,2], 'CCC' : [2,1,3,1]}); df
AAA	BBB	CCC
0	1	1	2
1	2	1	1
2	1	2	3
3	3	2	1

> source_cols = df.columns
> new_cols = [str(x) + "_cat" for x in source_cols]
> categories = {1 : 'Alpha', 2 : 'Beta', 3 : 'Charlie' }
> df[new_cols] = df[source_cols].applymap(categories.get);df
	AAA	BBB	CCC	AAA_cat	BBB_cat	CCC_cat
0	1	1	2	Alpha	Alpha	Beta
1	2	1	1	Beta	Alpha	Alpha
2	1	2	3	Alpha	Beta	Charlie
3	3	2	1	Charlie	Beta	Alpha

> df = pd.DataFrame(
   ....:      {'AAA' : [1,1,1,2,2,2,3,3], 'BBB' : [2,1,3,4,5,1,2,3]}); df
	AAA	BBB
0	1	2
1	1	1
2	1	3
3	2	4
4	2	5
5	2	1
6	3	2
7	3	3

#按AAA分組並找出每組中對應BBB的最小值的索引值
> df.loc[df.groupby("AAA")["BBB"].idxmin()]
	AAA	BBB
1	1	1
5	2	1
6	3	2
#另一種方法:
> df.sort_values(by="BBB").groupby("AAA", as_index=False).first()
	AAA	BBB
0	1	1
1	2	1
2	3	2