【model02】pandas
阿新 • • 發佈:2018-12-12
基本介紹
import pandas as pd
import numpy as np
s = pd.Series([1,3,6,np.nan,44,1])
print(s)
dates = pd.date_range('20160101',periods = 6)
print(dates)
df = pd.DataFrame(np.random.randn(6,4),index = dates,columns = ['a','b','c','d'])
print(df)
df1 = pd.DataFrame(np.arange(12).reshape(3,4))
print(df1)
df2 = pd.DataFrame({
'A':1.,
'B':pd.Timestamp('20130102'),
'C':pd.Series(1,index = list(range(4)),dtype = 'float32'),
'D':np.array([3]*4,dtype = 'int32'),
'E':pd.Categorical(['test','train','test','train']),
'F':'foo'
})
print(df2)
print(df2.dtypes)#每一列的資料形式
print(df2.index)
print(df2. columns)
print(df2.values)
print(df2.describe())
print(df2.T)
print(df2.sort_index(axis=1,ascending = False))#ascending = 1排序倒著排序
print(df2.sort_index(axis=0,ascending = False))
df2.sort_values(by='E')#對E列進行排序
0 1.0 1 3.0 2 6.0 3 NaN 4 44.0 5 1.0 dtype: float64 DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04', '2016-01-05', '2016-01-06'], dtype='datetime64[ns]', freq='D') a b c d 2016-01-01 -1.487623 0.888221 -1.506509 -0.776984 2016-01-02 -0.040059 0.262338 1.658207 -0.713937 2016-01-03 -1.316003 1.942315 1.483371 2.473241 2016-01-04 -0.014973 0.465519 2.120004 1.569695 2016-01-05 -2.075723 -1.336707 -0.409277 -0.431981 2016-01-06 -0.776415 -0.293674 0.240641 -0.402298 0 1 2 3 0 0 1 2 3 1 4 5 6 7 2 8 9 10 11 A B C D E F 0 1.0 2013-01-02 1.0 3 test foo 1 1.0 2013-01-02 1.0 3 train foo 2 1.0 2013-01-02 1.0 3 test foo 3 1.0 2013-01-02 1.0 3 train foo A float64 B datetime64[ns] C float32 D int32 E category F object dtype: object Int64Index([0, 1, 2, 3], dtype='int64') Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object') [[1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo'] [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo'] [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'test' 'foo'] [1.0 Timestamp('2013-01-02 00:00:00') 1.0 3 'train' 'foo']] A C D count 4.0 4.0 4.0 mean 1.0 1.0 3.0 std 0.0 0.0 0.0 min 1.0 1.0 3.0 25% 1.0 1.0 3.0 50% 1.0 1.0 3.0 75% 1.0 1.0 3.0 max 1.0 1.0 3.0 0 1 2 \ A 1 1 1 B 2013-01-02 00:00:00 2013-01-02 00:00:00 2013-01-02 00:00:00 C 1 1 1 D 3 3 3 E test train test F foo foo foo 3 A 1 B 2013-01-02 00:00:00 C 1 D 3 E train F foo F E D C B A 0 foo test 3 1.0 2013-01-02 1.0 1 foo train 3 1.0 2013-01-02 1.0 2 foo test 3 1.0 2013-01-02 1.0 3 foo train 3 1.0 2013-01-02 1.0 A B C D E F 3 1.0 2013-01-02 1.0 3 train foo 2 1.0 2013-01-02 1.0 3 test foo 1 1.0 2013-01-02 1.0 3 train foo 0 1.0 2013-01-02 1.0 3 test foo
A | B | C | D | E | F | |
---|---|---|---|---|---|---|
0 | 1.0 | 2013-01-02 | 1.0 | 3 | test | foo |
2 | 1.0 | 2013-01-02 | 1.0 | 3 | test | foo |
1 | 1.0 | 2013-01-02 | 1.0 | 3 | train | foo |
3 | 1.0 | 2013-01-02 | 1.0 | 3 | train | foo |
選擇資料
import pandas as pd
import numpy as np
dates = pd.date_range('20130101',periods = 6)
df = pd.DataFrame(np.arange(24).reshape(6,4),index = dates,columns = ['A','B','C','D'])
print(df)
print(df.A,'\n',df['A'])
print(df[0:3],'\n',df['20130102':'20130104'])
print(df.loc['20130102'])#select by label:loc
print(df.loc[:,['A','B']])#列印所有行,A、B兩列元素
print(df.loc['20130102',['A','B']])
print(df.iloc[3])#select by position:iloc
print(df.iloc[3][2])
print(df.iloc[3:5,1:3])
print(df.iloc[[1,3,5],1:3])
print(df.ix[:3,['A','C']])#mixed selection:ix
print(df[df.A>8])
A B C D
2013-01-01 0 1 2 3
2013-01-02 4 5 6 7
2013-01-03 8 9 10 11
2013-01-04 12 13 14 15
2013-01-05 16 17 18 19
2013-01-06 20 21 22 23
2013-01-01 0
2013-01-02 4
2013-01-03 8
2013-01-04 12
2013-01-05 16
2013-01-06 20
Freq: D, Name: A, dtype: int32
2013-01-01 0
2013-01-02 4
2013-01-03 8
2013-01-04 12
2013-01-05 16
2013-01-06 20
Freq: D, Name: A, dtype: int32
A B C D
2013-01-01 0 1 2 3
2013-01-02 4 5 6 7
2013-01-03 8 9 10 11
A B C D
2013-01-02 4 5 6 7
2013-01-03 8 9 10 11
2013-01-04 12 13 14 15
A 4
B 5
C 6
D 7
Name: 2013-01-02 00:00:00, dtype: int32
A B
2013-01-01 0 1
2013-01-02 4 5
2013-01-03 8 9
2013-01-04 12 13
2013-01-05 16 17
2013-01-06 20 21
A 4
B 5
Name: 2013-01-02 00:00:00, dtype: int32
A 12
B 13
C 14
D 15
Name: 2013-01-04 00:00:00, dtype: int32
14
B C
2013-01-04 13 14
2013-01-05 17 18
B C
2013-01-02 5 6
2013-01-04 13 14
2013-01-06 21 22
A C
2013-01-01 0 2
2013-01-02 4 6
2013-01-03 8 10
A B C D
2013-01-04 12 13 14 15
2013-01-05 16 17 18 19
2013-01-06 20 21 22 23
I:\Anaconda3\lib\site-packages\ipykernel_launcher.py:16: DeprecationWarning:
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing
See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
app.launch_new_instance()
設定值
import pandas as pd
import numpy as np
dates = pd.date_range('20130101',periods = 6)
df = pd.DataFrame(np.arange(24).reshape(6,4),index = dates,columns = ['A','B','C','D'])
df.iloc[2,2] = 1111
print(df)
df.loc['20130101','B'] = 2222
print(df)
df.A[df.A>4] = 0
print(df)
df[df.B>9]= 0
print(df)
df['F'] = np.nan
print(df)
df['E'] = pd.Series([1,2,3,4,5,6],index = pd.date_range('20130101',periods = 6))
print(df)
A B C D
2013-01-01 0 1 2 3
2013-01-02 4 5 6 7
2013-01-03 8 9 1111 11
2013-01-04 12 13 14 15
2013-01-05 16 17 18 19
2013-01-06 20 21 22 23
A B C D
2013-01-01 0 2222 2 3
2013-01-02 4 5 6 7
2013-01-03 8 9 1111 11
2013-01-04 12 13 14 15
2013-01-05 16 17 18 19
2013-01-06 20 21 22 23
A B C D
2013-01-01 0 2222 2 3
2013-01-02 4 5 6 7
2013-01-03 0 9 1111 11
2013-01-04 0 13 14 15
2013-01-05 0 17 18 19
2013-01-06 0 21 22 23
A B C D
2013-01-01 0 0 0 0
2013-01-02 4 5 6 7
2013-01-03 0 9 1111 11
2013-01-04 0 0 0 0
2013-01-05 0 0 0 0
2013-01-06 0 0 0 0
A B C D F
2013-01-01 0 0 0 0 NaN
2013-01-02 4 5 6 7 NaN
2013-01-03 0 9 1111 11 NaN
2013-01-04 0 0 0 0 NaN
2013-01-05 0 0 0 0 NaN
2013-01-06 0 0 0 0 NaN
A B C D F E
2013-01-01 0 0 0 0 NaN 1
2013-01-02 4 5 6 7 NaN 2
2013-01-03 0 9 1111 11 NaN 3
2013-01-04 0 0 0 0 NaN 4
2013-01-05 0 0 0 0 NaN 5
2013-01-06 0 0 0 0 NaN 6
處理丟失資料
import pandas as pd
import numpy as np
dates = pd.date_range('20130101',periods = 6)
df = pd.DataFrame(np.arange(24).reshape(6,4),index = dates,columns = ['A','B','C','D'])
df.iloc[0,1] = np.nan
df.iloc[1,2] = np.nan
print(df)
print(df.dropna(axis = 0,how = 'any'))#how = {'any','all'} 丟失元素中有NAN的行
print(df.dropna(axis = 1,how = 'any'))#丟失元素中有NAN的列
print(df.fillna(value = 0))
print(df.isnull())
print(np.any(df.isnull()) == True)#是否至少有一個元素是NAN
A B C D
2013-01-01 0 NaN 2.0 3
2013-01-02 4 5.0 NaN 7
2013-01-03 8 9.0 10.0 11
2013-01-04 12 13.0 14.0 15
2013-01-05 16 17.0 18.0 19
2013-01-06 20 21.0 22.0 23
A B C D
2013-01-03 8 9.0 10.0 11
2013-01-04 12 13.0 14.0 15
2013-01-05 16 17.0 18.0 19
2013-01-06 20 21.0 22.0 23
A D
2013-01-01 0 3
2013-01-02 4 7
2013-01-03 8 11
2013-01-04 12 15
2013-01-05 16 19
2013-01-06 20 23
A B C D
2013-01-01 0 0.0 2.0 3
2013-01-02 4 5.0 0.0 7
2013-01-03 8 9.0 10.0 11
2013-01-04 12 13.0 14.0 15
2013-01-05 16 17.0 18.0 19
2013-01-06 20 21.0 22.0 23
A B C D
2013-01-01 False True False False
2013-01-02 False False True False
2013-01-03 False False False False
2013-01-04 False False False False
2013-01-05 False False False False
2013-01-06 False False False False
True
匯入匯出資料
import pandas as pd
data = pd.read_csv (‘D:\list.csv’)
data.to_pickle(‘student.pickle’)
合併concat
import pandas as pd
import numpy as np
#concatenating
df1 = pd.DataFrame(np.ones((3,4))*0,columns = ['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1,columns = ['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*2,columns = ['a','b','c','d'])
print(df1)
print(df2)
print(df3)
res = pd.concat([df1,df2,df3],axis = 0)
print(res)#索引發生重複,可能存在問題
res = pd.concat([df1,df2,df3],axis = 1)
print(res)
res = pd.concat([df1,df2,df3],axis = 0,ignore_index = True)
print(res)#合併矩陣索引,產生新的索引
#join,['inner','outer']
df1 = pd.DataFrame(np.ones((3,4))*0,columns = ['a','b','c','d'],index = [1,2,3])
df2 = pd.DataFrame(np.ones((3,4))*1,columns = ['b','c','d','e'],index = [2,3,4])
print(df1)
print(df2)
res = pd.concat([df1,df2],join = 'outer')#合併兩個矩陣中沒有的部分填充NAN
print(res)
res = pd.concat([df1,df2],join = 'inner',ignore_index = True)#裁剪出兩個矩陣都有的部分
print(res)
res = pd.concat([df1,df2],axis = 1,join_axes=[df1.index])
#按照df1的索引進行填充,df2中沒有的部分用NAN進行填充
print(res)
res = pd.concat([df1,df2],axis = 1)
print(res)
#append
df1 = pd.DataFrame(np.ones((3,4))*0,columns = ['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1,columns = ['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*2,columns = ['a','b','c','d'])
res = df1.append(df2,ignore_index = True)
print(res)
res = df1.append([df2,df3],ignore_index = True)
print(res)
s1 = pd.Series([1,2,3,4],index = ['a','b','c','d'])
res = df1.append(s1,ignore_index = True)
print(res)
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
a b c d
0 1.0 1.0 1.0 1.0
1 1.0 1.0 1.0 1.0
2 1.0 1.0 1.0 1.0
a b c d
0 2.0 2.0 2.0 2.0
1 2.0 2.0 2.0 2.0
2 2.0 2.0 2.0 2.0
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
0 1.0 1.0 1.0 1.0
1 1.0 1.0 1.0 1.0
2 1.0 1.0 1.0 1.0
0 2.0 2.0 2.0 2.0
1 2.0 2.0 2.0 2.0
2 2.0 2.0 2.0 2.0
a b c d a b c d a b c d
0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0
1 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0
2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
5 1.0 1.0 1.0 1.0
6 2.0 2.0 2.0 2.0
7 2.0 2.0 2.0 2.0
8 2.0 2.0 2.0 2.0
a b c d
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0
b c d e
2 1.0 1.0 1.0 1.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
a b c d e
1 0.0 0.0 0.0 0.0 NaN
2 0.0 0.0 0.0 0.0 NaN
3 0.0 0.0 0.0 0.0 NaN
2 NaN 1.0 1.0 1.0 1.0
3 NaN 1.0 1.0 1.0 1.0
4 NaN 1.0 1.0 1.0 1.0
b c d
0 0.0 0.0 0.0
1 0.0 0.0 0.0
2 0.0 0.0 0.0
3 1.0 1.0 1.0
4 1.0 1.0 1.0
5 1.0 1.0 1.0
a b c d b c d e
1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN
2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
a b c d b c d e
1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN
2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
4 NaN NaN NaN NaN 1.0 1.0 1.0 1.0
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
5 1.0 1.0 1.0 1.0
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
5 1.0 1.0 1.0 1.0
6 2.0 2.0 2.0 2.0
7 2.0 2.0 2.0 2.0
8 2.0 2.0 2.0 2.0
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 2.0 3.0 4.0
合併merge
import pandas as pd
#merging two df by key/keys.(may be used in database)
left = pd.DataFrame({'key':['K0','K1','K2','K3'],
'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3']})
right = pd.DataFrame({'key':['K0','K1','K2','K3'],
'C':['C0','C1','C2','C3'],
'D':['D0','D1','D2','D3']})
print(left)
print(right)
res = pd.merge(left,right,on='key')
print(res)
#consider two keys
left = pd.DataFrame({'key1':['K0','K0','K1','K2'],
'key2':['K0','K1','K0','K1'],
'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3']})
right = pd.DataFrame({'key1':['K0','K1','K1','K2'],
'key2':['K0','K0','K0','K0'],
'C':['C0','C1','C2','C3'],
'D':['D0','D1','D2','D3']})
print(left)
print(right)
#how = ['inner','outer','left','right']
res = pd.merge(left,right,on=['key1','key2'],how='inner')
print(res)
res = pd.merge(left,right,on=['key1','key2'],how='outer')
print(res)
res = pd.merge(left,right,on=['key1','key2'],how='left')
print(res)
#indicator
df1 = pd.DataFrame({'col1':[0,1],'col_left':['a','b']})
df2 = pd.DataFrame({'col1':[1,2,2],'col_left':[2,2,2]