1. 程式人生 > >pandas基礎知識(DataFrame)

pandas基礎知識(DataFrame)

> 多維的pandas.Series
import pandas as pd 
import numpy as np
data = {
    'color':['blue', 'green', 'yellow', 'red', 'white'],
    'object':['ball', 'pen', 'pencil', 'paper', 'mug'],
    'price':[1.2, 1.0, 0.6, 0.9, 1.7]
}
frame = pd.DataFrame(data)
frame
color object price
0 blue ball 1.2
1 green pen 1.0
2 yellow pencil 0.6
3 red paper 0.9
4 white mug 1.7
# 指定列名讀取
frame2 = pd.DataFrame(data, columns=['object'
, 'price']) frame2
object price
0 ball 1.2
1 pen 1.0
2 pencil 0.6
3 paper 0.9
4 mug 1.7
# 修改索引
frames3 = pd.DataFrame(data, index=['one'
, 'two', 'three', 'four', 'five']) frames3
color object price
one blue ball 1.2
two green pen 1.0
three yellow pencil 0.6
four red paper 0.9
five white mug 1.7
frame4 = pd.DataFrame(np.arange(16).reshape(4,4),
                     index=['one', 'two', 'three', 'four'],
                     columns=['blue', 'green', 'yellow', 'red'])
frame4
blue green yellow red
one 0 1 2 3
two 4 5 6 7
three 8 9 10 11
four 12 13 14 15
### 選取資料
frame4.columns
Index([‘blue’, ‘green’, ‘yellow’, ‘red’], dtype=’object’)
frame4.index
Index([‘one’, ‘two’, ‘three’, ‘four’], dtype=’object’)
frame4.values
array([[ 0, 1, 2, 3], [ 4, 5, 6, 7], [ 8, 9, 10, 11], [12, 13, 14, 15]])
frame4.red
one 3 two 7 three 11 four 15 Name: red, dtype: int64
# frame4.ix[2] ix方法已經不能使用了
frame4.iloc[:,2]
one 2 two 6 three 10 four 14 Name: yellow, dtype: int64
# 行->列
frame4['red'][3]
15 ### 賦值
frame
color object price
0 blue ball 1.2
1 green pen 1.0
2 yellow pencil 0.6
3 red paper 0.9
4 white mug 1.7
frame.index.name = "id"
frame.columns.name = "item"
frame
item color object price
id
0 blue ball 1.2
1 green pen 1.0
2 yellow pencil 0.6
3 red paper 0.9
4 white mug 1.7
frame['new'] = 12
frame
item color object price new
id
0 blue ball 1.2 12
1 green pen 1.0 12
2 yellow pencil 0.6 12
3 red paper 0.9 12
4 white mug 1.7 12
frame['new'] = [1.2, 2.3, 3.5, 5.8, 9.6]
frame
item color object price new
id
0 blue ball 1.2 1.2
1 green pen 1.0 2.3
2 yellow pencil 0.6 3.5
3 red paper 0.9 5.8
4 white mug 1.7 9.6
### numpy.array->pandas.DataFrame
ser = pd.Series(np.arange(5))
ser
0 0 1 1 2 2 3 3 4 4 dtype: int64
frame['new'] = ser
frame
item color object price new
id
0 blue ball 1.2 0
1 green pen 1.0 1
2 yellow pencil 0.6 2
3 red paper 0.9 3
4 white mug 1.7 4
### 元素所屬關係
frame.isin([1.0,'pen'])
item color object price new
id
0 False False False False
1 False True True True
2 False False False False
3 False False False False
4 False False False False
frame[frame.isin([1.0,'pen'])]
item color object price new
id
0 NaN NaN NaN NaN
1 NaN pen 1.0 1.0
2 NaN NaN NaN NaN
3 NaN NaN NaN NaN
4 NaN NaN NaN NaN
### 刪除一列
del frame['new']
frame
item color object price
id
0 blue ball 1.2
1 green pen 1.0
2 yellow pencil 0.6
3 red paper 0.9
4 white mug 1.7
### 篩選
frame4
blue green yellow red
one 0 1 2 3
two 4 5 6 7
three 8 9 10 11
four 12 13 14 15
frame4[frame4<12]
blue green yellow red
one 0.0 1.0 2.0 3.0
two 4.0 5.0 6.0 7.0
three 8.0 9.0 10.0 11.0
four NaN NaN NaN NaN
### 巢狀字典生成DataFrame
nestdict = {'red':{2012:22, 2013:33},
            'white':{2011:13, 2012:22, 2013:16},
            'blue':{2011:17, 2012:27, 2013:18}}
frame5 = pd.DataFrame(nestdict)
frame5
blue red white
2011 17 NaN 13
2012 27 22.0 22
2013 18 33.0 16
### 轉置
frame5.T
2011 2012 2013
blue 17.0 27.0 18.0
red NaN 22.0 33.0
white 13.0 22.0 16.0
nestdict = {'red':{2012:22, 2013:33},
            'white':{2011:13, 2012:22, 2013:16},
            'blue':{2011:17, 2012:27, 2013:18}}
frame5 = pd.DataFrame(nestdict)
frame5
blue red white
2011 17 NaN 13
2012 27 22.0 22
2013 18 33.0 16