pandas學習(快速入門)
阿新 • • 發佈:2018-11-02
資料結構Series
#Series 是一個一維陣列結構,可以存入任一一種python的資料型別(integers, strings, floating point numbers, Python objects, etc.)。
from pandas import Series print("用列表生成Series") obj=Series([4,7,-5,3]) print(obj)#你會發現生成的像字典一樣的形式。 print(obj.values) print(obj.index) print(".........................") #指定Series的index obj2=Series([4,-7,5,3]) obj2.index=['d','c','b','a'] print(obj2) print(obj2.index) print(obj2['a']) obj2['d']=2 print(obj2[2:]) print("///////////") print(obj2[obj2 >0])#找出大於0 的元素 print("///////////") print('b' in obj2) print('e' in obj2)#判斷索引是否存在 from pandas import Series print("用字典生成Series") sdata = {'Ohio':45000, 'Texas':71000, 'Oregon':16000, 'Utah':5000} obj3 = Series(sdata) print(obj3) print() print('使用字典生成Series,並額外指定index,不匹配部分為NaN。') states = ['California', 'Ohio', 'Oregon', 'Texas'] obj4 = Series(sdata, index = states) print(obj4) print() print('Series相加,相同索引部分相加。') print(obj3 + obj4) print() print('指定Series及其索引的名字') obj4.name = 'population' obj4.index.name = 'state' print(obj4) print() print('替換index') obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan'] print(obj)
資料結構DateFrame
import numpy as np from pandas import Series, DataFrame print('用字典生成DataFrame,key為列的名字。') data = {'state':['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'], 'year':[2000, 2001, 2002, 2001, 2002], 'pop':[1.5, 1.7, 3.6, 2.4, 2.9]} print(DataFrame(data)) print(DataFrame(data, columns = ['year', 'state', 'pop'])) # 指定列順序 print() print('指定索引,在列中指定不存在的列,預設資料用NaN。') frame2 = DataFrame(data, columns = ['year', 'state', 'pop', 'debt'], index = ['one', 'two', 'three', 'four', 'five']) print(frame2) print("//////////////////////") print(frame2.state) print("、、、、、、、、、、、、") print(frame2.year) print(".......................") print(frame2.ix['three']) print("!!!!!!!!!!!!!!!!!!!!!!!") frame2['debt'] = 16.5 # 修改一整列 print(frame2) frame2.debt = np.arange(5) # 用numpy陣列修改元素 print(frame2) print() print('用Series指定要修改的索引及其對應的值,沒有指定的預設資料用NaN。') val = Series([-1.2, -1.5, -1.7], index = ['two', 'four', 'five']) frame2['debt'] = val print(frame2) print() print('賦值給新列') frame2['eastern'] = (frame2.state == 'Ohio') # 如果state等於Ohio為True print(frame2) print(frame2.columns) print() print('DataFrame轉置') pop = {'Nevada':{2001:2.4, 2002:2.9}, 'Ohio':{2000:1.5, 2001:1.7, 2002:3.6}} frame3 = DataFrame(pop) print(frame3) print(frame3.T) print() print('指定索引順序,以及使用切片初始化資料。') print(DataFrame(pop, index = [2001, 2002, 2003])) pdata = {'Ohio':frame3['Ohio'][:-1], 'Nevada':frame3['Nevada'][:2]} print(DataFrame(pdata)) print() print('指定索引和列的名稱') frame3.index.name = 'year' frame3.columns.name = 'state' print(frame3) print(frame3.values) print(frame2.values
索引物件
import numpy as np import sys from pandas import Series, DataFrame, Index print('獲取index') obj = Series(range(3), index = ['a', 'b', 'c']) index = obj.index print(index[1:]) try: index[1] = 'd' # index物件read only except: print(sys.exc_info()[0]) print() print('使用Index物件') index = Index(np.arange(3))#通過下面的兩個輸出可以知道Idenx函式的作用是把陣列變成列表。 print(np.arange(3)) print(index) obj2 = Series([1.5, -2.5, 0], index = index) print(obj2) print(obj2.index is index) print() print('判斷列和索引是否存在') pop = {'Nevada':{20001:2.4, 2002:2.9}, 'Ohio':{2000:1.5, 2001:1.7, 2002:3.6}} frame3 = DataFrame(pop) print('Ohio' in frame3.columns) print('2003' in frame3.index)
重新索引
import numpy as np
from pandas import DataFrame, Series
print('重新指定索引及順序')
obj = Series([4.5, 7.2, -5.3, 3.6], index = ['d', 'b', 'a', 'c'])
print(obj)
obj2 = obj.reindex(['a', 'b', 'd', 'c', 'e'])
print(obj2)
print(obj.reindex(['a', 'b', 'd', 'c', 'e'], fill_value = 0)) # 指定不存在元素的預設值
print()
print('重新指定索引並指定填元素充方法')
obj3 = Series(['blue', 'purple', 'yellow'], index = [0, 2, 4])
print(obj3)
print(obj3.reindex(range(6), method = 'ffill'))#一種填充方式
print()
print('對DataFrame重新指定索引')
frame = DataFrame(np.arange(9).reshape(3, 3),
index = ['a', 'c', 'd'],
columns = ['Ohio', 'Texas', 'California'])
print(frame)
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
print(frame2)
print()
print('重新指定column')
states = ['Texas', 'Utah', 'California']
print(frame.reindex(columns = states))
print()
刪除指定軸上的項
import numpy as np
from pandas import Series, DataFrame
print('Series根據索引刪除元素')
obj = Series(np.arange(5.), index = ['a', 'b', 'c', 'd', 'e'])
new_obj = obj.drop('c')#以索引來進行刪除元素。
print(new_obj)
print(obj.drop(['d', 'c']))#刪除兩項的話,以列表的形式來寫。
print()
print('DataFrame刪除元素,可指定索引或列。')
data = DataFrame(np.arange(16).reshape((4, 4)),
index = ['Ohio', 'Colorado', 'Utah', 'New York'],
columns = ['one', 'two', 'three', 'four'])
print(data)
print(data.drop(['Colorado', 'Ohio']))
print(data.drop('two', axis = 1))
print(data.drop(['two', 'four'], axis = 1))
索引 選取 和過濾
import numpy as np
from pandas import Series, DataFrame
print('Series的索引,預設數字索引可以工作。')
obj = Series(np.arange(4.), index = ['a', 'b', 'c', 'd'])
print(obj)
print(obj['b'])
print(obj[3])#索引順序有兩種方法,一種是通過index的值,另一種是通過下表的值,下標從零開始。
print(obj[[1,3]])#輸出下標為1和3 的元素值。
print(obj[obj < 2])
print()
print('Series的陣列切片')
print(obj['b':'c']) # 閉區間
print(obj[1:3])#這兩種方法都可以。
obj['b':'c'] = 5#更改值
print(obj)
print()
print('DataFrame的索引')
data = DataFrame(np.arange(16).reshape((4, 4)),
index = ['Ohio', 'Colorado', 'Utah', 'New York'],
columns = ['one', 'two', 'three', 'four'])
print(data)
print(data['two']) # 列印列
print(data[['three', 'one']])
print(data[:2])#如果用陣列下標的話,那麼表示的是按照行來索引的。
print(data.ix['Colorado', ['two', 'three']]) # 指定索引和列
print(data.ix[['Colorado', 'Utah'], [3, 0, 1]])
print(data.ix[2]) # 列印第2行(從0開始)
print(data.ix[:'Utah', 'two']) # 從開始到Utah,第2列。
print()
print('根據條件選擇')
print(data[data.three > 5])
print(data < 5) # 列印True或者False
data[data < 5] = 0
print(data)
算術運算和資料對齊
import numpy as np
from pandas import Series, DataFrame
print('加法')
s1 = Series([7.3, -2.5, 3.4, 1.5], index = ['a', 'c', 'd', 'e'])
s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index = ['a', 'c', 'e', 'f', 'g'])
print(s1)
print(s2)
print(s1 + s2)#如果兩個index索引相同則相加,不同就是NAN
print()
print('DataFrame加法,索引和列都必須匹配。')
df1 = DataFrame(np.arange(9.).reshape((3, 3)),
columns = list('bcd'),
index = ['Ohio', 'Texas', 'Colorado'])
df2 = DataFrame(np.arange(12).reshape((4, 3)),
columns = list('bde'),
index = ['Utah', 'Ohio', 'Texas', 'Oregon'])
print(df1)
print(df2)
print(df1 + df2)
print()
print('資料填充')
df1 = DataFrame(np.arange(12.).reshape((3, 4)), columns = list('abcd'))
df2 = DataFrame(np.arange(20.).reshape((4, 5)), columns = list('abcde'))
print(df1)
print(df2)
print(df1.add(df2, fill_value = 1))#就相當於DateFrame的加法,只不過是沒有的地方填充fill_value的值。
print(df1.reindex(columns = df2.columns, fill_value = 0))#原來的索引不變,在加上重新新的索引。
print()
print('DataFrame與Series之間的操作')
arr = np.arange(12.).reshape((3, 4))
print(arr)
print(arr[0])
print(arr[0][0])
print(arr - arr[0])
frame = DataFrame(np.arange(12).reshape((4, 3)),
columns = list('bde'),
index = ['Utah', 'Ohio', 'Texas', 'Oregon'])
series = frame.ix[0]
print(frame)
print(series)
print("////////////")
print(frame - series)
series2 = Series(range(3), index = list('bef'))
print(frame + series2)
series3 = frame['d']
print(frame.sub(series3, axis = 0)) # 按列減
函式應用和對映
import numpy as np
from pandas import Series, DataFrame
print('函式')
frame = DataFrame(np.random.randn(4, 3),#np.random.rand這個可以產生那個數但不產生負數。
columns = list('bde'),
index = ['Utah', 'Ohio', 'Texas', 'Oregon'])
print(frame)
print(np.abs(frame))#求絕對值。
print()
print('lambda以及應用')
f = lambda x: x.max() - x.min()
print(frame.apply(f))#預設對每一列進行使用
print(frame.apply(f, axis = 1))#若果對每一行進行使用需要加上引數。
排序
import numpy as np
from pandas import Series, DataFrame
print('根據索引排序,對於DataFrame可以指定軸。')
obj = Series(range(4), index = ['d', 'a', 'b', 'c'])
print(obj)
print(obj.sort_index())
frame = DataFrame(np.arange(8).reshape((2, 4)),
index = ['three', 'one'],
columns = list('dabc'))
print(frame.sort_index())
print(frame.sort_index(axis = 1))#按照行進行排序。
print(frame.sort_index(axis = 1, ascending = False)) # 降序
print()
print('根據值排序')
obj = Series([4, 7, -3, 2])
print(obj.sort_values()) # order已淘汰
print()
print('DataFrame指定列排序')
frame = DataFrame({'b':[4, 7, -3, 2], 'a':[0, 1, 0, 1]})
print(frame)
print(frame.sort_values(by = 'b')) # sort_index(by = ...)已淘汰
print(frame.sort_values(by = ['a', 'b']))
print()
print('rank,求排名的平均位置(從1開始)')
obj = Series([7, -5, 7, 4, 2, 0, 4])
# 對應排名:-5(1), 0(2), 2(3), 4(4), 4(5), 7(6), 7(7)
print(obj.rank())
print(obj.rank(method = 'first')) # 取第一次出現,不求平均值。
print(obj.rank(ascending = False, method = 'max')) # 逆序,並取最大值。所以-5的rank是7.
frame = DataFrame({'b':[4.3, 7, -3, 2],
'a':[0, 1, 0, 1],
'c':[-2, 5, 8, -2.5]})
print(frame)
print(frame.rank(axis = 1)) #按行排序
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
data=pd.date_range('20160101',periods=6)
# print(data)
# df=pd.DataFrame(np.random.randn(6,4),index=data,columns=['a','b','c','d'])
# print (df)
# #根據值進行排序通過by來指定根據哪一個值排序
# df=pd.DataFrame({'name':['sunshunli','sunguozhu','sundejiang','wangli'],
# 'age':[22,21,46,45]},index=['one','two','three','four'])
# print(df)
# # print(df.describe())
# # print(df.T)
# print("###########")
# print(df.sort_index(axis=1,ascending=False))#axis如果是1的話,那麼是行索引進行排序
# # print("##############")
# print(df.sort_index(axis=0,ascending=False))#axis如果是0 那麼是對列索引進行排序
# print(df.sort_values(by='age'))
#資料篩選
# df=pd.DataFrame(np.arange(24).reshape(6,4),index=data,columns=['A','B','C','D'])
# print(df)
# print(df['A'])
# print(df[0:3])
# print(df['20160102':'20160105'])
# print(df.loc['20160102'])
# print(df.loc['20160102',['A','B']])
# print(df.loc[:,['A','B']])
# print(df.iloc[3])
# print(df.iloc[3,2])#相當於座標
# print(df.iloc[3:5,1:3])#切片
# print(df.iloc[[1,3,5],2:4])#不連續的
# print(df.ix[3,['A','C']])#混合
# print(df[df.A > 8])
# df=pd.DataFrame(np.arange(24).reshape(6,4),index=data,columns=['A','B','C','D'])
# print(df)
# df.iloc[2,2]=100
# print(df)
# df.loc['20160102','A']=200
# print(df)
# df.ix[3,'B']=300
# print(df)
# df[df.D > 7]=600
# print(df)
# df.A[df.D > 2]=600
# print(df)
# df['F']=np.nan
# df['F']=df['B']+df['C']
# print(df)
# df.ix[0,'B']=np.nan
# df.ix[1,'C']=np.nan
# print(df.dropna(axis=0,how='any'))#如果是any 的話,只要有一個NaN那麼就會丟掉這個資料
# print("###########")
# print(df.dropna(axis=0,how='all'))#如果是all 的話,只有全部是NaN那麼就會丟掉這個資料
# print("##########")
# print(df.dropna(axis=1,how="any"))
# print("############")
# print(df.dropna(axis=1,how='all'))
# print(df.fillna(0))#把為NaN的值填充值
# print(df.isnull())
# print(np.any(df.isnull()==True))#判斷資料中是否有空值
#pamdas資料匯入匯出
# col_names = ["ID", "K1K2驅動訊號", "電子鎖驅動訊號", "急停訊號", "門禁訊號", "THDV-M", "THDI-M", "label"]
# data=pd.read_csv('data_test.csv',names=col_names)
# print(data)
# #儲存
# data.to_csv('channge')
#pandas合併
#concatenating 這種連線只能是整個表進行連線,不能按照某一列
# df1=pd.DataFrame(np.ones([3,4])*0,columns=['a','b','c','d'])
# df2=pd.DataFrame(np.ones([3,4])*1,columns=['a','b','c','d'])
# df3=pd.DataFrame(np.ones([3,4])*2,columns=['a','b','c','d'])
# print(df1)
# print(df2)
# # print(df3)
# res = pd.concat([df1,df2,df3],axis=0,ignore_index=True)#列的方向合併
# print(res)
# res1 = pd.concat([df1,df2,df3],axis=1,ignore_index=True)
# print(res1)
#join ['inner','outer']
# df1=pd.DataFrame(np.ones([3,4])*0,columns=['a','b','c','d'],index=[1,2,3])
# df2=pd.DataFrame(np.ones([3,4])*1,columns=['b','c','d','e'],index=[2,3,4])
# print(df1)
# print(df2)
# res = pd.concat([df1,df2],axis=0)
# print(res)
# res = pd.concat([df1,df2],join='outer')#合併是把所有的列都合併,沒有的用NaN來填充
# print(res)
# res = pd.concat([df1,df2],join='inner')#合併是隻合併這兩個資料共有的部分
# print(res)
#join_axes
# res = pd.concat([df1,df2],axis=1)
# print(res)
# res1 = pd.concat([df1,df2],axis=1,join_axes=[df1.index])#join_axes作用是用哪一個資料的index來進行合併
# print(res1)
#append 新增
# df1=pd.DataFrame(np.ones([3,4])*0,columns=['a','b','c','d'],index=[1,2,3])
# df2=pd.DataFrame(np.ones([3,4])*1,columns=['a','b','c','d'],index=[1,2,3])
# df3=pd.DataFrame(np.ones([3,4])*2,columns=['a','b','c','d'],index=[1,2,3])
# res = df1.append(df2)
# print(res)
# res = df1.append([df2,df3],ignore_index=True)
# print(res)
# df1=pd.DataFrame(np.ones([3,4])*0,columns=['a','b','c','d'],index=[1,2,3])
# s1=pd.Series([1,2,3,4],index=['a','b','c','d'])
# res = df1.append(s1,ignore_index=True)#必須要加上ignore_index
# print(res)
#merging 方法合併DateFram
# left = pd.DataFrame({'key':['k0','k1','k2','k3'],
# 'A':['A0','A1','A2','A3'],
# 'B':['B0','B1','B2','B3']})
# right = pd.DataFrame({'key':['k0','k1','k2','k3'],
# 'C':['C0','C1','C2','C3'],
# 'D':['D0','D1','D2','D3']})
# res = pd.merge(left,right,on='key')#基於某幾列形同的列進行合併
# print(res)
#資料視覺化
#plot data
# data = pd.Series(np.random.randn(1000),index=np.arange(1000))
# data = data.cumsum()
# data.plot()
# plt.show()
import pandas as pd
import numpy as np
# dates = pd.date_range('20130101', periods=6)
# print(dates)
# DatetimeIndex=['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
# '2013-01-05', '2013-01-06'],
# df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
# print(df)
# print(df.describe())#函式的作用是你描述資料,顯示資料的一些資訊
#對缺少資料的處理
data = {'state':['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
'year':[2000, 2001,np.NaN, 2001, 2002],
'pop':[1.5, 1.7, 3.6, np.NAN, 2.9]}
a=pd.DataFrame(data)
print(pd.DataFrame(data))
#刪除缺少資料的行
res1=a.dropna(how='any')
print(res1)
#填充缺少資料的行
res2=a.fillna(0.0)
print(res2)