Series物件與DataFrame物件
阿新 • • 發佈:2020-12-18
import pandas as pd import numpy as np #建立Series物件的方法 # #指定index,可以不按順序,不連續:pd.Series(data, index=index) x = pd.Series([1,2,3,4], index=[3,4,5,6]) print(x) #data可以為標量,類似廣播 x = pd.Series("Hanks" ,index = [1,2,4,5]) print(x) #data可以是字典,index預設是排序的字典鍵,series物件僅僅保留index定義的key-value對 x = pd.Series({3:'c',2:'b',1:'a'},index=[2,3]) print(x) #建立DataFrame物件的方法 population = {'henan':1000,'shandong':200,'hubei':400} area = {'henan':98,'shandong':900,'hubei':4000} population = pd.Series(population)#dataframe裡的物件必須是series province = pd.DataFrame({'population':population,'area':area}) print(province) print(province['area'])#與一般多維陣列不同,該操作返回的是一列 #二維陣列建立dataframe物件 abc = pd.DataFrame(np.random.rand(3, 2), columns=['foo', 'bar'], index=['a', 'b', 'c']) print(abc) #index:不可變陣列 not mutable x = pd.Index([3,2,5,9]) x[4] = 5#該句子報錯:Index does not support mutable operations print(x) y = pd.Index([4,6,9,23,3])print(x & y)#交集 print(x | y)#並集 print(x ^ y)#差集 #索引器:loc,iloc和ix data = pd.Series(['a','b','c'] , index=[1,3,5]) print(data[3])#顯式索引 print(data[1:3])#隱式索引 print(data.loc[1:3])#顯式索引 print(data.iloc[1:3])#隱式索引 #dataframe的取值方法 area = pd.Series({'California': 423967, 'Texas': 695662, 'New York': 141297, 'Florida': 170312, 'Illinois': 149995}) pop = pd.Series({'California': 38332521, 'Texas': 26448193, 'New York': 19651127, 'Florida': 19552860, 'Illinois': 12882135}) data = pd.DataFrame({'area':area, 'pop':pop}) print(data.area,data.pop) # #屬性獲取方法並不總是有用,當列名與方法名重合或者不全為字串時,不可使用屬性方法 print(data.area is data['area']) data['density'] = data['pop']/data['area'] print(data.values) print(data.T) #iloc print(data.iloc[:3,:2]) #loc print(data.loc[:'Illinois',:'pop']) #ix:混合,該功能已經被移除 print(data.ix[:3,:'pop']) x = np.random.RandomState(43) print(x) #pandas計算:一元運算保留索引和列標籤;二元計算自動對其索引進行計算 area = pd.Series({'Alaska': 1723337, 'Texas': 695662, 'California': 423967}, name='area') population = pd.Series({'California': 38332521, 'Texas': 26448193, 'New York': 19651127}, name='population') print(area/population)#索引對齊 x = area/population A = np.random.randint(10,size=(3,4)) print(A) print(A - A[0]) print(x.isnull()) print(x[x.notnull()]) print(x.dropna()) print(x) df = pd.DataFrame([[1, np.nan, 2] ,[2, 3 , 5] ,[np.nan , 4 ,6]]) print(df.dropna()) print(df.dropna(axis='columns')) df[3] = np.nan print(df) print(df.dropna(axis='columns' , how='all')) print(df.dropna(axis='rows' , thresh=3)) #全域性填充 print(df.fillna(9999)) #前值填充 print(df.fillna(method='ffill',axis=1)) #後值填充 print(df.fillna(method='bfill',axis=1))