python pandas(一)
阿新 • • 發佈:2018-12-21
#coding=utf-8 import numpy as np import pandas as pd df1 = pd.DataFrame(pd.read_csv('./house_data/all/test.csv',header=1)) # 讀取.csv檔案 print df1.shape df2 = pd.DataFrame({"id":[1002,1001,1003,1004,1005,1006], "date":pd.date_range('20130102', periods=6), "city":['Beijing ', 'SH', ' guangzhou ', 'Shenzhen', 'shanghai', 'BEIJING '], "age":[23,44,54,32,34,32], "category":['100-A','100-B','110-A','110-C','210-A','130-F'], "price":[1200,np.nan,2133,5433,np.nan,4432]}, columns =['id','date','city','category','age','price']) df3=pd.DataFrame({"id":[1001,1002,1003,1004,1005,1006,1007,1008], "gender":['male','female','male','female','male','female','male','female'], "pay":['Y','N','Y','Y','N','Y','N','Y'], "m-point":[10,12,20,40,40,40,30,20]}) print df3.info ''' print df2.shape # 維度檢視 print df2.info # 資料表基本資訊(維度、列名稱、資料格式、所佔空間等) print df2.dtypes # 檢視每一列的資料型別 print df2["city"].dtype # object print df2.isnull() print df2['price'].unique() # 即看某一列的值有哪些,避免重複 print df2.values # 檢視資料表的值 print '==============' print df2.head(3) #預設前10行資料 print df2.tail(3) #預設後10 行資料 ''' # 資料表清洗 df3 = df2.fillna(value=0) # 用數字0填充空值 print df3.info df2['price'] = df2['price'].fillna(df2['price'].mean()) # 使用列prince的均值對NA進行填充 ,注意等號左邊的 print df2.info df2['city']=df2['city'].map(str.strip) # 去除空格 print df2 df2['city']=df2['city'].str.lower() # 大小寫轉換 print df2.info print df2.dtypes # 檢視每一列的資料型別 # df2['price'] = df2['price'].astype(int) # 更改列的資料格式 # print df2.dtypes # 檢視每一列的資料型別 df2=df2.rename(columns={'category': 'category-size'}) # 對列剛改名字,注意接收的是df2物件 print df2.info # df2['city'] = df2['city'].drop_duplicates() # 刪除後出現的重複值 # print df2.info df2['city'] = df2['city'].drop_duplicates(keep='last') # 刪除先出現的重複值: print df2.info df2['city'] = df2['city'].replace('sh', 'shanghai') # 資料替換 print df2.info print '=======' '''' df_inner=pd.merge(df2,df3,how='inner') # 匹配合並,交集 print 'inner:' print df_inner.info df_outer=pd.merge(df2,df3,how='outer') #並集 print 'outer:' print df_outer.info print '=========' df_left=pd.merge(df2,df3,how='left') print 'left:' print df_left print '=========' df_rigth=pd.merge(df2,df3,how='right') print 'right:' print df_rigth ''' df2 = df2.set_index('id') # 設定id為索引 df2 = df2.sort_index() # 按照索引值排序 print df2 df2 = df2.sort_values(by=['age']) print df2 df2['group'] = np.where(df2['price'] > 3000,'high','low') # 增加了group列 print df2 # df2['price'] = np.where(df2['price'] > 3000,'high','low') # 增加了group列 # print df2 print df2.dtypes df2.loc[(df2['city'] == 'beijing') & (df2['price'] >= 4000), 'sign']=1 print df2 print '=====' print df2.loc[df2['city']== 'beijing'] # loc函式按標籤值進行提取 print '====' print df2.iloc[0:2] # 按索引提取區域行數值 df2 = df2.reset_index() # 重設索引 df2=df2.set_index('date') # 設定日期為索引 print '****' print df2[:'2013-01-03'] print df2.iloc[:3,:2] # 冒號前後的數字不再是索引的標籤名稱,而是資料所在的位置,從0開始,前三行,前兩列。 print df2.iloc[[0,2,5],[2]] # 提取第0、2、5行,4、5列 !!!! 若把日期作為索引,則一定會顯示日期 print df2 print df2.ix[:'2013-01-03',:4] # 2013-01-03號之前,前四列資料 使用ix按索引標籤和位置混合提取資料 print df2['city'].isin(['beijing']) # 整列資料輸出為False或True ,判斷city列的值是否為北京 print df2['city'].isin(['beijing','shanghai']) # 輸出True或者False ,判斷city列裡是否包含beijing和shanghai print df2.loc[df2['city'].isin(['beijing','shanghai'])] # 判斷city列裡是否包含beijing和shanghai,並帥選出來 print df2 print pd.DataFrame(df2['city'].str[:3]) # 提取city的前三個字元,並生成資料表,注意只有city列 # 使用與、或、非三個條件配合大於、小於、等於對資料進行篩選,並進行計數和求和。 # 使用 與 進行篩選 print df2.loc[(df2['age'] > 25) & (df2['city'] == 'beijing'), ['id','city','age','category-size','price']] #使用 或 進行篩選 print '或:',df2.loc[(df2['age'] > 25) | (df2['city'] == 'beijing'), ['id','city','age','category-size','price']] #使用 非 進行篩選,並按id進行排序,city.count()並進行計數,返回4 print df2.loc[(df2['city'] != 'beijing'), ['id','city','age','category','gender']].sort_values(['id']).city.count() # 使用query函式進行篩選 print df2.query('city == ["beijing","shanghai"]') # 對篩選後的結果按prince進行求和 print df2.query('city == ["beijing","shanghai"]').price.sum() # 11031.0 print '*****' print df2 print df2.groupby('city').count() # 按城市對所有的列進行計數彙總 print '*****' print df2.groupby('city').id.count() # 按城市對id列進行計數彙總 print df2.groupby(['city','age']).id.count() # 對兩個欄位進行彙總計數,必須 city age 相同,才能累加 print df2.groupby('city')['price'].agg([len,np.sum, np.mean]) # 對city欄位進行彙總,並分別計算prince的合計和均值 ''' len sum mean city beijing 1.0 4432.0 4432.0 guangzhou 1.0 2133.0 2133.0 shanghai 2.0 6599.0 3299.5 shenzhen 1.0 5433.0 5433.0 ''' df = df2.sample(n=3) # 選取3個樣本 print df print '====' weights = [0.8, 0, 0, 0, 0.1, 0.1] print df2.sample(n=2, weights=weights) print df2.describe().round(2).T # 描述性統計 ''' count mean std min 25% 50% 75% max id 6.0 1003.5 1.87 1001.0 1002.25 1003.5 1004.75 1006.0 age 6.0 36.5 10.88 23.0 32.00 33.0 41.50 54.0 price 6.0 3299.5 1523.35 1200.0 2424.62 3299.5 4148.88 5433.0 sign 1.0 1.0 NaN 1.0 1.00 1.0 1.00 1.0 ''' print df2['price'].std() # 計算某個欄位的標準差 1523.35163373 print df2['price'].cov(df2['age']) # 不支援 字串 print df2.cov() # 資料表中所有欄位間的協方差 ''' id age price sign id 3.5 -4.9 1526.1 NaN age -4.9 118.3 -1353.5 NaN price 1526.1 -1353.5 2320600.2 NaN sign NaN NaN NaN NaN ''' # 兩個欄位的相關性分析 print df2['price'].corr(df2['age']) # 相關係數在-1到1之間,接近1為正相關,接近-1為負相關,0為不相關 -0.0816894035549328 # 所有欄位的相關性分析 print df2.corr() df2.to_csv('./excel_to_python.csv') # 寫入CSV df2.to_excel('./excel_to_python.xlsx', sheet_name='bluewhale_cc') # 寫入Excel