1. 程式人生 > >python pandas(一)

python pandas(一)

#coding=utf-8
import numpy as np
import pandas as pd

df1 = pd.DataFrame(pd.read_csv('./house_data/all/test.csv',header=1))   # 讀取.csv檔案
print df1.shape

df2 = pd.DataFrame({"id":[1002,1001,1003,1004,1005,1006],
 "date":pd.date_range('20130102', periods=6),
  "city":['Beijing ', 'SH', ' guangzhou ', 'Shenzhen', 'shanghai', 'BEIJING '],
 "age":[23,44,54,32,34,32],
 "category":['100-A','100-B','110-A','110-C','210-A','130-F'],
  "price":[1200,np.nan,2133,5433,np.nan,4432]},
  columns =['id','date','city','category','age','price'])

df3=pd.DataFrame({"id":[1001,1002,1003,1004,1005,1006,1007,1008],
"gender":['male','female','male','female','male','female','male','female'],
"pay":['Y','N','Y','Y','N','Y','N','Y'],
"m-point":[10,12,20,40,40,40,30,20]})
print df3.info

'''
print df2.shape  # 維度檢視
print df2.info   # 資料表基本資訊(維度、列名稱、資料格式、所佔空間等)
print df2.dtypes  # 檢視每一列的資料型別

print df2["city"].dtype  # object
print df2.isnull()
print df2['price'].unique()  # 即看某一列的值有哪些,避免重複
print  df2.values  # 檢視資料表的值
print '=============='
print df2.head(3) #預設前10行資料
print df2.tail(3)    #預設後10 行資料
'''

# 資料表清洗
df3 = df2.fillna(value=0)  # 用數字0填充空值
print df3.info

df2['price'] = df2['price'].fillna(df2['price'].mean())  # 使用列prince的均值對NA進行填充  ,注意等號左邊的
print df2.info
df2['city']=df2['city'].map(str.strip)  # 去除空格
print df2

df2['city']=df2['city'].str.lower()  # 大小寫轉換
print df2.info
print df2.dtypes  # 檢視每一列的資料型別
# df2['price'] = df2['price'].astype(int)  # 更改列的資料格式
# print df2.dtypes  # 檢視每一列的資料型別

df2=df2.rename(columns={'category': 'category-size'})  # 對列剛改名字,注意接收的是df2物件
print df2.info
# df2['city'] = df2['city'].drop_duplicates()   # 刪除後出現的重複值
# print df2.info

df2['city'] = df2['city'].drop_duplicates(keep='last')  # 刪除先出現的重複值:
print df2.info

df2['city'] = df2['city'].replace('sh', 'shanghai')  # 資料替換
print df2.info
print '======='

''''
df_inner=pd.merge(df2,df3,how='inner')  # 匹配合並,交集
print 'inner:'
print df_inner.info

df_outer=pd.merge(df2,df3,how='outer')  #並集
print 'outer:'
print df_outer.info

print '========='
df_left=pd.merge(df2,df3,how='left')
print 'left:'
print df_left

print '========='
df_rigth=pd.merge(df2,df3,how='right')
print 'right:'
print df_rigth
'''

df2 = df2.set_index('id')  # 設定id為索引
df2 = df2.sort_index()   # 按照索引值排序
print df2

df2 = df2.sort_values(by=['age'])
print df2

df2['group'] = np.where(df2['price'] > 3000,'high','low')  # 增加了group列
print df2

# df2['price'] = np.where(df2['price'] > 3000,'high','low')  # 增加了group列
# print df2
print df2.dtypes
df2.loc[(df2['city'] == 'beijing') & (df2['price'] >= 4000), 'sign']=1
print df2

print '====='
print df2.loc[df2['city']== 'beijing']  # loc函式按標籤值進行提取
print '===='
print df2.iloc[0:2]  # 按索引提取區域行數值

df2 = df2.reset_index()  # 重設索引
df2=df2.set_index('date')  # 設定日期為索引
print '****'
print df2[:'2013-01-03']
print df2.iloc[:3,:2]  # 冒號前後的數字不再是索引的標籤名稱,而是資料所在的位置,從0開始,前三行,前兩列。

print df2.iloc[[0,2,5],[2]]  # 提取第0、2、5行,4、5列    !!!!  若把日期作為索引,則一定會顯示日期
print df2

print df2.ix[:'2013-01-03',:4] # 2013-01-03號之前,前四列資料  使用ix按索引標籤和位置混合提取資料
print df2['city'].isin(['beijing'])  # 整列資料輸出為False或True  ,判斷city列的值是否為北京
print df2['city'].isin(['beijing','shanghai'])  # 輸出True或者False  ,判斷city列裡是否包含beijing和shanghai
print df2.loc[df2['city'].isin(['beijing','shanghai'])]  # 判斷city列裡是否包含beijing和shanghai,並帥選出來
print df2
print pd.DataFrame(df2['city'].str[:3])  # 提取city的前三個字元,並生成資料表,注意只有city列

# 使用與、或、非三個條件配合大於、小於、等於對資料進行篩選,並進行計數和求和。

# 使用 與 進行篩選
print df2.loc[(df2['age'] > 25) & (df2['city'] == 'beijing'), ['id','city','age','category-size','price']]

#使用 或 進行篩選
print '或:',df2.loc[(df2['age'] > 25) | (df2['city'] == 'beijing'), ['id','city','age','category-size','price']]

#使用 非 進行篩選,並按id進行排序,city.count()並進行計數,返回4
print df2.loc[(df2['city'] != 'beijing'), ['id','city','age','category','gender']].sort_values(['id']).city.count()

# 使用query函式進行篩選
print df2.query('city == ["beijing","shanghai"]')

# 對篩選後的結果按prince進行求和
print df2.query('city == ["beijing","shanghai"]').price.sum()  # 11031.0
print '*****'
print df2
print df2.groupby('city').count()  # 按城市對所有的列進行計數彙總
print '*****'
print df2.groupby('city').id.count()  # 按城市對id列進行計數彙總
print df2.groupby(['city','age']).id.count()  # 對兩個欄位進行彙總計數,必須 city age 相同,才能累加
print df2.groupby('city')['price'].agg([len,np.sum, np.mean]) # 對city欄位進行彙總,並分別計算prince的合計和均值
'''
           len     sum    mean
city
beijing    1.0  4432.0  4432.0
guangzhou  1.0  2133.0  2133.0
shanghai   2.0  6599.0  3299.5
shenzhen   1.0  5433.0  5433.0

'''
df = df2.sample(n=3)  # 選取3個樣本
print df
print '===='
weights = [0.8, 0, 0, 0, 0.1, 0.1]
print df2.sample(n=2, weights=weights)

print df2.describe().round(2).T  # 描述性統計
'''
      count    mean      std     min      25%     50%      75%     max
id       6.0  1003.5     1.87  1001.0  1002.25  1003.5  1004.75  1006.0
age      6.0    36.5    10.88    23.0    32.00    33.0    41.50    54.0
price    6.0  3299.5  1523.35  1200.0  2424.62  3299.5  4148.88  5433.0
sign     1.0     1.0      NaN     1.0     1.00     1.0     1.00     1.0
'''
print df2['price'].std() # 計算某個欄位的標準差 1523.35163373
print df2['price'].cov(df2['age'])  # 不支援 字串
print df2.cov() # 資料表中所有欄位間的協方差
'''
        id     age      price  sign
id        3.5    -4.9     1526.1   NaN
age      -4.9   118.3    -1353.5   NaN
price  1526.1 -1353.5  2320600.2   NaN
sign      NaN     NaN        NaN   NaN
'''
# 兩個欄位的相關性分析
print df2['price'].corr(df2['age']) # 相關係數在-1到1之間,接近1為正相關,接近-1為負相關,0為不相關  -0.0816894035549328
# 所有欄位的相關性分析
print df2.corr()

df2.to_csv('./excel_to_python.csv') # 寫入CSV
df2.to_excel('./excel_to_python.xlsx', sheet_name='bluewhale_cc')  # 寫入Excel