基於Kaggle的經典AI專案三—特徵轉換、衍生
阿新 • • 發佈:2018-12-11
import pandas as pd
import numpy as np
import seaborn as sns #繪製heatmap類似於pyEcharts
import matplotlib
from scipy import stats
from scipy.stats import norm
from scipy.stats import skew
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
準備工作
%run ".基於Kaggle的經典AI專案二—資料清洗.ipynb" #引入資料清洗的工作
# 類別性變數區分度計算
def anova(train, categorical, y):
anv = pd.DataFrame(index=categorical)
anv['feature'] = categorical
pvals = []
for c in categorical:
samples = []
for cls in train[c].dropna().unique():
s = train[train[c] == cls][y].values # 獲取類別型變數的值
samples.append(s) # 某特徵下不同取值對應的房價組合形成二維列表
pval = stats.f_oneway(*samples)[1] # 一元方差分析得到 F,P,要P越小,對方差的影響越大。
pvals.append(pval)
anv['pval' ] = pvals
anv['disparity'] = np.log(20*1./anv['pval'].values)/np.log(20) # 區分度
return anv.sort_values('pval')
cate_feature = [column for column in train\
.columns if train.dtypes[column] == 'object'] # 型別變數集合
y = 'SalePrice'
corr_cate = anova(train, cate_feature, y)
train_ana = pd.DataFrame() # 統計變數的主要資訊
train_ana['feature_type'] = train.dtypes # 加入變數型別
train_ana['cate_cnt'] = train.apply(lambda x : x.nunique()) # 加入每個類別性變數的取值個數
train_ana['conti_corr'] = train.corr()['SalePrice'] # 加入連續性變數相關度
train_ana['cate_corr'] = corr_cate['disparity'] # 加入類別性變數區分度
# train_ana.sort_values(by = ['feature_type', 'conti_corr', 'cate_corr'], ascending=False)
一、特徵構造
1.1 分型別變數—重分組
neighborhood_order = train.groupby('Neighborhood')\
.median()\
.sort_values(by='SalePrice')\
.index
plt.figure(figsize = (14, 4))
sns.boxplot(x = 'Neighborhood', y = 'SalePrice',
data = train, order=neighborhood_order)
plt.xticks(rotation=90) #x軸字型旋轉90度
plt.figure(figsize = (14, 4))
sns.countplot(x = 'Neighborhood', data = train,
order=neighborhood_order)
plt.xticks(rotation=90)
train["SimpleNeighborhood"] = train.Neighborhood\
.replace({'IDOTRR': 'IDOTRR-BrDale' , 'BrDale': 'IDOTRR-BrDale',
'Blueste': 'Blueste-SWISU', 'SWISU': 'Blueste-SWISU' ,
'NPkVill': 'NPkVill-Mitchel', 'Mitchel': 'NPkVill-Mitchel'
})
anova(train, ['Neighborhood', 'SimpleNeighborhood'], y)
feature | pval | disparity | |
---|---|---|---|
SimpleNeighborhood | SimpleNeighborhood | 1.725211e-243 | 187.593052 |
Neighborhood | Neighborhood | 1.019383e-240 | 185.462820 |
1.2 分型別變數—one-hot編碼
#獲取分型別變數
featurego_cat_column = train.select_dtypes(include = [np.object]).columns
# 通過one-hot編碼建立虛擬特性分類值
train_onehot = pd.get_dummies(train[featurego_cat_column])
train_onehot.columns
2.1 連續型變數—非線性衍生
#獲取連續性變數conti_corr列的值降序排序
train_ana.loc[train_ana.feature_type != 'object', ]\
.sort_values('conti_corr', ascending=False)
train_ana.head()
feature_type | cate_cnt | conti_corr | cate_corr | |
---|---|---|---|---|
Id | int64 | 1458 | -0.027300 | NaN |
MSSubClass | object | 15 | NaN | 65.931200 |
MSZoning | object | 5 | NaN | 29.838963 |
LotFrontage | float64 | 111 | 0.209700 | NaN |
LotArea | float64 | 1039 | 0.421355 | NaN |
# 非線性構造
train["OverallQual-s2"] = train["OverallQual"] ** 2
train["OverallQual-s3"] = train["OverallQual"] ** 3
train["OverallQual-Sq"] = np.sqrt(train["OverallQual"])
train["GrLivArea-2"] = train["GrLivArea"] ** 2
train["GrLivArea-3"] = train["GrLivArea"] ** 3
train["GrLivArea-Sq"] = np.sqrt(train["GrLivArea"])
train[['SalePrice', 'OverallQual', 'OverallQual-s2', 'OverallQual-s3', 'OverallQual-Sq',
'GrLivArea', 'GrLivArea-2', 'GrLivArea-3', 'GrLivArea-Sq']].corr()['SalePrice']
2.2 連續型變數—簡單組合
# Total number of bathrooms
train["TotalBath"] = train["BsmtFullBath"] + (0.5 * train["BsmtHalfBath"]) + \
train["FullBath"] + (0.5 * train["HalfBath"])
# Total SF for house (incl. basement)
train["AllSF"] = train["GrLivArea"] + train["TotalBsmtSF"]
# Total SF for 1st + 2nd floors
train["AllFlrsSF"] = train["1stFlrSF"] + train["2ndFlrSF"]
# Total SF for porch
train["AllPorchSF"] = train["OpenPorchSF"] + train["EnclosedPorch"] + \
train["3SsnPorch"] + train["ScreenPorch"]
train[['TotalBath', 'AllSF', 'AllFlrsSF', 'AllPorchSF', 'SalePrice']].corr()['SalePrice']
2.3 連續型變數—正態轉換
# y值正態變換(檢視房價分佈)
sns.distplot(train['SalePrice'], fit=norm)
# 利用Q-Q圖判斷資料是否偏離正態分佈
train['SalePrice_log'] = np.log1p(train['SalePrice']) #取對數
stats.probplot(train['SalePrice'], plot=plt)
# 統計train資料集的相關係數(pearson)(只顯示['SalePrice','SalePrice_log']列)顯示前五行
train.corr().sort_values('SalePrice_log', ascending=False)\
[['SalePrice', 'SalePrice_log']].head()
# 連續型特徵正態變換
# 對數值特性進行變換,來減少傾斜異常值的影響
# 一般的經驗法則,絕對偏態值> 0.75被認為是傾斜嚴重
train_num = train.select_dtypes(include=[np.number])\
.drop(['SalePrice', 'SalePrice_log'], axis=1)
skewness = train_num.apply(lambda x: skew(x.dropna()))
skewness = skewness[abs(skewness) > 0.75]
skewness
featurego_skewed_features = skewness.index
train[featurego_skewed_features] = np.log1p(train[featurego_skewed_features])
2.4 連續型變數—無量綱化轉換
from sklearn.preprocessing import MinMaxScaler #用於區間縮放
train_num = train.select_dtypes(include=[np.number]).drop(['SalePrice', 'SalePrice_log'], axis=1)
featurego_min_max_scaler = MinMaxScaler()
featurego_min_max_scaler.fit_transform(train_num)
featurego_scaler_numcolumn = train_num.columns
train_num_minmax = np.round(featurego_min_max_scaler\
.transform(train[featurego_scaler_numcolumn]), 2) #四捨五入
train_num_minmax= pd.DataFrame(train_num_minmax,
columns=featurego_scaler_numcolumn + '_minmax', #重新命名
index=train.index)
整合處理
# 將處理好的資料 以下三個DataFrame 按列拼接。
train = pd.concat([train, train_onehot, train_num_minmax], axis=1)
train.drop(['SalePrice'], axis=1, inplace=True)