1. 程式人生 > >基於Kaggle的經典AI專案三—特徵轉換、衍生

基於Kaggle的經典AI專案三—特徵轉換、衍生

開發環境 jupyter notebook
import pandas as pd
import numpy as np
import seaborn as sns               #繪製heatmap類似於pyEcharts
import matplotlib
from scipy import stats
from scipy.stats import norm
from scipy.stats import skew

import matplotlib.pyplot as plt
%matplotlib inline


from IPython.core.interactiveshell import
InteractiveShell InteractiveShell.ast_node_interactivity = "all"

準備工作

%run ".基於Kaggle的經典AI專案二—資料清洗.ipynb"           #引入資料清洗的工作 

# 類別性變數區分度計算
def anova(train, categorical, y):
    anv = pd.DataFrame(index=categorical)
    anv['feature'] = categorical
    pvals = []
    for c in categorical:
        samples =
[] for cls in train[c].dropna().unique(): s = train[train[c] == cls][y].values # 獲取類別型變數的值 samples.append(s) # 某特徵下不同取值對應的房價組合形成二維列表 pval = stats.f_oneway(*samples)[1] # 一元方差分析得到 F,P,要P越小,對方差的影響越大。 pvals.append(pval) anv['pval'
] = pvals anv['disparity'] = np.log(20*1./anv['pval'].values)/np.log(20) # 區分度 return anv.sort_values('pval') cate_feature = [column for column in train\ .columns if train.dtypes[column] == 'object'] # 型別變數集合 y = 'SalePrice' corr_cate = anova(train, cate_feature, y) train_ana = pd.DataFrame() # 統計變數的主要資訊 train_ana['feature_type'] = train.dtypes # 加入變數型別 train_ana['cate_cnt'] = train.apply(lambda x : x.nunique()) # 加入每個類別性變數的取值個數 train_ana['conti_corr'] = train.corr()['SalePrice'] # 加入連續性變數相關度 train_ana['cate_corr'] = corr_cate['disparity'] # 加入類別性變數區分度 # train_ana.sort_values(by = ['feature_type', 'conti_corr', 'cate_corr'], ascending=False)

一、特徵構造

1.1 分型別變數—重分組

neighborhood_order = train.groupby('Neighborhood')\
                          .median()\
                          .sort_values(by='SalePrice')\
                          .index 

plt.figure(figsize = (14, 4))
sns.boxplot(x = 'Neighborhood', y = 'SalePrice',
            data = train, order=neighborhood_order)
plt.xticks(rotation=90)                       #x軸字型旋轉90度

plt.figure(figsize = (14, 4))
sns.countplot(x = 'Neighborhood', data = train, 
              order=neighborhood_order)
plt.xticks(rotation=90)

這裡寫圖片描述

train["SimpleNeighborhood"] = train.Neighborhood\
                                   .replace({'IDOTRR': 'IDOTRR-BrDale' , 'BrDale': 'IDOTRR-BrDale',
                                             'Blueste': 'Blueste-SWISU', 'SWISU': 'Blueste-SWISU' ,
                                             'NPkVill': 'NPkVill-Mitchel', 'Mitchel': 'NPkVill-Mitchel'
                                            }) 

anova(train, ['Neighborhood', 'SimpleNeighborhood'], y)
~ feature pval disparity
SimpleNeighborhood SimpleNeighborhood 1.725211e-243 187.593052
Neighborhood Neighborhood 1.019383e-240 185.462820

1.2 分型別變數—one-hot編碼

#獲取分型別變數
featurego_cat_column = train.select_dtypes(include = [np.object]).columns 

# 通過one-hot編碼建立虛擬特性分類值
train_onehot = pd.get_dummies(train[featurego_cat_column])
train_onehot.columns

2.1 連續型變數—非線性衍生

#獲取連續性變數conti_corr列的值降序排序
train_ana.loc[train_ana.feature_type != 'object', ]\
         .sort_values('conti_corr', ascending=False)
          
train_ana.head()
~ feature_type cate_cnt conti_corr cate_corr
Id int64 1458 -0.027300 NaN
MSSubClass object 15 NaN 65.931200
MSZoning object 5 NaN 29.838963
LotFrontage float64 111 0.209700 NaN
LotArea float64 1039 0.421355 NaN
# 非線性構造
train["OverallQual-s2"] = train["OverallQual"] ** 2
train["OverallQual-s3"] = train["OverallQual"] ** 3
train["OverallQual-Sq"] = np.sqrt(train["OverallQual"])
train["GrLivArea-2"] = train["GrLivArea"] ** 2
train["GrLivArea-3"] = train["GrLivArea"] ** 3
train["GrLivArea-Sq"] = np.sqrt(train["GrLivArea"]) 

train[['SalePrice', 'OverallQual', 'OverallQual-s2', 'OverallQual-s3', 'OverallQual-Sq',
      'GrLivArea', 'GrLivArea-2', 'GrLivArea-3', 'GrLivArea-Sq']].corr()['SalePrice']

2.2 連續型變數—簡單組合

# Total number of bathrooms
train["TotalBath"] = train["BsmtFullBath"] + (0.5 * train["BsmtHalfBath"]) + \
                     train["FullBath"] + (0.5 * train["HalfBath"]) 

# Total SF for house (incl. basement)
train["AllSF"] = train["GrLivArea"] + train["TotalBsmtSF"] 

# Total SF for 1st + 2nd floors
train["AllFlrsSF"] = train["1stFlrSF"] + train["2ndFlrSF"] 

# Total SF for porch
train["AllPorchSF"] = train["OpenPorchSF"] + train["EnclosedPorch"] + \
                      train["3SsnPorch"] + train["ScreenPorch"] 

train[['TotalBath', 'AllSF', 'AllFlrsSF', 'AllPorchSF', 'SalePrice']].corr()['SalePrice']

2.3 連續型變數—正態轉換

# y值正態變換(檢視房價分佈)
sns.distplot(train['SalePrice'], fit=norm) 

# 利用Q-Q圖判斷資料是否偏離正態分佈
train['SalePrice_log'] = np.log1p(train['SalePrice'])    #取對數
stats.probplot(train['SalePrice'], plot=plt) 

# 統計train資料集的相關係數(pearson)(只顯示['SalePrice','SalePrice_log']列)顯示前五行
train.corr().sort_values('SalePrice_log', ascending=False)\
             [['SalePrice', 'SalePrice_log']].head()

這裡寫圖片描述

# 連續型特徵正態變換

# 對數值特性進行變換,來減少傾斜異常值的影響
# 一般的經驗法則,絕對偏態值> 0.75被認為是傾斜嚴重
train_num = train.select_dtypes(include=[np.number])\
                 .drop(['SalePrice', 'SalePrice_log'], axis=1) 
                 
skewness = train_num.apply(lambda x: skew(x.dropna()))
skewness = skewness[abs(skewness) > 0.75]
skewness
featurego_skewed_features = skewness.index 

train[featurego_skewed_features] = np.log1p(train[featurego_skewed_features])

2.4 連續型變數—無量綱化轉換

from sklearn.preprocessing import MinMaxScaler    #用於區間縮放
train_num = train.select_dtypes(include=[np.number]).drop(['SalePrice', 'SalePrice_log'], axis=1)
featurego_min_max_scaler = MinMaxScaler()
featurego_min_max_scaler.fit_transform(train_num)

featurego_scaler_numcolumn = train_num.columns

train_num_minmax = np.round(featurego_min_max_scaler\
                            .transform(train[featurego_scaler_numcolumn]), 2)    #四捨五入
train_num_minmax= pd.DataFrame(train_num_minmax, 
                               columns=featurego_scaler_numcolumn + '_minmax',   #重新命名
                               index=train.index)

整合處理

# 將處理好的資料 以下三個DataFrame 按列拼接。
train = pd.concat([train, train_onehot, train_num_minmax], axis=1) 

train.drop(['SalePrice'], axis=1, inplace=True)