python資料處理,特徵工程,比賽等一定會用到的方法
阿新 • • 發佈:2019-01-29
1. 資料處理
# 資料預處理 # 1. 讀取資料: data_macro = pd.read_csv("macro.csv", parse_dates=['timestamp'], usecols=['timestamp'] + macro_cols) # 2. 顯示為object的屬性: data_train.dtypes[data_train.dtypes=='object'] # 3. 改變資料型別 data_train['material'] = data_train['material'].astype('object') # 4. 概覽資料 data_train.describe(include=['object']) # 5. 合併兩個表(上下) data_all = pd.concat([data_train, data_test], ignore_index=True) # 6. 合併兩個表(左右) data_all = pd.merge(data_all, data_macro, on='timestamp', how='left') # 7. 提取Number, Object特徵: object_columns = data_all.columns[data_all.dtypes == 'object'] number_columns = data_all.columns[data_all.dtypes != 'object'] # 8. 計算兩個特徵平均 sa_price = train_df.groupby('sub_area')[['work_share', 'price_doc']].mean()
2. 資料視覺化
# 資料視覺化 # 1. seaborn 畫圖技巧 # https://zhuanlan.zhihu.com/p/24464836 plt.figure(figsize=(8, 6)) sns.distplot(a=np.log1p(data_train['price_doc']), bins=50, kde=True) plt.xlabel("price", fontsize=12) plt.show() # 2. 資料中各特徵值缺失的個數排序 missing_df = (data_train.isnull().sum(axis=0)/data_train.shape[0]).reset_index() missing_df.columns = ['column_name', 'missing_count'] missing_df = missing_df.ix[missing_df['missing_count']>0] missing_df = missing_df.sort_values('missing_count', axis=0, ascending=True) width = 0.8 ind = np.arange(missing_df.shape[0]) fig, ax = plt.subplots(figsize=(12, 18)) ax.barh(ind, missing_df['missing_count'], color='y') ax.set_yticks(ind) ax.set_yticklabels(missing_df['column_name'], rotation='horizontal') ax.set_xlabel("Count of missing values") ax.set_title("Number of missing values in each column") plt.show() train_na = (train_df.isnull().sum() / len(train_df)) * 100 train_na = train_na.drop(train_na[train_na == 0].index).sort_values(ascending=False) f, ax = plt.subplots(figsize=(12, 8)) plt.xticks(rotation='90') sns.barplot(x=train_na.index, y=train_na) ax.set(title='Percent missing data by feature', ylabel='% missing') # 3. 相關性熱圖 internal_chars = ['full_sq', 'life_sq', 'floor', 'max_floor', 'build_year', 'num_room', 'kitch_sq', 'state', 'price_doc'] corrmat = train_df[internal_chars].corr() f, ax = plt.subplots(figsize=(10, 7)) plt.xticks(rotation='90') sns.heatmap(corrmat, square=True, linewidths=.5, annot=True) # 4. 散點圖 f, ax = plt.subplots(figsize=(10, 7)) ind = train_df[train_df['full_sq'] > 2000].index plt.scatter(x=train_df.drop(ind)['full_sq'], y=train_df.drop(ind)['price_doc'], c='r', alpha=0.5) ax.set(title='Price by area in sq meters', xlabel='Area', ylabel='Price') # 5. 個數圖 f, ax = plt.subplots(figsize=(10, 7)) plt.xticks(rotation='90') sns.countplot(x=train_df['num_room']) ax.set(title='Distribution of room count', xlabel='num_room') # 6. 曲線和擬合曲線圖 f, ax = plt.subplots(figsize=(12, 6)) by_price = by_df.groupby('build_year')[['build_year', 'price_doc']].mean() sns.regplot(x="build_year", y="price_doc", data=by_price, scatter=False, order=3, truncate=True) plt.plot(by_price['build_year'], by_price['price_doc'], color='r') ax.set(title='Mean price by year of build') # 7. 小提琴圖 f, ax = plt.subplots(figsize=(12, 8)) ind = train_df[train_df['state'].isnull()].index train_df['price_doc_log10'] = np.log10(train_df['price_doc']) sns.violinplot(x="state", y="price_doc_log10", data=train_df.drop(ind), inner="box") # sns.swarmplot(x="state", y="price_doc_log10", data=train_df.dropna(), color="w", alpha=.2); ax.set(title='Log10 of median price by state of home', xlabel='state', ylabel='log10(price)') # 8. barplot ax = sns.barplot(x="count", y="sub_area", data=sa_vc, orient="h")
3.特徵工程
# 特徵工程 # 1. 移除異常點 ulimit = np.percentile(data_train.price_doc.values, 99) llimit = np.percentile(data_train.price_doc.values, 1) data_train.loc[data_train['price_doc'] >ulimit, 'price_doc'] = ulimit data_train.loc[data_train['price_doc'] <llimit, 'price_doc'] = llimit # 2. 刪除缺失值過半的特徵 drop_columns = missing_df.ix[missing_df['missing_count']>0.5, 'column_name'].values data_train.drop(drop_columns, axis=1, inplace=True) data_test.drop(drop_columns, axis=1, inplace=True) # 3. 刪除不正常的行資料 data_all.drop(data_train[data_train["life_sq"] > 7000].index, inplace=True) # 4. 提取時間 # week of year # data_all["week_of_year"] = data_all["timestamp"].dt.weekofyear # day of week # data_all["day_of_week"] = data_all["timestamp"].dt.weekday # yearmonth data_all['yearmonth'] = pd.to_datetime(data_all['timestamp']) data_all['yearmonth'] = data_all['yearmonth'].dt.year*100 + data_all['yearmonth'].dt.month data_all_groupby = data_all.groupby('yearmonth') # 5. 連續資料離散化 data_all['floor_25'] = (data_all['floor']>25.0)*1 # 6. 分組來填補平均值 for num in number_columns: if(sum(data_all[num].isnull())>0): isnull_raw = data_all[num].isnull() isnull_yearmonth = data_all.ix[isnull_raw, 'yearmonth'].values data_all_groupby[num].transform(lambda x: x.fillna(x.mean())) # 7. Get_dummies離散化 dummies = pd.get_dummies(data=data_all[ob], prefix="{}#".format(ob)) data_all.drop(ob, axis=1, inplace=True) data_all = data_all.join(dummies) # 8. 用radio中位數填補空缺 kitch_ratio = train_df['full_sq']/train_df['kitch_sq'] train_df['kitch_sq']=train_df['kitch_sq'].fillna(train_df['full_sq'] /kitch_ratio.median()) # 9. LabelEncoder for ob in object_columns: lbl = preprocessing.LabelEncoder() lbl.fit(list(data_train[ob].values)) data_train[ob] = lbl.fit_transform(list(data_train[ob].values)) # 10. PCA的視覺化與轉換 from sklearn.decomposition import PCA components = 20 model = PCA(n_components=components) model.fit(data_train) ex_variance = pd.DataFrame({'ex_variance':model.explained_variance_ratio_ [0:components], 'n_component':range(1,components+1)}) ax = sns.barplot(x='n_component', y='ex_variance', data=ex_variance) ax.set_title('PCA_variance_explained') plt.show() data_train = model.fit_transform(data_train) data_test = model.fit_transform(data_test)
4. 建立模型
# 建立模型
# 1. import xgboost as xgb
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.grid_search import GridSearchCV
train_X = data_train
test_X = data_test
dtrain = xgb.DMatrix(train_X, train_y)
xgb_params={ 'eta': 0.05,
'max_depth': 5,
'subsample': 0.7,
'colsample_bytree': 0.7,
'objective': 'reg:linear',
'eval_metric': 'rmse',
'silent': 1
}
cv_output = xgb.cv(dict(xgb_params, silent=0), dtrain, num_boost_round=1000, early_stopping_rounds=20, verbose_eval=20)
num_boost_round = len(cv_output)
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=320)
num_boost_round = model.best_iteration
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_round)
preds = np.exp(model.predict(xgb.DMatrix(test_X, feature_names=test_X.columns.values)))-1
submission = pd.DataFrame()
submission['id'] = test_id
submission["price_doc"]= preds
submission.to_csv("sub.csv",index=False)
# 畫feature_importance
%matplotlib inline
fig, ax = plt.subplots(1, 1, figsize=(8, 60))
xgb.plot_importance(model, height=0.5, ax=ax)
# 提取feature_importance
import operator
importance = model.get_fscore()
df_importance = pd.DataFrame(importance, columns=['feature', 'fscore'])
df_importance.sort_values(ascending=False)
5. 其他
# 其他
# 去除共線性
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import Imputer
from statsmodels.stats.outliers_influence import variance_inflation_factor
class ReduceVIF(BaseEstimator, TransformerMixin):
def __init__(self, thresh=5.0, impute=True, impute_strategy='median'):
# From looking at documentation, values between 5 and 10 are "okay".
# Above 10 is too high and so should be removed.
self.thresh = thresh
# The statsmodel function will fail with NaN values, as such we have to impute them.
# By default we impute using the median value.
# This imputation could be taken out and added as part of an sklearn Pipeline.
if impute:
self.imputer = Imputer(strategy=impute_strategy)
def fit(self, X, y=None):
print('ReduceVIF fit')
if hasattr(self, 'imputer'):
self.imputer.fit(X)
return self
def transform(self, X, y=None):
print('ReduceVIF transform')
columns = X.columns.tolist()
if hasattr(self, 'imputer'):
X = pd.DataFrame(self.imputer.transform(X), columns=columns)
return ReduceVIF.calculate_vif(X, self.thresh)
@staticmethod
def calculate_vif(X, thresh=5.0):
# Taken from https://stats.stackexchange.com/a/253620/53565 and modified
dropped=True
while dropped:
# Loop repeatedly until we find that all columns within our dataset
# have a VIF value we're happy with.
variables = X.columns
dropped=False
vif = []
new_vif = 0
for var in X.columns:
new_vif = variance_inflation_factor(X[variables].values, X.columns.get_loc(var))
vif.append(new_vif)
if np.isinf(new_vif):
break
max_vif = max(vif)
if max_vif > thresh:
maxloc = vif.index(max_vif)
#print 'Dropping {X.columns[{0}]} with vif={{1}}'.format(maxloc, max_vif)
print X.columns[maxloc]
print max_vif
X = X.drop([X.columns.tolist()[maxloc]], axis=1)
dropped=True
return X
transformer = ReduceVIF()
X = data_all
# Only use 10 columns for speed in this example
data_all = transformer.fit_transform(data_train[data_train.columns[0:50]], train_y)
data_all.head()
# 2. Stacking
# Stacking Starter based on Allstate Faron's Script
#https://www.kaggle.com/mmueller/allstate-claims-severity/stacking-starter/run/390867
# Preprocessing from Alexandru Papiu
#https://www.kaggle.com/apapiu/house-prices-advanced-regression-techniques/regularized-linear-models
SEED = 1
NFOLDS = 3
import pandas as pd
import numpy as np
from scipy.stats import skew
import xgboost as xgb
from sklearn.cross_validation import KFold
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, Lasso
from math import sqrt
ntrain = data_train.shape[0]
ntest = data_test.shape[0]
print ntrain
print ntest
x_train = np.array(data_train)
x_test = np.array(data_test)
y_train = train_y
kf = KFold(ntrain, n_folds=3, shuffle=True, random_state=SEED)
class SklearnWrapper(object):
def __init__(self, clf, seed=0, params=None):
params['random_state'] = seed
self.clf = clf(**params)
def train(self, x_train, y_train):
self.clf.fit(x_train, y_train)
def predict(self, x):
return self.clf.predict(x)
class XgbWrapper(object):
def __init__(self, seed=0, params=None):
self.param = params
self.param['seed'] = seed
self.nrounds = params.pop('nrounds', 250)
def train(self, x_train, y_train):
dtrain = xgb.DMatrix(x_train, label=y_train)
self.gbdt = xgb.train(self.param, dtrain, self.nrounds)
def predict(self, x):
return self.gbdt.predict(xgb.DMatrix(x))
def get_oof(clf):
oof_train = np.zeros((ntrain,))
oof_test = np.zeros((ntest,))
oof_test_skf = np.empty((NFOLDS, ntest))
for i, (train_index, test_index) in enumerate(kf):
x_tr = x_train[train_index]
y_tr = y_train[train_index]
x_te = x_train[test_index]
clf.train(x_tr, y_tr)
oof_train[test_index] = clf.predict(x_te)
oof_test_skf[i, :] = clf.predict(x_test)
oof_test[:] = oof_test_skf.mean(axis=0)
return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)
et_params = {
'n_jobs': 16,
'n_estimators': 100,
'max_features': 0.5,
'max_depth': 12,
'min_samples_leaf': 2,
}
rf_params = {
'n_jobs': 16,
'n_estimators': 100,
'max_features': 0.2,
'max_depth': 12,
'min_samples_leaf': 2,
}
xgb_params = {
'seed': 0,
'colsample_bytree': 0.7,
'silent': 1,
'subsample': 0.7,
'learning_rate': 0.075,
'objective': 'reg:linear',
'max_depth': 4,
'num_parallel_tree': 1,
'min_child_weight': 1,
'eval_metric': 'rmse',
'nrounds': 500
}
rd_params={
'alpha': 10
}
ls_params={
'alpha': 0.005
}
xg = XgbWrapper(seed=SEED, params=xgb_params)
et = SklearnWrapper(clf=ExtraTreesRegressor, seed=SEED, params=et_params)
rf = SklearnWrapper(clf=RandomForestRegressor, seed=SEED, params=rf_params)
rd = SklearnWrapper(clf=Ridge, seed=SEED, params=rd_params)
ls = SklearnWrapper(clf=Lasso, seed=SEED, params=ls_params)
xg_oof_train, xg_oof_test = get_oof(xg)
et_oof_train, et_oof_test = get_oof(et)
rf_oof_train, rf_oof_test = get_oof(rf)
rd_oof_train, rd_oof_test = get_oof(rd)
ls_oof_train, ls_oof_test = get_oof(ls)
print("XG-CV: {}".format(sqrt(mean_squared_error(y_train, xg_oof_train))))
print("ET-CV: {}".format(sqrt(mean_squared_error(y_train, et_oof_train))))
print("RF-CV: {}".format(sqrt(mean_squared_error(y_train, rf_oof_train))))
print("RD-CV: {}".format(sqrt(mean_squared_error(y_train, rd_oof_train))))
print("LS-CV: {}".format(sqrt(mean_squared_error(y_train, ls_oof_train))))
x_train = np.concatenate((xg_oof_train, et_oof_train, rf_oof_train, rd_oof_train, ls_oof_train), axis=1)
x_test = np.concatenate((xg_oof_test, et_oof_test, rf_oof_test, rd_oof_test, ls_oof_test), axis=1)
print("{},{}".format(x_train.shape, x_test.shape))
dtrain = xgb.DMatrix(x_train, label=y_train)
dtest = xgb.DMatrix(x_test)
xgb_params = {
'seed': 0,
'colsample_bytree': 0.8,
'silent': 1,
'subsample': 0.6,
'learning_rate': 0.01,
'objective': 'reg:linear',
'max_depth': 1,
'num_parallel_tree': 1,
'min_child_weight': 1,
'eval_metric': 'rmse'}
res = xgb.cv(xgb_params, dtrain, num_boost_round=1000, nfold=4, seed=SEED, stratified=False,
early_stopping_rounds=25, verbose_eval=10, show_stdv=True)
best_nrounds = res.shape[0] - 1
cv_mean = res.iloc[-1, 0]
cv_std = res.iloc[-1, 1]
print('Ensemble-CV: {0}+{1}'.format(cv_mean, cv_std))