基於Kaggle的經典AI專案五—模型訓練
阿新 • • 發佈:2018-12-11
%run "基於Kaggle的經典AI專案四—特徵篩選.ipynb"
train_y = train_num['SalePrice_log']
train_X = train_num.drop('SalePrice_log', axis=1)
model_column = train_X.columns
train_X.shape # 檢視資料集
model_column # 檢視資料集
1 決策樹迴歸DecisionTreeRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.grid_search import GridSearchCV
# 引數優化 - GridSearchCV
# 交叉檢驗 - GridSearchCV
tuned_parameters = {"criterion": ["mse"],
"min_samples_split": [2, 10, 20],
"max_depth": [2, 10, 20, 40],
"min_samples_leaf": [1, 5, 10],
"max_leaf_nodes": [5, 10, 20, 40],
}
clf = DecisionTreeRegressor()
clf = GridSearchCV(clf, tuned_parameters, cv=5)
clf.fit(train_X, train_y)
clf.best_params_
for params, mean_score, scores in clf.grid_scores_:
print("%0.3f (+/-%0.03f) for %r"
% (mean_score, scores.std() * 2, params))
# 模型結果展示 變數重要性顯示
important_features = pd.Series(data=clf.best_estimator_.feature_importances_,
index=train_X.columns).sort_values(ascending=False)
important_features = important_features[important_features>0]
plt.figure(figsize=(20,10))
important_features.plot(kind='bar')
# 效果評估
from sklearn.metrics import mean_squared_error
pred_y = clf.predict(train_X)
np.sqrt(mean_squared_error(np.expm1(train_y), np.expm1(pred_y)))
plt.figure(figsize=(20,10))
plt.scatter(x=np.expm1(train_y), y=np.expm1(pred_y))
2 嶺迴歸Ridge
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
# 引數優化 - for
# 交叉檢驗 - KFold
n_folds = 5
def rmse_cv(model):
rmse= np.sqrt(-cross_val_score(model,
train_X, train_y,
scoring="neg_mean_squared_error",
cv = KFold(n_folds, shuffle=True, random_state=42)))
return(rmse)
alphas = [0.05, 0.1, 0.3, 1, 3, 5]
cv_ridge = [rmse_cv(Ridge(alpha = alpha)).mean()
for alpha in alphas]
# score.std()
cv_ridge = pd.Series(cv_ridge, index = alphas)
cv_ridge.plot(title = "Validation - LassoCV")
plt.xlabel("alpha")
plt.ylabel("rmse")
ridge = Ridge(alpha = 1)
ridge.fit(train_X, train_y)
"""
Ridge(alpha=1, copy_X=True,
fit_intercept=True, max_iter=None,
normalize=False, random_state=None,
solver='auto', tol=0.001)
"""
# 模型結果展示
# 變數重要性顯示
important_features = pd.Series(data=ridge.coef_, index=train_X.columns).sort_values(ascending=False)
important_features = important_features[np.abs(important_features)>0.01]
plt.figure(figsize=(20,10))
important_features.plot(kind='bar')
3 彈性網迴歸ElasticNet
# 引數優化
# 交叉檢驗
from sklearn.linear_model import ElasticNetCV
elasticNet = ElasticNetCV(l1_ratio = [0.1, 0.3, 0.5, 0.6, 0.7, 0.8, 0.85, 0.9, 0.95, 1],
alphas = [0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006,
0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1, 3, 6],
max_iter = 50000, cv = 10)
elasticNet.fit(train_X, train_y)
alpha = elasticNet.alpha_
ratio = elasticNet.l1_ratio_
print("Best l1_ratio :", ratio)
print("Best alpha :", alpha )
print("Try again for more precision with l1_ratio centered around " + str(ratio))
elasticNet = ElasticNetCV(l1_ratio = [ratio * .85, ratio * .9,
ratio * .95, ratio,
ratio * 1.05, ratio * 1.1,
ratio * 1.15],
alphas = [0.0001, 0.0003, 0.0006, 0.001,
0.003, 0.006, 0.01, 0.03, 0.06,
0.1, 0.3, 0.6, 1, 3, 6],
max_iter = 50000, cv = 10)
elasticNet.fit(train_X, train_y)
if (elasticNet.l1_ratio_ > 1):
elasticNet.l1_ratio_ = 1
alpha = elasticNet.alpha_
ratio = elasticNet.l1_ratio_
print("Best l1_ratio :", ratio)
print("Best alpha :", alpha )
print("Now try again for more precision on alpha, with l1_ratio fixed at " + str(ratio) +
" and alpha centered around " + str(alpha))
elasticNet = ElasticNetCV(l1_ratio = ratio,
alphas = [alpha * .6, alpha * .65,
alpha * .7, alpha * .75,
alpha * .8, alpha * .85,
alpha * .9, alpha * .95,
alpha, alpha * 1.05, alpha * 1.1,
alpha * 1.15, alpha * 1.25, alpha * 1.3,
alpha * 1.35, alpha * 1.4],
max_iter = 50000, cv = 10)
elasticNet.fit(train_X, train_y)
if (elasticNet.l1_ratio_ > 1):
elasticNet.l1_ratio_ = 1
alpha = elasticNet.alpha_
ratio = elasticNet.l1_ratio_
print("Best l1_ratio :", ratio)
print("Best alpha :", alpha )
# 模型穩定性
score = rmse_cv(elasticNet)
print(" Averaged base models score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
# Averaged base models score: 0.1199 (0.0098)
# 模型結果展示
# 變數重要性顯示
important_features = pd.Series(data=elasticNet.coef_,
index=train_X.columns).sort_values(ascending=False)
important_features = important_features[np.abs(important_features)>0.01]
plt.figure(figsize=(20,10))
important_features.plot(kind='bar')
# 效果評估
from sklearn.metrics import mean_squared_error
pred_y = elasticNet.predict(train_X)
np.sqrt(mean_squared_error(np.expm1(train_y), np.expm1(pred_y)))
# 19977.375075754539
# 效果評估展示(左圖)
plt.scatter(pred_y, pred_y - train_y, c = "blue", marker = "s", label = "Training data")
plt.hlines(y = 0, xmin = 10.5, xmax = 13.5, color = "red")
# Plot predictions(右圖)
plt.scatter(pred_y, train_y, c = "blue", marker = "s", label = "Training data")
plt.plot([10.5, 13.5], [10.5, 13.5], c = "red")
4 演算法融合
from sklearn.base import BaseEstimator
from sklearn.base import RegressorMixin
from sklearn.base import TransformerMixin
from sklearn.metrics import mean_squared_error
from sklearn.base import clone
from scipy.stats import pearsonr
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
def __init__(self, models):
self.models = models
# we define clones of the original models to fit the data in
def fit(self, X, y):
self.models_ = [clone(x) for x in self.models]
# Train cloned base models
for model in self.models_:
model.fit(X, y)
return self
#Now we do the predictions for cloned models and average them
def predict(self, X):
predictions = np.column_stack([
model.predict(X) for model in self.models_
])
return np.mean(predictions, axis=1)
# 模型相關性分析
pred_y_ridge = ridge.predict(train_X)
pred_y_elasticNet = elasticNet.predict(train_X)
pred_y_clf = clf.predict(train_X)
pearsonr(pred_y_ridge.T, pred_y_elasticNet.T)
pearsonr(pred_y_clf.T, pred_y_elasticNet.T)
# 模型訓練
averaged_models = AveragingModels(models = (ridge, elasticNet))
averaged_models.fit(train_X, train_y)
# 交叉驗證
n_folds = 5
def rmse_cv(model):
rmse= np.sqrt(-cross_val_score(model, train_X, train_y, scoring="neg_mean_squared_error",
cv = KFold(n_folds, shuffle=True, random_state=42)))
return(rmse)
score = rmse_cv(averaged_models)
print(" Averaged base models score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
# 相關性未去除 :Averaged base models score: 0.1124 (0.0090)
# 單獨 elasticNet :Averaged base models score: 0.1180 (0.0081)
# 效果評估
pred_y = averaged_models.predict(train_X)
np.sqrt(mean_squared_error(np.expm1(train_y), np.expm1(pred_y)))
# 相關性未去除 :19865.234119524695
# 單獨 elasticNet 20128.678118486128
# 效果評估展示(左圖)
plt.scatter(pred_y, pred_y - train_y, c = "blue", marker = "s", label = "Training data")
plt.hlines(y = 0, xmin = 10.5, xmax = 13.5, color = "red")
# Plot predictions(右圖)
plt.scatter(pred_y, train_y, c = "blue", marker = "s", label = "Training data")
plt.plot([10.5, 13.5], [10.5, 13.5], c = "red")
#最終模型
model_ridge = ridge
model_elasticNet = elasticNet
model_averaged_models = averaged_models