機器學習一個小目標——Task3
阿新 • • 發佈:2018-12-22
任務
構建xgboost和lightgbm模型進行預測
遇到的問題
- LGB和XGB自帶介面預測(predict)的都是概率
- LGB和XGBa用sklearn的介面(predict)是分類結果,預測(proba)是概率
- 訓練之前都要將資料轉化為相應模型所需的格式
- 怎麼設定引數還不太瞭解
實現程式碼
XGB
#!/usr/bin/env python 3.6
#-*- coding:utf-8 -*-
# @File : XGBoost.py
# @Date : 2018-11-17
# @Author : 黑桃
# @Software: PyCharm
import xgboost as xgb
import time
from sklearn.externals import joblib
from xgboost.sklearn import XGBClassifier
import pickle
from sklearn import metrics
start_time = time.time()
path = "E:/MyPython/Machine_learning_GoGoGo/"
"""=====================================================================================================================
1 讀取特徵
"""
print("0 讀取特徵")
f = open(path + 'feature/feature_V1.pkl', 'rb')
train, test, y_train,y_test= pickle.load(f)
f.close()
"""【將資料格式轉換成xgb模型所需的格式】"""
xgb_val = xgb.DMatrix(test,label=y_test)
xgb_train = xgb.DMatrix(train, label=y_train)
xgb_test = xgb.DMatrix(test)
"""=====================================================================================================================
2 設定模型訓練引數
"""
## XGB自帶介面
params={
'booster':'gbtree',
'objective': 'reg:linear', #多分類的問題
'gamma':0.1, # 用於控制是否後剪枝的引數,越大越保守,一般0.1、0.2這樣子。
'max_depth':12, # 構建樹的深度,越大越容易過擬合
'lambda':2, # 控制模型複雜度的權重值的L2正則化項引數,引數越大,模型越不容易過擬合。
'subsample':0.7, # 隨機取樣訓練樣本
'colsample_bytree':0.7, # 生成樹時進行的列取樣
'min_child_weight':3,
# 這個引數預設是 1,是每個葉子裡面 h 的和至少是多少,對正負樣本不均衡時的 0-1 分類而言
#,假設 h 在 0.01 附近,min_child_weight 為 1 意味著葉子節點中最少需要包含 100 個樣本。
#這個引數非常影響結果,控制葉子節點中二階導的和的最小值,該引數值越小,越容易 overfitting。
'silent':0 ,#設定成1則沒有執行資訊輸出,最好是設定為0.
'eta': 0.007, # 如同學習率
'seed':1000,
# 'nthread':7,# cpu 執行緒數
#'eval_metric': 'auc'
}
plst = list(params.items())## 轉化為list 為什麼要轉化?
num_rounds =50 # 設定迭代次數
#sklearn介面
##分類使用的是 XGBClassifier
##迴歸使用的是 XGBRegression
clf = XGBClassifier(
n_estimators=30,#三十棵樹
learning_rate =0.3,
max_depth=3,
min_child_weight=1,
gamma=0.3,
subsample=0.8,
colsample_bytree=0.8,
objective= 'binary:logistic',
nthread=12,
scale_pos_weight=1,
reg_lambda=1,
seed=27)
watchlist = [(xgb_train, 'train'),(xgb_val, 'val')]
"""=====================================================================================================================
3 模型訓練
"""
# training model
# early_stopping_rounds 當設定的迭代次數較大時,early_stopping_rounds 可在一定的迭代次數內準確率沒有提升就停止訓練
# 使用XGBoost有自帶介面
"""【使用XGBoost自帶介面訓練】"""
model_xgb = xgb.train(plst, xgb_train, num_rounds, watchlist,early_stopping_rounds=100)
## Scikit-Learn介面
"""【Scikit-Learn介面訓練】"""
model_xgb_sklearn=clf.fit(train, y_train)
"""【儲存模型】"""
print('3 儲存模型')
joblib.dump(model_xgb, path + "model/xgb.pkl")
joblib.dump(model_xgb_sklearn, path + "model/xgb_sklearn.pkl")
"""=====================================================================================================================
4 模型預測
"""
"""【使用XGBoost自帶介面預測】"""
y_xgb = model_xgb.predict(xgb_test)
"""【Scikit-Learn介面預測】"""
y_sklearn_pre= model_xgb_sklearn.predict(test)
y_sklearn_proba= model_xgb_sklearn.predict_proba(test)[:,1]
"""=====================================================================================================================
5 模型評分
"""
print("XGBoost_自帶介面(predict) : %s" % y_xgb)
print("XGBoost_sklearn介面(proba): %s" % y_sklearn_proba)
print("XGBoost_sklearn介面(predict) : %s" % y_sklearn_pre)
# print("XGBoost_自帶介面(predict) AUC Score : %f" % metrics.roc_auc_score(y_test, y_xgb))
# print("XGBoost_sklearn介面(proba) AUC Score : %f" % metrics.roc_auc_score(y_test, y_sklearn_proba))
# print("XGBoost_sklearn介面(predict) AUC Score : %f" % metrics.roc_auc_score(y_test, y_sklearn_pre))
"""【roc_auc_score】"""
#直接根據真實值(必須是二值)、預測值(可以是0/1,也可以是proba值)計算出auc值,中間過程的roc計算省略。
# f1 = f1_score(y_test, predictions, average='macro')
print("XGBoost_自帶介面(predict) AUC Score :{}".format(metrics.roc_auc_score(y_test, y_xgb)))
print("XGBoost_sklearn介面(proba) AUC Score : {}".format(metrics.roc_auc_score(y_test, y_sklearn_proba)))
print("XGBoost_sklearn介面(predict) AUC Score :{}".format(metrics.roc_auc_score(y_test, y_sklearn_pre)))
## [機器學習xgboost實戰—手寫數字識別 (DMatrix)](https://blog.csdn.net/u010159842/article/details/78053669)
## [Windows下在Anaconda3中安裝python版的XGBoost庫](https://blog.csdn.net/zz860890410/article/details/78682041)
## [XGBoost Plotting API以及GBDT組合特徵實踐](https://blog.csdn.net/sb19931201/article/details/65445514)
LGB
#!/usr/bin/env python 3.6
#-*- coding:utf-8 -*-
# @File : Lightgbm.py
# @Date : 2018-11-17
# @Author : 黑桃
# @Software: PyCharm
import lightgbm as lgb
import pickle
from sklearn import metrics
from sklearn.externals import joblib
path = "E:/MyPython/Machine_learning_GoGoGo/"
"""=====================================================================================================================
1 讀取特徵
"""
print("0 讀取特徵")
f = open(path + 'feature/feature_V1.pkl', 'rb')
train, test, y_train,y_test= pickle.load(f)
f.close()
"""【將資料格式轉換成lgb模型所需的格式】"""
lgb_train = lgb.Dataset(train, y_train)
lgb_eval = lgb.Dataset(test, y_test, reference=lgb_train)
"""=====================================================================================================================
2 設定模型訓練引數
"""
"""【LGB_自帶介面的引數】"""
params = {
'task': 'train',
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': {'l2', 'auc'},
'num_leaves': 31,
'learning_rate': 0.05,
'feature_fraction': 0.9,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'verbose': 0
}
"""=====================================================================================================================
3 模型訓練
"""
##分類使用的是 LGBClassifier
##迴歸使用的是 LGBRegression
"""【LGB_自帶介面訓練】"""
model_lgb = lgb.train(params,lgb_train,num_boost_round=100,valid_sets=lgb_eval,early_stopping_rounds=10)
# y_lgb_proba = model_lgb.predict_proba(test)
"""【LGB_sklearn介面訓練】"""
lgb_sklearn = lgb.LGBMClassifier(learning_rate=0.1,
max_bin=150,
num_leaves=32,
max_depth=11,
reg_alpha=0.1,
reg_lambda=0.2,
# objective='multiclass',
n_estimators=300,)
lgb_sklearn.fit(train,y_train)
"""【儲存模型】"""
print('3 儲存模型')
joblib.dump(model_lgb, path + "model/lgb.pkl")
joblib.dump(lgb_sklearn, path + "model/lgb_sklearn.pkl")
"""=====================================================================================================================
4 模型預測
"""
"""【LGB_自帶介面預測】"""
y_lgb_pre = model_lgb.predict(test, num_iteration=model_lgb.best_iteration)
"""【LGB_sklearn介面預測】"""
y_sklearn_pre= lgb_sklearn.predict(test)
y_sklearn_proba= lgb_sklearn.predict_proba(test)[:,1]
"""=====================================================================================================================
5 模型評分
"""
print('LGB_自帶介面(predict) AUC Score:', metrics.roc_auc_score(y_test, y_lgb_pre) )
print('LGB_sklearn介面(proba) AUC Score:', metrics.roc_auc_score(y_test, y_sklearn_proba) )
print('LGB_sklearn介面(predict) AUC Score:', metrics.roc_auc_score(y_test, y_sklearn_pre) )
## [lightgbm的原生版本與sklearn 介面版本對比](https://blog.csdn.net/PIPIXIU/article/details/82709899)
## [lightGBM原理、改進簡述](https://blog.csdn.net/niaolianjiulin/article/details/76584785)
## [LightGBM 如何調參](https://blog.csdn.net/aliceyangxi1987/article/details/80711014)
模型評分
【roc_auc_score】
直接根據真實值(必須是二值)、預測值(可以是0/1,也可以是proba值)計算出auc值,中間過程的roc計算省略。