1. 程式人生 > >機器學習一個小目標——Task3

機器學習一個小目標——Task3

任務

構建xgboost和lightgbm模型進行預測

遇到的問題

  • LGB和XGB自帶介面預測(predict)的都是概率
  • LGB和XGBa用sklearn的介面(predict)是分類結果,預測(proba)是概率
  • 訓練之前都要將資料轉化為相應模型所需的格式
  • 怎麼設定引數還不太瞭解

實現程式碼

XGB

#!/usr/bin/env python 3.6
#-*- coding:utf-8 -*-
# @File    : XGBoost.py
# @Date    : 2018-11-17
# @Author  : 黑桃
# @Software: PyCharm
import xgboost as
xgb import time from sklearn.externals import joblib from xgboost.sklearn import XGBClassifier import pickle from sklearn import metrics start_time = time.time() path = "E:/MyPython/Machine_learning_GoGoGo/" """===================================================================================================================== 1 讀取特徵 """
print("0 讀取特徵") f = open(path + 'feature/feature_V1.pkl', 'rb') train, test, y_train,y_test= pickle.load(f) f.close() """【將資料格式轉換成xgb模型所需的格式】""" xgb_val = xgb.DMatrix(test,label=y_test) xgb_train = xgb.DMatrix(train, label=y_train) xgb_test = xgb.DMatrix(test) """===================================================================================================================== 2 設定模型訓練引數 """
## XGB自帶介面 params={ 'booster':'gbtree', 'objective': 'reg:linear', #多分類的問題 'gamma':0.1, # 用於控制是否後剪枝的引數,越大越保守,一般0.1、0.2這樣子。 'max_depth':12, # 構建樹的深度,越大越容易過擬合 'lambda':2, # 控制模型複雜度的權重值的L2正則化項引數,引數越大,模型越不容易過擬合。 'subsample':0.7, # 隨機取樣訓練樣本 'colsample_bytree':0.7, # 生成樹時進行的列取樣 'min_child_weight':3, # 這個引數預設是 1,是每個葉子裡面 h 的和至少是多少,對正負樣本不均衡時的 0-1 分類而言 #,假設 h 在 0.01 附近,min_child_weight 為 1 意味著葉子節點中最少需要包含 100 個樣本。 #這個引數非常影響結果,控制葉子節點中二階導的和的最小值,該引數值越小,越容易 overfitting。 'silent':0 ,#設定成1則沒有執行資訊輸出,最好是設定為0. 'eta': 0.007, # 如同學習率 'seed':1000, # 'nthread':7,# cpu 執行緒數 #'eval_metric': 'auc' } plst = list(params.items())## 轉化為list 為什麼要轉化? num_rounds =50 # 設定迭代次數 #sklearn介面 ##分類使用的是 XGBClassifier ##迴歸使用的是 XGBRegression clf = XGBClassifier( n_estimators=30,#三十棵樹 learning_rate =0.3, max_depth=3, min_child_weight=1, gamma=0.3, subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', nthread=12, scale_pos_weight=1, reg_lambda=1, seed=27) watchlist = [(xgb_train, 'train'),(xgb_val, 'val')] """===================================================================================================================== 3 模型訓練 """ # training model # early_stopping_rounds 當設定的迭代次數較大時,early_stopping_rounds 可在一定的迭代次數內準確率沒有提升就停止訓練 # 使用XGBoost有自帶介面 """【使用XGBoost自帶介面訓練】""" model_xgb = xgb.train(plst, xgb_train, num_rounds, watchlist,early_stopping_rounds=100) ## Scikit-Learn介面 """【Scikit-Learn介面訓練】""" model_xgb_sklearn=clf.fit(train, y_train) """【儲存模型】""" print('3 儲存模型') joblib.dump(model_xgb, path + "model/xgb.pkl") joblib.dump(model_xgb_sklearn, path + "model/xgb_sklearn.pkl") """===================================================================================================================== 4 模型預測 """ """【使用XGBoost自帶介面預測】""" y_xgb = model_xgb.predict(xgb_test) """【Scikit-Learn介面預測】""" y_sklearn_pre= model_xgb_sklearn.predict(test) y_sklearn_proba= model_xgb_sklearn.predict_proba(test)[:,1] """===================================================================================================================== 5 模型評分 """ print("XGBoost_自帶介面(predict) : %s" % y_xgb) print("XGBoost_sklearn介面(proba): %s" % y_sklearn_proba) print("XGBoost_sklearn介面(predict) : %s" % y_sklearn_pre) # print("XGBoost_自帶介面(predict) AUC Score : %f" % metrics.roc_auc_score(y_test, y_xgb)) # print("XGBoost_sklearn介面(proba) AUC Score : %f" % metrics.roc_auc_score(y_test, y_sklearn_proba)) # print("XGBoost_sklearn介面(predict) AUC Score : %f" % metrics.roc_auc_score(y_test, y_sklearn_pre)) """【roc_auc_score】""" #直接根據真實值(必須是二值)、預測值(可以是0/1,也可以是proba值)計算出auc值,中間過程的roc計算省略。 # f1 = f1_score(y_test, predictions, average='macro') print("XGBoost_自帶介面(predict) AUC Score :{}".format(metrics.roc_auc_score(y_test, y_xgb))) print("XGBoost_sklearn介面(proba) AUC Score : {}".format(metrics.roc_auc_score(y_test, y_sklearn_proba))) print("XGBoost_sklearn介面(predict) AUC Score :{}".format(metrics.roc_auc_score(y_test, y_sklearn_pre))) ## [機器學習xgboost實戰—手寫數字識別 (DMatrix)](https://blog.csdn.net/u010159842/article/details/78053669) ## [Windows下在Anaconda3中安裝python版的XGBoost庫](https://blog.csdn.net/zz860890410/article/details/78682041) ## [XGBoost Plotting API以及GBDT組合特徵實踐](https://blog.csdn.net/sb19931201/article/details/65445514)

LGB

#!/usr/bin/env python 3.6
#-*- coding:utf-8 -*-
# @File    : Lightgbm.py
# @Date    : 2018-11-17
# @Author  : 黑桃
# @Software: PyCharm 
import lightgbm as lgb
import pickle
from sklearn import metrics
from sklearn.externals import joblib

path = "E:/MyPython/Machine_learning_GoGoGo/"
"""=====================================================================================================================
1 讀取特徵
"""
print("0 讀取特徵")
f = open(path + 'feature/feature_V1.pkl', 'rb')
train, test, y_train,y_test= pickle.load(f)
f.close()

"""【將資料格式轉換成lgb模型所需的格式】"""
lgb_train = lgb.Dataset(train, y_train)
lgb_eval = lgb.Dataset(test, y_test, reference=lgb_train)
"""=====================================================================================================================
2 設定模型訓練引數
"""
"""【LGB_自帶介面的引數】"""
params = {
'task': 'train',
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': {'l2', 'auc'},
'num_leaves': 31,
'learning_rate': 0.05,
'feature_fraction': 0.9,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'verbose': 0
}

"""=====================================================================================================================
3 模型訓練
"""
##分類使用的是 LGBClassifier
##迴歸使用的是 LGBRegression
"""【LGB_自帶介面訓練】"""
model_lgb = lgb.train(params,lgb_train,num_boost_round=100,valid_sets=lgb_eval,early_stopping_rounds=10)

# y_lgb_proba = model_lgb.predict_proba(test)
"""【LGB_sklearn介面訓練】"""
lgb_sklearn = lgb.LGBMClassifier(learning_rate=0.1,
    max_bin=150,
    num_leaves=32,
    max_depth=11,
    reg_alpha=0.1,
    reg_lambda=0.2,
    # objective='multiclass',
    n_estimators=300,)
lgb_sklearn.fit(train,y_train)

"""【儲存模型】"""
print('3 儲存模型')
joblib.dump(model_lgb, path + "model/lgb.pkl")
joblib.dump(lgb_sklearn, path + "model/lgb_sklearn.pkl")
"""=====================================================================================================================
4 模型預測
"""
"""【LGB_自帶介面預測】"""
y_lgb_pre = model_lgb.predict(test, num_iteration=model_lgb.best_iteration)

"""【LGB_sklearn介面預測】"""
y_sklearn_pre= lgb_sklearn.predict(test)
y_sklearn_proba= lgb_sklearn.predict_proba(test)[:,1]
"""=====================================================================================================================
5 模型評分
"""
print('LGB_自帶介面(predict)    AUC Score:', metrics.roc_auc_score(y_test, y_lgb_pre) )
print('LGB_sklearn介面(proba)    AUC Score:', metrics.roc_auc_score(y_test, y_sklearn_proba) )
print('LGB_sklearn介面(predict)    AUC Score:', metrics.roc_auc_score(y_test, y_sklearn_pre) )



## [lightgbm的原生版本與sklearn 介面版本對比](https://blog.csdn.net/PIPIXIU/article/details/82709899)
## [lightGBM原理、改進簡述](https://blog.csdn.net/niaolianjiulin/article/details/76584785)
## [LightGBM 如何調參](https://blog.csdn.net/aliceyangxi1987/article/details/80711014)





模型評分

【roc_auc_score】

直接根據真實值(必須是二值)、預測值(可以是0/1,也可以是proba值)計算出auc值,中間過程的roc計算省略。
在這裡插入圖片描述

在這裡插入圖片描述

參考資料