Logistic 迴歸—LogisticRegressionCV實現引數優化
阿新 • • 發佈:2018-12-16
1、準備
# 首先 import 必要的模組 import pandas as pd import numpy as np from sklearn.model_selection import GridSearchCV #競賽的評價指標為logloss from sklearn.metrics import log_loss from matplotlib import pyplot import seaborn as sns %matplotlib inline data = pd.read_csv('Otto_train.csv') data.head() data.info() data.describe() data.shape #受機器效能所限取前兩萬條資料 data = data[:20000] # Target 分佈,看看各類樣本分佈是否均衡 sns.countplot(data.target) pyplot.xlabel('target'); pyplot.ylabel('Number of occurrences');
2、資料的標準化
# 將類別字串變成數字 y_train = data.target y_train = y_train.map(lambda s:s[6:]) y_train = y_train.map(lambda s:int(s)-1) data = data.drop(['target','id'],axis=1) X_train = np.array(data) # 資料標準化 from sklearn.preprocessing import StandardScaler # 初始化特徵的標準化器 ss_X = StandardScaler() # 分別對訓練和測試資料的特徵進行標準化處理 X_train = ss_X.fit_transform(X_train) from sklearn.linear_model import LogisticRegression from sklearn.cross_validation import cross_val_score lr= LogisticRegression() # 交叉驗證用於評估模型效能和進行引數調優(模型選擇) #分類任務中交叉驗證預設是採用StratifiedKFold loss = cross_val_score(lr, X_train, y_train, cv=5, scoring='neg_log_loss') print('logloss of each fold is: ',-loss) print('cv logloss is:', -loss.mean())
3、用LogisticRegressionCV的L1正則
from sklearn.linear_model import LogisticRegressionCV Cs = [1, 10,100,1000] # 大量樣本(6W+)、高維度(93),L1正則 --> 可選用saga優化求解器(0.19版本新功能) # LogisticRegressionCV比GridSearchCV快 lrcv_L1 = LogisticRegressionCV(Cs=Cs, cv = 5, scoring='neg_log_loss', penalty='l1', solver='liblinear', multi_class='ovr') lrcv_L1.fit(X_train, y_train) LogisticRegressionCV(Cs=[1, 10, 100, 1000], class_weight=None, cv=5, dual=False, fit_intercept=True, intercept_scaling=1.0, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l1', random_state=None, refit=True, scoring='neg_log_loss', solver='liblinear', tol=0.0001, verbose=0) lrcv_L1.scores_ # scores_:dict with classes as the keys, and the values as the grid of scores obtained during cross-validating each fold, # Each dict value has shape (n_folds, len(Cs)) n_Cs = len(Cs) n_classes = 3 scores = np.zeros((n_classes,n_Cs)) for j in range(n_classes): scores[j][:] = np.mean(lrcv_L1.scores_[j],axis = 0) mse_mean = -np.mean(scores, axis = 0) pyplot.plot(np.log10(Cs), mse_mean.reshape(n_Cs,1)) #plt.plot(np.log10(reg.Cs)*np.ones(3), [0.28, 0.29, 0.30]) pyplot.xlabel('log(C)') pyplot.ylabel('neg-logloss') pyplot.show() #print ('C is:',lr_cv.C_) #對多類分類問題,每個類別的分類器有一個C lrcv_L1.coef_
4、用LogisticRegressionCV的L2正則
from sklearn.linear_model import LogisticRegressionCV
Cs = [1, 10,100,1000]
# 大量樣本(6W+)、高維度(93),L2正則 --> 預設用lbfgs,為了和GridSeachCV比較,也用liblinear
lr_cv_L2 = LogisticRegressionCV(Cs=Cs, cv = 5, scoring='neg_log_loss', penalty='l2', solver='liblinear', multi_class='ovr')
lr_cv_L2.fit(X_train, y_train)
lr_cv_L2.scores_
# dict with classes as the keys, and the values as the grid of scores obtained during cross-validating each fold,
# Each dict value has shape (n_folds, len(Cs))
n_Cs = len(Cs)
n_classes = 3
scores = np.zeros((n_classes,n_Cs))
for j in range(n_classes):
scores[j][:] = np.mean(lr_cv_L2.scores_[j],axis = 0)
mse_mean = -np.mean(scores, axis = 0)
pyplot.plot(np.log10(Cs), mse_mean.reshape(n_Cs,1))
#plt.plot(np.log10(reg.Cs)*np.ones(3), [0.28, 0.29, 0.30])
pyplot.xlabel('log(C)')
pyplot.ylabel('neg-logloss')
pyplot.show()
#print ('C is:',lr_cv.C_) #對多類分類問題,每個類別的分類器有一個C
from sklearn.linear_model import LogisticRegressionCV
Cs = [1, 10,100,1000]
# 大量樣本(6W+)、高維度(93),L2正則 --> 預設用lbfgs
# LogisticRegressionCV比GridSearchCV快
lrcv_L2 = LogisticRegressionCV(Cs=Cs, cv = 5, scoring='neg_log_loss', penalty='l2', multi_class='ovr')
lrcv_L2.fit(X_train, y_train)
lrcv_L2.scores_
# dict with classes as the keys, and the values as the grid of scores obtained during cross-validating each fold,
# Each dict value has shape (n_folds, len(Cs))
n_Cs = len(Cs)
n_classes = 3
scores = np.zeros((n_classes,n_Cs))
for j in range(n_classes):
scores[j][:] = np.mean(lrcv_L2.scores_[j],axis = 0)
mse_mean = -np.mean(scores, axis = 0)
pyplot.plot(np.log10(Cs), mse_mean.reshape(n_Cs,1))
#plt.plot(np.log10(reg.Cs)*np.ones(3), [0.28, 0.29, 0.30])
pyplot.xlabel('log(C)')
pyplot.ylabel('neg-logloss')
pyplot.show()