1. 程式人生 > >用機器學習對CTR預估建模(一)

用機器學習對CTR預估建模(一)

資料集介紹:

train - Training set. 10 days of click-through data, ordered chronologically. Non-clicks and clicks
are subsampled according to different strategies.
Train.csv 解壓後有5.6G,樣本個數非常大,一般200m的csv資料(20~30維)用pandas讀取成資料幀(dataframe)格式,大概會佔用記憶體1G左右,所以這麼的資料集單機記憶體一般吃不消。

test - Test set. 1 day of ads to for testing your model predictions.
Test.csv解壓後有673m,不是很大。

sampleSubmission.csv - Sample submission file in the correct format, corresponds to the All-0.5 Benchmark.

對特徵進行篩選和down sampling來降低資料集

# -*- coding: utf-8 -*-
"""
Created on Wed Feb 01 12:51:31 2017

@author: JR.Lu
"""
import pandas as pd
import numpy as np

train_df=pd.read_csv('train.csv',nrows=10000000
) test_df=pd.read_csv('test.csv') #down sampling temp_0=train_df.click==0 data_0=train_df[temp_0] # 16546986./20000000 佔了0.8273493左右 temp_1=train_df.click==1 data_1=train_df[temp_1] # 3453014 data_0_ed=data_0[0:len(data_1)] data_downsampled=pd.concat([data_1,data_0_ed]) #select features #通過每個columns對label的影響來選擇feature,這裡使用grouby實現
#train_df.groupby(train_df['device_model'])['click'].mean() columns_select_test=['id','device_type','C1','C15','C16','banner_pos','banner_pos','site_category'] columns_select=['click','device_type','C1','C15','C16','banner_pos','banner_pos','site_category'] data_downsampled_1=data_downsampled[columns_select] test_small=test_df[columns_select_test] # 打亂資料 sampler = np.random.permutation(len(data_downsampled_1)) data_downsampled_1=data_downsampled_1.take(sampler) data_downsampled_1.to_csv('train_small.csv') test_small.to_csv('test_small.csv')

其次是用簡單的特徵來測試模型,用網格搜尋的方式來進行引數優選

# -*- coding: utf-8 -*-
"""
Created on Wed Feb 01 20:36:46 2017

@author: JR.Lu
"""
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.learning_curve import learning_curve
from sklearn import metrics
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp



def logloss(act, pred):
    '''
    比賽使用logloss作為evaluation
    '''
    epsilon = 1e-15
    pred = sp.maximum(epsilon, pred)
    pred = sp.minimum(1-epsilon, pred)
    ll = sum(act*sp.log(pred) + sp.subtract(1,act)*sp.log(sp.subtract(1,pred)))
    ll = ll * -1.0/len(act)
    return ll

# 結果衡量
def print_metrics(true_values, predicted_values):
    print "logloss: ", logloss(true_values, predicted_values)
    print "Accuracy: ", metrics.accuracy_score(true_values, predicted_values)
    print "AUC: ", metrics.roc_auc_score(true_values, predicted_values)
    print "Confusion Matrix: ", + metrics.confusion_matrix(true_values, predicted_values)
    print metrics.classification_report(true_values, predicted_values)    


def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    '''
    輸入一個模型、title、x、y,返回模型學習過程曲線
    '''
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt


#讀取經過down sampling後的small資料
train_df=pd.read_csv('train_small.csv',nrows=100000) 
test_df=pd.read_csv('test_small.csv') 

feature_columns=['device_type','C1','C15','C16','banner_pos',
                'banner_pos','site_category']
train_x=train_df[feature_columns]
test_x=test_df[feature_columns]
x=pd.concat([train_x,test_x])

#變成one-hot encoding
temp=x
for each in feature_columns:
    temp_dummies=pd.get_dummies(x[each])
    temp=pd.concat([temp,temp_dummies],axis=1)

x_dummies=temp.drop(feature_columns,axis=1)

X_train=x_dummies[0:len(train_x)]
Y_train=train_df['click']

x_train, x_test, y_train, y_test=train_test_split(X_train,Y_train,test_size=0.33)


#建模


#模型引數選擇,使用GridSearchCV實現
"""
LR模型可調的引數,沒幾個能調的,gs調參只能輸入list,不能對str進行選擇。

LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True,
                   intercept_scaling=1, class_weight=None, random_state=None, 
                   solver='liblinear', max_iter=100, multi_class='ovr', 
                   verbose=0, warm_start=False, n_jobs=1)

        solver : {‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’}, default: ‘liblinear’
"""
#param_LR= {'C':[0.1,1,2]}
#
#    
#gsearch_LR = GridSearchCV(estimator = LogisticRegression(penalty='l1',solver='liblinear'),
#                          param_grid=param_LR,cv=3)
#gsearch_LR.fit(x_train,y_train)
#gsearch_LR.grid_scores_, gsearch_LR.best_params_, gsearch_LR.best_score_

title='LRlearning{penalty=l1,solver=liblinear,cv=3}'                          
plot_learning_curve(LogisticRegression(penalty='l1',solver='liblinear',C=1),
                    title=title,cv=10,X=x_train,y=y_train)

#gsearch_LR.fit(x_train,y_train)


#gbdt模型

#param_GBDT= {'learning_rate':[0.1,0.5],
#             'n_estimators':[100,200,300,400],
#             'max_depth':[3,4]}
#
#gsearch_GBDT = GridSearchCV(estimator =GradientBoostingClassifier(),
#                          param_grid=param_GBDT,cv=10)
#gsearch_GBDT.fit(x_train,y_train)
##gsearch_GBDT.grid_scores_
#gsearch_GBDT.best_params_
#gsearch_GBDT.best_score_
#最佳引數:'n_estimators': 200, 'learning_rate': 0.1, 'max_depth': 3

title='GDBTlearning{n_estimators: 200, learning_rate: 0.1, max_depth: 3}'
plot_learning_curve(estimator=GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, max_depth=3),
                    title=title,cv=2,X=x_train,y=y_train)
#比LR好那麼一點點

#rf建模


#param_rf= {'n_estimators':[100,200,300],
#            'max_depth':[2,3,4]}
#
#gsearch_rf = GridSearchCV(estimator =RandomForestClassifier(),
#                          param_grid=param_rf,cv=3)
#
#gsearch_rf.fit(x_train,y_train)
##gsearch_GBDT.grid_scores_
#gsearch_rf.best_params_
#gsearch_rf.best_score_

# 最佳引數: {'n_estimators': 200, 'max_depth': 4} 

title='RFlearning{n_estimators: 200,  max_depth: 4}'
plot_learning_curve(estimator=RandomForestClassifier(n_estimators=200, max_depth=4),
                    title=title,cv=2,X=x_train,y=y_train)



# predict

lr_model=LogisticRegression(penalty='l1',solver='liblinear',C=1)
gbdt_model=GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, max_depth=3)
rf_model=RandomForestClassifier(n_estimators=200, max_depth=4)

lr_model.fit(x_train,y_train)
gbdt_model.fit(x_train,y_train)
rf_model.fit(x_train,y_train)

lr_predict=lr_model.predict( x_test)
gbdt_predict=gbdt_model.predict(x_test)
rf_predict=rf_model.predict(x_test)

print "LRmodel 效能如下:-------"
print_metrics(y_test, lr_predict)

print "GBDTmodel 效能如下:-------"
print_metrics(y_test, gbdt_predict)

print "RFmodel 效能如下:-------"
print_metrics(y_test, rf_predict)

結果大概如下:

LRmodel 效能如下:-------
logloss:  14.8549419892
Accuracy:  0.569909090909
AUC:  0.570339428461
Confusion Matrix:  [[11141  5293]
 [ 8900  7666]]
             precision    recall  f1-score   support

          0       0.56      0.68      0.61     16434
          1       0.59      0.46      0.52     16566

avg / total       0.57      0.57      0.56     33000
GBDTmodel 效能如下:-------
logloss:  14.7952832304
Accuracy:  0.571636363636
AUC:  0.572068547036
Confusion Matrix:  [[11177  5257]
 [ 8879  7687]]
             precision    recall  f1-score   support

          0       0.56      0.68      0.61     16434
          1       0.59      0.46      0.52     16566

avg / total       0.58      0.57      0.57     33000
RFmodel 效能如下:-------
logloss:  15.4713065032
Accuracy:  0.552060606061
AUC:  0.553565705536
Confusion Matrix:  [[15281  1153]
 [13629  2937]]
             precision    recall  f1-score   support

          0       0.53      0.93      0.67     16434
          1       0.72      0.18      0.28     16566

avg / total       0.62      0.55      0.48     33000

插個圖看看結果:
這裡寫圖片描述

這裡寫圖片描述

這裡寫圖片描述