【機器學習 sklearn】XGBclassifier 超引數尋優
阿新 • • 發佈:2019-01-03
程式碼片段
# encoding: utf-8
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import pandas as pd
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
# print train.info()
# print test.info()
selected_features = ['Pclass', 'Sex', 'Age', 'Embarked', 'SibSp', 'Parch','Fare']
X_train = train[selected_features]
X_test = test[selected_features]
y_train = train['Survived' ]
#############缺失值處理#########
print X_train['Embarked'].value_counts()
print X_test['Embarked'].value_counts()
X_train['Embarked'].fillna('S', inplace=True)
X_test['Embarked'].fillna('S', inplace=True)
X_train['Age'].fillna(X_train['Age'].mean(), inplace=True)
X_test['Age'].fillna(X_test['Age'].mean(), inplace=True )
X_test['Fare'].fillna(X_test['Fare'].mean(), inplace=True)
###檢查
# print X_train.info()
# print X_test.info()
#####特徵向量化############
from sklearn.feature_extraction import DictVectorizer
dict_vec = DictVectorizer(sparse=False)
###########歸一化和標準化#################
X_train = dict_vec.fit_transform(X_train.to_dict(orient='record' ))
X_test = dict_vec.transform(X_test.to_dict(orient='record'))
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
from xgboost import XGBClassifier
xgbc = XGBClassifier()
from sklearn.model_selection import cross_val_score
print cross_val_score(rfc, X_train, y_train, cv=5).mean()
cross_val_score(xgbc, X_train, y_train, cv=5).mean()
rfc.fit(X_train,y_train)
rfc_y_predict = rfc.predict(X_test)
rfc_submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': rfc_y_predict})
# rfc_submission.to_csv('rfc_submission.csv', index=False)
xgbc.fit(X_train, y_train)
xgbc_y_predict = xgbc.predict(X_test)
xgbc_submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': xgbc_y_predict})
# xgbc_submission.to_csv('xgbc_submission.csv', index=False)
from sklearn.grid_search import GridSearchCV
params = {'max_depth':range(2, 7), 'n_estimators':range(100, 1100, 200), 'learning_rate':[0.05, 0.1, 0.25, 0.5, 1.0]}
xgbc_best = XGBClassifier()
gs = GridSearchCV(xgbc_best, params, n_jobs=-1, cv=5, verbose=1)
gs.fit(X_train, y_train)
print gs.best_score_
print gs.best_params_
xgbc_best_y_predict = gs.predict(X_test)
xgbc_best_submission = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': xgbc_best_y_predict})
#xgbc_best_submission.to_csv('xgbc_best_submission.csv', index=False)