kaggle——泰坦尼克號生死預測
阿新 • • 發佈:2019-01-30
把很久以前做的泰坦尼克號的程式碼貼出來。
# -*- coding: utf-8 -*-
"""
Created on Fri Mar 30 14:23:12 2017
@author: Yichengfan
"""
import pandas as pd
train = pd.read_csv(r"F:\TS\03_other_parts\Titanic\02_data\train.csv")
test = pd.read_csv(r"F:\TS\03_other_parts\Titanic\02_data\test.csv")
#先分別輸出訓練集和測試資料的基本資訊,這是一個好習慣,可以對資料的規模,
#各個特徵的資料型別以及是否缺失等,有一個整體的瞭解
print(train.info())
print(test.info())
'''
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId 891 non-null int64
Survived 891 non-null int64
Pclass 891 non-null int64
Name 891 non-null object
Sex 891 non-null object
Age 714 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Ticket 891 non-null object
Fare 891 non-null float64
Cabin 204 non-null object
Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId 418 non-null int64
Pclass 418 non-null int64
Name 418 non-null object
Sex 418 non-null object
Age 332 non-null float64
SibSp 418 non-null int64
Parch 418 non-null int64
Ticket 418 non-null object
Fare 417 non-null float64
Cabin 91 non-null object
Embarked 418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB
None
'''
selectd_features = ['Pclass','Sex', 'Age', 'Embarked','SibSp','Parch', 'Fare']
X_train = train[selectd_features]
X_test = test[selectd_features]
y_train = train['Survived']
#通過之前對資料的總體觀察,得知Embarked特徵存在缺失值,需要補充
print (X_train['Embarked'].value_counts())
print (X_test['Embarked'].value_counts())
'''
S 644
C 168
Q 77
Name: Embarked, dtype: int64
S 270
C 102
Q 46
Name: Embarked, dtype: int64
'''
#對於Embarked這種類別的型的特徵,我們使用出現頻率最高的特徵值來填充,
#這也是相對可以減少引入誤差的一種填充方法
X_train['Embarked'].fillna('S', inplace = True)
X_test['Embarked'].fillna('S', inplace = True)
#對於Age這種數值型別的特徵,我們習慣使用求平均值或者中位數來填充缺失值,
#也是相對可以減少引入誤差的一種填充方法
X_train['Age'].fillna(X_train['Age'].mean(), inplace = True)
X_test['Age'].fillna(X_test['Age'].mean(), inplace = True)
X_test['Fare'].fillna(X_test['Fare'].mean(), inplace = True)
#重新處理後的訓練和測試資料進行驗證
print(X_train.info())
print(X_test.info())
'''
\<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
Pclass 891 non-null int64
Sex 891 non-null object
Age 891 non-null float64
Embarked 891 non-null object
SibSp 891 non-null int64
Parch 891 non-null int64
Fare 891 non-null float64
dtypes: float64(2), int64(3), object(2)
memory usage: 48.8+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
Pclass 418 non-null int64
Sex 418 non-null object
Age 418 non-null float64
Embarked 418 non-null object
SibSp 418 non-null int64
Parch 418 non-null int64
Fare 418 non-null float64
dtypes: float64(2), int64(3), object(2)
memory usage: 22.9+ KB
None
'''
#接下來採用DictVectorizer對特徵進行向量化
from sklearn.feature_extraction import DictVectorizer
dict_vec = DictVectorizer(sparse = False)
X_train = dict_vec.fit_transform(X_train.to_dict(orient = 'record'))
dict_vec.feature_names_
'''
['Age',
'Embarked=C',
'Embarked=Q',
'Embarked=S',
'Fare',
'Parch',
'Pclass',
'Sex=female',
'Sex=male',
'SibSp']
'''
X_test = dict_vec.fit_transform(X_test.to_dict(orient = 'record'))
#從sklearn中引入 RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
#使用預設配置初始化RandomForestClassifier
rfc = RandomForestClassifier()
#從流行的工具包XGBoost匯入XGBClassifier
from xgboost import XGBClassifier
xgbc = XGBClassifier()
from sklearn.cross_validation import cross_val_score
#使用5折交叉驗證的方法在訓練集上分別對預設配置的RandomForestClassifier和
#XGBClassifier進行效能評估,並獲得平均分類器準確的得分
cross_val_score(rfc, X_train, y_train, cv= 5).mean()
'''
0.80476830342149963
'''
cross_val_score(xgbc, X_train, y_train, cv= 5).mean()
'''
0.81824559798311003
'''
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
cross_val_score(lr, X_train, y_train, cv= 5).mean()
'''
0.79128522828142689
'''
#使用預設配置的RandomForestClassifier進行預操作
rfc.fit(X_train, y_train)
rfc_y_predict = rfc.predict(X_test)
rfc_submission = pd.DataFrame({'PassengerId':test['PassengerId'],
'Survived':rfc_y_predict})
#將RandomForestClassifier測試資料儲存在檔案中
rfc_submission.to_csv(r'F:\TS\03_other_parts\Titanic\04_output\rfc_submission.csv'
,index = False)
#使用預設配置的RandomForestClassifier進行預操作
xgbc.fit(X_train, y_train)
'''
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
silent=True, subsample=1)
'''
xgbc_y_predict = xgbc.predict(X_test)
xgbc_submission = pd.DataFrame({'PassengerId':test['PassengerId'],
'Survived':xgbc_y_predict})
#將RandomForestClassifier測試資料儲存在檔案中
xgbc_submission.to_csv(r'F:\TS\03_other_parts\Titanic\04_output\xgbc_submission.csv'
,index = False)
#使用並行網格搜尋的方式尋找更好的超引數組合,以期待進一步提供XGBClassifier的預測效能
from sklearn.grid_search import GridSearchCV
params = {'max_depth':list(range(2,7)),'n_estimators':list(range(100,1100,200)),
'learning_rate':[0.05,0.1,0.25,0.5,1.0]}
xgbc_best = XGBClassifier()
#n_jobs= -1使用計算機全部的CPU核數
gs = GridSearchCV(xgbc_best, params, n_jobs= -1, cv = 5,verbose = 1)
gs.fit(X_train, y_train)
#使用經過優化超引數配置的XGBClassifier的超引數配置以及交叉驗證的準確性
print (gs.best_score_)
print (gs.best_params_)
#使用經過優化的超引數配置的XGBClassifier對測試資料的預測結果儲存在檔案xgbc_best_submission中
xgbc_best_y_predict = gs.predict(X_test)
xgbc_best_submission = pd.DataFrame({'PassengerId':test['PassengerId'],
'Survived':rfc_y_predict})
xgbc_best_submission.to_csv(r'F:\TS\03_other_parts\Titanic\04_output\xgbc_submission.csv' ,index = False)