使用sklearn進行kaggle案例泰坦尼克Titanic船員獲救預測
阿新 • • 發佈:2019-01-07
python程式碼:
最後推薦兩篇kaggle案例泰坦尼克Titanic船員獲救案例文章https://zhuanlan.zhihu.com/p/27550334,https://zhuanlan.zhihu.com/p/28795160,也算是進階版吧。#-*- coding: UTF-8 -*- """ Created on Mon Mar 27 20:26:43 2017 @author: Administrator """ #!/usr/bin/python #-*- coding: UTF-8 -*- import pandas titanic = pandas.read_csv('D:\python_code\study\\titanic\Kaggle_Titanic\data\\train.csv') #print titanic.describe() print titanic.head() titanic['Age'] = titanic['Age'].fillna(titanic['Age'].median())#對缺失值用平均值填充 #print titanic.describe() print titanic['Sex'].unique() titanic.loc[titanic['Sex'] == 'male','Sex'] = 0 #loc定位到哪一行,將titanic['Sex'] == 'male'的樣本Sex值改為0 titanic.loc[titanic['Sex'] == 'female','Sex'] = 1 print titanic['Sex'].unique() print titanic['Embarked'].unique() titanic['Embarked'] = titanic['Embarked'].fillna('S') #用最多的填 titanic.loc[titanic['Embarked'] == 'S','Embarked'] = 0 titanic.loc[titanic['Embarked'] == 'C','Embarked'] = 1 titanic.loc[titanic['Embarked'] == 'Q','Embarked'] = 2 print titanic['Embarked'].unique() from sklearn.linear_model import LinearRegression #線性迴歸 from sklearn.cross_validation import KFold #交叉驗證庫,將測試集進行切分交叉驗證取平均 predictors = ['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked'] #用到的特徵 alg = LinearRegression() kf = KFold(titanic.shape[0],n_folds=3,random_state=1) #將m個樣本平均分成3份進行交叉驗證 predictions = [] for train, test in kf: train_predictors = (titanic[predictors].iloc[train,:])#將predictors作為測試特徵 train_target = titanic['Survived'].iloc[train] alg.fit(train_predictors,train_target) test_prediction = alg.predict(titanic[predictors].iloc[test,:]) print test_prediction predictions.append(test_prediction) import numpy as np #使用線性迴歸得到的結果是在區間【0,1】上的某個值,需要將該值轉換成0或1 predictions = np.concatenate(predictions, axis=0) predictions[predictions >.5] = 1 predictions[predictions <=.5] = 0 accury = sum(predictions[predictions == titanic['Survived']]) / len(predictions)#測試準確率 print accury from sklearn.linear_model import LogisticRegression #邏輯迴歸 from sklearn import cross_validation alg = LogisticRegression(random_state=1) scores = cross_validation.cross_val_score(alg, titanic[predictors],titanic['Survived'],cv=3) print scores.mean() from sklearn.ensemble import RandomForestClassifier from sklearn import cross_validation predictions = ['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked'] alg = RandomForestClassifier(random_state=1,n_estimators=50,min_samples_split=4,min_samples_leaf=2) kf = cross_validation.KFold(titanic.shape[0],n_folds=3,random_state=1) scores = cross_validation.cross_val_score(alg,titanic[predictors],titanic['Survived'],cv=kf) print scores.mean() ##############提特徵###################### titanic['Familysize'] = titanic['SibSp'] + titanic['Parch'] #家庭總共多少人 titanic['NameLength'] = titanic['Name'].apply(lambda x: len(x)) #名字的長度 import re def get_title(name): title_reserch = re.search('([A-Za-z]+)\.',name) if title_reserch: return title_reserch.group(1) return "" titles = titanic['Name'].apply(get_title) print pandas.value_counts(titles) #將稱號轉換成數值表示 title_mapping = {"Mr":1,"Miss":2,"Mrs":3,"Master":4,"Dr":5,"Rev":6,"Col":7,"Major":8,"Mlle":9,"Countess":10,"Ms":11,"Lady":12,"Jonkheer":13,"Don":14,"Mme":15,"Capt":16,"Sir":17} for k,v in title_mapping.items(): titles[titles==k] = v print (pandas.value_counts(titles)) titanic["titles"] = titles #新增title特徵 import numpy as np from sklearn.feature_selection import SelectKBest,f_classif#引入feature_selection看每一個特徵的重要程度 import matplotlib.pyplot as plt predictors = ['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked','Familysize','NameLength','titles'] selector = SelectKBest(f_classif,k=5) selector.fit(titanic[predictors],titanic['Survived']) scores = -np.log10(selector.pvalues_) plt.bar(range(len(predictors)),scores) plt.xticks(range(len(predictors)),predictors,rotation='vertical') plt.show ##########整合分類器############# from sklearn.ensemble import GradientBoostingClassifier import numpy as np algorithas = [ [GradientBoostingClassifier(random_state=1,n_estimators=25,max_depth=3),['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked','Familysize','NameLength','titles']], [LogisticRegression(random_state=1),['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked','Familysize','NameLength','titles']] ] kf = KFold(titanic.shape[0],n_folds=3,random_state=1) predictions = [] for train, test in kf: train_target = titanic['Survived'].iloc[train] full_test_predictions = [] for alg,predictors in algorithas: alg.fit(titanic[predictors].iloc[train,:],train_target) test_prediction = alg.predict_proba(titanic[predictors].iloc[test,:].astype(float))[:,1] full_test_predictions.append(test_prediction) test_predictions = (full_test_predictions[0] + full_test_predictions[1]) / 2 test_predictions[test_predictions >.5] = 1 test_predictions[test_predictions <=.5] = 0 predictions.append(test_predictions) predictions = np.concatenate(predictions,axis=0) accury = sum(predictions[predictions == titanic['Survived']]) / len(predictions)#測試準確率 print accury