1. 程式人生 > >Titanic 多模型版 詳解資料分析部分 機器學習初學者實戰

Titanic 多模型版 詳解資料分析部分 機器學習初學者實戰

# 工作階段
#   1、問題定義
#   2、獲取訓練、測試集
#   3、處理資料
#   4、分析資料
#   5、建模以解決問題
#   6、視覺化展現
#   7、提交結果



# 1、問題定義
    # 官網

# 匯入
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# 圖形視覺化庫
import seaborn as sns    #seaborn基於matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.linear_model import LogisticRegression from sklearn.svm import SVC, LinearSVC from sklearn.ensemble import RandomForestClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.naive_bayes import GaussianNB from sklearn.linear_model import Perceptron from sklearn.linear_model import SGDClassifier from sklearn.tree import DecisionTreeClassifier #
2、讀取資料 train_df = pd.read_csv('train.csv') test_df = pd.read_csv('test.csv') combine = [train_df, test_df] # 3、處理資料 4、分析資料 # 特徵類別分析: # print(train_df.columns.values) # train_df.info() # print('_'*40) # test_df.info() # available features#
['PassengerId' 'Survived' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch' 'Ticket' 'Fare' 'Cabin' 'Embarked'] # 7個int或float型別(testset中只有6個),5個字串。 # train_df.head() # train_df.tail() # categorical features:有助於將樣本集分類,選擇正確的視覺化圖形,這些特徵nominal,ordinal,ratio,interval based? # Categorical: Survived, Sex, and Embarked. Ordinal: Pclass. # numerical features:有助於選擇正確的視覺化圖形,這些特徵discrete,continuous,timeseries based? # Continous: Age, Fare. Discrete: SibSp, Parch. # mixed features:混合型資料型別 # Ticket是數字或數字母混合 Cabin是數字母混合 # contain error features# name feature可能包含錯誤,因為存在多種表達name方式:titles,round brackets,quotes # contain blank features# 訓練集:Cabin > Age > Embarked features存在空值 # 測試集:Cabin > Age不完整 # 資料分佈分析: # numerical features# train_df.describe() # 樣本總量891,佔Tittanic乘客數量40% # Survived分類特徵值為01 # 樣本乘客存活率為38%,而實際存活率為32% # 超過75%乘客未帶父母和兒女 # 30%乘客帶有兄妹或配偶 # <1%乘客支付$512 Fare # <1% 老者在65-80 # categorical features# train_df.describe(include=['O']) # Names是unique # Sex65% male # Cabin存在重複值,幾個乘客共享cabin # Embarked 存在3中選擇,大多數乘客選擇S port # Ticket22%重複率 # 基於資料分析的假設 # 在正式採取方案前,找出和Survival相關的features # Completing# Age features肯定有關 # Embarked features有關或者與其他重要feature有關 # Correcting# Ticket features可能無關,因為高達22%的重複率 # Cabin features可能無關,因為在測試集和訓練集中包含太多空值 # PassengerId明顯無關 # Name features資料表示方法過多,不夠標準化,可能不能對結果造成直接影響 # Creating# 可能想基於父母兄妹上船的家庭人數建立一個新feature Family # 可能想要基於Name的title建立一個新feature # 可能想基於Age建立一個新feature來將連續的數字特徵轉換為一個序列的分類特徵 # 可能想建立一個Fare range的新特徵 # Classifying# 女人可能更容易倖存 # Age<?的孩子可能更容易倖存 # 頭等艙的乘客可能更容易倖存 # pivoting 特徵的分析 # 為驗證我們的觀察和猜測,可以獨立pivoting 特徵來快速分析 # Pclass在Pclass=1時相關性大於50% # train_df[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False) # Sex在Sex=female時相關性大於74% # train_df[["Sex", "Survived"]].groupby(['Sex'], as_index=False).mean().sort_values(by='Survived', ascending=False) # SibSp and Parch相關性不大,不呈現規律。可能該特徵來自於其他特徵或一系列特徵 # train_df[["SibSp", "Survived"]].groupby(['SibSp'], as_index=False).mean().sort_values(by='Survived', ascending=False) # train_df[["Parch", "Survived"]].groupby(['Parch'], as_index=False).mean().sort_values(by='Survived', ascending=False) # visualizing 資料分析: # 使用視覺化工具繼續確定假設,直方圖為例。 #@ 數字特徵直方圖(numerical features): # 年齡直方圖: # g = sns.FacetGrid(train_df, col='Survived') # g.map(plt.hist, 'Age', bins=20) # 觀察: # 孩子<4存活率高 # age=80存活 # 大量15-25未活下來 # 大部分乘客年齡在15-35歲之間 # 決定: # 在模型中考慮age特徵 # 填充age的空值 # 應該分age組 #@ 聯合多特徵直方圖(numerical and ordinal features): # Pclass 和 age: # grid = sns.FacetGrid(train_df, col='Pclass', hue='Survived') # grid = sns.FacetGrid(train_df, col='Survived', row='Pclass', size=2.2, aspect=1.6) # grid.map(plt.hist, 'Age', alpha=.5, bins=20) # grid.add_legend(); # 觀察: # Pclass=3乘客最多,但大部分死亡 # Pclass=1或Pclass=2的孩子大部分存活 # Pclass=1乘客大部分存活 # 決定: # 模型考慮Pclass特徵 #@ 類別特徵相關性(categorical features): # 類別直方圖 # grid = sns.FacetGrid(train_df, col='Embarked') # grid = sns.FacetGrid(train_df, row='Embarked', size=2.2, aspect=1.6) # grid.map(sns.pointplot, 'Pclass', 'Survived', 'Sex', palette='deep') # grid.add_legend() # 觀察: # 女士更容易存活 # 除了Exception=C 男人更容易存活,在Pclass和Embarked存在聯絡,Pclass與Surivived關係相反 # 在Pclass=3時,相比Pclass=2的C或Q港口,男士存活率更高 # 對於Pclass=3,港口有著不同存活率 # 決定: # 模型考慮Sex特徵 # 完善Embarked後加入模型 # 類別和數字特徵相關性(categorical and numerical features): # 將Embarked (Categorical non-numeric), Sex (Categorical non-numeric), Fare (Numeric continuous), with Survived (Categorical numeric)一同考慮 # grid = sns.FacetGrid(train_df, col='Embarked', hue='Survived', palette={0: 'k', 1: 'w'}) # grid = sns.FacetGrid(train_df, row='Embarked', col='Survived', size=2.2, aspect=1.6) # grid.map(sns.barplot, 'Sex', 'Fare', alpha=.5, ci=None) # grid.add_legend() # 觀察: # 高票價乘客更容易存活 # 登港口與存活率有關 # 決定: # 考慮Fare特徵 # 處理資料: #@ 通過刪除無關特徵: # 根據前面假設和驗證應刪除Cabin和Ticket特徵: # print("Before", train_df.shape, test_df.shape, combine[0].shape, combine[1].shape) # train_df = train_df.drop(['Ticket', 'Cabin'], axis=1) # test_df = test_df.drop(['Ticket', 'Cabin'], axis=1) # combine = [train_df, test_df] # "After", train_df.shape, test_df.shape, combine[0].shape, combine[1].shape #@ 根據已存在特徵建立新特徵: # 在刪除Name和PassengerId前想能否找到Title與Survival之間聯絡 # 通過正則找出Title: # for dataset in combine: # dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False) # pd.crosstab(train_df['Title'], train_df['Sex']) # 對找出的Title替換成同一Title: # for dataset in combine: # dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col',\ # 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare') # dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss') # dataset['Title'] = dataset['Title'].replace('Ms', 'Miss') # dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs') # train_df[['Title', 'Survived']].groupby(['Title'], as_index=False).mean() # 將Title轉化為序列: # title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5} # for dataset in combine: # dataset['Title'] = dataset['Title'].map(title_mapping) # dataset['Title'] = dataset['Title'].fillna(0) # train_df.head() # 刪除Name和PassengerId特徵 # train_df = train_df.drop(['Name', 'PassengerId'], axis=1) # test_df = test_df.drop(['Name'], axis=1) # combine = [train_df, test_df] # train_df.shape, test_df.shape # 當我們畫出Title、Age、Suivived,得出: # 觀察: # 大部分Title與Age分組類似 # Survival在Title和Age間輕微不同 # 一些Title大部分存活 (Mme, Lady, Sir) ,一部分沒有 (Don, Rev, Jonkheer) # 決定: # 將Title加入模型 #@ 將字串特徵轉化成數字特徵: # 轉化Sex: # for dataset in combine: # dataset['Sex'] = dataset['Sex'].map( {'female': 1, 'male': 0} ).astype(int) # train_df.head() #@ 填充空值: # 存在三種方法來完善連續數字特徵: # 1、簡單方式:在mean和標準偏差間產生一個隨機數 # 2、準確方式:通過相關特徵猜測缺失值,此例中通過Pclass和Gender特徵組合使用中值猜測Age的值 # 3、聯合12基於Pclass和Gender特徵組合,在中值和偏差間產生一個隨機數 # 2方法: # 方法13將會在我們模型中引入隨機噪聲,多次執行可能結果會有所不同。所以採用方法2# grid = sns.FacetGrid(train_df, col='Pclass', hue='Gender') # grid = sns.FacetGrid(train_df, row='Pclass', col='Sex', size=2.2, aspect=1.6) # grid.map(plt.hist, 'Age', alpha=.5, bins=20) # grid.add_legend() # 生成一個空陣列來儲存Age的猜測值: # guess_ages = np.zeros((2,3)) # guess_ages # 遍歷Sex和Pclass來猜測Age猜測值: # for dataset in combine: # for i in range(0, 2): # for j in range(0, 3): # guess_df = dataset[(dataset['Sex'] == i) & \ # (dataset['Pclass'] == j+1)]['Age'].dropna() # # age_mean = guess_df.mean() # # age_std = guess_df.std() # # age_guess = rnd.uniform(age_mean - age_std, age_mean + age_std) # age_guess = guess_df.median() # # Convert random age float to nearest .5 age # guess_ages[i,j] = int( age_guess/0.5 + 0.5 ) * 0.5 # for i in range(0, 2): # for j in range(0, 3): # dataset.loc[ (dataset.Age.isnull()) & (dataset.Sex == i) & (dataset.Pclass == j+1),\ # 'Age'] = guess_ages[i,j] # dataset['Age'] = dataset['Age'].astype(int) # train_df.head() # 建立Age bands,來確定其和Survival關係: # train_df['AgeBand'] = pd.cut(train_df['Age'], 5) # train_df[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False).mean().sort_values(by='AgeBand', ascending=True) # 使用序列替換年齡: # for dataset in combine: # dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0 # dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1 # dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2 # dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3 # dataset.loc[ dataset['Age'] > 64, 'Age'] # train_df.head() # 此時除去AgeBand # train_df = train_df.drop(['AgeBand'], axis=1) # combine = [train_df, test_df] # train_df.head() #@ 聯合已存在特徵建立新特徵: # 基於Parch和SibSp的FamilySize建立新特徵,這樣允許我們刪除Parch和SibSp # for dataset in combine: # dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1 # train_df[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean().sort_values(by='Survived', ascending=False) # 可以建立新特徵IsAlone: # for dataset in combine: # dataset['IsAlone'] = 0 # dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1 # train_df[['IsAlone', 'Survived']].groupby(['IsAlone'], as_index=False).mean() # 刪除SibSp、Parch、FamilySize: # train_df = train_df.drop(['Parch', 'SibSp', 'FamilySize'], axis=1) # test_df = test_df.drop(['Parch', 'SibSp', 'FamilySize'], axis=1) # combine = [train_df, test_df] # train_df.head() # 同樣可以建立一個特徵Age*Pclass: # for dataset in combine: # dataset['Age*Class'] = dataset.Age * dataset.Pclass # train_df.loc[:, ['Age*Class', 'Age', 'Pclass']].head(10) #@ 完善categorical features: # Embarked特徵有兩個確實值,使用最普遍的進行填充: # freq_port = train_df.Embarked.dropna().mode()[0] # freq_port # 填充: # for dataset in combine: # dataset['Embarked'] = dataset['Embarked'].fillna(freq_port) # train_df[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean().sort_values(by='Survived', ascending=False) #@ 將categorical features轉為數字型: # for dataset in combine: # dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int) # train_df.head() #@ 快速完善轉化為numeric features): # 使用最多出現的數填充Fare空值(並非建立新特徵或進一步分析猜測空值,只是用一個值來填充): # test_df['Fare'].fillna(test_df['Fare'].dropna().median(), inplace=True) # test_df.head() # 建立FareBand: # train_df['FareBand'] = pd.qcut(train_df['Fare'], 4) # train_df[['FareBand', 'Survived']].groupby(['FareBand'], as_index=False).mean().sort_values(by='FareBand', ascending=True) # 基於FareBand將Fare轉成序列: # for dataset in combine: # dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0 # dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1 # dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare'] = 2 # dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3 # dataset['Fare'] = dataset['Fare'].astype(int) # train_df = train_df.drop(['FareBand'], axis=1) # combine = [train_df, test_df] # train_df.head(10) # 處理完的資料: # test_df.head(10) # 模型、預測、解決: # 該問題屬於分類迴歸問題,監督學習。有以下幾種模型選擇: # Logistic Regression 邏輯迴歸 # KNN or k-Nearest Neighbors K近鄰學習 # Support Vector Machines 支援向量機 # Naive Bayes classifier 樸素貝葉斯分類器 # Decision Tree 決策樹 # Random Forrest 隨機森林 # Perceptron 感知機 # Artificial neural network 人工神經網路 # RVM or Relevance Vector Machine 相關向量機 # X_train = train_df.drop("Survived", axis=1) # Y_train = train_df["Survived"] # X_test = test_df.drop("PassengerId", axis=1).copy() # X_train.shape, Y_train.shape, X_test.shape # 邏輯迴歸:通過邏輯函式,預測在分類依賴變數(果變數)與一個或多個獨立變數(因變數)之間的關係。 # # Logistic Regression # logreg = LogisticRegression() # logreg.fit(X_train, Y_train) # Y_pred = logreg.predict(X_test) # acc_log = round(logreg.score(X_train, Y_train) * 100, 2) # acc_log # 80.359999999999999 # 邏輯迴歸可以驗證我們的假設,通過係數可以知道features是正面還是反面的 # coeff_df = pd.DataFrame(train_df.columns.delete(0)) # coeff_df.columns = ['Feature'] # coeff_df["Correlation"] = pd.Series(logreg.coef_[0]) # coeff_df.sort_values(by='Correlation', ascending=False) # 支援向量機SVM: # # Support Vector Machines # svc = SVC() # svc.fit(X_train, Y_train) # Y_pred = svc.predict(X_test) # acc_svc = round(svc.score(X_train, Y_train) * 100, 2) # acc_svc # 83.840000000000003 # K近鄰學習KNN: # knn = KNeighborsClassifier(n_neighbors = 3) # knn.fit(X_train, Y_train) # Y_pred = knn.predict(X_test) # acc_knn = round(knn.score(X_train, Y_train) * 100, 2) # acc_knn # 84.739999999999995 # 樸素貝葉斯分類器: # # Gaussian Naive Bayes # gaussian = GaussianNB() # gaussian.fit(X_train, Y_train) # Y_pred = gaussian.predict(X_test) # acc_gaussian = round(gaussian.score(X_train, Y_train) * 100, 2) # acc_gaussian # Out[45]: # 72.280000000000001 # 感知機: # # Perceptron # perceptron = Perceptron() # perceptron.fit(X_train, Y_train) # Y_pred = perceptron.predict(X_test) # acc_perceptron = round(perceptron.score(X_train, Y_train) * 100, 2) # acc_perceptron # Out[46]: # 78.0 # # Linear SVC # linear_svc = LinearSVC() # linear_svc.fit(X_train, Y_train) # Y_pred = linear_svc.predict(X_test) # acc_linear_svc = round(linear_svc.score(X_train, Y_train) * 100, 2) # acc_linear_svc # Out[47]: # 79.010000000000005 # # Stochastic Gradient Descent # sgd = SGDClassifier() # sgd.fit(X_train, Y_train) # Y_pred = sgd.predict(X_test) # acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2) # acc_sgd # Out[48]: # 77.329999999999998 # 決策樹: # # Decision Tree # decision_tree = DecisionTreeClassifier() # decision_tree.fit(X_train, Y_train) # Y_pred = decision_tree.predict(X_test) # acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2) # acc_decision_tree # Out[49]: # 86.760000000000005 # 隨機森林: # # Random Forest # random_forest = RandomForestClassifier(n_estimators=100) # random_forest.fit(X_train, Y_train) # Y_pred = random_forest.predict(X_test) # random_forest.score(X_train, Y_train) # acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2) # acc_random_forest # 模型評估: # models = pd.DataFrame({ # 'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', # 'Random Forest', 'Naive Bayes', 'Perceptron', # 'Stochastic Gradient Decent', 'Linear SVC', # 'Decision Tree'], # 'Score': [acc_svc, acc_knn, acc_log, # acc_random_forest, acc_gaussian, acc_perceptron, # acc_sgd, acc_linear_svc, acc_decision_tree]}) # models.sort_values(by='Score', ascending=False) # 儲存資料: # submission = pd.DataFrame({ # "PassengerId": test_df["PassengerId"], # "Survived": Y_pred # }) # # submission.to_csv('../output/submission.csv', index=False)