Titanic 多模型版 詳解資料分析部分 機器學習初學者實戰
阿新 • • 發佈:2019-01-10
# 工作階段
# 1、問題定義
# 2、獲取訓練、測試集
# 3、處理資料
# 4、分析資料
# 5、建模以解決問題
# 6、視覺化展現
# 7、提交結果
# 1、問題定義
# 官網
# 匯入
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd
# 圖形視覺化庫
import seaborn as sns #seaborn基於matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
# 2、讀取資料
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
combine = [train_df, test_df]
# 3、處理資料 4、分析資料
# 特徵類別分析:
# print(train_df.columns.values)
# train_df.info()
# print('_'*40)
# test_df.info()
# available features:
# ['PassengerId' 'Survived' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch' 'Ticket' 'Fare' 'Cabin' 'Embarked']
# 7個int或float型別(testset中只有6個),5個字串。
# train_df.head()
# train_df.tail()
# categorical features:有助於將樣本集分類,選擇正確的視覺化圖形,這些特徵nominal,ordinal,ratio,interval based?
# Categorical: Survived, Sex, and Embarked. Ordinal: Pclass.
# numerical features:有助於選擇正確的視覺化圖形,這些特徵discrete,continuous,timeseries based?
# Continous: Age, Fare. Discrete: SibSp, Parch.
# mixed features:混合型資料型別
# Ticket是數字或數字母混合 Cabin是數字母混合
# contain error features:
# name feature可能包含錯誤,因為存在多種表達name方式:titles,round brackets,quotes
# contain blank features:
# 訓練集:Cabin > Age > Embarked features存在空值
# 測試集:Cabin > Age不完整
# 資料分佈分析:
# numerical features:
# train_df.describe()
# 樣本總量891,佔Tittanic乘客數量40%
# Survived分類特徵值為0或1
# 樣本乘客存活率為38%,而實際存活率為32%
# 超過75%乘客未帶父母和兒女
# 近30%乘客帶有兄妹或配偶
# <1%乘客支付$512 Fare
# <1% 老者在65-80
# categorical features:
# train_df.describe(include=['O'])
# Names是unique
# Sex中65% male
# Cabin存在重複值,幾個乘客共享cabin
# Embarked 存在3中選擇,大多數乘客選擇S port
# Ticket有22%重複率
# 基於資料分析的假設
# 在正式採取方案前,找出和Survival相關的features
# Completing:
# Age features肯定有關
# Embarked features有關或者與其他重要feature有關
# Correcting:
# Ticket features可能無關,因為高達22%的重複率
# Cabin features可能無關,因為在測試集和訓練集中包含太多空值
# PassengerId明顯無關
# Name features資料表示方法過多,不夠標準化,可能不能對結果造成直接影響
# Creating:
# 可能想基於父母兄妹上船的家庭人數建立一個新feature Family
# 可能想要基於Name的title建立一個新feature
# 可能想基於Age建立一個新feature來將連續的數字特徵轉換為一個序列的分類特徵
# 可能想建立一個Fare range的新特徵
# Classifying:
# 女人可能更容易倖存
# Age<?的孩子可能更容易倖存
# 頭等艙的乘客可能更容易倖存
# pivoting 特徵的分析
# 為驗證我們的觀察和猜測,可以獨立pivoting 特徵來快速分析
# Pclass在Pclass=1時相關性大於50%
# train_df[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False)
# Sex在Sex=female時相關性大於74%
# train_df[["Sex", "Survived"]].groupby(['Sex'], as_index=False).mean().sort_values(by='Survived', ascending=False)
# SibSp and Parch相關性不大,不呈現規律。可能該特徵來自於其他特徵或一系列特徵
# train_df[["SibSp", "Survived"]].groupby(['SibSp'], as_index=False).mean().sort_values(by='Survived', ascending=False)
# train_df[["Parch", "Survived"]].groupby(['Parch'], as_index=False).mean().sort_values(by='Survived', ascending=False)
# visualizing 資料分析:
# 使用視覺化工具繼續確定假設,直方圖為例。
#@ 數字特徵直方圖(numerical features):
# 年齡直方圖:
# g = sns.FacetGrid(train_df, col='Survived')
# g.map(plt.hist, 'Age', bins=20)
# 觀察:
# 孩子<4存活率高
# age=80存活
# 大量15-25未活下來
# 大部分乘客年齡在15-35歲之間
# 決定:
# 在模型中考慮age特徵
# 填充age的空值
# 應該分age組
#@ 聯合多特徵直方圖(numerical and ordinal features):
# Pclass 和 age:
# grid = sns.FacetGrid(train_df, col='Pclass', hue='Survived')
# grid = sns.FacetGrid(train_df, col='Survived', row='Pclass', size=2.2, aspect=1.6)
# grid.map(plt.hist, 'Age', alpha=.5, bins=20)
# grid.add_legend();
# 觀察:
# Pclass=3乘客最多,但大部分死亡
# Pclass=1或Pclass=2的孩子大部分存活
# Pclass=1乘客大部分存活
# 決定:
# 模型考慮Pclass特徵
#@ 類別特徵相關性(categorical features):
# 類別直方圖
# grid = sns.FacetGrid(train_df, col='Embarked')
# grid = sns.FacetGrid(train_df, row='Embarked', size=2.2, aspect=1.6)
# grid.map(sns.pointplot, 'Pclass', 'Survived', 'Sex', palette='deep')
# grid.add_legend()
# 觀察:
# 女士更容易存活
# 除了Exception=C 男人更容易存活,在Pclass和Embarked存在聯絡,Pclass與Surivived關係相反
# 在Pclass=3時,相比Pclass=2的C或Q港口,男士存活率更高
# 對於Pclass=3,港口有著不同存活率
# 決定:
# 模型考慮Sex特徵
# 完善Embarked後加入模型
# 類別和數字特徵相關性(categorical and numerical features):
# 將Embarked (Categorical non-numeric), Sex (Categorical non-numeric), Fare (Numeric continuous), with Survived (Categorical numeric)一同考慮
# grid = sns.FacetGrid(train_df, col='Embarked', hue='Survived', palette={0: 'k', 1: 'w'})
# grid = sns.FacetGrid(train_df, row='Embarked', col='Survived', size=2.2, aspect=1.6)
# grid.map(sns.barplot, 'Sex', 'Fare', alpha=.5, ci=None)
# grid.add_legend()
# 觀察:
# 高票價乘客更容易存活
# 登港口與存活率有關
# 決定:
# 考慮Fare特徵
# 處理資料:
#@ 通過刪除無關特徵:
# 根據前面假設和驗證應刪除Cabin和Ticket特徵:
# print("Before", train_df.shape, test_df.shape, combine[0].shape, combine[1].shape)
# train_df = train_df.drop(['Ticket', 'Cabin'], axis=1)
# test_df = test_df.drop(['Ticket', 'Cabin'], axis=1)
# combine = [train_df, test_df]
# "After", train_df.shape, test_df.shape, combine[0].shape, combine[1].shape
#@ 根據已存在特徵建立新特徵:
# 在刪除Name和PassengerId前想能否找到Title與Survival之間聯絡
# 通過正則找出Title:
# for dataset in combine:
# dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
# pd.crosstab(train_df['Title'], train_df['Sex'])
# 對找出的Title替換成同一Title:
# for dataset in combine:
# dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
# 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
# dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
# dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
# dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
# train_df[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()
# 將Title轉化為序列:
# title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
# for dataset in combine:
# dataset['Title'] = dataset['Title'].map(title_mapping)
# dataset['Title'] = dataset['Title'].fillna(0)
# train_df.head()
# 刪除Name和PassengerId特徵
# train_df = train_df.drop(['Name', 'PassengerId'], axis=1)
# test_df = test_df.drop(['Name'], axis=1)
# combine = [train_df, test_df]
# train_df.shape, test_df.shape
# 當我們畫出Title、Age、Suivived,得出:
# 觀察:
# 大部分Title與Age分組類似
# Survival在Title和Age間輕微不同
# 一些Title大部分存活 (Mme, Lady, Sir) ,一部分沒有 (Don, Rev, Jonkheer)
# 決定:
# 將Title加入模型
#@ 將字串特徵轉化成數字特徵:
# 轉化Sex:
# for dataset in combine:
# dataset['Sex'] = dataset['Sex'].map( {'female': 1, 'male': 0} ).astype(int)
# train_df.head()
#@ 填充空值:
# 存在三種方法來完善連續數字特徵:
# 1、簡單方式:在mean和標準偏差間產生一個隨機數
# 2、準確方式:通過相關特徵猜測缺失值,此例中通過Pclass和Gender特徵組合使用中值猜測Age的值
# 3、聯合1、2基於Pclass和Gender特徵組合,在中值和偏差間產生一個隨機數
# 2方法:
# 方法1、3將會在我們模型中引入隨機噪聲,多次執行可能結果會有所不同。所以採用方法2:
# grid = sns.FacetGrid(train_df, col='Pclass', hue='Gender')
# grid = sns.FacetGrid(train_df, row='Pclass', col='Sex', size=2.2, aspect=1.6)
# grid.map(plt.hist, 'Age', alpha=.5, bins=20)
# grid.add_legend()
# 生成一個空陣列來儲存Age的猜測值:
# guess_ages = np.zeros((2,3))
# guess_ages
# 遍歷Sex和Pclass來猜測Age猜測值:
# for dataset in combine:
# for i in range(0, 2):
# for j in range(0, 3):
# guess_df = dataset[(dataset['Sex'] == i) & \
# (dataset['Pclass'] == j+1)]['Age'].dropna()
# # age_mean = guess_df.mean()
# # age_std = guess_df.std()
# # age_guess = rnd.uniform(age_mean - age_std, age_mean + age_std)
# age_guess = guess_df.median()
# # Convert random age float to nearest .5 age
# guess_ages[i,j] = int( age_guess/0.5 + 0.5 ) * 0.5
# for i in range(0, 2):
# for j in range(0, 3):
# dataset.loc[ (dataset.Age.isnull()) & (dataset.Sex == i) & (dataset.Pclass == j+1),\
# 'Age'] = guess_ages[i,j]
# dataset['Age'] = dataset['Age'].astype(int)
# train_df.head()
# 建立Age bands,來確定其和Survival關係:
# train_df['AgeBand'] = pd.cut(train_df['Age'], 5)
# train_df[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False).mean().sort_values(by='AgeBand', ascending=True)
# 使用序列替換年齡:
# for dataset in combine:
# dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
# dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
# dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
# dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
# dataset.loc[ dataset['Age'] > 64, 'Age']
# train_df.head()
# 此時除去AgeBand
# train_df = train_df.drop(['AgeBand'], axis=1)
# combine = [train_df, test_df]
# train_df.head()
#@ 聯合已存在特徵建立新特徵:
# 基於Parch和SibSp的FamilySize建立新特徵,這樣允許我們刪除Parch和SibSp
# for dataset in combine:
# dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
# train_df[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean().sort_values(by='Survived', ascending=False)
# 可以建立新特徵IsAlone:
# for dataset in combine:
# dataset['IsAlone'] = 0
# dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1
# train_df[['IsAlone', 'Survived']].groupby(['IsAlone'], as_index=False).mean()
# 刪除SibSp、Parch、FamilySize:
# train_df = train_df.drop(['Parch', 'SibSp', 'FamilySize'], axis=1)
# test_df = test_df.drop(['Parch', 'SibSp', 'FamilySize'], axis=1)
# combine = [train_df, test_df]
# train_df.head()
# 同樣可以建立一個特徵Age*Pclass:
# for dataset in combine:
# dataset['Age*Class'] = dataset.Age * dataset.Pclass
# train_df.loc[:, ['Age*Class', 'Age', 'Pclass']].head(10)
#@ 完善categorical features:
# Embarked特徵有兩個確實值,使用最普遍的進行填充:
# freq_port = train_df.Embarked.dropna().mode()[0]
# freq_port
# 填充:
# for dataset in combine:
# dataset['Embarked'] = dataset['Embarked'].fillna(freq_port)
# train_df[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean().sort_values(by='Survived', ascending=False)
#@ 將categorical features轉為數字型:
# for dataset in combine:
# dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
# train_df.head()
#@ 快速完善轉化為numeric features):
# 使用最多出現的數填充Fare空值(並非建立新特徵或進一步分析猜測空值,只是用一個值來填充):
# test_df['Fare'].fillna(test_df['Fare'].dropna().median(), inplace=True)
# test_df.head()
# 建立FareBand:
# train_df['FareBand'] = pd.qcut(train_df['Fare'], 4)
# train_df[['FareBand', 'Survived']].groupby(['FareBand'], as_index=False).mean().sort_values(by='FareBand', ascending=True)
# 基於FareBand將Fare轉成序列:
# for dataset in combine:
# dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
# dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
# dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare'] = 2
# dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
# dataset['Fare'] = dataset['Fare'].astype(int)
# train_df = train_df.drop(['FareBand'], axis=1)
# combine = [train_df, test_df]
# train_df.head(10)
# 處理完的資料:
# test_df.head(10)
# 模型、預測、解決:
# 該問題屬於分類迴歸問題,監督學習。有以下幾種模型選擇:
# Logistic Regression 邏輯迴歸
# KNN or k-Nearest Neighbors K近鄰學習
# Support Vector Machines 支援向量機
# Naive Bayes classifier 樸素貝葉斯分類器
# Decision Tree 決策樹
# Random Forrest 隨機森林
# Perceptron 感知機
# Artificial neural network 人工神經網路
# RVM or Relevance Vector Machine 相關向量機
# X_train = train_df.drop("Survived", axis=1)
# Y_train = train_df["Survived"]
# X_test = test_df.drop("PassengerId", axis=1).copy()
# X_train.shape, Y_train.shape, X_test.shape
# 邏輯迴歸:通過邏輯函式,預測在分類依賴變數(果變數)與一個或多個獨立變數(因變數)之間的關係。
# # Logistic Regression
# logreg = LogisticRegression()
# logreg.fit(X_train, Y_train)
# Y_pred = logreg.predict(X_test)
# acc_log = round(logreg.score(X_train, Y_train) * 100, 2)
# acc_log
# 80.359999999999999
# 邏輯迴歸可以驗證我們的假設,通過係數可以知道features是正面還是反面的
# coeff_df = pd.DataFrame(train_df.columns.delete(0))
# coeff_df.columns = ['Feature']
# coeff_df["Correlation"] = pd.Series(logreg.coef_[0])
# coeff_df.sort_values(by='Correlation', ascending=False)
# 支援向量機SVM:
# # Support Vector Machines
# svc = SVC()
# svc.fit(X_train, Y_train)
# Y_pred = svc.predict(X_test)
# acc_svc = round(svc.score(X_train, Y_train) * 100, 2)
# acc_svc
# 83.840000000000003
# K近鄰學習KNN:
# knn = KNeighborsClassifier(n_neighbors = 3)
# knn.fit(X_train, Y_train)
# Y_pred = knn.predict(X_test)
# acc_knn = round(knn.score(X_train, Y_train) * 100, 2)
# acc_knn
# 84.739999999999995
# 樸素貝葉斯分類器:
# # Gaussian Naive Bayes
# gaussian = GaussianNB()
# gaussian.fit(X_train, Y_train)
# Y_pred = gaussian.predict(X_test)
# acc_gaussian = round(gaussian.score(X_train, Y_train) * 100, 2)
# acc_gaussian
# Out[45]:
# 72.280000000000001
# 感知機:
# # Perceptron
# perceptron = Perceptron()
# perceptron.fit(X_train, Y_train)
# Y_pred = perceptron.predict(X_test)
# acc_perceptron = round(perceptron.score(X_train, Y_train) * 100, 2)
# acc_perceptron
# Out[46]:
# 78.0
# # Linear SVC
# linear_svc = LinearSVC()
# linear_svc.fit(X_train, Y_train)
# Y_pred = linear_svc.predict(X_test)
# acc_linear_svc = round(linear_svc.score(X_train, Y_train) * 100, 2)
# acc_linear_svc
# Out[47]:
# 79.010000000000005
# # Stochastic Gradient Descent
# sgd = SGDClassifier()
# sgd.fit(X_train, Y_train)
# Y_pred = sgd.predict(X_test)
# acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2)
# acc_sgd
# Out[48]:
# 77.329999999999998
# 決策樹:
# # Decision Tree
# decision_tree = DecisionTreeClassifier()
# decision_tree.fit(X_train, Y_train)
# Y_pred = decision_tree.predict(X_test)
# acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)
# acc_decision_tree
# Out[49]:
# 86.760000000000005
# 隨機森林:
# # Random Forest
# random_forest = RandomForestClassifier(n_estimators=100)
# random_forest.fit(X_train, Y_train)
# Y_pred = random_forest.predict(X_test)
# random_forest.score(X_train, Y_train)
# acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)
# acc_random_forest
# 模型評估:
# models = pd.DataFrame({
# 'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression',
# 'Random Forest', 'Naive Bayes', 'Perceptron',
# 'Stochastic Gradient Decent', 'Linear SVC',
# 'Decision Tree'],
# 'Score': [acc_svc, acc_knn, acc_log,
# acc_random_forest, acc_gaussian, acc_perceptron,
# acc_sgd, acc_linear_svc, acc_decision_tree]})
# models.sort_values(by='Score', ascending=False)
# 儲存資料:
# submission = pd.DataFrame({
# "PassengerId": test_df["PassengerId"],
# "Survived": Y_pred
# })
# # submission.to_csv('../output/submission.csv', index=False)