Titanic -----5
阿新 • • 發佈:2019-01-12
# 資料分析和處理
import numpy as np
import pandas as pd
# 資料視覺化
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')
combine = [train_df, test_df]
print(train_df.columns)
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'], dtype='object')
train_df.head()
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
train_df.info()
print('_'*40)
test_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId 891 non-null int64
Survived 891 non-null int64
Pclass 891 non-null int64
Name 891 non-null object
Sex 891 non-null object
Age 714 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Ticket 891 non-null object
Fare 891 non-null float64
Cabin 204 non-null object
Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
________________________________________
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId 418 non-null int64
Pclass 418 non-null int64
Name 418 non-null object
Sex 418 non-null object
Age 332 non-null float64
SibSp 418 non-null int64
Parch 418 non-null int64
Ticket 418 non-null object
Fare 417 non-null float64
Cabin 91 non-null object
Embarked 418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB
train_df.describe()
PassengerId | Survived | Pclass | Age | SibSp | Parch | Fare | |
---|---|---|---|---|---|---|---|
count | 891.000000 | 891.000000 | 891.000000 | 714.000000 | 891.000000 | 891.000000 | 891.000000 |
mean | 446.000000 | 0.383838 | 2.308642 | 29.699118 | 0.523008 | 0.381594 | 32.204208 |
std | 257.353842 | 0.486592 | 0.836071 | 14.526497 | 1.102743 | 0.806057 | 49.693429 |
min | 1.000000 | 0.000000 | 1.000000 | 0.420000 | 0.000000 | 0.000000 | 0.000000 |
25% | 223.500000 | 0.000000 | 2.000000 | 20.125000 | 0.000000 | 0.000000 | 7.910400 |
50% | 446.000000 | 0.000000 | 3.000000 | 28.000000 | 0.000000 | 0.000000 | 14.454200 |
75% | 668.500000 | 1.000000 | 3.000000 | 38.000000 | 1.000000 | 0.000000 | 31.000000 |
max | 891.000000 | 1.000000 | 3.000000 | 80.000000 | 8.000000 | 6.000000 | 512.329200 |
train_df.describe(include='O')
Name | Sex | Ticket | Cabin | Embarked | |
---|---|---|---|---|---|
count | 891 | 891 | 891 | 204 | 889 |
unique | 891 | 2 | 681 | 147 | 3 |
top | Laleff, Mr. Kristo | male | 1601 | C23 C25 C27 | S |
freq | 1 | 577 | 7 | 4 | 644 |
train_df[['Pclass', 'Survived']].groupby(['Pclass'],as_index=False)\
.mean().sort_values(by='Survived',ascending=False)
Pclass | Survived | |
---|---|---|
0 | 1 | 0.629630 |
1 | 2 | 0.472826 |
2 | 3 | 0.242363 |
train_df.groupby(['Sex'])['Sex','Survived'].mean()
Survived | |
---|---|
Sex | |
female | 0.742038 |
male | 0.188908 |
train_df[['Sex', 'Survived']].groupby(['Sex'],as_index=False)\
.mean().sort_values(by='Survived',ascending=False)
Sex | Survived | |
---|---|---|
0 | female | 0.742038 |
1 | male | 0.188908 |
train_df[['SibSp', 'Survived']].groupby(['SibSp'],as_index=False)\
.mean().sort_values(by='Survived',ascending=False)
SibSp | Survived | |
---|---|---|
1 | 1 | 0.535885 |
2 | 2 | 0.464286 |
0 | 0 | 0.345395 |
3 | 3 | 0.250000 |
4 | 4 | 0.166667 |
5 | 5 | 0.000000 |
6 | 8 | 0.000000 |
train_df[['Parch', 'Survived']].groupby(['Parch'],as_index=False)\
.mean().sort_values(by='Survived',ascending=False)
Parch | Survived | |
---|---|---|
3 | 3 | 0.600000 |
1 | 1 | 0.550847 |
2 | 2 | 0.500000 |
0 | 0 | 0.343658 |
5 | 5 | 0.200000 |
4 | 4 | 0.000000 |
6 | 6 | 0.000000 |
g = sns.FacetGrid(train_df, col='Survived')
g.map(plt.hist, 'Age', bins=20) #bins 直方數量
<seaborn.axisgrid.FacetGrid at 0x113396518>
grid = sns.FacetGrid(train_df, col='Survived', row='Pclass', size=2.2, aspect=1.6)
grid.map(plt.hist, 'Age', alpha=.5, bins=20) #bins表示直方數量, alpha表示顏色的深淺程度
grid.add_legend() # legend:圖例
<seaborn.axisgrid.FacetGrid at 0x1134c6358>
grid = sns.FacetGrid(train_df, row='Embarked', size=2.2, aspect=1.6)
grid.map(sns.pointplot, 'Pclass', 'Survived', 'Sex', palette='deep')
grid.add_legend()
/Users/shenxin/anaconda3/lib/python3.6/site-packages/seaborn/axisgrid.py:703: UserWarning: Using the pointplot function without specifying `order` is likely to produce an incorrect plot.
warnings.warn(warning)
/Users/shenxin/anaconda3/lib/python3.6/site-packages/seaborn/axisgrid.py:708: UserWarning: Using the pointplot function without specifying `hue_order` is likely to produce an incorrect plot.
warnings.warn(warning)
<seaborn.axisgrid.FacetGrid at 0x114cc0f28>
grid = sns.FacetGrid(train_df, row='Embarked', col='Survived', size=2.2, aspect=1.6)
grid.map(sns.barplot, 'Sex', 'Fare', alpha=.5, ci=None)
grid.add_legend()
/Users/shenxin/anaconda3/lib/python3.6/site-packages/seaborn/axisgrid.py:703: UserWarning: Using the barplot function without specifying `order` is likely to produce an incorrect plot.
warnings.warn(warning)
<seaborn.axisgrid.FacetGrid at 0x114b0f7b8>
print("Before", train_df.shape, test_df.shape, combine[0].shape, combine[1].shape)
Before (891, 12) (418, 11) (891, 12) (418, 11)
# 無關特徵刪除
train_df = train_df.drop(['Ticket', 'Cabin', 'Name'], axis=1)
test_df = test_df.drop(['Ticket', 'Cabin', 'Name'], axis=1)
combine = [train_df, test_df]
print("After", train_df.shape, test_df.shape, combine[0].shape, combine[1].shape)
After (891, 9) (418, 8) (891, 9) (418, 8)
# 分類特徵轉換為數值特徵
for dataset in combine:
dataset['Sex'] = dataset['Sex'].map({'female':1, 'male':0}).astype(int)
train_df.head()
PassengerId | Survived | Pclass | Sex | Age | SibSp | Parch | Fare | Embarked | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | 0 | 22.0 | 1 | 0 | 7.2500 | S |
1 | 2 | 1 | 1 | 1 | 38.0 | 1 | 0 | 71.2833 | C |
2 | 3 | 1 | 3 | 1 | 26.0 | 0 | 0 | 7.9250 | S |
3 | 4 | 1 | 1 | 1 | 35.0 | 1 | 0 | 53.1000 | S |
4 | 5 | 0 | 3 | 0 | 35.0 | 0 | 0 | 8.0500 | S |
# 數值特徵缺失值處理
guess_ages = np.zeros((2,3))
guess_ages
array([[0., 0., 0.],
[0., 0., 0.]])
for dataset in combine:
for i in range(0, 2):
for j in range(0, 3):
guess_df = dataset[(dataset['Sex'] == i) & \
(dataset['Pclass'] == j+1)]['Age'].dropna()
age_guess = guess_df.median()
guess_ages[i, j] = int(age_guess/0.5 + 0.5) * 0.5
for i in range(0, 2):
for j in range(0, 3):
dataset.loc[(dataset.Age.isnull()) & (dataset.Sex ==i) & ( dataset.Pclass == j+1),\
['Age']] = guess_ages[i, j]
train_df.head()
PassengerId | Survived | Pclass | Sex | Age | SibSp | Parch | Fare | Embarked | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | 0 | 22.0 | 1 | 0 | 7.2500 | S |
1 | 2 | 1 | 1 | 1 | 38.0 | 1 | 0 | 71.2833 | C |
2 | 3 | 1 | 3 | 1 | 26.0 | 0 | 0 | 7.9250 | S |
3 | 4 | 1 | 1 | 1 | 35.0 | 1 | 0 | 53.1000 | S |
4 | 5 | 0 | 3 | 0 | 35.0 | 0 | 0 | 8.0500 | S |
# 連續數值轉為分類特徵
train_df['AgeBand'] = pd.cut(train_df['Age'], 5) # 按數值值等分,區別 qcut()按數值個數等分
train_df[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False). \
mean().sort_values(by='AgeBand', ascending=True)
AgeBand | Survived | |
---|---|---|
0 | (0.34, 16.336] | 0.550000 |
1 | (16.336, 32.252] | 0.336714 |
2 | (32.252, 48.168] | 0.412844 |
3 | (48.168, 64.084] | 0.434783 |
4 | (64.084, 80.0] | 0.090909 |
for dataset in combine:
dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
dataset.loc[ dataset['Age'] > 64, 'Age']
train_df.head()
PassengerId | Survived | Pclass | Sex | Age | SibSp | Parch | Fare | Embarked | AgeBand | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | 0 | 1.0 | 1 | 0 | 7.2500 | S | (16.336, 32.252] |
1 | 2 | 1 | 1 | 1 | 2.0 | 1 | 0 | 71.2833 | C | (32.252, 48.168] |
2 | 3 | 1 | 3 | 1 | 1.0 | 0 | 0 | 7.9250 | S | (16.336, 32.252] |
3 | 4 | 1 | 1 | 1 | 2.0 | 1 | 0 | 53.1000 | S | (32.252, 48.168] |
4 | 5 | 0 | 3 | 0 | 2.0 | 0 | 0 | 8.0500 | S | (32.252, 48.168] |
train_df = train_df.drop(['AgeBand'], axis=1)
combine = [train_df, test_df]
train_df.head()
PassengerId | Survived | Pclass | Sex | Age | SibSp | Parch | Fare | Embarked | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | 0 | 1.0 | 1 | 0 | 7.2500 | S |
1 | 2 | 1 | 1 | 1 | 2.0 | 1 | 0 | 71.2833 | C |
2 | 3 | 1 | 3 | 1 | 1.0 | 0 | 0 | 7.9250 | S |
3 | 4 | 1 | 1 | 1 | 2.0 | 1 | 0 | 53.1000 | S |
4 | 5 | 0 | 3 | 0 | 2.0 | 0 | 0 | 8.0500 | S |
# 分類特徵缺失值處理(只有兩個,所以按最常用的填補)
freq_port = train_df.Embarked.dropna().mode()[0] # 最常見值
for dataset in combine:
dataset['Embarked'] = dataset['Embarked'].fillna(freq_port)
train_df[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean().\
sort_values(by='Survived', ascending=False)
Embarked | Survived | |
---|---|---|
0 | C | 0.553571 |
1 | Q | 0.389610 |
2 | S | 0.339009 |
for dataset in combine:
dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
train_df.head()
PassengerId | Survived | Pclass | Sex | Age | SibSp | Parch | Fare | Embarked | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | 0 | 1.0 | 1 | 0 | 7.2500 | 0 |
1 | 2 | 1 | 1 | 1 | 2.0 | 1 | 0 | 71.2833 | 1 |
2 | 3 | 1 | 3 | 1 | 1.0 | 0 | 0 | 7.9250 | 0 |
3 | 4 | 1 | 1 | 1 | 2.0 | 1 | 0 | 53.1000 | 0 |
4 | 5 | 0 | 3 | 0 | 2.0 | 0 | 0 | 8.0500 | 0 |
# 缺失較少,取中值
test_df['Fare'].fillna(test_df['Fare'].dropna().median(), inplace=True)
# 將票價離散化
train_df['FareBand'] = pd.qcut(train_df['Fare'], 4)
train_df[['FareBand', 'Survived']].groupby(['FareBand'], as_index=False).mean().\
sort_values(by='FareBand', ascending=True)
FareBand | Survived | |
---|---|---|
0 | (-0.001, 7.91] | 0.197309 |
1 | (7.91, 14.454] | 0.303571 |
2 | (14.454, 31.0] | 0.454955 |
3 | (31.0, 512.329] | 0.581081 |
for dataset in combine:
dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare'] = 2
dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
dataset['Fare'] = dataset['Fare'].astype(int)
train_df = train_df.drop(['FareBand'], axis=1)
combine = [train_df, test_df]
train_df.head(10)
PassengerId | Survived | Pclass | Sex | Age | SibSp | Parch | Fare | Embarked | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | 0 | 1.0 | 1 | 0 | 0 | 0 |
1 | 2 | 1 | 1 | 1 | 2.0 | 1 | 0 | 3 | 1 |
2 | 3 | 1 | 3 | 1 | 1.0 | 0 | 0 | 1 | 0 |
3 | 4 | 1 | 1 | 1 | 2.0 | 1 | 0 | 3 | 0 |
4 | 5 | 0 | 3 | 0 | 2.0 | 0 | 0 | 1 | 0 |
5 | 6 | 0 | 3 | 0 | 1.0 | 0 | 0 | 1 | 2 |
6 | 7 | 0 | 1 | 0 | 3.0 | 0 | 0 | 3 | 0 |
7 | 8 | 0 | 3 | 0 | 0.0 | 3 | 1 | 2 | 0 |
8 | 9 | 1 | 3 | 1 | 1.0 | 0 | 2 | 1 | 0 |
9 | 10 | 1 | 2 | 1 | 0.0 | 1 | 0 | 2 | 1 |
# 嘗試建立新特徵
for dataset in combine:
dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
train_df[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean().\
sort_values(by='Survived', ascending=False)
FamilySize | Survived | |
---|---|---|
3 | 4 | 0.724138 |
2 | 3 | 0.578431 |
1 | 2 | 0.552795 |
6 | 7 | 0.333333 |
0 | 1 | 0.303538 |
4 | 5 | 0.200000 |
5 | 6 | 0.136364 |
7 | 8 | 0.000000 |
8 | 11 | 0.000000 |
for dataset in combine:
dataset['IsAlone'] = 0
dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1
train_df[['IsAlone', 'Survived']].groupby(['IsAlone'], as_index=False).mean()
IsAlone | Survived | |
---|---|---|