Python 泰坦尼克號資料分析
阿新 • • 發佈:2019-02-20
匯入模組
import numpy as np
import pandas as pd
%matplotlib inline
匯入資料
train = pd.read_csv("./train.csv")
test = pd.read_csv("./test.csv")
檢視資料
train.head()
print("train data shape:",train.shape)
print("test data shape:",test.shape)
檢視資料空值
train.isnull().sum()
# 用年齡中位數填充表中未知年齡的資料 train.Age.fillna(train.Age.median(),inplace=True)
嘗試從性別角度進行生存率分析
train.Sex.value_counts()
# 生還者中,男女的人數
survived = train[train.Survived==1].Sex.value_counts()
# 未生還者中,男女的人數
dead = train[train.Survived==0].Sex.value_counts()
df = pd.DataFrame([survived,dead],index=["survived","dead"])
繪製柱狀圖
df.plot.bar()
繪製生存率百分比圖
df['p_survived'] = df.survived / (df.survived + df.dead) df['p_dead'] = df.dead / (df.survived + df.dead)
df[['p_survived','p_dead']].plot.bar(stacked=True)
分析年齡對生存率影響
survived = train[train.Survived==1].Age
dead = train[train.Survived==0].Age
df = pd.DataFrame([survived,dead],index=["survived","dead"])
df = df.T
df.plot.hist(stacked=True)
繪製密度圖
# 密度圖
df.plot.kde(xlim=(0,80))