Titanic -----5

# 資料分析和處理
import numpy as np
import pandas as pd

# 資料視覺化
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')
combine = [train_df, test_df]

print(train_df.columns)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

train_df.head()

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S

train_df.info()
print('_'*40)
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
________________________________________
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB

train_df.describe()

	PassengerId	Survived	Pclass	Age	SibSp	Parch	Fare
count	891.000000	891.000000	891.000000	714.000000	891.000000	891.000000	891.000000
mean	446.000000	0.383838	2.308642	29.699118	0.523008	0.381594	32.204208
std	257.353842	0.486592	0.836071	14.526497	1.102743	0.806057	49.693429
min	1.000000	0.000000	1.000000	0.420000	0.000000	0.000000	0.000000
25%	223.500000	0.000000	2.000000	20.125000	0.000000	0.000000	7.910400
50%	446.000000	0.000000	3.000000	28.000000	0.000000	0.000000	14.454200
75%	668.500000	1.000000	3.000000	38.000000	1.000000	0.000000	31.000000
max	891.000000	1.000000	3.000000	80.000000	8.000000	6.000000	512.329200

train_df.describe(include='O')

	Name	Sex	Ticket	Cabin	Embarked
count	891	891	891	204	889
unique	891	2	681	147	3
top	Laleff, Mr. Kristo	male	1601	C23 C25 C27	S
freq	1	577	7	4	644

train_df[['Pclass', 'Survived']].groupby(['Pclass'],as_index=False)\
.mean().sort_values(by='Survived',ascending=False)

	Pclass	Survived
0	1	0.629630
1	2	0.472826
2	3	0.242363

train_df.groupby(['Sex'])['Sex','Survived'].mean()

	Survived
Sex
female	0.742038
male	0.188908

train_df[['Sex', 'Survived']].groupby(['Sex'],as_index=False)\
.mean().sort_values(by='Survived',ascending=False)

	Sex	Survived
0	female	0.742038
1	male	0.188908

train_df[['SibSp', 'Survived']].groupby(['SibSp'],as_index=False)\
.mean().sort_values(by='Survived',ascending=False)

	SibSp	Survived
1	1	0.535885
2	2	0.464286
0	0	0.345395
3	3	0.250000
4	4	0.166667
5	5	0.000000
6	8	0.000000

train_df[['Parch', 'Survived']].groupby(['Parch'],as_index=False)\
.mean().sort_values(by='Survived',ascending=False)

	Parch	Survived
3	3	0.600000
1	1	0.550847
2	2	0.500000
0	0	0.343658
5	5	0.200000
4	4	0.000000
6	6	0.000000

g = sns.FacetGrid(train_df, col='Survived')
g.map(plt.hist, 'Age', bins=20)  #bins 直方數量

<seaborn.axisgrid.FacetGrid at 0x113396518>

png

grid = sns.FacetGrid(train_df, col='Survived', row='Pclass', size=2.2, aspect=1.6)
grid.map(plt.hist, 'Age', alpha=.5, bins=20)  #bins表示直方數量, alpha表示顏色的深淺程度
grid.add_legend() # legend:圖例

<seaborn.axisgrid.FacetGrid at 0x1134c6358>

png

grid = sns.FacetGrid(train_df, row='Embarked', size=2.2, aspect=1.6)
grid.map(sns.pointplot, 'Pclass', 'Survived', 'Sex', palette='deep')
grid.add_legend()

/Users/shenxin/anaconda3/lib/python3.6/site-packages/seaborn/axisgrid.py:703: UserWarning: Using the pointplot function without specifying `order` is likely to produce an incorrect plot.
  warnings.warn(warning)
/Users/shenxin/anaconda3/lib/python3.6/site-packages/seaborn/axisgrid.py:708: UserWarning: Using the pointplot function without specifying `hue_order` is likely to produce an incorrect plot.
  warnings.warn(warning)





<seaborn.axisgrid.FacetGrid at 0x114cc0f28>

png

grid = sns.FacetGrid(train_df, row='Embarked', col='Survived', size=2.2, aspect=1.6)
grid.map(sns.barplot, 'Sex', 'Fare', alpha=.5, ci=None)
grid.add_legend()

/Users/shenxin/anaconda3/lib/python3.6/site-packages/seaborn/axisgrid.py:703: UserWarning: Using the barplot function without specifying `order` is likely to produce an incorrect plot.
  warnings.warn(warning)





<seaborn.axisgrid.FacetGrid at 0x114b0f7b8>

png

print("Before", train_df.shape, test_df.shape, combine[0].shape, combine[1].shape)

Before (891, 12) (418, 11) (891, 12) (418, 11)

# 無關特徵刪除
train_df = train_df.drop(['Ticket', 'Cabin', 'Name'], axis=1)
test_df = test_df.drop(['Ticket', 'Cabin', 'Name'], axis=1)
combine = [train_df, test_df]
print("After", train_df.shape, test_df.shape, combine[0].shape, combine[1].shape)

After (891, 9) (418, 8) (891, 9) (418, 8)

# 分類特徵轉換為數值特徵
for dataset in combine:
    dataset['Sex'] = dataset['Sex'].map({'female':1, 'male':0}).astype(int)
train_df.head()

	PassengerId	Survived	Pclass	Sex	Age	SibSp	Fare	Embarked
0	1	0	3	0	22.0	1	7.2500	S
1	2	1	1	1	38.0	1	71.2833	C
2	3	1	3	1	26.0	0	7.9250	S
3	4	1	1	1	35.0	1	53.1000	S
4	5	0	3	0	35.0	0	8.0500	S

# 數值特徵缺失值處理
guess_ages = np.zeros((2,3))
guess_ages

array([[0., 0., 0.],
       [0., 0., 0.]])

for dataset in combine:
    for i in range(0, 2):
        for j in range(0, 3):
            guess_df = dataset[(dataset['Sex'] == i) & \
                                   (dataset['Pclass'] == j+1)]['Age'].dropna()
            age_guess = guess_df.median()
            guess_ages[i, j] = int(age_guess/0.5 + 0.5) * 0.5

    for i in range(0, 2):
        for j in range(0, 3):
            dataset.loc[(dataset.Age.isnull()) & (dataset.Sex ==i) & ( dataset.Pclass == j+1),\
                                                                    ['Age']] = guess_ages[i, j]
train_df.head()

	PassengerId	Survived	Pclass	Sex	Age	SibSp	Fare	Embarked
0	1	0	3	0	22.0	1	7.2500	S
1	2	1	1	1	38.0	1	71.2833	C
2	3	1	3	1	26.0	0	7.9250	S
3	4	1	1	1	35.0	1	53.1000	S
4	5	0	3	0	35.0	0	8.0500	S

# 連續數值轉為分類特徵
train_df['AgeBand'] = pd.cut(train_df['Age'], 5)   # 按數值值等分，區別 qcut()按數值個數等分
train_df[['AgeBand', 'Survived']].groupby(['AgeBand'], as_index=False). \
                                mean().sort_values(by='AgeBand', ascending=True)

	AgeBand	Survived
0	(0.34, 16.336]	0.550000
1	(16.336, 32.252]	0.336714
2	(32.252, 48.168]	0.412844
3	(48.168, 64.084]	0.434783
4	(64.084, 80.0]	0.090909

for dataset in combine:    
    dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age']
train_df.head()

	PassengerId	Survived	Pclass	Sex	Age	SibSp	Fare	Embarked	AgeBand
0	1	0	3	0	1.0	1	7.2500	S	(16.336, 32.252]
1	2	1	1	1	2.0	1	71.2833	C	(32.252, 48.168]
2	3	1	3	1	1.0	0	7.9250	S	(16.336, 32.252]
3	4	1	1	1	2.0	1	53.1000	S	(32.252, 48.168]
4	5	0	3	0	2.0	0	8.0500	S	(32.252, 48.168]

train_df = train_df.drop(['AgeBand'], axis=1)
combine = [train_df, test_df]
train_df.head()

	PassengerId	Survived	Pclass	Sex	Age	SibSp	Fare	Embarked
0	1	0	3	0	1.0	1	7.2500	S
1	2	1	1	1	2.0	1	71.2833	C
2	3	1	3	1	1.0	0	7.9250	S
3	4	1	1	1	2.0	1	53.1000	S
4	5	0	3	0	2.0	0	8.0500	S

# 分類特徵缺失值處理（只有兩個，所以按最常用的填補）
freq_port = train_df.Embarked.dropna().mode()[0]   # 最常見值
for dataset in combine:
    dataset['Embarked'] = dataset['Embarked'].fillna(freq_port)

train_df[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean().\
                                                sort_values(by='Survived', ascending=False)

	Embarked	Survived
0	C	0.553571
1	Q	0.389610
2	S	0.339009

for dataset in combine:
    dataset['Embarked'] = dataset['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
train_df.head()

	PassengerId	Survived	Pclass	Sex	Age	SibSp	Fare	Embarked
0	1	0	3	0	1.0	1	7.2500	0
1	2	1	1	1	2.0	1	71.2833	1
2	3	1	3	1	1.0	0	7.9250	0
3	4	1	1	1	2.0	1	53.1000	0
4	5	0	3	0	2.0	0	8.0500	0

# 缺失較少，取中值
test_df['Fare'].fillna(test_df['Fare'].dropna().median(), inplace=True)

# 將票價離散化
train_df['FareBand'] = pd.qcut(train_df['Fare'], 4)
train_df[['FareBand', 'Survived']].groupby(['FareBand'], as_index=False).mean().\
                                            sort_values(by='FareBand', ascending=True)

	FareBand	Survived
0	(-0.001, 7.91]	0.197309
1	(7.91, 14.454]	0.303571
2	(14.454, 31.0]	0.454955
3	(31.0, 512.329]	0.581081

for dataset in combine:
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)

train_df = train_df.drop(['FareBand'], axis=1)
combine = [train_df, test_df]
    
train_df.head(10)

	PassengerId	Survived	Pclass	Sex	Age	SibSp	Parch	Fare	Embarked
0	1	0	3	0	1.0	1	0	0	0
1	2	1	1	1	2.0	1	0	3	1
2	3	1	3	1	1.0	0	0	1	0
3	4	1	1	1	2.0	1	0	3	0
4	5	0	3	0	2.0	0	0	1	0
5	6	0	3	0	1.0	0	0	1	2
6	7	0	1	0	3.0	0	0	3	0
7	8	0	3	0	0.0	3	1	2	0
8	9	1	3	1	1.0	0	2	1	0
9	10	1	2	1	0.0	1	0	2	1

# 嘗試建立新特徵
for dataset in combine:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

train_df[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean().\
                                                sort_values(by='Survived', ascending=False)

	FamilySize	Survived
3	4	0.724138
2	3	0.578431
1	2	0.552795
6	7	0.333333
0	1	0.303538
4	5	0.200000
5	6	0.136364
7	8	0.000000
8	11	0.000000

for dataset in combine:
    dataset['IsAlone'] = 0
    dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1

train_df[['IsAlone', 'Survived']].groupby(['IsAlone'], as_index=False).mean()

# 資料分析和處理 import numpy as np import pandas as pd # 資料視覺化 import seaborn as sns import matplotlib.pyplot as plt %matplotlib inline train_df =

2017.5.3上午學習內容

技術 images logs log -1 alt cnblogs 學習內容 image 聽刑法第十一章2017.5.3上午學習內容

2017 5月3日上午

linux inux 傳播 img 目的今天視頻前三操作今天看通信工程教學視頻，了解了Linux 的一些知識 Linux是一套免費使用和自由傳播的類Unix操作系統，是一個基於POSIX和UNIX的多用戶、多任務、支持多線程和多CPU的操作系統。 Linux的

2017.5.3Am

etc 分享開機存在 xorg log x server 節點 nbsp 微積分基本公式的例題和部分課後題 linux目錄架構/ 根目錄/bin 常用的命令 binary file 的目錄/boot 存放系統啟動時必須讀取的檔案，包括核心 (k

5.3上午

分享 mage blog alt img 編譯原理一個原理 src 今天上午前1個半小時做《接力題典1800》. 後一個半小時上課，編譯原理，老師方紅琴。 5.3上午

5月3日學習收獲

rate add 破解 mdk ext visio 不能區別 .exe keil uvision4 破解：MDK412安裝包→一路next→安裝好了之後，用管理員模式打開軟件→File→License Management→Computer ID→復制CID→keilli

2017.5.3 4.全排列

col font ace urn color span 順序輸出所有一個題目描述給定N(N<10)，按照字典序輸出所有的N排列。輸入第一行輸入N。樣例輸入 3 輸出輸出1到N的全排列，一行一個排列，按照字典序順序輸出。

5月3日上午學習日誌

能力程序系統調用 logs 學習日誌執行 shel 管理軟件 -1 2017年5月3日上午把昨天記憶的英語單詞的多種詞意用自己組句的方法聯系起來再記憶一遍，然後再學10個考研高頻詞匯，完成英語app的打卡和看通信工程視頻。今天基本上完成了通信工程教學視頻中對操作系統組

2017.5.3-morning

man 4.0 time route 路徑 shutdown 實驗 acer sub 實驗目的 l 掌握RIP協議的配置方法： l 掌握查看通過動態路由協議RIP學習產生的路由； l 熟悉廣域網線纜的鏈接方式；實驗背景假設校園網通過一臺三層交換機連到校園網出口路由器上，

CentOS6.5升級手動安裝GCC4.8.2

sta 安裝gcc cout include 介紹 tar.bz2 yum wget lib 一、簡易安裝操作環境 CentOS6.5 64bit，原版本4.4.7，不能支持C++11的特性~，希望升級到4.8.2 不能通過yum的方法升級，需要自己手動下載安裝包並編

5.20親密數

整數算法 actor iostream != 輸出 friendly light 數組 Q：若整數A的全部因子（包括1，不包括本身）之和等於B，並且整數B的全部因子之和等於A，則稱A與B是親密數。求解3000以內的全部親密數。思路：先將1~3000以內所有數的全部因

Centos6.5 安裝zabbix3

reload 地址 dlx operation $1 按鈕 mes load target 1.安裝PHP Zabbix 3.0對PHP的要求最低為5.4，而CentOS6默認為5.3.3，完全不滿足要求，故需要利用第三方源，將PHP升級到5.4以上，註意，不支持PHP7

5.4下午

關於更改 wid 對話框提示打開 span 下拉詳細信息關於Windows Server 2012的安裝在此不再贅述，同WIN8CP沒什麽差別，此文介紹一下我們在企業中部署的一項重要服務：活動目錄服務在新版服務器系統中的部署操作，也是屬於Step by Step

在Sql中將 varchar 值 '1,2,3,4,5,6' 轉換成數據類型 int

給定序列顯示結果空格 sel -方法一個表 affect --問題：將aa轉換為Int類型失敗 string aa="3,5,11,56,88,45,23"; select * from ERPBuMen where ID in(aa) ; --方法sel

正確學習Linux系統的5個建議

windows 服務器應用軟件安全性穩定性最近幾年Linux系統應用越來越廣泛，以至於很多人開始熱衷學習Linux。但是我們都是從小都是學習windows系統長大的，從windows 98到現在的windows 10，而根據學習windows系統的經驗來學習Linux，使很多人越學

vRealize Operations Manager 6.5部署

vcenter vcops vrops　　生產環境一直使用vCenter管理host主機，因規模小沒上vcops。從升級vcenter6.5後個別VM總是莫名其妙的報內存使用率過高的問題，而實際內存使用並不高，資源也夠用。vcenter本身沒有很好的工具分析和監控vsphere，但vmware有專門的管理軟件

Laravel 5.1 中的異常處理器和HTTP異常處理 abort()

錯誤日誌 exce ant upload 記錄再次 .org splay don 原文 http://laravelacademy.org/post/1867.html 錯誤和異常是處理程序開發中不可回避的議題，在本地開發中我們往往希望能捕獲程序拋出的異常並將其顯示打印

HTTP錯誤 404.17 - Not Found" IIS 7.5 請求的內容似乎是腳本，因而將無法由靜態文件處理程序來處理

iis alt pla 重啟 word div microsoft 理論 tar Errore HTTP 404.2 - Not Found" IIS 7.5 請求的內容似乎是腳本，因而將無法由靜態文件處理程序來處理出現這種情況的原因通常是因為先安裝了Framew

【轉】集群/分布式環境下5種session處理策略

學習原理 memcache 可選 ret 當前 memcach uil 服務器轉載至：http://blog.csdn.net/u010028869/article/details/50773174 在搭建完集群環境後，不得不考慮的一個問題就是用戶訪問產生的sessi

2017.5.4下午學習內容

1-1 image .cn 英語 mage http 內容 ima 單詞總結考研英語閱讀，並聽專業相關詞匯，復習上午所學單詞2017.5.4下午學習內容

	IsAlone	Survived

	PassengerId	Survived	Pclass	Sex	Age	SibSp	Parch	Fare	Embarked
0	1	0	3	0	1.0	1	0	0	0
1	2	1	1	1	2.0	1	0	3	1
2	3	1	3	1	1.0	0	0	1	0
3	4	1	1	1	2.0	1	0	3	0
4	5	0	3	0	2.0	0	0	1	0
5	6	0	3	0	1.0	0	0	1	2
6	7	0	1	0	3.0	0	0	3	0
7	8	0	3	0	0.0	3	1	2	0
8	9	1	3	1	1.0	0	2	1	0
9	10	1	2	1	0.0	1	0	2	1

	PassengerId	Survived	Pclass	Sex	Age	SibSp	Parch	Fare	Embarked
0	1	0	3	0	1.0	1	0	0	0
1	2	1	1	1	2.0	1	0	3	1
2	3	1	3	1	1.0	0	0	1	0
3	4	1	1	1	2.0	1	0	3	0
4	5	0	3	0	2.0	0	0	1	0
5	6	0	3	0	1.0	0	0	1	2
6	7	0	1	0	3.0	0	0	3	0
7	8	0	3	0	0.0	3	1	2	0
8	9	1	3	1	1.0	0	2	1	0
9	10	1	2	1	0.0	1	0	2	1

Titanic -----5

相關推薦

	PassengerId	Survived	Pclass	Sex	Age	SibSp	Parch	Fare	Embarked
0	1	0	3	0	1.0	1	0	0	0
1	2	1	1	1	2.0	1	0	3	1
2	3	1	3	1	1.0	0	0	1	0
3	4	1	1	1	2.0	1	0	3	0
4	5	0	3	0	2.0	0	0	1	0
5	6	0	3	0	1.0	0	0	1	2
6	7	0	1	0	3.0	0	0	3	0
7	8	0	3	0	0.0	3	1	2	0
8	9	1	3	1	1.0	0	2	1	0
9	10	1	2	1	0.0	1	0	2	1