1. 程式人生 > 實用技巧 >泰坦尼克號決策樹預測 筆記

泰坦尼克號決策樹預測 筆記

 1 import matplotlib.pyplot as plt
 2 import random 
 3 import pylab as mpl
 4 import pandas as pd
 5 import numpy as np
 6 from sklearn.feature_extraction import DictVectorizer
 7 from sklearn.model_selection import train_test_split
 8 from sklearn.tree import DecisionTreeClassifier, export_graphviz
9 10 mpl.rcParams['font.sans-serif'] = ['SimHei'] 11 #mpl.rcParams['axes.unicode_minus'] = False 12 13 # 1. 獲取資料 14 titan = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt') 15 16 # 2. 資料的基本處理 17 # 2.1 確定特徵值,目標值 18 19 x = titan[['pclass', 'age', 'sex']] 20 y = titan['survived
'] 21 22 # 2.2 缺失值處理 23 x['age'].fillna(x['age'].mean(), inplace=True) 24 25 # 2.3 資料集的劃分 26 x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=22) 27 28 # 3. 特徵工程(字典特徵抽取) 29 30 31 x_train = x_train.to_dict(orient='records') 32 x_test = x_test.to_dict(orient='records') 33 34
transfer = DictVectorizer() 35 36 x_train = transfer.fit_transform(x_train) 37 x_test = transfer.fit_transform(x_test) 38 39 40 41 # 4. 機器學習(決策樹) 42 estimator = DecisionTreeClassifier() 43 estimator.fit(x_train, y_train) 44 45 46 # 5. 模型評估 47 y_pre = estimator.predict(x_test) 48 49 ret = estimator.score(x_test, y_test) 50 print(ret)