泰坦尼克號決策樹預測 筆記
阿新 • • 發佈:2020-10-08
1 import matplotlib.pyplot as plt 2 import random 3 import pylab as mpl 4 import pandas as pd 5 import numpy as np 6 from sklearn.feature_extraction import DictVectorizer 7 from sklearn.model_selection import train_test_split 8 from sklearn.tree import DecisionTreeClassifier, export_graphviz9 10 mpl.rcParams['font.sans-serif'] = ['SimHei'] 11 #mpl.rcParams['axes.unicode_minus'] = False 12 13 # 1. 獲取資料 14 titan = pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt') 15 16 # 2. 資料的基本處理 17 # 2.1 確定特徵值,目標值 18 19 x = titan[['pclass', 'age', 'sex']] 20 y = titan['survived'] 21 22 # 2.2 缺失值處理 23 x['age'].fillna(x['age'].mean(), inplace=True) 24 25 # 2.3 資料集的劃分 26 x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=22) 27 28 # 3. 特徵工程(字典特徵抽取) 29 30 31 x_train = x_train.to_dict(orient='records') 32 x_test = x_test.to_dict(orient='records') 33 34transfer = DictVectorizer() 35 36 x_train = transfer.fit_transform(x_train) 37 x_test = transfer.fit_transform(x_test) 38 39 40 41 # 4. 機器學習(決策樹) 42 estimator = DecisionTreeClassifier() 43 estimator.fit(x_train, y_train) 44 45 46 # 5. 模型評估 47 y_pre = estimator.predict(x_test) 48 49 ret = estimator.score(x_test, y_test) 50 print(ret)