大資料入門——使用決策樹模型預測泰坦尼克號乘客的生還情況
阿新 • • 發佈:2019-02-11
#資料查驗 import pandas as pd titanic=pd.read_csv('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt') print(titanic.head()) print(titanic.info()) #使用決策樹模型預測泰坦尼克號乘客的生還情況 X=titanic[['pclass', 'age', 'sex']] y=titanic['survived'] print(X.info()) X['age'].fillna(X['age'].mean(), inplace=True) print(X.info()) from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.25, random_state=33) from sklearn.feature_extraction import DictVectorizer vec=DictVectorizer(sparse=False) X_train=vec.fit_transform(X_train.to_dict(orient='record')) print(vec.feature_names_) X_test=vec.transform(X_test.to_dict(orient='record')) from sklearn.tree import DecisionTreeClassifier dtc=DecisionTreeClassifier() dtc.fit(X_train, y_train) y_predict=dtc.predict(X_test) #決策樹模型的預測效能 from sklearn.metrics import classification_report print(dtc.score(X_test, y_test)) print(classification_report(y_predict, y_test, target_names=['died', 'surveved']))