Titanic模型構建
阿新 • • 發佈:2019-01-08
1.因為邏輯回顧和支援向量機對資料的要求一樣,都需要的是連續性資料。所以先使用邏輯迴歸和支援向量機
資料處理
train_data.Sex[train_data.Sex == 'male'] = 0 train_data.Sex[train_data.Sex == 'female'] = 1 test_data.Sex[test_data.Sex == 'male'] = 0 test_data.Sex[test_data.Sex == 'female'] = 1 #對標稱資料連續化,取其one-hot,Pclass、Embarked、title、Family、person dum_pclass = pd.get_dummies(train_data.Pclass, prefix='Pclass') dum_embarked = pd.get_dummies(train_data.Embarked, prefix='Embarked') dum_title = pd.get_dummies(train_data.title, prefix='title') dum_family = pd.get_dummies(train_data.Family, prefix='Family') dum_person = pd.get_dummies(train_data.person, prefix='person') df_train = pd.concat([train_data, dum_pclass, dum_embarked, dum_title, dum_family, dum_person], axis=1) df_train.drop(['Pclass', 'Embarked', 'title', 'Family', 'person'], axis=1, inplace=True) dum_pclass = pd.get_dummies(test_data.Pclass, prefix='Pclass') dum_embarked = pd.get_dummies(test_data.Embarked, prefix='Embarked') dum_title = pd.get_dummies(test_data.title, prefix='title') dum_family = pd.get_dummies(test_data.Family, prefix='Family') dum_person = pd.get_dummies(test_data.person, prefix='person') df_test = pd.concat([test_data, dum_pclass, dum_embarked, dum_title, dum_family, dum_person], axis=1) df_test.drop(['Pclass', 'Embarked', 'title', 'Family', 'person'], axis=1, inplace=True) #對Age和Fare歸一化 from sklearn.preprocessing import StandardScaler scaler = StandardScaler() df_train['Age'] = scaler.fit_transform(df_train['Age'].values.reshape(-1,1)) df_train['Age'] = scaler.fit_transform(df_train['Age'].values.reshape(-1,1)) df_test['Age'] = scaler.fit_transform(df_test['Age'].values.reshape(-1,1)) df_test['Age'] = scaler.fit_transform(df_test['Age'].values.reshape(-1,1))
使用邏輯迴歸做預測
print(df_test.columns, len(df_train.columns)) #使用邏輯迴歸做預測 from sklearn.linear_model.logistic import LogisticRegression from sklearn.cross_validation import train_test_split,cross_val_score X = df_train[['Sex', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Age', 'isalone', 'mother', 'ticket-same', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'title_1', 'title_2', 'title_3', 'title_4', 'title_5', 'title_6', 'title_7', 'title_8', 'Family_0', 'Family_1', 'Family_2', 'person_adult-man', 'person_adult-woman', 'person_child']] Y = df_train['Survived'] classifier=LogisticRegression() classifier.fit(X,Y) scores=cross_val_score(classifier,X,Y,cv=5) print( '準確率',np.mean(scores),scores)
使用支援向量機做預測
#使用支援向量機做預測
from sklearn import svm
clf = svm.SVC()
clf.fit(X, Y)
scores=cross_val_score(clf, X, Y, cv=5)
print('準確率',np.mean(scores), scores)
2.決策樹需要的資料為標稱資料,將資料處理好儲存到檔案中
用決策樹來做預測
#使用決策樹做預測 train_data = pd.read_csv('./Titanic-data/task-2-train2.csv') test_data = pd.read_csv('./Titanic-data/task-2-test2.csv') X = train_data[['Pclass', 'Sex', 'SibSp', 'Parch', 'Cabin', 'Embarked', 'title', 'isalone', 'Family', 'mother', 'person', 'ticket-same', 'age', 'fare']] Y = train_data['Survived'] from sklearn import tree clf = tree.DecisionTreeClassifier() clf = clf.fit(X, Y) scores=cross_val_score(clf, X, Y, cv=5)#交叉驗證,使用5折交叉驗證 print('準確率',np.mean(scores), scores)
參考資料https://blog.csdn.net/weixin_40300458/article/details/79996764?from=singlemessage