1. 程式人生 > >Titanic模型構建

Titanic模型構建

1.因為邏輯回顧和支援向量機對資料的要求一樣,都需要的是連續性資料。所以先使用邏輯迴歸和支援向量機

資料處理

train_data.Sex[train_data.Sex == 'male'] = 0
train_data.Sex[train_data.Sex == 'female'] = 1
test_data.Sex[test_data.Sex == 'male'] = 0
test_data.Sex[test_data.Sex == 'female'] = 1
#對標稱資料連續化,取其one-hot,Pclass、Embarked、title、Family、person
dum_pclass = pd.get_dummies(train_data.Pclass, prefix='Pclass')
dum_embarked = pd.get_dummies(train_data.Embarked, prefix='Embarked')
dum_title = pd.get_dummies(train_data.title, prefix='title')
dum_family = pd.get_dummies(train_data.Family, prefix='Family')
dum_person = pd.get_dummies(train_data.person, prefix='person')
df_train = pd.concat([train_data, dum_pclass, dum_embarked, dum_title, dum_family, dum_person], axis=1)
df_train.drop(['Pclass', 'Embarked', 'title', 'Family', 'person'], axis=1, inplace=True)
dum_pclass = pd.get_dummies(test_data.Pclass, prefix='Pclass')
dum_embarked = pd.get_dummies(test_data.Embarked, prefix='Embarked')
dum_title = pd.get_dummies(test_data.title, prefix='title')
dum_family = pd.get_dummies(test_data.Family, prefix='Family')
dum_person = pd.get_dummies(test_data.person, prefix='person')
df_test = pd.concat([test_data, dum_pclass, dum_embarked, dum_title, dum_family, dum_person], axis=1)
df_test.drop(['Pclass', 'Embarked', 'title', 'Family', 'person'], axis=1, inplace=True)
#對Age和Fare歸一化
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_train['Age'] = scaler.fit_transform(df_train['Age'].values.reshape(-1,1))
df_train['Age'] = scaler.fit_transform(df_train['Age'].values.reshape(-1,1))
df_test['Age'] = scaler.fit_transform(df_test['Age'].values.reshape(-1,1))
df_test['Age'] = scaler.fit_transform(df_test['Age'].values.reshape(-1,1))

使用邏輯迴歸做預測

print(df_test.columns, len(df_train.columns))
#使用邏輯迴歸做預測
from sklearn.linear_model.logistic import LogisticRegression
from sklearn.cross_validation import train_test_split,cross_val_score
X = df_train[['Sex', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Age', 'isalone', 'mother',
       'ticket-same', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Embarked_C',
       'Embarked_Q', 'Embarked_S', 'title_1', 'title_2', 'title_3', 'title_4',
       'title_5', 'title_6', 'title_7', 'title_8', 'Family_0', 'Family_1',
       'Family_2', 'person_adult-man', 'person_adult-woman', 'person_child']]
Y = df_train['Survived']
classifier=LogisticRegression()
classifier.fit(X,Y)
scores=cross_val_score(classifier,X,Y,cv=5)
print( '準確率',np.mean(scores),scores)

使用支援向量機做預測

#使用支援向量機做預測
from sklearn import svm
clf = svm.SVC()
clf.fit(X, Y)
scores=cross_val_score(clf, X, Y, cv=5)
print('準確率',np.mean(scores), scores)

2.決策樹需要的資料為標稱資料,將資料處理好儲存到檔案中

用決策樹來做預測

#使用決策樹做預測
train_data = pd.read_csv('./Titanic-data/task-2-train2.csv')
test_data = pd.read_csv('./Titanic-data/task-2-test2.csv')
X = train_data[['Pclass', 'Sex', 'SibSp', 'Parch', 'Cabin', 'Embarked',
       'title', 'isalone', 'Family', 'mother', 'person', 'ticket-same', 'age',
       'fare']]
Y = train_data['Survived']
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, Y)
scores=cross_val_score(clf, X, Y, cv=5)#交叉驗證,使用5折交叉驗證
print('準確率',np.mean(scores), scores)

參考資料https://blog.csdn.net/weixin_40300458/article/details/79996764?from=singlemessage