1. 程式人生 > >Kaggle泰坦尼克特徵工程和模型融合

Kaggle泰坦尼克特徵工程和模型融合

上次我們對資料進行了分析,看到不同特徵對結果,也就是能否生存下來具有不同的影響,對資料進行觀察後我們需要選擇不同的特徵,並且對特徵進行組合,來為模型訓練做好準備,也就是特徵工程

import pandas as pd #資料分析
import numpy as np #科學計算
from pandas import Series,DataFrame
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
#對title提取
import re

train = pd.read_csv("C:\\Users\\Yao\\Desktop\\kaggle\\train.csv")
test = pd.read_csv("C:\\Users\\Yao\\Desktop\\kaggle\\test.csv")

我們可以對不同的名字觀察,可以發現大傢俱有不同的title,可以提取titile作為特徵


def get_title(name):
    title = re.search(' ([A-Za-z]+)\.',name)
    if title:
        return (title.group(1))
    return ('')
train['Title'] = train['Name'].apply(get_title)
#print(pd.crosstab(train['Title'],train['Sex']))
train['Title'] = train['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
     'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
train['Title'] = train['Title'].replace('Mlle','Miss')
train['Title'] = train['Title'].replace('Ms','Miss')
train['Title'] = train['Title'].replace('Mme','Mrs')
#print(train[['Title','Survived']].groupby(['Title'],as_index=False).mean())
test['Title'] = test['Name'].apply(get_title)
#print(pd.crosstab(test['Title'],test['Sex']))
test['Title'] = test['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
     'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
test['Title'] = test['Title'].replace('Mlle','Miss')
test['Title'] = test['Title'].replace('Ms','Miss')
test['Title'] = test['Title'].replace('Mme','Mrs')


train['familysize']  = train['Parch'] + train['SibSp'] + 1
train['alone'] = 0 
train.loc[train['familysize'] ==1,'alone' ] = 1
#用s對缺失處進行填充 fillna 填充函式
train['Embarked'] = train['Embarked'].fillna('S')
train['Cabin'] = train['Cabin'].fillna('no')
age_df=train[['Age','Fare','Parch','SibSp','Pclass']]
age_df_true=age_df.loc[(age_df.Age.notnull())]
age_df_null=age_df.loc[(age_df.Age.isnull())]
X=age_df_true.values[:,1:]
y=age_df_true.values[:,0]
rfr=RandomForestRegressor(n_estimators=1000,n_jobs=-1)
rfr.fit(X,y)
preAge=rfr.predict(age_df_null.values[:,1:])
train.loc[train.Age.isnull(),'Age']=preAge

drop_elements = ['PassengerId','Name','Ticket','Cabin','SibSp','Parch']
train_set=train.drop(drop_elements,axis = 1 )


#對測試樣本進行資料處理
test['familysize']  = test['Parch'] + test['SibSp'] + 1
test['alone'] = 0 
test.loc[test['familysize'] ==1,'alone' ] = 1
test['Embarked'] = test['Embarked'].fillna('S')
test['Cabin'] = test['Cabin'].fillna('no')
age_df=test[['Age','Fare','Parch','SibSp','Pclass']]
age_df_null=age_df.loc[(age_df.Age.isnull())]
preAge=rfr.predict(age_df_null.values[:,1:])
test.loc[test.Age.isnull(),'Age']=preAge
drop_elements = ['PassengerId','Name','Ticket','Cabin','SibSp','Parch']
test_set=test.drop(drop_elements,axis = 1 )

這裡,我們對家人數目做一個求和運算,計算總人數,判斷是否為獨自一個人,作為另一個特徵,接下來可以通過得到的有價值的特徵進行模型訓練


#訓練與預測

from sklearn import linear_model
import sklearn.preprocessing as preprocessing
dummies_Embarked = pd.get_dummies(train_set['Embarked'],prefix= 'Embarked')

dummies_Sex = pd.get_dummies(train_set['Sex'],prefix= 'Sex')

dummies_Pclass = pd.get_dummies(train_set['Pclass'],prefix= 'Pclass')

dummies_Title = pd.get_dummies(train_set['Title'],prefix= 'Title')

df=pd.concat([train_set,dummies_Embarked,dummies_Sex,dummies_Title],axis=1)
df.drop(['Sex','Embarked','Title'],axis=1,inplace=True)

train_np = df.as_matrix()

使用get_dummies函式可以把類別特徵變成one-hot模式,方便模型訓練,最後進行concat組合


y = train_np[:,0]
X = train_np[:,1:]

clf = linear_model.LogisticRegression(C=1.0,penalty='l1',tol=1e-6)
clf.fit(X,y)


dummies_Embarked = pd.get_dummies(test_set['Embarked'],prefix= 'Embarked')

dummies_Sex = pd.get_dummies(test_set['Sex'],prefix= 'Sex')

dummies_Pclass = pd.get_dummies(test_set['Pclass'],prefix= 'Pclass')

dummies_Title = pd.get_dummies(test_set['Title'],prefix= 'Title')

bf=pd.concat([test_set,dummies_Embarked,dummies_Sex,dummies_Title],axis=1)
bf.drop(['Sex','Embarked','Title'],axis=1,inplace=True)
bf=bf.fillna(50)
test_np = bf.as_matrix()


predictions = clf.predict(test_np)
result = pd.DataFrame({'PassengerId':test['PassengerId'].as_matrix(),'Survived':predictions.astype(np.int32)})
result.to_csv('feature_predictions.csv',index=False)

這裡使用不同的幾個模型使用StratifiedShuffleSplit進行資料切分,用不同的模型進行訓練,比較結果

#不同的方法嘗試
import matplotlib.pyplot as plt
import xgboost as xgb

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis,QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression

classifiers=[
        KNeighborsClassifier(3),
        DecisionTreeClassifier(),
        RandomForestClassifier(),
        SVC(probability=True),
        AdaBoostClassifier(),
        GradientBoostingClassifier(),
        GaussianNB(),
        LinearDiscriminantAnalysis(),
        QuadraticDiscriminantAnalysis(),
        LogisticRegression(),
        xgb.XGBClassifier()
        ]

sss=StratifiedShuffleSplit(n_splits=10,test_size=0.1,random_state=0)
x = train_np[:,1:]
y = train_np[:,0]
accuracy = np.zeros(len(classifiers))
for train_index, test_index in sss.split(x,y):
    x_train,x_test = x[train_index], x[test_index]
    y_train,y_test = y[train_index], y[test_index]
    clf_num = 0
    for clf in classifiers:
        clf_name = clf.__class__.__name__
        clf.fit(x_train,y_train)
        accuracy[clf_num] += (y_test == clf.predict(x_test)).mean()
        clf_num += 1
accuracy = accuracy/ 10
plt.bar(np.arange(len(classifiers)),accuracy, width=0.5, color='b')
plt.xlabel('Alog')
plt.ylabel('Accuracy')
plt.xticks(np.arange(len(classifiers)) + 0.25,
           ('KNN','DT','RF','SVC','AdaB','GBC','GNB',
            'LDA','QDA','LR','xgb')

不同的結果 

最後可以把模型融合起來,提高準確率


#模型融合
import matplotlib.pylab as pyl



sss=StratifiedShuffleSplit(n_splits=10,test_size=0.1,random_state=0)
x = train_np[:,1:]
y = train_np[:,0]
x1_test = np.zeros((test.shape[0],len(classifiers)))
accuracy = np.zeros(len(classifiers))
for train_index, test_index in sss.split(x,y):
    x_train,x_test = x[train_index], x[test_index]
    y_train,y_test = y[train_index], y[test_index]
    clf_num = 0
    for clf in classifiers:
        clf_name = clf.__class__.__name__
        clf.fit(x_train,y_train)
        x1_test[:,clf_num] += clf.predict(test_np)
        accuracy[clf_num] += (y_test == clf.predict(x_test)).mean()
        clf_num += 1
accuracy = accuracy/ 10
x1_test = x1_test / 10
plt.bar(np.arange(len(classifiers)),accuracy, width=0.5, color='b')
plt.xlabel('Alog')
plt.ylabel('Accuracy')
plt.xticks(np.arange(len(classifiers)) + 0.25,
           ('KNN','DT','RF','SVC','AdaB','GBC','GNB',
            'LDA','QDA','LR','xgb'))


pyl.pcolor(np.corrcoef(x1_test.T), cmap = 'Blues')
pyl.colorbar()
pyl.xticks(np.arange(0.5,11.5),
           ['KNN','DT','RF','SVC','AdaB','GBC','GNB','LDA','QDA','LR','xgb'])

pyl.yticks(np.arange(0.5,11.5),
           ['KNN','DT','RF','SVC','AdaB','GBC','GNB','LDA','QDA','LR','xgb'])

pyl.show

index = [0, 1, 2, 3,4,5,6,7,8, 9]
linear_prediction = x1_test[:, index].mean(axis=1 )
linear_prediction[linear_prediction >= 0.5] = 1
linear_prediction[linear_prediction < 0.5] =0
mixRe = pd.DataFrame({'PassengerId':test['PassengerId'].as_matrix(),'Survived':linear_prediction.astype(np.int32)})
mixRe.to_csv('mix2.csv',index=False)

最後得到結果為78%準確率,可以進前50%排名,可以在這基礎上繼續進行特徵工程,模型融合時可以計算相關性,對誤差結果分析,不斷對結果進行提升.