Kaggle泰坦尼克特徵工程和模型融合
阿新 • • 發佈:2019-02-05
上次我們對資料進行了分析,看到不同特徵對結果,也就是能否生存下來具有不同的影響,對資料進行觀察後我們需要選擇不同的特徵,並且對特徵進行組合,來為模型訓練做好準備,也就是特徵工程
import pandas as pd #資料分析 import numpy as np #科學計算 from pandas import Series,DataFrame import matplotlib.pyplot as plt from sklearn.ensemble import RandomForestRegressor #對title提取 import re train = pd.read_csv("C:\\Users\\Yao\\Desktop\\kaggle\\train.csv") test = pd.read_csv("C:\\Users\\Yao\\Desktop\\kaggle\\test.csv")
我們可以對不同的名字觀察,可以發現大傢俱有不同的title,可以提取titile作為特徵
def get_title(name): title = re.search(' ([A-Za-z]+)\.',name) if title: return (title.group(1)) return ('') train['Title'] = train['Name'].apply(get_title) #print(pd.crosstab(train['Title'],train['Sex'])) train['Title'] = train['Title'].replace(['Lady', 'Countess','Capt', 'Col',\ 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare') train['Title'] = train['Title'].replace('Mlle','Miss') train['Title'] = train['Title'].replace('Ms','Miss') train['Title'] = train['Title'].replace('Mme','Mrs') #print(train[['Title','Survived']].groupby(['Title'],as_index=False).mean()) test['Title'] = test['Name'].apply(get_title) #print(pd.crosstab(test['Title'],test['Sex'])) test['Title'] = test['Title'].replace(['Lady', 'Countess','Capt', 'Col',\ 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare') test['Title'] = test['Title'].replace('Mlle','Miss') test['Title'] = test['Title'].replace('Ms','Miss') test['Title'] = test['Title'].replace('Mme','Mrs') train['familysize'] = train['Parch'] + train['SibSp'] + 1 train['alone'] = 0 train.loc[train['familysize'] ==1,'alone' ] = 1 #用s對缺失處進行填充 fillna 填充函式 train['Embarked'] = train['Embarked'].fillna('S') train['Cabin'] = train['Cabin'].fillna('no') age_df=train[['Age','Fare','Parch','SibSp','Pclass']] age_df_true=age_df.loc[(age_df.Age.notnull())] age_df_null=age_df.loc[(age_df.Age.isnull())] X=age_df_true.values[:,1:] y=age_df_true.values[:,0] rfr=RandomForestRegressor(n_estimators=1000,n_jobs=-1) rfr.fit(X,y) preAge=rfr.predict(age_df_null.values[:,1:]) train.loc[train.Age.isnull(),'Age']=preAge drop_elements = ['PassengerId','Name','Ticket','Cabin','SibSp','Parch'] train_set=train.drop(drop_elements,axis = 1 ) #對測試樣本進行資料處理 test['familysize'] = test['Parch'] + test['SibSp'] + 1 test['alone'] = 0 test.loc[test['familysize'] ==1,'alone' ] = 1 test['Embarked'] = test['Embarked'].fillna('S') test['Cabin'] = test['Cabin'].fillna('no') age_df=test[['Age','Fare','Parch','SibSp','Pclass']] age_df_null=age_df.loc[(age_df.Age.isnull())] preAge=rfr.predict(age_df_null.values[:,1:]) test.loc[test.Age.isnull(),'Age']=preAge drop_elements = ['PassengerId','Name','Ticket','Cabin','SibSp','Parch'] test_set=test.drop(drop_elements,axis = 1 )
這裡,我們對家人數目做一個求和運算,計算總人數,判斷是否為獨自一個人,作為另一個特徵,接下來可以通過得到的有價值的特徵進行模型訓練
#訓練與預測 from sklearn import linear_model import sklearn.preprocessing as preprocessing dummies_Embarked = pd.get_dummies(train_set['Embarked'],prefix= 'Embarked') dummies_Sex = pd.get_dummies(train_set['Sex'],prefix= 'Sex') dummies_Pclass = pd.get_dummies(train_set['Pclass'],prefix= 'Pclass') dummies_Title = pd.get_dummies(train_set['Title'],prefix= 'Title') df=pd.concat([train_set,dummies_Embarked,dummies_Sex,dummies_Title],axis=1) df.drop(['Sex','Embarked','Title'],axis=1,inplace=True) train_np = df.as_matrix()
使用get_dummies函式可以把類別特徵變成one-hot模式,方便模型訓練,最後進行concat組合
y = train_np[:,0]
X = train_np[:,1:]
clf = linear_model.LogisticRegression(C=1.0,penalty='l1',tol=1e-6)
clf.fit(X,y)
dummies_Embarked = pd.get_dummies(test_set['Embarked'],prefix= 'Embarked')
dummies_Sex = pd.get_dummies(test_set['Sex'],prefix= 'Sex')
dummies_Pclass = pd.get_dummies(test_set['Pclass'],prefix= 'Pclass')
dummies_Title = pd.get_dummies(test_set['Title'],prefix= 'Title')
bf=pd.concat([test_set,dummies_Embarked,dummies_Sex,dummies_Title],axis=1)
bf.drop(['Sex','Embarked','Title'],axis=1,inplace=True)
bf=bf.fillna(50)
test_np = bf.as_matrix()
predictions = clf.predict(test_np)
result = pd.DataFrame({'PassengerId':test['PassengerId'].as_matrix(),'Survived':predictions.astype(np.int32)})
result.to_csv('feature_predictions.csv',index=False)
這裡使用不同的幾個模型使用StratifiedShuffleSplit進行資料切分,用不同的模型進行訓練,比較結果
#不同的方法嘗試
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis,QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
classifiers=[
KNeighborsClassifier(3),
DecisionTreeClassifier(),
RandomForestClassifier(),
SVC(probability=True),
AdaBoostClassifier(),
GradientBoostingClassifier(),
GaussianNB(),
LinearDiscriminantAnalysis(),
QuadraticDiscriminantAnalysis(),
LogisticRegression(),
xgb.XGBClassifier()
]
sss=StratifiedShuffleSplit(n_splits=10,test_size=0.1,random_state=0)
x = train_np[:,1:]
y = train_np[:,0]
accuracy = np.zeros(len(classifiers))
for train_index, test_index in sss.split(x,y):
x_train,x_test = x[train_index], x[test_index]
y_train,y_test = y[train_index], y[test_index]
clf_num = 0
for clf in classifiers:
clf_name = clf.__class__.__name__
clf.fit(x_train,y_train)
accuracy[clf_num] += (y_test == clf.predict(x_test)).mean()
clf_num += 1
accuracy = accuracy/ 10
plt.bar(np.arange(len(classifiers)),accuracy, width=0.5, color='b')
plt.xlabel('Alog')
plt.ylabel('Accuracy')
plt.xticks(np.arange(len(classifiers)) + 0.25,
('KNN','DT','RF','SVC','AdaB','GBC','GNB',
'LDA','QDA','LR','xgb')
不同的結果
最後可以把模型融合起來,提高準確率
#模型融合
import matplotlib.pylab as pyl
sss=StratifiedShuffleSplit(n_splits=10,test_size=0.1,random_state=0)
x = train_np[:,1:]
y = train_np[:,0]
x1_test = np.zeros((test.shape[0],len(classifiers)))
accuracy = np.zeros(len(classifiers))
for train_index, test_index in sss.split(x,y):
x_train,x_test = x[train_index], x[test_index]
y_train,y_test = y[train_index], y[test_index]
clf_num = 0
for clf in classifiers:
clf_name = clf.__class__.__name__
clf.fit(x_train,y_train)
x1_test[:,clf_num] += clf.predict(test_np)
accuracy[clf_num] += (y_test == clf.predict(x_test)).mean()
clf_num += 1
accuracy = accuracy/ 10
x1_test = x1_test / 10
plt.bar(np.arange(len(classifiers)),accuracy, width=0.5, color='b')
plt.xlabel('Alog')
plt.ylabel('Accuracy')
plt.xticks(np.arange(len(classifiers)) + 0.25,
('KNN','DT','RF','SVC','AdaB','GBC','GNB',
'LDA','QDA','LR','xgb'))
pyl.pcolor(np.corrcoef(x1_test.T), cmap = 'Blues')
pyl.colorbar()
pyl.xticks(np.arange(0.5,11.5),
['KNN','DT','RF','SVC','AdaB','GBC','GNB','LDA','QDA','LR','xgb'])
pyl.yticks(np.arange(0.5,11.5),
['KNN','DT','RF','SVC','AdaB','GBC','GNB','LDA','QDA','LR','xgb'])
pyl.show
index = [0, 1, 2, 3,4,5,6,7,8, 9]
linear_prediction = x1_test[:, index].mean(axis=1 )
linear_prediction[linear_prediction >= 0.5] = 1
linear_prediction[linear_prediction < 0.5] =0
mixRe = pd.DataFrame({'PassengerId':test['PassengerId'].as_matrix(),'Survived':linear_prediction.astype(np.int32)})
mixRe.to_csv('mix2.csv',index=False)
最後得到結果為78%準確率,可以進前50%排名,可以在這基礎上繼續進行特徵工程,模型融合時可以計算相關性,對誤差結果分析,不斷對結果進行提升.