1. 程式人生 > 實用技巧 >資料科學技術與應用第五章機器學習建模分析

資料科學技術與應用第五章機器學習建模分析

基於Keras建立深度神經網路模型,在bankpep資料集上訓練神經網路分類模型,將訓練模型的耗時以及模型效能,與XGBoost、SVM、樸素貝葉斯等方法進行比較。

import pandas,datetime,xgboost,numpy
from sklearn import model_selection,preprocessing,metrics,tree,naive_bayes,svm
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
import matplotlib.pyplot as plt
from keras.models import Sequential from keras.layers import Dense,Activation from keras.utils import np_utils from graphviz import Source from IPython.display import Image #請根據 bankpep.csv 儲存位置適當調整程式碼 df=pandas.read_csv('data/bankpep.csv',index_col='id') seq=['married','car','save_act','current_act
','mortgage','pep'] for feature in seq: df.loc[df[feature]=='YES',feature]=1 df.loc[df[feature] == 'NO', feature] = 0 df.loc[df['sex']=='FEMALE','sex']=1 df.loc[df['sex']=='MALE','sex']=0 dumm_region=pandas.get_dummies(df['region'],prefix='region') dumm_child=pandas.get_dummies(df['children'
],prefix='children') df=df.drop(['region','children'],axis=1) df=df.join([dumm_region,dumm_child],how='outer') x=df.drop(['pep'],axis=1).values.astype(float) #x=preprocessing.scale(x) y=df['pep'].values.astype(int) x_train,x_test,y_train,y_test=model_selection.train_test_split(x,y,test_size=0.2,random_state=1) featureName=df.drop(['pep'],axis=1).columns.values className=['pep','no pep'] #tree print('Tree') start_time=datetime.datetime.now() clf_tree=tree.DecisionTreeClassifier() clf_tree.fit(x_train,y_train) pre_y_train_tree=clf_tree.predict(x_train) pre_y_test_tree=clf_tree.predict(x_test) print('train_tree') print(clf_tree.score(x_train,y_train)) print(metrics.classification_report(y_train,pre_y_train_tree)) print(metrics.confusion_matrix(y_train,pre_y_train_tree)) print('test_tree') tree_score=clf_tree.score(x_test,y_test) print(tree_score) print(metrics.classification_report(y_test,pre_y_test_tree)) print(metrics.confusion_matrix(y_test,pre_y_test_tree)) ''' graph_tree=Source(tree.export_graphviz(clf_tree,out_file=None,feature_names=featureName,class_names=className)) png_bytes=graph_tree.pipe(format='png') with open('mooc_5.2_tree.png','wb') as f: f.write(png_bytes) ''' end_time = datetime.datetime.now() time_tree=end_time-start_time print("time:",time_tree) #naive_bayes.MultinomialNB print('MultinomialNB') start_time=datetime.datetime.now() clf_MultinomialNB=naive_bayes.MultinomialNB() clf_MultinomialNB.fit(x_train,y_train) pre_y_train_MultinomialNB=clf_MultinomialNB.predict(x_train) pre_y_test_MultinomialNB=clf_MultinomialNB.predict(x_test) print('train_MultinomialNB') print(clf_MultinomialNB.score(x_train,y_train)) print(metrics.classification_report(y_train,pre_y_train_MultinomialNB)) print(metrics.confusion_matrix(y_train,pre_y_train_MultinomialNB)) print('test_MultinomialNB') MultinomialNB_score=clf_MultinomialNB.score(x_test,y_test) print(MultinomialNB_score) print(metrics.classification_report(y_test,pre_y_test_MultinomialNB)) print(metrics.confusion_matrix(y_test,pre_y_test_MultinomialNB)) end_time=datetime.datetime.now() time_MultinomialNB=end_time-start_time print("time:",time_MultinomialNB) #naive_bayes.GaussianNB print('GaussianNB') start_time=datetime.datetime.now() clf_GaussianNB=naive_bayes.GaussianNB() clf_GaussianNB.fit(x_train,y_train) pre_y_train_GaussianNB=clf_GaussianNB.predict(x_train) pre_y_test_GaussianNB=clf_GaussianNB.predict(x_test) print('train_GaussianNB') print(clf_GaussianNB.score(x_train,y_train)) print(metrics.classification_report(y_train,pre_y_train_GaussianNB)) print(metrics.confusion_matrix(y_train,pre_y_train_GaussianNB)) print('test_GaussianNB') GaussianNB_score=clf_GaussianNB.score(x_test,y_test) print(GaussianNB_score) print(metrics.classification_report(y_test,pre_y_test_GaussianNB)) print(metrics.confusion_matrix(y_test,pre_y_test_GaussianNB)) end_time=datetime.datetime.now() time_GaussianNB=end_time-start_time print("time:",time_GaussianNB) #naive_bayes.BernoulliNB print('BernoulliNB') start_time=datetime.datetime.now() clf_BernoulliNB=naive_bayes.BernoulliNB() clf_BernoulliNB.fit(x_train,y_train) pre_y_train_BernoulliNB=clf_BernoulliNB.predict(x_train) pre_y_test_BernoulliNB=clf_BernoulliNB.predict(x_test) print('train_BernoulliNB') print(clf_BernoulliNB.score(x_train,y_train)) print(metrics.classification_report(y_train,pre_y_train_BernoulliNB)) print(metrics.confusion_matrix(y_train,pre_y_train_BernoulliNB)) print('test_BernoulliNB') BernoulliNB_score=clf_BernoulliNB.score(x_test,y_test) print(BernoulliNB_score) print(metrics.classification_report(y_test,pre_y_test_BernoulliNB)) print(metrics.confusion_matrix(y_test,pre_y_test_BernoulliNB)) end_time=datetime.datetime.now() time_BernoulliNB=end_time-start_time print("time:",time_BernoulliNB) #SVM print('SVM') start_time=datetime.datetime.now() clf_SVM=svm.SVC() clf_SVM.fit(x_train,y_train) pre_y_train_SVM=clf_SVM.predict(x_train) pre_y_test_SVM=clf_SVM.predict(x_test) print('train_SVM') print(clf_SVM.score(x_train,y_train)) print(metrics.classification_report(y_train,pre_y_train_SVM)) print(metrics.confusion_matrix(y_train,pre_y_train_SVM)) print('test_SVM') SVM_score=clf_SVM.score(x_test,y_test) print(SVM_score) print(metrics.classification_report(y_test,pre_y_test_SVM)) print(metrics.confusion_matrix(y_test,pre_y_test_SVM)) end_time=datetime.datetime.now() time_SVM=end_time-start_time print("time:",time_SVM) #GBM print('GBM') start_time=datetime.datetime.now() clf_GBM=GradientBoostingClassifier() clf_GBM.fit(x_train,y_train) pre_y_train_GBM=clf_GBM.predict(x_train) pre_y_test_GBM=clf_GBM.predict(x_test) print('train_GBM') print(clf_GBM.score(x_train,y_train)) print(metrics.classification_report(y_train,pre_y_train_GBM)) print(metrics.confusion_matrix(y_train,pre_y_train_GBM)) print('test_GBM') GBM_score=clf_GBM.score(x_test,y_test) print(GBM_score) print(metrics.classification_report(y_test,pre_y_test_GBM)) print(metrics.confusion_matrix(y_test,pre_y_test_GBM)) end_time=datetime.datetime.now() time_GBM=end_time-start_time print("time:",time_GBM) #XGBoost print('XGBoost') start_time=datetime.datetime.now() clf_XGBoost=xgboost.XGBClassifier() clf_XGBoost.fit(x_train,y_train) pre_y_train_XGBoost=clf_XGBoost.predict(x_train) pre_y_test_XGBoost=clf_XGBoost.predict(x_test) print('train_XGBoost') print(clf_XGBoost.score(x_train,y_train)) print(metrics.classification_report(y_train,pre_y_train_XGBoost)) print(metrics.confusion_matrix(y_train,pre_y_train_XGBoost)) print('test_XGBoost') XGBoost_score=clf_XGBoost.score(x_test,y_test) print(XGBoost_score) print(metrics.classification_report(y_test,pre_y_test_XGBoost)) print(metrics.confusion_matrix(y_test,pre_y_test_XGBoost)) end_time=datetime.datetime.now() time_XGBoost=end_time-start_time print("time:",time_XGBoost) #RandomForestClassifier print('RFC') start_time=datetime.datetime.now() clf_RFC=RandomForestClassifier() clf_RFC.fit(x_train,y_train) pre_y_train_RFC=clf_RFC.predict(x_train) pre_y_test_RFC=clf_RFC.predict(x_test) print('train_RFC') print(clf_RFC.score(x_train,y_train)) print(metrics.classification_report(y_train,pre_y_train_RFC)) print(metrics.confusion_matrix(y_train,pre_y_train_RFC)) print('test_RFC') RFC_score=clf_RFC.score(x_test,y_test) print(RFC_score) print(metrics.classification_report(y_test,pre_y_test_RFC)) print(metrics.confusion_matrix(y_test,pre_y_test_RFC)) end_time=datetime.datetime.now() time_RFC=end_time-start_time print("time:",time_RFC) #Keras print('Keras') start_time=datetime.datetime.now() model=Sequential() model.add(Dense(units=16,input_shape=(16,))) model.add(Activation('relu')) model.add(Dense(100)) model.add(Activation('relu')) model.add(Dense(2)) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['binary_accuracy']) y_train_ohe=np_utils.to_categorical(y_train,2) y_test_ohe=np_utils.to_categorical(y_test,2) model.fit(x_train,y_train_ohe,epochs=25,batch_size=1,verbose=2,validation_data=(x_test,y_test_ohe)) loss,accuracy=model.evaluate(x_test,y_test_ohe) print(loss,accuracy) classes=model.predict(x_test,batch_size=1,verbose=2) Keras_score=loss end_time=datetime.datetime.now() time_Keras=end_time-start_time print("time:",time_Keras) #Matplotlib model=['tree','MultinomialNB','GaussianNB','BernoulliNB','SVM','GBM','XGBoost','RFC'] column=['Score','Time'] datas=[] for i in model: data=[] data.append(eval(i+"_score")) data.append(eval("time_"+i).total_seconds()) datas.append(data) df_Matplotlib=pandas.DataFrame(datas,columns=column,index=model) print(df_Matplotlib) print('Keras',loss,accuracy,time_Keras.total_seconds()) df_Matplotlib.plot() plt.grid() plt.show()

輸出結果:

                  Score      Time
tree           0.775000  0.081810
MultinomialNB  0.666667  0.009974
GaussianNB     0.700000  0.008011
BernoulliNB    0.741667  0.009941
SVM            0.566667  0.027959
GBM            0.825000  0.100698
XGBoost        0.816667  0.153870
RFC            0.833333  0.282304
Keras 0.6881586909294128 0.550000011920929 13.049028