用交叉驗證調整KNN模型的引數
阿新 • • 發佈:2019-02-03
import pandas as pd import matplotlib.pyplot as plt import numpy as np #載入資料 def inspect_data(file_root): dataframe=pd.read_csv(file_root) print("資料基本資訊:") print(dataframe.info()) print("資料有%i行,%i列"%(dataframe.shape[0],dataframe.shape[1])) print("資料預覽:") print(dataframe.head()) return dataframe #缺失資料處理 def processing_missing_data(dataframe): if dataframe.isnull().values.any(): dataframe=dataframe.dropna() #dataframe=dataframe.fillna(0) return dataframe #載入資料 dataframe=pd.read_csv("H:/pythonfigure/voice.csv") #處理缺失資料 dataframe=processing_missing_data(dataframe) #資料轉化 dataframe.replace("male",1,inplace=True) dataframe.replace("female",0,inplace=True) #資料準備 x=dataframe.ix[:,:-1] y=dataframe.ix[:,-1] #特徵歸一化 from sklearn import preprocessing x=preprocessing.scale(x) #分割訓練集和測試集 from sklearn.model_selection import train_test_split x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=1/3.,random_state=5) #交叉驗證 from sklearn.model_selection import cross_val_score from sklearn.neighbors import KNeighborsClassifier k_range=range(1,31) cv_score=[] for k in k_range: knn=KNeighborsClassifier(k) scores=cross_val_score(knn,x_train,y_train,cv=10,scoring="accuracy") score_mean=scores.mean() cv_score.append(score_mean) print(k,score_mean) best_k=np.argmax(cv_score)+1 print("最優的k是%i"%(best_k)) plt.plot(k_range,cv_score) plt.xlabel("k") plt.ylabel("score") plt.show() #模型訓練 knn_model=KNeighborsClassifier(best_k) knn_model.fit(x_train,y_train) print("模型準確率:",knn_model.score(x_test,y_test))