02 貝葉斯演算法 - 案例一 - 鳶尾花資料分類
阿新 • • 發佈:2018-12-19
常規操作:
import numpy as np import pandas as pd import matplotlib.pyplot as plt import matplotlib as mpl from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures from sklearn.naive_bayes import GaussianNB, MultinomialNB#高斯貝葉斯和多項式樸素貝葉斯 from sklearn.pipeline import Pipeline from sklearn.metrics import accuracy_score from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier ## 設定屬性防止中文亂碼 mpl.rcParams['font.sans-serif'] = [u'SimHei'] mpl.rcParams['axes.unicode_minus'] = False # 花萼長度、花萼寬度,花瓣長度,花瓣寬度 iris_feature_E = 'sepal length', 'sepal width', 'petal length', 'petal width' iris_feature_C = u'花萼長度', u'花萼寬度', u'花瓣長度', u'花瓣寬度' iris_class = 'Iris-setosa', 'Iris-versicolor', 'Iris-virginica' features = [2,3] ## 讀取資料 path = './datas/iris.data' # 資料檔案路徑 data = pd.read_csv(path, header=None) x = data[list(range(4))] x = x[features] y = pd.Categorical(data[4]).codes ## 直接將資料特徵轉換為0,1,2 print ("總樣本數目:%d;特徵屬性數目:%d" % x.shape)
總樣本數目:150;特徵屬性數目:2
資料分割,形成模型訓練資料和測試資料
x_train1, x_test1, y_train1, y_test1 = train_test_split(x, y, train_size=0.8, random_state=14)
x_train, x_test, y_train, y_test = x_train1, x_test1, y_train1, y_test1
print ("訓練資料集樣本數目:%d, 測試資料集樣本數目:%d" % (x_train.shape[0], x_test.shape[0]))
訓練資料集樣本數目:120, 測試資料集樣本數目:30
高斯貝葉斯模型構建
clf = Pipeline([
('sc', StandardScaler()),#標準化,把它轉化成了高斯分佈
('poly', PolynomialFeatures(degree=1)),
('clf', GaussianNB())]) # MultinomialNB多項式貝葉斯演算法中要求特徵屬性的取值不能為負數
## 訓練模型
clf.fit(x_train, y_train)
Pipeline(memory=None,
steps=[('sc', StandardScaler(copy=True, with_mean=True, with_std=True)), ('poly', PolynomialFeatures(degree=1, include_bias=True, interaction_only=False)), ('clf', GaussianNB(priors=None))])
計算預測值並計算準確率
y_train_hat = clf.predict(x_train)
print ('訓練集準確度: %.2f%%' % (100 * accuracy_score(y_train, y_train_hat)))
y_test_hat = clf.predict(x_test)
print ('測試集準確度:%.2f%%' % (100 * accuracy_score(y_test, y_test_hat)))
訓練集準確度: 95.83%
測試集準確度:96.67%
產生區域圖
N, M = 500, 500 # 橫縱各取樣多少個值
x1_min1, x2_min1 = x_train.min()
x1_max1, x2_max1 = x_train.max()
x1_min2, x2_min2 = x_test.min()
x1_max2, x2_max2 = x_test.max()
x1_min = np.min((x1_min1, x1_min2))
x1_max = np.max((x1_max1, x1_max2))
x2_min = np.min((x2_min1, x2_min2))
x2_max = np.max((x2_max1, x2_max2))
t1 = np.linspace(x1_min, x1_max, N)
t2 = np.linspace(x2_min, x2_max, N)
x1, x2 = np.meshgrid(t1, t2) # 生成網格取樣點
x_show = np.dstack((x1.flat, x2.flat))[0] # 測試點
cm_light = mpl.colors.ListedColormap(['#77E0A0', '#FF8080', '#A0A0FF'])
cm_dark = mpl.colors.ListedColormap(['g', 'r', 'b'])
y_show_hat = clf.predict(x_show) # 預測值
y_show_hat = y_show_hat.reshape(x1.shape)
畫圖
plt.figure(facecolor='w')
plt.pcolormesh(x1, x2, y_show_hat, cmap=cm_light) # 預測值的顯示
plt.scatter(x_train[features[0]], x_train[features[1]], c=y_train, edgecolors='k', s=50, cmap=cm_dark)
plt.scatter(x_test[features[0]], x_test[features[1]], c=y_test, marker='^', edgecolors='k', s=120, cmap=cm_dark)
plt.xlabel(iris_feature_C[features[0]], fontsize=13)
plt.ylabel(iris_feature_C[features[1]], fontsize=13)
plt.xlim(x1_min, x1_max)
plt.ylim(x2_min, x2_max)
plt.title(u'GaussianNB對鳶尾花資料的分類結果, 正確率:%.3f%%' % (100 * accuracy_score(y_test, y_test_hat)), fontsize=18)
plt.grid(True)
plt.show()