1. 程式人生 > 實用技巧 >機器學習:樸素貝葉斯分類器實現二分類(伯努利型) 程式碼+專案實戰

機器學習:樸素貝葉斯分類器實現二分類(伯努利型) 程式碼+專案實戰

一、樸素貝葉斯分類器的構建

import numpy as np

class BernoulliNavieBayes:

    def __init__(self, alpha=1.):
        # 平滑係數, 預設為1(拉普拉斯平滑).
        self.alpha = alpha

    def _class_prior_proba_log(self, y, classes):
        '''計算所有類別先驗概率P(y=c_k)'''

        # 統計各類別樣本數量
        c_count = np.count_nonzero(y == classes[:, None], axis=1) 
        
# 計算各類別先驗概率(平滑修正) p = (c_count + self.alpha) / (len(y) + len(classes) * self.alpha) return np.log(p) def _conditional_proba_log(self, X, y, classes): '''計算所有條件概率P(x^(j)|y=c_k)的對數''' _, n = X.shape K = len(classes) # P_log: 2個條件概率的對數的矩陣 #
矩陣P_log[0]儲存所有log(P(x^(j)=0|y=c_k)) # 矩陣P_log[1]儲存所有log(P(x^(j)=1|y=c_k)) P_log = np.empty((2, K, n)) # 迭代每一個類別c_k for k, c in enumerate(classes): # 獲取類別為c_k的例項 X_c = X[y == c] # 統計各特徵值為1的例項的數量 count1 = np.count_nonzero(X_c, axis=0)
# 計算條件概率P(x^(j)=1|y=c_k)(平滑修正) p1 = (count1 + self.alpha) / (len(X_c) + 2 * self.alpha) # 將log(P(x^(j)=0|y=c_k))和log(P(x^(j)=1|y=c_k))存入矩陣 P_log[0, k] = np.log(1 - p1) P_log[1, k] = np.log(p1) return P_log def train(self, X_train, y_train): '''訓練模型''' # 獲取所有類別 self.classes = np.unique(y_train) # 計算並儲存所有先驗概率的對數 self.pp_log = self._class_prior_proba_log(y_train, self.classes) # 計算並儲存所有條件概率的對數 self.cp_log = self._conditional_proba_log(X_train, y_train, self.classes) def _predict(self, x): '''對單個例項進行預測''' K = len(self.classes) p_log = np.empty(K) # 分別獲取各特徵值為1和0的索引 idx1 = x == 1 idx0 = ~idx1 # 迭代每一個類別c_k for k in range(K): # 計算後驗概率P(c_k|x)分子部分的對數. p_log[k] = self.pp_log[k] + np.sum(self.cp_log[0, k][idx0]) \ + np.sum(self.cp_log[1, k][idx1]) # 返回具有最大後驗概率的類別 return np.argmax(p_log) def predict(self, X): '''預測''' # 對X中每個例項, 呼叫_predict進行預測, 收集結果並返回. return np.apply_along_axis(self._predict, axis=1, arr=X)

二、資料集的獲取

http://archive.ics.uci.edu/ml/machine-learning-databases/spambase/

三、載入資料與資料轉換

import numpy as np
data=np.loadtxt('F:/python_test/data/spambase.data',delimiter=',')
print(data)
X=data[:,:48]
X=np.where(X>0 , 1, 0)
print(X)

y=data[:,-1].astype('int')
y

四、模型擬合、預測與精度

單次訓練

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)
clf=BernoulliNavieBayes()
clf.train(X_train,y_train)
from sklearn.metrics import accuracy_score
y_pred=clf.predict(X_test)
print(y_pred)
accuracy=accuracy_score(y_test,y_pred)
print(accuracy)

多次訓練,精確度沒有太多的改變,說明樸素貝葉斯分類器只要很少的樣本就能學習到大部分的特徵

def test(X,y,test_size,N):
    acc=np.empty(N)
    for i in range(N):
        X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=test_size)
        clf=BernoulliNavieBayes()
        clf.train(X_train,y_train)
        y_pred=clf.predict(X_test)
        acc[i]=accuracy_score(y_test,y_pred)
    return np.mean(acc)

sizes=np.arange(0.3,1,0.1)
print(sizes)
acc=[test(X,y,test_size,100) for test_size in sizes]
print(acc)

import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.plot(sizes,acc,linestyle='--',color='red')
plt.ylim([0.87,0.88])
plt.xlabel('test_size/(test_size+trsin_size)')
plt.ylabel('accuracy')
plt.title('精確度趨勢圖')
plt.show()