1. 程式人生 > >機器學習演算法之樸素貝葉斯

機器學習演算法之樸素貝葉斯

樸素貝葉斯法是基於貝葉斯定理與特徵條件獨立假設的分類方法。

演算法的核心思想就是比較概率的大小,認定概率大的類別為所屬類別

下面是公式推導


下面是樸素貝葉斯的python程式碼實現

import numpy as np
from functools import reduce

def loadDataSet():
    """
    建立資料集
    :return:postingList:實驗樣本切分詞條,classVec:類別標籤向量:1:侮辱類,2:非侮辱類 
    """
    postingList = [['my','dog','has','flea','probelms','help','please'],
                   ['maybe','not','take','him','to','dog','park','stupid'],
                   ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
                   ['stop', 'posting', 'stupid', 'worthless', 'garbage'],
                   ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
                   ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
    classVec = [0,1,0,1,0,1]
    return postingList,classVec

def setOfWords2Vec(vocabList,inputSet):
    """
    根據vocabList詞彙表,將inputSet向量化,即詞彙表
    :param vocabList:詞彙表 
    :param inputSet:切分的詞條列表
    :return:文件向量 
    """
    returnVec = [0]*len(vocabList)#建立一個向量其中所有元素都為0
    for word in inputSet:#遍歷每個詞條
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else:
            print("the word:%s is not in my Vocabulary!" % word)
    return returnVec

def createVocabList(dataSet):
    """
    將資料轉化為詞彙表,即詞條不重複
    :param dataSet: 
    :return: 
    """
    vocabSet = set([])#建立一個空的不重複的列表
    for document in dataSet:
        vocabSet = vocabSet | set(document)#取並集
    return list(vocabSet)

def trainNB0(trainMatrix,trainCategory):
    """
    樸素貝葉斯分類器
    :param trainMatrix: 訓練文件矩陣
    :param trainCategory: 訓練類別標籤向量
    :return: p0Vect-侮辱類的條件概率陣列
             p1Vect-非侮辱類的條件概率陣列
             pAbusive-文件屬於侮辱類的概率
    """
    numTrainDocs = len(trainMatrix)#計算訓練的文件數目
    numWords = len(trainMatrix[0])#計算每篇文件的詞條數
    pAusive = sum(trainCategory)/float(numTrainDocs)
    #p0Num = np.zeros(numWords);p1Num = np.zeros(numWords)#建立numpy.zeros陣列,詞條出現數初始化為0
    p0Num = np.ones(numWords);p1Num = np.ones(numWords)#詞條出現數初始化為1,拉普拉斯平滑
    #p0Denom = 0.0;p1Denom = 0.0#分母初始化為0
    p0Denom = 2.0;p1Denom = 2.0#分母初始化為2,拉普拉斯平滑
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:#統計屬於侮辱類的條件概率所需的資料:P(w0&1)
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:#統計屬於非侮辱類的條件概率所需的資料
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])

    p1Vect = p1Num/p1Denom
    p0Vect = p0Num/p0Denom
    return p0Vect,p1Vect,pAusive

def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
    """
    樸素貝葉斯分類函式
    :param vec2Classify: 待分類詞條陣列
    :param p0Vec: 侮辱類的條件概率陣列
    :param p1Vec: 非侮辱類的條件概率陣列
    :param pClass1: 文件屬於侮辱類的概率
    :return: 0-侮辱類 1-非侮辱類
    """
    # #reduce:將一個可以迭代的物件應用到兩個帶有引數的方法上,我們稱這個方法為fun,遍歷這個可迭代的物件,將其中元素依次作為fun的引數
    # p1 = reduce(lambda x,y:x*y,vec2Classify*p1Vec)*pClass1
    # p2 = reduce(lambda x,y:x*y,vec2Classify*p1Vec)*(1.0 - pClass1)
    #用log防止下溢位  logA*B = logA + logB
    p1 = sum(vec2Classify*p1Vec)+np.log(pClass1)
    p0 = sum(vec2Classify*p0Vec)+np.log(1.0 - pClass1)
    print("p1:",p1)
    print("p2:",p0)
    if p1>p0:
        return 1
    else:
        return 0

def testingNB():
    """
    測試貝葉斯分類器
    :return: None
    """
    listOPosts,listClasses = loadDataSet()
    myVocabList = createVocabList(listOPosts)
    trainMat = []
    for postinDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
    p0V,p1V,pAb = trainNB0(trainMat,listClasses)#訓練
    testEntry = ['love','my','dalmation']
    thisDoc = np.array(setOfWords2Vec(myVocabList,testEntry))#測試樣本向量化
    if classifyNB(thisDoc,p0V,p1V,pAb):
        print(testEntry,"屬於侮辱類")
    else:
        print(testEntry, "不屬於侮辱類")

    testEntry = ['stupid','garbage']
    thisDoc = np.array(setOfWords2Vec(myVocabList,testEntry))#測試樣本向量化
    if classifyNB(thisDoc,p0V,p1V,pAb):
        print(testEntry,"屬於侮辱類")
    else:
        print(testEntry, "不屬於侮辱類")

if __name__ == '__main__':
    testingNB()

執行結果