機器學習演算法之樸素貝葉斯
阿新 • • 發佈:2019-02-11
樸素貝葉斯法是基於貝葉斯定理與特徵條件獨立假設的分類方法。
演算法的核心思想就是比較概率的大小,認定概率大的類別為所屬類別
下面是公式推導
下面是樸素貝葉斯的python程式碼實現
import numpy as np from functools import reduce def loadDataSet(): """ 建立資料集 :return:postingList:實驗樣本切分詞條,classVec:類別標籤向量:1:侮辱類,2:非侮辱類 """ postingList = [['my','dog','has','flea','probelms','help','please'], ['maybe','not','take','him','to','dog','park','stupid'], ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'], ['stop', 'posting', 'stupid', 'worthless', 'garbage'], ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'], ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']] classVec = [0,1,0,1,0,1] return postingList,classVec def setOfWords2Vec(vocabList,inputSet): """ 根據vocabList詞彙表,將inputSet向量化,即詞彙表 :param vocabList:詞彙表 :param inputSet:切分的詞條列表 :return:文件向量 """ returnVec = [0]*len(vocabList)#建立一個向量其中所有元素都為0 for word in inputSet:#遍歷每個詞條 if word in vocabList: returnVec[vocabList.index(word)] = 1 else: print("the word:%s is not in my Vocabulary!" % word) return returnVec def createVocabList(dataSet): """ 將資料轉化為詞彙表,即詞條不重複 :param dataSet: :return: """ vocabSet = set([])#建立一個空的不重複的列表 for document in dataSet: vocabSet = vocabSet | set(document)#取並集 return list(vocabSet) def trainNB0(trainMatrix,trainCategory): """ 樸素貝葉斯分類器 :param trainMatrix: 訓練文件矩陣 :param trainCategory: 訓練類別標籤向量 :return: p0Vect-侮辱類的條件概率陣列 p1Vect-非侮辱類的條件概率陣列 pAbusive-文件屬於侮辱類的概率 """ numTrainDocs = len(trainMatrix)#計算訓練的文件數目 numWords = len(trainMatrix[0])#計算每篇文件的詞條數 pAusive = sum(trainCategory)/float(numTrainDocs) #p0Num = np.zeros(numWords);p1Num = np.zeros(numWords)#建立numpy.zeros陣列,詞條出現數初始化為0 p0Num = np.ones(numWords);p1Num = np.ones(numWords)#詞條出現數初始化為1,拉普拉斯平滑 #p0Denom = 0.0;p1Denom = 0.0#分母初始化為0 p0Denom = 2.0;p1Denom = 2.0#分母初始化為2,拉普拉斯平滑 for i in range(numTrainDocs): if trainCategory[i] == 1:#統計屬於侮辱類的條件概率所需的資料:P(w0&1) p1Num += trainMatrix[i] p1Denom += sum(trainMatrix[i]) else:#統計屬於非侮辱類的條件概率所需的資料 p0Num += trainMatrix[i] p0Denom += sum(trainMatrix[i]) p1Vect = p1Num/p1Denom p0Vect = p0Num/p0Denom return p0Vect,p1Vect,pAusive def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1): """ 樸素貝葉斯分類函式 :param vec2Classify: 待分類詞條陣列 :param p0Vec: 侮辱類的條件概率陣列 :param p1Vec: 非侮辱類的條件概率陣列 :param pClass1: 文件屬於侮辱類的概率 :return: 0-侮辱類 1-非侮辱類 """ # #reduce:將一個可以迭代的物件應用到兩個帶有引數的方法上,我們稱這個方法為fun,遍歷這個可迭代的物件,將其中元素依次作為fun的引數 # p1 = reduce(lambda x,y:x*y,vec2Classify*p1Vec)*pClass1 # p2 = reduce(lambda x,y:x*y,vec2Classify*p1Vec)*(1.0 - pClass1) #用log防止下溢位 logA*B = logA + logB p1 = sum(vec2Classify*p1Vec)+np.log(pClass1) p0 = sum(vec2Classify*p0Vec)+np.log(1.0 - pClass1) print("p1:",p1) print("p2:",p0) if p1>p0: return 1 else: return 0 def testingNB(): """ 測試貝葉斯分類器 :return: None """ listOPosts,listClasses = loadDataSet() myVocabList = createVocabList(listOPosts) trainMat = [] for postinDoc in listOPosts: trainMat.append(setOfWords2Vec(myVocabList,postinDoc)) p0V,p1V,pAb = trainNB0(trainMat,listClasses)#訓練 testEntry = ['love','my','dalmation'] thisDoc = np.array(setOfWords2Vec(myVocabList,testEntry))#測試樣本向量化 if classifyNB(thisDoc,p0V,p1V,pAb): print(testEntry,"屬於侮辱類") else: print(testEntry, "不屬於侮辱類") testEntry = ['stupid','garbage'] thisDoc = np.array(setOfWords2Vec(myVocabList,testEntry))#測試樣本向量化 if classifyNB(thisDoc,p0V,p1V,pAb): print(testEntry,"屬於侮辱類") else: print(testEntry, "不屬於侮辱類") if __name__ == '__main__': testingNB()
執行結果