機器學習實戰——文字分類
阿新 • • 發佈:2018-12-26
樸素貝葉斯
優點:在資料較少的情況下仍然有效,可以處理多類別問題。
缺點:對輸入資料的準備方式較為敏感。
適用資料型別:標稱型資料
核心思想:選擇高概率對應的類別。
條件概率:
程式碼:
from numpy import * def loadDataSet(): postingList=[['my','dog','has','flea',\ 'probelms','help','please'], ['maybe','not','take','him',\ 'to','dog','park','stupid'], ['my','dalmation','is','so','cute',\ 'I','love','him'], ['stop','posting','stupid','worthless','garbage'], ['mr','licks','ate','my','steak','how',\ 'to','stop','him'], ['quit','buying','worthless','dog','food','stupid']] classVec=[0,1,0,1,0,1] #0正常,1侮辱,對應postingList的屬性 return postingList,classVec def createVocabList(dataSet): vocabSet=set([]) #建立一個空集 for document in dataSet: vocabSet=vocabSet|set(document) #建立兩個集合的並集 return list(vocabSet) def setOfWords2Vec(vocabList,inputSet): #vocabList詞彙表,inputSet輸入的文件 returnVec=[0]*len(vocabList) for word in inputSet: if word in vocabList: returnVec[vocabList.index(word)]+=1 else: print("the word: %s is not in my Vocabulary" % word) return returnVec def trainNB0(trainMatrix,trainCategory): numTrainDocs=len(trainMatrix) numWords=len(trainMatrix[0]) pAbusive=sum(trainCategory)/float(numTrainDocs) #侮辱性文件的概率 p0Num=ones(numWords);p1Num=ones(numWords) p0Denom=2.0;p1Denom=2.0 for i in range(numTrainDocs): if trainCategory[i]==1: p1Num+=trainMatrix[i] p1Denom+=sum(trainMatrix[i]) else: p0Num+=trainMatrix[i] p0Denom+=sum(trainMatrix[i]) p1Vect=log(p1Num/p1Denom) p0Vect=log(p0Num/p0Denom) return p0Vect,p1Vect,pAbusive def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1): p1=sum(vec2Classify*p1Vec)+log(pClass1) #計算相乘後的概率 p0=sum(vec2Classify*p0Vec)+log(1.0-pClass1) if p1>p0: return 1 else: return 0 def testingNB(): listOPosts,listClasses=loadDataSet() myVocabList=createVocabList(listOPosts) trainMat=[] for postoinDoc in listOPosts: trainMat.append(setOfWords2Vec(myVocabList,postoinDoc)) p0V,p1V,pAb=trainNB0(array(trainMat),array(listClasses)) testEntry=['love','my','dalmation'] thisDoc=array(setOfWords2Vec(myVocabList,testEntry)) print(testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb)) testEntry=['stupid','garbage'] thisDoc = array(setOfWords2Vec(myVocabList, testEntry)) print(testEntry, 'classified as:', classifyNB(thisDoc, p0V, p1V, pAb)) if __name__ == '__main__': testingNB() # listOPosts,listClasses=loadDataSet() # myVocabList=createVocabList(listOPosts) #單詞列表 # trainMat=[] # for postoinDoc in listOPosts: # trainMat.append(setOfWords2Vec(myVocabList,postoinDoc)) #句子分詞後的單詞在單詞列表中出現的矩陣 # # print(trainMat) # p0V,p1V,pAb=trainNB0(trainMat,listClasses) # print(p0V) # print(p1V) # print(pAb) # print(myVocabList) # print(setOfWords2Vec(myVocabList,listOPosts[0])) # print(setOfWords2Vec(myVocabList, listOPosts[3]))