《機器學習實戰》基於樸素貝葉斯分類演算法構建文字分類器的Python實現
阿新 • • 發佈:2019-01-01
Python程式碼實現:
#encoding:utf-8 from numpy import * #詞表到向量的轉換函式 def loadDataSet(): postingList = [['my','dog','has','flea','problems','help','please'], ['maybe','not','take','him','to','dog','park','stupid'], ['my','dalmation','is','so','cute','I','love','him'], ['stop','posting','stupid','worthless','garbage'], ['mr','licks','ate','my','steak','how','to','stop','him'], ['quit','buying','worthless','dog','food','stupid']] classVec = [0,1,0,1,0,1] #1,侮辱 0,正常 return postingList,classVec def createVocabList(dataSet): vocabSet = set([]) #呼叫set方法,建立一個空集 for document in dataSet: vocabSet = vocabSet | set(document) #建立兩個集合的並集 return list(vocabSet) ''' def setOfWords2Vec(vocabList,inputSet): returnVec = [0]*len(vocabList) #建立一個所含元素都為0的向量 for word in inputSet: if word in vocabList: returnVec[vocabList.index(word)] = 1 else: print "the word:%s is not in my Vocabulary" % word return returnVec ''' def bagOfWords2VecMN(vocabList,inputSet): returnVec = [0]*len(vocabList) #建立一個所含元素都為0的向量 for word in inputSet: if word in vocabList: returnVec[vocabList.index(word)] += 1 return returnVec #樸素貝葉斯分類器訓練集 def trainNB0(trainMatrix,trainCategory): #傳入引數為文件矩陣,每篇文件類別標籤所構成的向量 numTrainDocs = len(trainMatrix) #文件矩陣的長度 numWords = len(trainMatrix[0]) #第一個文件的單詞個數 pAbusive = sum(trainCategory)/float(numTrainDocs) #任意文件屬於侮辱性文件概率 #p0Num = zeros(numWords);p1Num = zeros(numWords) #初始化兩個矩陣,長度為numWords,內容值為0 p0Num = ones(numWords);p1Num = ones(numWords) #初始化兩個矩陣,長度為numWords,內容值為1 #p0Denom = 0.0;p1Denom = 0.0 #初始化概率 p0Denom = 2.0;p1Denom = 2.0 for i in range(numTrainDocs): if trainCategory[i]==1: p1Num +=trainMatrix[i] p1Denom += sum(trainMatrix[i]) else: p0Num +=trainMatrix[i] p0Denom += sum(trainMatrix[i]) #p1Vect = p1Num/p1Denom #對每個元素做除法 #p0Vect = p0Num/p0Denom p1Vect = log(p1Num/p1Denom) p0Vect = log(p0Num/p0Denom) return p0Vect,p1Vect,pAbusive #樸素貝葉斯分類函式 def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1): p1 = sum(vec2Classify * p1Vec) + log(pClass1) #元素相乘 p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1) if p1>p0: return 1 else: return 0 def testingNB(): listOPosts,listClasses = loadDataSet() #產生文件矩陣和對應的標籤 myVocabList = createVocabList(listOPosts) #建立並集 trainMat = [] #建立一個空的列表 for postinDoc in listOPosts: trainMat.append(setOfWords2Vec(myVocabList,postinDoc)) #使用詞向量來填充trainMat列表 p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses)) #訓練函式 testEntry = ['love','my','dalmation'] #測試文件列表 thisDoc = array(setOfWords2Vec(myVocabList,testEntry)) #宣告矩陣 print testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb) testEntry = ['stupid','garbage'] thisDoc = array(setOfWords2Vec(myVocabList,testEntry)) #宣告矩陣 print testEntry,'classified as:',classifyNB(thisDoc,p0V,p1V,pAb)