1. 程式人生 > >機器學習實戰——樸素貝葉斯分類

機器學習實戰——樸素貝葉斯分類

準備資料:從文字中構建詞向量

前期測試函式用的資料

def loadDataSet():
    '''建立一些實驗樣本'''
    postingList = [['my','dog','has','flea','problems','help','please'],
                  ['maybe','not','take','him','to','dog','park','stupid'],
                  ['my','dalmation','is','so','cute','I','love','him'],
                  ['stop'
,'posting','stupid','worthless','garbage'], ['mr','licks','ate','my','steak','how','to','stop','him'], ['quit','buying','worthless','dog','food','stupid']] classVec = [0,1,0,1,0,1] #0代表正常言論 1表示侮辱性 return postingList,classVec
def createVocabList(dataSet):
'''返回一個包含所有文件中出現的不重複的詞條集合''' vocabSet = set([]) for document in dataSet: vocabSet = vocabSet | set(document) #建立兩個集合的並集 return list(vocabSet)

詞表向向量的轉換函式

def setOfWords2Vec(vocabList,inputSet):
    '''接受詞彙表和某個文件,返回該文件向量'''
    returnVec = [0]*len(vocabList)
    for word in
inputSet: if word in vocabList: returnVec[vocabList.index(word)] = 1 else: print("the word:{} is not in my Vocabulary".format(word)) return returnVec

測試上述

listOPosts,listClasses = loadDataSet()
myVocabList = createVocabList(listOPosts)
myVocabList
['so',
 'buying',
 'please',
 'has',
 'dalmation',
 'my',
 'cute',
 'quit',
 'love',
 'stupid',
 'park',
 'not',
 'how',
 'flea',
 'problems',
 'licks',
 'food',
 'stop',
 'help',
 'him',
 'ate',
 'maybe',
 'take',
 'I',
 'worthless',
 'to',
 'steak',
 'mr',
 'is',
 'garbage',
 'posting',
 'dog']
setOfWords2Vec(myVocabList,listOPosts[0])
[0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1]

訓練演算法:從詞向量計算概率

樸素貝葉斯分類器訓練資料

from numpy import *
def trainNB0(trainMatrix,trainCategory):
    '''輸入文件矩陣,每篇文件類別構成的向量
    返回兩個向量[元素是各個詞條的條件概率P(Wi | C1) ,其中i=1,2,...,詞條數]和一個先驗概率'''
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pAbusive = sum(trainCategory) / float(numTrainDocs)   #類別中侮辱性對應1,它的先驗概率(若非二分問題此處修改)
    #p0Num = zeros(numWords);p1Num = zeros(numWords)
    #p0Denom = 0.0;p1denom = 0.0           #初始化概率
    p0Num = ones(numWords);p1Num = ones(numWords)
    p0Denom = 2.0;p1denom = 2.0            #初始化概率,拉普拉斯平滑,避免出現0
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]
            p1denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    #p1Vect = p1Num/p1denom
    #p0Vect = p0Num/p0Denom
    p1Vect = log(p1Num/p1denom)        #對乘積取自然對數,解決乘積很小時出現下溢位
    p0Vect = log(p0Num/p0Denom)
    return p0Vect,p1Vect,pAbusive

測試上述

trainMat = []
for postinDoc in listOPosts:
    trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
p0v,p1v,pAb = trainNB0(trainMat,listClasses)
pAb
0.5
p0v
array([-2.56494936, -3.25809654, -2.56494936, -2.56494936, -2.56494936,
       -1.87180218, -2.56494936, -3.25809654, -2.56494936, -3.25809654,
       -3.25809654, -3.25809654, -2.56494936, -2.56494936, -2.56494936,
       -2.56494936, -3.25809654, -2.56494936, -2.56494936, -2.15948425,
       -2.56494936, -3.25809654, -3.25809654, -2.56494936, -3.25809654,
       -2.56494936, -2.56494936, -2.56494936, -2.56494936, -3.25809654,
       -3.25809654, -2.56494936])
p1v
array([-3.04452244, -2.35137526, -3.04452244, -3.04452244, -3.04452244,
       -3.04452244, -3.04452244, -2.35137526, -3.04452244, -1.65822808,
       -2.35137526, -2.35137526, -3.04452244, -3.04452244, -3.04452244,
       -3.04452244, -2.35137526, -2.35137526, -3.04452244, -2.35137526,
       -3.04452244, -2.35137526, -2.35137526, -3.04452244, -1.94591015,
       -2.35137526, -3.04452244, -3.04452244, -3.04452244, -2.35137526,
       -2.35137526, -1.94591015])

測試演算法:

樸素貝葉斯分類函式

def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
    '''輸入要分類的向量,及訓練得到的引數
       返回分類'''
    p1 = sum(vec2Classify * p1Vec) + log(pClass1)   #元素相乘,此處相加是由於取對數了,而且vec2Classify的元素是0,1,從而對p1Vec和p0Vec
    p0 = sum(vec2Classify * p0Vec) + log(1.0 - pClass1)  #進行了篩選(0乘上去為0),即只用樣例出現的各個獨立條件概率(訓練得到的引數)
    if p1 > p0:
        return 1
    else:
        return 0

測試

testEntry = ['love','my','dalmation']
thisDoc = array(setOfWords2Vec(myVocabList,testEntry))
print("{0}classified as:{1}".format(testEntry,classifyNB(thisDoc,p0v,p1v,pAb)))
testEntry = ['stupid','garbage']
thisDoc = array(setOfWords2Vec(myVocabList,testEntry))
print("{0}classified as:{1}".format(testEntry,classifyNB(thisDoc,p0v,p1v,pAb)))
['love', 'my', 'dalmation']classified as:0
['stupid', 'garbage']classified as:1

使用詞袋模型

詞集模型是上面函式setOfWords2Vec()實現的,而詞袋模型是每個單詞可以出現多次

def bagOfWords2VecMN(vocabList,inputSet):
    '''接受詞彙表和某個文件,返回該文件向量'''
    returnVec = [0] * len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
    return returnVec

示例一:使用樸素貝葉斯過濾垃圾郵件

準備資料,檔案解析,從文字文件中構建自己的詞列表

def textParse(bigString):
    '''文字檔案解析,返回字串列表'''
    import re
    listOfTokens = re.split(r'\W*',bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) > 2]

對垃圾郵件進行自動化處理

def spamTest():
    docList = [];classList = [];fullText = []
    for i in range(1,26):
        #匯入並解析檔案
        wordList = textParse(open('E:\DataMining\Project\MLBook\機器學習實戰原始碼\machinelearninginaction\Ch04\email\spam\{}.txt'.
                                  format(i)).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(open('E:\DataMining\Project\MLBook\機器學習實戰原始碼\machinelearninginaction\Ch04\email\ham\{}.txt'.
                                  format(i),encoding='gb18030',errors='ignore').read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)  #返回所有文件中不重複的詞集
    trainingSet = list(range(50));testSet = []
    for i in range(10):
        #隨機構建訓練集
        randomIndex = int(random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randomIndex])
        del(trainingSet[randomIndex])
    trainMat = [];trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(setOfWords2Vec(vocabList,docList[docIndex]))
        trainClasses.append(classList[docIndex])
    p0v,p1v,pSpam = trainNB0(array(trainMat),array(trainClasses))
    errorCount = 0
    for docIndex in testSet:
        #對測試集分類
        wordVector = setOfWords2Vec(vocabList,docList[docIndex])
        if classifyNB(array(wordVector),p0v,p1v,pSpam) != classList[docIndex]:
            errorCount += 1
    print("the error rate is {}".format(float(errorCount)/len(testSet)))
    return float(errorCount)/len(testSet)

重複10次取錯誤率均值

errorPercent = 0.0
for i in range(10):
    errorPercent += spamTest()
print("the average error persent is : {}%".format(errorPercent/10 * 100))
E:\Anaconda3\lib\re.py:212: FutureWarning: split() requires a non-empty pattern match.
  return _compile(pattern, flags).split(string, maxsplit)


the error rate is 0.1
the error rate is 0.0
the error rate is 0.0
the error rate is 0.2
the error rate is 0.0
the error rate is 0.0
the error rate is 0.1
the error rate is 0.1
the error rate is 0.0
the error rate is 0.0
the average error persent is : 5.0%

示例二:使用樸素貝葉斯分類器從當地新聞中獲取所屬區域

這裡只是運用以下該分類器,找到高頻詞,如果真的要分析,其實要用停詞,詞性分析等

基於:使用RSS源閱讀程式庫:feedparser

def calcMostFreq(vocabList,fullText):
    '''遍歷詞彙表中的每個詞並統計在文字出現次數
       返回排序最高的30個單詞'''
    import operator
    freqDict = {}
    for token in vocabList:
        freqDict[token] = fullText.count(token)
    sortedFreq = sorted(freqDict.items(),key=operator.itemgetter(1),reverse=True)
    return sortedFreq[:30]

RSS源分類器函式

def localWords(feed1,feed0):
    '''輸入兩個RSS源'''
    import feedparser   #使用RSS源閱讀程式庫
    docList = [];classList = [];fullText = []
    minLen = min(len(feed1['entries']),len(feed0['entries']))
    for i in range(minLen):
        wordList = textParse(feed1['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(feed0['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)
    top30Words = calcMostFreq(vocabList,fullText)
    for pairW in top30Words:
        #去掉出現次數最高的那些詞
        if pairW[0] in vocabList:
            vocabList.remove(pairW[0])
    trainingSet = list(range(2*minLen));testSet = []
    #print("minLen is : {}".format(minLen))
    for i in range (20):
        randIndex = int(random.uniform(0,len(trainingSet)))
        #print("randIndex is : {}".format(randIndex))
        testSet.append(trainingSet[randIndex])
        del (trainingSet[randIndex])
    trainMat = [];trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(bagOfWords2VecMN(vocabList,docList[docIndex]))
        trainClasses.append(classList[docIndex])   #使用詞袋模型
    p0v,p1v,pSpam = trainNB0(array(trainMat),array(trainClasses))    #開始訓練
    errorCount = 0
    for docIndex in testSet:
        wordVector = bagOfWords2VecMN(vocabList,docList[docIndex])
        if classifyNB(array(wordVector),p0v,p1v,pSpam) != classList[docIndex]:
            errorCount += 1
    print('the error rate is : {}'.format(float(errorCount)/len(testSet)))
    return vocabList,p0v,p1v      

匯入RSS源測試

import feedparser
ny = feedparser.parse('https://newyork.craigslist.org/d/activity-partners/search/act?format=rss')
sf = feedparser.parse('https://sfbay.craigslist.org/d/activity-partners/search/act?format=rss')    #它的內容是不斷變化的
vocabList,pSF,pNY = localWords(ny,sf)
the error rate is : 0.35


E:\Anaconda3\lib\re.py:212: FutureWarning: split() requires a non-empty pattern match.
  return _compile(pattern, flags).split(string, maxsplit)

分析資料:顯示最近(我測試時)兩地新聞相關用詞

def getTopWords(ny,sf):
    import operator
    vocabList,p0v,p1v = localWords(ny,sf)
    topNY = [];topSF = []
    for i in range(len(p0v)):
        if p0v[i] > -5.0 : 
            topSF.append((vocabList[i],p0v[i]))
        if p1v[i] > -5.0 : 
            topNY.append((vocabList[i],p1v[i]))
    sortedSF = sorted(topSF,key = lambda pair: pair[1],reverse=True)
    print("SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**")
    for item in sortedSF:
        print(item[0])
    sortedNY = sorted(topNY,key = lambda pair:pair[1],reverse=True)
    print("NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**")
    for item in sortedNY:
        print(item[0])
getTopWords(ny,sf)
the error rate is : 0.3
SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**
really
abou
join
years
maybe
whom
one
wood
games
working
hang
fitness
early
two
also
know
june
past
level
could
but
NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**
channel
need
lady
great
our
shorter
make
little
attire
call
attend
youtube
things
participate
area
has
hair
help
got
funds
butterfly
social
vide
extra
submit
shiny
outgoing
brooklyn
there
how
long
etc
new
afternoon
noon
conversation
watching
hurry
walks
29th
youtu
back
does
dinner
moments
seeking
paddy
around
people
number
restaurant
put
couple
singers
weekends
maybe
share
when
must
love
full
name
live
then
5twfhtidasa
videos
humor
crowded
friend
articulate
info
pastime
working
starter
black
sports
show
those
considered