1. 程式人生 > >樸素貝葉斯分類numpy版本——深度學習

樸素貝葉斯分類numpy版本——深度學習

#!python3
#這裡使用樸素貝葉斯演算法
#即貝葉斯演算法的簡化版
from numpy import *
 
def loadDataSet():
    postingList=[['my','dog','had','flea',\
                  'problems','help','please'],
                 ['mybe','not','take','him',\
                  'to','dog','park','stupid'],
                 ['my','dalmation','is','so','cute',\
                  'I','love','him'],
                 ['stop','posting','stupid','worthless','garbage'],
                 ['mr','licks','ate','my','steak','how',\
                  'to','stop','him'],
                 ['quit','buying', 'worthless','dog','food','stupid']]
    classVec = [0,1,0,1,0,1]        #1 代表侮辱性文字,0代表正常言論
    return postingList,classVec
 
def createVocabList(dataSet):
    vocabSet = set([])
    for document in dataSet:
        vocabSet = vocabSet | set(document)
    return list(vocabSet)
 
def setOfWords2Vec(vocabList,inputSet):
    returnVec = [0]*len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] = 1
        else:
            print("the word: %s is not in my Vocabulary!" %word)
    return returnVec
 
#計算每個類別中的文件數目
#對每篇訓練文件:
#   對每個類別:
#       如果詞條出現在文件中————》增加該詞條的計數值
#       增加所有詞條的計數值
#   對每個類別:
#       對每個詞條:
#           將該詞條的數目初一總詞條數得到條件概率
#   返回每個類別的條件概率
 
def trainNB0(trainMatrix,trainCategory):
    numTrainDocs = len(trainMatrix)
    numWords = len(trainMatrix[0])
    pabusive = sum(trainCategory)/float(numTrainDocs)
    p0Num = zeros(numWords);p1Num = zeros(numWords)
    p0Denom = 0.0;p1Denom = 0.0
    for i in range(numTrainDocs):
        if trainCategory[i] == 1:
            p1Num += trainMatrix[i]
            p1Denom += sum(trainMatrix[i])
        else:
            p0Num += trainMatrix[i]
            p0Denom += sum(trainMatrix[i])
    p1Vect = p1Num/p1Denom
    p0Vect = p0Num/p0Denom
    return p0Vect,p1Vect,pabusive
 
def classifyNB(vec2Classify, p0Vec, p1Vec,pClass1):
    p1 = sum(vec2Classify * p1Vec) + log(pClass1)
    p0 = sum(vec2Classify * p0Vec) + log(1.0-pClass1)
    if p1 > p0:
        return 1
    else:
        return 0
 
def testingNB():
    listOPosts,listClasses = loadDataSet()
    myVocabList = createVocabList(listOPosts)
    trainMat=[]
    for postingDoc in listOPosts:
        trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
    p0V,p1V,pAb = trainNB0(array(trainMat),array(listClasses))
    testEntry = ['love', 'my', 'dalmation']
    thisDoc = array(setOfWords2Vec(myVocabList,testEntry))
    print (testEntry,'classified as: ',classifyNB(thisDoc,p0V,p1V\
                                                  ,pAb))
#文件詞袋模型
def bagOfWord2VecMN(vocabList, inputSet):
    returnVec = [0] * len(vocabList)
    for word in inputSet:
        if word in vocabList:
            returnVec[vocabList.index(word)] += 1
    return returnVec
                   
#檔案解析及完整的垃圾郵件測試函式
def textParse(bigString):
    import re
    listOfTokens = re.split(r'\w*',bigString)
    return [tok.lower() for tok in listOfTokens if len(tok) >2 2]
 
def spamTest():
    docList=[];classList = [];fullText = []
    for i in range(1,26):
        wordList = textParse(open('email/span/%d.txt'%i).read())
        docList.append(wordList)
        classList.append(1)
        wordList = textParse(open('email/ham/%d.txt'%i).read())
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)
    trainingSet = range(50);testSet = []
    for i in range(10):
        randIndex = int(random.uniform(0,len(trainingSet)))
        testSet.append(trainingSet[randIndex])
        del(trainingSet[randIndex])
    trainMat = [];trainClasses = []
    for docIndex in trainingSet:
        trainMat.append(setOfWords2Vec(vocabList, docIndex]))
        trainClasses.append(classList[docIndex])
    p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
    errorCount = 0
    for docIndex in testSet:
        wordVector = setOfWords2Vec(vocabList, docList[docIndex])
        if classifyNB(array(wordVector),p0V,p1V,pSpam) !=\
           classList[docIndex]:
            errorCount += 1
    print ("the error rate is:",float(errorCount)/len(testSet))
         
#Rss源分類器及高頻詞去除函式
def clacMostFreq(vocabList,fullText):
    import operator
    freqDict = {}
    for token in vocabList:
        freqDict[token] = fullText.count(token)
    sortedFreq = sorted(freqDict.items(),key = operator.itemgetter(1),reverse=True)
    return sortedFreq[:30]
 
def localWords(feed1,feed0):
    import feedparser
    docList = [];classList = [];fullText = []
    minLen = min(len(feed1['entries']),len(feed0['entries']))
    for i in range(minLen):
        wordList = textParse(feed1['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(1)
        wordList = textParse(feed0['entries'][i]['summary'])
        docList.append(wordList)
        fullText.extend(wordList)
        classList.append(0)
    vocabList = createVocabList(docList)
    top30Words = calcMostFreq(vocabList,fullText)