機器學習(3):基於概率論的分類方法:樸素貝葉斯
阿新 • • 發佈:2018-12-14
概述
- 優點:在資料較少的情況下仍然有效,可以處理多類別問題。
- 缺點:對於輸入資料的準備方式較為敏感。
- 使用資料型別:標稱型資料。
貝葉斯決策理論的核心思想:選擇具有最高概率的決策。
使用條件概率來分類
對於某個資料點x,y:
- 如果,那麼屬於類別
- 如果,那麼屬於類別
使用貝葉斯準則(公式:),可以通過已知的三個概率值來計算未知的概率值。
使用樸素貝葉斯進行文件分類
樸素貝葉斯的兩個假設分別是:
1.一個特徵或者單詞出現的可能性和它和其他單詞相鄰沒有關係。
由統計學知,如果每個特徵需要N個樣本,那麼對於包含1000個特徵的詞彙表需要N^1000個樣本。如果特徵之間相互獨立,那麼樣本數就可以從N^1000減少到1000xN。即一個特徵或者單詞出現的可能性與它和其他單詞相鄰沒有關係。
2.每個特徵同等重要。
程式碼:
from pysqlite2 import dbapi2 as sqlite import re import math def getwords(doc): splitter=re.compile('\\W*') print doc # Split the words by non-alpha characters words=[s.lower() for s in splitter.split(doc) if len(s)>2 and len(s)<20] # Return the unique set of words only return dict([(w,1) for w in words]) class classifier: def __init__(self,getfeatures,filename=None): # Counts of feature/category combinations self.fc={} # Counts of documents in each category self.cc={} self.getfeatures=getfeatures def setdb(self,dbfile): self.con=sqlite.connect(dbfile) self.con.execute('create table if not exists fc(feature,category,count)') self.con.execute('create table if not exists cc(category,count)') def incf(self,f,cat): count=self.fcount(f,cat) if count==0: self.con.execute("insert into fc values ('%s','%s',1)" % (f,cat)) else: self.con.execute( "update fc set count=%d where feature='%s' and category='%s'" % (count+1,f,cat)) def fcount(self,f,cat): res=self.con.execute( 'select count from fc where feature="%s" and category="%s"' %(f,cat)).fetchone() if res==None: return 0 else: return float(res[0]) def incc(self,cat): count=self.catcount(cat) if count==0: self.con.execute("insert into cc values ('%s',1)" % (cat)) else: self.con.execute("update cc set count=%d where category='%s'" % (count+1,cat)) def catcount(self,cat): res=self.con.execute('select count from cc where category="%s"' %(cat)).fetchone() if res==None: return 0 else: return float(res[0]) def categories(self): cur=self.con.execute('select category from cc'); return [d[0] for d in cur] def totalcount(self): res=self.con.execute('select sum(count) from cc').fetchone(); if res==None: return 0 return res[0] def train(self,item,cat): features=self.getfeatures(item) # Increment the count for every feature with this category for f in features: self.incf(f,cat) # Increment the count for this category self.incc(cat) self.con.commit() def fprob(self,f,cat): if self.catcount(cat)==0: return 0 # The total number of times this feature appeared in this # category divided by the total number of items in this category return self.fcount(f,cat)/self.catcount(cat) def weightedprob(self,f,cat,prf,weight=1.0,ap=0.5): # Calculate current probability basicprob=prf(f,cat) # Count the number of times this feature has appeared in # all categories totals=sum([self.fcount(f,c) for c in self.categories()]) # Calculate the weighted average bp=((weight*ap)+(totals*basicprob))/(weight+totals) return bp class naivebayes(classifier): def __init__(self,getfeatures): classifier.__init__(self,getfeatures) self.thresholds={} def docprob(self,item,cat): features=self.getfeatures(item) # Multiply the probabilities of all the features together p=1 for f in features: p*=self.weightedprob(f,cat,self.fprob) return p def prob(self,item,cat): catprob=self.catcount(cat)/self.totalcount() docprob=self.docprob(item,cat) return docprob*catprob def setthreshold(self,cat,t): self.thresholds[cat]=t def getthreshold(self,cat): if cat not in self.thresholds: return 1.0 return self.thresholds[cat] def classify(self,item,default=None): probs={} # Find the category with the highest probability max=0.0 for cat in self.categories(): probs[cat]=self.prob(item,cat) if probs[cat]>max: max=probs[cat] best=cat # Make sure the probability exceeds threshold*next best for cat in probs: if cat==best: continue if probs[cat]*self.getthreshold(best)>probs[best]: return default return best class fisherclassifier(classifier): def cprob(self,f,cat): # The frequency of this feature in this category clf=self.fprob(f,cat) if clf==0: return 0 # The frequency of this feature in all the categories freqsum=sum([self.fprob(f,c) for c in self.categories()]) # The probability is the frequency in this category divided by # the overall frequency p=clf/(freqsum) return p def fisherprob(self,item,cat): # Multiply all the probabilities together p=1 features=self.getfeatures(item) for f in features: p*=(self.weightedprob(f,cat,self.cprob)) # Take the natural log and multiply by -2 fscore=-2*math.log(p) # Use the inverse chi2 function to get a probability return self.invchi2(fscore,len(features)*2) def invchi2(self,chi, df): m = chi / 2.0 sum = term = math.exp(-m) for i in range(1, df//2): term *= m / i sum += term return min(sum, 1.0) def __init__(self,getfeatures): classifier.__init__(self,getfeatures) self.minimums={} def setminimum(self,cat,min): self.minimums[cat]=min def getminimum(self,cat): if cat not in self.minimums: return 0 return self.minimums[cat] def classify(self,item,default=None): # Loop through looking for the best result best=default max=0.0 for c in self.categories(): p=self.fisherprob(item,c) # Make sure it exceeds its minimum if p>self.getminimum(c) and p>max: best=c max=p return best def sampletrain(cl): cl.train('Nobody owns the water.','good') cl.train('the quick rabbit jumps fences','good') cl.train('buy pharmaceuticals now','bad') cl.train('make quick money at the online casino','bad') cl.train('the quick brown fox jumps','good')