以logistic Regression為例實現多類別分類及Python實現
阿新 • • 發佈:2019-02-11
這種方法簡單有效,而且使用類似logistic這種有概率值大小可以比較的情況下,類邊界其實是個有範圍的值,可以增加正確率。而且當K(類別數量)很大時,通過投票的方式解決了一部分不平衡性問題。
可知,最佳情況下準確率有所提高:
參考:http://blog.sina.com.cn/s/blog_5eef0840010147pa.html
# -*- coding: utf-8 -*- from logisticRegression import * from numpy import * import operator #知道了Iris共有三種類別Iris-setosa,Iris-versicolor和Iris-virginica def loadDataSet(filename): numFeat = len(open(filename).readline().split(','))-1 dataMat = []; labelMat = [] fr = open(filename) for line in fr.readlines(): lineArr = [] curLine = line.strip().split(',') for i in range(numFeat): lineArr.append(float(curLine[i])) dataMat.append([1]+lineArr) #這裡是為了使 x0 等於 1 labelMat.append(curLine[-1]) return dataMat,labelMat # voteResult = {'Iris-setosa':0,'Iris-versicolo':0,'Iris-virginica':0}#記錄投票情況 voteResult = [0,0,0] categorylabels = ['Iris-setosa','Iris-versicolor','Iris-virginica']#類別標籤 opts = {'alpha': 0.01, 'maxIter': 100, 'optimizeType': 'smoothStocGradDescent'} #訓練過程 dataMat,labelMat = loadDataSet('train.txt') weight1 = [] for i in range(3):#三類 labelMat1 = [] for j in range(len(labelMat)):#把名稱變成0或1的數字 if labelMat[j] == categorylabels[i]: labelMat1.append(1) else: labelMat1.append(0) dataMat = mat(dataMat);labelMat1 = mat(labelMat1).T weight1.append(logisticRegression(dataMat,labelMat1,opts)) #測試過程 dataMat,labelMat = loadDataSet('test.txt') dataMat = mat(dataMat) initial_value = 0 list_length = len(labelMat) h = [initial_value]*list_length for j in range(len(labelMat)): voteResult = [0,0,0] for i in range(3): h[j] = float(sigmoid(dataMat[j]*weight1[i]))#得到訓練結果 if h[j] > 0.5 and h[j] <= 1: voteResult[i] = voteResult[i]+1+h[j]#由於類別少,為了防止同票,投票數要加上概率值 elif h[j] >= 0 and h[j] <= 0.5: voteResult[i] = voteResult[i]-1+h[j] else: print 'Properbility wrong!' h[j] = voteResult.index(max(voteResult)) print h labelMat2 = [] for j in range(len(labelMat)):#把名稱變成0或1或2的數字 for i in range(3):#三類 if labelMat[j] == categorylabels[i]: labelMat2.append(i);break #計算正確率 error = 0.0 for j in range(len(labelMat)): if h[j] != labelMat2[j]: error = error +1 pro = 1 - error / len(labelMat)#正確率 print pro
</pre><pre class="python" name="code">
<span style="white-space:pre"> </span><span style="font-family:SimSun;font-size:24px;">沒有優化的情況下的準確率:<img src="" alt="" /></span>
2.第二種多分類方法為所有對所有(All-versus-all,AVA),也就是每次對一類學習一個分類器(one vs on at a time)。假定有M類,那麼要構建m(m-1)/2個二元分類器。每一個分類器都使用它應該區分的兩個類的元組來訓練。為了對未知元組分類,所有的分類器投票表決。該元組被指派到得票數醉倒的類。一般來說‘所有對所有’優於‘一對所有’。解決了不平衡性,但是會佔用更大的空間下面的程式主要修改了訓練過程:
-*- coding: utf-8 -*- from logisticRegression import * from numpy import * import operator #知道了Iris共有三種類別Iris-setosa,Iris-versicolor和Iris-virginica def loadDataSet(filename): numFeat = len(open(filename).readline().split(','))-1 dataMat = []; labelMat = [] fr = open(filename) for line in fr.readlines(): lineArr = [] curLine = line.strip().split(',') for i in range(numFeat): lineArr.append(float(curLine[i])) dataMat.append([1]+lineArr) #這裡是為了使 x0 等於 1 labelMat.append(curLine[-1]) return dataMat,labelMat # voteResult = {'Iris-setosa':0,'Iris-versicolo':0,'Iris-virginica':0}#記錄投票情況 voteResult = [0,0,0] categorylabels = ['Iris-setosa','Iris-versicolor','Iris-virginica']#類別標籤 opts = {'alpha': 0.01, 'maxIter': 50, 'optimizeType': 'smoothStocGradDescent'} #訓練過程 dataMat,labelMat = loadDataSet('train.txt') dataMat2 = dataMat[0:40]+dataMat[80:120] dataMat2 = mat(dataMat2) dataMat = mat(dataMat) weight1 = [] for i in range(3):#三類 labelMat1 = [] for j in range(len(labelMat)):#把名稱變成0或1的數字 if labelMat[j] == categorylabels[i]: labelMat1.append(1) else: labelMat1.append(0) if i == 0: weight1.append(logisticRegression(dataMat[0:80,:],labelMat1[0:80],opts)) elif i == 1: weight1.append(logisticRegression(dataMat[40:120,:],labelMat1[40:120],opts)) else: labelMat12 = labelMat1[0:40]+labelMat1[80:120] labelMat12 = labelMat12 weight1.append(logisticRegression(dataMat2,labelMat12,opts)) #測試過程 dataMat,labelMat = loadDataSet('test.txt') dataMat = mat(dataMat) initial_value = 0 list_length = len(labelMat) h = [initial_value]*list_length for j in range(len(labelMat)): voteResult = [0,0,0] for i in range(2): h[j] = float(sigmoid(dataMat[j]*weight1[i]))#得到訓練結果 if h[j] > 0.5 and h[j] <= 1: voteResult[i] = voteResult[i]+1#由於類別少,為了防止同票,投票數要加上概率值 elif h[j] >= 0 and h[j] <= 0.5: voteResult[i+1] = voteResult[i+1]+1 else: print 'Properbility wrong!' h[j] = float(sigmoid(dataMat[j]*weight1[2]))#得到訓練結果 if h[j] > 0.5 and h[j] <= 1: voteResult[2] = voteResult[2]+1#由於類別少,為了防止同票,投票數要加上概率值 elif h[j] >= 0 and h[j] <= 0.5: voteResult[0] = voteResult[0]+1 else: print 'Properbility wrong!' h[j] = voteResult.index(max(voteResult)) print h labelMat2 = [] for j in range(len(labelMat)):#把名稱變成0或1或2的數字 for i in range(3):#三類 if labelMat[j] == categorylabels[i]: labelMat2.append(i);break #計算正確率 error = 0.0 for j in range(len(labelMat)): if h[j] != labelMat2[j]: error = error +1 pro = 1 - error / len(labelMat)#正確率 print pro
可知,最佳情況下準確率有所提高:
參考:http://blog.sina.com.cn/s/blog_5eef0840010147pa.html