1. 程式人生 > >機器學習演算法與Python實踐之邏輯迴歸(Logistic Regression)(二)

機器學習演算法與Python實踐之邏輯迴歸(Logistic Regression)(二)

#!/usr/bin/python
# -*- coding:utf-8 -*-
import numpy as np
from numpy import *
import matplotlib.pyplot as plt
#處理資料函式
def loadDataSet():
    dataMat=[]
    labelMat=[]
    fr=open('C:\\Users\\root\\Desktop\\2017machinelearning\\machinelearninginaction-master\\machinelearninginaction-master\\Ch05\\testSet.txt')
    for line in fr.readlines():
        lineArr=line.strip().split()
        dataMat.append([1.0,float(lineArr[0]),float(lineArr[1])])
        labelMat.append(int(lineArr[2]))
    return dataMat,labelMat
def Sigmoid(x):
    f=1.0/(1+exp(-x))
    return  f
#梯度函式
def gradAscent(dataMat,labelMat):
    datamatri=mat(dataMat)
    labelMatri=mat(labelMat).transpose() #轉化為列向量
    m,n=shape(datamatri)
    alpha=0.001
    maxCycles=500
    weights=ones((n,1))
    #迴圈迭代次數maxCycles
    for i in range(maxCycles):
        # print i
        h=Sigmoid(datamatri*weights) #矩陣相乘 初始值
        error=(labelMatri-h) #錯誤數\
        weights=weights+alpha*datamatri.transpose()*error
    return weights
#隨機梯度函式
def stocgradAscent0(dataMat,labelMat):
    # datamatri=mat(dataMat)
    # labelMatri=mat(labelMat).transpose() #轉化為列向量
    m,n=shape(dataMat)
    alpha=0.01
    # maxCycles=500
    weights=ones(n)
    print n
    # print weights
    #迴圈迭代次數maxCycles
    for i in range(m):
        h=Sigmoid(sum(dataMat[i]*weights)) #矩陣相乘 初始值
        error=labelMat[i]-h #錯誤數\
        weights=weights+alpha*error*dataMat[i]
    return weights
#改進的隨機梯度函式
def stocGradAscent1(dataMatri,classLabels,numIter=150):
    m,n=shape(dataMatri)
    weights=ones(n)
    for j in range(numIter):
        dataIndex=range(m)
        for i in range(m):
            alpha=4/(1.0+j+i)+0.0001
            randIndex=int(random.uniform(0,len(dataIndex)))
            h=Sigmoid(sum(dataMatri[randIndex]*weights))
            error=classLabels[randIndex]-h
            weights=weights+alpha*error*dataMatri[randIndex]
            del(dataIndex[randIndex])
    return weights

#畫圖測試
def plotBestFit(weights):
    # weights=wei.getA()
    dataMat,labelMat=loadDataSet()
    dataArr=array(dataMat)
    # labelMat=array(labelMat)
    # print type(labelMat[0][0])
    n=shape(dataArr)[0]  #取行數
    xcord1=[]
    xcord2 = []
    ycord1=[]
    ycord2 = []
    for i in range(n):
        if int(labelMat[i])==1:
            xcord1.append(dataArr[i,1])
            ycord1.append(dataArr[i,2])
            # print int(labelMat[i][0]) == 1
        else:
            # print int ( labelMat[i][0] ) == 1
            xcord2.append(dataArr[i,1])
            ycord2.append(dataArr[i,2])
    # weights = gradAscent(dataMat,labelMat)
    fig=plt.figure()
    ax=fig.add_subplot(111)
    ax.scatter(xcord1,ycord1,c='red')
    ax.scatter(xcord2,ycord2,c='g')
    x=arange(-3.0,3.0,0.1)
    y=(-weights[0]-weights[1]*x)/weights[2]
    ax.plot(x,y.reshape(-1,1))
    plt.show()
#梯度函式
a,b=loadDataSet()
weights=gradAscent(a,b)
plotBestFit(weights)
#隨機梯度函式
a,b=loadDataSet()
weights=stocgradAscent0(array(a),b)
plotBestFit(weights)
##改進的隨機梯度函式
a,b=loadDataSet()
weights=stocgradAscent0(array(a),b)
plotBestFit(weights)
#例項分析從疝氣病病症預測病罵的死亡率
def classficatinon(Inter,weights):
    a=sum(Inter*weights)
    if Sigmoid(a)>0.5:
        b=1
    else:
        b=0
    return  b
def colicTest():
    frTrain=open('C:\\Users\\root\\Desktop\\2017machinelearning\\machinelearninginaction-master\\machinelearninginaction-master\\Ch05\\horseColicTraining.txt')
    frTest=open('C:\\Users\\root\\Desktop\\2017machinelearning\\machinelearninginaction-master\\machinelearninginaction-master\\Ch05\\horseColicTest.txt')
    trainingSet=[]
    trainingLabels=[]
    for line in frTrain.readlines():
        currLine=line.strip().split('\t')
        lineArr=[]#特徵向量
        for i in range(21):
            lineArr.append(float(currLine[i]))
        trainingSet.append(lineArr)
        trainingLabels.append(float(currLine[21]))#訓練集標籤
    trainWeiht=stocGradAscent1(array(trainingSet),trainingLabels,1)#訓練出迴歸係數
    errorCount=0
    numTestVec=0.0
    for line in frTest.readlines():
        numTestVec+=1.0
        currLine = line.strip ().split ( '\t' )
        lineArr = []  # 特徵向量(
        for i in range ( 21 ):
            lineArr.append ( float ( currLine[i] ) )
        if int(classficatinon(array(lineArr),trainWeiht))!=int(currLine[21]):
            errorCount+=1
        errorRate=(float(errorCount)/numTestVec)
        print "錯誤率為 %f" % errorRate
    return errorRate
def multiTest():
    numTests=1
    errorSum=0.0
    for k in range(numTests):
        errorSum+=colicTest()
    print "after %d 迭代後平均錯誤率為:%f" %(numTests,errorSum/float(numTests))
# colicTest()
multiTest()
#梯度函式

 

#隨機梯度函式

##改進的隨機梯度函式

 

#例項分析從疝氣病病症預測病罵的死亡率

錯誤率為 0.288136
錯誤率為 0.300000
錯誤率為 0.295082
錯誤率為 0.290323
錯誤率為 0.285714
錯誤率為 0.281250
錯誤率為 0.276923
錯誤率為 0.287879
錯誤率為 0.298507
after 10 迭代後平均錯誤率為:0.338806