建立邏輯迴歸(LogisticRegression)二分類器
已知資料集 testSet.txt 中資料格式如下:
設第一列特徵為x1,第二列特徵為x2,第三列標籤為z
每一個特徵都乘上一個迴歸係數w,則有
用向量表示法,可記為將z代入Sigmoid函式中,得:
Sigmoid函式由於其影象特點,可以很方便的執行二分類的任務,大於0.5的資料歸為一類,小於0.5的資料歸為另一類.
Sigmoid函式:
import matplotlib.pyplot as plt
import numpy as np
def sigmoid(z):
return 1 / (1 + np.exp(-z))
nums = np.arange(-5, 5, step=0.3)
fig = plt.figure(figsize=(12, 4))
ax = fig.add_subplot(111)
ax.plot(nums, sigmoid(nums), 'r')
plt.show()
梯度上升法:
記梯度為,則函式的梯度為
梯度代表了函式變化的方向,記為函式變化的大小,也稱“步長”,則梯度上升演算法的迭代公式為:
對應的梯度下降的迭代公式為由此,我們就可以通過梯度上升法來尋找最佳的迴歸係數。
使用 Matplotlib 繪出資料點:
import TxtToNumpy
dataMat, labelList = TxtToNumpy.TxtToNumpy("testSet.txt" )
type0_x = []; type0_y = []
type1_x = []; type1_y = []
for i in range(len(labelList)):
if labelList[i] == 0:
type0_x.append(dataMat[i][0])
type0_y.append(dataMat[i][1])
if labelList[i] == 1:
type1_x.append(dataMat[i][0])
type1_y.append(dataMat[i][1])
fig = plt.figure(figsize = (8, 4))
ax = fig.add_subplot(111)
type0 = ax.scatter(type0_x, type0_y, s = 30, c = 'r')
type1 = ax.scatter(type1_x, type1_y, s = 30, c = 'b')
ax.set_xlabel("X1")
ax.set_ylabel("X2")
ax.legend((type0, type1), ("Class 0", "Class 1"), loc=0)
plt.show()
TxtToNumpy.py 模組:
from numpy import *
def TxtToNumpy(filename):
file = open(filename)
file_lines_list = file.readlines()
number_of_file_lines = len(file_lines_list)
dataMat = zeros((number_of_file_lines, 3))
labelList = []
index = 0
for line in file_lines_list:
line = line.strip()
line_list = line.split('\t')
dataMat[index, :] = line_list[0:3]
labelList.append(int(line_list[-1]))
index += 1
return dataMat, labelList
if __name__ == "__main__":
print("Code Run As A Program")
畫出決策邊界:
①批處理梯度上升法求權重,進而畫出決策邊界:
批處理梯度上升法求權重時,每次更新迴歸係數都需要遍歷整個資料集,因此準確度也最高,但計算複雜度也非常高。
BpGradientAscent.py 模組:
# coding: utf-8
#Batch Processing Gradient Ascent
import numpy as np
import matplotlib.pyplot as plt
#將txt檔案中儲存的資料和標籤分別儲存在列表dataMat和labelMat中
def loadDataSet(filename):
dataList = []
labelList = []
fr = open(filename)
for line in fr.readlines():
#將每一行的各個元素取出存放在列表lineArr中
lineArr = line.strip().split()
#[ , , ]中三個引數代表了公式 z = W^T X中的X,第一個X的值為1
dataList.append([1.0, float(lineArr[0]), float(lineArr[1])])
labelList.append(int(lineArr[2]))
return dataList, labelList
#sigmoid函式,用於分類
def sigmoid(z):
return 1.0 / (1 + np.exp(-z))
#batch Processing Gradient Ascent,批處理梯度上升求權重W; alpha表示步長, maxCycles表示梯度上升演算法的最大迭代次數
def bpGradientAscent(filename, alpha=0.001, maxCycles=500):
dataList, labelList = loadDataSet(filename)
dataMatrix = np.mat(dataList)
#teanspose()用於矩陣轉置
labelMatrix = np.mat(labelList).transpose()
m, n = np.shape(dataMatrix)
weights = np.ones((n, 1))
for i in range(maxCycles):
sig = sigmoid(dataMatrix * weights)
error = labelMatrix - sig
weights = weights + alpha * dataMatrix.transpose() * error
#getA()將矩陣轉換為陣列
return weights.getA()
#畫出決策邊界
def decisionBoundary(weights, filename):
dataMat, labelMat = loadDataSet(filename)
dataArr = np.array(dataMat)
n = np.shape(dataArr)[0]
type0_x = []; type0_y = []
type1_x = []; type1_y = []
for i in range(n):
if labelMat[i] == 0:
type0_x.append(dataMat[i][1])
type0_y.append(dataMat[i][2])
if labelMat[i] == 1:
type1_x.append(dataMat[i][1])
type1_y.append(dataMat[i][2])
fig = plt.figure(figsize = (8, 4))
ax = fig.add_subplot(111)
type0 = ax.scatter(type0_x, type0_y, s = 30, c = 'r')
type1 = ax.scatter(type1_x, type1_y, s = 30, c = 'b')
x1 = np.arange(-4.5, 4.5, 0.1)
x2 = (-weights[0]-weights[1]*x1) / weights[2]
ax.set_xlabel("X1")
ax.set_ylabel("X2")
ax.legend((type0, type1), ("Class 0", "Class 1"), loc=0)
ax.plot(x1, x2)
plt.show()
if __name__ == "__main__":
print("Code Run as a Program!")
呼叫該 BpGradientAscent.py 模組:
import matplotlib.pyplot as plt
import numpy as np
import BpGradientAscent
BpGradientAscent.decisionBoundary(BpGradientAscent.bpGradientAscent("testSet.txt"), "testSet.txt")
得到決策邊界(藍線):
②小批量隨機梯度上升法求權重,進而畫出決策邊界:
小批量隨機梯度上升法求權重時,每次更新迴歸係數只需要選取一部分資料,準確度相對於批處理梯度上升法有所降低,但計算複雜度相對也降低很多,可以通過調整步長和最大迭代次數來提供決策邊界的準確度。
SbsGradientAscent.py 模組:
# coding: utf-8
#Small Batch Stochastic Gradient Ascent
import numpy as np
import matplotlib.pyplot as plt
#將txt檔案中儲存的資料和標籤分別儲存在列表dataMat和labelMat中
def loadDataSet(filename):
dataList = []
labelList = []
fr = open(filename)
for line in fr.readlines():
#將每一行的各個元素取出存放在列表lineArr中
lineArr = line.strip().split()
#[ , , ]中三個引數代表了公式 z = W^T X中的X,第一個X的值為1
dataList.append([1.0, float(lineArr[0]), float(lineArr[1])])
labelList.append(int(lineArr[2]))
return dataList, labelList
#sigmoid函式,用於分類
def sigmoid(z):
return 1.0 / (1 + np.exp(-z))
#small Batch Stochastic Gradient Ascent,小批量隨機梯度上升求權重;maxCycles表示梯度上升演算法的最大迭代次數
def sbsGradientAscent(filename, maxCycles = 300):
dataList, labelList = loadDataSet(filename)
m, n = np.shape(dataList)
weights = np.ones(n)
for i in range(maxCycles):
dataIndex = range(m)
for j in range(m):
#alpha表示步長
alpha = 4 / (1.0 + i + j) + 0.001
#uniform()表示在引數範圍內隨機取值
randomIndex = int(np.random.uniform(0, len(dataIndex)))
error = labelList[randomIndex] - sigmoid(sum(dataList[randomIndex] * weights))
weights = weights + alpha * error * np.array(dataList[randomIndex])
#從列表中移除剛剛被隨機選取的值
del(list(dataIndex)[randomIndex])
return weights
def decisionBoundary(weights, filename):
dataMat, labelMat = loadDataSet(filename)
dataArr = np.array(dataMat)
n = np.shape(dataArr)[0]
type0_x = []; type0_y = []
type1_x = []; type1_y = []
for i in range(n):
if labelMat[i] == 0:
type0_x.append(dataMat[i][1])
type0_y.append(dataMat[i][2])
if labelMat[i] == 1:
type1_x.append(dataMat[i][1])
type1_y.append(dataMat[i][2])
fig = plt.figure(figsize = (8, 4))
ax = fig.add_subplot(111)
type0 = ax.scatter(type0_x, type0_y, s = 30, c = 'r')
type1 = ax.scatter(type1_x, type1_y, s = 30, c = 'b')
x1 = np.arange(-4.5, 4.5, 0.1)
x2 = (-weights[0]-weights[1]*x1) / weights[2]
ax.set_xlabel("X1")
ax.set_ylabel("X2")
ax.legend((type0, type1), ("Class 0", "Class 1"), loc=0)
ax.plot(x1, x2)
plt.show()
if __name__ == "__main__":
print("Code Run as a Program!")
呼叫 SbsGradientAscent.py 模組:
import matplotlib.pyplot as plt
import numpy as np
import SbsGradientAscent
SbsGradientAscent.decisionBoundary(SbsGradientAscent.sbsGradientAscent("testSet.txt"), "testSet.txt")
得到決策邊界(藍線):
BpGradientAscent.py 模組 和 SbsGradientAscent.py 模組的不同之處在於其中的 bpGradientAscent()函式和 sbsGradientAscent()函式 不同,分別表示 批處理梯度上升求權重 和 小批量隨機梯度上升求權重
bpGradientAscent()函式:
#batch Processing Gradient Ascent,批處理梯度上升求權重W; alpha表示步長, maxCycles表示梯度上升演算法的最大迭代次數
def bpGradientAscent(filename, alpha=0.001, maxCycles=500):
dataList, labelList = loadDataSet(filename)
dataMatrix = np.mat(dataList)
#teanspose()用於矩陣轉置
labelMatrix = np.mat(labelList).transpose()
m, n = np.shape(dataMatrix)
weights = np.ones((n, 1))
for i in range(maxCycles):
sig = sigmoid(dataMatrix * weights)
error = labelMatrix - sig
weights = weights + alpha * dataMatrix.transpose() * error
#getA()將矩陣轉換為陣列
return weights.getA()
sbsGradientAscent()函式:
#small Batch Stochastic Gradient Ascent,小批量隨機梯度上升求權重;maxCycles表示梯度上升演算法的最大迭代次數
def sbsGradientAscent(filename, maxCycles = 300):
dataList, labelList = loadDataSet(filename)
m, n = np.shape(dataList)
weights = np.ones(n)
for i in range(maxCycles):
dataIndex = range(m)
for j in range(m):
#alpha表示步長
alpha = 4 / (1.0 + i + j) + 0.001
#uniform()表示在引數範圍內隨機取值
randomIndex = int(np.random.uniform(0, len(dataIndex)))
error = labelList[randomIndex] - sigmoid(sum(dataList[randomIndex] * weights))
weights = weights + alpha * error * np.array(dataList[randomIndex])
#從列表中移除剛剛被隨機選取的值
del(list(dataIndex)[randomIndex])
return weights
最終得到的決策邊界(藍線)為:
bpGradientAscent()函式:
sbsGradientAscent()函式: