1. 程式人生 > >logistic 回歸(線性和非線性)

logistic 回歸(線性和非線性)

num itl [1] tex sns RKE rec 損失函數 pos

一:線性logistic 回歸

代碼如下:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.optimize as opt
import seaborn as sns

#讀取數據集
path = ex2data1.txt
data = pd.read_csv(path, header=None, names=[Exam 1, Exam 2, Admitted])

#將正負數據集分開
positive = data[data[Admitted].isin([1])]
negative 
= data[data[Admitted].isin([0])] ‘‘‘ #查看分布 fig, ax = plt.subplots(figsize=(12, 8)) ax.scatter(positive[‘Exam 1‘], positive[‘Exam 2‘], s=60, c=‘b‘, marker=‘o‘, label=‘Admitted‘) ax.scatter(negative[‘Exam 1‘], negative[‘Exam 2‘], s=50, c=‘r‘, marker=‘x‘, label=‘UnAdmitted‘) ax.legend() ax.set_xlabel(‘Exam 1 Score‘) ax.set_ylabel(‘Exam 2 Score‘) plt.show()
‘‘‘ #sigmoid函數實現 def sigmoid(h): return 1 / (1 + np.exp(-h)) ‘‘‘ #測試sigmoid函數 nums = np.arange(-10, 11, step=1) fig, ax = plt.subplots(figsize=(12, 8)) ax.plot(nums, sigmoid(nums), ‘k‘) plt.show() ‘‘‘ #計算損失函數值 def cost(theta, X, y): theta = np.matrix(theta) X = np.matrix(X) y = np.matrix(y) part1
= np.multiply(-y, np.log(sigmoid(X * theta.T))) part2 = np.multiply((1-y), np.log(1-sigmoid(X * theta.T))) return np.sum(part1-part2) / len(X) #在原矩陣第1列前加一列全1 data.insert(0, ones, 1) cols = data.shape[1] X = data.iloc[:, 0:cols-1] y = data.iloc[:, cols-1:cols] X = np.array(X.values) y = np.array(y.values) theta = np.zeros(3) #這裏是一個行向量 #返回梯度向量,註意是向量 def gradient(theta, X, y): theta = np.matrix(theta) X = np.matrix(X) y = np.matrix(y) parameters = theta.ravel().shape[1] grad = np.zeros(parameters) error = sigmoid(X * theta.T) - y grad = error.T.dot(X) grad = grad / len(X) return grad #通過高級算法計算出最好的theta值 result = opt.fmin_tnc(func=cost, x0=theta, fprime=gradient, args=(X, y)) #print(cost(result[0], X, y)) #測試所得theta的性能 #計算原數據集的預測情況 def predict(theta, X): theta = np.matrix(theta) X = np.matrix(X) probability = sigmoid(X * theta.T) return [1 if i > 0.5 else 0 for i in probability] theta_min = result[0] predictions = predict(theta_min, X) correct = [1 if((a == 1 and b == 1) or(a == 0 and b == 0)) else 0 for(a, b) in zip(predictions, y)] accuracy = (sum(map(int, correct)) % len(correct)) print(accuracy = {0}%.format(accuracy))#訓練集測試準確度89% # 作圖 theta_temp = theta_min theta_temp = theta_temp / theta_temp[2] x = np.arange(130, step=0.1) y = -(theta_temp[0] + theta_temp[1] * x) #畫出原點 sns.set(context=notebook, style=ticks, font_scale=1.5) sns.lmplot(Exam 1, Exam 2, hue=Admitted, data=data, size=6, fit_reg=False, scatter_kws={"s": 25} ) #畫出分界線 plt.plot(x, y, grey) plt.xlim(0, 130) plt.ylim(0, 130) plt.title(Decision Boundary) plt.show()

二:非線性logistic 回歸(正則化)

代碼如下:

import pandas as pd
import numpy as np
import scipy.optimize as opt
import matplotlib.pyplot as plt


path = ex2data2.txt
data = pd.read_csv(path, header=None, names=[Test 1, Test 2, Accepted])

positive = data[data[Accepted].isin([1])]
negative = data[data[Accepted].isin([0])]

‘‘‘
#顯示原始數據的分布
fig, ax = plt.subplots(figsize=(12, 8))
ax.scatter(positive[‘Test 1‘], positive[‘Test 2‘], s=50, c=‘b‘, marker=‘o‘, label=‘Accepted‘)
ax.scatter(negative[‘Test 1‘], negative[‘Test 2‘], s=50, c=‘r‘, marker=‘x‘, label=‘Unaccepted‘)
ax.legend() #顯示右上角的Accepted 和 Unaccepted標簽
ax.set_xlabel(‘Test 1 Score‘)
ax.set_ylabel(‘Test 2 Score‘)
plt.show()
‘‘‘
degree = 5
x1 = data[Test 1]
x2 = data[Test 2]
#在data的第三列插入一列全1
data.insert(3, Ones, 1)

#創建多項式特征值,最高階為4
for i in range(1, degree):
    for j in range(0, i):
        data[F + str(i) + str(j)] = np.power(x1, i-j) * np.power(x2, j)

#刪除原數據中的test 1和test 2兩列
data.drop(Test 1, axis=1, inplace=True)
data.drop(Test 2, axis=1, inplace=True)


#sigmoid函數實現
def sigmoid(h):
    return 1 / (1 + np.exp(-h))


def cost(theta, X, y, learnRate):
    theta = np.matrix(theta)
    X = np.matrix(X)
    y = np.matrix(y)

    first = np.multiply(-y, np.log(sigmoid(X * theta.T)))
    second = np.multiply((1 - y), np.log(1 - sigmoid(X * theta.T)))
    reg = (learnRate / (2 * len(X))) * np.sum(np.power(theta[:, 1:theta.shape[1]], 2))
    return np.sum(first - second) / len(X) + reg


learnRate = 1
cols = data.shape[1]

X = data.iloc[:, 1:cols]
y = data.iloc[:, 0:1]

X = np.array(X)
y = np.array(y)
theta = np.zeros(X.shape[1])


#計算原數據集的預測情況
def predict(theta, X):
    theta = np.matrix(theta)
    X = np.matrix(X)

    probability = sigmoid(X * theta.T)
    return [1 if i > 0.5 else 0 for i in probability]


def gradientReg(theta, X, y, learnRate):
    theta = np.matrix(theta)
    X = np.matrix(X)
    y = np.matrix(y)

    paramates = int(theta.ravel().shape[1])
    grad = np.zeros(paramates)

    grad = (sigmoid(X * theta.T) - y).T * X / len(X) + (learnRate / len(X)) * theta[:, i]
    grad[0] = grad[0] - (learnRate / len(X)) * theta[:, i]
    return grad

result = opt.fmin_tnc(func=cost, x0=theta, fprime=gradientReg, args=(X, y, learnRate))
print(result)

theta_min = np.matrix(result[0])
predictions = predict(theta_min, X)
correct = [1 if((a == 1 and b == 1) or(a == 0 and b == 0)) else 0 for(a, b) in zip(predictions, y)]
accuracy = (sum(map(int, correct)) % len(correct))

print(accuracy = {0}%.format(accuracy))

logistic 回歸(線性和非線性)