[Python]Python版吳恩達《機器學習》習題——線性迴歸
阿新 • • 發佈:2021-01-13
胡話
主要給自己備忘,放部落格方便檢視,程式碼梯度下降部分主要參考文末連結,正規方程法程式碼自己寫的,雖然很簡單但算是機器學習的“Hello World”,有點小激動。
雖然之前也看著《機器學習實戰》也寫出過梯度下降,但那時候理解不深,並沒有太多欣喜,現在對它理解程度進了一步,再寫才感覺是真的第一次。
我是懶人,所以不太願意花太多時間在部落格上,所以看官可能不會太方便,但我註釋儘量寫全呼了,複製貼上看應該還好。
程式碼
梯度下降
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: CK
# Date: 2021/1/11
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
abs_path = os.path.dirname(os.path.abspath(sys.argv[0]))
sys.path.append(os.path.dirname(abs_path))
class GradientDescent:
def __init__(self):
pass
@staticmethod
def cost (x, y, theta):
"""
計算損失函式
:param x: 資料集
:param y: 標籤
:param theta: 引數
:return:
"""
# 誤差計算公式
return sum(np.power((np.dot(x, theta.T) - y), 2)) / (2 * len(x))
def gradient_descent(self, x, y, theta, alpha, epoch=1000):
"""
梯度下降
:param x: 訓練集
:param y: 標籤
:param theta: 引數
:param alpha: 學習率
:param epoch: 迭代次數
:return:
"""
cost = np.zeros(epoch) # 為之後繪圖作資料準備
m = x.shape[0] # 資料集數量
for i in range(epoch):
# 梯度下降公式
theta -= (alpha / m) * (x.dot(theta.T) - y).T.dot(x)
cost[i] = self.cost(x, y, theta)
return theta, cost
def run(self):
"""主入口"""
data = pd.read_csv(os.path.join(abs_path, 'ex1', 'ex1data1.txt'),
names=['Population', 'Profit'], header=None)
# 新增x_0
data.insert(0, 'Ones', 1)
column_num = data.shape[1]
# 獲取x、y數列
x = np.array(data.iloc[:, : column_num-1].values)
y = np.array(data.iloc[:, column_num-1: column_num].values)
# 初始化theta
theta = np.zeros([1, 2])
print(self.cost(x, y, theta))
final_theta, cost = self.gradient_descent(x, y, theta, 0.01)
# print(final_theta, cost)
final_cost = self.cost(x, y, final_theta)
print(final_theta, final_cost)
population = np.linspace(data.Population.min(), data.Population.max(), 100)
# 計算相應值以便之後編制曲線
profit = final_theta[0, 0] + (final_theta[0, 1] * population)
# matplotlib繪圖
fig, ax = plt.subplots(figsize=(8, 6))
ax.plot(population, profit, 'r', label='Prediction')
ax.scatter(data['Population'], data['Profit'], label='TrainingData') # 原始資料散點
ax.legend(loc=4)
ax.set_xlabel('Population')
ax.set_ylabel('Profit')
ax.set_title('PredictionProfit')
plt.show()
def main():
gd = GradientDescent()
gd.run()
if __name__ == '__main__':
main()
正規方程
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: CK
# Date: 2021/1/12
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
abs_path = os.path.dirname(os.path.abspath(sys.argv[0]))
sys.path.append(os.path.dirname(abs_path))
from gradient_descent import GradientDescent
class NormalEquation:
def __init__(self):
pass
@staticmethod
def normal_equation(x, y):
"""
正規方程法
:param x: 資料集
:param y: 標籤
:return:
"""
return np.linalg.pinv(x.T.dot(x)).dot(x.T).dot(y)
def run(self, x, y, theta):
"""主入口"""
# 呼叫之前寫的梯度下降法,得到的theta
gd = GradientDescent()
final_theta, cost = gd.gradient_descent(x, y, theta, 0.4)
print(final_theta, gd.cost(x, y, theta))
# 繪製學習曲線,1000是因為梯度下降預設迭代數量
plt.plot(range(1000), cost)
plt.show()
# 通過正規方程法直接計算得到theta
ne_theta = self.normal_equation(x, y)
print(ne_theta.T, gd.cost(x, y, ne_theta.T))
def main():
ne = NormalEquation()
data = pd.read_csv(os.path.join(abs_path, 'ex1', 'ex1data2.txt'),
names=['Size', 'Bedrooms', 'Price'], header=None)
column_num = data.shape[1]
x = data.iloc[:, : column_num-1]
y = data.iloc[:, column_num-1: column_num]
# 資料歸一化
# x_normalization = (x - x.min()) / (x.max() - x.min())
x_normalization = (x - x.mean()) / x.std()
x_normalization.insert(0, 'Ones', 1)
x_set = np.array(x_normalization.values)
y_set = np.array(y.values)
theta = np.zeros([1, column_num])
ne.run(x_set, y_set, theta)
if __name__ == '__main__':
main()