林軒田-機器學習基石-作業3-python原始碼
阿新 • • 發佈:2019-01-10
大家好,以下是林軒田機器學習基石--作業3的Python的參考程式碼,自己碼的。Python方面沒有工程經驗,如有錯誤或者更好的程式碼優化方法,麻煩大家留言提醒一下下,大家互相交流學習,謝謝。
13-15題主要考察在分類問題上的線性迴歸和特徵轉換,所使用的樣本點均由目標函式
f(x1; x2) = sign(x1^2 + x2^2 − 0.6)
產生
13.1:在使用線性迴歸且不進行引數轉換的情況下(也就是直接使用特徵向量(1; x1; x2)),對資料進行擬合。進行1000次試驗,並且畫出1000次試驗的Ein的直方圖,並求出Ein的平均值。
### For Questions 13-15, Generate a training set of N = 1000 points on X = [−1; 1] × [−1; 1] with uniform probability of
### picking each x 2 X. Generate simulated noise by flipping the sign of the output in a random 10% subset
### of the generated training set.
import random
import numpy as np
import matplotlib.pyplot as plt
### target function f(x1, x2) = sign(x1^2 + x2^2 - 0.6)
def target_function(x1, x2):
return (1 if (x1*x1 + x2*x2 - 0.6) >= 0 else -1)
### plot dot picture, two dimension features
def plot_dot_picture(features, lables, w=np.zeros((3, 1))):
x1 = features[:,1]
x2 = features[:,2]
y = lables[:,0]
plot_size = 20
size = np.ones((len(x1)))*plot_size
size_x1 = np.ma.masked_where(y<0 , size)
size_x2 = np.ma.masked_where(y>0, size)
### plot scatter
plt.scatter(x1, x2, s=size_x1, marker='x', c='r')
plt.scatter(x1, x2, s=size_x2, marker='o', c='b')
### plot w line
x1_tmp = np.arange(-1,1,0.01)
x2_tmp = np.arange(-1,1,0.01)
x1_tmp, x2_tmp = np.meshgrid(x1_tmp, x2_tmp)
f = x1_tmp*w[1, 0] + x2_tmp*w[2, 0] + w[0, 0]
try:
plt.contour(x1_tmp, x2_tmp, f, 0)
except ValueError:
pass
plt.xlabel('X1')
plt.ylabel('X2')
plt.title('Feature scatter plot')
plt.legend()
plt.show()
### return a numpy array
def training_data_with_random_error(num=1000):
features = np.zeros((num, 3))
labels = np.zeros((num, 1))
points_x1 = np.array([round(random.uniform(-1, 1) ,2) for _ in range(num)])
points_x2 = np.array([round(random.uniform(-1, 1) ,2) for _ in range(num)])
for i in range(num):
features[i, 0] = 1
features[i, 1] = points_x1[i]
features[i, 2] = points_x2[i]
labels[i] = target_function(points_x1[i], points_x2[i])
### choose 10 error labels
if i <= num*0.1:
labels[i] = (1 if labels[i]<0 else -1)
return features, labels
def error_rate(features, labels, w):
wrong = 0
for i in range(len(labels)):
if np.dot(features[i], w)*labels[i,0] < 0:
wrong += 1
return wrong/(len(labels)*1.0)
(features,labels) = training_data_with_random_error(1000)
plot_dot_picture(features, labels)
因為特徵是二維的,很容易用圖片表述。從圖上可以看出,點的分佈和目標方程大致一致。
### 13.1 (*) Carry out Linear Regression without transformation, i.e., with feature vector:
### (1; x1; x2);
### to find wlin, and use wlin directly for classification. Run the experiments for 1000 times and plot
### a histogram on the classification (0/1) in-sample error (Ein). What is the average Ein over 1000
### experiments?
"""
linear regression:
model : g(x) = Wt * X
strategy : squared error
algorithm : close form(matrix)
result : w = (Xt.X)^-1.Xt.Y
"""
def linear_regression_closed_form(X, Y):
return np.linalg.inv(np.dot(X.T, X)).dot(X.T).dot(Y)
w = linear_regression_closed_form(features, labels)
"""
plot the one result(just for visual)
"""
plot_dot_picture(features, labels, w)
"""
run 1000 times, and plot histogram
"""
error_rate_array = []
for i in range(1000):
(features,labels) = training_data_with_random_error(1000)
w = linear_regression_closed_form(features, labels)
error_rate_array.append(error_rate(features, labels, w))
bins = np.arange(0,1,0.05)
plt.hist(error_rate_array, bins, rwidth=0.8, histtype='bar')
plt.title("Error rate histogram(without feature transform)")
plt.show()
### error rate, approximately 0.5
avr_err = sum(error_rate_array)/(len(error_rate_array)*1.0)
print "13.1--Linear regression for classification without feature transform:Average error--",avr_err
下面這張圖片是執行1次試驗學習到的直線,可見效果很糟糕
下面這張直方圖描述的是執行1000次的Ein的直方圖,Ein大概為0.5左右,可以說沒什麼學習效果。證明我們選擇的一次模型不能夠滿足該資料集。
13.1--Linear regression for classification without feature transform:Average error-- 0.50587
### Now, transform the training data into the following nonlinear feature vector:
### (1; x1; x2; x1x2; x1^2; x2^2)
### Find the vector ~w that corresponds to the solution of Linear Regression, and take it for classification.
"""
feature transform φ(x) = z = (1; x1; x2; x1x2; x1^2; x2^2)
"""
def feature_transform(features):
new = np.zeros((len(features), 6))
new[:, 0:3] = features[:,:]*1
new[:, 3] = features[:, 1] * features[:, 2]
new[:, 4] = features[:, 1] * features[:, 1]
new[:, 5] = features[:, 2] * features[:, 2]
return new
def plot_dot_pictures(features, lables, w=np.zeros((6, 1))):
x1 = features[:,1]
x2 = features[:,2]
y = lables[:,0]
plot_size = 20
size = np.ones((len(x1)))*plot_size
size_x1 = np.ma.masked_where(y<0, size)
size_x2 = np.ma.masked_where(y>0, size)
### plot scatter
plt.scatter(x1, x2, s=size_x1, marker='x', c='r')
plt.scatter(x1, x2, s=size_x2, marker='o', c='b')
### plot w line
x1_tmp = np.arange(-1,1,0.01)
x2_tmp = np.arange(-1,1,0.01)
x1_tmp, x2_tmp = np.meshgrid(x1_tmp, x2_tmp)
f = w[0, 0] + x1_tmp*w[1, 0] + x2_tmp*w[2, 0] + x1_tmp*x2_tmp*w[3, 0] \
+ x1_tmp*x1_tmp*w[4, 0] + x2_tmp*x2_tmp*w[5, 0]
try:
plt.contour(x1_tmp, x2_tmp, f, 0)
except ValueError:
pass
plt.xlabel('X1')
plt.ylabel('X2')
plt.title('Feature scatter plot')
plt.legend()
plt.show()
"""
plot the one result(just for visual)
"""
(features,labels) = training_data_with_random_error(1000)
new_features = feature_transform(features)
w = linear_regression_closed_form(new_features, labels)
plot_dot_pictures(features, labels, w)
"""
run 1000 times, and plot histogram
"""
error_rate_array = []
for i in range(1000):
(features,labels) = training_data_with_random_error(1000)
new_features = feature_transform(features)
w = linear_regression_closed_form(new_features, labels)
error_rate_array.append(error_rate(new_features, labels, w))
bins = np.arange(0,1,0.05)
plt.hist(error_rate_array, bins, rwidth=0.8, histtype='bar')
plt.title("Error rate histogram(with feature transform)")
plt.show()
### error rate, approximately 0.5
avr_err = sum(error_rate_array)/(len(error_rate_array)*1.0)
print "13.2--Linear regression for classification with feature transform:Average error--",avr_err
所以在13題後面,我們使用二次的假設,並使用使用了特徵轉換,將非線性問題轉換為線性問題,以便於使用線性迴歸。從圖中看出來,我們學習的效果很不錯,錯誤率在12%左右(資料集裡面本身有10%的噪聲點)
13.2--Linear regression for classification with feature transform:Average error-- 0.124849
所以說線性迴歸總是適合分類分類問題嗎?下面做了一個小實驗。有意地挑選了六個樣本點,分別在[1,1]附近和[-1, -1]附近。
### is linear regression always good for classification, see the following example
features = np.array([[1, 1.1, 1.2], [1, 1.2,1.0], [1, 1.0, 1.0], [1, -1.1, -1.2], [1, -1.2, -1.0], [1, -1.0, -1.0]])
labels = np.array([[1],[1],[1],[-1],[-1],[-1]])
w = linear_regression_closed_form(features, labels)
"""
plot the one result(just for visual)
"""
plot_dot_picture(features, labels, w)
使用線性迴歸,得到如圖的一條直線(其實該結果出乎了我的意料,我還以為會生成一條類似y=x的直線呢)。
### if add a new large x point, what happens?
features = np.array([[1, 100, 100], [1, 1.1, 1.2], [1, 1.2,1.0], [1, 1.0, 1.0], [1, -1.1, -1.2], [1, -1.2, -1.0], [1, -1.0, -1.0]])
labels = np.array([[1], [1],[1],[1],[-1],[-1],[-1]])
w = linear_regression_closed_form(features, labels)
"""
plot the one result(just for visual)
"""
print w
plot_dot_picture(features, labels, w)
print np.dot(features, w)
### total 7 points, 2 points error!!!!!
現在加入一個[100, 100]的樣本點,加入這個點是很合理的,可見生成了一條類似Y=X的直線,但是居然有2個點分類錯誤(本來有圖的。。)。但如果該問題用binary classification或者其他的分類器,均可以很好的工作。所以現行迴歸並不是總是適合分類問題的。
### 14. (*) Run the experiment for 1000 times, and plot a histogram on ~ w3, the weight associated with
### x1x2. What is the average ~ w3?
"""
run 1000 times, and plot histogram
"""
w3_array = []
for i in range(1000):
(features,labels) = training_data_with_random_error(1000)
new_features = feature_transform(features)
w = linear_regression_closed_form(new_features, labels)
w3_array.append(w[3,0])
bins = np.arange(-2,2,0.05)
plt.hist(w3_array, bins, rwidth=0.8, histtype='bar')
plt.title("Parameters W3(with feature transform)")
plt.show()
print "Average of W3 is: ", sum(w3_array)/(len(w3_array)*1.0)
Average of W3 is: 0.00120328875641
### 15. (*) Continue from Question 14, and plot a histogram on the classification Eout instead. You can
### estimate it by generating a new set of 1000 points and adding noise as before. What is the average
### Eout?
error_out = []
for i in range(1000):
(features,labels) = training_data_with_random_error(1000)
new_features = feature_transform(features)
error_out.append(error_rate(new_features,labels, w))
bins = np.arange(-1,1,0.05)
plt.hist(error_out, bins, rwidth=0.8, histtype='bar')
plt.title("Error out(with feature transform)")
plt.show()
print "Average of Eout is: ", sum(error_out)/(len(error_out)*1.0)
Average of Eout is: 0.133649
### 18. (*) Implement the fixed learning rate gradient descent algorithm below for logistic regression, initialized with 0. Run the algorithm with η = 0:001 and T = 2000 on the following set for training:
### http://www.csie.ntu.edu.tw/~htlin/course/ml15fall/hw3/hw3_train.dat
### and the following set for testing:
### http://www.csie.ntu.edu.tw/~htlin/course/ml15fall/hw3/hw3_test.dat
### What is the weight vector within your g? What is the Eout(g) from your algorithm, evaluated using
### the 0=1 error on the test set?
import math
import numpy as np
"""
Read data from data file
"""
def data_load(file_path):
### open file and read lines
f = open(file_path)
try:
lines = f.readlines()
finally:
f.close()
### create features and lables array
example_num = len(lines)
feature_dimension = len(lines[0].strip().split()) ###i do not know how to calculate the dimension
features = np.zeros((example_num, feature_dimension))
features[:,0] = 1
labels = np.zeros((example_num, 1))
for index,line in enumerate(lines):
### items[0:-1]--features items[-1]--label
items = line.strip().split(' ')
### get features
features[index,1:] = [float(str_num) for str_num in items[0:-1]]
### get label
labels[index] = float(items[-1])
return features,labels
### gradient descent
def gradient_descent(X, Y, w):
### -YnWtXn
tmp = -Y*(np.dot(X, w))
### θ(-YnWtXn) = exp(tmp)/1+exp(tmp)
### weight_matrix = np.array([math.exp(_)/(1+math.exp(_)) for _ in tmp]).reshape(len(X), 1)
weight_matrix = np.exp(tmp)/((1+np.exp(tmp))*1.0)
gradient = 1/(len(X)*1.0)*(sum(weight_matrix*-Y*X).reshape(len(w), 1))
return gradient
### gradient descent
def stochastic_gradient_descent(X, Y, w):
### -YnWtXn
tmp = -Y*(np.dot(X, w))
### θ(-YnWtXn) = exp(tmp)/1+exp(tmp)
###weight = math.exp(tmp[0])/((1+math.exp(tmp[0]))*1.0)
weight = np.exp(tmp)/((1+np.exp(tmp))*1.0)
gradient = weight*-Y*X
return gradient.reshape(len(gradient), 1)
### LinearRegression Class,first time use Class, HaHa...
class LinearRegression:
'Linear Regression of My'
def __init__(self):
pass
### fit model
def fit(self, X, Y, Eta=0.001, max_interate=2000, sgd=False):
### ∂E/∂w = 1/N * ∑θ(-YnWtXn)(-YnXn)
self.__w = np.zeros((len(X[0]),1))
if sgd == False:
for i in range(max_interate):
self.__w = self.__w - Eta*gradient_descent(X, Y, self.__w)
else:
index = 0
for i in range(max_interate):
if (index >= len(X)):
index = 0
self.__w = self.__w - Eta*stochastic_gradient_descent(np.array(X[index]), Y[index], self.__w)
index += 1
### predict
def predict(self, X):
binary_result = np.dot(X, self.__w) >= 0
return np.array([(1 if _ > 0 else -1) for _ in binary_result]).reshape(len(X), 1)
### get vector w
def get_w(self):
return self.__w
### score(error rate)
def score(self, X, Y):
predict_Y = self.predict(X)
return sum(predict_Y != Y)/(len(Y)*1.0)
### training model
(X, Y) = data_load("hw3_train.dat")
lr = LinearRegression()
lr.fit(X, Y, max_interate = 2000)
### get weight vector
print "weight vector: ", lr.get_w()
### get 0/1 error in test data
test_X, test_Y = data_load("hw3_test.dat")
###print "Eout: ", lr.score(test_X,test_Y)
lr.score(test_X,test_Y)
weight vector: [[ 0.01878417]
[-0.01260595]
[ 0.04084862]
[-0.03266317]
[ 0.01502334]
[-0.03667437]
[ 0.01255934]
[ 0.04815065]
[-0.02206419]
[ 0.02479605]
[ 0.06899284]
[ 0.0193719 ]
[-0.01988549]
[-0.0087049 ]
[ 0.04605863]
[ 0.05793382]
[ 0.061218 ]
[-0.04720391]
[ 0.06070375]
[-0.01610907]
[-0.03484607]]
array([ 0.475])
### 19. (*) Implement the fixed learning rate gradient descent algorithm below for logistic regression,
### initialized with 0. Run the algorithm with η = 0:01 and T = 2000 on the following set for training:
### http://www.csie.ntu.edu.tw/~htlin/course/ml15fall/hw3/hw3_train.dat
### and the following set for testing:
### http://www.csie.ntu.edu.tw/~htlin/course/ml15fall/hw3/hw3_test.dat
### What is the weight vector within your g? What is the Eout(g) from your algorithm, evaluated using
### the 0=1 error on the test set?
### training model
(X, Y) = data_load("hw3_train.dat")
lr_eta = LinearRegression()
lr_eta.fit(X, Y, 0.01, 2000)
### get weight vector
print "weight vector: ", lr_eta.get_w()
### get 0/1 error in test data
test_X, test_Y = data_load("hw3_test.dat")
print "Eout: ", lr_eta.score(test_X,test_Y)
weight vector: [[-0.00385379]
[-0.18914564]
[ 0.26625908]
[-0.35356593]
[ 0.04088776]
[-0.3794296 ]
[ 0.01982783]
[ 0.33391527]
[-0.26386754]
[ 0.13489328]
[ 0.4914191 ]
[ 0.08726107]
[-0.25537728]
[-0.16291797]
[ 0.30073678]
[ 0.40014954]
[ 0.43218808]
[-0.46227968]
[ 0.43230193]
[-0.20786372]
[-0.36936337]]
Eout: [ 0.22]
### 20. (*) Implement the fixed learning rate stochastic gradient descent algorithm below for logistic regression,
### initialized with 0. Instead of randomly choosing n in each iteration, please simply pick
### the example with the cyclic order n = 1; 2; : : : ; N; 1; 2; : : :. Run the algorithm with η = 0:001 and
### T = 2000 on the following set for training:
### http://www.csie.ntu.edu.tw/~htlin/course/ml15fall/hw3/hw3_train.dat
### and the following set for testing:
### http://www.csie.ntu.edu.tw/~htlin/course/ml15fall/hw3/hw3_test.dat
### What is the weight vector within your g? What is the Eout(g) from your algorithm, evaluated using
### the 0=1 error on the test set?
### training model
(X, Y) = data_load("hw3_train.dat")
lr_sgd = LinearRegression()
lr_sgd.fit(X, Y, sgd=True, max_interate = 2000)
### get weight vector
print "weight vector: ", lr_sgd.get_w()
### get 0/1 error in test data
test_X, test_Y = data_load("hw3_test.dat")
print "Eout: ", lr_sgd.score(test_X,test_Y)
weight vector: [[ 0.01826899]
[-0.01308051]
[ 0.04072894]
[-0.03295698]
[ 0.01498363]
[-0.03691042]
[ 0.01232819]
[ 0.04791334]
[-0.02244958]
[ 0.02470544]
[ 0.06878235]
[ 0.01897378]
[-0.02032107]
[-0.00901469]
[ 0.04589259]
[ 0.05776824]
[ 0.06102487]
[-0.04756147]
[ 0.06035018]
[-0.01660574]
[-0.03509342]]
Eout: [ 0.473]