Coursera機器學習基石作業一python實現
阿新 • • 發佈:2018-11-10
機器學習基石作業一
import numpy as np def train_matrix(): with open("hw1_15_train.dat.txt","r") as f: rawData=f.readlines() dataNum=len(rawData) dataDim=len(rawData[0].strip().split(' ')) X=np.zeros((dataNum,dataDim)) Y=np.zeros((dataNum,1)) count=0 x=[] for line in rawData: x.append(1) for str in line.split(' '): if len(str.split('\t')) == 1 and len(x)<=4: x.append(float(str)) elif len(str.split('\t')) == 1 and len(x)==5: Y[count] = int(str.strip()) else: x.append(float(str.split('\t')[0])) Y[count,0] = int(str.split('\t')[1].strip()) X[count,:]=x x=[] count += 1 return X,Y,dataNum,dataDim def PLA(): x_train,y_train,dataNum,dataDim=train_matrix() w=np.zeros((dataDim,1)) count=0 while True: flag=False for i in range(dataNum): if np.dot(x_train[i,:],w)[0]*y_train[i,0]<=0: w+=y_train[i,:]*x_train[i,:].reshape(5,1) flag=True count+=1 if flag==False: break return count if __name__=='__main__': print(PLA())
import numpy as np def train_matrix(): with open("hw1_15_train.dat.txt","r") as f: rawData=f.readlines() dataNum=len(rawData) dataDim=len(rawData[0].strip().split(' ')) X=np.zeros((dataNum,dataDim)) Y=np.zeros((dataNum,1)) count=0 x=[] for line in rawData: x.append(1) for str in line.split(' '): if len(str.split('\t')) == 1 and len(x)<=4: x.append(float(str)) elif len(str.split('\t')) == 1 and len(x)==5: Y[count] = int(str.strip()) else: x.append(float(str.split('\t')[0])) Y[count,0] = int(str.split('\t')[1].strip()) X[count,:]=x x=[] count += 1 permutation = np.random.permutation(Y.shape[0]) #numpy打亂資料集的方法 shuffled_dataset = X[permutation] shuffled_labels = Y[permutation] return shuffled_dataset,shuffled_labels,dataNum,dataDim def PLA(): x_train,y_train,dataNum,dataDim=train_matrix() w=np.zeros((dataDim,1)) count=0 while True: flag=False for i in range(dataNum): if np.dot(x_train[i,:],w)[0]*y_train[i,0]<=0: w+=y_train[i,:]*x_train[i,:].reshape(5,1) flag=True count+=1 if flag==False: break return count if __name__=='__main__': sum=0 for i in range(2000): count=PLA() sum+=count print(sum/2000)
只要在計算w的時候再乘以0.5即可
import numpy import copy class Pocket(object): def __init__(self, dimension, train_count, test_count): self.__dimension = dimension self.__train_count = train_count self.__test_count = test_count def train_matrix(self, path): with open(path, "r") as f: rawData = f.readlines() X = numpy.zeros((self.__train_count, self.__dimension)) Y = numpy.zeros((self.__train_count, 1)) count = 0 x = [] for line in rawData: x.append(1) for str in line.split(' '): if len(str.split('\t')) == 1: x.append(float(str)) else: x.append(float(str.split('\t')[0])) Y[count, 0] = int(str.split('\t')[1].strip()) X[count, :] = x x = [] count += 1 permutation = numpy.random.permutation(Y.shape[0]) # numpy打亂資料集的方法 shuffled_dataset = X[permutation] shuffled_labels = Y[permutation] return shuffled_dataset, shuffled_labels def iteration(self, path): count = 0 x_train, y_train = self.train_matrix(path) w = numpy.zeros((self.__dimension, 1)) best_count = self.__train_count best_w = numpy.zeros((self.__dimension, 1)) # pocket演算法,對一條線進行修改(最多50次),每次修改後都用訓練集資料看是否是當前最好的那條線 for i in range(self.__train_count): if numpy.dot(x_train[i, :], w)[0] * y_train[i, 0] <= 0: w += 0.5 * y_train[i, 0] * x_train[i, :].reshape(5, 1) # 修改次數加一 count += 1 num = 0 # 驗證 for j in range(self.__train_count): if numpy.dot(x_train[j, :], w)[0] * y_train[j, 0] <= 0: num += 1 if num < best_count: best_count = num best_w = copy.deepcopy(w) if count == 50: break return best_w def test_matrix(self, test_path): x_test = numpy.zeros((self.__test_count, self.__dimension)) y_test = numpy.zeros((self.__test_count, 1)) test_set = open(test_path) x = [] x_count = 0 for line in test_set: x.append(1) for str in line.split(' '): if len(str.split('\t')) == 1: x.append(float(str)) else: x.append(float(str.split('\t')[0])) y_test[x_count, 0] = (int(str.split('\t')[1].strip())) x_test[x_count, :] = x x = [] x_count += 1 return x_test, y_test # 驗證 def test_error(self, train_path, test_path): w = self.iteration(train_path) x_test, y_test = self.test_matrix(test_path) count = 0.0 for i in range(self.__test_count): if numpy.dot(x_test[i, :], w)[0] * y_test[i, 0] <= 0: count += 1 return count / self.__test_count if __name__ == '__main__': average_error_rate = 0 for i in range(2000): my_Pocket = Pocket(5, 500, 500) average_error_rate += my_Pocket.test_error('hw1_18_train.dat.txt', 'hw1_18_test.dat.txt') print(average_error_rate / 2000.0)
import numpy
import copy
class Pocket(object):
def __init__(self, dimension, train_count, test_count):
self.__dimension = dimension
self.__train_count = train_count
self.__test_count = test_count
def train_matrix(self, path):
with open(path, "r") as f:
rawData = f.readlines()
X = numpy.zeros((self.__train_count, self.__dimension))
Y = numpy.zeros((self.__train_count, 1))
count = 0
x = []
for line in rawData:
x.append(1)
for str in line.split(' '):
if len(str.split('\t')) == 1:
x.append(float(str))
else:
x.append(float(str.split('\t')[0]))
Y[count, 0] = int(str.split('\t')[1].strip())
X[count, :] = x
x = []
count += 1
permutation = numpy.random.permutation(Y.shape[0]) # numpy打亂資料集的方法
shuffled_dataset = X[permutation]
shuffled_labels = Y[permutation]
return shuffled_dataset, shuffled_labels
def iteration(self, path):
count = 0
x_train, y_train = self.train_matrix(path)
w = numpy.zeros((self.__dimension, 1))
# pocket演算法,對一條線進行修改(最多50次),每次修改後都用訓練集資料看是否是當前最好的那條線
for i in range(self.__train_count):
if numpy.dot(x_train[i, :], w)[0] * y_train[i, 0] <= 0:
w += 0.5 * y_train[i, 0] * x_train[i, :].reshape(5, 1)
# 修改次數加一
count += 1
if count == 50:
break
return w
def test_matrix(self, test_path):
x_test = numpy.zeros((self.__test_count, self.__dimension))
y_test = numpy.zeros((self.__test_count, 1))
test_set = open(test_path)
x = []
x_count = 0
for line in test_set:
x.append(1)
for str in line.split(' '):
if len(str.split('\t')) == 1:
x.append(float(str))
else:
x.append(float(str.split('\t')[0]))
y_test[x_count, 0] = (int(str.split('\t')[1].strip()))
x_test[x_count, :] = x
x = []
x_count += 1
return x_test, y_test
# 驗證
def test_error(self, train_path, test_path):
w = self.iteration(train_path)
x_test, y_test = self.test_matrix(test_path)
count = 0.0
for i in range(self.__test_count):
if numpy.dot(x_test[i, :], w)[0] * y_test[i, 0] <= 0:
count += 1
return count / self.__test_count
if __name__ == '__main__':
average_error_rate = 0
for i in range(2000):
my_Pocket = Pocket(5, 500, 500)
average_error_rate += my_Pocket.test_error('hw1_18_train.dat.txt', 'hw1_18_test.dat.txt')
print(average_error_rate / 2000.0)
只需要在18題的程式碼上修改count判斷的條件,把50修改成100即可