邏輯迴歸之考試是否錄取
阿新 • • 發佈:2018-11-28
一、其中LogisticRegression是自己寫的模組,該模組是波士頓房價預測裡的程式碼 波士頓房價預測
二、完全程式碼
import matplotlib.pyplot as plt import pandas as pd import numpy as np from LogisticRegression import gradDescent,cost_function,accuracy,feature_scalling def load_data(): data = pd.read_csv('./data/LogiReg_data.txt', names=['exam1', 'exam2', 'label']).as_matrix()#加names是新增pa格式的列的標籤,as_matrix是做成numpy格式的資料,沒有了橫和豎的標籤 X = data[:, :-1] # 取前兩列 y = data[:, -1:] # 取最後一列 print(X) print(X.shape)#100*2 print(X.shape[0])#100 shuffle_index = np.random.permutation(X.shape[0])#其中X.shape[0]=100,這一行的作用是將列的索引打亂。np.random.permutation為隨機打亂函式 print(shuffle_index)#從0到99隨機打亂的陣列 X = X[shuffle_index] print(X) y = y[shuffle_index] return X, y#得到打亂的特徵X和打亂的標籤y def visualize_data(X, y): positive = np.where(y == 1)[0]#得到正樣本的索引 negative = np.where(y == 0)[0]#得到負樣本的索引 plt.scatter(X[positive,0],X[positive,1],s=30,c='b',marker='o',label='Admitted') plt.scatter(X[negative,0],X[negative,1],s=30,c='r',marker='o',label='Not Admitted') plt.legend()#用於顯示標註admitted和not admitted plt.show() def visualize_cost(ite,cost):#用來畫出代價函式和迭代次數曲線 plt.plot(np.linspace(0,ite,ite),cost,linewidth=1)#np.linspace(0,ite,ite)表均分為#10000份,不是橫座標顯示出一萬份,因為cost對應有10000份,所以np.linspace也應該有10000份 plt.title('cost history',color='r') plt.xlabel('iterations') plt.ylabel('cost J') plt.show() if __name__ == '__main__': # Step 1. Load data X, y = load_data() # Step 2. Visualize data visualize_data(X, y) # m, n = X.shape#100*2 X = feature_scalling(X) alpha = 0.1 W = np.random.randn(n, 1) b = 0.1 maxIt = 10000 W, b, cost_history = gradDescent(X, y, W, b, alpha, maxIt) print("******************") print(cost_history[:20]) visualize_cost(maxIt,cost_history) print("accuracys is : " + str(accuracy(X, y, W, b))) print("W:",W) print("b: ",b) print("******************")
三、資料
https://github.com/TolicWang/MachineLearningWithMe/blob/master/Lecture_02/data/LogiReg_data.txt
四、用sklearn來實現
程式碼:
import matplotlib.pyplot as plt import pandas as pd import numpy as np from LogisticRegression import feature_scalling from sklearn.linear_model import LogisticRegression def load_data(): data = pd.read_csv('./data/LogiReg_data.txt', names=['exam1', 'exam2', 'label']).as_matrix() X = data[:, :-1] # 取前兩列 y = data[:, -1:] # 取最後一列 shuffle_index = np.random.permutation(X.shape[0]) X = X[shuffle_index] y = y[shuffle_index] return X, y def visualize_cost(ite,cost): plt.plot(np.linspace(0,ite,ite),cost,linewidth=1) plt.title('cost history',color='r') plt.xlabel('iterations') plt.ylabel('cost J') plt.show() if __name__ == '__main__': X, y = load_data() X = feature_scalling(X) lr = LogisticRegression() lr.fit(X,y) print("******************") print("accuracys is :" ,lr.score(X,y)) print("W:{},b:{}".format(lr.coef_,lr.intercept_)) print("******************")