機器學習sklearn(77):演算法例項(三十四)迴歸(六)線性迴歸大家族(四)多重共線性:嶺迴歸與Lasso(一)嶺迴歸
阿新 • • 發佈:2021-07-03
1 最熟悉的陌生人:多重共線性
逆矩陣存在的充分必要條件 行列式不為0的充分必要條件 矩陣滿秩的充分必要條件2 嶺迴歸
2.1 嶺迴歸解決多重共線性問題 2.2 linear_model.Ridgeimport numpy as np import pandas as pd from sklearn.linear_model import Ridge, LinearRegression, Lasso from sklearn.model_selection import train_test_split as TTSfrom sklearn.datasets import fetch_california_housing as fch import matplotlib.pyplot as plt housevalue = fch() X = pd.DataFrame(housevalue.data) y = housevalue.target X.columns = ["住戶收入中位數","房屋使用年代中位數","平均房間數目" ,"平均臥室數目","街區人口","平均入住率","街區的緯度","街區的經度"] X.head() Xtrain,Xtest,Ytrain,Ytest = TTS(X,y,test_size=0.3,random_state=420) #資料集索引恢復 for i in [Xtrain,Xtest]: i.index = range(i.shape[0]) #使用嶺迴歸來進行建模 reg = Ridge(alpha=1).fit(Xtrain,Ytrain) reg.score(Xtest,Ytest) #交叉驗證下,與線性迴歸相比,嶺迴歸的結果如何變化? alpharange = np.arange(1,1001,100) ridge, lr = [], [] for alpha in alpharange: reg = Ridge(alpha=alpha) linear = LinearRegression() regs= cross_val_score(reg,X,y,cv=5,scoring = "r2").mean() linears = cross_val_score(linear,X,y,cv=5,scoring = "r2").mean() ridge.append(regs) lr.append(linears) plt.plot(alpharange,ridge,color="red",label="Ridge") plt.plot(alpharange,lr,color="orange",label="LR") plt.title("Mean") plt.legend() plt.show() #細化一下學習曲線 alpharange = np.arange(1,201,10)
#模型方差如何變化? alpharange = np.arange(1,1001,100) ridge, lr = [], [] for alpha in alpharange: reg = Ridge(alpha=alpha) linear = LinearRegression() varR = cross_val_score(reg,X,y,cv=5,scoring="r2").var() varLR = cross_val_score(linear,X,y,cv=5,scoring="r2").var() ridge.append(varR) lr.append(varLR) plt.plot(alpharange,ridge,color="red",label="Ridge") plt.plot(alpharange,lr,color="orange",label="LR") plt.title("Variance") plt.legend() plt.show()
from sklearn.datasets import load_boston from sklearn.model_selection import cross_val_score X = load_boston().data y = load_boston().target Xtrain,Xtest,Ytrain,Ytest = TTS(X,y,test_size=0.3,random_state=420) #先檢視方差的變化 alpharange = np.arange(1,1001,100) ridge, lr = [], [] for alpha in alpharange: reg = Ridge(alpha=alpha) linear = LinearRegression() varR = cross_val_score(reg,X,y,cv=5,scoring="r2").var() varLR = cross_val_score(linear,X,y,cv=5,scoring="r2").var() ridge.append(varR) lr.append(varLR) plt.plot(alpharange,ridge,color="red",label="Ridge") plt.plot(alpharange,lr,color="orange",label="LR") plt.title("Variance") plt.legend() plt.show() #檢視R2的變化 alpharange = np.arange(1,1001,100) ridge, lr = [], [] for alpha in alpharange: reg = Ridge(alpha=alpha) linear = LinearRegression() regs = cross_val_score(reg,X,y,cv=5,scoring = "r2").mean() linears = cross_val_score(linear,X,y,cv=5,scoring = "r2").mean() ridge.append(regs) lr.append(linears) plt.plot(alpharange,ridge,color="red",label="Ridge") plt.plot(alpharange,lr,color="orange",label="LR") plt.title("Mean") plt.legend() plt.show() #細化學習曲線 alpharange = np.arange(100,300,10) ridge, lr = [], [] for alpha in alpharange: reg = Ridge(alpha=alpha) #linear = LinearRegression() regs = cross_val_score(reg,X,y,cv=5,scoring = "r2").mean() #linears = cross_val_score(linear,X,y,cv=5,scoring = "r2").mean() ridge.append(regs) lr.append(linears) plt.plot(alpharange,ridge,color="red",label="Ridge") #plt.plot(alpharange,lr,color="orange",label="LR") plt.title("Mean") plt.legend() plt.show()2.3 選取最佳的正則化引數取值
import numpy as np import matplotlib.pyplot as plt from sklearn import linear_model #創造10*10的希爾伯特矩陣 X = 1. / (np.arange(1, 11) + np.arange(0, 10)[:, np.newaxis]) y = np.ones(10) #計算橫座標 n_alphas = 200 alphas = np.logspace(-10, -2, n_alphas) #建模,獲取每一個正則化取值下的係數組合 coefs = [] for a in alphas: ridge = linear_model.Ridge(alpha=a, fit_intercept=False) ridge.fit(X, y) coefs.append(ridge.coef_) #繪圖展示結果 ax = plt.gca() ax.plot(alphas, coefs) ax.set_xscale('log') ax.set_xlim(ax.get_xlim()[::-1]) #將橫座標逆轉 plt.xlabel('正則化引數alpha') plt.ylabel('係數w') plt.title('嶺迴歸下的嶺跡圖') plt.axis('tight') plt.show()這個類的使用也非常容易,依然使用我們之前建立的加利佛尼亞房屋價值資料集:
import numpy as np import pandas as pd from sklearn.linear_model import RidgeCV, LinearRegression from sklearn.model_selection import train_test_split as TTS from sklearn.datasets import fetch_california_housing as fch import matplotlib.pyplot as plt housevalue = fch() X = pd.DataFrame(housevalue.data) y = housevalue.target X.columns = ["住戶收入中位數","房屋使用年代中位數","平均房間數目" ,"平均臥室數目","街區人口","平均入住率","街區的緯度","街區的經度"] Ridge_ = RidgeCV(alphas=np.arange(1,1001,100) #,scoring="neg_mean_squared_error" ,store_cv_values=True #,cv=5 ).fit(X, y) #無關交叉驗證的嶺迴歸結果 Ridge_.score(X,y) #呼叫所有交叉驗證的結果 Ridge_.cv_values_.shape #進行平均後可以檢視每個正則化係數取值下的交叉驗證結果 Ridge_.cv_values_.mean(axis=0) #檢視被選擇出來的最佳正則化係數 Ridge_.alpha_