1. 程式人生 > 其它 >機器學習sklearn(77):演算法例項(三十四)迴歸(六)線性迴歸大家族(四)多重共線性:嶺迴歸與Lasso(一)嶺迴歸

機器學習sklearn(77):演算法例項(三十四)迴歸(六)線性迴歸大家族(四)多重共線性:嶺迴歸與Lasso(一)嶺迴歸

1 最熟悉的陌生人:多重共線性

逆矩陣存在的充分必要條件 行列式不為0的充分必要條件 矩陣滿秩的充分必要條件

2 嶺迴歸

2.1 嶺迴歸解決多重共線性問題 2.2 linear_model.Ridge
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge, LinearRegression, Lasso
from sklearn.model_selection import train_test_split as TTS
from sklearn.datasets import fetch_california_housing as fch import matplotlib.pyplot as plt housevalue = fch() X = pd.DataFrame(housevalue.data) y = housevalue.target X.columns = ["住戶收入中位數","房屋使用年代中位數","平均房間數目" ,"平均臥室數目","街區人口","平均入住率","街區的緯度","街區的經度"] X.head() Xtrain,Xtest,Ytrain,Ytest = TTS(X,y,test_size=0.3,random_state=420) #
資料集索引恢復 for i in [Xtrain,Xtest]: i.index = range(i.shape[0]) #使用嶺迴歸來進行建模 reg = Ridge(alpha=1).fit(Xtrain,Ytrain) reg.score(Xtest,Ytest) #交叉驗證下,與線性迴歸相比,嶺迴歸的結果如何變化? alpharange = np.arange(1,1001,100) ridge, lr = [], [] for alpha in alpharange: reg = Ridge(alpha=alpha) linear = LinearRegression() regs
= cross_val_score(reg,X,y,cv=5,scoring = "r2").mean() linears = cross_val_score(linear,X,y,cv=5,scoring = "r2").mean() ridge.append(regs) lr.append(linears) plt.plot(alpharange,ridge,color="red",label="Ridge") plt.plot(alpharange,lr,color="orange",label="LR") plt.title("Mean") plt.legend() plt.show() #細化一下學習曲線 alpharange = np.arange(1,201,10)
#模型方差如何變化?
alpharange = np.arange(1,1001,100)
ridge, lr = [], []
for alpha in alpharange:
  reg = Ridge(alpha=alpha)
  linear = LinearRegression()
  varR = cross_val_score(reg,X,y,cv=5,scoring="r2").var()
  varLR = cross_val_score(linear,X,y,cv=5,scoring="r2").var()
  ridge.append(varR)
  lr.append(varLR)
plt.plot(alpharange,ridge,color="red",label="Ridge")
plt.plot(alpharange,lr,color="orange",label="LR")
plt.title("Variance")
plt.legend()
plt.show()
from sklearn.datasets import load_boston
from sklearn.model_selection import cross_val_score
X = load_boston().data
y = load_boston().target
Xtrain,Xtest,Ytrain,Ytest = TTS(X,y,test_size=0.3,random_state=420) #先檢視方差的變化
alpharange = np.arange(1,1001,100)
ridge, lr = [], []
for alpha in alpharange:
  reg = Ridge(alpha=alpha)
  linear = LinearRegression()
  varR = cross_val_score(reg,X,y,cv=5,scoring="r2").var()
  varLR = cross_val_score(linear,X,y,cv=5,scoring="r2").var()
  ridge.append(varR)
  lr.append(varLR)
plt.plot(alpharange,ridge,color="red",label="Ridge")
plt.plot(alpharange,lr,color="orange",label="LR")
plt.title("Variance")
plt.legend()
plt.show()
#檢視R2的變化
alpharange = np.arange(1,1001,100)
ridge, lr = [], []
for alpha in alpharange:
  reg = Ridge(alpha=alpha)
  linear = LinearRegression()
  regs = cross_val_score(reg,X,y,cv=5,scoring = "r2").mean()
  linears = cross_val_score(linear,X,y,cv=5,scoring = "r2").mean()
  ridge.append(regs)
  lr.append(linears)
plt.plot(alpharange,ridge,color="red",label="Ridge")
plt.plot(alpharange,lr,color="orange",label="LR")
plt.title("Mean")
plt.legend()
plt.show()
#細化學習曲線
alpharange = np.arange(100,300,10)
ridge, lr = [], []
for alpha in alpharange:
  reg = Ridge(alpha=alpha)
  #linear = LinearRegression()
  regs = cross_val_score(reg,X,y,cv=5,scoring = "r2").mean()
  #linears = cross_val_score(linear,X,y,cv=5,scoring = "r2").mean()
  ridge.append(regs)
  lr.append(linears)
plt.plot(alpharange,ridge,color="red",label="Ridge")
#plt.plot(alpharange,lr,color="orange",label="LR")
plt.title("Mean")
plt.legend()
plt.show()
2.3 選取最佳的正則化引數取值
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
#創造10*10的希爾伯特矩陣
X = 1. / (np.arange(1, 11) + np.arange(0, 10)[:, np.newaxis])
y = np.ones(10) #計算橫座標
n_alphas = 200
alphas = np.logspace(-10, -2, n_alphas) #建模,獲取每一個正則化取值下的係數組合
coefs = []
for a in alphas:
  ridge = linear_model.Ridge(alpha=a, fit_intercept=False)
  ridge.fit(X, y)
  coefs.append(ridge.coef_) #繪圖展示結果
ax = plt.gca()
ax.plot(alphas, coefs)
ax.set_xscale('log')
ax.set_xlim(ax.get_xlim()[::-1]) #將橫座標逆轉
plt.xlabel('正則化引數alpha')
plt.ylabel('係數w')
plt.title('嶺迴歸下的嶺跡圖')
plt.axis('tight')
plt.show()
這個類的使用也非常容易,依然使用我們之前建立的加利佛尼亞房屋價值資料集:
import numpy as np
import pandas as pd
from sklearn.linear_model import RidgeCV, LinearRegression
from sklearn.model_selection import train_test_split as TTS
from sklearn.datasets import fetch_california_housing as fch
import matplotlib.pyplot as plt
housevalue = fch()
X = pd.DataFrame(housevalue.data) y = housevalue.target
X.columns = ["住戶收入中位數","房屋使用年代中位數","平均房間數目"
      ,"平均臥室數目","街區人口","平均入住率","街區的緯度","街區的經度"]
Ridge_ = RidgeCV(alphas=np.arange(1,1001,100)
        #,scoring="neg_mean_squared_error"
         ,store_cv_values=True
        #,cv=5
        ).fit(X, y)
#無關交叉驗證的嶺迴歸結果
Ridge_.score(X,y) #呼叫所有交叉驗證的結果
Ridge_.cv_values_.shape
#進行平均後可以檢視每個正則化係數取值下的交叉驗證結果
Ridge_.cv_values_.mean(axis=0) #檢視被選擇出來的最佳正則化係數
Ridge_.alpha_