1. 程式人生 > 實用技巧 >PLS偏最小二乘迴歸實現

PLS偏最小二乘迴歸實現

1. PLSRegression引數含義

交叉分解:sklearn.cross_decomposition.PLSRegression

class sklearn.cross_decomposition.PLSRegression(n_components=2, *, scale=True, max_iter=500, tol=1e-06, copy=True)

引數:

n_components:要保留的主成分數,預設為2個。int, (default 2)
scale:boolean, (default True),是否要將資料進行縮放;
max_iter:an integer, (default
500),NIPALS內迴圈的最大迭代次數(僅當algorithm =“ nipals”時使用); tol:non-negative real,Tolerance used in the iterative algorithm default 1e-06. copy:boolean, default True。Whether the deflation should be done on a copy. Let the default value to True unless you don’t care about side effect

Attributes:

x_weights_:X block權重向量
y_weights_:Y block權重向量
x_loadings_:array, [p, n_components] X block loadings vectors.
y_loadings_:array, [q, n_components] Y block loadings vectors.
x_scores_:array, [n_samples, n_components] X scores.
y_scores_:array, [n_samples, n_components] Y scores.
x_rotations_:array, [p, n_components] X block to latents rotations.
y_rotations_:array, [q, n_components] Y block to latents rotations.
coef_:array, [p, q] 線性模型中的迴歸係數向量: Y 
= X coef_ + Err

矩陣:

T: x_scores_,從自變數中選擇對因變數解釋力度最高的主成分;
U: y_scores_,通過T得到預測項U,因變數提取出的主成分; 
W: x_weights_,每個提取出的自變數主成分的權重
C: y_weights_,每個提取出的因變數主成分的權重
P: x_loadings_, Q: y_loadings_,Y對應於U的負荷矩陣

2. 案例實現

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use(
'seaborn') from sklearn.cross_decomposition import PLSRegression

###
接入大資料hive平臺
from pyhive import hive
conn = hive.Connection(host='172.23.52.11', port=10000, username='cp4d', database='tmp')
cursor = conn.cursor()


###
#匯入資料 datapap = pd.read_excel('/project_data/data_asset/datapap.xlsx') print(datapap.shape) datapap.head(17) #PLS模型 # Read data data =datapap data.columns X = data.drop(['name', 'flag', 'target_y', '時間' ],axis=1).to_numpy()#刪除不需要的自變數,不用於模型中 Y = data['target_y'].to_numpy() # Define the PLS regression object pls = PLSRegression(n_components=4) # Fit data:擬合數據 pls.fit(X, Y) Ypredict = pls.predict(X).flatten() #真實值與預測值的確定係數,越接近於1越好 R2Y = pls.score(X,Y) R2Y def _calculate_vips(model): t = model.x_scores_ w = model.x_weights_ q = model.y_loadings_ p, h = w.shape vips = np.zeros((p,))#np.zeros()表示初始化0向量 s = np.diag(np.matmul(np.matmul(np.matmul(t.T,t),q.T), q)).reshape(h, -1) #np.matmul(a,b)表示兩個矩陣相乘;np.diag()輸出矩陣中對角線上的元素,若矩陣是一維陣列則輸出一個以一維陣列為對角線的矩陣 total_s = np.sum(s) for i in range(p): weight = np.array([ (w[i,j] / np.linalg.norm(w[:,j]))**2 for j in range(h) ]) #np.linarg.norm()表示求範數:矩陣整體元素平方和開根號,不保留矩陣二維特性 vips[i] = np.sqrt(p*(np.matmul(s.T, weight))/total_s) #s.T表示矩陣的轉置 return vips _calculate_vips(pls).shape data.columns[1:] #變數重要性分析,變數對y的影響程度排序,一般認為大於1是有影響的 df_vip = pd.DataFrame() df_vip['X'] = data.drop(['name', 'flag', 'target_y','時間'],axis=1).columns df_vip['vip'] = _calculate_vips(pls) #VIP的視覺化 plt.figure(figsize=(8,8)) vip = df_vip.sort_values(by='vip',ascending= True).tail(30) plt.barh(vip.X,vip.vip,height=0.5) plt.title('VIP') # plt.figure(figsize=(15,8)) length=range(len(Y)) plt.plot(length,Y,marker='o',label='target_y') plt.plot(length,Ypredict,marker='o',label='target_y_predict') plt.legend() # df_vip['coef'] = pls.coef_.flatten() df_vip = df_vip.sort_values(by='vip',ascending=False).round(4).head(15) df_vip pls.coef_.flatten().round(4) x =Y y = Ypredict parameter = np.polyfit(x, y, 1) p = np.poly1d(parameter) plt.xlabel('Variables') plt.ylabel('target_y_predict') plt.scatter(x, y) plt.plot(x, p(x), color='g') plt.title('Actual vs Predict for target_y') # x.corr(y) plt.text(min(x),min(x)+0.1,'R = '+np.corrcoef(x,y)[0,1].round(2).astype(str),fontsize=16) plt.show() x_score = pd.DataFrame(pls.x_scores_,columns=['factor_1','factor_2','factor_3','factor_4']) x_score['name'] =data.name x_score['target_y'] =data['target_y'] # plt.scatter(x_score.factor_1,x_score.factor_2) sns.scatterplot('factor_1','factor_2',data=x_score[x_score['target_y']<4.05]) sns.scatterplot('factor_1','factor_2',data=x_score[x_score['target_y']>4.05])