線性迴歸及RANSAC異常值清除演算法案例
阿新 • • 發佈:2018-11-20
線性迴歸及RANSAC異常值清除演算法案例
1、常規線性迴歸
import matplotlib.pyplot as plt import seaborn as sns import pandas as pd import numpy as np from sklearn.linear_model import LinearRegression from sklearn.linear_model import RANSACRegressor def lin_regplot(X, y, model): plt.scatter(X, y, c='blue') plt.plot(X, model.predict(X), color='red') plt.savefig('result/Linear.png') plt.show() return None ### 線性迴歸模型 # 資料讀取 df = pd.read_csv('dataset/boston.csv', sep=',') df.columns = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'LSTAT', 'MEDV'] # print(df.head()) X = df[['RM']].values y = df[['MEDV']].values slr = LinearRegression() slr.fit(X, y) print("Slope: %.3f" % slr.coef_[0]) print("intercept: %.3f" % slr.intercept_) lin_regplot(X, y, slr)
視覺化:
2、RANSAC異常值清除後線性迴歸
### 使用RANSAC清除異常值高魯棒對的線性迴歸模型 ransac = RANSACRegressor(LinearRegression(), max_trials=100, min_samples=50, # residual_metric=lambda x: np.sum(np.abs(x), axis=1), residual_threshold=5.0, random_state=0) ransac.fit(X, y) # 視覺化 inlier_mask = ransac.inlier_mask_ outlier_mask = np.logical_not(inlier_mask) line_X = np.arange(3, 10, 1) line_y_ransac = ransac.predict(line_X[:, np.newaxis]) plt.scatter(X[inlier_mask], y[inlier_mask], c='blue', marker='o', label='Inliers') plt.scatter(X[outlier_mask], y[outlier_mask], c='lightgreen', marker='s', label='Outliers') plt.plot(line_X, line_y_ransac, color='red') plt.xlabel('Average number of rooms [RM]') plt.ylabel('Price in $ 1000 [MEDV]') plt.savefig('result/ransac.png') plt.show()
視覺化
3、所有程式碼
import matplotlib.pyplot as plt import seaborn as sns import pandas as pd import numpy as np from sklearn.linear_model import LinearRegression from sklearn.linear_model import RANSACRegressor def lin_regplot(X, y, model): plt.scatter(X, y, c='blue') plt.plot(X, model.predict(X), color='red') plt.savefig('result/Linear.png') plt.show() return None ### 線性迴歸模型 # 資料讀取 df = pd.read_csv('dataset/boston.csv', sep=',') df.columns = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'LSTAT', 'MEDV'] # print(df.head()) X = df[['RM']].values y = df[['MEDV']].values slr = LinearRegression() slr.fit(X, y) print("Slope: %.3f" % slr.coef_[0]) print("intercept: %.3f" % slr.intercept_) lin_regplot(X, y, slr) ### 使用RANSAC清除異常值高魯棒對的線性迴歸模型 ransac = RANSACRegressor(LinearRegression(), max_trials=100, min_samples=50, # residual_metric=lambda x: np.sum(np.abs(x), axis=1), residual_threshold=5.0, random_state=0) ransac.fit(X, y) # 視覺化 inlier_mask = ransac.inlier_mask_ outlier_mask = np.logical_not(inlier_mask) line_X = np.arange(3, 10, 1) line_y_ransac = ransac.predict(line_X[:, np.newaxis]) plt.scatter(X[inlier_mask], y[inlier_mask], c='blue', marker='o', label='Inliers') plt.scatter(X[outlier_mask], y[outlier_mask], c='lightgreen', marker='s', label='Outliers') plt.plot(line_X, line_y_ransac, color='red') plt.xlabel('Average number of rooms [RM]') plt.ylabel('Price in $ 1000 [MEDV]') plt.savefig('result/ransac.png') plt.show()