房價預測《進階版,測試》
阿新 • • 發佈:2017-10-23
rest 哪些 tle blog model lln one atp feature
#coding=utf8 import numpy as np import pandas as pd from sklearn.linear_model import Ridge from sklearn.model_selection import cross_val_score import matplotlib.pyplot as plt from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, AdaBoostRegressor from xgboost import XGBRegressor #不要第一列id,只是作為索引 train_df = pd.read_csv(‘./input/train.csv‘, index_col=0) test_df = pd.read_csv(‘./input/test.csv‘, index_col=0) prices = pd.DataFrame({‘price‘:train_df[‘SalePrice‘], ‘log(price + 1)‘:np.log1p(train_df[‘SalePrice‘])}) #print train_df.columns #prices.hist() #print ‘ok‘ #print train_df.index #print test_df.index y_train = np.log1p(train_df.pop(‘SalePrice‘)) #print y_train.shape #print train_df.index all_df = pd.concat((train_df,test_df), axis=0) #變量轉換 print train_df.index print test_df.index #print all_df[‘MSSubClass‘].dtypes all_df[‘MSSubClass‘] = all_df[‘MSSubClass‘].astype(str) #print all_df.shape#print all_df[‘MSSubClass‘].value_counts() #print all_df[‘MSSubClass‘].dtypes #print pd.get_dummies(all_df[‘MSSubClass‘], prefix=‘MSSubClass‘).head() #當我們用numerical來表達categorical的時候,要註意,數字本身有大小的含義,所以亂用數字會給之後的模型學習帶來麻煩。於是我們可以用One-Hot的方法來表達category。 #pandas自帶的get_dummies方法,一鍵做到One-Hot。 #把所有的category數據,都給One-Hot了 all_dummy_df = pd.get_dummies(all_df) #print all_dummy_df.head() #print all_dummy_df.isnull().sum().sort_values(ascending=False).head(10) #處理缺失值 mean_cols = all_dummy_df.mean() #print mean_cols all_dummy_df = all_dummy_df.fillna(mean_cols) #print all_dummy_df.isnull().sum().sum() #標準化numerical數據,這裏,我們當然不需要把One-Hot的那些0/1數據給標準化。我們的目標應該是那些本來就是numerical的數據: #先來看看 哪些是numerical的 numeric_cols = all_df.columns[all_df.dtypes != ‘object‘] #print numeric_cols #print train_df.index numeric_col_means = all_dummy_df.loc[:, numeric_cols].mean() numeric_col_std = all_dummy_df.loc[:, numeric_cols].std() all_dummy_df.loc[:, numeric_cols] = (all_dummy_df.loc[:, numeric_cols] - numeric_col_means) / numeric_col_std dummy_train_df = all_dummy_df.loc[train_df.index] dummy_test_df = all_dummy_df.loc[test_df.index] #print train_df.index #print test_df.index #print dummy_train_df.shape #print dummy_test_df.shape #print type(dummy_train_df) X_train = dummy_train_df.values X_test = dummy_test_df.values #print type(X_train) print X_train.shape alphas = np.logspace(-3, 2, 50) test_scores = [] for alpha in alphas: clf = Ridge(alpha) test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring=‘neg_mean_squared_error‘)) test_scores.append(np.mean(test_score)) plt.plot(alphas, test_scores) plt.title(‘Alpha vs CV Error‘) max_features = [.1, .3, .5, .7, .9, .99] test_scores = [] for max_feat in max_features: clf = RandomForestRegressor(n_estimators=200, max_features=max_feat) test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=5, scoring=‘neg_mean_squared_error‘)) test_scores.append(np.mean(test_score)) plt.plot(max_features, test_scores) plt.title("Max Features vs CV Error") #做一點高級的Ensemble #這裏,可以不必輸入Base_estimator,使用自帶的,但是結果不及已經調好的 base_estimator,通過作圖可以驗證。 ridge = Ridge(alpha=15) #Bagging params = [1, 10, 15, 20, 25, 30, 40] test_scores = [] for param in params: clf = BaggingRegressor(n_estimators=param, base_estimator=ridge) test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring=‘neg_mean_squared_error‘)) test_scores.append(np.mean(test_score)) plt.plot(params, test_scores) plt.title("n_estimator vs CV Error") #Boosting params = [10, 15, 20, 25, 30, 35, 40, 45, 50] test_scores = [] for param in params: clf = AdaBoostRegressor(n_estimators=param, base_estimator=ridge) test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring=‘neg_mean_squared_error‘)) test_scores.append(np.mean(test_score)) plt.plot(params, test_scores) plt.title("n_estimator vs CV Error") #XGBoost params = [1,2,3,4,5,6] test_scores = [] for param in params: clf = XGBRegressor(max_depth=param) test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring=‘neg_mean_squared_error‘)) test_scores.append(np.mean(test_score)) plt.plot(params, test_scores) plt.title("max_depth vs CV Error") """ rf = RandomForestRegressor(n_estimators=500, max_features=.3) ridge.fit(X_train, y_train) rf.fit(X_train, y_train) y_ridge = np.expm1(ridge.predict(X_test)) y_rf = np.expm1(rf.predict(X_test)) y_final = (y_ridge + y_rf) / 2 """
房價預測《進階版,測試》