2-3動手學習深度學習-kaggle房價預測
阿新 • • 發佈:2018-12-19
%matplotlib inline import gluonbook as gb from mxnet import autograd,gluon,init,nd from mxnet.gluon import data as gdata,loss as gloss,nn import pandas as pd import numpy as np # 讀入資料 train_data=pd.read_csv('../data/kaggle_house_pred_train.csv') test_data=pd.read_csv('../data/kaggle_house_pred_test.csv') # print(train_data.shape) (1460, 81) # print(test_data.shape) (1459, 80) # train_data.iloc[0:4, [0, 1, 2, 3, -3, -2, -1]] 先按照行來索引 前 4 個樣本的前 4 個特徵、後 2 個特徵和標籤(SalePrice) # 第一個特徵是 Id,它能幫助模型記住每個訓練樣本 # ,但難以推廣到測試樣本,所以我們不使用它來訓練。我們將所有的訓練和測試資料的 79 個特徵按樣本連結 all_features=pd.concat((train_data.iloc[:,1:-1],test_data.iloc[:,1:])) # all_features.shape (2919, 79) # 預處理資料 # 我們對連續數值的特徵做標準化(standardization): # 設該特徵在整個資料集上的均值為 μ ,標準差為 σ 。那麼,我們可以將該特徵的每個值先減去 μ 再除以 σ # 得到標準化後的每個特徵值。對於缺失的特徵值,我們將其替換成該特徵的均值。 numeric_features=all_features.dtypes[all_features.dtypes!='object'].index all_features[numeric_features]=all_features[numeric_features].apply(lambda x:(x-x.mean())/x.std()) all_features=all_features.fillna(all_features.mean()) # 接下來將離散數值轉成指示特徵。舉個例子,假設特徵 MSZoning 裡面有兩個不同的離散值 RL 和 RM, # 那麼這一步轉換將去掉 MSZoning 特徵,並新加兩個特徵 MSZoning_RL 和 MSZoning_RM,其值為 0 或 1。 # 如果一個樣本原來在 MSZoning 裡的值為 RL,那麼有 MSZoning_RL=0 且 MSZoning_RM=1。 # dummy_na=True 將缺失值也當做合法的特徵值併為其建立指示特徵。 all_features=pd.get_dummies(all_features,dummy_na=True) # all_features.shape (2919, 331) # 可以看到這一步轉換將特徵數從 79 增加到了 331。 # 最後,通過values屬性得到 NumPy 格式的資料,並轉成 NDArray 方便後面的訓練。 n_train=train_data.shape[0] train_features=nd.array(all_features[:n_train].values) test_features=nd.array(all_features[n_train:].values) train_labels=nd.array(train_data.SalePrice.values).reshape((-1,1)) # 訓練模型 # 我們使用一個基本的線性迴歸模型和平方損失函式來訓練模型。 loss=gloss.L2Loss() def get_net(): net=nn.Sequential() net.add(nn.Dense(64,activation='relu'),nn.Dense(1)) net.initialize() return net def log_rmse(net,train_features,train_labels): clipped_preds=nd.clip(net(train_features),1,float('inf')) rmse=nd.sqrt(2*loss(clipped_preds.log(),train_labels.log()).mean()) return rmse.asscalar() # Adam 優化演算法。相對之前使用的小批量隨機梯度下降,它對學習率相對不那麼敏感。 def train(net, train_features, train_labels, test_features, test_labels, num_epochs, learning_rate, weight_decay, batch_size): train_ls, test_ls = [], [] train_iter = gdata.DataLoader(gdata.ArrayDataset( train_features, train_labels), batch_size, shuffle=True) # 這裡使用了 Adam 優化演算法。 trainer = gluon.Trainer(net.collect_params(), 'adam', { 'learning_rate': learning_rate, 'wd': weight_decay}) for epoch in range(num_epochs): for X, y in train_iter: with autograd.record(): l = loss(net(X), y) l.backward() trainer.step(batch_size) train_ls.append(log_rmse(net, train_features, train_labels)) if test_labels is not None: test_ls.append(log_rmse(net, test_features, test_labels)) return train_ls, test_ls # K 折交叉驗證。它將被用來選擇模型設計並調節超引數。以下實現了一個函式,它返回第i折交叉驗證時所需要的訓練和驗證資料。 def get_k_fold_data(k,i,X,y): assert k>1 fold_size=X.shape[0]//k #表取整 print( fold_size) print('\n') X_train,y_train=None,None for j in range(k): idx=slice(j*fold_size,(j+1)*fold_size)#slice() 函式實現切片物件,主要用在切片操作函式裡的引數傳遞。返回一個切片物件。 X_part,y_part=X[idx,:],y[idx] if j==i: X_valid,y_valid=X_part,y_part elif X_train is None: X_train,y_train=X_part,y_part else: X_train=nd.concat(X_train,X_part,dim=0) y_train=nd.concat(y_train,y_part,dim=0) return X_train,y_train,X_valid,y_valid # 在 K 折交叉驗證中我們訓練 K 次並返回訓練和驗證的平均誤差。 def k_fold(k, X_train, y_train, num_epochs, learning_rate, weight_decay, batch_size): # def k_fold(k,X_train,y_train,num_eopchs,learning_rate,weight_decay,batch_size): train_l_sum,valid_l_sum=0,0 for i in range(k): data=get_k_fold_data(k,i,X_train,y_train) net=get_net() train_ls,valid_ls=train(net,*data,num_epochs,learning_rate, #*data表示把資料都取出來 weight_decay,batch_size) train_l_sum+=train_ls[-1] valid_l_sum+=valid_ls[-1] if i==4: gb.semilogy(range(1, num_epochs + 1), train_ls, 'epochs', 'rmse', range(1, num_epochs + 1), valid_ls, ['train', 'valid']) # print('fold %d,train rmse: %f,valid rmse: %f'%(i,train_ls[-1],valid[-1])) print('fold %d, train rmse: %f, valid rmse: %f' % ( i, train_ls[-1], valid_ls[-1])) return train_l_sum / k, valid_l_sum / k k, num_epochs, lr, weight_decay, batch_size = 5, 100, 5, 0, 64 verbose_epoch = num_epochs - 2 train_l, valid_l = k_fold(k, train_features, train_labels, num_epochs, lr, weight_decay, batch_size) print('%d-fold validation: avg train rmse: %f, avg valid rmse: %f' % (k, train_l, valid_l)) # 在預測之前,我們會使用完整的訓練資料集來重新訓練模型,並將預測結果存成提交所需要的格式 def train_and_pred(train_features,test_features,train_labels,test_data, num_epochs,lr,weight_decay,batch_size): net=get_net() train_ls,_=train(net,train_features,train_labels,None,None, num_epochs,lr,weight_decay,batch_size) gb.semilogy(range(1,num_epochs+1),train_ls,'epochs','rmse') print('train rmse %f'%train_ls[-1]) preds=net(test_features).asnumpy() test_data['SalePrice']=pd.Series(preds.reshape(-1,1)[0]) submission=pd.concat([test_data['Id'],test_data['SalePrice']],axis=1) submission.to_csv('submission.csv',index=False) train_and_pred(train_features, test_features, train_labels, test_data, num_epochs, lr, weight_decay, batch_size)