時間序列預測之keras
阿新 • • 發佈:2018-12-24
''' 多變數時間序列預測 ''' import os os.getcwd() os.chdir('C:\\Users\\87671\\Desktop\\位元魔方') from pandas import read_csv from datetime import datetime from numpy import concatenate from matplotlib import pyplot from pandas import concat from pandas import DataFrame from sklearn import preprocessing from sklearn.metrics import mean_squared_error from keras.models import Sequential from keras.layers import Dense from keras.layers import LSTM # 把日期轉換成指定格式 年月日小時 def parse(x): return datetime.strptime(x, '%Y %m %d %H') dataset = read_csv('raw.csv', parse_dates=[['year','month','day','hour']],index_col=0,date_parser=parse) # parse_dates指定時間轉化的列 ,date_parser指定轉化的形式,index_col指定某列作為索引,一箇中括號在一起表示合併在了一起 dataset.drop('No',axis =1 ,inplace = True) # 賦予特殊的列名 dataset.columns = ['pollution', 'dew', 'temp', 'press', 'wnd_dir', 'wnd_spd', 'snow', 'rain'] dataset.index.name = 'date' # 填充NA為0 dataset['pollution'].fillna(0,inplace = True) # 刪除前24行的資料 dataset = dataset[24:] print(dataset.head(5)) #dataset.to_csv('pollution.csv') dataset = read_csv("pollution.csv",header = 0,index_col = 0) # 畫圖,顯示每個變數的5年資料 values = dataset.values # 首先提取資料框裡面的values groups = [0,1,2,3,5,6,7] i = 1 pyplot.figure() for group in groups: pyplot.subplot(len(groups),1,i) # 子圖要幾行幾列,每個圖在第幾個 pyplot.plot(values[:,group]) pyplot.title(dataset.columns[group],y = 0.5,loc='right') i += 1 pyplot.show() ''' 明確監督問題:根據前1個小時的天氣情況x和y,預測下一個階段的y 將時間序列資料集轉化為監督學習問題 基礎知識 1.df['t-1']=df['t'].shift(1) #建立了和t之後一項的序列,同等長度,前面的第一個變成了NA df['t-1']=df['t'].shift(-1) # 提前一項,最後一項為nan ''' def series_to_supervised(data, n_in=1,n_out=1,dropnan = True): n_vars = 1 if type(data) is list else data.shape[1] df = DataFrame(data) cols,names = list(),list() # 輸入序列(t-n,...,t-1) for i in range(n_in,0,-1): cols.append(df.shift(i)) names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)] # 預測序列(t ,t+1,...,t+n) for i in range(0, n_out): # 只包括前不包括後,所以range0,1代表只有一個 cols.append(df.shift(-i)) if i == 0: names += [('var%d(t)' % (j+1)) for j in range(n_vars)] else: names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)] # 將他們放在一起 agg = concat(cols,axis=1) agg.columns = names # 去除有na的行 if dropnan: agg.dropna(inplace = True) return agg ''' 舉例 ## 單變數(就只有一個時間變數) values = [x for x in range(10)] data = series_to_supervised(values) print(data) data = series_to_supervised(values,3) # 滯後三項預測一個y print(data) data = series_to_supervised(values,2,2) # 滯後2,預測2 print(data) ## 多變數(以2個時間變數為例子) raw = DataFrame() raw['ob1'] = [x for x in range(10)] raw['ob2'] = [x for x in range(50, 60)] values = raw.values data = series_to_supervised(values) # 2個都滯後一項 print(data) ''' # 將類別變數(第5列)獨熱編碼 encoder = preprocessing.LabelEncoder() values[:,4] = encoder.fit_transform(values[:,4]) # 確保所有的data是float形式 values = values.astype('float32') # 標準化壓縮到0-1之間 scaler = preprocessing.MinMaxScaler(feature_range=(0,1)) scaled = scaler.fit_transform(values) ###########################################################################timestep=1 # 使用時間序列變監督學習的函式 reframed1 = series_to_supervised(scaled,1,1) # 其實我們只需要預測的是第一個變數,所以其他變數對應的時間序列可以刪去 reframed1.drop(reframed1.columns[[9,10,11,12,13,14,15,]], axis = 1,inplace=True) print(reframed1.head()) #前八個變數時原來資料集的X和Y,最後一個是Y ''' 現在資料準備好了,開始擬合模型 ''' #劃分訓練集測試集(這裡講第一年當做訓練集,其餘4年當做測試集) values = reframed1.values n_train_hours = 365*24 train = values[:n_train_hours,:] test = values[n_train_hours:,:] #劃分X和Y train_X,train_y = train[:,:-1],train[:,-1] test_X,test_y = test[:,:-1],test[:,-1] #X轉化成LSTM需要的格式[樣本,時間步長,特徵] train_X = train_X.reshape((train_X.shape[0],1,train_X.shape[1])) test_X = test_X.reshape((test_X.shape[0],1,test_X.shape[1])) print(train_X.shape,train_y.shape,test_X.shape,test_y.shape) # 設計網路 model = Sequential() # 50個神經元,X[1]是一個時間步長,X[2]是8個維度 model.add(LSTM(50, input_shape=(train_X.shape[1],train_X.shape[2]))) model.add(Dense(1)) # 平均絕對誤差mae model.compile(loss='mae',optimizer = 'adam') # 擬合network # 50個批量大小為72的訓練時機 history = model.fit(train_X,train_y,epochs=50,batch_size=72,validation_data=(test_X, test_y), verbose=2, shuffle=False) # plot history pyplot.plot(history.history['loss'],label='train') pyplot.plot(history.history['val_loss'],label='test') pyplot.legend() pyplot.show() # 看圖觀察可能過擬合 ''' 模型評估 ''' # make a prediction yhat = model.predict(test_X) test_X = test_X.reshape((test_X.shape[0], test_X.shape[2])) # 3W條資料,8個特徵 # invert scaling for forecast inv_yhat = concatenate((yhat, test_X[:, 1:]), axis=1) # 拼接成y對應X7個變數的格式 inv_yhat = scaler.inverse_transform(inv_yhat) inv_yhat = inv_yhat[:,0] # 只選擇第一列也就是我們真正的yhat # invert scaling for actual test_y = test_y.reshape((len(test_y), 1)) inv_y = concatenate((test_y, test_X[:, 1:]), axis=1) inv_y = scaler.inverse_transform(inv_y) inv_y = inv_y[:,0] # calculate RMSE rmse = sqrt(mean_squared_error(inv_y, inv_yhat)) # 計算mse print('Test RMSE: %.3f' % rmse) ##################################################################################時間跨度為2呢 reframed = series_to_supervised(scaled,2,1) ''' 不能再像原來那樣刪這些編號啦,可以這樣做 train_X, train_y = train[:, 0:n_obs], train[:, -n_features] reframed.drop(reframed.columns[[9,10,11,12,13,14,15,]], axis = 1,inplace=True) ''' ''' 現在資料準備好了,開始擬合模型 ''' #劃分訓練集測試集(這裡講第一年當做訓練集,其餘4年當做測試集) values = reframed.values n_train_hours = 365*24 train = values[:n_train_hours,:] test = values[n_train_hours:,:] #劃分X和Y # 和step=1不一樣 n_hours = 2 # timestep=2 n_features = 8 n_obs = n_hours * n_features # step=2 train_X, train_y = train[:, 0:n_obs], train[:, -n_features] test_X, test_y = test[:, 0:n_obs], test[:, -n_features] # reshape input to be 3D [samples, timesteps, features] train_X = train_X.reshape((train_X.shape[0], n_hours, n_features)) #這裡可以檢查train_X和reframed各自什麼樣子,經驗證是我們需要的樣子! test_X = test_X.reshape((test_X.shape[0], n_hours, n_features)) print(train_X.shape, train_y.shape, test_X.shape, test_y.shape) ''' train_X,train_y = train[:,:-1],train[:,-1] test_X,test_y = test[:,:-1],test[:,-1] #X轉化成LSTM需要的格式[樣本,時間不長,特徵] train_X = train_X.reshape((train_X.shape[0],1,train_X.shape[1])) test_X = test_X.reshape((test_X.shape[0],1,test_X.shape[1])) ''' # 設計網路 model = Sequential() # 50個神經元,X是一個時間步長,8個維度 model.add(LSTM(50, input_shape=(train_X.shape[1],train_X.shape[2]))) model.add(Dense(1)) # 平均絕對誤差mae model.compile(loss='mae',optimizer = 'adam') # 擬合network # 50個批量大小為72的訓練時機 history = model.fit(train_X,train_y,epochs=50,batch_size=72,validation_data=(test_X, test_y), verbose=2, shuffle=False) # plot history pyplot.plot(history.history['loss'],label='train') pyplot.plot(history.history['val_loss'],label='test') pyplot.legend() pyplot.show() # 看圖觀察可能過擬合 ''' 模型評估 ''' ''' # make a prediction 返回原來的單位 trainPredict = model.predict(train_X) testPredict = model.predict(test_X) trainPredict = scaler.inverse_transform(trainPredict) trainY = scaler.inverse_transform([trainY]) testPredict = scaler.inverse_transform(testPredict) testY = scaler.inverse_transform([testY]) import math trainScore = math.sqrt(mean_squared_error(trainY[0],trainPredict[:,0])) trainScore = math.sqrt(mean_squared_error(trainY[0],trainPredict[:,0])) print('Train RMSE: %.3f' % trainScore) print('Test RMSE: %.3f' % testScore) ''' yhat = model.predict(test_X) test_X = test_X.reshape((test_X.shape[0], test_X.shape[2]*n_hours)) # 3W條資料,16個特徵 # invert scaling for forecast inv_yhat = concatenate((yhat, test_X[:, 1:8]), axis=1) # 因為之前標準化之前是8列,所以inverse的時候也要是8列, # 當然我們有用的只是第一列,所以1:8,8:15都可以 inv_yhat = scaler.inverse_transform(inv_yhat) inv_yhat = inv_yhat[:,0] # 只選擇第一列也就是我們真正的yhat # invert scaling for actual test_y = test_y.reshape((len(test_y), 1)) inv_y = concatenate((test_y, test_X[:, 1:8]), axis=1) inv_y = scaler.inverse_transform(inv_y) inv_y = inv_y[:,0] # calculate RMSE rmse = sqrt(mean_squared_error(inv_y, inv_yhat)) # 計算mse print('Test RMSE: %.3f' % rmse) # 26.335 # 看一下預測的曲線(這裡值取了前面50個) plt.plot(inv_y[0:50],color='green') plt.plot(inv_yhat[0:50]) plt.show()