強化學習6-MC與TD的比較-實戰
阿新 • • 發佈:2019-02-18
choose == randint http 擬合 tar odi 特殊 visio
# encoding:utf-8 import numpy as np import matplotlib.pylab as plt ‘‘‘ 隨機行走問題 0 - 1 - 2 - 3 - 4 - 5 - 6 e s e 0終點r為0. 6終點r為1 中間每個選擇r為0 策略 [-1, 1] 每種選擇0.5, -1向左,1向右 這個策略下,理論上數字越大回報越高 ‘‘‘ stats = range(7) start = 3 end = [0, 6] actions = [-1, 1] r = 1 # 衰減因子 alpha = 0.5 # 學習率 echos= [5, 10, 50, 100, 500, 1000, 10000] def choose_act(stat): # 策略 if np.random.rand() > 0.5: return 1 else: return -1 v = np.zeros([len(stats)]) for i in echos: for j in range(i): act = choose_act(start) stat_ = start + act if stat_ in end:if stat_ == 6: v[start] += alpha * (1 + v[stat_] - v[start]) else: v[start] += alpha * (v[stat_] - v[start]) start = np.random.randint(1,6) else: v[start] += alpha * (v[stat_] - v[start]) start = np.random.randint(1,6) plt.plot(v[1:-1]) plt.text(stats[-4], v[-3], j+1) plt.xlabel(‘state‘) plt.ylabel(‘v‘) plt.text(1, 0.8, ‘alpha = %s‘%alpha) plt.show()
可以看到 隨著學習率的增大,效果越來越好,當學習率為0.5時,已經明顯過擬合了
# encoding:utf-8 from __future__ import division __author__ = ‘HP‘ import numpy as np import matplotlib.pylab as plt stats = range(7) end = [0, 6] actions = [-1, 1] r = 1 # 衰減因子 def choose_act(stat): # 策略 if np.random.rand() > 0.5: return 1 else: return -1 v_t = [0, 1/6, 1/3, 1/2, 2/3, 5/6, 0] alpha_td = [0.1, 0.15, 0.2] # 學習率 alpha_mc = [0.01, 0.02, 0.04] for c in range(3): # TD alpha = alpha_td[c] # v = np.random.rand(len(stats)) # v = np.zeros(len(stats)) v = [0.2] * len(stats) errors = [] start = 3 for j in range(100): act = choose_act(start) stat_ = start + act if stat_ in end: if stat_ == 6: v[start] += alpha * (1 + v[stat_] - v[start]) else: v[start] += alpha * (v[stat_] - v[start]) start = np.random.randint(1,6) else: v[start] += alpha * (v[stat_] - v[start]) start = stat_ # np.random.randint(1,6) error = np.sqrt(sum([pow(value - v_t[index], 2) for index, value in enumerate(v)])) errors.append(error) plt.plot(range(100), errors) index = np.random.randint(40,100) plt.text(index-3, errors[index], ‘alpha_td = %s‘%alpha) # MC alpha = alpha_mc[c] # v_mc = np.random.rand(len(stats)) # v_mc = np.zeros(len(stats)) v_mc = [0.2] * len(stats) count_mc = np.zeros(len(stats)) errors = [] for j in range(100): process = [] start = 3 # np.random.randint(1, 6) while True: if start in end: process.append([start]) break act = choose_act(start) if start == 5 and act == 1: r = 1 else: r = 0 process.append([start, act, r]) start = start + act T = len(process[:-1]) s_all = [i[0] for i in process[:-1]] s_dealed = [] for k in range(T): sar = process[k] s = sar[0] if s in s_dealed:continue # first visit t = s_all.index(s) # 該s 首次出現的位置 num = s_all.count(s) # 該s 總共出現的次數 r_all = sum([i[2] for i in process[t:-1]]) / num v_mc[s] += alpha * (r_all - v_mc[s]) # v_mc[s] = (v_mc[s] * count_mc[s] + r_all) / (count_mc[s] + 1) # count_mc[s] += 1 s_dealed.append(s) error = np.sqrt(sum([pow(value - v_t[index], 2) for index, value in enumerate(v_mc)])) errors.append(error) plt.plot(range(100), errors, ‘.‘) index = np.random.randint(40,100) plt.text(index-3, errors[index], ‘alpha_mc = %s‘%alpha) plt.xlabel(‘echo‘) plt.ylabel(‘mse‘) plt.show()
隨機行走有個特殊性:兩個終點,有一個終點獎勵為0,也就是說在前幾個回合中,單步更新的TD如果一開始向左走,需要好多步才能到達右邊終點,而MC由於是整個回合,要麽左,要麽右,先到右邊終點的概率要大得多,所以,前幾步MC收斂明顯比TD快
但是從總體來看,TD收斂比MC要快,而且收斂值要小,故TD效率更高
強化學習6-MC與TD的比較-實戰