強化學習--Actor-Critic
阿新 • • 發佈:2019-01-09
Policy Gradient 可以直接預測出動作,也可以預測連續動作,但是無法單步更新。
QLearning 先預測出Q值,根據Q值選動作,無法預測連續動作、或者動作種類多的情況,但是可以單步更新。
一句話概括 Actor Critic 方法:
結合了 Policy Gradient (Actor) 和 Function Approximation (Critic) 的方法. Actor
基於概率選行為, Critic
基於 Actor
的行為評判行為的得分, Actor
根據 Critic
Actor Critic 方法的優勢: 可以進行單步更新, 比傳統的 Policy Gradient 要快.
Actor Critic 方法的劣勢: 取決於 Critic 的價值判斷, 但是 Critic 難收斂, 再加上 Actor 的更新, 就更難收斂. 為了解決收斂問題, Google Deepmind 提出了 Actor Critic
升級版 Deep Deterministic Policy Gradient
. 後者融合了 DQN 的優勢, 解決了收斂難的問題.
Actor網路的輸入(st,at,TDerror)
Actor 網路與policy gradient 差不多,多分類網路,在算loss時候,policy gradient需要乘一個權重Vt,而Vt是根據回報R 累計計算的。
在Actor中,在算loss時候,loss的權重是TDerror
TDerror是Critic網路計算出來的。
Critic網路的輸入(st,vt+1,r),輸出TDerror
V_eval = network(st)
# TD_error = (r+gamma*V_next) - V_eval
學習的時候輸入:(st, r, st+1)
vt+1 = network(st+1)
Critic網路(st,vt+1,r)
1 """ 2 Actor-Critic using TD-error as the Advantage, Reinforcement Learning. 3 4 The cart pole example. Policy is oscillated. 5 6 View more on my tutorial page: https://morvanzhou.github.io/tutorials/ 7 8 Using: 9 tensorflow 1.0 10 gym 0.8.0 11 """ 12 13 import numpy as np 14 import tensorflow as tf 15 import gym 16 17 np.random.seed(2) 18 tf.set_random_seed(2) # reproducible 19 20 # Superparameters 21 OUTPUT_GRAPH = False 22 MAX_EPISODE = 3000 23 DISPLAY_REWARD_THRESHOLD = 200 # renders environment if total episode reward is greater then this threshold 24 MAX_EP_STEPS = 1000 # maximum time step in one episode 25 RENDER = False # rendering wastes time 26 GAMMA = 0.9 # reward discount in TD error 27 LR_A = 0.001 # learning rate for actor 28 LR_C = 0.01 # learning rate for critic 29 30 env = gym.make('CartPole-v0') 31 env.seed(1) # reproducible 32 env = env.unwrapped 33 34 N_F = env.observation_space.shape[0] 35 N_A = env.action_space.n 36 37 38 class Actor(object): 39 def __init__(self, sess, n_features, n_actions, lr=0.001): 40 self.sess = sess 41 42 self.s = tf.placeholder(tf.float32, [1, n_features], "state") 43 self.a = tf.placeholder(tf.int32, None, "act") 44 self.td_error = tf.placeholder(tf.float32, None, "td_error") # TD_error 45 46 with tf.variable_scope('Actor'): 47 l1 = tf.layers.dense( 48 inputs=self.s, 49 units=20, # number of hidden units 50 activation=tf.nn.relu, 51 kernel_initializer=tf.random_normal_initializer(0., .1), # weights 52 bias_initializer=tf.constant_initializer(0.1), # biases 53 name='l1' 54 ) 55 56 self.acts_prob = tf.layers.dense( 57 inputs=l1, 58 units=n_actions, # output units 59 activation=tf.nn.softmax, # get action probabilities 60 kernel_initializer=tf.random_normal_initializer(0., .1), # weights 61 bias_initializer=tf.constant_initializer(0.1), # biases 62 name='acts_prob' 63 ) 64 65 with tf.variable_scope('exp_v'): 66 log_prob = tf.log(self.acts_prob[0, self.a]) 67 self.exp_v = tf.reduce_mean(log_prob * self.td_error) # advantage (TD_error) guided loss 68 69 with tf.variable_scope('train'): 70 self.train_op = tf.train.AdamOptimizer(lr).minimize(-self.exp_v) # minimize(-exp_v) = maximize(exp_v) 71 72 def learn(self, s, a, td): 73 s = s[np.newaxis, :] 74 feed_dict = {self.s: s, self.a: a, self.td_error: td} 75 _, exp_v = self.sess.run([self.train_op, self.exp_v], feed_dict) 76 return exp_v 77 78 def choose_action(self, s): 79 s = s[np.newaxis, :] 80 probs = self.sess.run(self.acts_prob, {self.s: s}) # get probabilities for all actions 81 return np.random.choice(np.arange(probs.shape[1]), p=probs.ravel()) # return a int 82 83 84 class Critic(object): 85 def __init__(self, sess, n_features, lr=0.01): 86 self.sess = sess 87 88 self.s = tf.placeholder(tf.float32, [1, n_features], "state") 89 self.v_ = tf.placeholder(tf.float32, [1, 1], "v_next") 90 self.r = tf.placeholder(tf.float32, None, 'r') 91 92 with tf.variable_scope('Critic'): 93 l1 = tf.layers.dense( 94 inputs=self.s, 95 units=20, # number of hidden units 96 activation=tf.nn.relu, # None 97 # have to be linear to make sure the convergence of actor. 98 # But linear approximator seems hardly learns the correct Q. 99 kernel_initializer=tf.random_normal_initializer(0., .1), # weights 100 bias_initializer=tf.constant_initializer(0.1), # biases 101 name='l1' 102 ) 103 104 self.v = tf.layers.dense( 105 inputs=l1, 106 units=1, # output units 107 activation=None, 108 kernel_initializer=tf.random_normal_initializer(0., .1), # weights 109 bias_initializer=tf.constant_initializer(0.1), # biases 110 name='V' 111 ) 112 113 with tf.variable_scope('squared_TD_error'): 114 self.td_error = self.r + GAMMA * self.v_ - self.v 115 self.loss = tf.square(self.td_error) # TD_error = (r+gamma*V_next) - V_eval 116 with tf.variable_scope('train'): 117 self.train_op = tf.train.AdamOptimizer(lr).minimize(self.loss) 118 119 def learn(self, s, r, s_): 120 s, s_ = s[np.newaxis, :], s_[np.newaxis, :] 121 122 v_ = self.sess.run(self.v, {self.s: s_}) 123 td_error, _ = self.sess.run([self.td_error, self.train_op], 124 {self.s: s, self.v_: v_, self.r: r}) 125 return td_error 126 127 128 sess = tf.Session() 129 130 actor = Actor(sess, n_features=N_F, n_actions=N_A, lr=LR_A) 131 critic = Critic(sess, n_features=N_F, lr=LR_C) # we need a good teacher, so the teacher should learn faster than the actor 132 133 sess.run(tf.global_variables_initializer()) 134 135 if OUTPUT_GRAPH: 136 tf.summary.FileWriter("logs/", sess.graph) 137 138 for i_episode in range(MAX_EPISODE): 139 s = env.reset() 140 t = 0 141 track_r = [] 142 while True: 143 if RENDER: env.render() 144 145 a = actor.choose_action(s) 146 147 s_, r, done, info = env.step(a) 148 149 if done: r = -20 150 151 track_r.append(r) 152 153 td_error = critic.learn(s, r, s_) # gradient = grad[r + gamma * V(s_) - V(s)] 154 actor.learn(s, a, td_error) # true_gradient = grad[logPi(s,a) * td_error] 155 156 s = s_ 157 t += 1 158 159 if done or t >= MAX_EP_STEPS: 160 ep_rs_sum = sum(track_r) 161 162 if 'running_reward' not in globals(): 163 running_reward = ep_rs_sum 164 else: 165 running_reward = running_reward * 0.95 + ep_rs_sum * 0.05 166 if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering 167 print("episode:", i_episode, " reward:", int(running_reward)) 168 break