強化學習 Qlearning小例子
阿新 • • 發佈:2018-11-26
開始入門強化學習,最先看了莫凡大佬的視訊,講解Q-learning演算法不得不說真的是通俗易懂。這裡是視訊地址:https://morvanzhou.github.io/tutorials/machine-learning/reinforcement-learning/2-1-general-rl/#│ᆭチツᄍ
然後貼一下大神寫的程式碼,以後多多學習。這個小例子對學習理解Q-learning演算法十分有用!!!
# -*- coding: utf-8 -*- """ Created on Mon Oct 1 22:20:10 2018 @author: """ import numpy as np import pandas as pd import time np.random.seed(2) # reproducible N_STATES = 6 # 1維世界的寬度 ACTIONS = ['left', 'right'] # 探索者的可用動作 EPSILON = 0.9 # 貪婪度 greedy ALPHA = 0.1 # 學習率 GAMMA = 0.9 # 獎勵遞減值 MAX_EPISODES = 13 # 最大回合數 FRESH_TIME = 0.3 # 移動間隔時間 #Q表: def build_q_table(n_states, actions): table = pd.DataFrame( np.zeros((n_states, len(actions))), # q_table 全 0 初始 columns=actions, # columns 對應的是行為名稱 ) return table # 在某個 state 地點, 選擇行為 def choose_action(state, q_table): state_actions = q_table.iloc[state, :] # 選出這個 state 的所有 action 值 if (np.random.uniform() > EPSILON) or (state_actions.all() == 0): # 非貪婪 or 或者這個 state 還沒有探索過 action_name = np.random.choice(ACTIONS) else: action_name = state_actions.argmax() # 貪婪模式 return action_name #環境反饋S_,R def get_env_feedback(S, A): # This is how agent will interact with the environment if A == 'right': # move right if S == N_STATES - 2: # terminate S_ = 'terminal' R = 1 else: S_ = S + 1 #右移 R = 0 else: # move left R = 0 if S == 0: S_ = S # reach the wall else: S_ = S - 1 #左移 return S_, R #環境更新 def update_env(S, episode, step_counter): # This is how environment be updated env_list = ['-']*(N_STATES-1) + ['T'] # '---------T' our environment if S == 'terminal': interaction = 'Episode %s: total_steps = %s' % (episode+1, step_counter) print('\r{}'.format(interaction)) time.sleep(2) print('\r ', end='') else: env_list[S] = 'o' interaction = ''.join(env_list) print('\r{}'.format(interaction), end='') time.sleep(FRESH_TIME) #強化學習主迴圈 def rl(): q_table = build_q_table(N_STATES, ACTIONS) # 初始 q table for episode in range(MAX_EPISODES): # 回合 step_counter = 0 S = 0 # 回合初始位置 is_terminated = False # 是否回合結束 update_env(S, episode, step_counter) # 環境更新 while not is_terminated: A = choose_action(S, q_table) # 選行為 S_, R = get_env_feedback(S, A) # 實施行為並得到環境的反饋 q_predict = q_table.loc[S, A] # 估算的(狀態-行為)值 if S_ != 'terminal': q_target = R + GAMMA * q_table.iloc[S_, :].max() # 實際的(狀態-行為)值 (回合沒結束) else: q_target = R # 實際的(狀態-行為)值 (回合結束) is_terminated = True # terminate this episode q_table.loc[S, A] += ALPHA * (q_target - q_predict) # q_table 更新 S = S_ # 探索者移動到下一個 state update_env(S, episode, step_counter+1) # 環境更新 step_counter += 1 return q_table if __name__ == "__main__": q_table = rl() print('\r\nQ-table:\n') print(q_table)