面試前必知必會的二分查詢及其變種
阿新 • • 發佈:2020-12-08
import gym //匯入模組 from policynet import PolicyGradient import matplotlib.pyplot as plt import time DISPLAY_REWARD_THRESHOLD = 1000 RENDER = False #建立一個環境 env = gym.make('CartPole-v0') //建立一個小車倒立擺模型 env.seed(1) env = env.unwrapped print(env.action_space) print(env.observation_space) print(env.observation_space.high) print(env.observation_space.low) RL = PolicyGradient( n_actions=env.action_space.n, n_features=env.observation_space.shape[0], learning_rate=0.02, reward_decay=0.99, ) #學習過程 for i_episode in range(85): observation = env.reset() while True: if RENDER: env.render() #取樣動作,探索環境 # action = RL.choose_action(observation) # observation_, reward, done, info = env.step(action) action = RL.choose_action(observation) observation_, reward, done, info = env.step(action) #將觀測,動作和回報儲存起來 RL.store_transition(observation, action, reward) if done: ep_rs_sum = sum(RL.ep_rs) if 'running_reward' not in globals(): running_reward = ep_rs_sum else: running_reward = running_reward * 0.99+ep_rs_sum * 0.01 if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True print("episode:", i_episode, "rewards:", int(running_reward)) #每個episode學習一次 vt = RL.learn() if i_episode == 0: plt.plot(vt) plt.xlabel('episode steps') plt.ylabel('normalized state-action value') plt.show() break #智慧體探索一步 observation = observation_ # #測試過程 for i in range(10): observation = env.reset() count = 0 while True: # 取樣動作,探索環境 env.render() action = RL.greedy(observation) #action = RL.choose_action(observation) #action = RL.sample_action(observation) # print (action) # print(action1) observation_, reward, done, info = env.step(action) if done: print(count) break observation = observation_ count+=1 #time.sleep(0.001) print (count)