bandit演算法原理及Python實現
阿新 • • 發佈:2019-02-05
選一個(0,1)之間較小的數epsilon
每次以概率epsilon(產生一個[0,1]之間的隨機數,比epsilon小)做一件事:所有臂中隨機選一個。否則,選擇截止當前,平均收益最大的那個臂。
是不是簡單粗暴?epsilon的值可以控制對Exploit和Explore的偏好程度。越接近0,越保守,只想花錢不想掙錢。
程式碼:
from arm import Arm import random import numpy as np def mean(values): return sum(values)*1.0/len(values) class EpsilonGreedyAlgorithm(object): def __init__(self, arms, epsilon): self.epsilon = epsilon self.arms = arms self.values = [[] for i in arms] def select_arm(self): if random.random() > self.epsilon: arm_idx = self.get_best_arm_idx() else: arm_idx = self.get_random_arm_idx() arm = self.arms[arm_idx] reward = arm.pull() self.update(arm_idx, reward) def update(self, arm_idx, reward): self.values[arm_idx].append(reward) def get_best_arm_idx(self): max_yhat = 0.0 max_idx = None for i, values in enumerate(self.values): yhat = 0.0 if len(values) == 0 else mean(values) if yhat > max_yhat: max_yhat = yhat max_idx = i if max_idx is None: return self.get_random_arm_idx() else: return max_idx def get_random_arm_idx(self): return random.randrange(len(self.arms)) if __name__=="__main__": epsilon = 0.1 ps = [random.random() for i in range(random.randrange(2, 8))] arms = [Arm(p) for p in ps] algo = EpsilonGreedyAlgorithm(arms, epsilon=epsilon) for i in range(100): algo.select_arm() total_reward = 0 for i, vals in enumerate(algo.values): total_reward += sum(vals) print "reward:", total_reward, "\t epsilon:", epsilon