1. 程式人生 > >bandit演算法原理及Python實現

bandit演算法原理及Python實現

選一個(0,1)之間較小的數epsilon

每次以概率epsilon(產生一個[0,1]之間的隨機數,比epsilon小)做一件事:所有臂中隨機選一個。否則,選擇截止當前,平均收益最大的那個臂。

是不是簡單粗暴?epsilon的值可以控制對Exploit和Explore的偏好程度。越接近0,越保守,只想花錢不想掙錢。

程式碼:

from arm import Arm
import random
import numpy as np


def mean(values):
    return sum(values)*1.0/len(values)

class EpsilonGreedyAlgorithm(object):

    def __init__(self, arms, epsilon):
        self.epsilon = epsilon
        self.arms = arms
        self.values = [[] for i in arms]

    def select_arm(self):
        if random.random() > self.epsilon:
            arm_idx = self.get_best_arm_idx()
        else:
            arm_idx = self.get_random_arm_idx()

        arm = self.arms[arm_idx]
        reward = arm.pull()
        self.update(arm_idx, reward)

    def update(self, arm_idx, reward):
        self.values[arm_idx].append(reward)

    def get_best_arm_idx(self):
        max_yhat = 0.0
        max_idx = None
        for i, values in enumerate(self.values):
            yhat = 0.0 if len(values) == 0 else mean(values)
            if yhat > max_yhat:
                max_yhat = yhat
                max_idx = i

        if max_idx is None:
            return self.get_random_arm_idx()
        else:
            return max_idx

    def get_random_arm_idx(self):
        return random.randrange(len(self.arms))


if __name__=="__main__":
	epsilon = 0.1
	ps = [random.random() for i in range(random.randrange(2, 8))]
            arms = [Arm(p) for p in ps]
            algo = EpsilonGreedyAlgorithm(arms, epsilon=epsilon)
            for i in range(100):
                algo.select_arm()
            total_reward = 0
            for i, vals in enumerate(algo.values):
                total_reward += sum(vals)
    print "reward:", total_reward, "\t epsilon:", epsilon