CS6700 - Tutorial - 1 - Bandits - Ipynb - Colaboratory
CS6700 - Tutorial - 1 - Bandits - Ipynb - Colaboratory
ipynb - Colaboratory
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import NamedTuple, List
class Env:
def __init__(self, num_arms: int, mean_reward_range: tuple, std: float):
"""
num_arms: number of bandit arms
mean_reward_range: mean reward of an arm should lie between the given range
std: standard deviation of the reward for each arm
"""
self.num_arms = num_arms
self.arms = self.create_arms(num_arms, mean_reward_range, std)
@property
def arm_ids(self):
return list(self.arms.keys())
def get_best_arm_and_expected_reward(self):
best_arm_id = max(self.arms, key=lambda x: self.arms[x].mean)
return best_arm_id, self.arms[best_arm_id].mean
def get_avg_arm_reward(self):
arm_mean_rewards = [v.mean for v in self.arms.values()]
return np.mean(arm_mean_rewards)
https://colab.research.google.com/drive/1tG4whuEJr84CsxKY49igHJ74Czi5ac6m#scrollTo=hSJXjgLaud2A&printMode=true 1/6
1/26/24, 3:40 PM CS6700_Tutorial_1_Bandits.ipynb - Colaboratory
keyboard_arrow_down Policy
class BasePolicy:
@property
def name(self):
return 'base_policy'
def reset(self):
"""
This function resets the internal variable.
"""
pass
class RandomPolicy(BasePolicy):
def __init__(self, arm_ids: List[int]):
self.arm_ids = arm_ids
@property
def name(self):
return 'random'
class EpGreedyPolicy(BasePolicy):
def __init__(self, epsilon: float, arm_ids: List[int]):
self.epsilon = epsilon
self.arm_ids = arm_ids
self.Q = {id: 0 for id in self.arm_ids}
self.num_pulls_per_arm = {id: 0 for id in self.arm_ids}
@property
def name(self):
return f'ep-greedy ep:{self.epsilon}'
https://colab.research.google.com/drive/1tG4whuEJr84CsxKY49igHJ74Czi5ac6m#scrollTo=hSJXjgLaud2A&printMode=true 2/6
1/26/24, 3:40 PM CS6700_Tutorial_1_Bandits.ipynb - Colaboratory
class SoftmaxPolicy(BasePolicy):
def __init__(self, tau, arm_ids):
self.tau = tau
self.arm_ids = arm_ids
self.Q = {id: 0 for id in self.arm_ids}
self.num_pulls_per_arm = {id: 0 for id in self.arm_ids}
@property
def name(self):
return f'softmax tau:{self.tau}'
def reset(self):
self.Q = {id: 0 for id in self.arm_ids}
self.num_pulls_per_arm = {id: 0 for id in self.arm_ids}
class UCB(BasePolicy):
# your code here
pass
keyboard_arrow_down Trainer
_, expected_max_reward = env.get_best_arm_and_expected_reward()
ax.plot(np.arange(timesteps), [expected_max_reward]*timesteps, 'g-')
avg_arm_reward = env.get_avg_arm_reward()
ax.plot(np.arange(timesteps), [avg_arm_reward]*timesteps, 'r-')
plt.legend(loc='lower right')
plt.show()
keyboard_arrow_down Experiments
https://colab.research.google.com/drive/1tG4whuEJr84CsxKY49igHJ74Czi5ac6m#scrollTo=hSJXjgLaud2A&printMode=true 3/6
1/26/24, 3:40 PM CS6700_Tutorial_1_Bandits.ipynb - Colaboratory
seed = 42
np.random.seed(seed)
num_arms = 5
mean_reward_range = (-25, 25)
std = 2.0
env.plot_arms_reward_distribution()
1 22.53571532049581
print(env.get_avg_arm_reward())
3.119254917081568
random_policy = RandomPolicy(env.arm_ids)
plot_reward_curve_and_print_regret(env, [random_policy], timesteps=200, num_runs=500)
https://colab.research.google.com/drive/1tG4whuEJr84CsxKY49igHJ74Czi5ac6m#scrollTo=hSJXjgLaud2A&printMode=true 4/6
1/26/24, 3:40 PM CS6700_Tutorial_1_Bandits.ipynb - Colaboratory
keyboard_arrow_down Optional: Please explore different values of epsilon, tau and verify how does the behaviour changes.
https://colab.research.google.com/drive/1tG4whuEJr84CsxKY49igHJ74Czi5ac6m#scrollTo=hSJXjgLaud2A&printMode=true 5/6
1/26/24, 3:40 PM CS6700_Tutorial_1_Bandits.ipynb - Colaboratory
https://colab.research.google.com/drive/1tG4whuEJr84CsxKY49igHJ74Czi5ac6m#scrollTo=hSJXjgLaud2A&printMode=true 6/6