-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnarmedbandit.py
39 lines (31 loc) · 1.13 KB
/
narmedbandit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
"""
The classic N-Armed bandit environment (not bounded)
"""
import gym
from gym import spaces
import numpy as np
class NArmedBanditEnv(gym.Env):
def __init__(self, arms, type):
self.type = type
self.arms = arms
# The true action values
self.true_action_vals = np.random.randn(self.arms)
self.action_space = spaces.Discrete(arms)
self.observation_space = None # no observations given
def _reset(self):
# Reset steps counter
self.step_counter = 0
return None
def _step(self, action):
if self.step_counter != 0:
if self.type == 'stationary':
# stationary - true action value + noise
reward = self.true_action_vals[action] + np.random.standard_normal()
else:
# non stationary - random walk per action
self.true_action_vals = [self.true_action_vals[i] + np.random.standard_normal() for i in range(self.arms)]
reward = self.true_action_vals[action]
else:
reward = 0
self.step_counter += 1
return None, reward, False, {}