-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbandit_benchmark.py
46 lines (34 loc) · 1.31 KB
/
bandit_benchmark.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
from openai.agents.sampleaverage import SampleAverageActionValueAgent
from openai.envs.classic.narmedbandit import NArmedBanditEnv
import numpy as np
import matplotlib.pyplot as plt
def main():
# setup
mean_results = evaluate(epsilon = 0) # greedy
mean_results_01 = evaluate(epsilon = 0.1)
mean_results_001 = evaluate(epsilon = 0.01)
#print "Results: {}".format(mean_results)
plt.plot(mean_results, color='g', label='$\epsilon$ = 0 (greedy)')
plt.plot(mean_results_01, label='$\epsilon$ = 0.1')
plt.plot(mean_results_001, color='r', label='$\epsilon$ = 0.01')
plt.legend(loc="lower right")
plt.xlabel('Steps')
plt.ylabel('Average reward')
plt.show()
def evaluate(epsilon, bandits_count = 2000, max_steps = 1000):
reward = 0
done = False
results = np.zeros([bandits_count, max_steps])
for i in xrange(bandits_count):
agent = SampleAverageActionValueAgent(num_actions = 10, epsilon = epsilon)
bandit = NArmedBanditEnv(10, 'stationary')
bandit._reset()
for j in xrange(max_steps):
action = agent.evaluate(reward, done)
ob, reward, done, _ = bandit._step(action)
results[i, j] = reward
if done:
break
return np.mean(results, axis = 0)
if __name__ == '__main__':
main()