-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathexpSARSA_frozen.py
75 lines (57 loc) · 1.83 KB
/
expSARSA_frozen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import numpy as np
import gym
# Defining the parameters
total_episodes = 10000
max_steps = 100 # max step in each episode
alpha = 0.30 # learning rate
gamma = 0.99 # decay factor
epsilon = 0.1 # for exploration
# Initializing the Q-matrix
Q = np.zeros((env.observation_space.n, env.action_space.n))
# Initializing the reward
total_reward = 0
# Function to choose the next action - epsilon-greedy
def choose_action(state):
action = 0
if np.random.uniform(0, 1) < epsilon:
action = env.action_space.sample()
else:
action = np.argmax(Q[state, :])
return action
# Function to learn the Q-value
def update(state, state2, reward, action, action2, done):
if done:
Q[state, action] = Q[state, action] + alpha * (reward - Q[state, action])
else:
exp_q = np.mean(Q[state2, :])
Q[state, action] = Q[state, action] + alpha * (reward + gamma*exp_q - Q[state, action])
# Building the environment
env = gym.make('FrozenLake-v0')
# Starting the SARSA learning
for episode in range(total_episodes):
t = 0
state1 = env.reset()
action1 = choose_action(state1)
while t < max_steps:
# Visualizing the training
# env.render()
# Getting the next state
state2, reward, done, info = env.step(action1)
# Choosing the next action
action2 = choose_action(state2)
# Learning the Q-value
update(state1, state2, reward, action1, action2, done)
state1 = state2
action1 = action2
# Updating the respective values
t += 1
total_reward += reward
# If at the end of learning process
if done:
env.render()
break
# Evaluating the performance
print("total eps : ", total_episodes)
print("Performance : ", total_reward / total_episodes)
# Visualizing the Q-matrix
print(Q)