-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathVI.py
49 lines (42 loc) · 1.27 KB
/
VI.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
from maze import *
from value_plot import *
discount=0.9
iters=1000
def calcQ( state, action):
Vsum = 0
rllist = []
slip_action = ACTMAP[action]
slip_reward, slip_next_state, _ = env.step(state,slip_action, slip=False)
rllist.append((slip_reward, slip_next_state, env.slip))
reward, next_state, _ = env.step(state, action, slip=False)
rllist.append((reward, next_state, 1-env.slip))
for reward, next_state, pi in rllist:
Vsum += pi * (reward + discount * values[next_state])
return Vsum
env = Maze()
values = np.zeros(env.snum)
Qval = np.zeros((env.snum, env.anum))
optpolicies = np.zeros(env.snum)
# VI
for i in range(iters):
tmpV = np.zeros(env.snum)
for state in range(env.snum):
if env.idx2cell[int(state/8)] == env.goal_pos:
continue
Vmax = float('-inf')
for action in range(env.anum):
Vsum = calcQ(state, action)
Vmax = max(Vmax, Vsum)
tmpV[state] = Vmax
values = np.copy(tmpV)
for state in range(env.snum):
for action in range(env.anum):
Qval[state, action] = calcQ(state, action)
for state in range(env.snum):
optpolicies[state] = np.argmax(Qval[state,:])
np.save('Results/Q_Values',Qval)
print Qval
#np.save('Optimal_policies',optpolicies)
print "Optimal Policies --> 0 - UP; 1 - DOWN; 2 - LEFT; 3 - RIGHT"
print optpolicies
value_plot(Qval, env)