forked from akshayramaswamy/KeepMeSafe
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdistMDP.py
248 lines (208 loc) · 8.89 KB
/
distMDP.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
import mdpUtil
import math
class DistMDP(mdpUtil.MDP):
def __init__(self, locationGrid, startRow, startCol, endRow, endCol):
"""
cardValues: list of integers (face values for each card included in the deck)
multiplicity: single integer representing the number of cards with each face value
threshold: maximum number of points (i.e. sum of card values in hand) before going bust
peekCost: how much it costs to peek at the next card
"""
# Take subgrid account for +/- 0.1 mile error
self.dimCol1 = max(min(startCol, endCol), 1)
self.dimCol2 = min(max(startCol, endCol), len(locationGrid[0]) - 2)
self.dimRow1 = max(min(startRow, endRow), 1)
self.dimRow2 = min(max(startRow, endRow), len(locationGrid) - 2)
self.locationGrid = locationGrid
self.row = startRow
self.col = startCol
self.endRow = endRow
self.endCol = endCol
# print "==============GRID============"
# print self.locationGrid
# print self.row
# print self.col
# Return the start state.
# State represented as: (curr row, curr col, # of edges currently traversed, total sum of rewards)
def startState(self):
return (self.row, self.col)
# Given a list of valid actions, remove all actions that take you to a visited square in the grid
def getUnvisitedActions(self, actions,row,col):
return actions
if "U" in actions:
if self.locationGrid[row - 1][col][1]:
actions.remove("U")
if "D" in actions:
if self.locationGrid[row + 1][col][1]:
actions.remove("D")
if "L" in actions:
if self.locationGrid[row][col - 1][1]:
actions.remove("L")
if "R" in actions:
if self.locationGrid[row][col + 1][1]:
actions.remove("R")
if "UL" in actions:
if self.locationGrid[row - 1][col - 1][1]:
actions.remove("UL")
if "UR" in actions:
if self.locationGrid[row - 1][col + 1][1]:
actions.remove("UR")
if "DL" in actions:
if self.locationGrid[row + 1][col - 1][1]:
actions.remove("DL")
if "DR" in actions:
if self.locationGrid[row + 1][col + 1][1]:
actions.remove("DR")
return actions
# Return set of actions possible from |state|.
# Can go up, down, left, right, up-right, up-left, down-right, down-left
# CANNOT GO TO VISITED PLACES - DO THIS
def actions(self, state):
row = state[0]
col = state[1]
# Find the special case actions - on boundary of grid
if row == self.dimRow1 - 1 or row == self.dimRow2 + 1 or col == self.dimCol1 - 1 or col == self.dimCol2 + 1:
# Top Left Corner
if row == self.dimRow1 - 1 and col == self.dimCol1 - 1:
return self.getUnvisitedActions(["D", "R", "DR"], row, col)
# Top Right Corner
if row == self.dimRow1 - 1 and col == self.dimCol2 + 1:
return self.getUnvisitedActions(["D", "L", "DL"], row, col)
# Bottom Left Corner
if row == self.dimRow2 + 1 and col == self.dimCol1 - 1:
return self.getUnvisitedActions(["U", "R", "UR"], row, col)
# Bottom Right Corner
if row == self.dimRow2 + 1 and col == self.dimCol2 + 1:
return self.getUnvisitedActions(["U", "L", "UL"], row, col)
# Top row
if row == self.dimRow1 - 1:
return self.getUnvisitedActions(["D", "L", "R", "DL", "DR"], row, col)
# Bottom row
if row == self.dimRow2 + 1:
return self.getUnvisitedActions(["U", "L", "R", "UL", "UR"], row, col)
# Left col
if col == self.dimCol1 - 1:
return self.getUnvisitedActions(["U", "D", "R", "UR", "DR"], row, col)
# Right Col
if col == self.dimCol2 + 1:
return self.getUnvisitedActions(["U", "D", "L", "UL", "DL"], row, col)
# Otherwise explore all 8 possible actions
action = self.getUnvisitedActions(["U", "D", "L", "R", "UL", "UR", "DL", "DR"], row, col)
return action
def isEnd(self, row, col):
return row == self.endRow and col == self.endCol
def distance(self, row1, col1, row2, col2):
return math.sqrt((row1 - row2)**2 + (col1 - col2)**2)
# Given a |state| and |action|, return a list of (newState, prob, reward) tuples
# corresponding to the states reachable from |state| when taking |action|.
# A few reminders:
# * Indicate a terminal state (after quitting, busting, or running out of cards)
# by setting the deck to None.
# * If |state| is an end state, you should return an empty list [].
# * When the probability is 0 for a transition to a particular new state,
# don't include that state in the list returned by succAndProbReward.
def succAndProbReward(self, state, action):
row, col = state
prob = 1
reward = -1
# End state: reached destination
if row == self.endRow and col == self.endCol:
return []
# Up action
if action == "U":
newRow = row - 1
newCol = col
newState = (newRow, newCol)
if self.isEnd(newRow, newCol):
return [(newState, prob, 0)]
if self.locationGrid[newRow][newCol][1]:
return []
else:
reward = -1 * self.distance(newRow, newCol, self.endRow, self.endCol)
return [(newState, prob, reward)]
# Down action
if action == "D":
newRow = row + 1
newCol = col
newState = (newRow, newCol)
if self.isEnd(newRow, newCol):
return [(newState, prob, 0)]
if self.locationGrid[newRow][newCol][1]:
return []
else:
reward = -1 * self.distance(newRow, newCol, self.endRow, self.endCol)
return [(newState, prob, reward)]
# Left action
if action == "L":
newRow = row
newCol = col - 1
newState = (newRow, newCol)
if self.isEnd(newRow, newCol):
return [(newState, prob, 0)]
if self.locationGrid[newRow][newCol][1]:
return []
else:
reward = -1 * self.distance(newRow, newCol, self.endRow, self.endCol)
return [(newState, prob, reward)]
# Right action
if action == "R":
newRow = row
newCol = col + 1
newState = (newRow, newCol)
if self.isEnd(newRow, newCol):
return [(newState, prob, 0)]
if self.locationGrid[newRow][newCol][1]:
return []
else:
reward = -1 * self.distance(newRow, newCol, self.endRow, self.endCol)
return [(newState, prob, reward)]
# Up-left action
if action == "UL":
newRow = row - 1
newCol = col - 1
newState = (newRow, newCol)
if self.isEnd(newRow, newCol):
return [(newState, prob, 0)]
if self.locationGrid[newRow][newCol][1]:
return []
else:
reward = -1 * self.distance(newRow, newCol, self.endRow, self.endCol)
return [(newState, prob, reward)]
# Up-right action
if action == "UR":
newRow = row - 1
newCol = col + 1
newState = (newRow, newCol)
if self.isEnd(newRow, newCol):
return [(newState, prob, 0)]
if self.locationGrid[newRow][newCol][1]:
return []
else:
reward = -1 * self.distance(newRow, newCol, self.endRow, self.endCol)
return [(newState, prob, reward)]
# Down-left action
if action == "DL":
newRow = row + 1
newCol = col - 1
newState = (newRow, newCol)
if self.isEnd(newRow, newCol):
return [(newState, prob, 0 )]
if self.locationGrid[newRow][newCol][1]:
return []
else:
reward = -1 * self.distance(newRow, newCol, self.endRow, self.endCol)
return [(newState, prob, reward)]
# Down-right action
if action == "DR":
newRow = row + 1
newCol = col + 1
newState = (newRow, newCol)
if self.isEnd(newRow, newCol):
return [(newState, prob, 0 )]
if self.locationGrid[newRow][newCol][1]:
return []
else:
reward = -1 * self.distance(newRow, newCol, self.endRow, self.endCol)
return [(newState, prob, reward)]
def discount(self):
return 1