-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathl_dgn.py
343 lines (298 loc) · 10.6 KB
/
l_dgn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
import argparse
import os
import pprint
import time
import warnings
from math import e, log, pow
from pathlib import Path
from typing import List, Tuple
import gymnasium
import numpy as np
import optuna
import torch
from torch.utils.tensorboard import SummaryWriter
from tianshou.data import (
VectorReplayBuffer,
PrioritizedVectorReplayBuffer
)
from tianshou.env import DummyVectorEnv, SubprocVectorEnv
from tianshou.policy import BasePolicy, DQNPolicy
from tianshou.trainer import OffpolicyTrainer
from tianshou.utils import WandbLogger
from graph_env.env.utils.constants import NUMBER_OF_FEATURES
from graph_env.env.utils.networks.l_dgn import LDGNNetwork
from graph_env.env.utils.policies.multi_agent_managers.shared_policy import MultiAgentSharedPolicy
from graph_env.env.utils.collectors.multi_agent_collector import MultiAgentCollector
from graph_env.env.utils.hyp_optimizer.offpolicy_opt import offpolicy_optimizer
from common import get_args, get_env, select_aggregator
def get_agents(
args: argparse.Namespace,
policy: BasePolicy = None,
optim: torch.optim.Optimizer = None
) -> Tuple[BasePolicy, torch.optim.Optimizer, List[str]]:
"""
Build or return the MultiAgentSharedPolicy, the optimizer, and list of agents.
"""
env = get_env(number_of_agents=args.n_agents)
observation_space = env.observation_space['observation'] if isinstance(
env.observation_space, (gymnasium.spaces.Dict, gymnasium.spaces.Dict)
) else env.observation_space
args.state_shape = observation_space.shape or observation_space.n
args.action_shape = env.action_space.shape or env.action_space.n
args.max_action = 1 # Not strictly used if env is discrete, but keep for consistency
if policy is None:
# Construct aggregator
aggregator = select_aggregator(args.aggregator_function)
# Q and V param
q_param = {"hidden_sizes": args.dueling_q_hidden_sizes}
v_param = {"hidden_sizes": args.dueling_v_hidden_sizes}
# Create network
net = LDGNNetwork(
NUMBER_OF_FEATURES,
args.hidden_emb,
args.action_shape,
args.num_heads,
device=args.device,
dueling_param=(q_param, v_param),
aggregator_function=aggregator
)
# Optimizer
optim = torch.optim.Adam(net.parameters(), lr=args.lr)
# Policy
policy = DQNPolicy(
model=net,
optim=optim,
discount_factor=args.gamma,
estimation_step=args.n_step,
target_update_freq=args.target_update_freq,
action_space=env.action_space # If discrete, Tianshou automatically handles that
).to(args.device)
masp_policy = MultiAgentSharedPolicy(policy, env)
return masp_policy, optim, env.agents
def watch(args: argparse.Namespace, masp_policy: BasePolicy = None) -> None:
"""
Load a pre-trained policy (if masp_policy not given) and run it in watch mode.
"""
weights_path = os.path.join(
args.logdir, "mpr", args.algorithm, "weights", f"{args.model_name}"
)
# Create a single test environment
env = DummyVectorEnv([
lambda: get_env(
number_of_agents=args.n_agents,
is_scripted=args.mpr_policy,
is_testing=True,
dynamic_graph=args.dynamic_graph,
render_mode="human",
all_agents_source=True
)
])
# Set seeds
env.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
# Load the policy if none is provided
if masp_policy is None:
masp_policy = load_policy(weights_path, args, env)
masp_policy.policy.eval()
masp_policy.policy.set_eps(args.eps_test)
# Create collector in watch mode
collector = MultiAgentCollector(
agents_num=args.n_agents,
policy=masp_policy,
env=env,
exploration_noise=False
)
# Collect some episodes
result = collector.collect(n_episode=args.test_num * args.n_agents)
pprint.pprint(result)
time.sleep(5)
def train_agent(
args: argparse.Namespace,
masp_policy: BasePolicy = None,
optim: torch.optim.Optimizer = None,
opt_trial: optuna.Trial = None
) -> Tuple[dict, BasePolicy]:
"""
Main training loop, with optional hyperparameter optimization
(when args.optimize is True).
"""
# Build training and test envs
train_envs = SubprocVectorEnv([
lambda: get_env(
number_of_agents=args.n_agents,
dynamic_graph=args.dynamic_graph
)
for _ in range(args.training_num)
])
test_envs = SubprocVectorEnv([
lambda: get_env(
number_of_agents=args.n_agents,
dynamic_graph=args.dynamic_graph,
is_testing=True
)
])
# Set seeds
np.random.seed(args.seed)
torch.manual_seed(args.seed)
train_envs.seed(args.seed)
test_envs.seed(args.seed)
# Build/Load policy
masp_policy, optim, agents = get_agents(args, policy=masp_policy, optim=optim)
# Replay buffers
if args.prio_buffer:
replay_buffer = PrioritizedVectorReplayBuffer(
args.buffer_size,
buffer_num=len(train_envs) * len(agents),
ignore_obs_next=True,
alpha=args.alpha,
beta=args.beta
)
else:
replay_buffer = VectorReplayBuffer(
args.buffer_size,
buffer_num=len(train_envs) * len(agents),
ignore_obs_next=True
)
# Collectors
train_collector = MultiAgentCollector(
agents_num=len(agents),
policy=masp_policy,
env=train_envs,
buffer=replay_buffer,
exploration_noise=True
)
test_collector = MultiAgentCollector(
agents_num=len(agents),
policy=masp_policy,
env=test_envs,
exploration_noise=False
)
# Pre-collect if desired
train_collector.reset()
train_collector.collect(n_step=args.batch_size * args.training_num)
# Setup logger
logger, writer = None, None
log_path = os.path.join(args.logdir, "mpr", args.algorithm)
weights_path = os.path.join(log_path, "weights")
Path(weights_path).mkdir(parents=True, exist_ok=True)
if not args.optimize:
logger = WandbLogger(project="dancing_bees", name=args.model_name)
writer = SummaryWriter(log_path)
writer.add_text("args", str(args))
if logger is not None:
logger.load(writer)
def save_best_fn(pol: BasePolicy):
"""
Save the best model checkpoint.
"""
best_path = os.path.join(weights_path, f"{args.model_name}_best.pth")
print(f"Saving best model to {best_path}")
torch.save(pol.policy.state_dict(), best_path)
def train_fn(epoch: int, env_step: int):
"""
Adjust epsilon during training based on env steps.
"""
decay_factor = 1.0 - pow(
e,
(log(args.eps_train_final) / (args.exploration_fraction * args.epoch * args.step_per_epoch))
)
eps = max(args.eps_train * (1.0 - decay_factor) ** env_step, args.eps_train_final)
masp_policy.policy.set_eps(eps)
# Logging
if logger is not None and env_step % 1000 == 0:
logger.write("train/env_step", env_step, {"train/eps": eps})
def test_fn(epoch: int, env_step: int):
"""
Set epsilon to test level for evaluation.
"""
masp_policy.policy.set_eps(args.eps_test)
# Decide trainer vs. optimizer
if not args.optimize:
result = OffpolicyTrainer(
policy=masp_policy,
train_collector=train_collector,
test_collector=test_collector,
max_epoch=args.epoch,
step_per_epoch=args.step_per_epoch,
step_per_collect=args.step_per_collect,
episode_per_test=args.test_num,
batch_size=args.batch_size,
train_fn=train_fn,
test_fn=test_fn,
update_per_step=args.update_per_step,
test_in_train=False,
save_best_fn=save_best_fn,
logger=logger
).run()
else:
# hyperparameter optimization
result = offpolicy_optimizer(
policy=masp_policy,
train_collector=train_collector,
test_collector=test_collector,
max_epoch=args.epoch,
step_per_epoch=args.step_per_epoch,
step_per_collect=args.step_per_collect,
episode_per_test=args.test_num,
batch_size=args.batch_size,
train_fn=train_fn,
test_fn=test_fn,
update_per_step=args.update_per_step,
test_in_train=False,
save_best_fn=save_best_fn,
trial=opt_trial
)
# Always save the final model
last_path = os.path.join(weights_path, f"{args.model_name}_last.pth")
print(f"Saving last model to {last_path}")
torch.save(masp_policy.policy.state_dict(), last_path)
return result, masp_policy
def load_policy(path: str, args: argparse.Namespace, env: DummyVectorEnv) -> BasePolicy:
"""
Load a saved policy from a given path. If it doesn't exist, exit.
"""
args.action_shape = 2 # if your action space is discrete of size 2, etc.
print(f"Loading agent checkpoint from {path}")
if not os.path.exists(path):
print("Fail to restore policy and optim. Exiting.")
exit(0)
# Construct aggregator
aggregator = select_aggregator(args.aggregator_function)
q_param = {"hidden_sizes": args.dueling_q_hidden_sizes}
v_param = {"hidden_sizes": args.dueling_v_hidden_sizes}
# Build the same network used in training
net = LDGNNetwork(
NUMBER_OF_FEATURES,
args.hidden_emb,
args.action_shape,
args.num_heads,
device=args.device,
dueling_param=(q_param, v_param),
aggregator_function=aggregator
)
optim = torch.optim.Adam(net.parameters(), lr=args.lr)
policy = DQNPolicy(
model=net,
optim=optim,
discount_factor=args.gamma,
estimation_step=args.n_step,
target_update_freq=args.target_update_freq,
action_space=env.action_space
).to(args.device)
# Wrap in MultiAgentSharedPolicy
masp_policy, _, _ = get_agents(args, policy, optim)
# Load weights
masp_policy.policy.load_state_dict(torch.load(path))
print("Successfully restored policy and optimizer.")
return masp_policy
if __name__ == '__main__':
args = get_args()
args.algorithm = "l_dgn"
if args.watch:
watch(args)
elif args.optimize:
pass
else:
result, masp_policy = train_agent(args)
pprint.pprint(result)