-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathmain.py
72 lines (54 loc) · 2.57 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import sys
import os
import time
import random
from collections import deque
import numpy as np
import torch
from i2l.misc.utils import cleanup_log_dir
from i2l.misc.arguments import get_args
from i2l.networks.networks_manager import NetworksManager
from i2l.rl.rl_agent import RLAgent
def setup(args):
torch.manual_seed(args.seed)
torch.cuda.manual_seed_all(args.seed)
np.random.seed(args.seed)
random.seed(args.seed)
if args.cuda and torch.cuda.is_available() and args.cuda_deterministic:
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
log_dir = os.path.expanduser(args.log_dir)
cleanup_log_dir(log_dir)
torch.set_num_threads(1)
args.device = torch.device("cuda:0" if args.cuda else "cpu")
def main():
args = get_args()
args.num_processes = 1 # future work: (to run more than 1 env in parallel, code needs modifications)
print("== Starting I2L with the following parameters == ")
print(vars(args))
setup(args)
rl_agent = RLAgent(args)
manager = NetworksManager(args, rl_agent)
episode_rewards = deque(maxlen=50)
episode_lengths = deque(maxlen=50)
start = time.time()
num_updates = int(args.num_env_steps) // args.num_steps // args.num_processes
for j in range(num_updates):
# collect agent-environment interaction data
rl_agent.collect_rollout_batch(episode_rewards, episode_lengths)
# update wasserstein critic, discriminator, and priority buffer
wcritic_loss, discriminator_loss = manager.update(j)
# update actor-critic parameters with PPO
value_loss, action_loss, dist_entropy = rl_agent.update()
if j % args.log_interval == 0 and len(episode_rewards) > 1:
total_num_steps = (j + 1) * args.num_processes * args.num_steps
end = time.time()
print("Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean length {:.1f}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f} \n Wcritic_loss: {:.2f}, Discriminator_loss: {:.2f}, Entropy: {:.2f}, Value_loss: {:.2f}, Action_loss: {:.2f}\n"
.format(j, total_num_steps,
int(total_num_steps / (end - start)),
len(episode_rewards), np.mean(episode_lengths), np.mean(episode_rewards),
np.median(episode_rewards), np.min(episode_rewards),
np.max(episode_rewards), wcritic_loss, discriminator_loss, dist_entropy, value_loss, action_loss))
sys.stdout.flush()
if __name__ == "__main__":
main()