forked from vwxyzjn/cleanrl
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdocker_queue.py
executable file
·85 lines (73 loc) · 2.85 KB
/
docker_queue.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
"""
See https://github.com/docker/docker-py/issues/2395
At the moment, nvidia-container-toolkit still includes nvidia-container-runtime. So, you can still add nvidia-container-runtime as a runtime in /etc/docker/daemon.json:
{
"runtimes": {
"nvidia": {
"path": "nvidia-container-runtime",
"runtimeArgs": []
}
}
}
Then restart the docker service (sudo systemctl restart docker) and use runtime="nvidia" in docker-py as before.
"""
import argparse
import shlex
import time
import docker
parser = argparse.ArgumentParser(description="CleanRL Docker Submission")
# Common arguments
parser.add_argument("--exp-script", type=str, default="test1.sh", help="the file name of this experiment")
# parser.add_argument('--cuda', type=lambda x:bool(strtobool(x)), default=True, nargs='?', const=True,
# help='if toggled, cuda will not be enabled by default')
parser.add_argument("--num-vcpus", type=int, default=16, help="total number of vcpus used in the host machine")
parser.add_argument("--frequency", type=int, default=1, help="the number of seconds to check container update status")
args = parser.parse_args()
client = docker.from_env()
# c = client.containers.run("ubuntu:latest", "echo hello world", detach=True)
with open(args.exp_script) as f:
lines = f.readlines()
tasks = []
for line in lines:
line.replace("\n", "")
line_split = shlex.split(line)
for idx, item in enumerate(line_split):
if item == "-e":
break
env_vars = line_split[idx + 1 : idx + 2]
image = line_split[idx + 2]
commands = line_split[idx + 3 :]
tasks += [[image, env_vars, commands]]
running_containers = []
vcpus = list(range(args.num_vcpus))
while len(tasks) != 0:
time.sleep(args.frequency)
# update running_containers
new_running_containers = []
for item in running_containers:
c = item[0]
c.reload()
if c.status != "exited":
new_running_containers += [item]
else:
print(f"✅ task on vcpu {item[1]} has finished")
vcpus += [item[1]]
running_containers = new_running_containers
if len(vcpus) != 0:
task = tasks.pop()
vcpu = vcpus.pop()
# if args.cuda:
# c = client.containers.run(
# image=task[0],
# environment=task[1],
# command=task[2],
# runtime="nvidia",
# cpuset_cpus=str(vcpu),
# detach=True)
# running_containers += [[c, vcpu]]
# else:
c = client.containers.run(image=task[0], environment=task[1], command=task[2], cpuset_cpus=str(vcpu), detach=True)
running_containers += [[c, vcpu]]
print("========================")
print(f"remaining tasks={len(tasks)}, running containers={len(running_containers)}")
print(f"running on vcpu {vcpu}", task)