Skip to content

Commit 6c5b7af

Browse files
authored
[distributed][misc] use fork by default for mp (#5669)
1 parent 8065a7e commit 6c5b7af

File tree

3 files changed

+38
-3
lines changed

3 files changed

+38
-3
lines changed

.buildkite/test-pipeline.yaml

+9
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,9 @@ steps:
3737
working_dir: "/vllm-workspace/tests"
3838
num_gpus: 2
3939
commands:
40+
# FIXIT: find out which code initialize cuda before running the test
41+
# before the fix, we need to use spawn to test it
42+
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
4043
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
4144
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
4245
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
@@ -55,6 +58,9 @@ steps:
5558
working_dir: "/vllm-workspace/tests"
5659
num_gpus: 4
5760
commands:
61+
# FIXIT: find out which code initialize cuda before running the test
62+
# before the fix, we need to use spawn to test it
63+
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
5864
- pytest -v -s distributed/test_pynccl.py
5965
# We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here.
6066
# See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context.
@@ -145,6 +151,9 @@ steps:
145151
num_gpus: 4
146152
# This test runs llama 13B, so it is required to run on 4 GPUs.
147153
commands:
154+
# FIXIT: find out which code initialize cuda before running the test
155+
# before the fix, we need to use spawn to test it
156+
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
148157
- pytest -v -s -x lora/test_long_context.py
149158

150159
- label: Tensorizer Test

vllm/distributed/device_communicators/custom_all_reduce_utils.py

+27-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
import ctypes
22
import json
33
import os
4+
import pickle
5+
import subprocess
6+
import sys
47
from itertools import product
58
from typing import Dict, List, Optional, Sequence
69

@@ -198,7 +201,25 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
198201
ids = list(range(num_dev))
199202
# batch of all pairs of GPUs
200203
batch_src, batch_tgt = zip(*list(product(ids, ids)))
201-
result = can_actually_p2p(batch_src, batch_tgt)
204+
# NOTE: we use `subprocess` rather than `multiprocessing` here
205+
# because the caller might not have `if __name__ == "__main__":`,
206+
# in that case we cannot use spawn method in multiprocessing.
207+
# However, `can_actually_p2p` requires spawn method.
208+
# The fix is, we use `subprocess` to call the function,
209+
# where we have `if __name__ == "__main__":` in this file.
210+
input_bytes = pickle.dumps((batch_src, batch_tgt))
211+
returned = subprocess.run([sys.executable, __file__],
212+
input=input_bytes,
213+
capture_output=True)
214+
# check if the subprocess is successful
215+
try:
216+
returned.check_returncode()
217+
except Exception as e:
218+
# wrap raised exception to provide more information
219+
raise RuntimeError(
220+
f"Error happened when batch testing "
221+
f"peer-to-peer access from {batch_src} to {batch_tgt}") from e
222+
result = pickle.loads(returned.stdout)
202223
for _i, _j, r in zip(batch_src, batch_tgt, result):
203224
cache[f"{_i}->{_j}"] = r
204225
with open(path, "w") as f:
@@ -213,3 +234,8 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
213234

214235

215236
__all__ = ["gpu_p2p_access_check"]
237+
238+
if __name__ == "__main__":
239+
batch_src, batch_tgt = pickle.loads(sys.stdin.buffer.read())
240+
result = can_actually_p2p(batch_src, batch_tgt)
241+
sys.stdout.buffer.write(pickle.dumps(result))

vllm/envs.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
VLLM_CPU_KVCACHE_SPACE: int = 0
3030
VLLM_XLA_CACHE_PATH: str = "~/.vllm/xla_cache/"
3131
VLLM_USE_RAY_COMPILED_DAG: bool = False
32-
VLLM_WORKER_MULTIPROC_METHOD: str = "spawn"
32+
VLLM_WORKER_MULTIPROC_METHOD: str = "fork"
3333
VLLM_IMAGE_FETCH_TIMEOUT: int = 5
3434
VLLM_TARGET_DEVICE: str = "cuda"
3535
MAX_JOBS: Optional[str] = None
@@ -212,7 +212,7 @@
212212
# Use dedicated multiprocess context for workers.
213213
# Both spawn and fork work
214214
"VLLM_WORKER_MULTIPROC_METHOD":
215-
lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn"),
215+
lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "fork"),
216216

217217
# Timeout for fetching images when serving multimodal models
218218
# Default is 5 seconds

0 commit comments

Comments
 (0)