1
1
import ctypes
2
2
import json
3
3
import os
4
+ import pickle
5
+ import subprocess
6
+ import sys
4
7
from itertools import product
5
8
from typing import Dict , List , Optional , Sequence
6
9
@@ -198,7 +201,25 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
198
201
ids = list (range (num_dev ))
199
202
# batch of all pairs of GPUs
200
203
batch_src , batch_tgt = zip (* list (product (ids , ids )))
201
- result = can_actually_p2p (batch_src , batch_tgt )
204
+ # NOTE: we use `subprocess` rather than `multiprocessing` here
205
+ # because the caller might not have `if __name__ == "__main__":`,
206
+ # in that case we cannot use spawn method in multiprocessing.
207
+ # However, `can_actually_p2p` requires spawn method.
208
+ # The fix is, we use `subprocess` to call the function,
209
+ # where we have `if __name__ == "__main__":` in this file.
210
+ input_bytes = pickle .dumps ((batch_src , batch_tgt ))
211
+ returned = subprocess .run ([sys .executable , __file__ ],
212
+ input = input_bytes ,
213
+ capture_output = True )
214
+ # check if the subprocess is successful
215
+ try :
216
+ returned .check_returncode ()
217
+ except Exception as e :
218
+ # wrap raised exception to provide more information
219
+ raise RuntimeError (
220
+ f"Error happened when batch testing "
221
+ f"peer-to-peer access from { batch_src } to { batch_tgt } " ) from e
222
+ result = pickle .loads (returned .stdout )
202
223
for _i , _j , r in zip (batch_src , batch_tgt , result ):
203
224
cache [f"{ _i } ->{ _j } " ] = r
204
225
with open (path , "w" ) as f :
@@ -213,3 +234,8 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
213
234
214
235
215
236
__all__ = ["gpu_p2p_access_check" ]
237
+
238
+ if __name__ == "__main__" :
239
+ batch_src , batch_tgt = pickle .loads (sys .stdin .buffer .read ())
240
+ result = can_actually_p2p (batch_src , batch_tgt )
241
+ sys .stdout .buffer .write (pickle .dumps (result ))
0 commit comments