[BugFix] lazy init _copy_stream to avoid torch init wrong gpu instance (vllm-project#8403)

lnykww · web-flow · commit acd24a9a9f79 · 2024-09-12T10:47:42.000-07:00
diff --git a/vllm/worker/multi_step_model_runner.py b/vllm/worker/multi_step_model_runner.py
@@ -230,12 +230,15 @@ def __init__(self, base_model_runner: GPUModelRunnerBase, *args, **kwargs):
         self._base_model_runner: GPUModelRunnerBase = base_model_runner
 
         self.is_multi_step = self.scheduler_config.is_multi_step
-        # used to copy tensors from GPU to CPU asynchronously
-        self._copy_stream = torch.cuda.Stream()
         self.pinned_sampled_token_ids: Optional[torch.Tensor] = None
 
         self.pythonization_cache = PythonizationCache()
 
+    @functools.cached_property
+    def _copy_stream(self):
+        # used to copy tensors from GPU to CPU asynchronously
+        return torch.cuda.Stream()
+
     def make_model_input_from_broadcasted_tensor_dict(
             self, tensor_dict: Dict[str, Any]) -> StatefulModelInput:
         model_input = (StatefulModelInput.from_broadcasted_tensor_dict(