Disable non blocking to device with MPS (#14368)

j0rd1smit · carmocca · justusschock · rohitgr7 · commit 176f380eade4 · 2022-08-28T00:39:46.000+05:30
* disable non-blocking for mps due to race condition bug * fixed typo * fixed: unknown mps device for non arm systems * Removed unrobust test case * moved _MPS_DEVICES such that we used in apply_func * Resolve circular dependencies * Comment rewording * changed torchElasticEnvironment to a global import * simplified if statement to blocking device type * Added change to CHANGELOG * Update src/pytorch_lightning/utilities/apply_func.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixed mypy not detecting casting of device * Moved check into if statement to mainain original behavior Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com> Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md
@@ -15,6 +15,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed `LightningDataModule` hparams parsing ([#12806](https://github.com/PyTorchLightning/pytorch-lightning/pull/12806))
 - Reset epoch progress with batch size scaler ([#13846](https://github.com/Lightning-AI/lightning/pull/13846)
 - Fixed restoring the trainer after using `lr_find()` so that the correct LR schedule is used for the actual training ([#14113](https://github.com/Lightning-AI/lightning/pull/14113))
+- Fixed incorrect values after transferring data to a MPS device ([#13285](https://github.com/Lightning-AI/lightning/issues/13285))
 
 
 ## [1.7.3] - 2022-08-25
diff --git a/src/pytorch_lightning/accelerators/cpu.py b/src/pytorch_lightning/accelerators/cpu.py
@@ -16,7 +16,7 @@
 import torch
 
 from pytorch_lightning.accelerators.accelerator import Accelerator
-from pytorch_lightning.utilities import device_parser
+from pytorch_lightning.utilities.device_parser import parse_cpu_cores
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.imports import _PSUTIL_AVAILABLE
 from pytorch_lightning.utilities.types import _DEVICE
@@ -42,13 +42,13 @@ def get_device_stats(self, device: _DEVICE) -> Dict[str, Any]:
     @staticmethod
     def parse_devices(devices: Union[int, str, List[int]]) -> int:
         """Accelerator device parsing logic."""
-        devices = device_parser.parse_cpu_cores(devices)
+        devices = parse_cpu_cores(devices)
         return devices
 
     @staticmethod
     def get_parallel_devices(devices: Union[int, str, List[int]]) -> List[torch.device]:
         """Gets parallel devices for the Accelerator."""
-        devices = device_parser.parse_cpu_cores(devices)
+        devices = parse_cpu_cores(devices)
         return [torch.device("cpu")] * devices
 
     @staticmethod
diff --git a/src/pytorch_lightning/accelerators/hpu.py b/src/pytorch_lightning/accelerators/hpu.py
@@ -17,8 +17,9 @@
 import torch
 
 from pytorch_lightning.accelerators.accelerator import Accelerator
-from pytorch_lightning.utilities import _HPU_AVAILABLE, device_parser
+from pytorch_lightning.utilities.device_parser import parse_hpus
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
+from pytorch_lightning.utilities.imports import _HPU_AVAILABLE
 from pytorch_lightning.utilities.rank_zero import rank_zero_debug
 
 if _HPU_AVAILABLE:
@@ -61,7 +62,7 @@ def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]:
     @staticmethod
     def parse_devices(devices: Union[int, str, List[int]]) -> Optional[int]:
         """Accelerator device parsing logic."""
-        return device_parser.parse_hpus(devices)
+        return parse_hpus(devices)
 
     @staticmethod
     def get_parallel_devices(devices: int) -> List[torch.device]:
diff --git a/src/pytorch_lightning/accelerators/ipu.py b/src/pytorch_lightning/accelerators/ipu.py
@@ -16,7 +16,7 @@
 import torch
 
 from pytorch_lightning.accelerators.accelerator import Accelerator
-from pytorch_lightning.utilities import _IPU_AVAILABLE
+from pytorch_lightning.utilities.imports import _IPU_AVAILABLE
 
 
 class IPUAccelerator(Accelerator):
diff --git a/src/pytorch_lightning/plugins/environments/xla_environment.py b/src/pytorch_lightning/plugins/environments/xla_environment.py
@@ -15,7 +15,7 @@
 import os
 
 from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
-from pytorch_lightning.utilities import _TPU_AVAILABLE
+from pytorch_lightning.utilities.imports import _TPU_AVAILABLE
 
 if _TPU_AVAILABLE:
     import torch_xla.core.xla_env_vars as xenv
diff --git a/src/pytorch_lightning/utilities/apply_func.py b/src/pytorch_lightning/utilities/apply_func.py
@@ -38,7 +38,7 @@
     Batch = type(None)
 
 
-_CPU_DEVICES = ("cpu", torch.device("cpu"))
+_BLOCKING_DEVICE_TYPES = ("cpu", "mps")
 
 
 def to_dtype_tensor(
@@ -322,6 +322,9 @@ def move_data_to_device(batch: Any, device: Union[str, torch.device]) -> Any:
         - :class:`torch.device`
     """
 
+    if isinstance(device, str):
+        device = torch.device(device)
+
     def batch_to(data: Any) -> Any:
         # try to move torchtext data first
         if _TORCHTEXT_LEGACY and isinstance(data, Batch):
@@ -342,7 +345,8 @@ def batch_to(data: Any) -> Any:
 
         kwargs = {}
         # Don't issue non-blocking transfers to CPU
-        if isinstance(data, Tensor) and device not in _CPU_DEVICES:
+        # Same with MPS due to a race condition bug: https://github.com/pytorch/pytorch/issues/83015
+        if isinstance(data, Tensor) and isinstance(device, torch.device) and device.type not in _BLOCKING_DEVICE_TYPES:
             kwargs["non_blocking"] = True
         data_output = data.to(device, **kwargs)
         if data_output is not None:
diff --git a/src/pytorch_lightning/utilities/device_parser.py b/src/pytorch_lightning/utilities/device_parser.py
@@ -110,6 +110,7 @@ def parse_gpu_ids(
     gpus = _normalize_parse_gpu_input_to_list(gpus, include_cuda=include_cuda, include_mps=include_mps)
     if not gpus:
         raise MisconfigurationException("GPUs requested but none are available.")
+
     if (
         TorchElasticEnvironment.detect()
         and len(gpus) != 1