Merge branch 'master' into loadams/sd-paths

microsoft · Jan 9, 2024 · ad774d3 · ad774d3
2 parents db1473a + 75db3d7
commit ad774d3
Show file tree

Hide file tree

Showing 9 changed files with 354 additions and 8 deletions.
diff --git a/accelerator/hpu_accelerator.py b/accelerator/hpu_accelerator.py
@@ -0,0 +1,287 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import os
+import pkgutil
+import importlib
+import torch
+
+from .abstract_accelerator import DeepSpeedAccelerator
+
+
+class HPU_Accelerator(DeepSpeedAccelerator):
+
+    def __init__(self):
+        self._name = 'hpu'
+        self._communication_backend_name = 'hccl'
+        try:
+            import habana_frameworks.torch.hpu as hpu
+            hpu.setDeterministic(True)
+            self.hpu = hpu
+        except ImportError as e:
+            raise ValueError(
+                f"HPU_Accelerator requires habana_frameworks.torch.hpu, which is not installed on this system.")
+
+        self.fp16_supported = None
+
+    # Device APIs
+    def is_synchronized_device(self):
+        return False
+
+    def device_name(self, device_index=None):
+        if device_index is None:
+            return 'hpu'
+        return 'hpu:{}'.format(device_index)
+
+    def device(self, device_index=None):
+        return torch.device(self.device_name(device_index))
+
+    def set_device(self, device_index):
+        self.hpu.set_device(device_index)
+
+    def current_device(self):
+        return (self.hpu.current_device())
+
+    def current_device_name(self):
+        return 'hpu:{}'.format(self.current_device())
+
+    def device_count(self):
+        return self.hpu.device_count()
+
+    def synchronize(self, device_index=None):
+        return self.hpu.synchronize()
+
+    # RNG APIs
+    def random(self):
+        return torch.random
+
+    def set_rng_state(self, new_state, device_index=None):
+        self.hpu.random.set_rng_state(new_state)
+
+    def get_rng_state(self, device_index=None):
+        return self.hpu.random.get_rng_state()
+
+    def manual_seed(self, seed):
+        self.hpu.random.manual_seed(seed)
+
+    def manual_seed_all(self, seed):
+        self.hpu.random.manual_seed_all(seed)
+
+    def initial_seed(self, seed):
+        self.hpu.random.initial_seed(seed)
+
+    def default_generator(self, device_index):
+        return self.hpu.random.default_generators[device_index]
+
+    # Streams/Events
+    @property
+    def Stream(self):
+        return self.hpu.Stream
+
+    def stream(self, stream):
+        return self.hpu.stream(stream)
+
+    def current_stream(self, device_index=None):
+        return self.hpu.current_stream()
+
+    def default_stream(self, device_index=None):
+        return self.hpu.default_stream()
+
+    @property
+    def Event(self):
+        import habana_frameworks.torch.core as htcore
+        return htcore.hpu.Event
+
+    # Memory management
+    def empty_cache(self):
+        return
+
+    def memory_allocated(self, device_index=None):
+        return self.hpu.memory_allocated()
+
+    def max_memory_allocated(self, device_index=None):
+        return self.hpu.max_memory_allocated()
+
+    def reset_max_memory_allocated(self, device_index=None):
+        return self.hpu.reset_max_memory_allocated()
+
+    def memory_cached(self, device_index=None):
+        return 0
+
+    def max_memory_cached(self, device_index=None):
+        return 0
+
+    def reset_max_memory_cached(self, device_index=None):
+        return 0
+
+    def memory_stats(self, device_index=None):
+        return {}
+
+    def reset_peak_memory_stats(self, device_index=None):
+        self.hpu.reset_peak_memory_stats()
+
+    def memory_reserved(self, device_index=None):
+        return 0
+
+    def max_memory_reserved(self, device_index=None):
+        return 0
+
+    def total_memory(self, device_index=None):
+        return 0
+
+    def available_memory(self, device_index=None):
+        return self.total_memory(device_index) - self.memory_allocated(device_index)
+
+    # Data types
+    def is_bf16_supported(self):
+        return True
+
+    def is_fp16_supported(self):
+        if self.fp16_supported is None:
+            import habana_frameworks.torch.utils.experimental as htexp
+            self.fp16_supported = htexp._is_fp16_supported()
+        return self.fp16_supported
+
+    def supported_dtypes(self):
+        supported_dtypes = [torch.float, torch.bfloat16]
+        if self.is_fp16_supported():
+            supported_dtypes.append(torch.bfloat16)
+        return supported_dtypes
+
+    # Misc
+    def amp(self):
+        return None
+
+    def is_available(self):
+        return self.hpu.is_available()
+
+    def range_push(self, msg):
+        return
+
+    def range_pop(self):
+        return
+
+    def lazy_call(self, callback):
+        callback()
+
+    def communication_backend_name(self):
+        return self._communication_backend_name
+
+    def is_triton_supported(self):
+        return False
+
+    # Graph operations
+    def create_graph(self):
+        return None
+
+    def capture_to_graph(self, graph, pool=None, stream=None):
+        from deepspeed.runtime.utils import noop_context
+        return noop_context()
+
+    def replay_graph(self, graph):
+        return
+
+    # Tensor operations
+    @property
+    def BFloat16Tensor(self):
+        return torch.hpu.BFloat16Tensor
+
+    @property
+    def ByteTensor(self):
+        return torch.hpu.ByteTensor
+
+    @property
+    def DoubleTensor(self):
+        return torch.hpu.DoubleTensor
+
+    @property
+    def FloatTensor(self):
+        return torch.hpu.FloatTensor
+
+    @property
+    def HalfTensor(self):
+        return torch.hpu.HalfTensor
+
+    @property
+    def IntTensor(self):
+        return torch.hpu.IntTensor
+
+    @property
+    def LongTensor(self):
+        return torch.hpu.LongTensor
+
+    def pin_memory(self, tensor, align_bytes=1):
+        return tensor.pin_memory(self.device())
+
+    def is_pinned(self, tensor):
+        return tensor.is_pinned()
+
+    def on_accelerator(self, tensor):
+        device_str = str(tensor.device)
+        if device_str.startswith('hpu:'):
+            return True
+        else:
+            return False
+
+    def op_builder_dir(self):
+        try:
+            # is op_builder from deepspeed or a 3p version? this should only succeed if it's deepspeed
+            # if successful this also means we're doing a local install and not JIT compile path
+            from op_builder import __deepspeed__  # noqa: F401 # type: ignore
+            return "op_builder.hpu"
+        except ImportError:
+            return "deepspeed.ops.op_builder.hpu"
+
+    # dict that holds class name <--> class type mapping i.e.
+    # 'AsyncIOBuilder': <class 'op_builder.async_io.AsyncIOBuilder'>
+    # this dict will be filled at init stage
+    class_dict = None
+
+    def _lazy_init_class_dict(self):
+        if self.class_dict is not None:
+            return
+        else:
+            self.class_dict = {}
+            # begin initialize for create_op_builder()
+            # put all valid class name <--> class type mapping into class_dict
+            op_builder_dir = self.op_builder_dir()
+            op_builder_module = importlib.import_module(op_builder_dir)
+            op_builder_absolute_path = os.path.dirname(op_builder_module.__file__)
+            for _, module_name, _ in pkgutil.iter_modules([op_builder_absolute_path]):
+                # avoid self references,
+                # skip sub_directories which contains ops for other backend(cpu, npu, etc.).
+                if module_name != 'all_ops' and module_name != 'builder' and not os.path.isdir(
+                        os.path.join(op_builder_absolute_path, module_name)):
+                    module = importlib.import_module("{}.{}".format(op_builder_dir, module_name))
+                    for member_name in module.__dir__():
+                        if member_name.endswith(
+                                'Builder'
+                        ) and member_name != "OpBuilder" and member_name != "CPUOpBuilder" and member_name != "TorchCPUOpBuilder":  # avoid abstract classes
+                            if not member_name in self.class_dict:
+                                self.class_dict[member_name] = getattr(module, member_name)
+            # end initialize for create_op_builder()
+
+    # create an instance of op builder and return, name specified by class_name
+    def create_op_builder(self, class_name):
+        self._lazy_init_class_dict()
+        if class_name in self.class_dict:
+            return self.class_dict[class_name]()
+        else:
+            return None
+
+    # return an op builder class, name specified by class_name
+    def get_op_builder(self, class_name):
+        self._lazy_init_class_dict()
+        if class_name in self.class_dict:
+            return self.class_dict[class_name]
+        else:
+            return self.class_dict['NotImplementedBuilder'] if 'NotImplementedBuilder' in self.class_dict else None
+
+    def build_extension(self):
+        from torch.utils.cpp_extension import BuildExtension
+        return BuildExtension
+
+    def export_envs(self):
+        return []
diff --git a/accelerator/real_accelerator.py b/accelerator/real_accelerator.py
@@ -20,7 +20,7 @@
 except ImportError as e:
     dsa2 = None
 
-SUPPORTED_ACCELERATOR_LIST = ['cuda', 'cpu', 'xpu', 'xpu.external', 'npu', 'mps']
+SUPPORTED_ACCELERATOR_LIST = ['cuda', 'cpu', 'xpu', 'xpu.external', 'npu', 'mps', 'hpu']
 
 ds_accelerator = None
 
@@ -92,6 +92,12 @@ def get_accelerator():
                 torch.mps.current_allocated_memory()
             except (RuntimeError, ImportError) as e:
                 raise ValueError(f"MPS_Accelerator requires torch.mps, which is not installed on this system.")
+        elif accelerator_name == "hpu":
+            try:
+                import habana_frameworks.torch.hpu  # noqa: F401
+            except ImportError as e:
+                raise ValueError(
+                    f"HPU_Accelerator requires habana_frameworks.torch.hpu, which is not installed on this system.")
         elif is_current_accelerator_supported():
             raise ValueError(f'DS_ACCELERATOR must be one of {SUPPORTED_ACCELERATOR_LIST}. '
                              f'Value "{accelerator_name}" is not supported')
@@ -140,6 +146,13 @@ def get_accelerator():
                 accelerator_name = "mps"
             except (RuntimeError, ImportError) as e:
                 pass
+        if accelerator_name is None:
+            try:
+                import habana_frameworks.torch.hpu  # noqa: F401,F811
+
+                accelerator_name = "hpu"
+            except ImportError as e:
+                pass
         if accelerator_name is None:
             accelerator_name = "cuda"
 
@@ -169,6 +182,10 @@ def get_accelerator():
         from .mps_accelerator import MPS_Accelerator
 
         ds_accelerator = MPS_Accelerator()
+    elif accelerator_name == 'hpu':
+        from .hpu_accelerator import HPU_Accelerator
+
+        ds_accelerator = HPU_Accelerator()
     _validate_accelerator(ds_accelerator)
     if accel_logger is not None:
         accel_logger.info(f"Setting ds_accelerator to {ds_accelerator._name} ({ds_set_method})")

diff --git a/deepspeed/module_inject/containers/vae.py b/deepspeed/module_inject/containers/vae.py
@@ -13,11 +13,11 @@ def __init__(self):
         super().__init__()
         try:
             import diffusers
-            if hasattr(diffusers.models.vae, "AutoencoderKL"):
-                self._orig_layer_class = diffusers.models.vae.AutoencoderKL
+            if hasattr(diffusers.models.autoencoders.vae, "AutoencoderKL"):
+                self._orig_layer_class = diffusers.models.autoencoders.vae.AutoencoderKL
             else:
                 # Diffusers >= 0.12.0 changes location of AutoencoderKL
-                self._orig_layer_class = diffusers.models.autoencoder_kl.AutoencoderKL
+                self._orig_layer_class = diffusers.models.autoencoders.autoencoder_kl.AutoencoderKL
         except ImportError:
             self._orig_layer_class = None
 

diff --git a/deepspeed/runtime/zero/mics.py b/deepspeed/runtime/zero/mics.py
@@ -41,7 +41,14 @@ def wait(self) -> None:
         """
         """
         # let the current stream to op
-        instrument_w_nvtx(self.allgather_handle.wait)()
+        try:
+            instrument_w_nvtx(self.allgather_handle.wait)()
+        except RuntimeError as e:
+            log_dist(
+                f"WARNING: Runtime Error while waiting the collective all-gather, possibly due to the _IllegalWork",
+                ranks=[0])
+            log_dist(f"Error message: {e}", ranks=[0])
+
         if self.complete:
             return
 

diff --git a/docs/code-docs/source/schedulers.rst b/docs/code-docs/source/schedulers.rst
@@ -1,7 +1,7 @@
 Learning Rate Schedulers
 ===================
 
-DeepSpeed offers implementations of ``LRRangeTest``, ``OneCycle``, ``WarmupLR``, ``WarmupDecayLR`` learning rate schedulers. When using a DeepSpeed's learning rate scheduler (specified in the `ds_config.json` file), DeepSpeed calls the `step()` method of the scheduler at every training step (when `model_engine.step()` is executed). When not using a DeepSpeed's learning rate scheduler:
+DeepSpeed offers implementations of ``LRRangeTest``, ``OneCycle``, ``WarmupLR``, ``WarmupDecayLR``, ``WarmupCosineLR`` learning rate schedulers. When using a DeepSpeed's learning rate scheduler (specified in the `ds_config.json` file), DeepSpeed calls the `step()` method of the scheduler at every training step (when `model_engine.step()` is executed). When not using a DeepSpeed's learning rate scheduler:
   * if the schedule is supposed to execute at every training step, then the user can pass the scheduler to `deepspeed.initialize` when initializing the DeepSpeed engine and let DeepSpeed manage it for update or save/restore.
   * if the schedule is supposed to execute at any other interval (e.g., training epochs), then the user should NOT pass the scheduler to DeepSpeed during initialization and must manage it explicitly.
 
@@ -23,3 +23,8 @@ WarmupLR
 WarmupDecayLR
 ---------------------------
 .. autoclass:: deepspeed.runtime.lr_schedules.WarmupDecayLR
+
+
+WarmupCosineLR
+---------------------------
+.. autoclass:: deepspeed.runtime.lr_schedules.WarmupCosineLR
diff --git a/requirements/requirements-sd.txt b/requirements/requirements-sd.txt
@@ -1,2 +1,2 @@
-diffusers
+diffusers>=0.25.0
 triton>=2.1.0