diff --git a/docker/ubuntu/base/Dockerfile b/docker/ubuntu/base/Dockerfile index a5b5dd8..31612ce 100644 --- a/docker/ubuntu/base/Dockerfile +++ b/docker/ubuntu/base/Dockerfile @@ -10,7 +10,7 @@ ENV BOOST_VERSION 1.78.0 ENV OPENUCX_VERSION 1.9.0 ENV OPENMPI_VERSION 4.0.7 ENV CONDA_VERSION 4.9.2 -ENV PYTORCH_VERSION 1.12.1 +ENV PYTORCH_VERSION 1.11.0 ENV NCCL_VERSION 2.14.3-1 SHELL ["/bin/bash", "-c"] diff --git a/pyrannc/__init__.py b/pyrannc/__init__.py index 1d8ac51..c79f513 100644 --- a/pyrannc/__init__.py +++ b/pyrannc/__init__.py @@ -1,6 +1,7 @@ import copy import inspect import logging +import sys from collections import OrderedDict import torch @@ -8,7 +9,21 @@ import torch.onnx.utils import torch.random -from . import _pyrannc, utils +from .torch_version import BUILD_TORCH_VER + +try: + from . import _pyrannc +except ImportError as e: + import re + + torch_ver = re.sub(r"\+.*", "", torch.__version__) + build_torch_ver = re.sub(r"\+.*", "", BUILD_TORCH_VER) + if torch_version != build_torch_ver: + print("RaNNC was compiled with PyTorch {}, but the current PyTorch version is {}.".format( + BUILD_TORCH_VER, torch.__version__), file=sys.stderr) + raise e + +from . import utils from .dist_param import store_dist_param, load_dist_param, set_dist_param, get_dist_param_range, set_dist_param_dtype, \ DistributeModelParams from .opt import patch_optimizer @@ -442,6 +457,12 @@ def zero_grad(self): super().zero_grad() def get_param(self, name, amp_master_param=False): + r""" + Gets a parameter tensor specified by ``name``. + + :param args: Name of a parameter. + :param amp_master_param: Gets Apex amp master parameter if ``True``. + """ if name not in self.name_to_pid or name not in self.name_to_param: raise RuntimeError("No parameter found: {}".format(name)) @@ -450,6 +471,12 @@ def get_param(self, name, amp_master_param=False): return self.name_to_param[name] def get_param_grad(self, name, amp_master_param=False): + r""" + Gets the gradient of a parameter tensor specified by ``name``. + + :param args: Name of a parameter. + :param amp_master_param: Gets Apex amp master gradient if ``True``. + """ if name not in self.name_to_pid or name not in self.name_to_param: raise RuntimeError("No parameter found: {}".format(name)) @@ -482,6 +509,12 @@ def undeploy(self): super().undeploy() def enable_dropout(self, enable): + r""" + Enables/disables dropout layers. + This method is useful for evaluation because model.eval() does not work for a RaNNCModule. + + :param enable: Set ``True`` to enable and ``False`` to disable dropout layers. + """ if self.ready: super().enable_dropout(enable) else: @@ -565,11 +598,14 @@ def _run_dp_dry(path): def recreate_all_communicators(): + r""" + Destroy and recreate all communicators. + """ _pyrannc.recreate_all_communicators() def show_deployment(path, batch_size): - """ + r""" Show a deployment (Subgraphs and micro-batch sizes in pipeline parallelism) saved in a file. This is used for debugging. diff --git a/pyrannc/torch_version.py b/pyrannc/torch_version.py new file mode 100644 index 0000000..62212ff --- /dev/null +++ b/pyrannc/torch_version.py @@ -0,0 +1,2 @@ +BUILD_TORCH_VER = "1.11.0" +BUILD_TORCH_CUDA_VER = "11.3" diff --git a/setup.py b/setup.py index b3edca0..1ad56fa 100644 --- a/setup.py +++ b/setup.py @@ -6,6 +6,7 @@ import sys from distutils.version import LooseVersion +import torch from setuptools import setup, Extension, find_packages from setuptools.command.build_ext import build_ext @@ -70,6 +71,10 @@ def build_extension(self, ext): version_nums = os.environ["CUDA_VERSION"].split(".") VERSION += "+cu{}{}".format(version_nums[0], version_nums[1]) +with open('pyrannc/torch_version.py', mode='w') as f: + f.write('BUILD_TORCH_VER="{}"\n'.format(torch.__version__)) + f.write('BUILD_TORCH_CUDA_VER="{}"\n'.format(torch.version.cuda)) + setup( name='pyrannc', packages=find_packages(),