From 4c9643c1fe4c45232a5464f8ab4cfb5eef4bc6ee Mon Sep 17 00:00:00 2001 From: Alyssa Cote <46540273+AlyssaCote@users.noreply.github.com> Date: Wed, 17 Apr 2024 12:12:06 -0700 Subject: [PATCH 01/13] Drop python 3.8 (#544) Python 3.8 is nearing its end of life so we're no longer supporting it. [ committed by @AlyssaCote ] [ reviewed by @MattToast @mellis13 ] --- .github/workflows/release.yml | 2 +- .github/workflows/run_tests.yml | 4 +--- doc/changelog.rst | 4 ++++ doc/installation_instructions/basic.rst | 10 +++++----- pyproject.toml | 2 +- setup.cfg | 3 +-- smartsim/__init__.py | 4 ++-- smartsim/_core/_install/buildenv.py | 2 +- smartsim/_core/utils/helpers.py | 6 ++---- 9 files changed, 18 insertions(+), 19 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index ad9a55e03..ad711675a 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -98,7 +98,7 @@ jobs: - uses: actions/setup-python@v5 name: Install Python with: - python-version: '3.8' + python-version: '3.9' - name: Build sdist run: | diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index b1f007319..a635537d4 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -57,12 +57,10 @@ jobs: os: [macos-12, macos-14, ubuntu-20.04] # Operating systems compiler: [8] # GNU compiler version rai: [1.2.7] # Redis AI versions - py_v: ["3.8", "3.9", "3.10", "3.11"] # Python versions + py_v: ["3.9", "3.10", "3.11"] # Python versions exclude: - os: macos-14 py_v: "3.9" - - os: macos-14 - py_v: "3.8" env: SMARTSIM_REDISAI: ${{ matrix.rai }} diff --git a/doc/changelog.rst b/doc/changelog.rst index 024f4ab6a..4bf19e7ab 100644 --- a/doc/changelog.rst +++ b/doc/changelog.rst @@ -18,6 +18,7 @@ To be released at some future point in time Description +- Drop Python 3.8 support - Update watchdog dependency - Historical output files stored under .smartsim directory - Add option to build Torch backend without the Intel Math Kernel Library @@ -39,6 +40,8 @@ Description Detailed Notes +- Python 3.8 is reaching its end-of-life in October, 2024, so it will + no longer continue to be supported. (SmartSim-PR544_) - Update watchdog dependency from 3.x to 4.x, fix new type issues (SmartSim-PR540_) - The dashboard needs to display historical logs, so log files are written out under the .smartsim directory and files under the experiment @@ -95,6 +98,7 @@ Detailed Notes handler. SmartSim will now attempt to kill any launched jobs before calling the previously registered signal handler. (SmartSim-PR535_) +.. _SmartSim-PR544: https://github.com/CrayLabs/SmartSim/pull/544 .. _SmartSim-PR540: https://github.com/CrayLabs/SmartSim/pull/540 .. _SmartSim-PR532: https://github.com/CrayLabs/SmartSim/pull/532 .. _SmartSim-PR538: https://github.com/CrayLabs/SmartSim/pull/538 diff --git a/doc/installation_instructions/basic.rst b/doc/installation_instructions/basic.rst index 75b099ad5..905519f6f 100644 --- a/doc/installation_instructions/basic.rst +++ b/doc/installation_instructions/basic.rst @@ -20,7 +20,7 @@ Basic The base prerequisites to install SmartSim and SmartRedis are: - - Python 3.8-3.11 + - Python 3.9-3.11 - Pip - Cmake 3.13.x (or later) - C compiler @@ -74,11 +74,11 @@ Supported Versions * - MacOS - x86_64, aarch64 - Not supported - - 3.8 - 3.11 + - 3.9 - 3.11 * - Linux - x86_64 - Nvidia - - 3.8 - 3.11 + - 3.9 - 3.11 .. note:: @@ -256,9 +256,9 @@ SmartSim does. * - Platform - Python Versions * - MacOS - - 3.8 - 3.11 + - 3.9 - 3.11 * - Linux - - 3.8 - 3.11 + - 3.9 - 3.11 The Python client for SmartRedis is installed through ``pip`` as follows: diff --git a/pyproject.toml b/pyproject.toml index 72cc378d4..fe87141de 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,7 +31,7 @@ build-backend = "setuptools.build_meta" [tool.black] line-length = 88 -target-version = ['py38', 'py39', 'py310'] +target-version = ['py39', 'py310', 'py311'] exclude = ''' ( | \.egg diff --git a/setup.cfg b/setup.cfg index 5fdfa82ae..ba6606f7f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -42,7 +42,6 @@ contact_email = craylabs@hpe.com license = BSD 2-Clause License keywords = scientific, ai, workflow, hpc, analysis classifiers = - Programming Language :: Python :: 3.8 Programming Language :: Python :: 3.9 Programming Language :: Python :: 3.10 Programming Language :: Python :: 3.11 @@ -56,7 +55,7 @@ setup_requires = setuptools>=39.2 cmake>=3.13 include_package_data = True -python_requires = >=3.8,<3.12 +python_requires = >=3.9,<3.12 [options.packages.find] include = diff --git a/smartsim/__init__.py b/smartsim/__init__.py index 7c1fa2fe0..5e24097a5 100644 --- a/smartsim/__init__.py +++ b/smartsim/__init__.py @@ -30,8 +30,8 @@ # pylint: disable-next=useless-import-alias from .version import __version__ as __version__ -if sys.version_info < (3, 8): # pragma: no cover - sys.exit("Python 3.8 or greater must be used with SmartSim.") +if sys.version_info < (3, 9): # pragma: no cover + sys.exit("Python 3.9 or greater must be used with SmartSim.") # Main API module # pylint: disable=wrong-import-position diff --git a/smartsim/_core/_install/buildenv.py b/smartsim/_core/_install/buildenv.py index cbf29c4b5..476d0374c 100644 --- a/smartsim/_core/_install/buildenv.py +++ b/smartsim/_core/_install/buildenv.py @@ -267,7 +267,7 @@ class Versioner: """ # compatible Python version - PYTHON_MIN = Version_("3.8.0") + PYTHON_MIN = Version_("3.9.0") # Versions SMARTSIM = Version_(get_env("SMARTSIM_VERSION", "0.6.2")) diff --git a/smartsim/_core/utils/helpers.py b/smartsim/_core/utils/helpers.py index b9e79e250..d9e7c513f 100644 --- a/smartsim/_core/utils/helpers.py +++ b/smartsim/_core/utils/helpers.py @@ -312,11 +312,9 @@ def decode_cmd(encoded_cmd: str) -> t.List[str]: return cleaned_cmd -# TODO: Remove the ``type: ignore`` comment here when Python 3.8 support is dropped -# ``collections.abc.Collection`` is not subscriptable until Python 3.9 @t.final -class SignalInterceptionStack(collections.abc.Collection): # type: ignore[type-arg] - """Registers a stack of unique callables to be called when a signal is +class SignalInterceptionStack(collections.abc.Collection[_TSignalHandlerFn]): + """Registers a stack of callables to be called when a signal is received before calling the original signal handler. """ From f5beb410ee756c1a201b650ee7bdf4dccf3acb0b Mon Sep 17 00:00:00 2001 From: amandarichardsonn <30413257+amandarichardsonn@users.noreply.github.com> Date: Wed, 17 Apr 2024 14:27:46 -0700 Subject: [PATCH 02/13] Change default path for entities (#533) This PR makes changes to the default path for SS entities. New default path is `exp_path/entity_name/`. A path argument has also been added to create_ensemble and create_model. [ reviewed by @ashao @mellis13 ] [ committed by @amandarichardsonn ] --- .pylintrc | 2 +- doc/changelog.rst | 7 + doc/experiment.rst | 101 +++++++++++++ smartsim/database/orchestrator.py | 4 +- smartsim/entity/ensemble.py | 16 +- smartsim/entity/model.py | 5 +- smartsim/experiment.py | 49 +++++-- tests/backends/test_dbmodel.py | 3 - tests/full_wlm/test_generic_batch_launch.py | 3 - .../full_wlm/test_generic_orc_launch_batch.py | 4 - tests/on_wlm/test_generic_orc_launch.py | 3 - tests/on_wlm/test_simple_entity_launch.py | 1 - tests/on_wlm/test_stop.py | 1 - tests/test_experiment.py | 137 ++++++++++++++++++ tests/test_interrupt.py | 2 - tests/test_local_launch.py | 1 - tests/test_local_restart.py | 1 - tests/test_manifest.py | 6 - tests/test_multidb.py | 2 - tests/test_reconnect_orchestrator.py | 4 +- tests/test_telemetry_monitor.py | 1 - 21 files changed, 302 insertions(+), 51 deletions(-) diff --git a/.pylintrc b/.pylintrc index f2fa17bab..aa378d039 100644 --- a/.pylintrc +++ b/.pylintrc @@ -325,7 +325,7 @@ valid-metaclass-classmethod-first-arg=mcs max-args=9 # Maximum number of locals for function / method body -max-locals=20 +max-locals=25 # Maximum number of return / yield for function / method body max-returns=11 diff --git a/doc/changelog.rst b/doc/changelog.rst index 4bf19e7ab..b702d1a95 100644 --- a/doc/changelog.rst +++ b/doc/changelog.rst @@ -18,6 +18,7 @@ To be released at some future point in time Description +- Change default path for entities - Drop Python 3.8 support - Update watchdog dependency - Historical output files stored under .smartsim directory @@ -40,6 +41,10 @@ Description Detailed Notes +- The default path for an entity is now the path to the experiment / the + entity name. create_database and create_ensemble now have path arguments. + All path arguments are compatible with relative paths. Relative paths are + relative to the CWD. (SmartSim-PR533_) - Python 3.8 is reaching its end-of-life in October, 2024, so it will no longer continue to be supported. (SmartSim-PR544_) - Update watchdog dependency from 3.x to 4.x, fix new type issues (SmartSim-PR540_) @@ -54,6 +59,7 @@ Detailed Notes - Change type_extension and pydantic versions in readthedocs environment to enable docs build. (SmartSim-PR537_) - Promote devices to a dedicated Enum type throughout the SmartSim code base. + (SmartSim-PR498_) - Update the telemetry monitor to enable retrieval of metrics on a scheduled interval. Switch basic experiment tracking telemetry to default to on. Add database metric collectors. Improve telemetry monitor logging. Create @@ -98,6 +104,7 @@ Detailed Notes handler. SmartSim will now attempt to kill any launched jobs before calling the previously registered signal handler. (SmartSim-PR535_) +.. _SmartSim-PR533: https://github.com/CrayLabs/SmartSim/pull/533 .. _SmartSim-PR544: https://github.com/CrayLabs/SmartSim/pull/544 .. _SmartSim-PR540: https://github.com/CrayLabs/SmartSim/pull/540 .. _SmartSim-PR532: https://github.com/CrayLabs/SmartSim/pull/532 diff --git a/doc/experiment.rst b/doc/experiment.rst index 9936f49a9..73ba08812 100644 --- a/doc/experiment.rst +++ b/doc/experiment.rst @@ -211,6 +211,107 @@ An ``Ensemble`` supports key features, including methods to: Visit the respective links for more information on each topic. +============== +File Structure +============== +When a user executes an ``Experiment`` script, it generates output folders in the system's directory. +By default, SmartSim creates a predefined file structure and assigns a path to each entity initialized. +However, users have the flexibility to customize this according to workflow needs. Please refer +to the respective :ref:`default` and :ref:`configure` sections below +for more details. + +.. note:: + Files added for symlinking, copying, or configuration will not be organized into the generated + directories unless ``Experiment.generate`` is invoked on the designated entity. + +.. _default_folder: + +Default +======= +By default, an ``Experiment`` folder is created in your current working directory, using the +specified `name` parameter during ``Experiment`` initialization. Each entity created by the +``Experiment`` generates an output folder under the ``Experiment`` directory, named after the +entity. These folders hold `.err` and `.out` files, containing execution-related information. + +For instance, consider the following Python script: + +.. code-block:: python + + from smartsim import Experiment + + exp = Experiment(name="experiment-example") + database = exp.create_database(port=6379, interface="ib0") + exp.start(database) + settings = exp.create_run_settings(exe="echo", exec_args="hello world") + model = exp.create_model(name="model-name", run_settings=settings) + ensemble = exp.create_ensemble(name="ensemble-name", run_settings=settings, replicas=2) + exp.start(model, ensemble) + exp.stop(database) + +When executed, this script creates the following directory structure in your +working directory: + +:: + + experiment-example + ├── orchestrator + │ ├── orchestrator_0.err + │ └── orchestrator_0.out + ├── model-name + │ ├── model-name.err + │ └── model-name.out + └── ensemble-name + ├── ensemble-name_0 + │ ├── ensemble-name_0.err + │ └── ensemble-name_0.out + ├── ensemble-name_1 + │ ├── ensemble-name_1.err + │ └── ensemble-name_1.out + +.. _config_folder: + +Configure +========= +Customizing the path of the ``Experiment`` and entity folders is possible by providing +either an absolute or relative path to the `path` argument during initialization. When +a relative path is provided, SmartSim executes the entity relative to the current working +directory. + +For instance, consider the following Python script: + +.. code-block:: python + + from smartsim import Experiment + + exp = Experiment(name="experiment-example", exp_path="absolute/path/to/experiment-folder") + database = exp.create_database(port=6379, interface="ib0") + exp.start(database) + settings = exp.create_run_settings(exe="echo", exec_args="hello world") + model = exp.create_model(name="model-name", run_settings=settings, path="./model-folder") + ensemble = exp.create_ensemble(name="ensemble-name", run_settings=settings, replicas=2, path="./ensemble-folder") + exp.start(model, ensemble) + exp.stop(database) + +When executed, this script creates the following directory structure in your +working directory: + +:: + + ├── experiment-folder + | ├── orchestrator + | │ ├── orchestrator_0.err + | │ └── orchestrator_0.out + ├── model-folder + │ ├── model-name.err + │ └── model-name.out + └── ensemble-folder + ├── ensemble-name_0 + │ ├── ensemble-name_0.err + │ └── ensemble-name_0.out + ├── ensemble-name_1 + │ ├── ensemble-name_1.err + │ └── ensemble-name_1.out + .. _exp_example: ======= diff --git a/smartsim/database/orchestrator.py b/smartsim/database/orchestrator.py index e4d3abe54..d3a917900 100644 --- a/smartsim/database/orchestrator.py +++ b/smartsim/database/orchestrator.py @@ -147,6 +147,7 @@ class Orchestrator(EntityList[DBNode]): def __init__( self, + path: t.Optional[str] = getcwd(), port: int = 6379, interface: t.Union[str, t.List[str]] = "lo", launcher: str = "local", @@ -204,10 +205,9 @@ def __init__( if self.launcher == "lsf": gpus_per_shard = int(kwargs.pop("gpus_per_shard", 0)) cpus_per_shard = int(kwargs.pop("cpus_per_shard", 4)) - super().__init__( name=db_identifier, - path=getcwd(), + path=str(path), port=port, interface=interface, db_nodes=db_nodes, diff --git a/smartsim/entity/ensemble.py b/smartsim/entity/ensemble.py index c04681149..fa757ae49 100644 --- a/smartsim/entity/ensemble.py +++ b/smartsim/entity/ensemble.py @@ -24,6 +24,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import os.path as osp import typing as t from copy import deepcopy from os import getcwd @@ -62,6 +63,7 @@ def __init__( self, name: str, params: t.Dict[str, t.Any], + path: t.Optional[str] = getcwd(), params_as_args: t.Optional[t.List[str]] = None, batch_settings: t.Optional[BatchSettings] = None, run_settings: t.Optional[RunSettings] = None, @@ -102,7 +104,7 @@ def __init__( self.batch_settings = init_default({}, batch_settings, BatchSettings) self.run_settings = init_default({}, run_settings, RunSettings) - super().__init__(name, getcwd(), perm_strat=perm_strat, **kwargs) + super().__init__(name, str(path), perm_strat=perm_strat, **kwargs) @property def models(self) -> t.Collection[Model]: @@ -137,9 +139,9 @@ def _initialize_entities(self, **kwargs: t.Any) -> None: run_settings = deepcopy(self.run_settings) model_name = "_".join((self.name, str(i))) model = Model( - model_name, - param_set, - self.path, + name=model_name, + params=param_set, + path=osp.join(self.path, model_name), run_settings=run_settings, params_as_args=self.params_as_args, ) @@ -161,9 +163,9 @@ def _initialize_entities(self, **kwargs: t.Any) -> None: for i in range(replicas): model_name = "_".join((self.name, str(i))) model = Model( - model_name, - {}, - self.path, + name=model_name, + params={}, + path=osp.join(self.path, model_name), run_settings=deepcopy(self.run_settings), ) model.enable_key_prefixing() diff --git a/smartsim/entity/model.py b/smartsim/entity/model.py index 4a2d9b5f5..516f0f9e3 100644 --- a/smartsim/entity/model.py +++ b/smartsim/entity/model.py @@ -31,6 +31,7 @@ import sys import typing as t import warnings +from os import getcwd from os import path as osp from .._core._install.builder import Device @@ -50,8 +51,8 @@ def __init__( self, name: str, params: t.Dict[str, str], - path: str, run_settings: RunSettings, + path: t.Optional[str] = getcwd(), params_as_args: t.Optional[t.List[str]] = None, batch_settings: t.Optional[BatchSettings] = None, ): @@ -74,7 +75,7 @@ def __init__( model as a batch job, defaults to None :type batch_settings: BatchSettings | None """ - super().__init__(name, path, run_settings) + super().__init__(name, str(path), run_settings) self.params = params self.params_as_args = params_as_args self.incoming_entities: t.List[SmartSimEntity] = [] diff --git a/smartsim/experiment.py b/smartsim/experiment.py index edfde10d7..6b0fbf4c8 100644 --- a/smartsim/experiment.py +++ b/smartsim/experiment.py @@ -24,6 +24,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import os import os.path as osp import typing as t from os import environ, getcwd @@ -229,8 +230,8 @@ def start( :type kill_on_interrupt: bool, optional """ - start_manifest = Manifest(*args) + self._create_entity_dir(start_manifest) try: if summary: self._launch_summary(start_manifest) @@ -440,6 +441,7 @@ def create_ensemble( run_settings: t.Optional[base.RunSettings] = None, replicas: t.Optional[int] = None, perm_strategy: str = "all_perm", + path: t.Optional[str] = None, **kwargs: t.Any, ) -> Ensemble: """Create an ``Ensemble`` of ``Model`` instances @@ -491,10 +493,15 @@ def create_ensemble( :return: ``Ensemble`` instance :rtype: Ensemble """ + if name is None: + raise AttributeError("Entity has no name. Please set name attribute.") + path = path or osp.join(self.exp_path, name) + entity_path: str = osp.abspath(path) try: new_ensemble = Ensemble( - name, - params or {}, + name=name, + params=params or {}, + path=entity_path, batch_settings=batch_settings, run_settings=run_settings, perm_strat=perm_strategy, @@ -602,16 +609,20 @@ def create_model( :return: the created ``Model`` :rtype: Model """ - path = init_default(getcwd(), path, str) - - if path is None: - path = getcwd() + if name is None: + raise AttributeError("Entity has no name. Please set name attribute.") + path = path or osp.join(self.exp_path, name) + entity_path: str = osp.abspath(path) if params is None: params = {} try: new_model = Model( - name, params, path, run_settings, batch_settings=batch_settings + name=name, + params=params, + path=entity_path, + run_settings=run_settings, + batch_settings=batch_settings, ) if enable_key_prefixing: new_model.enable_key_prefixing() @@ -750,6 +761,7 @@ def create_batch_settings( def create_database( self, port: int = 6379, + path: t.Optional[str] = None, db_nodes: int = 1, batch: bool = False, hosts: t.Optional[t.Union[t.List[str], str]] = None, @@ -813,9 +825,11 @@ def create_database( """ self.append_to_db_identifier_list(db_identifier) - + path = path or osp.join(self.exp_path, db_identifier) + entity_path: str = osp.abspath(path) return Orchestrator( port=port, + path=entity_path, db_nodes=db_nodes, batch=batch, hosts=hosts, @@ -923,6 +937,23 @@ def _launch_summary(self, manifest: Manifest) -> None: logger.info(summary) + def _create_entity_dir(self, start_manifest: Manifest) -> None: + def create_entity_dir(entity: t.Union[Orchestrator, Model, Ensemble]) -> None: + if not os.path.isdir(entity.path): + os.makedirs(entity.path) + + for model in start_manifest.models: + create_entity_dir(model) + + for orch in start_manifest.dbs: + create_entity_dir(orch) + + for ensemble in start_manifest.ensembles: + create_entity_dir(ensemble) + + for member in ensemble.models: + create_entity_dir(member) + def __str__(self) -> str: return self.name diff --git a/tests/backends/test_dbmodel.py b/tests/backends/test_dbmodel.py index 3c02947e6..eb0198229 100644 --- a/tests/backends/test_dbmodel.py +++ b/tests/backends/test_dbmodel.py @@ -558,7 +558,6 @@ def test_colocated_db_model_ensemble(fileutils, test_dir, wlmutils, mlutils): # Create a third model with a colocated database colo_model = exp.create_model("colocated_model", colo_settings) - colo_model.set_path(test_dir) colo_model.colocate_db_tcp( port=test_port, db_cpus=1, debug=True, ifname=test_interface ) @@ -757,7 +756,6 @@ def test_colocated_db_model_errors(fileutils, test_dir, wlmutils, mlutils): # Create colocated SmartSim Model colo_model = exp.create_model("colocated_model", colo_settings) - colo_model.set_path(test_dir) colo_model.colocate_db_tcp( port=test_port, db_cpus=1, debug=True, ifname=test_interface ) @@ -814,7 +812,6 @@ def test_colocated_db_model_errors(fileutils, test_dir, wlmutils, mlutils): colo_ensemble2 = exp.create_ensemble( "colocated_ens", run_settings=colo_settings2, replicas=2 ) - colo_ensemble2.set_path(test_dir) colo_ensemble2.add_ml_model( "cnn", "TF", diff --git a/tests/full_wlm/test_generic_batch_launch.py b/tests/full_wlm/test_generic_batch_launch.py index 3487ca81c..02316dfd1 100644 --- a/tests/full_wlm/test_generic_batch_launch.py +++ b/tests/full_wlm/test_generic_batch_launch.py @@ -63,7 +63,6 @@ def test_batch_model(fileutils, test_dir, wlmutils): model = exp.create_model( "model", path=test_dir, run_settings=run_settings, batch_settings=batch_settings ) - model.set_path(test_dir) exp.start(model, block=True) statuses = exp.get_status(model) @@ -89,7 +88,6 @@ def test_batch_ensemble(fileutils, test_dir, wlmutils): ensemble = exp.create_ensemble("batch-ens", batch_settings=batch) ensemble.add_model(M1) ensemble.add_model(M2) - ensemble.set_path(test_dir) exp.start(ensemble, block=True) statuses = exp.get_status(ensemble) @@ -110,7 +108,6 @@ def test_batch_ensemble_replicas(fileutils, test_dir, wlmutils): ensemble = exp.create_ensemble( "batch-ens-replicas", batch_settings=batch, run_settings=settings, replicas=2 ) - ensemble.set_path(test_dir) exp.start(ensemble, block=True) statuses = exp.get_status(ensemble) diff --git a/tests/full_wlm/test_generic_orc_launch_batch.py b/tests/full_wlm/test_generic_orc_launch_batch.py index 293a2cdd2..ed28da878 100644 --- a/tests/full_wlm/test_generic_orc_launch_batch.py +++ b/tests/full_wlm/test_generic_orc_launch_batch.py @@ -61,7 +61,6 @@ def test_launch_orc_auto_batch(test_dir, wlmutils): orc.batch_settings.set_account(wlmutils.get_test_account()) orc.batch_settings.set_walltime("00:02:00") - orc.set_path(test_dir) exp.start(orc, block=True) statuses = exp.get_status(orc) @@ -97,7 +96,6 @@ def test_launch_cluster_orc_batch_single(test_dir, wlmutils): orc.batch_settings.set_account(wlmutils.get_test_account()) orc.batch_settings.set_walltime("00:02:00") - orc.set_path(test_dir) exp.start(orc, block=True) statuses = exp.get_status(orc) @@ -133,7 +131,6 @@ def test_launch_cluster_orc_batch_multi(test_dir, wlmutils): orc.batch_settings.set_account(wlmutils.get_test_account()) orc.batch_settings.set_walltime("00:03:00") - orc.set_path(test_dir) exp.start(orc, block=True) statuses = exp.get_status(orc) @@ -159,7 +156,6 @@ def test_launch_cluster_orc_reconnect(test_dir, wlmutils): orc = exp.create_database( wlmutils.get_test_port(), db_nodes=3, batch=True, interface=network_interface ) - orc.set_path(test_dir) orc.batch_settings.set_account(wlmutils.get_test_account()) diff --git a/tests/on_wlm/test_generic_orc_launch.py b/tests/on_wlm/test_generic_orc_launch.py index f31c8a890..cacdd5be5 100644 --- a/tests/on_wlm/test_generic_orc_launch.py +++ b/tests/on_wlm/test_generic_orc_launch.py @@ -50,7 +50,6 @@ def test_launch_orc_auto(test_dir, wlmutils): single_cmd=False, hosts=wlmutils.get_test_hostlist(), ) - orc.set_path(test_dir) exp.start(orc, block=True) statuses = exp.get_status(orc) @@ -83,7 +82,6 @@ def test_launch_cluster_orc_single(test_dir, wlmutils): single_cmd=True, hosts=wlmutils.get_test_hostlist(), ) - orc.set_path(test_dir) exp.start(orc, block=True) statuses = exp.get_status(orc) @@ -116,7 +114,6 @@ def test_launch_cluster_orc_multi(test_dir, wlmutils): single_cmd=False, hosts=wlmutils.get_test_hostlist(), ) - orc.set_path(test_dir) exp.start(orc, block=True) statuses = exp.get_status(orc) diff --git a/tests/on_wlm/test_simple_entity_launch.py b/tests/on_wlm/test_simple_entity_launch.py index 5dacc13da..d16c81487 100644 --- a/tests/on_wlm/test_simple_entity_launch.py +++ b/tests/on_wlm/test_simple_entity_launch.py @@ -72,7 +72,6 @@ def test_ensemble(fileutils, test_dir, wlmutils): settings.set_tasks(1) ensemble = exp.create_ensemble("e1", run_settings=settings, replicas=2) - ensemble.set_path(test_dir) exp.start(ensemble, block=True) statuses = exp.get_status(ensemble) diff --git a/tests/on_wlm/test_stop.py b/tests/on_wlm/test_stop.py index 0c06375a4..abc7441bb 100644 --- a/tests/on_wlm/test_stop.py +++ b/tests/on_wlm/test_stop.py @@ -68,7 +68,6 @@ def test_stop_entity_list(fileutils, test_dir, wlmutils): settings.set_tasks(1) ensemble = exp.create_ensemble("e1", run_settings=settings, replicas=2) - ensemble.set_path(test_dir) exp.start(ensemble, block=False) time.sleep(5) diff --git a/tests/test_experiment.py b/tests/test_experiment.py index c1c785d6e..32c642eb4 100644 --- a/tests/test_experiment.py +++ b/tests/test_experiment.py @@ -24,7 +24,9 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import os +import os.path as osp import pathlib +import shutil import typing as t import pytest @@ -33,6 +35,7 @@ from smartsim._core.config import CONFIG from smartsim._core.config.config import Config from smartsim._core.utils import serialize +from smartsim.database import Orchestrator from smartsim.entity import Model from smartsim.error import SmartSimError from smartsim.error.errors import SSUnsupportedError @@ -60,6 +63,20 @@ def test_model_prefix(test_dir: str) -> None: assert model._key_prefixing_enabled == True +def test_model_no_name(): + exp = Experiment("test_model_no_name") + with pytest.raises(AttributeError): + _ = exp.create_model(name=None, run_settings=RunSettings("python")) + + +def test_ensemble_no_name(): + exp = Experiment("test_ensemble_no_name") + with pytest.raises(AttributeError): + _ = exp.create_ensemble( + name=None, run_settings=RunSettings("python"), replicas=2 + ) + + def test_bad_exp_path() -> None: with pytest.raises(NotADirectoryError): exp = Experiment("test", "not-a-directory") @@ -229,3 +246,123 @@ def test_telemetry_default( def test_error_on_cobalt() -> None: with pytest.raises(SSUnsupportedError): exp = Experiment("cobalt_exp", launcher="cobalt") + + +def test_default_orch_path( + monkeypatch: pytest.MonkeyPatch, test_dir: str, wlmutils: "conftest.WLMUtils" +) -> None: + """Ensure the default file structure is created for Orchestrator""" + + exp_name = "default-orch-path" + exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir) + monkeypatch.setattr(exp._control, "start", lambda *a, **kw: ...) + db = exp.create_database( + port=wlmutils.get_test_port(), interface=wlmutils.get_test_interface() + ) + exp.start(db) + orch_path = pathlib.Path(test_dir) / db.name + assert orch_path.exists() + assert db.path == str(orch_path) + + +def test_default_model_path( + monkeypatch: pytest.MonkeyPatch, test_dir: str, wlmutils: "conftest.WLMUtils" +) -> None: + """Ensure the default file structure is created for Model""" + + exp_name = "default-model-path" + exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir) + monkeypatch.setattr(exp._control, "start", lambda *a, **kw: ...) + settings = exp.create_run_settings(exe="echo", exe_args="hello") + model = exp.create_model(name="model_name", run_settings=settings) + exp.start(model) + model_path = pathlib.Path(test_dir) / model.name + assert model_path.exists() + assert model.path == str(model_path) + + +def test_default_ensemble_path( + monkeypatch: pytest.MonkeyPatch, test_dir: str, wlmutils: "conftest.WLMUtils" +) -> None: + """Ensure the default file structure is created for Ensemble""" + + exp_name = "default-ensemble-path" + exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir) + monkeypatch.setattr(exp._control, "start", lambda *a, **kw: ...) + settings = exp.create_run_settings(exe="echo", exe_args="hello") + ensemble = exp.create_ensemble( + name="ensemble_name", run_settings=settings, replicas=2 + ) + exp.start(ensemble) + ensemble_path = pathlib.Path(test_dir) / ensemble.name + assert ensemble_path.exists() + assert ensemble.path == str(ensemble_path) + for member in ensemble.models: + member_path = ensemble_path / member.name + assert member_path.exists() + assert member.path == str(ensemble_path / member.name) + + +def test_user_orch_path( + monkeypatch: pytest.MonkeyPatch, test_dir: str, wlmutils: "conftest.WLMUtils" +) -> None: + """Ensure a relative path is used to created Orchestrator folder""" + + exp_name = "default-orch-path" + exp = Experiment(exp_name, launcher="local", exp_path=test_dir) + monkeypatch.setattr(exp._control, "start", lambda *a, **kw: ...) + db = exp.create_database( + port=wlmutils.get_test_port(), + interface=wlmutils.get_test_interface(), + path="./testing_folder1234", + ) + exp.start(db) + orch_path = pathlib.Path(osp.abspath("./testing_folder1234")) + assert orch_path.exists() + assert db.path == str(orch_path) + shutil.rmtree(orch_path) + + +def test_default_model_with_path( + monkeypatch: pytest.MonkeyPatch, test_dir: str, wlmutils: "conftest.WLMUtils" +) -> None: + """Ensure a relative path is used to created Model folder""" + + exp_name = "default-ensemble-path" + exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir) + monkeypatch.setattr(exp._control, "start", lambda *a, **kw: ...) + settings = exp.create_run_settings(exe="echo", exe_args="hello") + model = exp.create_model( + name="model_name", run_settings=settings, path="./testing_folder1234" + ) + exp.start(model) + model_path = pathlib.Path(osp.abspath("./testing_folder1234")) + assert model_path.exists() + assert model.path == str(model_path) + shutil.rmtree(model_path) + + +def test_default_ensemble_with_path( + monkeypatch: pytest.MonkeyPatch, test_dir: str, wlmutils: "conftest.WLMUtils" +) -> None: + """Ensure a relative path is used to created Ensemble folder""" + + exp_name = "default-ensemble-path" + exp = Experiment(exp_name, launcher=wlmutils.get_test_launcher(), exp_path=test_dir) + monkeypatch.setattr(exp._control, "start", lambda *a, **kw: ...) + settings = exp.create_run_settings(exe="echo", exe_args="hello") + ensemble = exp.create_ensemble( + name="ensemble_name", + run_settings=settings, + path="./testing_folder1234", + replicas=2, + ) + exp.start(ensemble) + ensemble_path = pathlib.Path(osp.abspath("./testing_folder1234")) + assert ensemble_path.exists() + assert ensemble.path == str(ensemble_path) + for member in ensemble.models: + member_path = ensemble_path / member.name + assert member_path.exists() + assert member.path == str(member_path) + shutil.rmtree(ensemble_path) diff --git a/tests/test_interrupt.py b/tests/test_interrupt.py index 61dc5b8c0..c38ae0225 100644 --- a/tests/test_interrupt.py +++ b/tests/test_interrupt.py @@ -63,7 +63,6 @@ def test_interrupt_blocked_jobs(test_dir): replicas=2, run_settings=RunSettings("sleep", "100"), ) - ensemble.set_path(test_dir) num_jobs = 1 + len(ensemble) pid = os.getpid() keyboard_interrupt_thread = Thread( @@ -105,7 +104,6 @@ def test_interrupt_multi_experiment_unblocked_jobs(test_dir): replicas=2, run_settings=RunSettings("sleep", "100"), ) - ensemble.set_path(test_dir) jobs_per_experiment[i] = 1 + len(ensemble) pid = os.getpid() diff --git a/tests/test_local_launch.py b/tests/test_local_launch.py index fa09806b3..85687e014 100644 --- a/tests/test_local_launch.py +++ b/tests/test_local_launch.py @@ -61,7 +61,6 @@ def test_ensemble(fileutils, test_dir): settings = exp.create_run_settings("python", f"{script} --time=3") ensemble = exp.create_ensemble("e1", run_settings=settings, replicas=2) - ensemble.set_path(test_dir) exp.start(ensemble, block=True, summary=True) statuses = exp.get_status(ensemble) diff --git a/tests/test_local_restart.py b/tests/test_local_restart.py index 383e99900..2556c5597 100644 --- a/tests/test_local_restart.py +++ b/tests/test_local_restart.py @@ -65,7 +65,6 @@ def test_ensemble(fileutils, test_dir): settings = exp.create_run_settings("python", f"{script} --time=3") ensemble = exp.create_ensemble("e1", run_settings=settings, replicas=2) - ensemble.set_path(test_dir) exp.start(ensemble, block=True) statuses = exp.get_status(ensemble) diff --git a/tests/test_manifest.py b/tests/test_manifest.py index 1240d106d..c26868ebb 100644 --- a/tests/test_manifest.py +++ b/tests/test_manifest.py @@ -61,7 +61,6 @@ orc = Orchestrator() orc_1 = deepcopy(orc) orc_1.name = "orc2" -model_no_name = exp.create_model(name=None, run_settings=rs) db_script = DBScript("some-script", "def main():\n print('hello world')\n") db_model = DBModel("some-model", "TORCH", b"some-model-bytes") @@ -76,11 +75,6 @@ def test_separate(): assert manifest.dbs[0] == orc -def test_no_name(): - with pytest.raises(AttributeError): - _ = Manifest(model_no_name) - - def test_separate_type(): with pytest.raises(TypeError): _ = Manifest([1, 2, 3]) diff --git a/tests/test_multidb.py b/tests/test_multidb.py index 5a530dc97..13c8d86e7 100644 --- a/tests/test_multidb.py +++ b/tests/test_multidb.py @@ -153,7 +153,6 @@ def test_db_identifier_colo_then_standard( # Create the SmartSim Model smartsim_model = exp.create_model("colocated_model", colo_settings) - smartsim_model.set_path(test_dir) db_args = { "port": test_port, @@ -326,7 +325,6 @@ def test_multidb_colo_once(fileutils, test_dir, wlmutils, coloutils, db_type): # Create the SmartSim Model smartsim_model = exp.create_model("smartsim_model", run_settings) - smartsim_model.set_path(test_dir) db_args = { "port": test_port + 1, diff --git a/tests/test_reconnect_orchestrator.py b/tests/test_reconnect_orchestrator.py index 12d9cfb95..6ce93c6f9 100644 --- a/tests/test_reconnect_orchestrator.py +++ b/tests/test_reconnect_orchestrator.py @@ -51,7 +51,7 @@ def test_local_orchestrator(test_dir, wlmutils): first_dir = test_dir orc = Orchestrator(port=wlmutils.get_test_port()) - orc.set_path(test_dir) + orc.set_path(osp.join(test_dir, "orchestrator")) exp.start(orc) statuses = exp.get_status(orc) @@ -69,7 +69,7 @@ def test_reconnect_local_orc(test_dir): exp_name = "test-orc-local-reconnect-2nd" exp_2 = Experiment(exp_name, launcher="local", exp_path=test_dir) - checkpoint = osp.join(first_dir, "smartsim_db.dat") + checkpoint = osp.join(first_dir, "orchestrator", "smartsim_db.dat") reloaded_orc = exp_2.reconnect_orchestrator(checkpoint) # let statuses update once diff --git a/tests/test_telemetry_monitor.py b/tests/test_telemetry_monitor.py index c466a4b08..ebeeaee48 100644 --- a/tests/test_telemetry_monitor.py +++ b/tests/test_telemetry_monitor.py @@ -839,7 +839,6 @@ def test_telemetry_db_only_without_generate(test_dir, wlmutils, monkeypatch, con # create regular database orc = exp.create_database(port=test_port, interface=test_interface) - orc.set_path(test_dir) telemetry_output_path = pathlib.Path(test_dir) / config.telemetry_subdir try: From 0a2f53db1a237802a722223522dcc9f04d8f3509 Mon Sep 17 00:00:00 2001 From: Chris McBride <3595025+ankona@users.noreply.github.com> Date: Thu, 18 Apr 2024 18:38:22 -0400 Subject: [PATCH 03/13] Avoid setting `task_id` on a managed step in telemetry monitor (#557) Ensures that a managed step-mapping doesn't include a `task_id`. The telemetry monitor exhibits a defect where it logs errors from a task manager, even with only managed tasks being monitored for updates: ![image](https://github.com/CrayLabs/SmartSim/assets/3595025/84921e5b-144b-4fcd-8289-48d2504deaac) This fix modifies the telemetry monitor to not set a `task_id` when adding items to the `step_mapping` collection. This avoids triggering lookups for unmanaged processes. ![image](https://github.com/CrayLabs/SmartSim/assets/3595025/dfc7cfad-a875-45b3-91d2-fa19407c9d0c) [ committed by @ankona ] [ approved by @MattToast ] --- doc/changelog.rst | 4 ++++ smartsim/_core/utils/telemetry/telemetry.py | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/doc/changelog.rst b/doc/changelog.rst index b702d1a95..50ec351a8 100644 --- a/doc/changelog.rst +++ b/doc/changelog.rst @@ -18,6 +18,7 @@ To be released at some future point in time Description +- Fix telemetry monitor logging errrors for task history - Change default path for entities - Drop Python 3.8 support - Update watchdog dependency @@ -41,6 +42,8 @@ Description Detailed Notes +- Ensure the telemetry monitor does not track a task_id + for a managed task. (SmartSim-PR557_) - The default path for an entity is now the path to the experiment / the entity name. create_database and create_ensemble now have path arguments. All path arguments are compatible with relative paths. Relative paths are @@ -104,6 +107,7 @@ Detailed Notes handler. SmartSim will now attempt to kill any launched jobs before calling the previously registered signal handler. (SmartSim-PR535_) +.. _SmartSim-PR557: https://github.com/CrayLabs/SmartSim/pull/557 .. _SmartSim-PR533: https://github.com/CrayLabs/SmartSim/pull/533 .. _SmartSim-PR544: https://github.com/CrayLabs/SmartSim/pull/544 .. _SmartSim-PR540: https://github.com/CrayLabs/SmartSim/pull/540 diff --git a/smartsim/_core/utils/telemetry/telemetry.py b/smartsim/_core/utils/telemetry/telemetry.py index fa5e5e840..ddfc797f7 100644 --- a/smartsim/_core/utils/telemetry/telemetry.py +++ b/smartsim/_core/utils/telemetry/telemetry.py @@ -212,14 +212,14 @@ def process_manifest(self, manifest_path: str) -> None: # status updates but does not try to start a new copy self.job_manager.add_job( entity.name, - entity.task_id, + entity.step_id, entity, False, ) # Tell the launcher it's managed so it doesn't attempt # to look for a PID that may no longer exist self._launcher.step_mapping.add( - entity.name, entity.step_id, entity.task_id, True + entity.name, entity.step_id, "", True ) self._tracked_runs[run.timestamp] = run From 044c4bd36c19a105094261705da8f1889cb9c22c Mon Sep 17 00:00:00 2001 From: amandarichardsonn <30413257+amandarichardsonn@users.noreply.github.com> Date: Thu, 18 Apr 2024 16:37:19 -0700 Subject: [PATCH 04/13] Remove init_default function (#545) This PR removes the helper function `init_default` and instead implements traditional type narrowing. [ reviewed by @MattToast ] [ committed by @amandarichardsonn ] --- doc/changelog.rst | 4 ++++ smartsim/_core/utils/__init__.py | 2 +- smartsim/_core/utils/helpers.py | 12 ------------ smartsim/entity/dbobject.py | 4 ---- smartsim/entity/ensemble.py | 9 ++++----- smartsim/entity/model.py | 8 ++++---- smartsim/experiment.py | 19 +++++++++++-------- smartsim/wlm/slurm.py | 3 +-- 8 files changed, 25 insertions(+), 36 deletions(-) diff --git a/doc/changelog.rst b/doc/changelog.rst index 50ec351a8..cb0b33cd1 100644 --- a/doc/changelog.rst +++ b/doc/changelog.rst @@ -18,6 +18,7 @@ To be released at some future point in time Description +- Remove helper function ``init_default`` - Fix telemetry monitor logging errrors for task history - Change default path for entities - Drop Python 3.8 support @@ -42,6 +43,8 @@ Description Detailed Notes +- Remove helper function ``init_default`` and replace with traditional type + narrowing. (SmartSim-PR545_) - Ensure the telemetry monitor does not track a task_id for a managed task. (SmartSim-PR557_) - The default path for an entity is now the path to the experiment / the @@ -107,6 +110,7 @@ Detailed Notes handler. SmartSim will now attempt to kill any launched jobs before calling the previously registered signal handler. (SmartSim-PR535_) +.. _SmartSim-PR545: https://github.com/CrayLabs/SmartSim/pull/545 .. _SmartSim-PR557: https://github.com/CrayLabs/SmartSim/pull/557 .. _SmartSim-PR533: https://github.com/CrayLabs/SmartSim/pull/533 .. _SmartSim-PR544: https://github.com/CrayLabs/SmartSim/pull/544 diff --git a/smartsim/_core/utils/__init__.py b/smartsim/_core/utils/__init__.py index cb9395881..0a109de95 100644 --- a/smartsim/_core/utils/__init__.py +++ b/smartsim/_core/utils/__init__.py @@ -24,5 +24,5 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from .helpers import colorize, delete_elements, init_default, installed_redisai_backends +from .helpers import colorize, delete_elements, installed_redisai_backends from .redis import check_cluster_status, create_cluster, db_is_active diff --git a/smartsim/_core/utils/helpers.py b/smartsim/_core/utils/helpers.py index d9e7c513f..9e1a8acf6 100644 --- a/smartsim/_core/utils/helpers.py +++ b/smartsim/_core/utils/helpers.py @@ -116,18 +116,6 @@ def get_base_36_repr(positive_int: int) -> str: return "".join(reversed(result)) -def init_default( - default: t.Any, - init_value: t.Any, - expected_type: t.Union[t.Type[t.Any], t.Tuple[t.Type[t.Any], ...], None] = None, -) -> t.Any: - if init_value is None: - return default - if expected_type is not None and not isinstance(init_value, expected_type): - raise TypeError(f"Argument was of type {type(init_value)}, not {expected_type}") - return init_value - - def expand_exe_path(exe: str) -> str: """Takes an executable and returns the full path to that executable diff --git a/smartsim/entity/dbobject.py b/smartsim/entity/dbobject.py index ff18da1cd..d30668c76 100644 --- a/smartsim/entity/dbobject.py +++ b/smartsim/entity/dbobject.py @@ -28,7 +28,6 @@ from pathlib import Path from .._core._install.builder import Device -from .._core.utils import init_default from ..error import SSUnsupportedError __all__ = ["DBObject", "DBModel", "DBScript"] @@ -76,9 +75,6 @@ def _check_tensor_args( inputs: t.Union[str, t.Optional[t.List[str]]], outputs: t.Union[str, t.Optional[t.List[str]]], ) -> t.Tuple[t.List[str], t.List[str]]: - inputs = init_default([], inputs, (list, str)) - outputs = init_default([], outputs, (list, str)) - if isinstance(inputs, str): inputs = [inputs] if isinstance(outputs, str): diff --git a/smartsim/entity/ensemble.py b/smartsim/entity/ensemble.py index fa757ae49..1f80fe71e 100644 --- a/smartsim/entity/ensemble.py +++ b/smartsim/entity/ensemble.py @@ -32,7 +32,6 @@ from tabulate import tabulate from .._core._install.builder import Device -from .._core.utils.helpers import init_default from ..error import ( EntityExistsError, SmartSimError, @@ -98,11 +97,11 @@ def __init__( :return: ``Ensemble`` instance :rtype: ``Ensemble`` """ - self.params = init_default({}, params, dict) - self.params_as_args = init_default({}, params_as_args, (list, str)) + self.params = params or {} + self.params_as_args = params_as_args or [] self._key_prefixing_enabled = True - self.batch_settings = init_default({}, batch_settings, BatchSettings) - self.run_settings = init_default({}, run_settings, RunSettings) + self.batch_settings = batch_settings + self.run_settings = run_settings super().__init__(name, str(path), perm_strat=perm_strat, **kwargs) diff --git a/smartsim/entity/model.py b/smartsim/entity/model.py index 516f0f9e3..69e942ed2 100644 --- a/smartsim/entity/model.py +++ b/smartsim/entity/model.py @@ -35,7 +35,7 @@ from os import path as osp from .._core._install.builder import Device -from .._core.utils.helpers import cat_arg_and_value, init_default +from .._core.utils.helpers import cat_arg_and_value from ..error import EntityExistsError, SSUnsupportedError from ..log import get_logger from ..settings.base import BatchSettings, RunSettings @@ -164,9 +164,9 @@ def attach_generator_files( :param to_configure: input files with tagged parameters, defaults to [] :type to_configure: list, optional """ - to_copy = init_default([], to_copy, (list, str)) - to_symlink = init_default([], to_symlink, (list, str)) - to_configure = init_default([], to_configure, (list, str)) + to_copy = to_copy or [] + to_symlink = to_symlink or [] + to_configure = to_configure or [] # Check that no file collides with the parameter file written # by Generator. We check the basename, even though it is more diff --git a/smartsim/experiment.py b/smartsim/experiment.py index 6b0fbf4c8..069a81540 100644 --- a/smartsim/experiment.py +++ b/smartsim/experiment.py @@ -36,7 +36,6 @@ from smartsim.status import SmartSimStatus from ._core import Controller, Generator, Manifest -from ._core.utils import init_default from .database import Orchestrator from .entity import ( Ensemble, @@ -160,7 +159,10 @@ def __init__( if not osp.isdir(osp.abspath(exp_path)): raise NotADirectoryError("Experiment path provided does not exist") exp_path = osp.abspath(exp_path) - self.exp_path: str = init_default(osp.join(getcwd(), name), exp_path, str) + else: + exp_path = osp.join(getcwd(), name) + + self.exp_path = exp_path if launcher == "auto": launcher = detect_launcher() @@ -495,8 +497,9 @@ def create_ensemble( """ if name is None: raise AttributeError("Entity has no name. Please set name attribute.") - path = path or osp.join(self.exp_path, name) - entity_path: str = osp.abspath(path) + check_path = path or osp.join(self.exp_path, name) + entity_path: str = osp.abspath(check_path) + try: new_ensemble = Ensemble( name=name, @@ -611,8 +614,8 @@ def create_model( """ if name is None: raise AttributeError("Entity has no name. Please set name attribute.") - path = path or osp.join(self.exp_path, name) - entity_path: str = osp.abspath(path) + check_path = path or osp.join(self.exp_path, name) + entity_path: str = osp.abspath(check_path) if params is None: params = {} @@ -825,8 +828,8 @@ def create_database( """ self.append_to_db_identifier_list(db_identifier) - path = path or osp.join(self.exp_path, db_identifier) - entity_path: str = osp.abspath(path) + check_path = path or osp.join(self.exp_path, db_identifier) + entity_path: str = osp.abspath(check_path) return Orchestrator( port=port, path=entity_path, diff --git a/smartsim/wlm/slurm.py b/smartsim/wlm/slurm.py index 098056c9e..d80b217ea 100644 --- a/smartsim/wlm/slurm.py +++ b/smartsim/wlm/slurm.py @@ -31,7 +31,6 @@ from .._core.launcher.slurm.slurmCommands import salloc, scancel, scontrol, sinfo from .._core.launcher.slurm.slurmParser import parse_salloc, parse_salloc_error from .._core.launcher.util.launcherUtil import ComputeNode, Partition -from .._core.utils.helpers import init_default from ..error import ( AllocationError, LauncherError, @@ -84,7 +83,7 @@ def get_allocation( "Attempted slurm function without access to slurm(salloc) at the call site" ) - options = init_default({}, options, dict) + options = options or {} salloc_args = _get_alloc_cmd(nodes, time, account, options=options) debug_msg = " ".join(salloc_args[1:]) From 04ea493fdb27f8d163ef2b9713d8d36ece30a5a8 Mon Sep 17 00:00:00 2001 From: Alyssa Cote <46540273+AlyssaCote@users.noreply.github.com> Date: Fri, 19 Apr 2024 09:38:35 -0700 Subject: [PATCH 05/13] Upgrade ubuntu to 22.04 (#558) Bump ubuntu to version 22.04 [ committed by @AlyssaCote ] [ reviewed by @ashao ] --- .github/workflows/release.yml | 2 +- .github/workflows/run_tests.yml | 2 +- doc/changelog.rst | 4 ++++ docker/dev/Dockerfile | 2 +- docker/docs/dev/Dockerfile | 2 +- docker/prod/Dockerfile | 2 +- docker/testing/Dockerfile | 2 +- 7 files changed, 10 insertions(+), 6 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index ad711675a..c58288ee5 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -53,7 +53,7 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-20.04, macos-12] + os: [ubuntu-22.04, macos-12] steps: - uses: actions/checkout@v4 diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml index a635537d4..2e3463e5b 100644 --- a/.github/workflows/run_tests.yml +++ b/.github/workflows/run_tests.yml @@ -54,7 +54,7 @@ jobs: fail-fast: false matrix: subset: [backends, slow_tests, group_a, group_b] - os: [macos-12, macos-14, ubuntu-20.04] # Operating systems + os: [macos-12, macos-14, ubuntu-22.04] # Operating systems compiler: [8] # GNU compiler version rai: [1.2.7] # Redis AI versions py_v: ["3.9", "3.10", "3.11"] # Python versions diff --git a/doc/changelog.rst b/doc/changelog.rst index cb0b33cd1..8c9a69040 100644 --- a/doc/changelog.rst +++ b/doc/changelog.rst @@ -18,6 +18,7 @@ To be released at some future point in time Description +- Upgrade ubuntu to 22.04 - Remove helper function ``init_default`` - Fix telemetry monitor logging errrors for task history - Change default path for entities @@ -43,6 +44,8 @@ Description Detailed Notes +- After dropping support for Python 3.8, ubuntu needs to be upgraded. + (SmartSim-PR558_) - Remove helper function ``init_default`` and replace with traditional type narrowing. (SmartSim-PR545_) - Ensure the telemetry monitor does not track a task_id @@ -110,6 +113,7 @@ Detailed Notes handler. SmartSim will now attempt to kill any launched jobs before calling the previously registered signal handler. (SmartSim-PR535_) +.. _SmartSim-PR558: https://github.com/CrayLabs/SmartSim/pull/558 .. _SmartSim-PR545: https://github.com/CrayLabs/SmartSim/pull/545 .. _SmartSim-PR557: https://github.com/CrayLabs/SmartSim/pull/557 .. _SmartSim-PR533: https://github.com/CrayLabs/SmartSim/pull/533 diff --git a/docker/dev/Dockerfile b/docker/dev/Dockerfile index 877fcc821..3ab3a37f8 100644 --- a/docker/dev/Dockerfile +++ b/docker/dev/Dockerfile @@ -24,7 +24,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -FROM ubuntu:20.04 +FROM ubuntu:22.04 LABEL maintainer="Cray Labs" diff --git a/docker/docs/dev/Dockerfile b/docker/docs/dev/Dockerfile index 48a9f4027..49bbb833c 100644 --- a/docker/docs/dev/Dockerfile +++ b/docker/docs/dev/Dockerfile @@ -24,7 +24,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -FROM ubuntu:20.04 +FROM ubuntu:22.04 LABEL maintainer="Cray Labs" diff --git a/docker/prod/Dockerfile b/docker/prod/Dockerfile index 081b2aa72..c4e86d603 100644 --- a/docker/prod/Dockerfile +++ b/docker/prod/Dockerfile @@ -24,7 +24,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -FROM ubuntu:20.04 +FROM ubuntu:22.04 LABEL maintainer="Cray Labs" LABEL org.opencontainers.image.source https://github.com/CrayLabs/SmartSim diff --git a/docker/testing/Dockerfile b/docker/testing/Dockerfile index 9c247c320..285a66023 100644 --- a/docker/testing/Dockerfile +++ b/docker/testing/Dockerfile @@ -26,7 +26,7 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -FROM ubuntu:21.10 +FROM ubuntu:22.04 ENV DEBIAN_FRONTEND noninteractive RUN apt update && apt install -y python3 python3-pip python-is-python3 cmake git RUN pip install torch==1.9.1 From 75118bacbf666193cb402b3d1cf3fe4098dab411 Mon Sep 17 00:00:00 2001 From: Alyssa Cote <46540273+AlyssaCote@users.noreply.github.com> Date: Mon, 22 Apr 2024 07:56:34 -0700 Subject: [PATCH 06/13] Remove defensive regexp in .gitignore (#560) In this PR I removed the defensive regexp in `.gitignore` and added `test_dir` to the tests that were writing to the `cwd` instead of the `test_output` directory. [ committed by @AlyssaCote ] [ reviewed by @ankona ] --- .gitignore | 5 ----- doc/changelog.rst | 4 ++++ tests/on_wlm/test_het_job.py | 4 ++-- tests/test_launch_errors.py | 4 ++-- tests/test_model.py | 16 ++++++++++------ 5 files changed, 18 insertions(+), 15 deletions(-) diff --git a/.gitignore b/.gitignore index 7cf5fc857..24e061563 100644 --- a/.gitignore +++ b/.gitignore @@ -32,10 +32,5 @@ smartsim/_core/bin/*-cli # created upon install smartsim/_core/lib -**/manifest/ -**/*.err -**/*.out -**/.smartsim/* - # optional dev tools .pre-commit-config.yaml diff --git a/doc/changelog.rst b/doc/changelog.rst index 8c9a69040..e8bb9c6f5 100644 --- a/doc/changelog.rst +++ b/doc/changelog.rst @@ -18,6 +18,7 @@ To be released at some future point in time Description +- Remove defensive regexp in .gitignore - Upgrade ubuntu to 22.04 - Remove helper function ``init_default`` - Fix telemetry monitor logging errrors for task history @@ -44,6 +45,8 @@ Description Detailed Notes +- Remove defensive regexp in .gitignore and ensure tests write to test_output. + (SmartSim-PR560_) - After dropping support for Python 3.8, ubuntu needs to be upgraded. (SmartSim-PR558_) - Remove helper function ``init_default`` and replace with traditional type @@ -113,6 +116,7 @@ Detailed Notes handler. SmartSim will now attempt to kill any launched jobs before calling the previously registered signal handler. (SmartSim-PR535_) +.. _SmartSim-PR560: https://github.com/CrayLabs/SmartSim/pull/560 .. _SmartSim-PR558: https://github.com/CrayLabs/SmartSim/pull/558 .. _SmartSim-PR545: https://github.com/CrayLabs/SmartSim/pull/545 .. _SmartSim-PR557: https://github.com/CrayLabs/SmartSim/pull/557 diff --git a/tests/on_wlm/test_het_job.py b/tests/on_wlm/test_het_job.py index 5a039a7c9..ea28c944f 100644 --- a/tests/on_wlm/test_het_job.py +++ b/tests/on_wlm/test_het_job.py @@ -63,11 +63,11 @@ def test_set_het_groups(monkeypatch): rs.set_het_group([4]) -def test_orch_single_cmd(monkeypatch, wlmutils): +def test_orch_single_cmd(monkeypatch, wlmutils, test_dir): """Test that single cmd is rejected in a heterogeneous job""" monkeypatch.setenv("SLURM_HET_SIZE", "1") exp_name = "test-orch-single-cmd" - exp = Experiment(exp_name, launcher="slurm") + exp = Experiment(exp_name, launcher="slurm", exp_path=test_dir) orc = exp.create_database( wlmutils.get_test_port(), db_nodes=3, diff --git a/tests/test_launch_errors.py b/tests/test_launch_errors.py index 4431cd31c..21b3184e5 100644 --- a/tests/test_launch_errors.py +++ b/tests/test_launch_errors.py @@ -37,9 +37,9 @@ pytestmark = pytest.mark.group_a -def test_unsupported_run_settings(): +def test_unsupported_run_settings(test_dir): exp_name = "test-unsupported-run-settings" - exp = Experiment(exp_name, launcher="slurm") + exp = Experiment(exp_name, launcher="slurm", exp_path=test_dir) bad_settings = JsrunSettings("echo", "hello") model = exp.create_model("bad_rs", bad_settings) diff --git a/tests/test_model.py b/tests/test_model.py index 5e336bc5e..64a68b299 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -116,8 +116,10 @@ def launch_step_nop(self, step, entity): return _monkeypatch_exp_controller -def test_model_with_batch_settings_makes_batch_step(monkeypatch_exp_controller): - exp = Experiment("experiment", launcher="slurm") +def test_model_with_batch_settings_makes_batch_step( + monkeypatch_exp_controller, test_dir +): + exp = Experiment("experiment", launcher="slurm", exp_path=test_dir) bs = SbatchSettings() rs = SrunSettings("python", exe_args="sleep.py") model = exp.create_model("test_model", run_settings=rs, batch_settings=bs) @@ -132,9 +134,9 @@ def test_model_with_batch_settings_makes_batch_step(monkeypatch_exp_controller): def test_model_without_batch_settings_makes_run_step( - monkeypatch, monkeypatch_exp_controller + monkeypatch, monkeypatch_exp_controller, test_dir ): - exp = Experiment("experiment", launcher="slurm") + exp = Experiment("experiment", launcher="slurm", exp_path=test_dir) rs = SrunSettings("python", exe_args="sleep.py") model = exp.create_model("test_model", run_settings=rs) @@ -150,8 +152,10 @@ def test_model_without_batch_settings_makes_run_step( assert isinstance(step, SrunStep) -def test_models_batch_settings_are_ignored_in_ensemble(monkeypatch_exp_controller): - exp = Experiment("experiment", launcher="slurm") +def test_models_batch_settings_are_ignored_in_ensemble( + monkeypatch_exp_controller, test_dir +): + exp = Experiment("experiment", launcher="slurm", exp_path=test_dir) bs_1 = SbatchSettings(nodes=5) rs = SrunSettings("python", exe_args="sleep.py") model = exp.create_model("test_model", run_settings=rs, batch_settings=bs_1) From d4a78b67761f29b73883c5c1c76a9a82aa6d6c42 Mon Sep 17 00:00:00 2001 From: Matt Drozt Date: Mon, 22 Apr 2024 12:32:22 -0700 Subject: [PATCH 07/13] Fix failing and noisy WLM test (#559) Fixes: - `tests/backends/test_onnx.py::test_sklearn_onnx` - Correctly set number of tasks when not using the local launcher - Makes sure the DB is not left running on test failure - `tests/full_wlm/test_generic_orc_launch_batch.py::test_launch_cluster_orc_reconnect` - Look for pickle file under the orchestrator path rather than the test dir - Makes sure that DB is cleaned up correctly on test failure Quiets: - `tests/on_wlm/test_het_job.py` - All experiments under the test module are given an explict test path [ committed by @MattToast ] [ reviewed by @ashao ] --- doc/changelog.rst | 6 ++ tests/backends/test_onnx.py | 31 +++++---- .../full_wlm/test_generic_orc_launch_batch.py | 67 +++++++++++++------ tests/on_wlm/test_het_job.py | 8 +-- 4 files changed, 74 insertions(+), 38 deletions(-) diff --git a/doc/changelog.rst b/doc/changelog.rst index e8bb9c6f5..8cd60055d 100644 --- a/doc/changelog.rst +++ b/doc/changelog.rst @@ -110,6 +110,11 @@ Detailed Notes undefined. (SmartSim-PR521_) - Remove previously deprecated behavior present in test suite on machines with Slurm and Open MPI. (SmartSim-PR520_) +- Experiments in the WLM tests are given explicit paths to prevent unexpected + directory creation. Ensure database are not left open on test suite failures. + Update path to pickle file in + ``tests/full_wlm/test_generic_orc_launch_batch.py::test_launch_cluster_orc_reconnect`` + to conform with changes made in SmartSim-PR533_. (SmartSim-PR559_) - When calling ``Experiment.start`` SmartSim would register a signal handler that would capture an interrupt signal (^C) to kill any jobs launched through its ``JobManager``. This would replace the default (or user defined) signal @@ -117,6 +122,7 @@ Detailed Notes the previously registered signal handler. (SmartSim-PR535_) .. _SmartSim-PR560: https://github.com/CrayLabs/SmartSim/pull/560 +.. _SmartSim-PR559: https://github.com/CrayLabs/SmartSim/pull/559 .. _SmartSim-PR558: https://github.com/CrayLabs/SmartSim/pull/558 .. _SmartSim-PR545: https://github.com/CrayLabs/SmartSim/pull/545 .. _SmartSim-PR557: https://github.com/CrayLabs/SmartSim/pull/557 diff --git a/tests/backends/test_onnx.py b/tests/backends/test_onnx.py index 7972d1746..871c3f059 100644 --- a/tests/backends/test_onnx.py +++ b/tests/backends/test_onnx.py @@ -25,6 +25,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import os +import sys from pathlib import Path import pytest @@ -81,21 +82,25 @@ def test_sklearn_onnx(test_dir, mlutils, wlmutils): db = wlmutils.get_orchestrator(nodes=1) db.set_path(test_dir) - exp.start(db) - - run_settings = exp.create_run_settings( - "python", f"run_sklearn_onnx.py --device={test_device}" - ) - model = exp.create_model("onnx_models", run_settings) - script_dir = os.path.dirname(os.path.abspath(__file__)) - script_path = Path(script_dir, "run_sklearn_onnx.py").resolve() - model.attach_generator_files(to_copy=str(script_path)) - exp.generate(model) - - exp.start(model, block=True) + exp.start(db) + try: + run_settings = exp.create_run_settings( + sys.executable, f"run_sklearn_onnx.py --device={test_device}" + ) + if wlmutils.get_test_launcher() != "local": + run_settings.set_tasks(1) + model = exp.create_model("onnx_models", run_settings) + + script_dir = os.path.dirname(os.path.abspath(__file__)) + script_path = Path(script_dir, "run_sklearn_onnx.py").resolve() + model.attach_generator_files(to_copy=str(script_path)) + exp.generate(model) + + exp.start(model, block=True) + finally: + exp.stop(db) - exp.stop(db) # if model failed, test will fail model_status = exp.get_status(model) assert model_status[0] != SmartSimStatus.STATUS_FAILED diff --git a/tests/full_wlm/test_generic_orc_launch_batch.py b/tests/full_wlm/test_generic_orc_launch_batch.py index ed28da878..b3a0ba57b 100644 --- a/tests/full_wlm/test_generic_orc_launch_batch.py +++ b/tests/full_wlm/test_generic_orc_launch_batch.py @@ -25,6 +25,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import os.path as osp +import pathlib import time import pytest @@ -147,9 +148,12 @@ def test_launch_cluster_orc_batch_multi(test_dir, wlmutils): def test_launch_cluster_orc_reconnect(test_dir, wlmutils): """test reconnecting to clustered 3-node orchestrator""" + p_test_dir = pathlib.Path(test_dir) launcher = wlmutils.get_test_launcher() exp_name = "test-launch-cluster-orc-batch-reconect" - exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) + exp_1_dir = p_test_dir / exp_name + exp_1_dir.mkdir() + exp = Experiment(exp_name, launcher=launcher, exp_path=str(exp_1_dir)) # batch = False to launch on existing allocation network_interface = wlmutils.get_test_interface() @@ -164,26 +168,47 @@ def test_launch_cluster_orc_reconnect(test_dir, wlmutils): exp.start(orc, block=True) statuses = exp.get_status(orc) - # don't use assert so that orc we don't leave an orphan process - if SmartSimStatus.STATUS_FAILED in statuses: + try: + assert all(stat == SmartSimStatus.STATUS_RUNNING for stat in statuses) + except Exception: exp.stop(orc) - assert False - - exp.stop(orc) + raise exp_name = "test-orc-cluster-orc-batch-reconnect-2nd" - exp_2 = Experiment(exp_name, launcher=launcher) - - checkpoint = osp.join(test_dir, "smartsim_db.dat") - reloaded_orc = exp_2.reconnect_orchestrator(checkpoint) - - # let statuses update once - time.sleep(5) - - statuses = exp_2.get_status(reloaded_orc) - for stat in statuses: - if stat == SmartSimStatus.STATUS_FAILED: - exp_2.stop(reloaded_orc) - assert False - - exp_2.stop(reloaded_orc) + exp_2_dir = p_test_dir / exp_name + exp_2_dir.mkdir() + exp_2 = Experiment(exp_name, launcher=launcher, exp_path=str(exp_2_dir)) + + try: + checkpoint = osp.join(orc.path, "smartsim_db.dat") + reloaded_orc = exp_2.reconnect_orchestrator(checkpoint) + + # let statuses update once + time.sleep(5) + + statuses = exp_2.get_status(reloaded_orc) + assert all(stat == SmartSimStatus.STATUS_RUNNING for stat in statuses) + except Exception: + # Something went wrong! Let the experiment that started the DB + # clean up the DB + exp.stop(orc) + raise + + try: + # Test experiment 2 can stop the DB + exp_2.stop(reloaded_orc) + assert all( + stat == SmartSimStatus.STATUS_CANCELLED + for stat in exp_2.get_status(reloaded_orc) + ) + except Exception: + # Something went wrong! Let the experiment that started the DB + # clean up the DB + exp.stop(orc) + raise + else: + # Ensure it is the same DB that Experiment 1 was tracking + time.sleep(5) + assert not any( + stat == SmartSimStatus.STATUS_RUNNING for stat in exp.get_status(orc) + ) diff --git a/tests/on_wlm/test_het_job.py b/tests/on_wlm/test_het_job.py index ea28c944f..aeea7b474 100644 --- a/tests/on_wlm/test_het_job.py +++ b/tests/on_wlm/test_het_job.py @@ -34,10 +34,10 @@ pytestmark = pytest.mark.skip(reason="Test is only for Slurm WLM systems") -def test_mpmd_errors(monkeypatch): +def test_mpmd_errors(monkeypatch, test_dir): monkeypatch.setenv("SLURM_HET_SIZE", "1") exp_name = "test-het-job-errors" - exp = Experiment(exp_name, launcher="slurm") + exp = Experiment(exp_name, exp_path=test_dir, launcher="slurm") rs: SrunSettings = exp.create_run_settings("sleep", "1", run_command="srun") rs2: SrunSettings = exp.create_run_settings("sleep", "1", run_command="srun") with pytest.raises(ValueError): @@ -49,11 +49,11 @@ def test_mpmd_errors(monkeypatch): rs.set_het_group(1) -def test_set_het_groups(monkeypatch): +def test_set_het_groups(monkeypatch, test_dir): """Test ability to set one or more het groups to run setting""" monkeypatch.setenv("SLURM_HET_SIZE", "4") exp_name = "test-set-het-group" - exp = Experiment(exp_name, launcher="slurm") + exp = Experiment(exp_name, exp_path=test_dir, launcher="slurm") rs: SrunSettings = exp.create_run_settings("sleep", "1", run_command="srun") rs.set_het_group([1]) assert rs.run_args["het-group"] == "1" From 62f2e8ce7f05a341cf425e3f04c16e5f692d701d Mon Sep 17 00:00:00 2001 From: Alyssa Cote <46540273+AlyssaCote@users.noreply.github.com> Date: Tue, 23 Apr 2024 08:51:43 -0700 Subject: [PATCH 08/13] Symlink batch ensembles and batch models (#547) After testing a bunch of batch ensembles and batch models, I found that I hadn't actually symlinked the substeps in the controller. This fix should properly symlink the substeps. [ committed by @AlyssaCote ] [ reviewed by @ankona ] --- doc/changelog.rst | 3 + smartsim/_core/control/controller.py | 82 +++---- smartsim/_core/control/controller_utils.py | 77 +++++++ tests/on_wlm/test_symlinking.py | 175 ++++++++++++++ tests/test_output_files.py | 76 +----- tests/test_symlinking.py | 254 +++++++++++++++++++++ 6 files changed, 551 insertions(+), 116 deletions(-) create mode 100644 smartsim/_core/control/controller_utils.py create mode 100644 tests/on_wlm/test_symlinking.py create mode 100644 tests/test_symlinking.py diff --git a/doc/changelog.rst b/doc/changelog.rst index 8cd60055d..ef6d31012 100644 --- a/doc/changelog.rst +++ b/doc/changelog.rst @@ -18,6 +18,7 @@ To be released at some future point in time Description +- Fix symlinking batch ensemble and model bug - Remove defensive regexp in .gitignore - Upgrade ubuntu to 22.04 - Remove helper function ``init_default`` @@ -45,6 +46,7 @@ Description Detailed Notes +- Properly symlinks batch ensembles and batch models. (SmartSim-PR547_) - Remove defensive regexp in .gitignore and ensure tests write to test_output. (SmartSim-PR560_) - After dropping support for Python 3.8, ubuntu needs to be upgraded. @@ -121,6 +123,7 @@ Detailed Notes handler. SmartSim will now attempt to kill any launched jobs before calling the previously registered signal handler. (SmartSim-PR535_) +.. _SmartSim-PR547: https://github.com/CrayLabs/SmartSim/pull/547 .. _SmartSim-PR560: https://github.com/CrayLabs/SmartSim/pull/560 .. _SmartSim-PR559: https://github.com/CrayLabs/SmartSim/pull/559 .. _SmartSim-PR558: https://github.com/CrayLabs/SmartSim/pull/558 diff --git a/smartsim/_core/control/controller.py b/smartsim/_core/control/controller.py index ae01e396c..16e52a517 100644 --- a/smartsim/_core/control/controller.py +++ b/smartsim/_core/control/controller.py @@ -55,7 +55,7 @@ shutdown_db_node, ) from ...database import Orchestrator -from ...entity import Ensemble, EntityList, EntitySequence, Model, SmartSimEntity +from ...entity import Ensemble, EntitySequence, Model, SmartSimEntity from ...error import ( LauncherError, SmartSimError, @@ -70,6 +70,7 @@ from ..launcher import LocalLauncher, LSFLauncher, PBSLauncher, SlurmLauncher from ..launcher.launcher import Launcher from ..utils import check_cluster_status, create_cluster, serialize +from .controller_utils import _AnonymousBatchJob, _look_up_launched_data from .job import Job from .jobmanager import JobManager from .manifest import LaunchedManifest, LaunchedManifestBuilder, Manifest @@ -376,14 +377,17 @@ def symlink_output_files( entity_out.unlink() entity_err.unlink() - try: + historical_err.touch() + historical_out.touch() + + if historical_err.exists() and historical_out.exists(): entity_out.symlink_to(historical_out) entity_err.symlink_to(historical_err) - except FileNotFoundError as fnf: + else: raise FileNotFoundError( f"Output files for {entity.name} could not be found. " "Symlinking files failed." - ) from fnf + ) def _launch( self, exp_name: str, exp_path: str, manifest: Manifest @@ -432,6 +436,11 @@ def _launch( steps: t.List[ t.Tuple[Step, t.Union[SmartSimEntity, EntitySequence[SmartSimEntity]]] ] = [] + + symlink_substeps: t.List[ + t.Tuple[Step, t.Union[SmartSimEntity, EntitySequence[SmartSimEntity]]] + ] = [] + for elist in manifest.ensembles: ens_telem_dir = manifest_builder.run_telemetry_subdirectory / "ensemble" if elist.batch: @@ -439,6 +448,11 @@ def _launch( manifest_builder.add_ensemble( elist, [(batch_step.name, step) for step in substeps] ) + + # symlink substeps to maintain directory structure + for substep, substep_entity in zip(substeps, elist.models): + symlink_substeps.append((substep, substep_entity)) + steps.append((batch_step, elist)) else: # if ensemble is to be run as separate job steps, aka not in a batch @@ -456,19 +470,26 @@ def _launch( model_telem_dir = manifest_builder.run_telemetry_subdirectory / "model" if model.batch_settings: anon_entity_list = _AnonymousBatchJob(model) - batch_step, _ = self._create_batch_job_step( + batch_step, substeps = self._create_batch_job_step( anon_entity_list, model_telem_dir ) manifest_builder.add_model(model, (batch_step.name, batch_step)) + + symlink_substeps.append((substeps[0], model)) steps.append((batch_step, model)) else: job_step = self._create_job_step(model, model_telem_dir) manifest_builder.add_model(model, (job_step.name, job_step)) steps.append((job_step, model)) - # launch steps + # launch and symlink steps for step, entity in steps: self._launch_step(step, entity) + self.symlink_output_files(step, entity) + + # symlink substeps to maintain directory structure + for substep, entity in symlink_substeps: + self.symlink_output_files(substep, entity) return manifest_builder.finalize() @@ -501,12 +522,13 @@ def _launch_orchestrator( orchestrator, [(orc_batch_step.name, step) for step in substeps] ) + self._launch_step(orc_batch_step, orchestrator) + self.symlink_output_files(orc_batch_step, orchestrator) + # symlink substeps to maintain directory structure for substep, substep_entity in zip(substeps, orchestrator.entities): self.symlink_output_files(substep, substep_entity) - self._launch_step(orc_batch_step, orchestrator) - # if orchestrator was run on existing allocation, locally, or in allocation else: db_steps = [ @@ -518,6 +540,7 @@ def _launch_orchestrator( ) for db_step in db_steps: self._launch_step(*db_step) + self.symlink_output_files(*db_step) # wait for orchestrator to spin up self._orchestrator_launch_wait(orchestrator) @@ -572,7 +595,6 @@ def _launch_step( if completed_job is None and ( entity.name not in self._jobs.jobs and entity.name not in self._jobs.db_jobs ): - self.symlink_output_files(job_step, entity) try: job_id = self._launcher.run(job_step) except LauncherError as e: @@ -581,10 +603,10 @@ def _launch_step( msg += f"{entity}" logger.error(msg) raise SmartSimError(f"Job step {entity.name} failed to launch") from e + # if the completed job does exist and the entity passed in is the same # that has ran and completed, relaunch the entity. elif completed_job is not None and completed_job.entity is entity: - self.symlink_output_files(job_step, entity) try: job_id = self._launcher.run(job_step) except LauncherError as e: @@ -593,6 +615,7 @@ def _launch_step( msg += f"{entity}" logger.error(msg) raise SmartSimError(f"Job step {entity.name} failed to launch") from e + # the entity is using a duplicate name of an existing entity in # the experiment, throw an error else: @@ -938,42 +961,3 @@ def _start_telemetry_monitor(self, exp_dir: str) -> None: cwd=str(pathlib.Path(__file__).parent.parent.parent), shell=False, ) - - -class _AnonymousBatchJob(EntityList[Model]): - @staticmethod - def _validate(model: Model) -> None: - if model.batch_settings is None: - msg = "Unable to create _AnonymousBatchJob without batch_settings" - raise SmartSimError(msg) - - def __init__(self, model: Model) -> None: - self._validate(model) - super().__init__(model.name, model.path) - self.entities = [model] - self.batch_settings = model.batch_settings - - def _initialize_entities(self, **kwargs: t.Any) -> None: ... - - -def _look_up_launched_data( - launcher: Launcher, -) -> t.Callable[[t.Tuple[str, Step]], "TStepLaunchMetaData"]: - def _unpack_launched_data(data: t.Tuple[str, Step]) -> "TStepLaunchMetaData": - # NOTE: we cannot assume that the name of the launched step - # ``launched_step_name`` is equal to the name of the step referring to - # the entity ``step.name`` as is the case when an entity list is - # launched as a batch job - launched_step_name, step = data - launched_step_map = launcher.step_mapping[launched_step_name] - out_file, err_file = step.get_output_files() - return ( - launched_step_map.step_id, - launched_step_map.task_id, - launched_step_map.managed, - out_file, - err_file, - pathlib.Path(step.meta.get("status_dir", step.cwd)), - ) - - return _unpack_launched_data diff --git a/smartsim/_core/control/controller_utils.py b/smartsim/_core/control/controller_utils.py new file mode 100644 index 000000000..37ae9aebf --- /dev/null +++ b/smartsim/_core/control/controller_utils.py @@ -0,0 +1,77 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from __future__ import annotations + +import pathlib +import typing as t + +from ..._core.launcher.step import Step +from ...entity import EntityList, Model +from ...error import SmartSimError +from ..launcher.launcher import Launcher + +if t.TYPE_CHECKING: + from ..utils.serialize import TStepLaunchMetaData + + +class _AnonymousBatchJob(EntityList[Model]): + @staticmethod + def _validate(model: Model) -> None: + if model.batch_settings is None: + msg = "Unable to create _AnonymousBatchJob without batch_settings" + raise SmartSimError(msg) + + def __init__(self, model: Model) -> None: + self._validate(model) + super().__init__(model.name, model.path) + self.entities = [model] + self.batch_settings = model.batch_settings + + def _initialize_entities(self, **kwargs: t.Any) -> None: ... + + +def _look_up_launched_data( + launcher: Launcher, +) -> t.Callable[[t.Tuple[str, Step]], "TStepLaunchMetaData"]: + def _unpack_launched_data(data: t.Tuple[str, Step]) -> "TStepLaunchMetaData": + # NOTE: we cannot assume that the name of the launched step + # ``launched_step_name`` is equal to the name of the step referring to + # the entity ``step.name`` as is the case when an entity list is + # launched as a batch job + launched_step_name, step = data + launched_step_map = launcher.step_mapping[launched_step_name] + out_file, err_file = step.get_output_files() + return ( + launched_step_map.step_id, + launched_step_map.task_id, + launched_step_map.managed, + out_file, + err_file, + pathlib.Path(step.meta.get("status_dir", step.cwd)), + ) + + return _unpack_launched_data diff --git a/tests/on_wlm/test_symlinking.py b/tests/on_wlm/test_symlinking.py new file mode 100644 index 000000000..246457d1c --- /dev/null +++ b/tests/on_wlm/test_symlinking.py @@ -0,0 +1,175 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import pathlib +import time + +from smartsim import Experiment + + +def test_batch_model_and_ensemble(test_dir, wlmutils): + exp_name = "test-batch" + launcher = wlmutils.get_test_launcher() + exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) + rs = exp.create_run_settings("echo", ["spam", "eggs"]) + bs = exp.create_batch_settings() + + test_model = exp.create_model( + "test_model", path=test_dir, run_settings=rs, batch_settings=bs + ) + exp.generate(test_model) + exp.start(test_model, block=True) + + assert pathlib.Path(test_model.path).exists() + _should_be_symlinked(pathlib.Path(test_model.path, f"{test_model.name}.out"), True) + _should_be_symlinked(pathlib.Path(test_model.path, f"{test_model.name}.err"), False) + _should_not_be_symlinked(pathlib.Path(test_model.path, f"{test_model.name}.sh")) + + test_ensemble = exp.create_ensemble( + "test_ensemble", params={}, batch_settings=bs, run_settings=rs, replicas=3 + ) + exp.generate(test_ensemble) + exp.start(test_ensemble, block=True) + + assert pathlib.Path(test_ensemble.path).exists() + for i in range(len(test_ensemble.models)): + _should_be_symlinked( + pathlib.Path( + test_ensemble.path, + f"{test_ensemble.name}_{i}", + f"{test_ensemble.name}_{i}.out", + ), + True, + ) + _should_be_symlinked( + pathlib.Path( + test_ensemble.path, + f"{test_ensemble.name}_{i}", + f"{test_ensemble.name}_{i}.err", + ), + False, + ) + + _should_not_be_symlinked(pathlib.Path(exp.exp_path, "smartsim_params.txt")) + + +def test_batch_ensemble_symlinks(test_dir, wlmutils): + exp_name = "test-batch-ensemble" + launcher = wlmutils.get_test_launcher() + exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) + rs = exp.create_run_settings("echo", ["spam", "eggs"]) + bs = exp.create_batch_settings() + test_ensemble = exp.create_ensemble( + "test_ensemble", params={}, batch_settings=bs, run_settings=rs, replicas=3 + ) + exp.generate(test_ensemble) + exp.start(test_ensemble, block=True) + + for i in range(len(test_ensemble.models)): + _should_be_symlinked( + pathlib.Path( + test_ensemble.path, + f"{test_ensemble.name}_{i}", + f"{test_ensemble.name}_{i}.out", + ), + True, + ) + _should_be_symlinked( + pathlib.Path( + test_ensemble.path, + f"{test_ensemble.name}_{i}", + f"{test_ensemble.name}_{i}.err", + ), + False, + ) + + _should_not_be_symlinked(pathlib.Path(exp.exp_path, "smartsim_params.txt")) + + +def test_batch_model_symlinks(test_dir, wlmutils): + exp_name = "test-batch-model" + launcher = wlmutils.get_test_launcher() + exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) + rs = exp.create_run_settings("echo", ["spam", "eggs"]) + bs = exp.create_batch_settings() + test_model = exp.create_model( + "test_model", path=test_dir, run_settings=rs, batch_settings=bs + ) + exp.generate(test_model) + exp.start(test_model, block=True) + + assert pathlib.Path(test_model.path).exists() + + _should_be_symlinked(pathlib.Path(test_model.path, f"{test_model.name}.out"), True) + _should_be_symlinked(pathlib.Path(test_model.path, f"{test_model.name}.err"), False) + _should_not_be_symlinked(pathlib.Path(test_model.path, f"{test_model.name}.sh")) + + +def test_batch_orchestrator_symlinks(test_dir, wlmutils): + exp_name = "test-batch-orc" + launcher = wlmutils.get_test_launcher() + exp = Experiment(exp_name, launcher=launcher, exp_path=test_dir) + port = 2424 + db = exp.create_database( + db_nodes=3, + port=port, + batch=True, + interface=wlmutils.get_test_interface(), + single_cmd=False, + ) + exp.generate(db) + exp.start(db, block=True) + time.sleep(2) + exp.stop(db) + + _should_be_symlinked(pathlib.Path(db.path, f"{db.name}.out"), False) + _should_be_symlinked(pathlib.Path(db.path, f"{db.name}.err"), False) + + for i in range(db.db_nodes): + _should_be_symlinked(pathlib.Path(db.path, f"{db.name}_{i}.out"), False) + _should_be_symlinked(pathlib.Path(db.path, f"{db.name}_{i}.err"), False) + _should_not_be_symlinked( + pathlib.Path(db.path, f"nodes-orchestrator_{i}-{port}.conf") + ) + + +def _should_not_be_symlinked(non_linked_path: pathlib.Path): + """Helper function for assertions about paths that should NOT be symlinked""" + assert non_linked_path.exists() + assert not non_linked_path.is_symlink() + + +def _should_be_symlinked(linked_path: pathlib.Path, open_file: bool): + """Helper function for assertions about paths that SHOULD be symlinked""" + assert linked_path.exists() + assert linked_path.is_symlink() + # ensure the source file exists + assert pathlib.Path(os.readlink(linked_path)).exists() + if open_file: + with open(pathlib.Path(os.readlink(linked_path)), "r") as file: + log_contents = file.read() + assert "spam eggs" in log_contents diff --git a/tests/test_output_files.py b/tests/test_output_files.py index 987fa9068..f3830051c 100644 --- a/tests/test_output_files.py +++ b/tests/test_output_files.py @@ -31,7 +31,7 @@ from smartsim import Experiment from smartsim._core.config import CONFIG -from smartsim._core.control.controller import Controller +from smartsim._core.control.controller import Controller, _AnonymousBatchJob from smartsim._core.launcher.step import Step from smartsim.database.orchestrator import Orchestrator from smartsim.entity.ensemble import Ensemble @@ -39,6 +39,9 @@ from smartsim.settings.base import RunSettings from smartsim.settings.slurmSettings import SbatchSettings, SrunSettings +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + controller = Controller() slurm_controller = Controller(launcher="slurm") @@ -48,8 +51,11 @@ ens = Ensemble("ens", params={}, run_settings=rs, batch_settings=bs, replicas=3) orc = Orchestrator(db_nodes=3, batch=True, launcher="slurm", run_command="srun") -model = Model("test_model", {}, "", rs) -batch_model = Model("batch_test_model", {}, "", batch_rs, batch_settings=bs) +model = Model("test_model", params={}, path="", run_settings=rs) +batch_model = Model( + "batch_test_model", params={}, path="", run_settings=batch_rs, batch_settings=bs +) +anon_batch_model = _AnonymousBatchJob(batch_model) def test_mutated_model_output(test_dir): @@ -161,67 +167,3 @@ def test_get_output_files_no_status_dir(test_dir): step = Step("mock-step", test_dir, step_settings) with pytest.raises(KeyError): out, err = step.get_output_files() - - -@pytest.mark.parametrize( - "entity", - [pytest.param(ens, id="ensemble"), pytest.param(model, id="model")], -) -def test_symlink(test_dir, entity): - """Test symlinking historical output files""" - entity.path = test_dir - if entity.type == Ensemble: - for member in ens.models: - symlink_with_create_job_step(test_dir, member) - else: - symlink_with_create_job_step(test_dir, entity) - - -def symlink_with_create_job_step(test_dir, entity): - """Function that helps cut down on repeated testing code""" - exp_dir = pathlib.Path(test_dir) - entity.path = test_dir - status_dir = exp_dir / CONFIG.telemetry_subdir / entity.type - step = controller._create_job_step(entity, status_dir) - controller.symlink_output_files(step, entity) - assert pathlib.Path(entity.path, f"{entity.name}.out").is_symlink() - assert pathlib.Path(entity.path, f"{entity.name}.err").is_symlink() - assert os.readlink(pathlib.Path(entity.path, f"{entity.name}.out")) == str( - status_dir / entity.name / (entity.name + ".out") - ) - assert os.readlink(pathlib.Path(entity.path, f"{entity.name}.err")) == str( - status_dir / entity.name / (entity.name + ".err") - ) - - -@pytest.mark.parametrize( - "entity", - [pytest.param(ens, id="ensemble"), pytest.param(orc, id="orchestrator")], -) -def test_batch_symlink(entity, test_dir): - """Test symlinking historical output files""" - exp_dir = pathlib.Path(test_dir) - entity.path = test_dir - status_dir = exp_dir / CONFIG.telemetry_subdir / entity.type - batch_step, substeps = slurm_controller._create_batch_job_step(entity, status_dir) - for step in substeps: - slurm_controller.symlink_output_files(step, entity) - assert pathlib.Path(entity.path, f"{entity.name}.out").is_symlink() - assert pathlib.Path(entity.path, f"{entity.name}.err").is_symlink() - assert os.readlink(pathlib.Path(entity.path, f"{entity.name}.out")) == str( - status_dir / entity.name / step.entity_name / (step.entity_name + ".out") - ) - assert os.readlink(pathlib.Path(entity.path, f"{entity.name}.err")) == str( - status_dir / entity.name / step.entity_name / (step.entity_name + ".err") - ) - - -def test_symlink_error(test_dir): - """Ensure FileNotFoundError is thrown""" - bad_model = Model( - "bad_model", {}, pathlib.Path(test_dir, "badpath"), RunSettings("echo") - ) - telem_dir = pathlib.Path(test_dir, "bad_model_telemetry") - bad_step = controller._create_job_step(bad_model, telem_dir) - with pytest.raises(FileNotFoundError): - controller.symlink_output_files(bad_step, bad_model) diff --git a/tests/test_symlinking.py b/tests/test_symlinking.py new file mode 100644 index 000000000..2b70e3e9f --- /dev/null +++ b/tests/test_symlinking.py @@ -0,0 +1,254 @@ +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import pathlib + +import pytest + +from smartsim import Experiment +from smartsim._core.config import CONFIG +from smartsim._core.control.controller import Controller, _AnonymousBatchJob +from smartsim.database.orchestrator import Orchestrator +from smartsim.entity.ensemble import Ensemble +from smartsim.entity.model import Model +from smartsim.settings.base import RunSettings +from smartsim.settings.slurmSettings import SbatchSettings, SrunSettings + +# The tests in this file belong to the group_a group +pytestmark = pytest.mark.group_a + +controller = Controller() +slurm_controller = Controller(launcher="slurm") + +rs = RunSettings("echo", ["spam", "eggs"]) +bs = SbatchSettings() +batch_rs = SrunSettings("echo", ["spam", "eggs"]) + +ens = Ensemble("ens", params={}, run_settings=rs, batch_settings=bs, replicas=3) +orc = Orchestrator(db_nodes=3, batch=True, launcher="slurm", run_command="srun") +model = Model("test_model", params={}, path="", run_settings=rs) +batch_model = Model( + "batch_test_model", params={}, path="", run_settings=batch_rs, batch_settings=bs +) +anon_batch_model = _AnonymousBatchJob(batch_model) + + +@pytest.mark.parametrize( + "entity", + [pytest.param(ens, id="ensemble"), pytest.param(model, id="model")], +) +def test_symlink(test_dir, entity): + """Test symlinking historical output files""" + entity.path = test_dir + if entity.type == Ensemble: + for member in ens.models: + symlink_with_create_job_step(test_dir, member) + else: + symlink_with_create_job_step(test_dir, entity) + + +def symlink_with_create_job_step(test_dir, entity): + """Function that helps cut down on repeated testing code""" + exp_dir = pathlib.Path(test_dir) + entity.path = test_dir + status_dir = exp_dir / CONFIG.telemetry_subdir / entity.type + step = controller._create_job_step(entity, status_dir) + controller.symlink_output_files(step, entity) + assert pathlib.Path(entity.path, f"{entity.name}.out").is_symlink() + assert pathlib.Path(entity.path, f"{entity.name}.err").is_symlink() + assert os.readlink(pathlib.Path(entity.path, f"{entity.name}.out")) == str( + status_dir / entity.name / (entity.name + ".out") + ) + assert os.readlink(pathlib.Path(entity.path, f"{entity.name}.err")) == str( + status_dir / entity.name / (entity.name + ".err") + ) + + +@pytest.mark.parametrize( + "entity", + [ + pytest.param(ens, id="ensemble"), + pytest.param(orc, id="orchestrator"), + pytest.param(anon_batch_model, id="model"), + ], +) +def test_batch_symlink(entity, test_dir): + """Test symlinking historical output files""" + exp_dir = pathlib.Path(test_dir) + entity.path = test_dir + status_dir = exp_dir / CONFIG.telemetry_subdir / entity.type + batch_step, substeps = slurm_controller._create_batch_job_step(entity, status_dir) + for step in substeps: + slurm_controller.symlink_output_files(step, entity) + assert pathlib.Path(entity.path, f"{entity.name}.out").is_symlink() + assert pathlib.Path(entity.path, f"{entity.name}.err").is_symlink() + assert os.readlink(pathlib.Path(entity.path, f"{entity.name}.out")) == str( + status_dir / entity.name / step.entity_name / (step.entity_name + ".out") + ) + assert os.readlink(pathlib.Path(entity.path, f"{entity.name}.err")) == str( + status_dir / entity.name / step.entity_name / (step.entity_name + ".err") + ) + + +def test_symlink_error(test_dir): + """Ensure FileNotFoundError is thrown""" + bad_model = Model( + "bad_model", + params={}, + path=pathlib.Path(test_dir, "badpath"), + run_settings=RunSettings("echo"), + ) + telem_dir = pathlib.Path(test_dir, "bad_model_telemetry") + bad_step = controller._create_job_step(bad_model, telem_dir) + with pytest.raises(FileNotFoundError): + controller.symlink_output_files(bad_step, bad_model) + + +def test_failed_model_launch_symlinks(test_dir): + exp_name = "failed-exp" + exp = Experiment(exp_name, exp_path=test_dir) + test_model = exp.create_model( + "test_model", run_settings=batch_rs, batch_settings=bs + ) + exp.generate(test_model) + with pytest.raises(TypeError): + exp.start(test_model) + + _should_not_be_symlinked(pathlib.Path(test_model.path)) + assert not pathlib.Path(test_model.path, f"{test_model.name}.out").is_symlink() + assert not pathlib.Path(test_model.path, f"{test_model.name}.err").is_symlink() + + +def test_failed_ensemble_launch_symlinks(test_dir): + exp_name = "failed-exp" + exp = Experiment(exp_name, exp_path=test_dir) + test_ensemble = exp.create_ensemble( + "test_ensemble", params={}, batch_settings=bs, run_settings=batch_rs, replicas=3 + ) + exp.generate(test_ensemble) + with pytest.raises(TypeError): + exp.start(test_ensemble) + + _should_not_be_symlinked(pathlib.Path(test_ensemble.path)) + assert not pathlib.Path( + test_ensemble.path, f"{test_ensemble.name}.out" + ).is_symlink() + assert not pathlib.Path( + test_ensemble.path, f"{test_ensemble.name}.err" + ).is_symlink() + + for i in range(len(test_ensemble.models)): + assert not pathlib.Path( + test_ensemble.path, + f"{test_ensemble.name}_{i}", + f"{test_ensemble.name}_{i}.out", + ).is_symlink() + assert not pathlib.Path( + test_ensemble.path, + f"{test_ensemble.name}_{i}", + f"{test_ensemble.name}_{i}.err", + ).is_symlink() + + +def test_non_batch_ensemble_symlinks(test_dir): + exp_name = "test-non-batch-ensemble" + rs = RunSettings("echo", ["spam", "eggs"]) + exp = Experiment(exp_name, exp_path=test_dir) + test_ensemble = exp.create_ensemble( + "test_ensemble", params={}, run_settings=rs, replicas=3 + ) + exp.generate(test_ensemble) + exp.start(test_ensemble, block=True) + + for i in range(len(test_ensemble.models)): + _should_be_symlinked( + pathlib.Path( + test_ensemble.path, + f"{test_ensemble.name}_{i}", + f"{test_ensemble.name}_{i}.out", + ), + True, + ) + _should_be_symlinked( + pathlib.Path( + test_ensemble.path, + f"{test_ensemble.name}_{i}", + f"{test_ensemble.name}_{i}.err", + ), + False, + ) + + _should_not_be_symlinked(pathlib.Path(exp.exp_path, "smartsim_params.txt")) + + +def test_non_batch_model_symlinks(test_dir): + exp_name = "test-non-batch-model" + exp = Experiment(exp_name, exp_path=test_dir) + rs = RunSettings("echo", ["spam", "eggs"]) + + test_model = exp.create_model("test_model", path=test_dir, run_settings=rs) + exp.generate(test_model) + exp.start(test_model, block=True) + + assert pathlib.Path(test_model.path).exists() + + _should_be_symlinked(pathlib.Path(test_model.path, f"{test_model.name}.out"), True) + _should_be_symlinked(pathlib.Path(test_model.path, f"{test_model.name}.err"), False) + _should_not_be_symlinked(pathlib.Path(exp.exp_path, "smartsim_params.txt")) + + +def test_non_batch_orchestrator_symlinks(test_dir): + exp = Experiment("test-non-batch-orc", exp_path=test_dir) + + db = exp.create_database(interface="lo") + exp.generate(db) + exp.start(db, block=True) + exp.stop(db) + + for i in range(db.db_nodes): + _should_be_symlinked(pathlib.Path(db.path, f"{db.name}_{i}.out"), False) + _should_be_symlinked(pathlib.Path(db.path, f"{db.name}_{i}.err"), False) + + _should_not_be_symlinked(pathlib.Path(exp.exp_path, "smartsim_params.txt")) + + +def _should_not_be_symlinked(non_linked_path: pathlib.Path): + """Helper function for assertions about paths that should NOT be symlinked""" + assert non_linked_path.exists() + assert not non_linked_path.is_symlink() + + +def _should_be_symlinked(linked_path: pathlib.Path, open_file: bool): + """Helper function for assertions about paths that SHOULD be symlinked""" + assert linked_path.exists() + assert linked_path.is_symlink() + # ensure the source file exists + assert pathlib.Path(os.readlink(linked_path)).exists() + if open_file: + with open(pathlib.Path(os.readlink(linked_path)), "r") as file: + log_contents = file.read() + assert "spam eggs" in log_contents From 399886bd4734fbd40890b6095e723a8e099b5761 Mon Sep 17 00:00:00 2001 From: Alyssa Cote <46540273+AlyssaCote@users.noreply.github.com> Date: Tue, 23 Apr 2024 11:19:53 -0700 Subject: [PATCH 09/13] Bump manifest.json version to 0.0.4 (#563) The `manifest.json` version needs to be bumped from `0.0.3` to `0.0.4` to match the version of SmartDashboard. [ committed by @AlyssaCote ] [ reviewed by @MattToast ] --- doc/changelog.rst | 4 ++++ smartsim/_core/utils/serialize.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/doc/changelog.rst b/doc/changelog.rst index ef6d31012..01e1e6651 100644 --- a/doc/changelog.rst +++ b/doc/changelog.rst @@ -18,6 +18,7 @@ To be released at some future point in time Description +- Bump manifest.json to version 0.0.4 - Fix symlinking batch ensemble and model bug - Remove defensive regexp in .gitignore - Upgrade ubuntu to 22.04 @@ -46,6 +47,8 @@ Description Detailed Notes +- The manifest.json version needs to match the SmartDashboard version, which is + 0.0.4 in the upcoming release. (SmartSim-PR563_) - Properly symlinks batch ensembles and batch models. (SmartSim-PR547_) - Remove defensive regexp in .gitignore and ensure tests write to test_output. (SmartSim-PR560_) @@ -123,6 +126,7 @@ Detailed Notes handler. SmartSim will now attempt to kill any launched jobs before calling the previously registered signal handler. (SmartSim-PR535_) +.. _SmartSim-PR563: https://github.com/CrayLabs/SmartSim/pull/563 .. _SmartSim-PR547: https://github.com/CrayLabs/SmartSim/pull/547 .. _SmartSim-PR560: https://github.com/CrayLabs/SmartSim/pull/560 .. _SmartSim-PR559: https://github.com/CrayLabs/SmartSim/pull/559 diff --git a/smartsim/_core/utils/serialize.py b/smartsim/_core/utils/serialize.py index c94d1f5ce..d4ec66eaf 100644 --- a/smartsim/_core/utils/serialize.py +++ b/smartsim/_core/utils/serialize.py @@ -77,7 +77,7 @@ def save_launch_manifest(manifest: _Manifest[TStepLaunchMetaData]) -> None: manifest_dict = { "schema info": { "schema_name": "entity manifest", - "version": "0.0.3", + "version": "0.0.4", }, "experiment": { "name": manifest.metadata.exp_name, From 05a1e0aa3701617845ca78c9f779c7836191998b Mon Sep 17 00:00:00 2001 From: amandarichardsonn <30413257+amandarichardsonn@users.noreply.github.com> Date: Wed, 24 Apr 2024 16:16:44 -0700 Subject: [PATCH 10/13] Auto-post release PR to develop (#566) This PR adds to the release.yml github workflow to autogenerate a PR that merge changes from master to develop. [ reviewed by @MattToast ] [ committed by @amandarichardsonn ] --- .github/workflows/release.yml | 12 ++++++++++++ doc/changelog.rst | 4 ++++ 2 files changed, 16 insertions(+) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index c58288ee5..4366caf28 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -124,3 +124,15 @@ jobs: user: __token__ password: ${{ secrets.PYPI }} #repository_url: https://test.pypi.org/legacy/ + + createPullRequest: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Create pull request + run: | + gh pr create -B develop -H master --title 'Merge master into develop' --body 'This PR brings develop up to date with master for release.' + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/doc/changelog.rst b/doc/changelog.rst index 01e1e6651..da5fbbcd3 100644 --- a/doc/changelog.rst +++ b/doc/changelog.rst @@ -18,6 +18,7 @@ To be released at some future point in time Description +- Auto-post release PR to develop - Bump manifest.json to version 0.0.4 - Fix symlinking batch ensemble and model bug - Remove defensive regexp in .gitignore @@ -47,6 +48,8 @@ Description Detailed Notes +- Add to github release workflow to auto generate a pull request from master + into develop for release. (SmartSim-PR566_) - The manifest.json version needs to match the SmartDashboard version, which is 0.0.4 in the upcoming release. (SmartSim-PR563_) - Properly symlinks batch ensembles and batch models. (SmartSim-PR547_) @@ -126,6 +129,7 @@ Detailed Notes handler. SmartSim will now attempt to kill any launched jobs before calling the previously registered signal handler. (SmartSim-PR535_) +.. _SmartSim-PR566: https://github.com/CrayLabs/SmartSim/pull/566 .. _SmartSim-PR563: https://github.com/CrayLabs/SmartSim/pull/563 .. _SmartSim-PR547: https://github.com/CrayLabs/SmartSim/pull/547 .. _SmartSim-PR560: https://github.com/CrayLabs/SmartSim/pull/560 From f5f238573e90e888f8229df334c81e9b7c9a86eb Mon Sep 17 00:00:00 2001 From: amandarichardsonn <30413257+amandarichardsonn@users.noreply.github.com> Date: Thu, 25 Apr 2024 14:10:18 -0700 Subject: [PATCH 11/13] Auto generate typehints into documentation (#561) This PR removes :type: and :rtype: driectives from function docstrings as well as implements the sphinx-autodoc-typehints extension. [ reviewed by @AlyssaCote ] [ committed by @amandarichardsonn ] --- conftest.py | 3 - doc/api/smartsim_api.rst | 2 + doc/changelog.rst | 4 + doc/conf.py | 24 +- doc/requirements-doc.txt | 1 + doc/tutorials/ml_training/surrogate/fd_sim.py | 5 - smartsim/_core/_cli/utils.py | 1 - smartsim/_core/_install/builder.py | 8 - smartsim/_core/control/controller.py | 33 +-- smartsim/_core/control/job.py | 30 +-- smartsim/_core/control/jobmanager.py | 22 -- smartsim/_core/control/manifest.py | 4 - smartsim/_core/entrypoints/colocated.py | 6 - smartsim/_core/entrypoints/indirect.py | 4 - .../_core/entrypoints/telemetrymonitor.py | 10 +- smartsim/_core/generation/generator.py | 24 +- smartsim/_core/generation/modelwriter.py | 10 - smartsim/_core/launcher/colocated.py | 17 +- smartsim/_core/launcher/launcher.py | 8 - smartsim/_core/launcher/local/local.py | 8 - smartsim/_core/launcher/lsf/lsfCommands.py | 7 - smartsim/_core/launcher/lsf/lsfLauncher.py | 6 - smartsim/_core/launcher/lsf/lsfParser.py | 15 -- smartsim/_core/launcher/pbs/pbsCommands.py | 4 - smartsim/_core/launcher/pbs/pbsLauncher.py | 6 - smartsim/_core/launcher/pbs/pbsParser.py | 12 - .../_core/launcher/slurm/slurmCommands.py | 7 - .../_core/launcher/slurm/slurmLauncher.py | 8 - smartsim/_core/launcher/slurm/slurmParser.py | 13 +- smartsim/_core/launcher/step/alpsStep.py | 5 - smartsim/_core/launcher/step/lsfStep.py | 11 - smartsim/_core/launcher/step/mpiStep.py | 17 -- smartsim/_core/launcher/step/pbsStep.py | 6 - smartsim/_core/launcher/step/slurmStep.py | 11 - smartsim/_core/launcher/step/step.py | 1 - smartsim/_core/launcher/taskManager.py | 35 +-- smartsim/_core/launcher/util/launcherUtil.py | 4 - smartsim/_core/launcher/util/shell.py | 22 +- smartsim/_core/utils/helpers.py | 26 +-- smartsim/_core/utils/network.py | 4 - smartsim/_core/utils/redis.py | 12 - smartsim/_core/utils/telemetry/collector.py | 46 ++-- smartsim/_core/utils/telemetry/manifest.py | 19 +- smartsim/_core/utils/telemetry/sink.py | 8 +- smartsim/_core/utils/telemetry/telemetry.py | 36 ++- smartsim/_core/utils/telemetry/util.py | 13 +- smartsim/database/orchestrator.py | 69 +++--- smartsim/entity/dbnode.py | 4 - smartsim/entity/dbobject.py | 39 +--- smartsim/entity/ensemble.py | 75 ++----- smartsim/entity/entity.py | 8 +- smartsim/entity/files.py | 16 -- smartsim/entity/model.py | 102 +++------ smartsim/experiment.py | 208 ++++++------------ smartsim/log.py | 24 +- smartsim/ml/data.py | 29 +-- smartsim/ml/tf/utils.py | 6 - smartsim/settings/alpsSettings.py | 24 +- smartsim/settings/base.py | 91 ++++---- smartsim/settings/containers.py | 8 - smartsim/settings/lsfSettings.py | 55 +---- smartsim/settings/mpiSettings.py | 55 +---- smartsim/settings/palsSettings.py | 33 +-- smartsim/settings/pbsSettings.py | 30 +-- smartsim/settings/settings.py | 27 +-- smartsim/settings/slurmSettings.py | 58 +---- smartsim/wlm/__init__.py | 8 - smartsim/wlm/pbs.py | 6 +- smartsim/wlm/slurm.py | 32 +-- tests/test_config.py | 2 - 70 files changed, 362 insertions(+), 1225 deletions(-) diff --git a/conftest.py b/conftest.py index 1e9b5a141..8d6f6fb2a 100644 --- a/conftest.py +++ b/conftest.py @@ -609,11 +609,8 @@ def make_test_file( """Create a dummy file in the test output directory. :param file_name: name of file to create, e.g. "file.txt" - :type file_name: str :param file_dir: path - :type file_dir: str :return: String path to test output file - :rtype: str """ file_path = os.path.join(file_dir, file_name) os.makedirs(file_dir) diff --git a/doc/api/smartsim_api.rst b/doc/api/smartsim_api.rst index 045c06b1b..bb6a02b8e 100644 --- a/doc/api/smartsim_api.rst +++ b/doc/api/smartsim_api.rst @@ -26,6 +26,7 @@ Experiment Experiment.get_status Experiment.reconnect_orchestrator Experiment.summary + Experiment.telemetry .. autoclass:: Experiment :show-inheritance: @@ -407,6 +408,7 @@ Orchestrator Orchestrator.set_max_clients Orchestrator.set_max_message_size Orchestrator.set_db_conf + Orchestrator.telemetry Orchestrator ------------ diff --git a/doc/changelog.rst b/doc/changelog.rst index da5fbbcd3..210646b13 100644 --- a/doc/changelog.rst +++ b/doc/changelog.rst @@ -18,6 +18,7 @@ To be released at some future point in time Description +- Auto-generate type-hints into documentation - Auto-post release PR to develop - Bump manifest.json to version 0.0.4 - Fix symlinking batch ensemble and model bug @@ -48,6 +49,8 @@ Description Detailed Notes +- Add extension to auto-generate function type-hints into documentation. + (SmartSim-PR561_) - Add to github release workflow to auto generate a pull request from master into develop for release. (SmartSim-PR566_) - The manifest.json version needs to match the SmartDashboard version, which is @@ -129,6 +132,7 @@ Detailed Notes handler. SmartSim will now attempt to kill any launched jobs before calling the previously registered signal handler. (SmartSim-PR535_) +.. _SmartSim-PR561: https://github.com/CrayLabs/SmartSim/pull/561 .. _SmartSim-PR566: https://github.com/CrayLabs/SmartSim/pull/566 .. _SmartSim-PR563: https://github.com/CrayLabs/SmartSim/pull/563 .. _SmartSim-PR547: https://github.com/CrayLabs/SmartSim/pull/547 diff --git a/doc/conf.py b/doc/conf.py index a5496df42..38f419bc1 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -14,6 +14,9 @@ import os import sys +import logging +import inspect +from sphinx.util.logging import SphinxLoggerAdapter sys.path.insert(0, os.path.abspath('.')) # -- Project information ----------------------------------------------------- @@ -39,6 +42,7 @@ # ones. extensions = [ 'sphinx.ext.autodoc', + 'sphinx_autodoc_typehints', 'sphinx.ext.autosectionlabel', 'sphinx.ext.todo', 'sphinx.ext.coverage', @@ -56,6 +60,12 @@ 'sphinx_design', 'sphinx.ext.mathjax', ] +# sphinx_autodoc_typehints configurations +always_use_bars_union = True +typehints_document_rtype = True +typehints_use_signature = True +typehints_use_signature_return = True +typehints_defaults = 'comma' autodoc_mock_imports = ["smartredis.smartredisPy"] suppress_warnings = ['autosectionlabel'] @@ -143,4 +153,16 @@ def ensure_pandoc_installed(_): def setup(app): - app.connect("builder-inited", ensure_pandoc_installed) \ No newline at end of file + app.connect("builder-inited", ensure_pandoc_installed) + + # Below code from https://github.com/sphinx-doc/sphinx/issues/10219 + def _is_sphinx_logger_adapter(obj): + return isinstance(obj, SphinxLoggerAdapter) + class ForwardReferenceFilter(logging.Filter): + def filter(self, record): + # Suppress the warning related to forward references + return "Cannot resolve forward reference in type annotations" not in record.getMessage() + + members = inspect.getmembers(app.extensions['sphinx_autodoc_typehints'].module, _is_sphinx_logger_adapter) + for _, adapter in members: + adapter.logger.addFilter(ForwardReferenceFilter()) diff --git a/doc/requirements-doc.txt b/doc/requirements-doc.txt index 8b6d46bb9..108d4cad9 100644 --- a/doc/requirements-doc.txt +++ b/doc/requirements-doc.txt @@ -14,3 +14,4 @@ protobuf numpy sphinx-design pypandoc +sphinx-autodoc-typehints diff --git a/doc/tutorials/ml_training/surrogate/fd_sim.py b/doc/tutorials/ml_training/surrogate/fd_sim.py index 8b128a319..db68b24b2 100644 --- a/doc/tutorials/ml_training/surrogate/fd_sim.py +++ b/doc/tutorials/ml_training/surrogate/fd_sim.py @@ -18,12 +18,9 @@ def augment_batch(samples, targets): following NWHC ordering. :param samples: Samples to augment - :type samples: np.ndarray :param targets: Targets to augment - :type targets: np.ndarray :returns: Tuple of augmented samples and targets - :rtype: (np.ndarray, np.ndarray) """ batch_size = samples.shape[0] augmented_samples = np.empty((batch_size*8, *samples.shape[1:])) @@ -83,9 +80,7 @@ def simulate(steps, size): both as tensors and as augmented samples for training. :param steps: Number of simulations to run - :type steps: int :param size: lateral size of the discretized domain - :type size: int """ batch_size = 50 samples = np.zeros((batch_size,size,size,1)).astype(np.single) diff --git a/smartsim/_core/_cli/utils.py b/smartsim/_core/_cli/utils.py index 8bf0984df..1b099c248 100644 --- a/smartsim/_core/_cli/utils.py +++ b/smartsim/_core/_cli/utils.py @@ -78,7 +78,6 @@ def clean(core_path: Path, _all: bool = False) -> int: """Remove pre existing installations of ML runtimes :param _all: Remove all non-python dependencies - :type _all: bool, optional """ build_temp = core_path / ".third-party" diff --git a/smartsim/_core/_install/builder.py b/smartsim/_core/_install/builder.py index d0dbc5a6a..55b1e90b9 100644 --- a/smartsim/_core/_install/builder.py +++ b/smartsim/_core/_install/builder.py @@ -63,7 +63,6 @@ def expand_exe_path(exe: str) -> str: """Takes an executable and returns the full path to that executable :param exe: executable or file - :type exe: str :raises TypeError: if file is not an executable :raises FileNotFoundError: if executable cannot be found """ @@ -283,9 +282,7 @@ def build_from_git( ) -> None: """Build Redis from git :param git_url: url from which to retrieve Redis - :type git_url: str :param branch: branch to checkout - :type branch: str """ # pylint: disable=too-many-locals database_name = "keydb" if "KeyDB" in git_url else "redis" @@ -534,7 +531,6 @@ def symlink_libtf(self, device: Device) -> None: """Add symbolic link to available libtensorflow in RedisAI deps. :param device: cpu or gpu - :type device: str """ rai_deps_path = sorted( self.rai_build_path.glob(os.path.join("deps", f"*{device.value}*")) @@ -591,11 +587,8 @@ def build_from_git( """Build RedisAI from git :param git_url: url from which to retrieve RedisAI - :type git_url: str :param branch: branch to checkout - :type branch: str :param device: cpu or gpu - :type device: str """ # delete previous build dir (should never be there) if self.rai_build_path.is_dir(): @@ -705,7 +698,6 @@ def _fetch_deps_for(self, device: Device) -> None: def _install_backends(self, device: Device) -> None: """Move backend libraries to smartsim/_core/lib/ :param device: cpu or cpu - :type device: str """ self.rai_install_path = self.rai_build_path.joinpath( f"install-{device.value}" diff --git a/smartsim/_core/control/controller.py b/smartsim/_core/control/controller.py index 16e52a517..0724235c4 100644 --- a/smartsim/_core/control/controller.py +++ b/smartsim/_core/control/controller.py @@ -97,7 +97,6 @@ def __init__(self, launcher: str = "local") -> None: """Initialize a Controller :param launcher: the type of launcher being used - :type launcher: str """ self._jobs = JobManager(JM_LOCK) self.init_launcher(launcher) @@ -158,11 +157,8 @@ def poll( """Poll running jobs and receive logging output of job status :param interval: number of seconds to wait before polling again - :type interval: int :param verbose: set verbosity - :type verbose: bool :param kill_on_interrupt: flag for killing jobs when SIGINT is received - :type kill_on_interrupt: bool, optional """ self._jobs.kill_on_interrupt = kill_on_interrupt to_monitor = self._jobs.jobs @@ -182,7 +178,6 @@ def finished( """Return a boolean indicating wether a job has finished or not :param entity: object launched by SmartSim. - :type entity: Entity | EntitySequence :returns: bool :raises ValueError: if entity has not been launched yet """ @@ -212,7 +207,6 @@ def stop_entity( the jobmanager so that the job appears as "cancelled". :param entity: entity to be stopped - :type entity: Entity | EntitySequence """ with JM_LOCK: job = self._jobs[entity.name] @@ -235,8 +229,8 @@ def stop_entity( def stop_db(self, db: Orchestrator) -> None: """Stop an orchestrator + :param db: orchestrator to be stopped - :type db: Orchestrator """ if db.batch: self.stop_entity(db) @@ -268,7 +262,6 @@ def stop_entity_list(self, entity_list: EntitySequence[SmartSimEntity]) -> None: """Stop an instance of an entity list :param entity_list: entity list to be stopped - :type entity_list: EntitySequence """ if entity_list.batch: @@ -291,10 +284,8 @@ def get_entity_status( """Get the status of an entity :param entity: entity to get status of - :type entity: SmartSimEntity | EntitySequence :raises TypeError: if not SmartSimEntity | EntitySequence :return: status of entity - :rtype: SmartSimStatus """ if not isinstance(entity, (SmartSimEntity, EntitySequence)): raise TypeError( @@ -310,10 +301,8 @@ def get_entity_list_status( :param entity_list: entity list containing entities to get statuses of - :type entity_list: EntitySequence :raises TypeError: if not EntitySequence :return: list of SmartSimStatus statuses - :rtype: list """ if not isinstance(entity_list, EntitySequence): raise TypeError( @@ -332,7 +321,6 @@ def init_launcher(self, launcher: str) -> None: and local launching :param launcher: which launcher to initialize - :type launcher: str :raises SSUnsupportedError: if a string is passed that is not a supported launcher :raises TypeError: if no launcher argument is provided. @@ -364,9 +352,7 @@ def symlink_output_files( under the .smartsim directory :param job_step: Job step instance - :type job_step: Step :param entity: Entity instance - :type entity: SmartSimEntity | EntitySequence[SmartSimEntity] """ historical_out, historical_err = map(pathlib.Path, job_step.get_output_files()) entity_out = pathlib.Path(entity.path) / f"{entity.name}.out" @@ -398,11 +384,8 @@ def _launch( address of the database can be given to following entities :param exp_name: The name of the launching experiment - :type exp_name: str :param exp_path: path to location of ``Experiment`` directory if generated - :type exp_path: str :param manifest: Manifest of deployables to launch - :type manifest: Manifest """ manifest_builder = LaunchedManifestBuilder[t.Tuple[str, Step]]( @@ -505,10 +488,8 @@ def _launch_orchestrator( set them in the JobManager :param orchestrator: orchestrator to launch - :type orchestrator: Orchestrator :param manifest_builder: An `LaunchedManifestBuilder` to record the names and `Step`s of the launched orchestrator - :type manifest_builder: LaunchedManifestBuilder[tuple[str, Step]] """ orchestrator.remove_stale_files() orc_telem_dir = manifest_builder.run_telemetry_subdirectory / "database" @@ -581,9 +562,7 @@ def _launch_step( """Use the launcher to launch a job step :param job_step: a job step instance - :type job_step: Step :param entity: entity instance - :type entity: SmartSimEntity :raises SmartSimError: if launch fails """ # attempt to retrieve entity name in JobManager.completed @@ -641,13 +620,10 @@ def _create_batch_job_step( """Use launcher to create batch job step :param entity_list: EntityList to launch as batch - :type entity_list: EntityList :param telemetry_dir: Path to a directory in which the batch job step may write telemetry events - :type telemetry_dir: pathlib.Path :return: batch job step instance and a list of run steps to be executed within the batch job - :rtype: tuple[Step, list[Step]] """ if not entity_list.batch_settings: raise ValueError( @@ -676,12 +652,9 @@ def _create_job_step( """Create job steps for all entities with the launcher :param entity: an entity to create a step for - :type entity: SmartSimEntity :param telemetry_dir: Path to a directory in which the job step may write telemetry events - :type telemetry_dir: pathlib.Path :return: the job step - :rtype: Step """ # get SSDB, SSIN, SSOUT and add to entity run settings if isinstance(entity, Model): @@ -698,7 +671,6 @@ def _prep_entity_client_env(self, entity: Model) -> None: """Retrieve all connections registered to this entity :param entity: The entity to retrieve connections from - :type entity: Model """ client_env: t.Dict[str, t.Union[str, int, float, bool]] = {} @@ -763,7 +735,6 @@ def _save_orchestrator(self, orchestrator: Orchestrator) -> None: to the orchestrator. :param orchestrator: Orchestrator configuration to be saved - :type orchestrator: Orchestrator """ dat_file = "/".join((orchestrator.path, "smartsim_db.dat")) @@ -785,7 +756,6 @@ def _orchestrator_launch_wait(self, orchestrator: Orchestrator) -> None: be launched with SSDB address :param orchestrator: orchestrator instance - :type orchestrator: Orchestrator :raises SmartSimError: if launch fails or manually stopped by user """ if orchestrator.batch: @@ -934,7 +904,6 @@ def _start_telemetry_monitor(self, exp_dir: str) -> None: of the processes launched through this controller. :param exp_dir: An experiment directory - :type exp_dir: str """ if ( self._telemetry_monitor is None diff --git a/smartsim/_core/control/job.py b/smartsim/_core/control/job.py index c15a0ef8f..eeefaf001 100644 --- a/smartsim/_core/control/job.py +++ b/smartsim/_core/control/job.py @@ -116,9 +116,8 @@ def _map_db_metadata(entity_dict: t.Dict[str, t.Any], entity: "JobEntity") -> No """Map DB-specific properties from a runtime manifest onto a `JobEntity` :param entity_dict: The raw dictionary deserialized from manifest JSON - :type entity_dict: Dict[str, Any] :param entity: The entity instance to modify - :type entity: JobEntity""" + """ if entity.is_db: # add collectors if they're configured to be enabled in the manifest entity.collectors = { @@ -141,13 +140,10 @@ def _map_standard_metadata( """Map universal properties from a runtime manifest onto a `JobEntity` :param entity_type: The type of the associated `SmartSimEntity` - :type entity_type: str :param entity_dict: The raw dictionary deserialized from manifest JSON - :type entity_dict: Dict[str, Any] :param entity: The entity instance to modify - :type entity: JobEntity :param exp_dir: The path to the experiment working directory - :type exp_dir: str""" + """ metadata = entity_dict["telemetry_metadata"] status_dir = pathlib.Path(metadata.get("status_dir")) @@ -167,11 +163,9 @@ def from_manifest( """Instantiate a `JobEntity` from the dictionary deserialized from manifest JSON :param entity_type: The type of the associated `SmartSimEntity` - :type entity_type: str :param entity_dict: The raw dictionary deserialized from manifest JSON - :type entity_dict: Dict[str, Any] :param exp_dir: The path to the experiment working directory - :type exp_dir: str""" + """ entity = JobEntity() cls._map_standard_metadata(entity_type, entity_dict, entity, exp_dir) @@ -198,15 +192,10 @@ def __init__( """Initialize a Job. :param job_name: Name of the job step - :type job_name: str :param job_id: The id associated with the job - :type job_id: str :param entity: The SmartSim entity(list) associated with the job - :type entity: SmartSimEntity | EntitySequence | JobEntity :param launcher: Launcher job was started with - :type launcher: str :param is_task: process monitored by TaskManager (True) or the WLM (True) - :type is_task: bool """ self.name = job_name self.jid = job_id @@ -240,15 +229,10 @@ def set_status( """Set the status of a job. :param new_status: The new status of the job - :type new_status: SmartSimStatus :param raw_status: The raw status of the launcher - :type raw_status: str :param returncode: The return code for the job - :type return_code: int|None :param error: Content produced by stderr - :type error: str :param output: Content produced by stdout - :type output: str """ self.status = new_status self.raw_status = raw_status @@ -270,11 +254,8 @@ def reset( """Reset the job in order to be able to restart it. :param new_job_name: name of the new job step - :type new_job_name: str :param new_job_id: new job id to launch under - :type new_job_id: int :param is_task: process monitored by TaskManager (True) or the WLM (True) - :type is_task: bool """ self.name = new_job_name self.jid = new_job_id @@ -291,7 +272,6 @@ def error_report(self) -> str: """A descriptive error report based on job fields :return: error report for display in terminal - :rtype: str """ warning = f"{self.ename} failed. See below for details \n" if self.error: @@ -311,7 +291,6 @@ def __str__(self) -> str: """Return user-readable string of the Job :returns: A user-readable string of the Job - :rtype: str """ if self.jid: job = "{}({}): {}" @@ -329,8 +308,7 @@ class History: def __init__(self, runs: int = 0) -> None: """Init a history object for a job - :param runs: number of runs so far, defaults to 0 - :type runs: int, optional + :param runs: number of runs so far """ self.runs = runs self.jids: t.Dict[int, t.Optional[str]] = {} diff --git a/smartsim/_core/control/jobmanager.py b/smartsim/_core/control/jobmanager.py index 4910b8311..1bc24cf9a 100644 --- a/smartsim/_core/control/jobmanager.py +++ b/smartsim/_core/control/jobmanager.py @@ -61,7 +61,6 @@ def __init__(self, lock: RLock, launcher: t.Optional[Launcher] = None) -> None: """Initialize a Jobmanager :param launcher: a Launcher object to manage jobs - :type: SmartSim.Launcher """ self.monitor: t.Optional[Thread] = None @@ -124,7 +123,6 @@ def move_to_completed(self, job: Job) -> None: actively monitored by the job manager :param job: job instance we are transitioning - :type job: Job """ with self._lock: self.completed[job.ename] = job @@ -141,9 +139,7 @@ def __getitem__(self, entity_name: str) -> Job: from which it was created. :param entity_name: The name of the entity of a job - :type entity_name: str :returns: the Job associated with the entity_name - :rtype: Job """ with self._lock: entities = ChainMap(self.db_jobs, self.jobs, self.completed) @@ -153,7 +149,6 @@ def __call__(self) -> t.Dict[str, Job]: """Returns dictionary all jobs for () operator :returns: Dictionary of all jobs - :rtype: dictionary """ all_jobs = {**self.jobs, **self.db_jobs} return all_jobs @@ -175,13 +170,9 @@ def add_job( """Add a job to the job manager which holds specific jobs by type. :param job_name: name of the job step - :type job_name: str :param job_id: job step id created by launcher - :type job_id: str :param entity: entity that was launched on job step - :type entity: SmartSimEntity | EntitySequence :param is_task: process monitored by TaskManager (True) or the WLM (True) - :type is_task: bool """ launcher = str(self._launcher) # all operations here should be atomic @@ -197,9 +188,7 @@ def is_finished(self, entity: SmartSimEntity) -> bool: """Detect if a job has completed :param entity: entity to check - :type entity: SmartSimEntity :return: True if finished - :rtype: bool """ with self._lock: job = self[entity.name] # locked operation @@ -243,9 +232,7 @@ def get_status( """Return the status of a job. :param entity: SmartSimEntity or EntitySequence instance - :type entity: SmartSimEntity | EntitySequence :returns: a SmartSimStatus status - :rtype: SmartSimStatus """ with self._lock: if entity.name in self.completed: @@ -261,7 +248,6 @@ def set_launcher(self, launcher: Launcher) -> None: """Set the launcher of the job manager to a specific launcher instance :param launcher: child of Launcher - :type launcher: Launcher instance """ self._launcher = launcher @@ -269,9 +255,7 @@ def query_restart(self, entity_name: str) -> bool: """See if the job just started should be restarted or not. :param entity_name: name of entity to check for a job for - :type entity_name: str :return: if job should be restarted instead of started - :rtype: bool """ if entity_name in self.completed: return True @@ -288,13 +272,9 @@ def restart_job( ready to launch again. :param job_name: new job step name - :type job_name: str :param job_id: new job id - :type job_id: str :param entity_name: name of the entity of the job - :type entity_name: str :param is_task: process monitored by TaskManager (True) or the WLM (True) - :type is_task: bool """ with self._lock: @@ -312,7 +292,6 @@ def get_db_host_addresses(self) -> t.Dict[str, t.List[str]]: for corresponding database identifiers :return: dictionary of host ip addresses - :rtype: Dict[str, list] """ address_dict: t.Dict[str, t.List[str]] = {} @@ -334,7 +313,6 @@ def set_db_hosts(self, orchestrator: Orchestrator) -> None: """Set the DB hosts in db_jobs so future entities can query this :param orchestrator: orchestrator instance - :type orchestrator: Orchestrator """ # should only be called during launch in the controller diff --git a/smartsim/_core/control/manifest.py b/smartsim/_core/control/manifest.py index f16d49abb..97fc0ba8e 100644 --- a/smartsim/_core/control/manifest.py +++ b/smartsim/_core/control/manifest.py @@ -68,7 +68,6 @@ def dbs(self) -> t.List[Orchestrator]: :raises SmartSimError: if user added to databases to manifest :return: List of orchestrator instances - :rtype: list[Orchestrator] """ dbs = [item for item in self._deployables if isinstance(item, Orchestrator)] return dbs @@ -78,7 +77,6 @@ def models(self) -> t.List[Model]: """Return Model instances in Manifest :return: model instances - :rtype: List[Model] """ _models: t.List[Model] = [ item for item in self._deployables if isinstance(item, Model) @@ -90,7 +88,6 @@ def ensembles(self) -> t.List[Ensemble]: """Return Ensemble instances in Manifest :return: list of ensembles - :rtype: List[Ensemble] """ return [e for e in self._deployables if isinstance(e, Ensemble)] @@ -100,7 +97,6 @@ def all_entity_lists(self) -> t.List[EntitySequence[SmartSimEntity]]: exceptional ones like Orchestrator :return: list of entity lists - :rtype: List[EntitySequence[SmartSimEntity]] """ _all_entity_lists: t.List[EntitySequence[SmartSimEntity]] = list(self.ensembles) diff --git a/smartsim/_core/entrypoints/colocated.py b/smartsim/_core/entrypoints/colocated.py index b8f2157f9..508251fe0 100644 --- a/smartsim/_core/entrypoints/colocated.py +++ b/smartsim/_core/entrypoints/colocated.py @@ -62,11 +62,8 @@ def launch_db_model(client: Client, db_model: t.List[str]) -> str: """Parse options to launch model on local cluster :param client: SmartRedis client connected to local DB - :type client: Client :param db_model: List of arguments defining the model - :type db_model: List[str] :return: Name of model - :rtype: str """ parser = argparse.ArgumentParser("Set ML model on DB") parser.add_argument("--name", type=str) @@ -129,11 +126,8 @@ def launch_db_script(client: Client, db_script: t.List[str]) -> str: """Parse options to launch script on local cluster :param client: SmartRedis client connected to local DB - :type client: Client :param db_model: List of arguments defining the script - :type db_model: List[str] :return: Name of model - :rtype: str """ parser = argparse.ArgumentParser("Set script on DB") parser.add_argument("--name", type=str) diff --git a/smartsim/_core/entrypoints/indirect.py b/smartsim/_core/entrypoints/indirect.py index 2f67be020..1f445ac4a 100644 --- a/smartsim/_core/entrypoints/indirect.py +++ b/smartsim/_core/entrypoints/indirect.py @@ -60,14 +60,10 @@ def main( to achieve the same result. :param cmd: a base64 encoded cmd to execute - :type cmd: str :param entity_type: `SmartSimEntity` entity class. Valid values include: orchestrator, dbnode, ensemble, model - :type entity_type: str :param cwd: working directory to execute the cmd from - :type cwd: str :param status_dir: path to the output directory for status updates - :type status_dir: str """ global STEP_PID # pylint: disable=global-statement proxy_pid = os.getpid() diff --git a/smartsim/_core/entrypoints/telemetrymonitor.py b/smartsim/_core/entrypoints/telemetrymonitor.py index 7610de0c5..27582ac77 100644 --- a/smartsim/_core/entrypoints/telemetrymonitor.py +++ b/smartsim/_core/entrypoints/telemetrymonitor.py @@ -53,7 +53,7 @@ def register_signal_handlers( """Register a signal handling function for all termination events :param handle_signal: the function to execute when a term signal is received - :type handle_signal: Callable[[int, FrameType | None], None]""" + """ # NOTE: omitting kill because it is not catchable term_signals = [signal.SIGINT, signal.SIGQUIT, signal.SIGTERM, signal.SIGABRT] for signal_num in term_signals: @@ -64,7 +64,7 @@ def get_parser() -> argparse.ArgumentParser: """Instantiate a parser to process command line arguments :returns: An argument parser ready to accept required telemetry monitor parameters - :rtype: argparse.ArgumentParser""" + """ arg_parser = argparse.ArgumentParser(description="SmartSim Telemetry Monitor") arg_parser.add_argument( "-exp_dir", @@ -98,7 +98,7 @@ def parse_arguments() -> TelemetryMonitorArgs: of TelemetryMonitorArgs populated with the CLI inputs :returns: `TelemetryMonitorArgs` instance populated with command line arguments - :rtype: TelemetryMonitorArgs""" + """ parser = get_parser() parsed_args = parser.parse_args() return TelemetryMonitorArgs( @@ -114,11 +114,9 @@ def configure_logger(logger_: logging.Logger, log_level_: int, exp_dir: str) -> target output file path passed as an argument to the entrypoint :param logger_: logger to configure - :type logger_: logging.Logger :param log_level_: log level to apply to the python logging system - :type log_level_: logging._Level :param exp_dir: root path to experiment outputs - :type exp_dir: str""" + """ logger_.setLevel(log_level_) logger_.propagate = False diff --git a/smartsim/_core/generation/generator.py b/smartsim/_core/generation/generator.py index 502753df7..624a43379 100644 --- a/smartsim/_core/generation/generator.py +++ b/smartsim/_core/generation/generator.py @@ -64,11 +64,8 @@ def __init__( collision between entities. :param gen_path: Path in which files need to be generated - :type gen_path: str - :param overwrite: toggle entity replacement, defaults to False - :type overwrite: bool, optional + :param overwrite: toggle entity replacement :param verbose: Whether generation information should be logged to std out - :type verbose: bool, optional """ self._writer = ModelWriter() self.gen_path = gen_path @@ -82,7 +79,6 @@ def log_file(self) -> str: of all generated entities. :returns: path to file with parameter settings - :rtype: str """ return join(self.gen_path, "smartsim_params.txt") @@ -129,11 +125,7 @@ def set_tag(self, tag: str, regex: t.Optional[str] = None) -> None: :param tag: A string of characters that signify the string to be changed. Defaults to ``;`` - :type tag: str - - :param regex: full regex for the modelwriter to search for, - defaults to None - :type regex: str | None + :param regex: full regex for the modelwriter to search for """ self._writer.set_tag(tag, regex) @@ -167,7 +159,6 @@ def _gen_orc_dir(self, orchestrator_list: t.List[Orchestrator]) -> None: configuration files for the orchestrator. :param orchestrator: Orchestrator instance - :type orchestrator: Orchestrator | None """ # Loop through orchestrators for orchestrator in orchestrator_list: @@ -183,7 +174,6 @@ def _gen_entity_list_dir(self, entity_lists: t.List[Ensemble]) -> None: """Generate directories for Ensemble instances :param entity_lists: list of Ensemble instances - :type entity_lists: list """ if not entity_lists: @@ -209,9 +199,7 @@ def _gen_entity_dirs( """Generate directories for Entity instances :param entities: list of Model instances - :type entities: list[Model] - :param entity_list: Ensemble instance, defaults to None - :type entity_list: Ensemble | None + :param entity_list: Ensemble instance :raises EntityExistsError: if a directory already exists for an entity by that name """ @@ -247,7 +235,6 @@ def _write_tagged_entity_files(self, entity: Model) -> None: an Ensemble. :param entity: a Model instance - :type entity: Model """ if entity.files: to_write = [] @@ -258,7 +245,6 @@ def _build_tagged_files(tagged: TaggedFilesHierarchy) -> None: :param tagged: a TaggedFileHierarchy to be built as a directory structure - :type tagged: TaggedFilesHierarchy """ for file in tagged.files: dst_path = path.join(entity.path, tagged.base, path.basename(file)) @@ -291,9 +277,7 @@ def _log_params( and what values were set to the parameters :param entity: the model being generated - :type entity: Model :param files_to_params: a dict connecting each file to its parameter settings - :type files_to_params: t.Dict[str, t.Dict[str, str]] """ used_params: t.Dict[str, str] = {} file_to_tables: t.Dict[str, str] = {} @@ -333,7 +317,6 @@ def _copy_entity_files(entity: Model) -> None: """Copy the entity files and directories attached to this entity. :param entity: Model - :type entity: Model """ if entity.files: for to_copy in entity.files.copy: @@ -348,7 +331,6 @@ def _link_entity_files(entity: Model) -> None: """Symlink the entity files attached to this entity. :param entity: Model - :type entity: Model """ if entity.files: for to_link in entity.files.link: diff --git a/smartsim/_core/generation/modelwriter.py b/smartsim/_core/generation/modelwriter.py index ced232b6a..2998d4e35 100644 --- a/smartsim/_core/generation/modelwriter.py +++ b/smartsim/_core/generation/modelwriter.py @@ -48,10 +48,8 @@ def set_tag(self, tag: str, regex: t.Optional[str] = None) -> None: :param tag: tag for the modelwriter to search for, defaults to semi-colon e.g. ";" - :type tag: str :param regex: full regex for the modelwriter to search for, defaults to "(;.+;)" - :type regex: str, optional """ if regex: self.regex = regex @@ -69,13 +67,9 @@ def configure_tagged_model_files( instance. :param tagged_files: list of paths to tagged files - :type model: list[str] :param params: model parameters - :type params: dict[str, str] :param make_missing_tags_fatal: raise an error if a tag is missing - :type make_missing_tags_fatal: bool :returns: A dict connecting each file to its parameter settings - :rtype: dict[str,dict[str,str]] """ files_to_tags: t.Dict[str, t.Dict[str, str]] = {} for tagged_file in tagged_files: @@ -90,7 +84,6 @@ def _set_lines(self, file_path: str) -> None: """Set the lines for the modelwrtter to iterate over :param file_path: path to the newly created and tagged file - :type file_path: str :raises ParameterWriterError: if the newly created file cannot be read """ try: @@ -118,12 +111,9 @@ def _replace_tags( model. The tag defaults to ";" :param model: The model instance - :type model: Model :param make_fatal: (Optional) Set to True to force a fatal error if a tag is not matched - :type make_fatal: bool :returns: A dict of parameter names and values set for the file - :rtype: dict[str,str] """ edited = [] unused_tags: t.DefaultDict[str, t.List[int]] = collections.defaultdict(list) diff --git a/smartsim/_core/launcher/colocated.py b/smartsim/_core/launcher/colocated.py index 738ca4a00..03540ce0f 100644 --- a/smartsim/_core/launcher/colocated.py +++ b/smartsim/_core/launcher/colocated.py @@ -42,11 +42,8 @@ def write_colocated_launch_script( is created for this entity. :param file_name: name of the script to write - :type file_name: str :param db_log: log file for the db - :type db_log: str :param colocated_settings: db settings from entity run_settings - :type colocated_settings: dict[str, Any] """ colocated_cmd = _build_colocated_wrapper_cmd(db_log, **colocated_settings) @@ -93,21 +90,13 @@ def _build_colocated_wrapper_cmd( """Build the command use to run a colocated DB application :param db_log: log file for the db - :type db_log: str - :param cpus: db cpus, defaults to 1 - :type cpus: int, optional - :param rai_args: redisai args, defaults to None - :type rai_args: dict[str, str], optional - :param extra_db_args: extra redis args, defaults to None - :type extra_db_args: dict[str, str], optional + :param cpus: db cpus + :param rai_args: redisai args + :param extra_db_args: extra redis args :param port: port to bind DB to - :type port: int :param ifname: network interface(s) to bind DB to - :type ifname: str | list[str], optional :param db_cpu_list: The list of CPUs that the database should be limited to - :type db_cpu_list: str, optional :return: the command to run - :rtype: str """ # pylint: disable=too-many-locals diff --git a/smartsim/_core/launcher/launcher.py b/smartsim/_core/launcher/launcher.py index 80000c22f..6ae20ae62 100644 --- a/smartsim/_core/launcher/launcher.py +++ b/smartsim/_core/launcher/launcher.py @@ -94,15 +94,11 @@ def create_step( """Create a WLM job step :param name: name of the entity to be launched - :type name: str :param cwd: path to launch dir - :type cwd: str :param step_settings: batch or run settings for entity - :type step_settings: BatchSettings | RunSettings :raises SSUnsupportedError: if batch or run settings type isnt supported :raises LauncherError: if step creation fails :return: step instance - :rtype: Step """ try: step_class = self.supported_rs[type(step_settings)] @@ -129,9 +125,7 @@ def get_step_update( """Get update for a list of job steps :param step_names: list of job steps to get updates for - :type step_names: list[str] :return: list of name, job update tuples - :rtype: list[(str, StepInfo)] """ updates: t.List[t.Tuple[str, t.Union[StepInfo, None]]] = [] @@ -162,9 +156,7 @@ def _get_unmanaged_step_update( """Get step updates for Popen managed jobs :param task_ids: task id to check - :type task_ids: list[str] :return: list of step updates - :rtype: list[StepInfo] """ updates = [] for task_id in task_ids: diff --git a/smartsim/_core/launcher/local/local.py b/smartsim/_core/launcher/local/local.py index 96778ec0d..ffcb84f23 100644 --- a/smartsim/_core/launcher/local/local.py +++ b/smartsim/_core/launcher/local/local.py @@ -59,9 +59,7 @@ def get_step_update( """Get status updates of each job step name provided :param step_names: list of step_names - :type step_names: list[str] :return: list of tuples for update - :rtype: list[tuple[str, StepInfo | None]] """ # step ids are process ids of the tasks # as there is no WLM intermediary @@ -78,9 +76,7 @@ def get_step_nodes(self, step_names: t.List[str]) -> t.List[t.List[str]]: """Return the address of nodes assigned to the step :param step_names: list of step_names - :type step_names: list[str] :return: list of node addresses - :rtype: list[list[str]] TODO: Use socket to find the actual Lo address? """ @@ -92,9 +88,7 @@ def run(self, step: Step) -> str: files will be written to the entity path. :param step: LocalStep instance to run - :type step: LocalStep :return: task_id of the newly created step - :rtype: str """ if not self.task_manager.actively_monitoring: self.task_manager.start() @@ -118,9 +112,7 @@ def stop(self, step_name: str) -> UnmanagedStepInfo: """Stop a job step :param step_name: name of the step to be stopped - :type step_name: str :return: a UnmanagedStepInfo instance - :rtype: UnmanagedStepInfo """ # step_id is task_id for local. Naming for consistency step_id = self.step_mapping[step_name].task_id diff --git a/smartsim/_core/launcher/lsf/lsfCommands.py b/smartsim/_core/launcher/lsf/lsfCommands.py index d6d0ee031..cb92587c1 100644 --- a/smartsim/_core/launcher/lsf/lsfCommands.py +++ b/smartsim/_core/launcher/lsf/lsfCommands.py @@ -33,7 +33,6 @@ def bjobs(args: t.List[str]) -> t.Tuple[str, str]: """Calls LSF bjobs with args :param args: List of command arguments - :type args: List of str :returns: Output and error of bjobs """ cmd = ["bjobs"] + args @@ -47,9 +46,7 @@ def bkill(args: t.List[str]) -> t.Tuple[int, str, str]: returncode is also supplied in this function. :param args: list of command arguments - :type args: list of str :return: returncode, output and error - :rtype: (int, str, str) """ cmd = ["bkill"] + args returncode, out, error = execute_cmd(cmd) @@ -62,9 +59,7 @@ def jskill(args: t.List[str]) -> t.Tuple[int, str, str]: returncode is also supplied in this function. :param args: list of command arguments - :type args: list of str :return: returncode, output and error - :rtype: (int, str, str) """ cmd = ["jskill"] + args @@ -76,9 +71,7 @@ def jslist(args: t.List[str]) -> t.Tuple[str, str]: """Calls LSF jslist with args :param args: List of command arguments - :type args: List of str :returns: Output and error of jslist - :rtype: (str, str) """ cmd = ["jslist"] + args _, out, err = execute_cmd(cmd) diff --git a/smartsim/_core/launcher/lsf/lsfLauncher.py b/smartsim/_core/launcher/lsf/lsfLauncher.py index bfa560c2d..e0ad808ed 100644 --- a/smartsim/_core/launcher/lsf/lsfLauncher.py +++ b/smartsim/_core/launcher/lsf/lsfLauncher.py @@ -91,10 +91,8 @@ def run(self, step: Step) -> t.Optional[str]: """Run a job step through LSF :param step: a job step instance - :type step: Step :raises LauncherError: if launch fails :return: job step id if job is managed - :rtype: str """ if not self.task_manager.actively_monitoring: self.task_manager.start() @@ -134,9 +132,7 @@ def stop(self, step_name: str) -> StepInfo: """Stop/cancel a job step :param step_name: name of the job to stop - :type step_name: str :return: update for job due to cancel - :rtype: StepInfo """ stepmap = self.step_mapping[step_name] if stepmap.managed: @@ -185,9 +181,7 @@ def _get_managed_step_update(self, step_ids: t.List[str]) -> t.List[StepInfo]: """Get step updates for WLM managed jobs :param step_ids: list of job step ids - :type step_ids: list[str] :return: list of updates for managed jobs - :rtype: list[StepInfo] """ updates: t.List[StepInfo] = [] diff --git a/smartsim/_core/launcher/lsf/lsfParser.py b/smartsim/_core/launcher/lsf/lsfParser.py index 33837d2bd..c3272fa99 100644 --- a/smartsim/_core/launcher/lsf/lsfParser.py +++ b/smartsim/_core/launcher/lsf/lsfParser.py @@ -31,9 +31,7 @@ def parse_bsub(output: str) -> str: """Parse bsub output and return job id. :param output: stdout of bsub command - :type output: str :returns: job id - :rtype: str """ for line in output.split("\n"): if line.startswith("Job"): @@ -45,9 +43,7 @@ def parse_bsub_error(output: str) -> str: """Parse and return error output of a failed bsub command. :param output: stderr of qsub command - :type output: str :returns: error message - :rtype: str """ # Search for first non-empty line error_lines = [] @@ -77,11 +73,8 @@ def parse_jslist_stepid(output: str, step_id: str) -> t.Tuple[str, t.Optional[st options to obtain step status :param output: output of the bjobs command - :type output: str :param step_id: allocation id or job step id - :type step_id: str :return: status and return code - :rtype: (str, str) """ result: t.Tuple[str, t.Optional[str]] = ("NOTFOUND", None) @@ -101,11 +94,8 @@ def parse_bjobs_jobid(output: str, job_id: str) -> str: to obtain job status. :param output: output of the bjobs command - :type output: str :param job_id: allocation id or job step id - :type job_id: str :return: status - :rtype: str """ result = "NOTFOUND" for line in output.split("\n"): @@ -126,9 +116,7 @@ def parse_bjobs_nodes(output: str) -> t.List[str]: a job in a list with the duplicates removed. :param output: output of the `bjobs -w` command - :type output: str :return: compute nodes of the allocation or job - :rtype: list of str """ nodes = [] @@ -146,11 +134,8 @@ def parse_max_step_id_from_jslist(output: str) -> t.Optional[str]: properly returned :param output: output bjobs - :type output: str :param step_name: the name of the step to query - :type step_name: str :return: the step_id - :rtype: str """ max_step_id = None diff --git a/smartsim/_core/launcher/pbs/pbsCommands.py b/smartsim/_core/launcher/pbs/pbsCommands.py index f738ef1f8..989af93be 100644 --- a/smartsim/_core/launcher/pbs/pbsCommands.py +++ b/smartsim/_core/launcher/pbs/pbsCommands.py @@ -33,7 +33,6 @@ def qstat(args: t.List[str]) -> t.Tuple[str, str]: """Calls PBS qstat with args :param args: List of command arguments - :type args: List of str :returns: Output and error of qstat """ cmd = ["qstat"] + args @@ -45,7 +44,6 @@ def qsub(args: t.List[str]) -> t.Tuple[str, str]: """Calls PBS qsub with args :param args: List of command arguments - :type args: List of str :returns: Output and error of salloc """ cmd = ["qsub"] + args @@ -59,9 +57,7 @@ def qdel(args: t.List[str]) -> t.Tuple[int, str, str]: returncode is also supplied in this function. :param args: list of command arguments - :type args: list of str :return: output and error - :rtype: str """ cmd = ["qdel"] + args returncode, out, error = execute_cmd(cmd) diff --git a/smartsim/_core/launcher/pbs/pbsLauncher.py b/smartsim/_core/launcher/pbs/pbsLauncher.py index bb1b46d46..e01cbae08 100644 --- a/smartsim/_core/launcher/pbs/pbsLauncher.py +++ b/smartsim/_core/launcher/pbs/pbsLauncher.py @@ -88,10 +88,8 @@ def run(self, step: Step) -> t.Optional[str]: """Run a job step through PBSPro :param step: a job step instance - :type step: Step :raises LauncherError: if launch fails :return: job step id if job is managed - :rtype: str """ if not self.task_manager.actively_monitoring: self.task_manager.start() @@ -131,9 +129,7 @@ def stop(self, step_name: str) -> StepInfo: """Stop/cancel a job step :param step_name: name of the job to stop - :type step_name: str :return: update for job due to cancel - :rtype: StepInfo """ stepmap = self.step_mapping[step_name] if stepmap.managed: @@ -180,9 +176,7 @@ def _get_managed_step_update(self, step_ids: t.List[str]) -> t.List[StepInfo]: """Get step updates for WLM managed jobs :param step_ids: list of job step ids - :type step_ids: list[str] :return: list of updates for managed jobs - :rtype: list[StepInfo] """ updates: t.List[StepInfo] = [] diff --git a/smartsim/_core/launcher/pbs/pbsParser.py b/smartsim/_core/launcher/pbs/pbsParser.py index 362577595..dcb5a3ef5 100644 --- a/smartsim/_core/launcher/pbs/pbsParser.py +++ b/smartsim/_core/launcher/pbs/pbsParser.py @@ -33,9 +33,7 @@ def parse_qsub(output: str) -> str: output is the job id itself. :param output: stdout of qsub command - :type output: str :returns: job id - :rtype: str """ return output @@ -44,9 +42,7 @@ def parse_qsub_error(output: str) -> str: """Parse and return error output of a failed qsub command. :param output: stderr of qsub command - :type output: str :returns: error message - :rtype: str """ # look for error first for line in output.split("\n"): @@ -66,11 +62,8 @@ def parse_qstat_jobid(output: str, job_id: str) -> str: to obtain job status. :param output: output of the qstat command - :type output: str :param job_id: allocation id or job step id - :type job_id: str :return: status - :rtype: str """ result = "NOTFOUND" for line in output.split("\n"): @@ -93,9 +86,7 @@ def parse_qstat_nodes(output: str) -> t.List[str]: The `output` parameter must be in JSON format. :param output: output of the qstat command in JSON format - :type output: str :return: compute nodes of the allocation or job - :rtype: list of str """ nodes: t.List[str] = [] out_json = load_and_clean_json(output) @@ -116,11 +107,8 @@ def parse_step_id_from_qstat(output: str, step_name: str) -> t.Optional[str]: """Parse and return the step id from a qstat command :param output: output qstat - :type output: str :param step_name: the name of the step to query - :type step_name: str :return: the step_id - :rtype: str """ step_id: t.Optional[str] = None out_json = load_and_clean_json(output) diff --git a/smartsim/_core/launcher/slurm/slurmCommands.py b/smartsim/_core/launcher/slurm/slurmCommands.py index 2e37f1d79..839826297 100644 --- a/smartsim/_core/launcher/slurm/slurmCommands.py +++ b/smartsim/_core/launcher/slurm/slurmCommands.py @@ -38,7 +38,6 @@ def sstat(args: t.List[str], *, raise_on_err: bool = False) -> t.Tuple[str, str] """Calls sstat with args :param args: List of command arguments - :type args: List of str :returns: Output and error of sstat """ _, out, err = _execute_slurm_cmd("sstat", args, raise_on_err=raise_on_err) @@ -49,7 +48,6 @@ def sacct(args: t.List[str], *, raise_on_err: bool = False) -> t.Tuple[str, str] """Calls sacct with args :param args: List of command arguments - :type args: List of str :returns: Output and error of sacct """ _, out, err = _execute_slurm_cmd("sacct", args, raise_on_err=raise_on_err) @@ -60,7 +58,6 @@ def salloc(args: t.List[str], *, raise_on_err: bool = False) -> t.Tuple[str, str """Calls slurm salloc with args :param args: List of command arguments - :type args: List of str :returns: Output and error of salloc """ _, out, err = _execute_slurm_cmd("salloc", args, raise_on_err=raise_on_err) @@ -71,7 +68,6 @@ def sinfo(args: t.List[str], *, raise_on_err: bool = False) -> t.Tuple[str, str] """Calls slurm sinfo with args :param args: List of command arguments - :type args: List of str :returns: Output and error of sinfo """ _, out, err = _execute_slurm_cmd("sinfo", args, raise_on_err=raise_on_err) @@ -82,7 +78,6 @@ def scontrol(args: t.List[str], *, raise_on_err: bool = False) -> t.Tuple[str, s """Calls slurm scontrol with args :param args: List of command arguments - :type args: List of str :returns: Output and error of sinfo """ _, out, err = _execute_slurm_cmd("scontrol", args, raise_on_err=raise_on_err) @@ -95,9 +90,7 @@ def scancel(args: t.List[str], *, raise_on_err: bool = False) -> t.Tuple[int, st returncode is also supplied in this function. :param args: list of command arguments - :type args: list of str :return: output and error - :rtype: str """ return _execute_slurm_cmd("scancel", args, raise_on_err=raise_on_err) diff --git a/smartsim/_core/launcher/slurm/slurmLauncher.py b/smartsim/_core/launcher/slurm/slurmLauncher.py index a25e62806..2e4102391 100644 --- a/smartsim/_core/launcher/slurm/slurmLauncher.py +++ b/smartsim/_core/launcher/slurm/slurmLauncher.py @@ -100,10 +100,8 @@ def get_step_nodes(self, step_names: t.List[str]) -> t.List[t.List[str]]: would return nid00034 :param step_names: list of job step names - :type step_names: list[str] :raises LauncherError: if nodelist aquisition fails :return: list of hostnames - :rtype: list[str] """ _, step_ids = self.step_mapping.get_ids(step_names, managed=True) step_str = _create_step_id_str([val for val in step_ids if val is not None]) @@ -122,10 +120,8 @@ def run(self, step: Step) -> t.Optional[str]: """Run a job step through Slurm :param step: a job step instance - :type step: Step :raises LauncherError: if launch fails :return: job step id if job is managed - :rtype: str """ self.check_for_slurm() if not self.task_manager.actively_monitoring: @@ -175,9 +171,7 @@ def stop(self, step_name: str) -> StepInfo: """Step a job step :param step_name: name of the job to stop - :type step_name: str :return: update for job due to cancel - :rtype: StepInfo """ stepmap = self.step_mapping[step_name] if stepmap.managed: @@ -257,9 +251,7 @@ def _get_managed_step_update(self, step_ids: t.List[str]) -> t.List[StepInfo]: """Get step updates for WLM managed jobs :param step_ids: list of job step ids - :type step_ids: list[str] :return: list of updates for managed jobs - :rtype: list[StepInfo] """ step_str = _create_step_id_str(step_ids) sacct_out, _ = sacct( diff --git a/smartsim/_core/launcher/slurm/slurmParser.py b/smartsim/_core/launcher/slurm/slurmParser.py index ede687eb6..4ec187f19 100644 --- a/smartsim/_core/launcher/slurm/slurmParser.py +++ b/smartsim/_core/launcher/slurm/slurmParser.py @@ -43,9 +43,7 @@ def parse_salloc_error(output: str) -> t.Optional[str]: """Parse and return error output of a failed salloc command :param output: stderr output of salloc command - :type output: str :return: error message - :rtype: str """ salloc = which("salloc") # look for error first @@ -73,10 +71,9 @@ def jobid_exact_match(parsed_id: str, job_id: str) -> bool: the prefix of another job_id, like 1 and 11 or 1.1 and 1.10. Works with job id or step id (i.e. with or without a '.' in the id) + :param parsed_id: the id read from the line - :type paserd_id: str :param job_id: the id to check for equality - :type job_id: str """ if "." in job_id: return parsed_id == job_id @@ -88,11 +85,8 @@ def parse_sacct(output: str, job_id: str) -> t.Tuple[str, t.Optional[str]]: """Parse and return output of the sacct command :param output: output of the sacct command - :type output: str :param job_id: allocation id or job step id - :type job_id: str :return: status and returncode - :rtype: tuple """ result: t.Tuple[str, t.Optional[str]] = ("PENDING", None) for line in output.split("\n"): @@ -113,9 +107,7 @@ def parse_sstat_nodes(output: str, job_id: str) -> t.List[str]: a job in a list with the duplicates removed. :param output: output of the sstat command - :type output: str :return: compute nodes of the allocation or job - :rtype: list of str """ nodes = [] for line in output.split("\n"): @@ -134,11 +126,8 @@ def parse_step_id_from_sacct(output: str, step_name: str) -> t.Optional[str]: :param output: output of sacct --noheader -p --format=jobname,jobid --job - :type output: str :param step_name: the name of the step to query - :type step_name: str :return: the step_id - :rtype: str """ step_id = None for line in output.split("\n"): diff --git a/smartsim/_core/launcher/step/alpsStep.py b/smartsim/_core/launcher/step/alpsStep.py index 61ca5eee8..eb7903af9 100644 --- a/smartsim/_core/launcher/step/alpsStep.py +++ b/smartsim/_core/launcher/step/alpsStep.py @@ -42,11 +42,8 @@ def __init__(self, name: str, cwd: str, run_settings: AprunSettings) -> None: """Initialize a ALPS aprun job step :param name: name of the entity to be launched - :type name: str :param cwd: path to launch dir - :type cwd: str :param run_settings: run settings for entity - :type run_settings: AprunSettings """ super().__init__(name, cwd, run_settings) self.alloc: t.Optional[str] = None @@ -65,7 +62,6 @@ def get_launch_cmd(self) -> t.List[str]: """Get the command to launch this step :return: launch command - :rtype: list[str] """ aprun = self.run_settings.run_command if not aprun: @@ -122,7 +118,6 @@ def _build_exe(self) -> t.List[str]: """Build the executable for this step :return: executable list - :rtype: list[str] """ if self._get_mpmd(): return self._make_mpmd() diff --git a/smartsim/_core/launcher/step/lsfStep.py b/smartsim/_core/launcher/step/lsfStep.py index 1c88dadb8..0cb921e19 100644 --- a/smartsim/_core/launcher/step/lsfStep.py +++ b/smartsim/_core/launcher/step/lsfStep.py @@ -42,11 +42,8 @@ def __init__(self, name: str, cwd: str, batch_settings: BsubBatchSettings) -> No """Initialize a LSF bsub step :param name: name of the entity to launch - :type name: str :param cwd: path to launch dir - :type cwd: str :param batch_settings: batch settings for entity - :type batch_settings: BsubBatchSettings """ super().__init__(name, cwd, batch_settings) self.step_cmds: t.List[t.List[str]] = [] @@ -57,7 +54,6 @@ def get_launch_cmd(self) -> t.List[str]: """Get the launch command for the batch :return: launch command for the batch - :rtype: list[str] """ script = self._write_script() return [self.batch_settings.batch_cmd, script] @@ -66,7 +62,6 @@ def add_to_batch(self, step: Step) -> None: """Add a job step to this batch :param step: a job step instance e.g. SrunStep - :type step: Step """ launch_cmd = step.get_launch_cmd() self.step_cmds.append(launch_cmd) @@ -76,7 +71,6 @@ def _write_script(self) -> str: """Write the batch script :return: batch script path after writing - :rtype: str """ batch_script = self.get_step_file(ending=".sh") output, error = self.get_output_files() @@ -113,11 +107,8 @@ def __init__(self, name: str, cwd: str, run_settings: RunSettings): """Initialize a LSF jsrun job step :param name: name of the entity to be launched - :type name: str :param cwd: path to launch dir - :type cwd: str :param run_settings: run settings for entity - :type run_settings: RunSettings """ super().__init__(name, cwd, run_settings) self.alloc: t.Optional[str] = None @@ -155,7 +146,6 @@ def get_launch_cmd(self) -> t.List[str]: """Get the command to launch this step :return: launch command - :rtype: list[str] """ jsrun = self.run_settings.run_command if not jsrun: @@ -223,7 +213,6 @@ def _build_exe(self) -> t.List[str]: """Build the executable for this step :return: executable list - :rtype: list[str] """ exe = self.run_settings.exe args = self.run_settings._exe_args # pylint: disable=protected-access diff --git a/smartsim/_core/launcher/step/mpiStep.py b/smartsim/_core/launcher/step/mpiStep.py index 785d55e92..767486462 100644 --- a/smartsim/_core/launcher/step/mpiStep.py +++ b/smartsim/_core/launcher/step/mpiStep.py @@ -43,11 +43,8 @@ def __init__(self, name: str, cwd: str, run_settings: RunSettings) -> None: """Initialize a job step conforming to the MPI standard :param name: name of the entity to be launched - :type name: str :param cwd: path to launch dir - :type cwd: str :param run_settings: run settings for entity - :type run_settings: RunSettings """ super().__init__(name, cwd, run_settings) @@ -64,7 +61,6 @@ def get_launch_cmd(self) -> t.List[str]: """Get the command to launch this step :return: launch command - :rtype: list[str] """ run_cmd = self.run_settings.run_command if not run_cmd: @@ -130,7 +126,6 @@ def _build_exe(self) -> t.List[str]: """Build the executable for this step :return: executable list - :rtype: list[str] """ if self._get_mpmd(): return self._make_mpmd() @@ -161,14 +156,10 @@ def __init__(self, name: str, cwd: str, run_settings: MpiexecSettings) -> None: """Initialize an mpiexec job step :param name: name of the entity to be launched - :type name: str :param cwd: path to launch dir - :type cwd: str :param run_settings: run settings for entity - :type run_settings: MpiexecSettings :param default_run_command: The default command to launch an MPI application - :type default_run_command: str, optional """ super().__init__(name, cwd, run_settings) @@ -179,14 +170,10 @@ def __init__(self, name: str, cwd: str, run_settings: MpirunSettings) -> None: """Initialize an mpirun job step :param name: name of the entity to be launched - :type name: str :param cwd: path to launch dir - :type cwd: str :param run_settings: run settings for entity - :type run_settings: MpirunSettings :param default_run_command: The default command to launch an MPI application - :type default_run_command: str, optional """ super().__init__(name, cwd, run_settings) @@ -197,14 +184,10 @@ def __init__(self, name: str, cwd: str, run_settings: OrterunSettings) -> None: """Initialize an orterun job step :param name: name of the entity to be launched - :type name: str :param cwd: path to launch dir - :type cwd: str :param run_settings: run settings for entity - :type run_settings: OrterunSettings :param default_run_command: The default command to launch an MPI application - :type default_run_command: str, optional """ super().__init__(name, cwd, run_settings) diff --git a/smartsim/_core/launcher/step/pbsStep.py b/smartsim/_core/launcher/step/pbsStep.py index 65dac3225..82a91aaa4 100644 --- a/smartsim/_core/launcher/step/pbsStep.py +++ b/smartsim/_core/launcher/step/pbsStep.py @@ -38,11 +38,8 @@ def __init__(self, name: str, cwd: str, batch_settings: QsubBatchSettings) -> No """Initialize a PBSpro qsub step :param name: name of the entity to launch - :type name: str :param cwd: path to launch dir - :type cwd: str :param batch_settings: batch settings for entity - :type batch_settings: QsubBatchSettings """ super().__init__(name, cwd, batch_settings) self.step_cmds: t.List[t.List[str]] = [] @@ -53,7 +50,6 @@ def get_launch_cmd(self) -> t.List[str]: """Get the launch command for the batch :return: launch command for the batch - :rtype: list[str] """ script = self._write_script() return [self.batch_settings.batch_cmd, script] @@ -62,7 +58,6 @@ def add_to_batch(self, step: Step) -> None: """Add a job step to this batch :param step: a job step instance e.g. SrunStep - :type step: Step """ launch_cmd = step.get_launch_cmd() self.step_cmds.append(launch_cmd) @@ -72,7 +67,6 @@ def _write_script(self) -> str: """Write the batch script :return: batch script path after writing - :rtype: str """ batch_script = self.get_step_file(ending=".sh") output, error = self.get_output_files() diff --git a/smartsim/_core/launcher/step/slurmStep.py b/smartsim/_core/launcher/step/slurmStep.py index 7baab891b..83f39cf09 100644 --- a/smartsim/_core/launcher/step/slurmStep.py +++ b/smartsim/_core/launcher/step/slurmStep.py @@ -42,11 +42,8 @@ def __init__(self, name: str, cwd: str, batch_settings: SbatchSettings) -> None: """Initialize a Slurm Sbatch step :param name: name of the entity to launch - :type name: str :param cwd: path to launch dir - :type cwd: str :param batch_settings: batch settings for entity - :type batch_settings: SbatchSettings """ super().__init__(name, cwd, batch_settings) self.step_cmds: t.List[t.List[str]] = [] @@ -57,7 +54,6 @@ def get_launch_cmd(self) -> t.List[str]: """Get the launch command for the batch :return: launch command for the batch - :rtype: list[str] """ script = self._write_script() return [self.batch_settings.batch_cmd, "--parsable", script] @@ -66,7 +62,6 @@ def add_to_batch(self, step: Step) -> None: """Add a job step to this batch :param step: a job step instance e.g. SrunStep - :type step: Step """ launch_cmd = ["cd", step.cwd, ";"] launch_cmd += step.get_launch_cmd() @@ -77,7 +72,6 @@ def _write_script(self) -> str: """Write the batch script :return: batch script path after writing - :rtype: str """ batch_script = self.get_step_file(ending=".sh") output, error = self.get_output_files() @@ -108,11 +102,8 @@ def __init__(self, name: str, cwd: str, run_settings: SrunSettings) -> None: """Initialize a srun job step :param name: name of the entity to be launched - :type name: str :param cwd: path to launch dir - :type cwd: str :param run_settings: run settings for entity - :type run_settings: SrunSettings """ super().__init__(name, cwd, run_settings) self.alloc: t.Optional[str] = None @@ -125,7 +116,6 @@ def get_launch_cmd(self) -> t.List[str]: """Get the command to launch this step :return: launch command - :rtype: list[str] """ srun = self.run_settings.run_command if not srun: @@ -206,7 +196,6 @@ def _build_exe(self) -> t.List[str]: """Build the executable for this step :return: executable list - :rtype: list[str] """ if self._get_mpmd(): return self._make_mpmd() diff --git a/smartsim/_core/launcher/step/step.py b/smartsim/_core/launcher/step/step.py index deeef6b73..a32685b53 100644 --- a/smartsim/_core/launcher/step/step.py +++ b/smartsim/_core/launcher/step/step.py @@ -126,7 +126,6 @@ def add_to_batch(self, step: Step) -> None: """Add a job step to this batch :param step: a job step instance e.g. SrunStep - :type step: Step """ raise SmartSimError("add_to_batch not implemented for this step type") diff --git a/smartsim/_core/launcher/taskManager.py b/smartsim/_core/launcher/taskManager.py index 84123944e..60f097da6 100644 --- a/smartsim/_core/launcher/taskManager.py +++ b/smartsim/_core/launcher/taskManager.py @@ -114,17 +114,11 @@ def start_task( by a workload manager :param cmd_list: command to run - :type cmd_list: list[str] :param cwd: current working directory - :type cwd: str :param env: environment to launch with - :type env: dict[str, str], optional. If None, calling environment is inherited - :param out: output file, defaults to PIPE - :type out: file, optional - :param err: error file, defaults to PIPE - :type err: file, optional + :param out: output file + :param err: error file :return: task id - :rtype: int """ with self._lock: proc = execute_async_cmd(cmd_list, cwd, env=env, out=out, err=err) @@ -150,15 +144,10 @@ def start_and_wait( This is primarily used for batch job launches :param cmd_list: command to run - :type cmd_list: list[str] :param cwd: current working directory - :type cwd: str :param env: environment to launch with - :type env: dict[str, str], optional - :param timeout: time to wait, defaults to None - :type timeout: int, optional + :param timeout: time to wait :return: returncode, output, and err - :rtype: int, str, str """ returncode, out, err = execute_cmd(cmd_list, cwd=cwd, env=env, timeout=timeout) if VERBOSE_TM: @@ -169,7 +158,6 @@ def add_existing(self, task_id: int) -> None: """Add existing task to be managed by the TaskManager :param task_id: task id of existing task - :type task_id: str :raises LauncherError: If task cannot be found """ with self._lock: @@ -186,7 +174,6 @@ def remove_task(self, task_id: str) -> None: """Remove a task from the TaskManager :param task_id: id of the task to remove - :type task_id: str """ with self._lock: if VERBOSE_TM: @@ -210,9 +197,7 @@ def get_task_update( """Get the update of a task :param task_id: task id - :type task_id: str :return: status, returncode, output, error - :rtype: str, int, str, str """ with self._lock: try: @@ -251,13 +236,9 @@ def add_task_history( Add a task to record its future returncode, output and error :param task_id: id of the task - :type task_id: str :param returncode: returncode - :type returncode: int, defaults to None - :param out: output, defaults to None - :type out: str, optional - :param err: output, defaults to None - :type err: str, optional + :param out: output + :param err: output """ self.task_history[task_id] = (returncode, out, err) @@ -278,7 +259,6 @@ def __init__(self, process: psutil.Process) -> None: """Initialize a task :param process: Popen object - :type process: psutil.Process """ self.process = process self.pid = str(self.process.pid) @@ -287,7 +267,6 @@ def check_status(self) -> t.Optional[int]: """Ping the job and return the returncode if finished :return: returncode if finished otherwise None - :rtype: int """ if self.owned and isinstance(self.process, psutil.Popen): poll_result = self.process.poll() @@ -302,7 +281,6 @@ def get_io(self) -> t.Tuple[t.Optional[str], t.Optional[str]]: """Get the IO from the subprocess :return: output and error from the Popen - :rtype: str, str """ # Process class does not implement communicate if not self.owned or not isinstance(self.process, psutil.Popen): @@ -335,8 +313,7 @@ def kill_callback(proc: psutil.Process) -> None: def terminate(self, timeout: int = 10) -> None: """Terminate a this process and all children. - :param timeout: time to wait for task death, defaults to 10 - :type timeout: int, optional + :param timeout: time to wait for task death """ def terminate_callback(proc: psutil.Process) -> None: diff --git a/smartsim/_core/launcher/util/launcherUtil.py b/smartsim/_core/launcher/util/launcherUtil.py index a24d69e49..1a6ec5d83 100644 --- a/smartsim/_core/launcher/util/launcherUtil.py +++ b/smartsim/_core/launcher/util/launcherUtil.py @@ -38,9 +38,7 @@ def __init__( """Initialize a ComputeNode :param node_name: the name of the node - :type node_name: str :param node_ppn: the number of ppn - :type node_ppn: int """ self.name: t.Optional[str] = node_name self.ppn: t.Optional[int] = node_ppn @@ -52,7 +50,6 @@ def _is_valid_node(self) -> bool: and ppn being not None. :returns: True if valid, false otherwise - :rtype: bool """ if self.name is None: return False @@ -80,7 +77,6 @@ def _is_valid_partition(self) -> bool: and each ComputeNode being valid :returns: True if valid, false otherwise - :rtype: bool """ if self.name is None: return False diff --git a/smartsim/_core/launcher/util/shell.py b/smartsim/_core/launcher/util/shell.py index c747bacbc..a2b5bc76b 100644 --- a/smartsim/_core/launcher/util/shell.py +++ b/smartsim/_core/launcher/util/shell.py @@ -49,22 +49,14 @@ def execute_cmd( """Execute a command locally :param cmd_list: list of command with arguments - :type cmd_list: list of str - :param shell: run in system shell, defaults to False - :type shell: bool, optional - :param cwd: current working directory, defaults to None - :type cwd: str, optional - :param env: environment to launcher process with, - defaults to None (current env) - :type env: dict[str, str], optional - :param proc_input: input to the process, defaults to "" - :type proc_input: str, optional - :param timeout: timeout of the process, defaults to None - :type timeout: int, optional + :param shell: run in system shell + :param cwd: current working directory + :param env: environment to launcher process with + :param proc_input: input to the process + :param timeout: timeout of the process :raises ShellError: if timeout of process was exceeded :raises ShellError: if child process raises an error :return: returncode, output, and error of the process - :rtype: tuple of (int, str, str) """ if VERBOSE_SHELL: source = "shell" if shell else "Popen" @@ -106,13 +98,9 @@ def execute_async_cmd( popen subprocess object wrapped with psutil. :param cmd_list: list of command with arguments - :type cmd_list: list of str :param cwd: current working directory - :type cwd: str :param env: environment variables to set - :type env: dict[str, str] :return: the subprocess object - :rtype: psutil.Popen """ if VERBOSE_SHELL: logger.debug(f"Executing command: {' '.join(cmd_list)}") diff --git a/smartsim/_core/utils/helpers.py b/smartsim/_core/utils/helpers.py index 9e1a8acf6..4c68eaa01 100644 --- a/smartsim/_core/utils/helpers.py +++ b/smartsim/_core/utils/helpers.py @@ -51,11 +51,8 @@ def unpack_db_identifier(db_id: str, token: str) -> t.Tuple[str, str]: """Unpack the unformatted database identifier and format for env variable suffix using the token :param db_id: the unformatted database identifier eg. identifier_1 - :type db_id: str :param token: character to use to construct the db suffix - :type token: str :return: db id suffix and formatted db_id e.g. ("_identifier_1", "identifier_1") - :rtype: (str, str) """ if db_id == "orchestrator": @@ -66,10 +63,9 @@ def unpack_db_identifier(db_id: str, token: str) -> t.Tuple[str, str]: def unpack_colo_db_identifier(db_id: str) -> str: """Create database identifier suffix for colocated database + :param db_id: the unformatted database identifier - :type db_id: str :return: db suffix - :rtype: str """ return "_" + db_id if db_id else "" @@ -100,10 +96,9 @@ def fmt_dict(value: t.Dict[str, t.Any]) -> str: def get_base_36_repr(positive_int: int) -> str: """Converts a positive integer to its base 36 representation + :param positive_int: the positive integer to convert - :type positive_int: int :return: base 36 representation of the given positive int - :rtype: str """ digits = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" result = [] @@ -120,7 +115,6 @@ def expand_exe_path(exe: str) -> str: """Takes an executable and returns the full path to that executable :param exe: executable or file - :type exe: str :raises TypeError: if file is not an executable :raises FileNotFoundError: if executable cannot be found """ @@ -182,9 +176,7 @@ def colorize( def delete_elements(dictionary: t.Dict[str, t.Any], key_list: t.List[str]) -> None: """Delete elements from a dictionary. :param dictionary: the dictionary from which the elements must be deleted. - :type dictionary: dict :param key_list: the list of keys to delete from the dictionary. - :type key: any """ for key in key_list: if key in dictionary: @@ -209,9 +201,7 @@ def cat_arg_and_value(arg_name: str, value: str) -> str: `-arg_name=value` (i.e., `-a val`) :param arg_name: the command line argument name - :type arg_name: str :param value: the command line argument value - :type value: str """ if arg_name.startswith("--"): @@ -255,10 +245,8 @@ def installed_redisai_backends( the backend directories (`redisai_tensorflow`, `redisai_torch`, `redisai_onnxruntime`, or `redisai_tflite`). - :param backends_path: path containing backends, defaults to None - :type backends_path: str, optional + :param backends_path: path containing backends :return: list of installed RedisAI backends - :rtype: set[str] """ # import here to avoid circular import base_path = redis_install_base(backends_path) @@ -321,9 +309,7 @@ def __init__( signal number via the `get` factory method. :param signalnum: The signal number to intercept - :type signalnum: int :param callbacks: A iterable of functions to call upon receiving the signal - :type callbacks: t.Iterable[_TSignalHandlerFn] | None """ self._callbacks = list(callbacks) if callbacks else [] self._original = signal.signal(signalnum, self) @@ -333,7 +319,6 @@ def __call__(self, signalnum: int, frame: t.Optional["FrameType"]) -> None: End by calling the originally registered signal hander (if present). :param frame: The current stack frame - :type frame: FrameType | None """ for fn in self: fn(signalnum, frame) @@ -356,9 +341,7 @@ def get(cls, signalnum: int) -> "SignalInterceptionStack": :param signalnum: The singal number of the signal interception stack should be registered - :type signalnum: int :returns: The existing or created signal interception stack - :rtype: SignalInterceptionStack """ handler = signal.getsignal(signalnum) if isinstance(handler, cls): @@ -369,7 +352,6 @@ def push(self, fn: _TSignalHandlerFn) -> None: """Add a callback to the signal interception stack. :param fn: A callable to add to the unique signal stack - :type fn: _TSignalHandlerFn """ self._callbacks.append(fn) @@ -378,10 +360,8 @@ def push_unique(self, fn: _TSignalHandlerFn) -> bool: callback is not already present. :param fn: A callable to add to the unique signal stack - :type fn: _TSignalHandlerFn :returns: True if the callback was added, False if the callback was already present - :rtype: bool """ if did_push := fn not in self: self.push(fn) diff --git a/smartsim/_core/utils/network.py b/smartsim/_core/utils/network.py index 69eeb3e1b..f568597df 100644 --- a/smartsim/_core/utils/network.py +++ b/smartsim/_core/utils/network.py @@ -37,9 +37,7 @@ def get_ip_from_host(host: str) -> str: """Return the IP address for the interconnect. :param host: hostname of the compute node e.g. nid00004 - :type host: str :returns: ip of host - :rtype: str """ ip_address = socket.gethostbyname(host) return ip_address @@ -50,11 +48,9 @@ def get_ip_from_interface(interface: str) -> str: # pragma: no cover """Get IPV4 address of a network interface :param interface: interface name - :type interface: str :raises ValueError: if the interface does not exist :raises ValueError: if interface does not have an IPV4 address :return: ip address of interface - :rtype: str """ net_if_addrs = psutil.net_if_addrs() if interface not in net_if_addrs: diff --git a/smartsim/_core/utils/redis.py b/smartsim/_core/utils/redis.py index e3500c093..41ee69cc4 100644 --- a/smartsim/_core/utils/redis.py +++ b/smartsim/_core/utils/redis.py @@ -53,9 +53,7 @@ def create_cluster(hosts: t.List[str], ports: t.List[int]) -> None: # cov-wlm needs to occur manually which is not often. :param hosts: List of hostnames to connect to - :type hosts: List[str] :param ports: List of ports for each hostname - :type ports: List[int] :raises SmartSimError: if cluster creation fails """ ip_list = [] @@ -85,11 +83,8 @@ def check_cluster_status( """Check that a Redis/KeyDB cluster is up and running :param hosts: List of hostnames to connect to - :type hosts: List[str] :param ports: List of ports for each hostname - :type ports: List[int] :param trials: number of attempts to verify cluster status - :type trials: int, optional :raises SmartSimError: If cluster status cannot be verified """ @@ -129,13 +124,9 @@ def db_is_active(hosts: t.List[str], ports: t.List[int], num_shards: int) -> boo just ping DB. :param hosts: list of hosts - :type hosts: list[str] :param ports: list of ports - :type ports: list[int] :param num_shards: Number of DB shards - :type num_shards: int :return: Whether DB is running - :rtype: bool """ # if single shard if num_shards < 2: @@ -229,11 +220,8 @@ def shutdown_db_node(host_ip: str, port: int) -> t.Tuple[int, str, str]: # cov- will take care of this automatically. :param host_ip: IP of host to connect to - :type hosts: str :param ports: Port to which node is listening - :type ports: int :return: returncode, output, and error of the process - :rtype: tuple of (int, str, str) """ redis_cli = CONFIG.database_cli cmd = [redis_cli, "-h", host_ip, "-p", str(port), "shutdown"] diff --git a/smartsim/_core/utils/telemetry/collector.py b/smartsim/_core/utils/telemetry/collector.py index 9371390da..178126dec 100644 --- a/smartsim/_core/utils/telemetry/collector.py +++ b/smartsim/_core/utils/telemetry/collector.py @@ -49,17 +49,15 @@ def __init__(self, entity: JobEntity, sink: Sink) -> None: """Initialize the collector :param entity: entity to collect metrics on - :type entity: JobEntity :param sink: destination to write collected information - :type sink: Sink""" + """ self._entity = entity self._sink = sink self._enabled = True @property def enabled(self) -> bool: - """Boolean indicating if the collector should perform data collection - :rtype: bool""" + """Boolean indicating if the collector should perform data collection""" return self._entity.telemetry_on @enabled.setter @@ -69,15 +67,14 @@ def enabled(self, value: bool) -> None: @property def entity(self) -> JobEntity: """The `JobEntity` for which data is collected - :return: the entity - :rtype: JobEntity""" + :return: the entity""" return self._entity @property def sink(self) -> Sink: """The sink where collected data is written :return: the sink - :rtype: Sink""" + """ return self._sink @abc.abstractmethod @@ -99,9 +96,7 @@ class _DBAddress: def __init__(self, host: str, port: int) -> None: """Initialize the instance :param host: host address for database connections - :type host: str :param port: port number for database connections - :type port: int """ self.host = host.strip() if host else "" self.port = port @@ -126,9 +121,8 @@ def __init__(self, entity: JobEntity, sink: Sink) -> None: """Initialize the `DBCollector` :param entity: entity with metadata about the resource to monitor - :type entity: JobEntity :param sink: destination to write collected information - :type sink: Sink""" + """ super().__init__(entity, sink) self._client: t.Optional[redisa.Redis[bytes]] = None self._address = _DBAddress( @@ -174,7 +168,7 @@ async def _perform_collection( adding extraneous base class code to differentiate the results :return: an iterable containing individual metric collection results - :rtype: Sequence[Tuple[Union[int, float, str], ...]]""" + """ async def collect(self) -> None: """Execute database metric collection if the collector is enabled. Writes @@ -220,7 +214,7 @@ async def _check_db(self) -> bool: """Check if the target database is reachable. :return: `True` if connection succeeds, `False` otherwise. - :rtype: bool""" + """ try: if self._client: return await self._client.ping() @@ -249,7 +243,7 @@ async def _perform_collection( :return: an iterable containing individual metric collection results in the format `(timestamp,used_memory,used_memory_peak,total_system_memory)` - :rtype: Sequence[Tuple[int, float, float, float]]""" + """ if self._client is None: return [] @@ -285,7 +279,7 @@ async def _perform_collection( :return: an iterable containing individual metric collection results in the format `(timestamp,client_id,address)` - :rtype: Sequence[Tuple[Union[int, str, str], ...]]""" + """ if self._client is None: return [] @@ -322,7 +316,7 @@ async def _perform_collection( :return: an iterable containing individual metric collection results in the format `(timestamp,num_clients)` - :rtype: Sequence[Tuple[int, int]]""" + """ if self._client is None: return [] @@ -345,9 +339,7 @@ class CollectorManager: def __init__(self, timeout_ms: int = 1000) -> None: """Initialize the `CollectorManager` without collectors - :param timeout_ms: maximum time (in ms) allowed for `Collector.collect`, - defaults to 1000ms - :type timeout_ms: int + :param timeout_ms: maximum time (in ms) allowed for `Collector.collect` """ # A lookup table to hold a list of registered collectors per entity self._collectors: t.Dict[str, t.List[Collector]] = collections.defaultdict(list) @@ -362,7 +354,7 @@ def add(self, collector: Collector) -> None: """Add a collector to the monitored set :param collector: `Collector` instance to monitor - :type collector: Collector""" + """ entity_name = collector.entity.name registered_collectors = self._collectors[entity_name] @@ -378,7 +370,7 @@ def add_all(self, collectors: t.Sequence[Collector]) -> None: """Add multiple collectors to the monitored set :param collectors: a collection of `Collectors` to monitor - :type collectors: Sequence[Collector]""" + """ for collector in collectors: self.add(collector) @@ -387,7 +379,7 @@ async def remove_all(self, entities: t.Sequence[JobEntity]) -> None: :param entities: a collection of `JobEntity` instances that will no longer have registered collectors - :type entities: Sequence[JobEntity]""" + """ if not entities: return @@ -398,7 +390,7 @@ async def remove(self, entity: JobEntity) -> None: """Remove all collectors registered to the supplied entity :param entities: `JobEntity` that will no longer have registered collectors - :type entities: JobEntity""" + """ registered = self._collectors.pop(entity.name, []) if not registered: return @@ -443,7 +435,7 @@ def all_collectors(self) -> t.Sequence[Collector]: """Get a list of all registered collectors :return: a collection of registered collectors for all entities - :rtype: Sequence[Collector]""" + """ # flatten and return all the lists-of-collectors that are registered collectors = itertools.chain.from_iterable(self._collectors.values()) return [collector for collector in collectors if collector.enabled] @@ -453,7 +445,7 @@ def dead_collectors(self) -> t.Sequence[Collector]: """Get a list of all disabled collectors :return: a collection of disabled collectors for all entities - :rtype: Sequence[Collector]""" + """ collectors = itertools.chain.from_iterable(self._collectors.values()) return [collector for collector in collectors if not collector.enabled] @@ -462,7 +454,7 @@ def register_collectors(self, entity: JobEntity) -> None: :param entity: a `JobEntity` instance that will have all configured collectors registered for collection. Configuration is found in the `RuntimeManifest` - :type entity: JobEntity""" + """ collectors: t.List[Collector] = [] # ONLY db telemetry is implemented at this time. This resolver must @@ -485,6 +477,6 @@ def register_all_collectors(self, entities: t.Sequence[JobEntity]) -> None: """Find all configured collectors for the entity and register them :param entities: entities to call `register_collectors` for - :type entities: Sequence[JobEntity]""" + """ for entity in entities: self.register_collectors(entity) diff --git a/smartsim/_core/utils/telemetry/manifest.py b/smartsim/_core/utils/telemetry/manifest.py index f4cca3a4c..e72a18fa0 100644 --- a/smartsim/_core/utils/telemetry/manifest.py +++ b/smartsim/_core/utils/telemetry/manifest.py @@ -57,7 +57,7 @@ def flatten( :param filter_fn: optional boolean filter that returns True for entities to include in the result - :type filter_fn: (optional) Callable[[JobEntity], bool]""" + """ entities = self.models + self.orchestrators + self.ensembles if filter_fn: entities = [entity for entity in entities if filter_fn(entity)] @@ -72,13 +72,10 @@ def load_entity( """Map entity data persisted in a manifest file to an object :param entity_type: type of the associated `SmartSimEntity` - :type entity_type: str :param entity_dict: raw dictionary deserialized from manifest JSON - :type entity_dict: Dict[str, Any] :param exp_dir: root path to experiment outputs - :type exp_dir: pathlib.Path :return: list of loaded `JobEntity` instances - :rtype: List[JobEntity]""" + """ entities = [] # an entity w/parent keys must create entities for the items that it @@ -108,13 +105,10 @@ def load_entities( """Map a collection of entity data persisted in a manifest file to an object :param entity_type: type of the associated `SmartSimEntity` - :type entity_type: str :param run: raw dictionary containing `Run` data deserialized from JSON - :type run: Dict[str, Any] :param exp_dir: root path to experiment outputs - :type exp_dir: pathlib.Path :return: list of loaded `JobEntity` instances - :rtype: Dict[str, List[JobEntity]]""" + """ persisted: t.Dict[str, t.List[JobEntity]] = { "model": [], "orchestrator": [], @@ -131,11 +125,9 @@ def load_run(raw_run: t.Dict[str, t.Any], exp_dir: pathlib.Path) -> "Run": """Map run data persisted in a manifest file to an object :param runs: raw dictionary containing `Run` data deserialized from JSON - :type runs: Dict[str, Any] :param exp_dir: root path to experiment outputs - :type exp_dir: pathlib.Path :return: populated `Run` instance - :rtype: Run""" + """ # create an output mapping to hold the deserialized entities run_entities: t.Dict[str, t.List[JobEntity]] = { @@ -189,11 +181,8 @@ def load_manifest(file_path: str) -> t.Optional["RuntimeManifest"]: """Load a persisted manifest and return the content :param file_path: path to the manifest file to load - :type file_path: str - :return: deserialized `RuntimeManifest` if the manifest file is found, otherwise None - :rtype: RuntimeManifest|None """ manifest_dict: t.Optional[t.Dict[str, t.Any]] = None try_count, max_attempts = 1, 5 diff --git a/smartsim/_core/utils/telemetry/sink.py b/smartsim/_core/utils/telemetry/sink.py index 73c2d075a..afea791ea 100644 --- a/smartsim/_core/utils/telemetry/sink.py +++ b/smartsim/_core/utils/telemetry/sink.py @@ -40,7 +40,7 @@ async def save(self, *args: t.Any) -> None: """Save the args passed to this method to the underlying sink :param args: variadic list of values to save - :type args: Any""" + """ class FileSink(Sink): @@ -50,7 +50,7 @@ def __init__(self, path: str) -> None: """Initialize the FileSink :param filename: path to a file backing this `Sink` - :type filename: str""" + """ super().__init__() self._check_init(path) self._path = pathlib.Path(path) @@ -61,7 +61,7 @@ def _check_init(filename: str) -> None: if an invalid filename is passed :param filename: path to a file backing this `Sink` - :type filename: str""" + """ if not filename: raise ValueError("No filename provided to FileSink") @@ -70,7 +70,7 @@ def path(self) -> pathlib.Path: """The path to the file this FileSink writes :return: path to a file backing this `Sink` - :rtype: pathlib.Path""" + """ return self._path async def save(self, *args: t.Any) -> None: diff --git a/smartsim/_core/utils/telemetry/telemetry.py b/smartsim/_core/utils/telemetry/telemetry.py index ddfc797f7..7b1288341 100644 --- a/smartsim/_core/utils/telemetry/telemetry.py +++ b/smartsim/_core/utils/telemetry/telemetry.py @@ -79,18 +79,14 @@ def __init__( :param pattern: a pattern that identifies the files whose events are of interest by matching their name - :type pattern: str :param ignore_patterns: a pattern that identifies the files whose events should be ignored - :type ignore_patterns: Optional[List[str]] :param ignore_directories: set to `True` to avoid directory events - :type ignore_directories: bool :param case_sensitive: set to `True` to require case sensitivity in resource names in order to match input patterns - :type case_sensitive: bool :param timeout_ms: maximum duration (in ms) of a call to the event loop prior to cancelling tasks - :type timeout_ms: int""" + """ super().__init__( [pattern], ignore_patterns, ignore_directories, case_sensitive ) # type: ignore @@ -112,7 +108,7 @@ def tracked_jobs(self) -> t.Sequence[JobEntity]: """The collection of `JobEntity` that are actively being monitored :return: the collection - :rtype: Sequence[JobEntity]""" + """ return list(self._tracked_jobs.values()) def init_launcher(self, launcher: str) -> None: @@ -121,10 +117,10 @@ def init_launcher(self, launcher: str) -> None: and local launching :param launcher: the name of the workload manager used by the experiment - :type launcher: str :raises ValueError: if a string is passed that is not a supported launcher - :raises TypeError: if no launcher argument is provided.""" + :raises TypeError: if no launcher argument is provided. + """ if not launcher: raise TypeError("Must provide a 'launcher' argument") @@ -146,7 +142,7 @@ def set_launcher(self, launcher: str) -> None: """Initialize all required dependencies :param launcher: the name of the workload manager used by the experiment - :type launcher: str""" + """ self.init_launcher(launcher) self.init_job_manager() @@ -156,7 +152,7 @@ def process_manifest(self, manifest_path: str) -> None: and registered collectors :param manifest_path: full path to the manifest file - :type manifest_path: str""" + """ try: # it is possible to read the manifest prior to a completed # write due to no access locking mechanism. log the issue @@ -227,7 +223,7 @@ def on_modified(self, event: FileSystemEvent) -> None: """Event handler for when a file or directory is modified. :param event: event representing file/directory modification. - :type event: FileSystemEvent""" + """ super().on_modified(event) logger.debug(f"Processing manifest modified @ {event.src_path}") self.process_manifest(event.src_path) @@ -236,7 +232,7 @@ def on_created(self, event: FileSystemEvent) -> None: """Event handler for when a file or directory is created. :param event: event representing file/directory creation. - :type event: FileSystemEvent""" + """ super().on_created(event) logger.debug(f"processing manifest created @ {event.src_path}") self.process_manifest(event.src_path) @@ -251,11 +247,9 @@ async def _to_completed( stop monitoring for updates during timesteps. :param timestamp: current timestamp for event logging - :type timestamp: int :param entity: running SmartSim Job - :type entity: JobEntity :param step_info: `StepInfo` received when requesting a Job status update - :type step_info: StepInfo""" + """ # remember completed entities to ignore them after manifest updates inactive_entity = self._tracked_jobs.pop(entity.key) if entity.key not in self._completed_jobs: @@ -289,7 +283,7 @@ async def on_timestep(self, timestamp: int) -> None: monitored entities :param timestamp: current timestamp for event logging - :type timestamp: int""" + """ if not self._launcher: return @@ -334,13 +328,10 @@ def __init__( """Initialize the instance with inputs and defaults :param exp_dir: root path to experiment outputs - :type exp_dir: str :param frequency: desired frequency of metric & status updates (in seconds) - :type frequency: int :param frequency: cooldown period (in seconds) before automatic shutdown - :type frequency: int :param log_level: log level to apply to python logging - :type log_level: logging._Level""" + """ self.exp_dir: str = exp_dir self.frequency: int = frequency # freq in seconds self.cooldown: int = cooldown # cooldown in seconds @@ -432,7 +423,6 @@ def __init__(self, telemetry_monitor_args: TelemetryMonitorArgs): """Initialize the telemetry monitor instance :param telemetry_monitor_args: configuration for the telemetry monitor - :type telemetry_monitor_args: TelemetryMonitorArgs """ self._observer: BaseObserver = Observer() """an observer object that triggers the action handler""" @@ -454,7 +444,7 @@ def _can_shutdown(self) -> bool: are stored in the job manager :return: return True if capable of automatically shutting down - :rtype: bool""" + """ managed_jobs = ( list(self._action_handler.job_manager.jobs.values()) if self._action_handler @@ -534,7 +524,7 @@ async def run(self) -> int: will poll for telemetry data :return: return code for the process - :rtype: int""" + """ logger.info("Executing telemetry monitor") logger.info(f"Polling frequency: {self._args.frequency}s") logger.info(f"Experiment directory: {self._experiment_dir}") diff --git a/smartsim/_core/utils/telemetry/util.py b/smartsim/_core/utils/telemetry/util.py index 4eb61a79b..78f9c6db0 100644 --- a/smartsim/_core/utils/telemetry/util.py +++ b/smartsim/_core/utils/telemetry/util.py @@ -52,22 +52,15 @@ def write_event( Does not overwrite existing records. :param timestamp: when the event occurred - :type timestamp: str :param task_id: the task_id of a managed task - :type task_id: int|str :param step_id: the step_id of an unmanaged task - :type step_id: str :param entity_type: the SmartSimEntity subtype (e.g. `orchestrator`, `ensemble`, `model`, `dbnode`, ...) - :type entity_type: str :param event_type: the event subtype - :type event_type: _EventClass :param status_dir: path where the SmartSimEntity outputs are written - :type status_dir: pathlib.Path :param detail: (optional) additional information to write with the event - :type detail: str :param return_code: (optional) the return code of a completed task - :type return_code: int|None""" + """ tgt_path = status_dir / f"{event_type}.json" tgt_path.parent.mkdir(parents=True, exist_ok=True) @@ -109,10 +102,8 @@ def map_return_code(step_info: StepInfo) -> t.Optional[int]: and does not yet have a return code. :param step_info: step information produced by job manager status update queries - :type step_info: StepInfo - :return: a return code if the step is finished, otherwise None - :rtype: int""" + """ rc_map = {s: 1 for s in TERMINAL_STATUSES} # return `1` for all terminal statuses rc_map.update( {SmartSimStatus.STATUS_COMPLETED: os.EX_OK} diff --git a/smartsim/database/orchestrator.py b/smartsim/database/orchestrator.py index d3a917900..d95ae465b 100644 --- a/smartsim/database/orchestrator.py +++ b/smartsim/database/orchestrator.py @@ -166,23 +166,32 @@ def __init__( db_identifier: str = "orchestrator", **kwargs: t.Any, ) -> None: - """Initialize an Orchestrator reference for local launch - - :param port: TCP/IP port, defaults to 6379 - :type port: int, optional - :param interface: network interface(s), defaults to "lo" - :type interface: str, list[str], optional + """Initialize an ``Orchestrator`` reference for local launch Extra configurations for RedisAI See https://oss.redis.com/redisai/configuration/ + :param path: path to location of ``Orchestrator`` directory + :param port: TCP/IP port + :param interface: network interface(s) + :param launcher: type of launcher being used, options are "slurm", "pbs", + "lsf", or "local". If set to "auto", + an attempt will be made to find an available launcher + on the system. + :param run_command: specify launch binary or detect automatically + :param db_nodes: number of database shards + :param batch: run as a batch workload + :param hosts: specify hosts to launch on + :param account: account to run batch on + :param time: walltime for batch 'HH:MM:SS' format + :param alloc: allocation to launch database on + :param single_cmd: run all shards with one (MPMD) command :param threads_per_queue: threads per GPU device - :type threads_per_queue: int, optional :param inter_op_threads: threads across CPU operations - :type inter_op_threads: int, optional :param intra_op_threads: threads per CPU operation - :type intra_op_threads: int, optional + :param db_identifier: an identifier to distinguish this orchestrator in + multiple-database experiments """ self.launcher, self.run_command = _autodetect(launcher, run_command) _check_run_command(self.launcher, self.run_command) @@ -266,18 +275,16 @@ def db_identifier(self) -> str: """Return the DB identifier, which is common to a DB and all of its nodes :return: DB identifier - :rtype: str """ return self.name @property def num_shards(self) -> int: - """Return the number of DB shards contained in the orchestrator. + """Return the number of DB shards contained in the Orchestrator. This might differ from the number of ``DBNode`` objects, as each ``DBNode`` may start more than one shard (e.g. with MPMD). - :returns: num_shards - :rtype: int + :returns: the number of DB shards contained in the Orchestrator """ return sum(node.num_shards for node in self.entities) @@ -289,19 +296,17 @@ def db_nodes(self) -> int: an alias to the ``num_shards`` attribute. :returns: Number of database nodes - :rtype: int """ return self.num_shards @property def hosts(self) -> t.List[str]: - """Return the hostnames of orchestrator instance hosts + """Return the hostnames of Orchestrator instance hosts Note that this will only be populated after the orchestrator has been launched by SmartSim. - :return: hostnames - :rtype: list[str] + :return: the hostnames of Orchestrator instance hosts """ if not self._hosts: self._hosts = self._get_db_hosts() @@ -312,7 +317,7 @@ def telemetry(self) -> TelemetryConfiguration: """Return the telemetry configuration for this entity. :returns: configuration of telemetry for this entity - :rtype: TelemetryConfiguration""" + """ return self._telemetry_cfg def reset_hosts(self) -> None: @@ -334,7 +339,6 @@ def get_address(self) -> t.List[str]: """Return database addresses :return: addresses - :rtype: list[str] :raises SmartSimError: If database address cannot be found or is not active """ @@ -354,7 +358,6 @@ def is_active(self) -> bool: """Check if the database is active :return: True if database is active, False otherwise - :rtype: bool """ if not self._hosts: return False @@ -367,7 +370,6 @@ def _rai_module(self) -> t.Tuple[str, ...]: :return: Tuple of args to pass to the orchestrator exe to load and configure the RedisAI - :rtype: tuple[str] """ module = ["--loadmodule", CONFIG.redisai] if self.queue_threads: @@ -393,7 +395,6 @@ def set_cpus(self, num_cpus: int) -> None: compute threads, background threads, and network I/O. :param num_cpus: number of cpus to set - :type num_cpus: int """ if self.batch: if self.launcher == "pbs": @@ -417,7 +418,6 @@ def set_walltime(self, walltime: str) -> None: Note: This will only effect orchestrators launched as a batch :param walltime: amount of time e.g. 10 hours is 10:00:00 - :type walltime: str :raises SmartSimError: if orchestrator isn't launching as batch """ if not self.batch: @@ -430,7 +430,6 @@ def set_hosts(self, host_list: t.Union[t.List[str], str]) -> None: """Specify the hosts for the ``Orchestrator`` to launch on :param host_list: list of host (compute node names) - :type host_list: str, list[str] :raises TypeError: if wrong type """ if isinstance(host_list, str): @@ -474,9 +473,7 @@ def set_batch_arg(self, arg: str, value: t.Optional[str] = None) -> None: by SmartSim and will not be allowed to be set. :param arg: batch argument to set e.g. "exclusive" - :type arg: str :param value: batch param - set to None if no param value - :type value: str | None :raises SmartSimError: if orchestrator not launching as batch """ if not hasattr(self, "batch_settings") or not self.batch_settings: @@ -499,9 +496,7 @@ def set_run_arg(self, arg: str, value: t.Optional[str] = None) -> None: For example, "n", "N", etc. :param arg: run argument to set - :type arg: str :param value: run parameter - set to None if no parameter value - :type value: str | None """ if arg in self._reserved_run_args[type(self.entities[0].run_settings)]: logger.warning( @@ -522,7 +517,6 @@ def enable_checkpoints(self, frequency: int) -> None: after 900 seconds if there is at least 1 change to the dataset. :param frequency: the given number of seconds before the DB saves - :type frequency: int """ self.set_db_conf("save", f"{frequency} 1") @@ -531,15 +525,15 @@ def set_max_memory(self, mem: str) -> None: Setting max memory to zero also results in no memory limit. Once a limit is surpassed, keys will be removed according to the eviction strategy. The specified memory size is case insensitive and supports the typical forms of: - 1k => 1000 bytes - 1kb => 1024 bytes - 1m => 1000000 bytes - 1mb => 1024*1024 bytes - 1g => 1000000000 bytes + + 1k => 1000 bytes \n + 1kb => 1024 bytes \n + 1m => 1000000 bytes \n + 1mb => 1024*1024 bytes \n + 1g => 1000000000 bytes \n 1gb => 1024*1024*1024 bytes :param mem: the desired max memory size e.g. 3gb - :type mem: str :raises SmartSimError: If 'mem' is an invalid memory value :raises SmartSimError: If database is not active """ @@ -551,7 +545,6 @@ def set_eviction_strategy(self, strategy: str) -> None: :param strategy: The max memory policy to use e.g. "volatile-lru", "allkeys-lru", etc. - :type strategy: str :raises SmartSimError: If 'strategy' is an invalid maxmemory policy :raises SmartSimError: If database is not active """ @@ -564,7 +557,6 @@ def set_max_clients(self, clients: int = 50_000) -> None: incoming and another outgoing. :param clients: the maximum number of connected clients - :type clients: int, optional """ self.set_db_conf("maxclients", str(clients)) @@ -577,7 +569,6 @@ def set_max_message_size(self, size: int = 1_073_741_824) -> None: to 1gb, use 1024*1024*1024. :param size: maximum message size in bytes - :type size: int, optional """ self.set_db_conf("proto-max-bulk-len", str(size)) @@ -588,9 +579,7 @@ def set_db_conf(self, key: str, value: str) -> None: will take effect starting with the next command executed. :param key: the configuration parameter - :type key: str :param value: the database configuration parameter's new value - :type value: str """ if self.is_active(): addresses = [] diff --git a/smartsim/entity/dbnode.py b/smartsim/entity/dbnode.py index 9b67687f0..485bbcd88 100644 --- a/smartsim/entity/dbnode.py +++ b/smartsim/entity/dbnode.py @@ -146,9 +146,7 @@ def _get_cluster_conf_filenames(self, port: int) -> t.List[str]: # cov-lsf This function should bu used if and only if ``_mpmd==True`` :param port: port number - :type port: int :return: the dbnode configuration file name - :rtype: str """ if self.num_shards == 1: return [f"nodes-{self.name}-{port}.conf"] @@ -188,7 +186,6 @@ def get_launched_shard_info(self) -> "t.List[LaunchedShardData]": :raises SmartSimError: if all shard info could not be found :return: The found launched shard info - :rtype: list[LaunchedShardData] """ ips: "t.List[LaunchedShardData]" = [] trials = CONFIG.database_file_parse_trials @@ -225,7 +222,6 @@ def _parse_db_hosts(self) -> t.List[str]: :raises SmartSimError: if host/ip could not be found :return: ip addresses | hostnames - :rtype: list[str] """ return list({shard.hostname for shard in self.get_launched_shard_info()}) diff --git a/smartsim/entity/dbobject.py b/smartsim/entity/dbobject.py index d30668c76..0f834d253 100644 --- a/smartsim/entity/dbobject.py +++ b/smartsim/entity/dbobject.py @@ -110,9 +110,7 @@ def _enumerate_devices(self) -> t.List[str]: """Enumerate devices for a DBObject :param dbobject: DBObject to enumerate - :type dbobject: DBObject :return: list of device names - :rtype: list[str] """ if self.device == "GPU" and self.devices_per_node > 1: @@ -175,17 +173,11 @@ def __init__( must be provided :param name: key to store script under - :type name: str :param script: TorchScript code - :type script: str, optional - :param script_path: path to TorchScript code, defaults to None - :type script_path: str, optional - :param device: device for script execution, defaults to "CPU" - :type device: str, optional + :param script_path: path to TorchScript code + :param device: device for script execution :param devices_per_node: number of devices to store the script on - :type devices_per_node: int :param first_device: first devices to store the script on - :type first_device: int """ super().__init__( name, script, script_path, device, devices_per_node, first_device @@ -235,31 +227,18 @@ def __init__( must be provided :param name: key to store model under - :type name: str :param model: model in memory - :type model: str, optional :param model_file: serialized model - :type model_file: file path to model, optional :param backend: name of the backend (TORCH, TF, TFLITE, ONNX) - :type backend: str - :param device: name of device for execution, defaults to "CPU" - :type device: str, optional + :param device: name of device for execution :param devices_per_node: number of devices to store the model on - :type devices_per_node: int :param first_device: The first device to store the model on - :type first_device: int - :param batch_size: batch size for execution, defaults to 0 - :type batch_size: int, optional - :param min_batch_size: minimum batch size for model execution, defaults to 0 - :type min_batch_size: int, optional - :param min_batch_timeout: time to wait for minimum batch size, defaults to 0 - :type min_batch_timeout: int, optional - :param tag: additional tag for model information, defaults to "" - :type tag: str, optional - :param inputs: model inputs (TF only), defaults to None - :type inputs: list[str], optional - :param outputs: model outupts (TF only), defaults to None - :type outputs: list[str], optional + :param batch_size: batch size for execution + :param min_batch_size: minimum batch size for model execution + :param min_batch_timeout: time to wait for minimum batch size + :param tag: additional tag for model information + :param inputs: model inputs (TF only) + :param outputs: model outupts (TF only) """ super().__init__( name, model, model_file, device, devices_per_node, first_device diff --git a/smartsim/entity/ensemble.py b/smartsim/entity/ensemble.py index 1f80fe71e..ed971c6ae 100644 --- a/smartsim/entity/ensemble.py +++ b/smartsim/entity/ensemble.py @@ -75,27 +75,19 @@ def __init__( parameters to the permutation strategy. :param name: name of the ensemble - :type name: str :param params: parameters to expand into ``Model`` members - :type params: dict[str, Any] :param params_as_args: list of params that should be used as command line arguments to the ``Model`` member executables and not written to generator files - :type params_as_args: list[str] :param batch_settings: describes settings for ``Ensemble`` as batch workload - :type batch_settings: BatchSettings, optional :param run_settings: describes how each ``Model`` should be executed - :type run_settings: RunSettings, optional :param replicas: number of ``Model`` replicas to create - a keyword argument of kwargs - :type replicas: int, optional :param perm_strategy: strategy for expanding ``params`` into ``Model`` instances from params argument options are "all_perm", "step", "random" - or a callable function. Defaults to "all_perm". - :type perm_strategy: str + or a callable function. :return: ``Ensemble`` instance - :rtype: ``Ensemble`` """ self.params = params or {} self.params_as_args = params_as_args or [] @@ -190,7 +182,6 @@ def add_model(self, model: Model) -> None: """Add a model to this ensemble :param model: model instance to be added - :type model: Model :raises TypeError: if model is not an instance of ``Model`` :raises EntityExistsError: if model already exists in this ensemble """ @@ -221,7 +212,6 @@ def register_incoming_entity(self, incoming_entity: SmartSimEntity) -> None: Only python clients can have multiple incoming connections :param incoming_entity: The entity that data will be received from - :type incoming_entity: SmartSimEntity """ for model in self.models: model.register_incoming_entity(incoming_entity) @@ -237,7 +227,6 @@ def query_key_prefixing(self) -> bool: """Inquire as to whether each model within the ensemble will prefix their keys :returns: True if all models have key prefixing enabled, False otherwise - :rtype: bool """ return all(model.query_key_prefixing() for model in self.models) @@ -263,12 +252,9 @@ def attach_generator_files( would like to change. The tag is settable but defaults to a semicolon e.g. THERMO = ;10; - :param to_copy: files to copy, defaults to [] - :type to_copy: list, optional - :param to_symlink: files to symlink, defaults to [] - :type to_symlink: list, optional - :param to_configure: input files with tagged parameters, defaults to [] - :type to_configure: list, optional + :param to_copy: files to copy + :param to_symlink: files to symlink + :param to_configure: input files with tagged parameters """ for model in self.models: model.attach_generator_files( @@ -281,7 +267,6 @@ def attached_files_table(self) -> str: attached to models belonging to this ensemble. :returns: A table of all files attached to all models - :rtype: str """ if not self.models: return "The ensemble is empty, no files to show." @@ -304,10 +289,8 @@ def _set_strategy(strategy: str) -> StrategyFunction: the ensemble :param strategy: name of the strategy or callable function - :type strategy: str :raises SSUnsupportedError: if str name is not supported :return: strategy function - :rtype: callable """ if strategy == "all_perm": return create_all_permutations @@ -327,7 +310,6 @@ def _read_model_parameters(self) -> t.Tuple[t.List[str], t.List[t.List[str]]]: :raises TypeError: if params are of the wrong type :return: param names and values for permutation strategy - :rtype: tuple[list, list] """ if not isinstance(self.params, dict): @@ -378,33 +360,19 @@ def add_ml_model( must be provided :param name: key to store model under - :type name: str :param model: model in memory - :type model: str | bytes | None :param model_path: serialized model - :type model_path: file path to model :param backend: name of the backend (TORCH, TF, TFLITE, ONNX) - :type backend: str - :param device: name of device for execution, defaults to "CPU" - :type device: str, optional - :param devices_per_node: number of GPUs per node in multiGPU nodes, - defaults to 1 - :type devices_per_node: int, optional + :param device: name of device for execution + :param devices_per_node: number of GPUs per node in multiGPU nodes :param first_device: first device in multi-GPU nodes to use for execution, defaults to 0; ignored if devices_per_node is 1 - :type first_device: int, optional - :param batch_size: batch size for execution, defaults to 0 - :type batch_size: int, optional - :param min_batch_size: minimum batch size for model execution, defaults to 0 - :type min_batch_size: int, optional - :param min_batch_timeout: time to wait for minimum batch size, defaults to 0 - :type min_batch_timeout: int, optional - :param tag: additional tag for model information, defaults to "" - :type tag: str, optional - :param inputs: model inputs (TF only), defaults to None - :type inputs: list[str], optional - :param outputs: model outupts (TF only), defaults to None - :type outputs: list[str], optional + :param batch_size: batch size for execution + :param min_batch_size: minimum batch size for model execution + :param min_batch_timeout: time to wait for minimum batch size + :param tag: additional tag for model information + :param inputs: model inputs (TF only) + :param outputs: model outupts (TF only) """ db_model = DBModel( name=name, @@ -462,17 +430,11 @@ def add_script( must be provided :param name: key to store script under - :type name: str :param script: TorchScript code - :type script: str, optional :param script_path: path to TorchScript code - :type script_path: str, optional - :param device: device for script execution, defaults to "CPU" - :type device: str, optional + :param device: device for script execution :param devices_per_node: number of devices on each host - :type devices_per_node: int :param first_device: first device to use on each host - :type first_device: int """ db_script = DBScript( name=name, @@ -523,15 +485,10 @@ def add_function( being stored on nodes M through M + N - 1. :param name: key to store function under - :type name: str :param function: TorchScript code - :type function: str, optional - :param device: device for script execution, defaults to "CPU" - :type device: str, optional + :param device: device for script execution :param devices_per_node: number of devices on each host - :type devices_per_node: int :param first_device: first device to use on each host - :type first_device: int """ db_script = DBScript( name=name, @@ -567,9 +524,7 @@ def _extend_entity_db_models(model: Model, db_models: t.List[DBModel]) -> None: found. Otherwise, it appends the given list of DBModels to the Ensemble. :param model: SmartSim Model object. - :type model: Model :param db_models: List of DBModels to append to the Ensemble. - :type db_models: t.List[DBModel] """ for add_ml_model in db_models: dupe = next( @@ -597,9 +552,7 @@ def _extend_entity_db_scripts(model: Model, db_scripts: t.List[DBScript]) -> Non Ensemble. :param model: SmartSim Model object. - :type model: Model :param db_scripts: List of DBScripts to append to the Ensemble. - :type db_scripts: t.List[DBScript] """ for add_script in db_scripts: dupe = next( diff --git a/smartsim/entity/entity.py b/smartsim/entity/entity.py index 7e08534ae..012a76744 100644 --- a/smartsim/entity/entity.py +++ b/smartsim/entity/entity.py @@ -43,7 +43,6 @@ def telemetry(self) -> TelemetryConfiguration: # Return the telemetry configuration for this entity. # :returns: Configuration object indicating the configuration # status of telemetry for this entity - # :rtype: TelemetryConfiguration return self._telemetry_producer ``` @@ -55,7 +54,7 @@ def __init__(self, enabled: bool = False) -> None: """Initialize the telemetry producer and immediately call the `_on_enable` hook. :param enabled: flag indicating the initial state of telemetry - :type enabled: bool""" + """ self._is_on = enabled if self._is_on: @@ -68,7 +67,7 @@ def is_enabled(self) -> bool: """Boolean flag indicating if telemetry is currently enabled :returns: `True` if enabled, `False` otherwise - :rtype: bool""" + """ return self._is_on def enable(self) -> None: @@ -101,12 +100,9 @@ def __init__( share these attributes. :param name: Name of the entity - :type name: str :param path: path to output, error, and configuration files - :type path: str :param run_settings: Launcher settings specified in the experiment entity - :type run_settings: dict """ self.name = name self.run_settings = run_settings diff --git a/smartsim/entity/files.py b/smartsim/entity/files.py index 9c282b94e..d00e946e2 100644 --- a/smartsim/entity/files.py +++ b/smartsim/entity/files.py @@ -58,13 +58,10 @@ def __init__( """Initialize an EntityFiles instance :param tagged: tagged files for model configuration - :type tagged: list of str :param copy: files or directories to copy into model or node directories - :type copy: list of str :param symlink: files to symlink into model or node directories - :type symlink: list of str """ self.tagged = tagged or [] self.copy = copy or [] @@ -102,12 +99,9 @@ def _type_check_files( """Check the type of the files provided by the user. :param file_list: either tagged, copy, or symlink files - :type file_list: list of str :param file_type: name of the file type e.g. "tagged" - :type file_type: str :raises TypeError: if incorrect type is provided by user :return: file list provided - :rtype: list of str """ if file_list: if not isinstance(file_list, list): @@ -128,10 +122,8 @@ def _check_path(file_path: str) -> str: the directory or file and create a full path. :param file_path: path to a specific file or directory - :type file_path: str :raises FileNotFoundError: if file or directory does not exist :return: full path to file or directory - :rtype: str """ full_path = path.abspath(file_path) if path.isfile(full_path): @@ -183,12 +175,10 @@ def __init__(self, parent: t.Optional[t.Any] = None, subdir_name: str = "") -> N :param parent: The parent hierarchy of the new hierarchy, must be None if creating a root hierarchy, must be provided if creating a subhierachy - :type parent: TaggedFilesHierarchy | None, optional :param subdir_name: Name of subdirectory representd by the new hierarchy, must be "" if creating a root hierarchy, must be any valid dir name if subhierarchy, invalid names are ".", ".." or contain path seperators - :type subdir_name: str, optional :raises ValueError: if given a subdir_name without a parent, if given a parent without a subdir_name, or if the subdir_name is invalid @@ -232,15 +222,12 @@ def from_list_paths( :param path_list: list of absolute paths to tagged files or dirs containing tagged files - :type path_list: list[str] :param dir_contents_to_base: When a top level dir is encountered, if this value is truthy, files in the dir are put into the base hierarchy level. Otherwise, a new sub level is created for the dir - :type dir_contents_to_base: bool :return: A built tagged file hierarchy for the given files - :rtype: TaggedFilesHierarchy """ tagged_file_hierarchy = cls() if dir_contents_to_base: @@ -261,7 +248,6 @@ def _add_file(self, file: str) -> None: """Add a file to the current level in the file hierarchy :param file: absoute path to a file to add to the hierarchy - :type file: str """ self.files.add(file) @@ -271,7 +257,6 @@ def _add_dir(self, dir_path: str) -> None: the new level sub level tagged file hierarchy :param dir: absoute path to a dir to add to the hierarchy - :type dir: str """ tagged_file_hierarchy = TaggedFilesHierarchy(self, path.basename(dir_path)) # pylint: disable-next=protected-access @@ -285,7 +270,6 @@ def _add_paths(self, paths: t.List[str]) -> None: TaggedFilesHierarchy. :param paths: list of paths to files or dirs to add to the hierarchy - :type paths: list[str] :raises ValueError: if link to dir is found :raises FileNotFoundError: if path does not exist """ diff --git a/smartsim/entity/model.py b/smartsim/entity/model.py index 69e942ed2..3f78e042c 100644 --- a/smartsim/entity/model.py +++ b/smartsim/entity/model.py @@ -59,21 +59,15 @@ def __init__( """Initialize a ``Model`` :param name: name of the model - :type name: str :param params: model parameters for writing into configuration files or to be passed as command line arguments to executable. - :type params: dict :param path: path to output, error, and configuration files - :type path: str :param run_settings: launcher settings specified in the experiment - :type run_settings: RunSettings :param params_as_args: list of parameters which have to be interpreted as command line arguments to be added to run_settings - :type params_as_args: list[str] :param batch_settings: Launcher settings for running the individual - model as a batch job, defaults to None - :type batch_settings: BatchSettings | None + model as a batch job """ super().__init__(name, str(path), run_settings) self.params = params @@ -87,17 +81,26 @@ def __init__( @property def db_models(self) -> t.Iterable[DBModel]: - """Return an immutable collection of attached models""" + """Retrieve an immutable collection of attached models + + :return: Return an immutable collection of attached models + """ return (model for model in self._db_models) @property def db_scripts(self) -> t.Iterable[DBScript]: - """Return an immutable collection attached of scripts""" + """Retrieve an immutable collection attached of scripts + + :return: Return an immutable collection of attached scripts + """ return (script for script in self._db_scripts) @property def colocated(self) -> bool: - """Return True if this Model will run with a colocated Orchestrator""" + """Return True if this Model will run with a colocated Orchestrator + + :return: Return True of the Model will run with a colocated Orchestrator + """ return bool(self.run_settings.colocated_db_settings) def register_incoming_entity(self, incoming_entity: SmartSimEntity) -> None: @@ -108,7 +111,6 @@ def register_incoming_entity(self, incoming_entity: SmartSimEntity) -> None: with that entity :param incoming_entity: The entity that data will be received from - :type incoming_entity: SmartSimEntity :raises SmartSimError: if incoming entity has already been registered """ if incoming_entity.name in [ @@ -130,7 +132,10 @@ def disable_key_prefixing(self) -> None: self._key_prefixing_enabled = False def query_key_prefixing(self) -> bool: - """Inquire as to whether this entity will prefix its keys with its name""" + """Inquire as to whether this entity will prefix its keys with its name + + :return: Return True if entity will prefix its keys with its name + """ return self._key_prefixing_enabled def attach_generator_files( @@ -157,12 +162,9 @@ def attach_generator_files( would like to change. The tag is settable but defaults to a semicolon e.g. THERMO = ;10; - :param to_copy: files to copy, defaults to [] - :type to_copy: list, optional - :param to_symlink: files to symlink, defaults to [] - :type to_symlink: list, optional - :param to_configure: input files with tagged parameters, defaults to [] - :type to_configure: list, optional + :param to_copy: files to copy + :param to_symlink: files to symlink + :param to_configure: input files with tagged parameters """ to_copy = to_copy or [] to_symlink = to_symlink or [] @@ -187,7 +189,6 @@ def attached_files_table(self) -> str: """Return a list of attached files as a plain text table :returns: String version of table - :rtype: str """ if not self.files: return "No file attached to this model." @@ -241,18 +242,12 @@ def colocate_db_uds( Generally these don't need to be changed. :param unix_socket: path to where the socket file will be created - :type unix_socket: str, optional :param socket_permissions: permissions for the socketfile - :type socket_permissions: int, optional - :param db_cpus: number of cpus to use for orchestrator, defaults to 1 - :type db_cpus: int, optional + :param db_cpus: number of cpus to use for orchestrator :param custom_pinning: CPUs to pin the orchestrator to. Passing an empty iterable disables pinning - :type custom_pinning: iterable of ints or iterable of ints, optional :param debug: launch Model with extra debug information about the colocated db - :type debug: bool, optional :param kwargs: additional keyword arguments to pass to the orchestrator database - :type kwargs: dict, optional """ if not re.match(r"^[a-zA-Z0-9.:\,_\-/]*$", unix_socket): @@ -307,20 +302,13 @@ def colocate_db_tcp( Generally these don't need to be changed. - :param port: port to use for orchestrator database, defaults to 6379 - :type port: int, optional - :param ifname: interface to use for orchestrator, defaults to "lo" - :type ifname: str | list[str], optional - :param db_cpus: number of cpus to use for orchestrator, defaults to 1 - :type db_cpus: int, optional + :param port: port to use for orchestrator database + :param ifname: interface to use for orchestrator + :param db_cpus: number of cpus to use for orchestrator :param custom_pinning: CPUs to pin the orchestrator to. Passing an empty iterable disables pinning - :type custom_pinning: iterable of ints or iterable of ints, optional :param debug: launch Model with extra debug information about the colocated db - :type debug: bool, optional :param kwargs: additional keyword arguments to pass to the orchestrator database - :type kwargs: dict, optional - """ tcp_options = {"port": port, "ifname": ifname} @@ -504,35 +492,22 @@ def add_ml_model( must be provided :param name: key to store model under - :type name: str :param backend: name of the backend (TORCH, TF, TFLITE, ONNX) - :type backend: str :param model: A model in memory (only supported for non-colocated orchestrators) - :type model: byte string, optional :param model_path: serialized model - :type model_path: file path to model - :param device: name of device for execution, defaults to "CPU" - :type device: str, optional + :param device: name of device for execution :param devices_per_node: The number of GPU devices available on the host. This parameter only applies to GPU devices and will be ignored if device is specified as CPU. - :type devices_per_node: int :param first_device: The first GPU device to use on the host. This parameter only applies to GPU devices and will be ignored if device is specified as CPU. - :type first_device: int - :param batch_size: batch size for execution, defaults to 0 - :type batch_size: int, optional - :param min_batch_size: minimum batch size for model execution, defaults to 0 - :type min_batch_size: int, optional - :param min_batch_timeout: time to wait for minimum batch size, defaults to 0 - :type min_batch_timeout: int, optional - :param tag: additional tag for model information, defaults to "" - :type tag: str, optional - :param inputs: model inputs (TF only), defaults to None - :type inputs: list[str], optional - :param outputs: model outupts (TF only), defaults to None - :type outputs: list[str], optional + :param batch_size: batch size for execution + :param min_batch_size: minimum batch size for model execution + :param min_batch_timeout: time to wait for minimum batch size + :param tag: additional tag for model information + :param inputs: model inputs (TF only) + :param outputs: model outupts (TF only) """ db_model = DBModel( name=name, @@ -578,21 +553,15 @@ def add_script( must be provided :param name: key to store script under - :type name: str :param script: TorchScript code (only supported for non-colocated orchestrators) - :type script: str, optional :param script_path: path to TorchScript code - :type script_path: str, optional - :param device: device for script execution, defaults to "CPU" - :type device: str, optional + :param device: device for script execution :param devices_per_node: The number of GPU devices available on the host. This parameter only applies to GPU devices and will be ignored if device is specified as CPU. - :type devices_per_node: int :param first_device: The first GPU device to use on the host. This parameter only applies to GPU devices and will be ignored if device is specified as CPU. - :type first_device: int """ db_script = DBScript( name=name, @@ -627,19 +596,14 @@ def add_function( in the model being stored in the first N devices of type ``device``. :param name: key to store function under - :type name: str :param function: TorchScript function code - :type function: str, optional - :param device: device for script execution, defaults to "CPU" - :type device: str, optional + :param device: device for script execution :param devices_per_node: The number of GPU devices available on the host. This parameter only applies to GPU devices and will be ignored if device is specified as CPU. - :type devices_per_node: int :param first_device: The first GPU device to use on the host. This parameter only applies to GPU devices and will be ignored if device is specified as CPU. - :type first_device: int """ db_script = DBScript( name=name, diff --git a/smartsim/experiment.py b/smartsim/experiment.py index 069a81540..9f230b1a9 100644 --- a/smartsim/experiment.py +++ b/smartsim/experiment.py @@ -80,9 +80,7 @@ def _on_disable(self) -> None: # pylint: disable=no-self-use class Experiment: - """Experiments are the Python user interface for SmartSim. - - Experiment is a factory class that creates stages of a workflow + """Experiment is a factory class that creates stages of a workflow and manages their execution. The instances created by an Experiment represent executable code @@ -104,7 +102,7 @@ def __init__( exp_path: t.Optional[str] = None, launcher: str = "local", ): - """Initialize an Experiment instance + """Initialize an Experiment instance. With the default settings, the Experiment will use the local launcher, which will start all Experiment created @@ -125,10 +123,10 @@ def __init__( exp = Experiment(name="my_exp", launcher="slurm") - If you wish your driver script and Experiment to be run across + If you want your Experiment driver script to be run across multiple system with different schedulers (workload managers) - you can also use the `auto` argument to have the Experiment guess - which launcher to use based on system installed binaries and libraries + you can also use the `auto` argument to have the Experiment detect + which launcher to use based on system installed binaries and libraries. .. highlight:: python .. code-block:: python @@ -142,15 +140,11 @@ def __init__( from the Experiment. :param name: name for the ``Experiment`` - :type name: str - :param exp_path: path to location of ``Experiment`` directory if generated - :type exp_path: str, optional + :param exp_path: path to location of ``Experiment`` directory :param launcher: type of launcher being used, options are "slurm", "pbs", "lsf", or "local". If set to "auto", an attempt will be made to find an available launcher on the system. - Defaults to "local" - :type launcher: str, optional """ self.name = name if exp_path: @@ -196,7 +190,7 @@ def start( model = exp.create_model("my_model", settings) exp.start(model) - Multiple instance can also be passed to the start method + Multiple entity instances can also be passed to the start method at once no matter which type of instance they are. These will all be launched together. @@ -222,15 +216,10 @@ def start( zombie processes will need to be manually killed. :param block: block execution until all non-database - jobs are finished, defaults to True - :type block: bool, optional - :param summary: print a launch summary prior to launch, - defaults to False - :type summary: bool, optional + jobs are finished + :param summary: print a launch summary prior to launch :param kill_on_interrupt: flag for killing jobs when ^C (SIGINT) signal is received. - - :type kill_on_interrupt: bool, optional """ start_manifest = Manifest(*args) self._create_entity_dir(start_manifest) @@ -271,6 +260,7 @@ def stop( # multiple exp.stop(model_1, model_2, db, ensemble) + :param args: One or more SmartSimEntity or EntitySequence objects. :raises TypeError: if wrong type :raises SmartSimError: if stop request fails """ @@ -297,8 +287,8 @@ def generate( ) -> None: """Generate the file structure for an ``Experiment`` - ``Experiment.generate`` creates directories for each instance - passed to organize Experiments that launch many instances. + ``Experiment.generate`` creates directories for each entity + passed to organize Experiments that launch many entities. If files or directories are attached to ``Model`` objects using ``Model.attach_generator_files()``, those files or @@ -309,12 +299,8 @@ def generate( can all be passed as arguments to the generate method. :param tag: tag used in `to_configure` generator files - :type tag: str, optional - :param overwrite: overwrite existing folders and contents, - defaults to False - :type overwrite: bool, optional + :param overwrite: overwrite existing folders and contents :param verbose: log parameter settings to std out - :type verbose: bool """ try: generator = Generator(self.exp_path, overwrite=overwrite, verbose=verbose) @@ -354,14 +340,10 @@ def poll( that all jobs launched by this experiment will be killed, and the zombie processes will need to be manually killed. - :param interval: frequency (in seconds) of logging to stdout, - defaults to 10 seconds - :type interval: int, optional - :param verbose: set verbosity, defaults to True - :type verbose: bool, optional + :param interval: frequency (in seconds) of logging to stdout + :param verbose: set verbosity :param kill_on_interrupt: flag for killing jobs when SIGINT is received - :type kill_on_interrupt: bool, optional - :raises SmartSimError: + :raises SmartSimError: if poll request fails """ try: self._control.poll(interval, verbose, kill_on_interrupt=kill_on_interrupt) @@ -381,9 +363,7 @@ def finished(self, entity: SmartSimEntity) -> bool: by the user. :param entity: object launched by this ``Experiment`` - :type entity: Model | Ensemble - :returns: True if job has completed, False otherwise - :rtype: bool + :returns: True if the job has finished, False otherwise :raises SmartSimError: if entity has not been launched by this ``Experiment`` """ @@ -397,7 +377,7 @@ def finished(self, entity: SmartSimEntity) -> bool: def get_status( self, *args: t.Union[SmartSimEntity, EntitySequence[SmartSimEntity]] ) -> t.List[SmartSimStatus]: - """Query the status of launched instances + """Query the status of launched entity instances Return a smartsim.status string representing the status of the launched instance. @@ -419,7 +399,6 @@ def get_status( assert all(complete) :returns: status of the instances passed as arguments - :rtype: list[str] :raises SmartSimError: if status retrieval fails """ try: @@ -452,7 +431,7 @@ def create_ensemble( if using a non-local launcher. e.g. slurm Ensembles require one of the following combinations - of arguments + of arguments: - ``run_settings`` and ``params`` - ``run_settings`` and ``replicas`` @@ -461,39 +440,32 @@ def create_ensemble( - ``batch_settings``, ``run_settings``, and ``replicas`` If given solely batch settings, an empty ensemble - will be created that models can be added to manually + will be created that Models can be added to manually through ``Ensemble.add_model()``. - The entire ensemble will launch as one batch. + The entire Ensemble will launch as one batch. Provided batch and run settings, either ``params`` or ``replicas`` must be passed and the entire ensemble will launch as a single batch. Provided solely run settings, either ``params`` - or ``replicas`` must be passed and the ensemble members + or ``replicas`` must be passed and the Ensemble members will each launch sequentially. The kwargs argument can be used to pass custom input parameters to the permutation strategy. - :param name: name of the ensemble - :type name: str + :param name: name of the ``Ensemble`` :param params: parameters to expand into ``Model`` members - :type params: dict[str, Any] :param batch_settings: describes settings for ``Ensemble`` as batch workload - :type batch_settings: BatchSettings :param run_settings: describes how each ``Model`` should be executed - :type run_settings: RunSettings :param replicas: number of replicas to create - :type replicas: int :param perm_strategy: strategy for expanding ``params`` into ``Model`` instances from params argument options are "all_perm", "step", "random" - or a callable function. Default is "all_perm". - :type perm_strategy: str, optional + or a callable function. :raises SmartSimError: if initialization fails :return: ``Ensemble`` instance - :rtype: Ensemble """ if name is None: raise AttributeError("Entity has no name. Please set name attribute.") @@ -536,27 +508,27 @@ def create_model( ``Model`` instances can be launched sequentially, as a batch job, or as a group by adding them into an ``Ensemble``. - All models require a reference to run settings to specify which + All ``Models`` require a reference to run settings to specify which executable to launch as well provide options for how to launch the executable with the underlying WLM. Furthermore, batch a - reference to a batch settings can be added to launch the model - as a batch job through ``Experiment.start``. If a model with + reference to a batch settings can be added to launch the ``Model`` + as a batch job through ``Experiment.start``. If a ``Model`` with a reference to a set of batch settings is added to a larger entity with its own set of batch settings (for e.g. an ``Ensemble``) the batch settings of the larger entity will take - precedence and the batch setting of the model will be + precedence and the batch setting of the ``Model`` will be strategically ignored. Parameters supplied in the `params` argument can be written into - configuration files supplied at runtime to the model through + configuration files supplied at runtime to the ``Model`` through ``Model.attach_generator_files``. `params` can also be turned into executable arguments by calling ``Model.params_to_args`` By default, ``Model`` instances will be executed in the - current working directory if no `path` argument is supplied. + exp_path/model_name directory if no `path` argument is supplied. If a ``Model`` instance is passed to ``Experiment.generate``, a directory within the ``Experiment`` directory will be created - to house the input and output files from the model. + to house the input and output files from the ``Model``. Example initialization of a ``Model`` instance @@ -592,25 +564,16 @@ def create_model( deprecated, but remains as an alias for ``Model.colocate_db_tcp`` for backward compatibility. - :param name: name of the model - :type name: str + :param name: name of the ``Model`` :param run_settings: defines how ``Model`` should be run - :type run_settings: RunSettings - :param params: model parameters for writing into configuration files - :type params: dict, optional - :param path: path to where the model should be executed at runtime - :type path: str, optional - :param enable_key_prefixing: If True, data sent to the Orchestrator + :param params: ``Model`` parameters for writing into configuration files + :param path: path to where the ``Model`` should be executed at runtime + :param enable_key_prefixing: If True, data sent to the ``Orchestrator`` using SmartRedis from this ``Model`` will be prefixed with the ``Model`` name. - Default is True. - :type enable_key_prefixing: bool, optional - :param batch_settings: Settings to run model individually as a batch job, - defaults to None - :type batch_settings: BatchSettings | None + :param batch_settings: Settings to run ``Model`` individually as a batch job. :raises SmartSimError: if initialization fails :return: the created ``Model`` - :rtype: Model """ if name is None: raise AttributeError("Entity has no name. Please set name attribute.") @@ -648,7 +611,7 @@ def create_run_settings( """Create a ``RunSettings`` instance. run_command="auto" will attempt to automatically - match a run command on the system with a RunSettings + match a run command on the system with a ``RunSettings`` class in SmartSim. If found, the class corresponding to that run_command will be created and returned. @@ -669,19 +632,12 @@ class in SmartSim. If found, the class corresponding - jsrun (LSF) :param run_command: command to run the executable - :type run_command: str :param exe: executable to run - :type exe: str :param exe_args: arguments to pass to the executable - :type exe_args: list[str], optional :param run_args: arguments to pass to the ``run_command`` - :type run_args: dict[str, t.Union[int, str, float, None]], optional :param env_vars: environment variables to pass to the executable - :type env_vars: dict[str, str], optional :param container: if execution environment is containerized - :type container: Container, optional :return: the created ``RunSettings`` - :rtype: RunSettings """ try: @@ -732,18 +688,12 @@ def create_batch_settings( batch_args=batch_args) bs.set_account("default") - :param nodes: number of nodes for batch job, defaults to 1 - :type nodes: int, optional - :param time: length of batch job, defaults to "" - :type time: str, optional - :param queue: queue or partition (if slurm), defaults to "" - :type queue: str, optional - :param account: user account name for batch system, defaults to "" - :type account: str, optional - :param batch_args: additional batch arguments, defaults to None - :type batch_args: dict[str, str], optional + :param nodes: number of nodes for batch job + :param time: length of batch job + :param queue: queue or partition (if slurm) + :param account: user account name for batch system + :param batch_args: additional batch arguments :return: a newly created BatchSettings instance - :rtype: BatchSettings :raises SmartSimError: if batch creation fails """ try: @@ -777,57 +727,44 @@ def create_database( db_identifier: str = "orchestrator", **kwargs: t.Any, ) -> Orchestrator: - """Initialize an Orchestrator database + """Initialize an ``Orchestrator`` database The ``Orchestrator`` database is a key-value store based - on Redis that can be launched together with other Experiment + on Redis that can be launched together with other ``Experiment`` created instances for online data storage. When launched, ``Orchestrator`` can be used to communicate data between Fortran, Python, C, and C++ applications. Machine Learning models in Pytorch, Tensorflow, and ONNX (i.e. scikit-learn) - can also be stored within the Orchestrator database where they + can also be stored within the ``Orchestrator`` database where they can be called remotely and executed on CPU or GPU where the database is hosted. To enable a SmartSim ``Model`` to communicate with the database the workload must utilize the SmartRedis clients. For more information on the database, and SmartRedis clients see the - documentation at www.craylabs.org - - :param port: TCP/IP port, defaults to 6379 - :type port: int, optional - :param db_nodes: number of database shards, defaults to 1 - :type db_nodes: int, optional - :param batch: run as a batch workload, defaults to False - :type batch: bool, optional - :param hosts: specify hosts to launch on, defaults to None - :type hosts: list[str], optional - :param run_command: specify launch binary or detect automatically, - defaults to "auto" - :type run_command: str, optional - :param interface: Network interface, defaults to "ipogif0" - :type interface: str, optional - :param account: account to run batch on, defaults to None - :type account: str, optional - :param time: walltime for batch 'HH:MM:SS' format, defaults to None - :type time: str, optional - :param queue: queue to run the batch on, defaults to None - :type queue: str, optional - :param single_cmd: run all shards with one (MPMD) command, defaults to True - :type single_cmd: bool, optional + documentation at https://www.craylabs.org/docs/smartredis.html + + :param port: TCP/IP port + :param db_nodes: number of database shards + :param batch: run as a batch workload + :param hosts: specify hosts to launch on + :param run_command: specify launch binary or detect automatically + :param interface: Network interface + :param account: account to run batch on + :param time: walltime for batch 'HH:MM:SS' format + :param queue: queue to run the batch on + :param single_cmd: run all shards with one (MPMD) command :param db_identifier: an identifier to distinguish this orchestrator in - multiple-database experiments, defaults to "orchestrator" - :type db_identifier: str, optional + multiple-database experiments :raises SmartSimError: if detection of launcher or of run command fails :raises SmartSimError: if user indicated an incompatible run command for the launcher - :return: Orchestrator - :rtype: Orchestrator or derived class + :return: Orchestrator or derived class """ - self.append_to_db_identifier_list(db_identifier) + self._append_to_db_identifier_list(db_identifier) check_path = path or osp.join(self.exp_path, db_identifier) entity_path: str = osp.abspath(check_path) return Orchestrator( @@ -859,7 +796,6 @@ def reconnect_orchestrator(self, checkpoint: str) -> Orchestrator: :param checkpoint: the `smartsim_db.dat` file created when an ``Orchestrator`` is launched - :type checkpoint: str """ try: orc = self._control.reload_saved_db(checkpoint) @@ -877,11 +813,8 @@ def summary(self, style: str = "github") -> str: :param style: the style in which the summary table is formatted, for a full list of styles see the table-format section of: - https://github.com/astanin/python-tabulate, - defaults to "github" - :type style: str, optional + https://github.com/astanin/python-tabulate :return: tabulate string of ``Experiment`` history - :rtype: str """ values = [] headers = [ @@ -915,11 +848,18 @@ def summary(self, style: str = "github") -> str: disable_numparse=True, ) + @property + def telemetry(self) -> TelemetryConfiguration: + """Return the telemetry configuration for this entity. + + :returns: configuration of telemetry for this entity + """ + return self._telemetry_cfg + def _launch_summary(self, manifest: Manifest) -> None: """Experiment pre-launch summary of entities that will be launched :param manifest: Manifest of deployables. - :type manifest: Manifest """ summary = "\n\n=== Launch Summary ===\n" @@ -960,7 +900,7 @@ def create_entity_dir(entity: t.Union[Orchestrator, Model, Ensemble]) -> None: def __str__(self) -> str: return self.name - def append_to_db_identifier_list(self, db_identifier: str) -> None: + def _append_to_db_identifier_list(self, db_identifier: str) -> None: """Check if db_identifier already exists when calling create_database""" if db_identifier in self.db_identifiers: logger.warning( @@ -970,11 +910,3 @@ def append_to_db_identifier_list(self, db_identifier: str) -> None: ) # Otherwise, add self.db_identifiers.add(db_identifier) - - @property - def telemetry(self) -> TelemetryConfiguration: - """Return the telemetry configuration for this entity. - - :returns: configuration of telemetry for this entity - :rtype: TelemetryConfiguration""" - return self._telemetry_cfg diff --git a/smartsim/log.py b/smartsim/log.py index a84f29c56..c9e0e9399 100644 --- a/smartsim/log.py +++ b/smartsim/log.py @@ -75,9 +75,7 @@ def _translate_log_level(user_log_level: str = "info") -> str: extremely verbose logging. :param user_log_level: log level specified by user, defaults to info - :type user_log_level: str :returns: Log level for coloredlogs - :rtype: str """ user_log_level = user_log_level.lower() if user_log_level in ["info", "debug", "warning"]: @@ -95,7 +93,6 @@ def get_exp_log_paths() -> t.Tuple[t.Optional[pathlib.Path], t.Optional[pathlib. Returns None for both paths if experiment context is unavailable. :returns: 2-tuple of paths to experiment logs in form (output_path, error_path) - :rtype: Tuple[pathlib.Path | None, pathlib.Path | None] """ default_paths = None, None @@ -124,9 +121,7 @@ def filter(self, record: logging.LogRecord) -> bool: """Enrich log records with active experiment context :param record: the record to evaluate for filtering - :type record: logging.LogRecord :returns: always True - :rtype: bool """ record.exp_path = ctx_exp_path.get() return True @@ -213,13 +208,9 @@ def get_logger( logger.warning("This is a warning message") :param name: the name of the desired logger - :type name: str :param log_level: what level to set the logger to - :type log_level: str :param fmt: the format of the log messages - :type fmt: str :returns: logger instance - :rtype: logging.Logger """ # if name is None, then logger is the root logger # if not root logger, get the name of file without prefix. @@ -244,7 +235,6 @@ def __init__(self, maximum_level: str = "INFO"): """Create a low-pass log filter allowing messages below a specific log level :param maximum_level: The maximum log level to be passed by the filter - :type maximum_level: str """ super().__init__() self.max = maximum_level @@ -253,9 +243,7 @@ def filter(self, record: logging.LogRecord) -> bool: """Filter log records; pass those less than or equal to the maximum level :param record: the record to evaluate for filtering - :type record: logging.LogRecord :returns: True if record level passes filter, False otherwise - :rtype: bool """ # If a string representation of the level is passed in, # the corresponding numeric value is returned. @@ -268,12 +256,9 @@ def log_to_file(filename: str, log_level: str = "debug") -> None: allowing subsequent logging calls to be sent to filename. :param filename: the name of the desired log file. - :type filename: str - :param log_level: as defined in get_logger. Can be specified to allow the file to store more or less verbose logging information. - :type log_level: str """ logger = logging.getLogger("SmartSim") stream = open( # pylint: disable=consider-using-with @@ -293,19 +278,13 @@ def log_to_exp_file( allowing subsequent logging calls to be sent to filename. :param filename: the name of the desired log file. - :type filename: str :param log_level: as defined in get_logger. Can be specified to allow the file to store more or less verbose logging information. - :type log_level: int | str :param logger: an existing logger to add the handler to - :type logger: (optional) logging.Logger :param fmt: a log format for the handler (otherwise, EXPERIMENT_LOG_FORMAT) - :type fmt: (optional) str :param log_filter: log filter to attach to handler - :type log_filter: (optional) logging.Filter :return: logging.Handler - :rtype: logging.Handler """ # ensure logs are written even if specified dir doesn't exist log_path = pathlib.Path(filename) @@ -341,9 +320,8 @@ def method_contextualizer( must accept an instance of matching type. :param ctx_var: The ContextVar that will be modified - :type ctx_var: ContextVar :param ctx_map: A function that returns the value to be set to ctx_var - :type ctx_map: t.Callable[[_T], _ContextT]""" + """ def _contextualize( fn: "t.Callable[Concatenate[_T, _PR], _RT]", / diff --git a/smartsim/ml/data.py b/smartsim/ml/data.py index 4cdc27c06..f2c37fdc4 100644 --- a/smartsim/ml/data.py +++ b/smartsim/ml/data.py @@ -57,13 +57,9 @@ class DataInfo: can be accessed in ``DataInfo.sample_name`` and ``DataInfo.target_name``. :param list_name: Name of the aggregation list used for sample datasets - :type list_name: str :param sample_name: Name of tensor holding training samples in stored datasets. - :type sample_name: str :param target_name: Name of tensor holding targets or labels in stored datasets. - :type target_name: str :num_classes: Number of classes (for categorical data). - :type num_classes: int | None """ def __init__( @@ -86,7 +82,6 @@ def publish(self, client: Client) -> None: stored as metastrings and integers stored as metascalars. :param client: Client to connect to Database - :type client: SmartRedis.Client """ info_ds = Dataset(self._ds_name) info_ds.add_meta_string("sample_name", self.sample_name) @@ -104,7 +99,6 @@ def download(self, client: Client) -> None: on the DB, the object members are not modified. :param client: Client to connect to Database - :type client: SmartRedis.Client """ try: info_ds = client.get_dataset(self._ds_name) @@ -148,21 +142,13 @@ class TrainingDataUploader: by the attributes of this class. :param list_name: Name of the dataset as stored on the Orchestrator - :type list_name: str :param sample_name: Name of samples tensor in uploaded Datasets - :type sample_name: str :param target_name: Name of targets tensor (if needed) in uploaded Datasets - :type target_name: str :param num_classes: Number of classes of targets, if categorical - :type num_classes: int :param cluster: Whether the SmartSim Orchestrator is being run as a cluster - :type cluster: bool :param address: Address of Redis DB as : - :type address: str :param rank: Rank of DataUploader in multi-process application (e.g. MPI rank). - :type rank: int :param verbose: If output should be logged to screen. - :type verbose: bool """ @@ -266,35 +252,23 @@ class DataDownloader: - shuffle the dataset if `shuffle` is set to ``True``. :param batch_size: Size of batches obtained with __iter__ - :type batch_size: int :param dynamic: Whether new batches should be donwnloaded when ``update_data`` is called. - :type dtnamic: bool :param shuffle: whether order of samples has to be shuffled when calling `update_data` - :type shuffle: bool :param data_info_or_list_name: DataInfo object with details about dataset to download, if a string is passed, it is used to download DataInfo data from DB, assuming it was stored with ``list_name=data_info_or_list_name`` - :type data_info_or_list_name: DataInfo | str :param list_name: Name of aggregation list used to upload data - :type list_name: str :param cluster: Whether the Orchestrator will be run as a cluster - :type cluster: bool :param address: Address of Redis client as : - :type address: str :param replica_rank: When StaticDataDownloader is used distributedly, indicates the rank of this object - :type replica_rank: int :param num_replicas: When BatchDownlaoder is used distributedly, indicates the total number of ranks - :type num_replicas: int :param verbose: Whether log messages should be printed - :type verbose: bool :param init_samples: whether samples should be initialized in the constructor - :type init_samples: bool :param max_fetch_trials: maximum number of attempts to initialize data - :type max_fetch_trials: int """ def __init__( @@ -378,7 +352,6 @@ def need_targets(self) -> bool: """Compute if targets have to be downloaded. :return: Whether targets (or labels) should be downloaded - :rtype: bool """ return bool(self.target_name) and not self.autoencoding @@ -409,8 +382,8 @@ def init_samples(self, init_trials: int = -1) -> None: A new attempt to download samples will be made every ten seconds, for ``init_trials`` times. + :param init_trials: maximum number of attempts to fetch data - :type init_trials: int """ self._client = Client(self.cluster, self.address) diff --git a/smartsim/ml/tf/utils.py b/smartsim/ml/tf/utils.py index 69c8e2580..cf69b65e5 100644 --- a/smartsim/ml/tf/utils.py +++ b/smartsim/ml/tf/utils.py @@ -47,13 +47,9 @@ def freeze_model( a trained model and put it inside an ``orchestrator`` instance :param model: TensorFlow or Keras model - :type model: tf.Module :param output_dir: output dir to save model file to - :type output_dir: str :param file_name: name of model file to create - :type file_name: str :return: path to model file, model input layer names, model output layer names - :rtype: str, list[str], list[str] """ # TODO figure out why layer names don't match up to # specified name in Model init. @@ -93,9 +89,7 @@ def serialize_model(model: keras.Model) -> t.Tuple[str, t.List[str], t.List[str] a trained model and put it inside an ``orchestrator`` instance. :param model: TensorFlow or Keras model - :type model: tf.Module :return: serialized model, model input layer names, model output layer names - :rtype: str, list[str], list[str] """ full_model = tf.function(model) diff --git a/smartsim/settings/alpsSettings.py b/smartsim/settings/alpsSettings.py index 5357312a5..54b9c7525 100644 --- a/smartsim/settings/alpsSettings.py +++ b/smartsim/settings/alpsSettings.py @@ -46,13 +46,9 @@ def __init__( ``AprunSettings`` can be used for the `pbs` launcher. :param exe: executable - :type exe: str - :param exe_args: executable arguments, defaults to None - :type exe_args: str | list[str], optional - :param run_args: arguments for run command, defaults to None - :type run_args: dict[str, t.Union[int, str, float, None]], optional - :param env_vars: environment vars to launch job with, defaults to None - :type env_vars: dict[str, str], optional + :param exe_args: executable arguments + :param run_args: arguments for run command + :param env_vars: environment vars to launch job with """ super().__init__( exe, @@ -71,7 +67,6 @@ def make_mpmd(self, settings: RunSettings) -> None: into a single MPMD command joined with ':' :param settings: ``AprunSettings`` instance - :type settings: AprunSettings """ if self.colocated_db_settings: raise SSUnsupportedError( @@ -89,7 +84,6 @@ def set_cpus_per_task(self, cpus_per_task: int) -> None: This sets ``--cpus-per-pe`` :param cpus_per_task: number of cpus to use per task - :type cpus_per_task: int """ self.run_args["cpus-per-pe"] = int(cpus_per_task) @@ -99,7 +93,6 @@ def set_tasks(self, tasks: int) -> None: This sets ``--pes`` :param tasks: number of tasks - :type tasks: int """ self.run_args["pes"] = int(tasks) @@ -109,7 +102,6 @@ def set_tasks_per_node(self, tasks_per_node: int) -> None: This sets ``--pes-per-node`` :param tasks_per_node: number of tasks per node - :type tasks_per_node: int """ self.run_args["pes-per-node"] = int(tasks_per_node) @@ -117,7 +109,6 @@ def set_hostlist(self, host_list: t.Union[str, t.List[str]]) -> None: """Specify the hostlist for this job :param host_list: hosts to launch on - :type host_list: str | list[str] :raises TypeError: if not str or list of str """ if isinstance(host_list, str): @@ -134,7 +125,6 @@ def set_hostlist_from_file(self, file_path: str) -> None: This sets ``--node-list-file`` :param file_path: Path to the hostlist file - :type file_path: str """ self.run_args["node-list-file"] = file_path @@ -142,7 +132,6 @@ def set_excluded_hosts(self, host_list: t.Union[str, t.List[str]]) -> None: """Specify a list of hosts to exclude for launching this job :param host_list: hosts to exclude - :type host_list: str | list[str] :raises TypeError: if not str or list of str """ if isinstance(host_list, str): @@ -159,7 +148,6 @@ def set_cpu_bindings(self, bindings: t.Union[int, t.List[int]]) -> None: This sets ``--cpu-binding`` :param bindings: List of cpu numbers - :type bindings: list[int] | int """ if isinstance(bindings, int): bindings = [bindings] @@ -171,7 +159,6 @@ def set_memory_per_node(self, memory_per_node: int) -> None: This sets ``--memory-per-pe`` in megabytes :param memory_per_node: Per PE memory limit in megabytes - :type memory_per_node: int """ self.run_args["memory-per-pe"] = int(memory_per_node) @@ -181,7 +168,6 @@ def set_verbose_launch(self, verbose: bool) -> None: This sets ``--debug`` arg to the highest level :param verbose: Whether the job should be run verbosely - :type verbose: bool """ if verbose: self.run_args["debug"] = 7 @@ -194,7 +180,6 @@ def set_quiet_launch(self, quiet: bool) -> None: This sets ``--quiet`` :param quiet: Whether the job should be run quietly - :type quiet: bool """ if quiet: self.run_args["quiet"] = None @@ -205,7 +190,6 @@ def format_run_args(self) -> t.List[str]: """Return a list of ALPS formatted run arguments :return: list of ALPS arguments for these settings - :rtype: list[str] """ # args launcher uses args = [] @@ -228,7 +212,6 @@ def format_env_vars(self) -> t.List[str]: """Format the environment variables for aprun :return: list of env vars - :rtype: list[str] """ formatted = [] if self.env_vars: @@ -242,6 +225,5 @@ def set_walltime(self, walltime: str) -> None: Walltime is given in total number of seconds :param walltime: wall time - :type walltime: str """ self.run_args["cpu-time-limit"] = str(walltime) diff --git a/smartsim/settings/base.py b/smartsim/settings/base.py index 4e5b5cf4e..6373b52fd 100644 --- a/smartsim/settings/base.py +++ b/smartsim/settings/base.py @@ -75,19 +75,11 @@ def __init__( rs = RunSettings("echo", "hello", "mpirun", run_args={"-np": "2"}) :param exe: executable to run - :type exe: str - :param exe_args: executable arguments, defaults to None - :type exe_args: str | list[str], optional - :param run_command: launch binary (e.g. "srun"), defaults to empty str - :type run_command: str, optional - :param run_args: arguments for run command (e.g. `-np` for `mpiexec`), - defaults to None - :type run_args: dict[str, str], optional - :param env_vars: environment vars to launch job with, defaults to None - :type env_vars: dict[str, str], optional - :param container: container type for workload (e.g. "singularity"), - defaults to None - :type container: Container, optional + :param exe_args: executable arguments + :param run_command: launch binary (e.g. "srun") + :param run_args: arguments for run command (e.g. `-np` for `mpiexec`) + :param env_vars: environment vars to launch job with + :param container: container type for workload (e.g. "singularity") """ # Do not expand executable if running within a container self.exe = [exe] if container else [expand_exe_path(exe)] @@ -117,26 +109,50 @@ def __init__( @property def exe_args(self) -> t.Union[str, t.List[str]]: + """Return an immutable list of attached executable arguments. + + :returns: attached executable arguments + """ return self._exe_args @exe_args.setter def exe_args(self, value: t.Union[str, t.List[str], None]) -> None: + """Set the executable arguments. + + :param value: executable arguments + """ self._exe_args = self._build_exe_args(value) @property def run_args(self) -> t.Dict[str, t.Union[int, str, float, None]]: + """Return an immutable list of attached run arguments. + + :returns: attached run arguments + """ return self._run_args @run_args.setter def run_args(self, value: t.Dict[str, t.Union[int, str, float, None]]) -> None: + """Set the run arguments. + + :param value: run arguments + """ self._run_args = copy.deepcopy(value) @property def env_vars(self) -> t.Dict[str, t.Optional[str]]: + """Return an immutable list of attached environment variables. + + :returns: attached environment variables + """ return self._env_vars @env_vars.setter def env_vars(self, value: t.Dict[str, t.Optional[str]]) -> None: + """Set the environment variables. + + :param value: environment variables + """ self._env_vars = copy.deepcopy(value) # To be overwritten by subclasses. Set of reserved args a user cannot change @@ -146,7 +162,6 @@ def set_nodes(self, nodes: int) -> None: """Set the number of nodes :param nodes: number of nodes to run with - :type nodes: int """ logger.warning( ( @@ -159,7 +174,6 @@ def set_tasks(self, tasks: int) -> None: """Set the number of tasks to launch :param tasks: number of tasks to launch - :type tasks: int """ logger.warning( ( @@ -172,7 +186,6 @@ def set_tasks_per_node(self, tasks_per_node: int) -> None: """Set the number of tasks per node :param tasks_per_node: number of tasks to launch per node - :type tasks_per_node: int """ logger.warning( ( @@ -185,7 +198,6 @@ def set_task_map(self, task_mapping: str) -> None: """Set a task mapping :param task_mapping: task mapping - :type task_mapping: str """ logger.warning( ( @@ -198,7 +210,6 @@ def set_cpus_per_task(self, cpus_per_task: int) -> None: """Set the number of cpus per task :param cpus_per_task: number of cpus per task - :type cpus_per_task: int """ logger.warning( ( @@ -211,7 +222,6 @@ def set_hostlist(self, host_list: t.Union[str, t.List[str]]) -> None: """Specify the hostlist for this job :param host_list: hosts to launch on - :type host_list: str | list[str] """ logger.warning( ( @@ -224,7 +234,6 @@ def set_hostlist_from_file(self, file_path: str) -> None: """Use the contents of a file to specify the hostlist for this job :param file_path: Path to the hostlist file - :type file_path: str """ logger.warning( ( @@ -237,7 +246,6 @@ def set_excluded_hosts(self, host_list: t.Union[str, t.List[str]]) -> None: """Specify a list of hosts to exclude for launching this job :param host_list: hosts to exclude - :type host_list: str | list[str] """ logger.warning( ( @@ -250,7 +258,6 @@ def set_cpu_bindings(self, bindings: t.Union[int, t.List[int]]) -> None: """Set the cores to which MPI processes are bound :param bindings: List specifing the cores to which MPI processes are bound - :type bindings: list[int] | int """ logger.warning( ( @@ -263,7 +270,6 @@ def set_memory_per_node(self, memory_per_node: int) -> None: """Set the amount of memory required per node in megabytes :param memory_per_node: Number of megabytes per node - :type memory_per_node: int """ logger.warning( ( @@ -276,7 +282,6 @@ def set_verbose_launch(self, verbose: bool) -> None: """Set the job to run in verbose mode :param verbose: Whether the job should be run verbosely - :type verbose: bool """ logger.warning( ( @@ -289,7 +294,6 @@ def set_quiet_launch(self, quiet: bool) -> None: """Set the job to run in quiet mode :param quiet: Whether the job should be run quietly - :type quiet: bool """ logger.warning( ( @@ -302,7 +306,6 @@ def set_broadcast(self, dest_path: t.Optional[str] = None) -> None: """Copy executable file to allocated compute nodes :param dest_path: Path to copy an executable file - :type dest_path: str | None """ logger.warning( ( @@ -315,11 +318,8 @@ def set_time(self, hours: int = 0, minutes: int = 0, seconds: int = 0) -> None: """Automatically format and set wall time :param hours: number of hours to run job - :type hours: int :param minutes: number of minutes to run job - :type minutes: int :param seconds: number of seconds to run job - :type seconds: int """ return self.set_walltime( self._fmt_walltime(int(hours), int(minutes), int(seconds)) @@ -329,7 +329,6 @@ def set_node_feature(self, feature_list: t.Union[str, t.List[str]]) -> None: """Specify the node feature for this job :param feature_list: node feature to launch on - :type feature_list: str | list[str] """ logger.warning( ( @@ -345,13 +344,9 @@ def _fmt_walltime(hours: int, minutes: int, seconds: int) -> str: By defualt the formatted wall time is the total number of seconds. :param hours: number of hours to run job - :type hours: int :param minutes: number of minutes to run job - :type minutes: int :param seconds: number of seconds to run job - :type seconds: int :returns: Formatted walltime - :rtype: str """ time_ = hours * 3600 time_ += minutes * 60 @@ -362,7 +357,6 @@ def set_walltime(self, walltime: str) -> None: """Set the formatted walltime :param walltime: Time in format required by launcher`` - :type walltime: str """ logger.warning( ( @@ -375,7 +369,6 @@ def set_binding(self, binding: str) -> None: """Set binding :param binding: Binding - :type binding: str """ logger.warning( ( @@ -388,7 +381,6 @@ def set_mpmd_preamble(self, preamble_lines: t.List[str]) -> None: """Set preamble to a file to make a job MPMD :param preamble_lines: lines to put at the beginning of a file. - :type preamble_lines: list[str] """ logger.warning( ( @@ -401,7 +393,6 @@ def make_mpmd(self, settings: RunSettings) -> None: """Make job an MPMD job :param settings: ``RunSettings`` instance - :type settings: RunSettings """ logger.warning( ( @@ -417,7 +408,6 @@ def run_command(self) -> t.Optional[str]: Attempt to expand the path to the executable if possible :returns: launch binary e.g. mpiexec - :type: str | None """ cmd = self._run_command @@ -441,7 +431,6 @@ def update_env(self, env_vars: t.Dict[str, t.Union[str, int, float, bool]]) -> N :param env_vars: environment variables to update or add - :type env_vars: dict[str, Union[str, int, float, bool]] :raises TypeError: if env_vars values cannot be coerced to strings """ val_types = (str, int, float, bool) @@ -458,7 +447,6 @@ def add_exe_args(self, args: t.Union[str, t.List[str]]) -> None: """Add executable arguments to executable :param args: executable arguments - :type args: str | list[str] """ args = self._build_exe_args(args) self._exe_args.extend(args) @@ -509,11 +497,8 @@ def set( # otherwise returns ["exclusive", "None"] :param arg: name of the argument - :type arg: str :param value: value of the argument - :type value: str | None :param conditon: set the argument if condition evaluates to True - :type condition: bool """ if not isinstance(arg, str): raise TypeError("Argument name should be of type str") @@ -567,7 +552,6 @@ def format_run_args(self) -> t.List[str]: literally with no formatting. :return: list run arguments for these settings - :rtype: list[str] """ formatted = [] for arg, value in self.run_args.items(): @@ -579,7 +563,6 @@ def format_env_vars(self) -> t.List[str]: """Build environment variable string :returns: formatted list of strings to export variables - :rtype: list[str] """ formatted = [] for key, val in self.env_vars.items(): @@ -625,7 +608,6 @@ def batch_cmd(self) -> str: command. If we cannot, returns the batch command as is. :returns: batch command - :type: str """ if is_valid_cmd(self._batch_cmd): return expand_exe_path(self._batch_cmd) @@ -634,10 +616,18 @@ def batch_cmd(self) -> str: @property def batch_args(self) -> t.Dict[str, t.Optional[str]]: + """Retrieve attached batch arguments + + :returns: attached batch arguments + """ return self._batch_args @batch_args.setter def batch_args(self, value: t.Dict[str, t.Optional[str]]) -> None: + """Attach batch arguments + + :param value: dictionary of batch arguments + """ self._batch_args = copy.deepcopy(value) if value else {} def set_nodes(self, num_nodes: int) -> None: @@ -662,7 +652,6 @@ def set_batch_command(self, command: str) -> None: """Set the command used to launch the batch e.g. ``sbatch`` :param command: batch command - :type command: str """ self._batch_cmd = command @@ -673,7 +662,6 @@ def add_preamble(self, lines: t.List[str]) -> None: start virtual environments before running the executables. :param line: lines to add to preamble. - :type line: str or list[str] """ if isinstance(lines, str): self._preamble += [lines] @@ -684,7 +672,10 @@ def add_preamble(self, lines: t.List[str]) -> None: @property def preamble(self) -> t.Iterable[str]: - """Return an iterable of preamble clauses to be prepended to the batch file""" + """Return an iterable of preamble clauses to be prepended to the batch file + + :return: attached preamble clauses + """ return (clause for clause in self._preamble) def __str__(self) -> str: # pragma: no-cover diff --git a/smartsim/settings/containers.py b/smartsim/settings/containers.py index bdba1ce88..d2fd4fca2 100644 --- a/smartsim/settings/containers.py +++ b/smartsim/settings/containers.py @@ -39,13 +39,9 @@ class Container: launch a workload within a container into a single object. :param image: local or remote path to container image - :type image: str :param args: arguments to container command - :type args: str | list[str], optional :param mount: paths to mount (bind) from host machine into image. - :type mount: str | list[str] | dict[str, str], optional :param working_directory: path of the working directory within the container - :type working_directory: str """ def __init__( @@ -70,7 +66,6 @@ def _containerized_run_command(self, run_command: str) -> str: """Return modified run_command with container commands prepended. :param run_command: run command from a RunSettings class - :type run_command: str """ raise NotImplementedError( "Containerized run command specification not implemented for this " @@ -99,11 +94,8 @@ class Singularity(Container): :param image: local or remote path to container image, e.g. ``docker://sylabsio/lolcow`` - :type image: str :param args: arguments to 'singularity exec' command - :type args: str | list[str], optional :param mount: paths to mount (bind) from host machine into image. - :type mount: str | list[str] | dict[str, str], optional """ def __init__(self, *args: t.Any, **kwargs: t.Any) -> None: diff --git a/smartsim/settings/lsfSettings.py b/smartsim/settings/lsfSettings.py index 32902c8c6..bce0581c5 100644 --- a/smartsim/settings/lsfSettings.py +++ b/smartsim/settings/lsfSettings.py @@ -51,13 +51,9 @@ def __init__( ``JsrunSettings`` should only be used on LSF-based systems. :param exe: executable - :type exe: str - :param exe_args: executable arguments, defaults to None - :type exe_args: str | list[str], optional - :param run_args: arguments for run command, defaults to None - :type run_args: dict[str, t.Union[int, str, float, None]], optional - :param env_vars: environment vars to launch job with, defaults to None - :type env_vars: dict[str, str], optional + :param exe_args: executable arguments + :param run_args: arguments for run command + :param env_vars: environment vars to launch job with """ super().__init__( exe, @@ -81,7 +77,6 @@ def set_num_rs(self, num_rs: t.Union[str, int]) -> None: This sets ``--nrs``. :param num_rs: Number of resource sets or `ALL_HOSTS` - :type num_rs: int or str """ if isinstance(num_rs, str): self.run_args["nrs"] = num_rs @@ -94,7 +89,6 @@ def set_cpus_per_rs(self, cpus_per_rs: int) -> None: This sets ``--cpu_per_rs`` :param cpus_per_rs: number of cpus to use per resource set or ALL_CPUS - :type cpus_per_rs: int or str """ if self.colocated_db_settings: db_cpus = int(t.cast(int, self.colocated_db_settings.get("db_cpus", 0))) @@ -117,7 +111,6 @@ def set_gpus_per_rs(self, gpus_per_rs: int) -> None: This sets ``--gpu_per_rs`` :param gpus_per_rs: number of gpus to use per resource set or ALL_GPUS - :type gpus_per_rs: int or str """ if isinstance(gpus_per_rs, str): self.run_args["gpu_per_rs"] = gpus_per_rs @@ -130,7 +123,6 @@ def set_rs_per_host(self, rs_per_host: int) -> None: This sets ``--rs_per_host`` :param rs_per_host: number of resource sets to use per host - :type rs_per_host: int """ self.run_args["rs_per_host"] = int(rs_per_host) @@ -140,7 +132,6 @@ def set_tasks(self, tasks: int) -> None: This sets ``--np`` :param tasks: number of tasks - :type tasks: int """ self.run_args["np"] = int(tasks) @@ -150,7 +141,6 @@ def set_tasks_per_rs(self, tasks_per_rs: int) -> None: This sets ``--tasks_per_rs`` :param tasks_per_rs: number of tasks per resource set - :type tasks_per_rs: int """ self.run_args["tasks_per_rs"] = int(tasks_per_rs) @@ -160,7 +150,6 @@ def set_tasks_per_node(self, tasks_per_node: int) -> None: This function is an alias for `set_tasks_per_rs`. :param tasks_per_node: number of tasks per resource set - :type tasks_per_node: int """ self.set_tasks_per_rs(int(tasks_per_node)) @@ -170,7 +159,6 @@ def set_cpus_per_task(self, cpus_per_task: int) -> None: This function is an alias for `set_cpus_per_rs`. :param cpus_per_task: number of cpus per resource set - :type cpus_per_task: int """ self.set_cpus_per_rs(int(cpus_per_task)) @@ -180,7 +168,6 @@ def set_memory_per_rs(self, memory_per_rs: int) -> None: This sets ``--memory_per_rs`` :param memory_per_rs: Number of megabytes per rs - :type memory_per_rs: int """ self.run_args["memory_per_rs"] = int(memory_per_rs) @@ -190,7 +177,6 @@ def set_memory_per_node(self, memory_per_node: int) -> None: Alias for `set_memory_per_rs`. :param memory_per_node: Number of megabytes per rs - :type memory_per_node: int """ self.set_memory_per_rs(int(memory_per_node)) @@ -200,7 +186,6 @@ def set_binding(self, binding: str) -> None: This sets ``--bind`` :param binding: Binding, e.g. `packed:21` - :type binding: str """ self.run_args["bind"] = binding @@ -213,7 +198,6 @@ def make_mpmd(self, settings: RunSettings) -> None: the list of settings to be launched in the same ERF file. :param settings: ``JsrunSettings`` instance - :type settings: JsrunSettings, optional """ if self.colocated_db_settings: raise SSUnsupportedError( @@ -231,7 +215,6 @@ def set_mpmd_preamble(self, preamble_lines: t.List[str]) -> None: :param preamble_lines: lines to put at the beginning of the ERF file. - :type preamble_lines: list[str] """ self.mpmd_preamble_lines = preamble_lines @@ -249,7 +232,6 @@ def set_erf_sets(self, erf_sets: t.Dict[str, str]) -> None: only `rank` is used. :param hosts: dictionary of resources - :type hosts: dict[str,str] """ self.erf_sets = copy.deepcopy(erf_sets) @@ -259,7 +241,6 @@ def format_env_vars(self) -> t.List[str]: its value is propagated from the current environment. :returns: formatted list of strings to export variables - :rtype: list[str] """ format_str = [] for k, v in self.env_vars.items(): @@ -279,8 +260,6 @@ def set_individual_output(self, suffix: t.Optional[str] = None) -> None: :param suffix: Optional suffix to add to output file names, it can contain `%j`, `%h`, `%p`, or `%t`, as specified by `jsrun` options. - :type suffix: str, optional - """ self.run_args["stdio_mode"] = "individual" if suffix: @@ -290,7 +269,6 @@ def format_run_args(self) -> t.List[str]: """Return a list of LSF formatted run arguments :return: list of LSF arguments for these settings - :rtype: list[str] """ # args launcher uses args = [] @@ -403,16 +381,11 @@ def __init__( ) -> None: """Specify ``bsub`` batch parameters for a job - :param nodes: number of nodes for batch, defaults to None - :type nodes: int, optional - :param time: walltime for batch job in format hh:mm, defaults to None - :type time: str, optional - :param project: project for batch launch, defaults to None - :type project: str, optional - :param batch_args: overrides for LSF batch arguments, defaults to None - :type batch_args: dict[str, str], optional - :param smts: SMTs, defaults to 0 - :type smts: int, optional + :param nodes: number of nodes for batch + :param time: walltime for batch job in format hh:mm + :param project: project for batch launch + :param batch_args: overrides for LSF batch arguments + :param smts: SMTs """ self.project: t.Optional[str] = None @@ -445,7 +418,6 @@ def set_walltime(self, walltime: str) -> None: :param walltime: Time in hh:mm format, e.g. "10:00" for 10 hours, if time is supplied in hh:mm:ss format, seconds will be ignored and walltime will be set as ``hh:mm`` - :type walltime: str """ # For compatibility with other launchers, as explained in docstring if walltime: @@ -461,7 +433,6 @@ def set_smts(self, smts: int) -> None: takes precedence. :param smts: SMT (e.g on Summit: 1, 2, or 4) - :type smts: int """ self.smts = smts @@ -471,7 +442,6 @@ def set_project(self, project: str) -> None: This sets ``-P``. :param time: project name - :type time: str """ if project: self.project = project @@ -482,7 +452,6 @@ def set_account(self, account: str) -> None: this function is an alias for `set_project`. :param account: project name - :type account: str """ self.set_project(account) @@ -492,7 +461,6 @@ def set_nodes(self, num_nodes: int) -> None: This sets ``-nnodes``. :param nodes: number of nodes - :type nodes: int """ if num_nodes: self.batch_args["nnodes"] = str(int(num_nodes)) @@ -503,6 +471,9 @@ def set_expert_mode_req(self, res_req: str, slots: int) -> None: disregard all other allocation options. This sets ``-csm -n slots -R res_req`` + + :param res_req: specific resource requirements + :param slots: number of resources to allocate """ self.expert_mode = True self.batch_args["csm"] = "y" @@ -513,7 +484,6 @@ def set_hostlist(self, host_list: t.Union[str, t.List[str]]) -> None: """Specify the hostlist for this job :param host_list: hosts to launch on - :type host_list: str | list[str] :raises TypeError: if not str or list of str """ if isinstance(host_list, str): @@ -530,7 +500,6 @@ def set_tasks(self, tasks: int) -> None: This sets ``-n`` :param tasks: number of tasks - :type tasks: int """ self.batch_args["n"] = str(int(tasks)) @@ -538,7 +507,6 @@ def set_queue(self, queue: str) -> None: """Set the queue for this job :param queue: The queue to submit the job on - :type queue: str """ if queue: self.batch_args["q"] = queue @@ -573,7 +541,6 @@ def format_batch_args(self) -> t.List[str]: """Get the formatted batch arguments for a preview :return: list of batch arguments for Qsub - :rtype: list[str] """ opts = [] diff --git a/smartsim/settings/mpiSettings.py b/smartsim/settings/mpiSettings.py index ce132bcc5..c64c66cbf 100644 --- a/smartsim/settings/mpiSettings.py +++ b/smartsim/settings/mpiSettings.py @@ -61,16 +61,11 @@ def __init__( None can be provided for arguments that do not have values. :param exe: executable - :type exe: str - :param exe_args: executable arguments, defaults to None - :type exe_args: str | list[str], optional - :param run_args: arguments for run command, defaults to None - :type run_args: dict[str, str], optional - :param env_vars: environment vars to launch job with, defaults to None - :type env_vars: dict[str, str], optional + :param exe_args: executable arguments + :param run_args: arguments for run command + :param env_vars: environment vars to launch job with :param fail_if_missing_exec: Throw an exception of the MPI command is missing. Otherwise, throw a warning - :type fail_if_missing_exec: bool, optional """ super().__init__( exe, @@ -101,7 +96,6 @@ def make_mpmd(self, settings: RunSettings) -> None: Model instance :param settings: MpirunSettings instance - :type settings: MpirunSettings """ if self.colocated_db_settings: raise SSUnsupportedError( @@ -117,7 +111,6 @@ def set_task_map(self, task_mapping: str) -> None: For examples, see the man page for ``mpirun`` :param task_mapping: task mapping - :type task_mapping: str """ self.run_args["map-by"] = task_mapping @@ -130,7 +123,6 @@ def set_cpus_per_task(self, cpus_per_task: int) -> None: and will soon be replaced. :param cpus_per_task: number of tasks - :type cpus_per_task: int """ self.run_args["cpus-per-proc"] = int(cpus_per_task) @@ -140,7 +132,6 @@ def set_cpu_binding_type(self, bind_type: str) -> None: This sets ``--bind-to`` for MPI compliant implementations :param bind_type: binding type - :type bind_type: str """ self.run_args["bind-to"] = bind_type @@ -148,7 +139,6 @@ def set_tasks_per_node(self, tasks_per_node: int) -> None: """Set the number of tasks per node :param tasks_per_node: number of tasks to launch per node - :type tasks_per_node: int """ self.run_args["npernode"] = int(tasks_per_node) @@ -158,7 +148,6 @@ def set_tasks(self, tasks: int) -> None: This sets ``-n`` for MPI compliant implementations :param tasks: number of tasks - :type tasks: int """ self.run_args["n"] = int(tasks) @@ -168,7 +157,6 @@ def set_hostlist(self, host_list: t.Union[str, t.List[str]]) -> None: This sets ``--host`` :param host_list: list of host names - :type host_list: str | list[str] :raises TypeError: if not str or list of str """ if isinstance(host_list, str): @@ -185,7 +173,6 @@ def set_hostlist_from_file(self, file_path: str) -> None: This sets ``--hostfile`` :param file_path: Path to the hostlist file - :type file_path: str """ self.run_args["hostfile"] = file_path @@ -195,7 +182,6 @@ def set_verbose_launch(self, verbose: bool) -> None: This sets ``--verbose`` :param verbose: Whether the job should be run verbosely - :type verbose: bool """ if verbose: self.run_args["verbose"] = None @@ -208,7 +194,6 @@ def set_quiet_launch(self, quiet: bool) -> None: This sets ``--quiet`` :param quiet: Whether the job should be run quietly - :type quiet: bool """ if quiet: self.run_args["quiet"] = None @@ -221,7 +206,6 @@ def set_broadcast(self, dest_path: t.Optional[str] = None) -> None: This sets ``--preload-binary`` :param dest_path: Destination path (Ignored) - :type dest_path: str | None """ if dest_path is not None and isinstance(dest_path, str): logger.warning( @@ -238,7 +222,6 @@ def set_walltime(self, walltime: str) -> None: This sets ``--timeout`` :param walltime: number like string of seconds that a job will run in secs - :type walltime: str """ self.run_args["timeout"] = walltime @@ -246,7 +229,6 @@ def format_run_args(self) -> t.List[str]: """Return a list of MPI-standard formatted run arguments :return: list of MPI-standard arguments for these settings - :rtype: list[str] """ # args launcher uses args = [] @@ -265,7 +247,6 @@ def format_env_vars(self) -> t.List[str]: """Format the environment variables for mpirun :return: list of env vars - :rtype: list[str] """ formatted = [] env_string = "-x" @@ -299,13 +280,9 @@ def __init__( None can be provided for arguments that do not have values. :param exe: executable - :type exe: str - :param exe_args: executable arguments, defaults to None - :type exe_args: str | list[str], optional - :param run_args: arguments for run command, defaults to None - :type run_args: dict[str, t.Union[int, str, float, None]], optional - :param env_vars: environment vars to launch job with, defaults to None - :type env_vars: dict[str, str], optional + :param exe_args: executable arguments + :param run_args: arguments for run command + :param env_vars: environment vars to launch job with """ super().__init__(exe, exe_args, "mpirun", run_args, env_vars, **kwargs) @@ -330,13 +307,9 @@ def __init__( None can be provided for arguments that do not have values. :param exe: executable - :type exe: str - :param exe_args: executable arguments, defaults to None - :type exe_args: str | list[str], optional - :param run_args: arguments for run command, defaults to None - :type run_args: dict[str, t.Union[int, str, float, None]], optional - :param env_vars: environment vars to launch job with, defaults to None - :type env_vars: dict[str, str], optional + :param exe_args: executable arguments + :param run_args: arguments for run command + :param env_vars: environment vars to launch job with """ super().__init__(exe, exe_args, "mpiexec", run_args, env_vars, **kwargs) @@ -370,12 +343,8 @@ def __init__( None can be provided for arguments that do not have values. :param exe: executable - :type exe: str - :param exe_args: executable arguments, defaults to None - :type exe_args: str | list[str], optional - :param run_args: arguments for run command, defaults to None - :type run_args: dict[str, t.Union[int, str, float, None]], optional - :param env_vars: environment vars to launch job with, defaults to None - :type env_vars: dict[str, str], optional + :param exe_args: executable arguments + :param run_args: arguments for run command + :param env_vars: environment vars to launch job with """ super().__init__(exe, exe_args, "orterun", run_args, env_vars, **kwargs) diff --git a/smartsim/settings/palsSettings.py b/smartsim/settings/palsSettings.py index e43cd9466..4100e8efe 100644 --- a/smartsim/settings/palsSettings.py +++ b/smartsim/settings/palsSettings.py @@ -45,13 +45,9 @@ class PalsMpiexecSettings(_BaseMPISettings): None can be provided for arguments that do not have values. :param exe: executable - :type exe: str - :param exe_args: executable arguments, defaults to None - :type exe_args: str | list[str], optional - :param run_args: arguments for run command, defaults to None - :type run_args: dict[str, str], optional - :param env_vars: environment vars to launch job with, defaults to None - :type env_vars: dict[str, str], optional + :param exe_args: executable arguments + :param run_args: arguments for run command + :param env_vars: environment vars to launch job with """ def __init__( @@ -74,16 +70,11 @@ def __init__( None can be provided for arguments that do not have values. :param exe: executable - :type exe: str - :param exe_args: executable arguments, defaults to None - :type exe_args: str | list[str], optional - :param run_args: arguments for run command, defaults to None - :type run_args: dict[str, t.Union[int, str, float, None]], optional - :param env_vars: environment vars to launch job with, defaults to None - :type env_vars: dict[str, str], optional + :param exe_args: executable arguments + :param run_args: arguments for run command + :param env_vars: environment vars to launch job with :param fail_if_missing_exec: Throw an exception of the MPI command is missing. Otherwise, throw a warning - :type fail_if_missing_exec: bool, optional """ super().__init__( exe, @@ -103,7 +94,6 @@ def set_task_map(self, task_mapping: str) -> None: For examples, see the man page for ``mpirun`` :param task_mapping: task mapping - :type task_mapping: str """ logger.warning("set_task_map not supported under PALS") @@ -116,7 +106,6 @@ def set_cpus_per_task(self, cpus_per_task: int) -> None: and will soon be replaced. :param cpus_per_task: number of tasks - :type cpus_per_task: int """ logger.warning("set_cpus_per_task not supported under PALS") @@ -126,7 +115,6 @@ def set_cpu_binding_type(self, bind_type: str) -> None: This sets ``--bind-to`` for MPI compliant implementations :param bind_type: binding type - :type bind_type: str """ self.run_args["cpu-bind"] = bind_type @@ -134,7 +122,6 @@ def set_tasks(self, tasks: int) -> None: """Set the number of tasks :param tasks: number of total tasks to launch - :type tasks: int """ self.run_args["np"] = int(tasks) @@ -142,7 +129,6 @@ def set_tasks_per_node(self, tasks_per_node: int) -> None: """Set the number of tasks per node :param tasks_per_node: number of tasks to launch per node - :type tasks_per_node: int """ self.run_args["ppn"] = int(tasks_per_node) @@ -152,7 +138,6 @@ def set_quiet_launch(self, quiet: bool) -> None: This sets ``--quiet`` :param quiet: Whether the job should be run quietly - :type quiet: bool """ logger.warning("set_quiet_launch not supported under PALS") @@ -163,7 +148,6 @@ def set_broadcast(self, dest_path: t.Optional[str] = None) -> None: This sets ``--preload-binary`` :param dest_path: Destination path (Ignored) - :type dest_path: str | None """ if dest_path is not None and isinstance(dest_path, str): logger.warning( @@ -178,7 +162,6 @@ def set_walltime(self, walltime: str) -> None: """Set the maximum number of seconds that a job will run :param walltime: number like string of seconds that a job will run in secs - :type walltime: str """ logger.warning("set_walltime not supported under PALS") @@ -186,7 +169,6 @@ def set_gpu_affinity_script(self, affinity: str, *args: t.Any) -> None: """Set the GPU affinity through a bash script :param affinity: path to the affinity script - :type affinity: str """ self.affinity_script.append(str(affinity)) for arg in args: @@ -196,7 +178,6 @@ def format_run_args(self) -> t.List[str]: """Return a list of MPI-standard formatted run arguments :return: list of MPI-standard arguments for these settings - :rtype: list[str] """ # args launcher uses args = [] @@ -219,7 +200,6 @@ def format_env_vars(self) -> t.List[str]: """Format the environment variables for mpirun :return: list of env vars - :rtype: list[str] """ formatted = [] @@ -242,7 +222,6 @@ def set_hostlist(self, host_list: t.Union[str, t.List[str]]) -> None: This sets ``--hosts`` :param host_list: list of host names - :type host_list: str | list[str] :raises TypeError: if not str or list of str """ if isinstance(host_list, str): diff --git a/smartsim/settings/pbsSettings.py b/smartsim/settings/pbsSettings.py index 19a58b11c..09d48181a 100644 --- a/smartsim/settings/pbsSettings.py +++ b/smartsim/settings/pbsSettings.py @@ -53,20 +53,13 @@ def __init__( the value for select statement supplied in ``resources`` will override. - :param nodes: number of nodes for batch, defaults to None - :type nodes: int, optional - :param ncpus: number of cpus per node, defaults to None - :type ncpus: int, optional - :param time: walltime for batch job, defaults to None - :type time: str, optional - :param queue: queue to run batch in, defaults to None - :type queue: str, optional - :param account: account for batch launch, defaults to None - :type account: str, optional - :param resources: overrides for resource arguments, defaults to None - :type resources: dict[str, str], optional - :param batch_args: overrides for PBS batch arguments, defaults to None - :type batch_args: dict[str, str], optional + :param nodes: number of nodes for batch + :param ncpus: number of cpus per node + :param time: walltime for batch job + :param queue: queue to run batch in + :param account: account for batch launch + :param resources: overrides for resource arguments + :param batch_args: overrides for PBS batch arguments """ self._ncpus = ncpus @@ -112,7 +105,6 @@ def set_nodes(self, num_nodes: int) -> None: nodes here is sets the 'nodes' resource. :param num_nodes: number of nodes - :type num_nodes: int """ if num_nodes: @@ -122,7 +114,6 @@ def set_hostlist(self, host_list: t.Union[str, t.List[str]]) -> None: """Specify the hostlist for this job :param host_list: hosts to launch on - :type host_list: str | list[str] :raises TypeError: if not str or list of str """ if isinstance(host_list, str): @@ -143,7 +134,6 @@ def set_walltime(self, walltime: str) -> None: this value will be overridden :param walltime: wall time - :type walltime: str """ if walltime: self.set_resource("walltime", walltime) @@ -152,7 +142,6 @@ def set_queue(self, queue: str) -> None: """Set the queue for the batch job :param queue: queue name - :type queue: str """ if queue: self.batch_args["q"] = str(queue) @@ -165,7 +154,6 @@ def set_ncpus(self, num_cpus: t.Union[int, str]) -> None: this value will be overridden :param num_cpus: number of cpus per node in select - :type num_cpus: int """ self._ncpus = int(num_cpus) @@ -173,7 +161,6 @@ def set_account(self, account: str) -> None: """Set the account for this batch job :param acct: account id - :type acct: str """ if account: self.batch_args["A"] = str(account) @@ -185,9 +172,7 @@ def set_resource(self, resource_name: str, value: t.Union[str, int]) -> None: arguments will be overridden. Likewise for Walltime :param resource_name: name of resource, e.g. walltime - :type resource_name: str :param value: value - :type value: str """ # TODO add error checking here # TODO include option to overwrite place (warning for orchestrator?) @@ -200,7 +185,6 @@ def format_batch_args(self) -> t.List[str]: """Get the formatted batch arguments for a preview :return: batch arguments for Qsub - :rtype: list[str] :raises ValueError: if options are supplied without values """ opts = self._create_resource_list() diff --git a/smartsim/settings/settings.py b/smartsim/settings/settings.py index 6e6172507..7bc2f7b86 100644 --- a/smartsim/settings/settings.py +++ b/smartsim/settings/settings.py @@ -63,19 +63,12 @@ def create_batch_settings( :param launcher: launcher for this experiment, if set to 'auto', an attempt will be made to find an available launcher on the system - :type launcher: str - :param nodes: number of nodes for batch job, defaults to 1 - :type nodes: int, optional - :param time: length of batch job, defaults to "" - :type time: str, optional - :param queue: queue or partition (if slurm), defaults to "" - :type queue: str, optional - :param account: user account name for batch system, defaults to "" - :type account: str, optional - :param batch_args: additional batch arguments, defaults to None - :type batch_args: dict[str, str], optional + :param nodes: number of nodes for batch job + :param time: length of batch job + :param queue: queue or partition (if slurm) + :param account: user account name for batch system + :param batch_args: additional batch arguments :return: a newly created BatchSettings instance - :rtype: BatchSettings :raises SmartSimError: if batch creation fails """ # all supported batch class implementations @@ -127,21 +120,13 @@ def create_run_settings( :param launcher: launcher to create settings for, if set to 'auto', an attempt will be made to find an available launcher on the system - :type launcher: str :param run_command: command to run the executable - :type run_command: str :param exe: executable to run - :type exe: str :param exe_args: arguments to pass to the executable - :type exe_args: list[str], optional :param run_args: arguments to pass to the ``run_command`` - :type run_args: list[str], optional :param env_vars: environment variables to pass to the executable - :type env_vars: dict[str, str], optional - :param container: container type for workload (e.g. "singularity"), defaults to None - :type container: Container, optional + :param container: container type for workload (e.g. "singularity") :return: the created ``RunSettings`` - :rtype: RunSettings :raises SmartSimError: if run_command=="auto" and detection fails """ # all supported RunSettings child classes diff --git a/smartsim/settings/slurmSettings.py b/smartsim/settings/slurmSettings.py index 61a3e9841..6cb13c54a 100644 --- a/smartsim/settings/slurmSettings.py +++ b/smartsim/settings/slurmSettings.py @@ -55,15 +55,10 @@ def __init__( parameters will launch on that allocation. :param exe: executable to run - :type exe: str - :param exe_args: executable arguments, defaults to None - :type exe_args: list[str] | str, optional - :param run_args: srun arguments without dashes, defaults to None - :type run_args: dict[str, t.Union[int, str, float, None]], optional - :param env_vars: environment variables for job, defaults to None - :type env_vars: dict[str, str], optional - :param alloc: allocation ID if running on existing alloc, defaults to None - :type alloc: str, optional + :param exe_args: executable arguments + :param run_args: srun arguments without dashes + :param env_vars: environment variables for job + :param alloc: allocation ID if running on existing alloc """ super().__init__( exe, @@ -84,7 +79,6 @@ def set_nodes(self, nodes: int) -> None: Effectively this is setting: ``srun --nodes `` :param nodes: number of nodes to run with - :type nodes: int """ self.run_args["nodes"] = int(nodes) @@ -95,7 +89,6 @@ def make_mpmd(self, settings: RunSettings) -> None: Model instance :param settings: SrunSettings instance - :type settings: SrunSettings """ if self.colocated_db_settings: raise SSUnsupportedError( @@ -117,7 +110,6 @@ def set_hostlist(self, host_list: t.Union[str, t.List[str]]) -> None: This sets ``--nodelist`` :param host_list: hosts to launch on - :type host_list: str | list[str] :raises TypeError: if not str or list of str """ if isinstance(host_list, str): @@ -134,7 +126,6 @@ def set_hostlist_from_file(self, file_path: str) -> None: This sets ``--nodefile`` :param file_path: Path to the hostlist file - :type file_path: str """ self.run_args["nodefile"] = file_path @@ -142,7 +133,6 @@ def set_excluded_hosts(self, host_list: t.Union[str, t.List[str]]) -> None: """Specify a list of hosts to exclude for launching this job :param host_list: hosts to exclude - :type host_list: list[str] :raises TypeError: """ if isinstance(host_list, str): @@ -159,7 +149,6 @@ def set_cpus_per_task(self, cpus_per_task: int) -> None: This sets ``--cpus-per-task`` :param num_cpus: number of cpus to use per task - :type num_cpus: int """ self.run_args["cpus-per-task"] = int(cpus_per_task) @@ -169,7 +158,6 @@ def set_tasks(self, tasks: int) -> None: This sets ``--ntasks`` :param tasks: number of tasks - :type tasks: int """ self.run_args["ntasks"] = int(tasks) @@ -179,7 +167,6 @@ def set_tasks_per_node(self, tasks_per_node: int) -> None: This sets ``--ntasks-per-node`` :param tasks_per_node: number of tasks per node - :type tasks_per_node: int """ self.run_args["ntasks-per-node"] = int(tasks_per_node) @@ -189,7 +176,6 @@ def set_cpu_bindings(self, bindings: t.Union[int, t.List[int]]) -> None: This sets ``--cpu-bind`` using the ``map_cpu:`` option :param bindings: List specifing the cores to which MPI processes are bound - :type bindings: list[int] | int """ if isinstance(bindings, int): bindings = [bindings] @@ -203,7 +189,6 @@ def set_memory_per_node(self, memory_per_node: int) -> None: This sets ``--mem`` in megabytes :param memory_per_node: Amount of memory per node in megabytes - :type memory_per_node: int """ self.run_args["mem"] = f"{int(memory_per_node)}M" @@ -213,7 +198,6 @@ def set_verbose_launch(self, verbose: bool) -> None: This sets ``--verbose`` :param verbose: Whether the job should be run verbosely - :type verbose: bool """ if verbose: self.run_args["verbose"] = None @@ -226,7 +210,6 @@ def set_quiet_launch(self, quiet: bool) -> None: This sets ``--quiet`` :param quiet: Whether the job should be run quietly - :type quiet: bool """ if quiet: self.run_args["quiet"] = None @@ -239,7 +222,6 @@ def set_broadcast(self, dest_path: t.Optional[str] = None) -> None: This sets ``--bcast`` :param dest_path: Path to copy an executable file - :type dest_path: str | None """ self.run_args["bcast"] = dest_path @@ -249,7 +231,6 @@ def set_node_feature(self, feature_list: t.Union[str, t.List[str]]) -> None: This sets ``-C`` :param feature_list: node feature to launch on - :type feature_list: str | list[str] :raises TypeError: if not str or list of str """ if isinstance(feature_list, str): @@ -265,13 +246,9 @@ def _fmt_walltime(hours: int, minutes: int, seconds: int) -> str: Converts time to format HH:MM:SS :param hours: number of hours to run job - :type hours: int :param minutes: number of minutes to run job - :type minutes: int :param seconds: number of seconds to run job - :type seconds: int :returns: Formatted walltime - :rtype: str """ return fmt_walltime(hours, minutes, seconds) @@ -281,7 +258,6 @@ def set_walltime(self, walltime: str) -> None: format = "HH:MM:SS" :param walltime: wall time - :type walltime: str """ self.run_args["time"] = str(walltime) @@ -291,7 +267,6 @@ def set_het_group(self, het_group: t.Iterable[int]) -> None: this sets `--het-group` :param het_group: list of heterogeneous groups - :type het_group: int or iterable of ints """ het_size_env = os.getenv("SLURM_HET_SIZE") if het_size_env is None: @@ -320,7 +295,6 @@ def format_run_args(self) -> t.List[str]: """Return a list of slurm formatted run arguments :return: list of slurm arguments for these settings - :rtype: list[str] """ # add additional slurm arguments based on key length opts = [] @@ -361,7 +335,6 @@ def format_env_vars(self) -> t.List[str]: """Build bash compatible environment variable string for Slurm :returns: the formatted string of environment variables - :rtype: list[str] """ self.check_env_vars() return [f"{k}={v}" for k, v in self.env_vars.items() if "," not in str(v)] @@ -374,7 +347,6 @@ def format_comma_sep_env_vars(self) -> t.Tuple[str, t.List[str]]: for more information on this, see the slurm documentation for srun :returns: the formatted string of environment variables - :rtype: tuple[str, list[str]] """ self.check_env_vars() exportable_env, compound_env, key_only = [], [], [] @@ -407,13 +379,9 @@ def fmt_walltime(hours: int, minutes: int, seconds: int) -> str: Converts time to format HH:MM:SS :param hours: number of hours to run job - :type hours: int :param minutes: number of minutes to run job - :type minutes: int :param seconds: number of seconds to run job - :type seconds: int :returns: Formatted walltime - :rtype: str """ delta = datetime.timedelta(hours=hours, minutes=minutes, seconds=seconds) fmt_str = str(delta) @@ -442,14 +410,10 @@ def __init__( Initialization values provided (nodes, time, account) will overwrite the same arguments in ``batch_args`` if present - :param nodes: number of nodes, defaults to None - :type nodes: int, optional + :param nodes: number of nodes :param time: walltime for job, e.g. "10:00:00" for 10 hours - :type time: str, optional - :param account: account for job, defaults to None - :type account: str, optional - :param batch_args: extra batch arguments, defaults to None - :type batch_args: dict[str, str], optional + :param account: account for job + :param batch_args: extra batch arguments """ super().__init__( "sbatch", @@ -466,7 +430,6 @@ def set_walltime(self, walltime: str) -> None: format = "HH:MM:SS" :param walltime: wall time - :type walltime: str """ # TODO check for formatting here if walltime: @@ -476,7 +439,6 @@ def set_nodes(self, num_nodes: int) -> None: """Set the number of nodes for this batch job :param num_nodes: number of nodes - :type num_nodes: int """ if num_nodes: self.batch_args["nodes"] = str(int(num_nodes)) @@ -485,7 +447,6 @@ def set_account(self, account: str) -> None: """Set the account for this batch job :param account: account id - :type account: str """ if account: self.batch_args["account"] = account @@ -494,7 +455,6 @@ def set_partition(self, partition: str) -> None: """Set the partition for the batch job :param partition: partition name - :type partition: str """ self.batch_args["partition"] = str(partition) @@ -504,7 +464,6 @@ def set_queue(self, queue: str) -> None: Sets the partition for the slurm batch job :param queue: the partition to run the batch job on - :type queue: str """ if queue: self.set_partition(queue) @@ -515,7 +474,6 @@ def set_cpus_per_task(self, cpus_per_task: int) -> None: This sets ``--cpus-per-task`` :param num_cpus: number of cpus to use per task - :type num_cpus: int """ self.batch_args["cpus-per-task"] = str(int(cpus_per_task)) @@ -523,7 +481,6 @@ def set_hostlist(self, host_list: t.Union[str, t.List[str]]) -> None: """Specify the hostlist for this job :param host_list: hosts to launch on - :type host_list: str | list[str] :raises TypeError: if not str or list of str """ if isinstance(host_list, str): @@ -538,7 +495,6 @@ def format_batch_args(self) -> t.List[str]: """Get the formatted batch arguments for a preview :return: batch arguments for Sbatch - :rtype: list[str] """ opts = [] # TODO add restricted here diff --git a/smartsim/wlm/__init__.py b/smartsim/wlm/__init__.py index 3a82a81e5..a5d20d0c9 100644 --- a/smartsim/wlm/__init__.py +++ b/smartsim/wlm/__init__.py @@ -75,9 +75,7 @@ def get_hosts(launcher: t.Optional[str] = None) -> t.List[str]: :param launcher: Name of the WLM to use to collect allocation info. If no launcher is provided ``detect_launcher`` is used to select a launcher. - :type launcher: str | None :returns: Names of the hosts - :rtype: list[str] :raises SSUnsupportedError: User attempted to use an unsupported WLM """ if launcher is None: @@ -94,9 +92,7 @@ def get_queue(launcher: t.Optional[str] = None) -> str: :param launcher: Name of the WLM to use to collect allocation info. If no launcher is provided ``detect_launcher`` is used to select a launcher. - :type launcher: str | None :returns: Name of the queue - :rtype: str :raises SSUnsupportedError: User attempted to use an unsupported WLM """ if launcher is None: @@ -113,9 +109,7 @@ def get_tasks(launcher: t.Optional[str] = None) -> int: :param launcher: Name of the WLM to use to collect allocation info. If no launcher is provided ``detect_launcher`` is used to select a launcher. - :type launcher: str | None :returns: Number of tasks - :rtype: int :raises SSUnsupportedError: User attempted to use an unsupported WLM """ if launcher is None: @@ -132,9 +126,7 @@ def get_tasks_per_node(launcher: t.Optional[str] = None) -> t.Dict[str, int]: :param launcher: Name of the WLM to use to collect allocation info. If no launcher is provided ``detect_launcher`` is used to select a launcher. - :type launcher: str | None :returns: Map of nodes to number of processes on that node - :rtype: dict[str, int] :raises SSUnsupportedError: User attempted to use an unsupported WLM """ if launcher is None: diff --git a/smartsim/wlm/pbs.py b/smartsim/wlm/pbs.py index eda5baf24..5b559c1e6 100644 --- a/smartsim/wlm/pbs.py +++ b/smartsim/wlm/pbs.py @@ -38,7 +38,6 @@ def get_hosts() -> t.List[str]: """Get the name of the hosts used in a PBS allocation. :returns: Names of the host nodes - :rtype: list[str] :raises SmartSimError: ``PBS_NODEFILE`` is not set """ hosts = [] @@ -59,7 +58,6 @@ def get_queue() -> str: """Get the name of queue in a PBS allocation. :returns: The name of the queue - :rtype: str :raises SmartSimError: ``PBS_QUEUE`` is not set """ if "PBS_QUEUE" in os.environ: @@ -76,7 +74,6 @@ def get_tasks() -> int: node from which it is run. :returns: Then number of tasks in the allocation - :rtype: int :raises LauncherError: Could not access ``qstat`` :raises SmartSimError: ``PBS_JOBID`` is not set """ @@ -103,8 +100,7 @@ def get_tasks_per_node() -> t.Dict[str, int]: This method requires ``qstat`` be installed on the node from which it is run. - :returns: Map of chunks to number of processes on that chunck - :rtype: dict[str, int] + :returns: Map of chunks to number of processes on that chunk :raises LauncherError: Could not access ``qstat`` :raises SmartSimError: ``PBS_JOBID`` is not set """ diff --git a/smartsim/wlm/slurm.py b/smartsim/wlm/slurm.py index d80b217ea..ae7299f28 100644 --- a/smartsim/wlm/slurm.py +++ b/smartsim/wlm/slurm.py @@ -66,17 +66,12 @@ def get_allocation( - exclusive=None - :param nodes: number of nodes for the allocation, defaults to 1 - :type nodes: int, optional - :param time: wall time of the allocation, HH:MM:SS format, defaults to None - :type time: str, optional - :param account: account id for allocation, defaults to None - :type account: str, optional - :param options: additional options for the slurm wlm, defaults to None - :type options: dict[str, str], optional + :param nodes: number of nodes for the allocation + :param time: wall time of the allocation, HH:MM:SS format + :param account: account id for allocation + :param options: additional options for the slurm wlm :raises LauncherError: if the allocation is not successful :return: the id of the allocation - :rtype: str """ if not which("salloc"): raise LauncherError( @@ -107,7 +102,6 @@ def release_allocation(alloc_id: str) -> None: """Free an allocation's resources :param alloc_id: allocation id - :type alloc_id: str :raises LauncherError: if allocation could not be freed """ if not which("scancel"): @@ -136,15 +130,11 @@ def validate(nodes: int = 1, ppn: int = 1, partition: t.Optional[str] = None) -> if no partition is provided, the default partition is found and used. - :param nodes: Override the default node count to validate, defaults to 1 - :type nodes: int, optional - :param ppn: Override the default processes per node to validate, defaults to 1 - :type ppn: int, optional - :param partition: partition to validate, defaults to None - :type partition: str, optional + :param nodes: Override the default node count to validate + :param ppn: Override the default processes per node to validate + :param partition: partition to validate :raises: LauncherError :returns: True if resources are available, False otherwise - :rtype: bool """ sys_partitions = _get_system_partition_info() @@ -188,7 +178,6 @@ def get_default_partition() -> str: a star following its partition name in sinfo output :returns: the name of the default partition - :rtype: str """ sinfo_output, _ = sinfo(["--noheader", "--format", "%P"]) @@ -205,7 +194,6 @@ def get_default_partition() -> str: def _get_system_partition_info() -> t.Dict[str, Partition]: """Build a dictionary of slurm partitions :returns: dict of Partition objects - :rtype: dict """ sinfo_output, _ = sinfo(["--noheader", "--format", "%R %n %c"]) @@ -279,9 +267,7 @@ def _validate_time_format(time: str) -> str: By defualt the formatted wall time is the total number of seconds. :param time: number of hours to run job - :type time: str :returns: Formatted walltime - :rtype: str """ try: hours, minutes, seconds = map(int, time.split(":")) @@ -301,7 +287,6 @@ def get_hosts() -> t.List[str]: on which it is run :returns: Names of the host nodes - :rtype: list[str] :raises LauncherError: Could not access ``scontrol`` :raises SmartSimError: ``SLURM_JOB_NODELIST`` is not set """ @@ -324,7 +309,6 @@ def get_queue() -> str: """Get the name of queue in a slurm allocation. :returns: The name of the queue - :rtype: str :raises SmartSimError: ``SLURM_JOB_PARTITION`` is not set """ if job_partition := os.environ.get("SLURM_JOB_PARTITION", None): @@ -336,7 +320,6 @@ def get_tasks() -> int: """Get the number of tasks in a slurm allocation. :returns: Then number of tasks in the allocation - :rtype: int :raises SmartSimError: ``SLURM_NTASKS`` is not set """ if ntasks_str := os.environ.get("SLURM_NTASKS", 0): @@ -353,7 +336,6 @@ def get_tasks_per_node() -> t.Dict[str, int]: on which it is run :returns: Map of nodes to number of tasks on that node - :rtype: dict[str, int] :raises SmartSimError: ``SLURM_TASKS_PER_NODE`` is not set """ if "SLURM_TASKS_PER_NODE" in os.environ: diff --git a/tests/test_config.py b/tests/test_config.py index 0716ac0d5..5cd13f2c5 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -61,9 +61,7 @@ def get_redisai_env( """Convenience method to create a set of environment variables that include RedisAI-specific variables :param rai_path: The path to the RedisAI library - :type: str (optional) :param lib_path: The path to the SMARTSIM_DEP_INSTALL_PATH - :type: str (optional) :return: A dictionary containing an updated set of environment variables """ env = os.environ.copy() From 7db84905f4d154dba832dfdf1da195d55416f756 Mon Sep 17 00:00:00 2001 From: amandarichardsonn <30413257+amandarichardsonn@users.noreply.github.com> Date: Fri, 26 Apr 2024 09:06:10 -0700 Subject: [PATCH 12/13] Set GH_TOKEN environment variable to use Github CLI in workflow (#570) This PR updates the authetication used in the release workflow from a developer created token to the GH_TOKEN environment variable. [ reviewed by @MattToast ] [ committed by @amandarichardsonn ] --- .github/workflows/release.yml | 3 ++- doc/changelog.rst | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 4366caf28..6c1361b46 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -125,6 +125,7 @@ jobs: password: ${{ secrets.PYPI }} #repository_url: https://test.pypi.org/legacy/ + createPullRequest: runs-on: ubuntu-latest steps: @@ -135,4 +136,4 @@ jobs: run: | gh pr create -B develop -H master --title 'Merge master into develop' --body 'This PR brings develop up to date with master for release.' env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GH_TOKEN: ${{ github.token }} diff --git a/doc/changelog.rst b/doc/changelog.rst index 210646b13..aed0e675c 100644 --- a/doc/changelog.rst +++ b/doc/changelog.rst @@ -18,6 +18,7 @@ To be released at some future point in time Description +- Update authentication in release workflow - Auto-generate type-hints into documentation - Auto-post release PR to develop - Bump manifest.json to version 0.0.4 @@ -49,6 +50,8 @@ Description Detailed Notes +- Replace the developer created token with the GH_TOKEN environment variable. + (SmartSim-PR570_) - Add extension to auto-generate function type-hints into documentation. (SmartSim-PR561_) - Add to github release workflow to auto generate a pull request from master @@ -132,6 +135,7 @@ Detailed Notes handler. SmartSim will now attempt to kill any launched jobs before calling the previously registered signal handler. (SmartSim-PR535_) +.. _SmartSim-PR570: https://github.com/CrayLabs/SmartSim/pull/570 .. _SmartSim-PR561: https://github.com/CrayLabs/SmartSim/pull/561 .. _SmartSim-PR566: https://github.com/CrayLabs/SmartSim/pull/566 .. _SmartSim-PR563: https://github.com/CrayLabs/SmartSim/pull/563 From 674b421c8c1831fcb066ca237d9a8f960045e415 Mon Sep 17 00:00:00 2001 From: amandarichardsonn <30413257+amandarichardsonn@users.noreply.github.com> Date: Tue, 30 Apr 2024 15:14:31 -0700 Subject: [PATCH 13/13] Automate release notes (#568) This PR adds a `release.yml` file to the root of the `.github` folder. Within the file we configure the release notes generated through PR tags. This PR also converts the changelog format from rst to md to match release notes format. [ reviewed by @AlyssaCote ] [ committed by @amandarichardsonn ] --- .github/release.yml | 56 ++ .github/workflows/changelog.yml | 4 +- doc/changelog.md | 927 ++++++++++++++++++++++++++++++++ doc/changelog.rst | 856 ----------------------------- doc/conf.py | 11 +- doc/requirements-doc.txt | 1 + 6 files changed, 996 insertions(+), 859 deletions(-) create mode 100644 .github/release.yml create mode 100644 doc/changelog.md delete mode 100644 doc/changelog.rst diff --git a/.github/release.yml b/.github/release.yml new file mode 100644 index 000000000..92304e6a9 --- /dev/null +++ b/.github/release.yml @@ -0,0 +1,56 @@ +# +# BSD 2-Clause License +# +# Copyright (c) 2021-2024, Hewlett Packard Enterprise +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +changelog: + exclude: + labels: + - ignore-for-release + categories: + - title: Features + labels: + - 'type: feature' + exclude: + labels: + - non-user-facing + - title: Bug Fixes + labels: + - 'bug: critical' + - 'bug: major' + - 'bug: minor' + exclude: + labels: + - non-user-facing + - title: API Breaks + labels: + - 'API break' + exclude: + labels: + - non-user-facing + - title: Miscellaneous Improvements + labels: + - "*" \ No newline at end of file diff --git a/.github/workflows/changelog.yml b/.github/workflows/changelog.yml index 220e9c210..3346206d1 100644 --- a/.github/workflows/changelog.yml +++ b/.github/workflows/changelog.yml @@ -45,5 +45,5 @@ jobs: - name: Changelog Enforcer uses: dangoslen/changelog-enforcer@v3.6.0 with: - changeLogPath: './doc/changelog.rst' - missingUpdateErrorMessage: 'changelog.rst has not been updated' \ No newline at end of file + changeLogPath: './doc/changelog.md' + missingUpdateErrorMessage: 'changelog.md has not been updated' \ No newline at end of file diff --git a/doc/changelog.md b/doc/changelog.md new file mode 100644 index 000000000..d95670f2c --- /dev/null +++ b/doc/changelog.md @@ -0,0 +1,927 @@ +# Changelog + +Listed here are the changes between each release of SmartSim, +SmartRedis and SmartDashboard. + +Jump to: +- {ref}`SmartRedis changelog` +- {ref}`SmartDashboard changelog` + +## SmartSim + +### Development branch + +To be released at some future point in time + +Description + +- Update authentication in release workflow +- Auto-generate type-hints into documentation +- Auto-post release PR to develop +- Bump manifest.json to version 0.0.4 +- Fix symlinking batch ensemble and model bug +- Fix noisy failing WLM test +- Remove defensive regexp in .gitignore +- Upgrade ubuntu to 22.04 +- Remove helper function `init_default` +- Fix telemetry monitor logging errors for task history +- Change default path for entities +- Drop Python 3.8 support +- Update watchdog dependency +- Historical output files stored under .smartsim directory +- Fixes unfalsifiable test that tests SmartSim's custom SIGINT signal + handler +- Add option to build Torch backend without the Intel Math Kernel + Library +- Fix ReadTheDocs build issue +- Disallow uninitialized variable use +- Promote device options to an Enum +- Update telemetry monitor, add telemetry collectors +- Add method to specify node features for a Slurm job +- Colo Orchestrator setup now blocks application start until setup + finished +- Refactor areas of the code where mypy potential errors +- Minor enhancements to test suite +- ExecArgs handling correction +- ReadTheDocs config file added and enabled on PRs +- Enforce changelog updates +- Fix Jupyter notebook math expressions +- Remove deprecated SmartSim modules +- SmartSim Documentation refactor +- Promote SmartSim statuses to a dedicated type +- Update the version of Redis from [7.0.4]{.title-ref} to + [7.2.4]{.title-ref} +- Increase disk space in doc builder container +- Update Experiment API typing +- Prevent duplicate entity names +- Fix publishing of development docs + +Detailed Notes + +- Replace the developer created token with the GH_TOKEN environment variable. + ([SmartSim-PR570](https://github.com/CrayLabs/SmartSim/pull/570)) +- Add extension to auto-generate function type-hints into documentation. + ([SmartSim-PR561](https://github.com/CrayLabs/SmartSim/pull/561)) +- Add to github release workflow to auto generate a pull request from + master into develop for release. + ([SmartSim-PR566](https://github.com/CrayLabs/SmartSim/pull/566)) +- The manifest.json version needs to match the SmartDashboard version, + which is 0.0.4 in the upcoming release. + ([SmartSim-PR563](https://github.com/CrayLabs/SmartSim/pull/563)) +- Properly symlinks batch ensembles and batch models. + ([SmartSim-PR547](https://github.com/CrayLabs/SmartSim/pull/547)) +- Remove defensive regexp in .gitignore and ensure tests write to + test_output. + ([SmartSim-PR560](https://github.com/CrayLabs/SmartSim/pull/560)) +- After dropping support for Python 3.8, ubuntu needs to be upgraded. + ([SmartSim-PR558](https://github.com/CrayLabs/SmartSim/pull/558)) +- Remove helper function `init_default` and replace with traditional + type narrowing. + ([SmartSim-PR545](https://github.com/CrayLabs/SmartSim/pull/545)) +- Ensure the telemetry monitor does not track a task_id for a managed + task. + ([SmartSim-PR557](https://github.com/CrayLabs/SmartSim/pull/557)) +- The default path for an entity is now the path to the experiment / + the entity name. create_database and create_ensemble now have path + arguments. All path arguments are compatible with relative paths. + Relative paths are relative to the CWD. + ([SmartSim-PR533](https://github.com/CrayLabs/SmartSim/pull/533)) +- Python 3.8 is reaching its end-of-life in October, 2024, so it will + no longer continue to be supported. + ([SmartSim-PR544](https://github.com/CrayLabs/SmartSim/pull/544)) +- Update watchdog dependency from 3.x to 4.x, fix new type issues + ([SmartSim-PR540](https://github.com/CrayLabs/SmartSim/pull/540)) +- The dashboard needs to display historical logs, so log files are + written out under the .smartsim directory and files under the + experiment directory are symlinked to them. + ([SmartSim-PR532](https://github.com/CrayLabs/SmartSim/pull/532)) +- Add an option to smart build + \"\--torch_with_mkl\"/\"\--no_torch_with_mkl\" to prevent Torch from + trying to link in the Intel Math Kernel Library. This is needed + because on machines that have the Intel compilers installed, the + Torch will unconditionally try to link in this library, however + fails because the linking flags are incorrect. + ([SmartSim-PR538](https://github.com/CrayLabs/SmartSim/pull/538)) +- Change type_extension and pydantic versions in readthedocs + environment to enable docs build. + ([SmartSim-PR537](https://github.com/CrayLabs/SmartSim/pull/537)) +- Promote devices to a dedicated Enum type throughout the SmartSim + code base. + ([SmartSim-PR527](https://github.com/CrayLabs/SmartSim/pull/527)) +- Update the telemetry monitor to enable retrieval of metrics on a + scheduled interval. Switch basic experiment tracking telemetry to + default to on. Add database metric collectors. Improve telemetry + monitor logging. Create telemetry subpackage at + [smartsim.\_core.utils.telemetry]{.title-ref}. Refactor telemetry + monitor entrypoint. + ([SmartSim-PR460](https://github.com/CrayLabs/SmartSim/pull/460)) +- Users can now specify node features for a Slurm job through + `SrunSettings.set_node_feature`. The method accepts a string or list + of strings. + ([SmartSim-PR529](https://github.com/CrayLabs/SmartSim/pull/529)) +- The request to the colocated entrypoints file within the shell + script is now a blocking process. Once the Orchestrator is setup, it + returns which moves the process to the background and allows the + application to start. This prevents the application from requesting + a ML model or script that has not been uploaded to the Orchestrator + yet. + ([SmartSim-PR522](https://github.com/CrayLabs/SmartSim/pull/522)) +- Add checks and tests to ensure SmartSim users cannot initialize run + settings with a list of lists as the exe_args argument. + ([SmartSim-PR517](https://github.com/CrayLabs/SmartSim/pull/517)) +- Add readthedocs configuration file and enable readthedocs builds on + pull requests. Additionally added robots.txt file generation when + readthedocs environment detected. + ([SmartSim-PR512](https://github.com/CrayLabs/SmartSim/pull/512)) +- Add Github Actions workflow that checks if changelog is edited on + pull requests into develop. + ([SmartSim-PR518](https://github.com/CrayLabs/SmartSim/pull/518)) +- Add path to MathJax.js file so that Sphinx will use to render math + expressions. + ([SmartSim-PR516](https://github.com/CrayLabs/SmartSim/pull/516)) +- Removed deprecated SmartSim modules: slurm and mpirunSettings. + ([SmartSim-PR514](https://github.com/CrayLabs/SmartSim/pull/514)) +- Implemented new structure of SmartSim documentation. Added examples + images and further detail of SmartSim components. + ([SmartSim-PR463](https://github.com/CrayLabs/SmartSim/pull/463)) +- Promote SmartSim statuses to a dedicated type named SmartSimStatus. + ([SmartSim-PR509](https://github.com/CrayLabs/SmartSim/pull/509)) +- Update Redis version to [7.2.4]{.title-ref}. This change fixes an + issue in the Redis build scripts causing failures on Apple Silicon + hosts. + ([SmartSim-PR507](https://github.com/CrayLabs/SmartSim/pull/507)) +- The container which builds the documentation for every merge to + develop was failing due to a lack of space within the container. + This was fixed by including an additional Github action that removes + some unneeded software and files that come from the default Github + Ubuntu container. + ([SmartSim-PR504](https://github.com/CrayLabs/SmartSim/pull/504)) +- Update the generic [t.Any]{.title-ref} typehints in Experiment API. + ([SmartSim-PR501](https://github.com/CrayLabs/SmartSim/pull/501)) +- The CI will fail static analysis if common erroneous truthy checks + are detected. + ([SmartSim-PR524](https://github.com/CrayLabs/SmartSim/pull/524)) +- Prevent the launch of duplicate named entities. Allow completed + entities to run. + ([SmartSim-PR480](https://github.com/CrayLabs/SmartSim/pull/480)) +- The CI will fail static analysis if a local variable used while + potentially undefined. + ([SmartSim-PR521](https://github.com/CrayLabs/SmartSim/pull/521)) +- Remove previously deprecated behavior present in test suite on + machines with Slurm and Open MPI. + ([SmartSim-PR520](https://github.com/CrayLabs/SmartSim/pull/520)) +- Experiments in the WLM tests are given explicit paths to prevent + unexpected directory creation. Ensure database are not left open on + test suite failures. Update path to pickle file in + `tests/full_wlm/test_generic_orc_launch_batch.py::test_launch_cluster_orc_reconnect` + to conform with changes made in + ([SmartSim-PR533](https://github.com/CrayLabs/SmartSim/pull/533)). + ([SmartSim-PR559](https://github.com/CrayLabs/SmartSim/pull/559)) +- When calling `Experiment.start` SmartSim would register a signal + handler that would capture an interrupt signal (\^C) to kill any + jobs launched through its `JobManager`. This would replace the + default (or user defined) signal handler. SmartSim will now attempt + to kill any launched jobs before calling the previously registered + signal handler. + ([SmartSim-PR535](https://github.com/CrayLabs/SmartSim/pull/535)) + +### 0.6.2 + +Released on 16 February, 2024 + +Description + +- Patch SmartSim dependency version + +Detailed Notes + +- A critical performance concern was identified and addressed in + SmartRedis. A patch fix was deployed, and SmartSim was updated to + ensure users do not inadvertently pull the unpatched version of + SmartRedis. + ([SmartSim-PR493](https://github.com/CrayLabs/SmartSim/pull/493)) + +### 0.6.1 + +Released on 15 February, 2024 + +Description + +- Duplicate for DBModel/Script prevented +- Update license to include 2024 +- Telemetry monitor is now active by default +- Add support for Mac OSX on Apple Silicon +- Remove Torch warnings during testing +- Validate Slurm timing format +- Expose Python Typehints +- Fix test_logs to prevent generation of directory +- Fix Python Typehint for colocated database settings +- Python 3.11 Support +- Quality of life [smart validate]{.title-ref} improvements +- Remove Cobalt support +- Enrich logging through context variables +- Upgrade Machine Learning dependencies +- Override sphinx-tabs background color +- Add concurrency group to test workflow +- Fix index when installing torch through smart build + +Detailed Notes + +- Modify the [git clone]{.title-ref} for both Redis and RedisAI to set + the line endings to unix-style line endings when using MacOS on ARM. + ([SmartSim-PR482](https://github.com/CrayLabs/SmartSim/pull/482)) +- Separate install instructions are now provided for Mac OSX on x64 vs + ARM64 + ([SmartSim-PR479](https://github.com/CrayLabs/SmartSim/pull/479)) +- Prevent duplicate ML model and script names being added to an + Ensemble member if the names exists. + ([SmartSim-PR475](https://github.com/CrayLabs/SmartSim/pull/475)) +- Updates [Copyright (c) 2021-2023]{.title-ref} to [Copyright (c) + 2021-2024]{.title-ref} in all of the necessary files. + ([SmartSim-PR485](https://github.com/CrayLabs/SmartSim/pull/485)) +- Bug fix which prevents the expected behavior when the + [SMARTSIM_LOG_LEVEL]{.title-ref} environment variable was set to + [developer]{.title-ref}. + ([SmartSim-PR473](https://github.com/CrayLabs/SmartSim/pull/473)) +- Sets the default value of the \"enable telemetry\" flag to on. Bumps + the output [manifest.json]{.title-ref} version number to match that + of [smartdashboard]{.title-ref} and pins a watchdog version to avoid + build errors. + ([SmartSim-PR477](https://github.com/CrayLabs/SmartSim/pull/477)) +- Refactor logic of [Manifest.has_db_objects]{.title-ref} to remove + excess branching and improve readability/maintainability. + ([SmartSim-PR476](https://github.com/CrayLabs/SmartSim/pull/476)) +- SmartSim can now be built and used on platforms using Apple Silicon + (ARM64). Currently, only the PyTorch backend is supported. Note that + libtorch will be downloaded from a CrayLabs github repo. + ([SmartSim-PR465](https://github.com/CrayLabs/SmartSim/pull/465)) +- Tests that were saving Torch models were emitting warnings. These + warnings were addressed by updating the model save test function. + ([SmartSim-PR472](https://github.com/CrayLabs/SmartSim/pull/472)) +- Validate the timing format when requesting a slurm allocation. + ([SmartSim-PR471](https://github.com/CrayLabs/SmartSim/pull/471)) +- Add and ship [py.typed]{.title-ref} marker to expose inline type + hints. Fix type errors related to SmartRedis. + ([SmartSim-PR468](https://github.com/CrayLabs/SmartSim/pull/468)) +- Fix the [test_logs.py::test_context_leak]{.title-ref} test that was + erroneously creating a directory named [some value]{.title-ref} in + SmartSim\'s root directory. + ([SmartSim-PR467](https://github.com/CrayLabs/SmartSim/pull/467)) +- Add Python type hinting to colocated settings. + ([SmartSim-PR462](https://github.com/CrayLabs/SmartSim/pull/462)) +- Add github actions for running black and isort checks. + ([SmartSim-PR464](https://github.com/CrayLabs/SmartSim/pull/464)) +- Relax the required version of [typing_extensions]{.title-ref}. + ([SmartSim-PR459](https://github.com/CrayLabs/SmartSim/pull/459)) +- Addition of Python 3.11 to SmartSim. + ([SmartSim-PR461](https://github.com/CrayLabs/SmartSim/pull/461)) +- Quality of life [smart validate]{.title-ref} improvements such as + setting [CUDA_VISIBLE_DEVICES]{.title-ref} environment variable + within [smart validate]{.title-ref} prior to importing any ML deps + to prevent false negatives on multi-GPU systems. Additionally, move + SmartRedis logs from standard out to dedicated log file in the + validation temporary directory as well as suppress + [sklearn]{.title-ref} deprecation warning by pinning + [KMeans]{.title-ref} constructor argument. Lastly, move TF test to + last as TF may reserve the GPUs it uses. + ([SmartSim-PR458](https://github.com/CrayLabs/SmartSim/pull/458)) +- Some actions in the current GitHub CI/CD workflows were outdated. + They were replaced with the latest versions. + ([SmartSim-PR446](https://github.com/CrayLabs/SmartSim/pull/446)) +- As the Cobalt workload manager is not used on any system we are + aware of, its support in SmartSim was terminated and classes such as + [CobaltLauncher]{.title-ref} have been removed. + ([SmartSim-PR448](https://github.com/CrayLabs/SmartSim/pull/448)) +- Experiment logs are written to a file that can be read by the + dashboard. + ([SmartSim-PR452](https://github.com/CrayLabs/SmartSim/pull/452)) +- Updated SmartSim\'s machine learning backends to PyTorch 2.0.1, + Tensorflow 2.13.1, ONNX 1.14.1, and ONNX Runtime 1.16.1. As a result + of this change, there is now an available ONNX wheel for use with + Python 3.10, and wheels for all of SmartSim\'s machine learning + backends with Python 3.11. + ([SmartSim-PR451](https://github.com/CrayLabs/SmartSim/pull/451)) + ([SmartSim-PR461](https://github.com/CrayLabs/SmartSim/pull/461)) +- The sphinx-tabs documentation extension uses a white background for + the tabs component. A custom CSS for those components to inherit the + overall theme color has been added. + ([SmartSim-PR453](https://github.com/CrayLabs/SmartSim/pull/453)) +- Add concurrency groups to GitHub\'s CI/CD workflows, preventing + multiple workflows from the same PR to be launched concurrently. + ([SmartSim-PR439](https://github.com/CrayLabs/SmartSim/pull/439)) +- Torch changed their preferred indexing when trying to install their + provided wheels. Updated the [pip install]{.title-ref} command + within [smart build]{.title-ref} to ensure that the appropriate + packages can be found. + ([SmartSim-PR449](https://github.com/CrayLabs/SmartSim/pull/449)) + +### 0.6.0 + +Released on 18 December, 2023 + +Description + +- Conflicting directives in the SmartSim packaging instructions were + fixed +- [sacct]{.title-ref} and [sstat]{.title-ref} errors are now fatal for + Slurm-based workflow executions +- Added documentation section about ML features and TorchScript +- Added TorchScript functions to Online Analysis tutorial +- Added multi-DB example to documentation +- Improved test stability on HPC systems +- Added support for producing & consuming telemetry outputs +- Split tests into groups for parallel execution in CI/CD pipeline +- Change signature of [Experiment.summary()]{.title-ref} +- Expose first_device parameter for scripts, functions, models +- Added support for MINBATCHTIMEOUT in model execution +- Remove support for RedisAI 1.2.5, use RedisAI 1.2.7 commit +- Add support for multiple databases + +Detailed Notes + +- Several conflicting directives between the [setup.py]{.title-ref} + and the [setup.cfg]{.title-ref} were fixed to mitigate warnings + issued when building the pip wheel. + ([SmartSim-PR435](https://github.com/CrayLabs/SmartSim/pull/435)) +- When the Slurm functions [sacct]{.title-ref} and [sstat]{.title-ref} + returned an error, it would be ignored and SmartSim\'s state could + become inconsistent. To prevent this, errors raised by + [sacct]{.title-ref} or [sstat]{.title-ref} now result in an + exception. + ([SmartSim-PR392](https://github.com/CrayLabs/SmartSim/pull/392)) +- A section named *ML Features* was added to documentation. It + contains multiple examples of how ML models and functions can be + added to and executed on the DB. TorchScript-based post-processing + was added to the *Online Analysis* tutorial + ([SmartSim-PR411](https://github.com/CrayLabs/SmartSim/pull/411)) +- An example of how to use multiple Orchestrators concurrently was + added to the documentation + ([SmartSim-PR409](https://github.com/CrayLabs/SmartSim/pull/409)) +- The test infrastructure was improved. Tests on HPC system are now + stable, and issues such as non-stopped [Orchestrators]{.title-ref} + or experiments created in the wrong paths have been fixed + ([SmartSim-PR381](https://github.com/CrayLabs/SmartSim/pull/381)) +- A telemetry monitor was added to check updates and produce events + for SmartDashboard + ([SmartSim-PR426](https://github.com/CrayLabs/SmartSim/pull/426)) +- Split tests into [group_a]{.title-ref}, [group_b]{.title-ref}, + [slow_tests]{.title-ref} for parallel execution in CI/CD pipeline + ([SmartSim-PR417](https://github.com/CrayLabs/SmartSim/pull/417), + [SmartSim-PR424](https://github.com/CrayLabs/SmartSim/pull/424)) +- Change [format]{.title-ref} argument to [style]{.title-ref} in + [Experiment.summary()]{.title-ref}, this is an API break + ([SmartSim-PR391](https://github.com/CrayLabs/SmartSim/pull/391)) +- Added support for first_device parameter for scripts, functions, and + models. This causes them to be loaded to the first num_devices + beginning with first_device + ([SmartSim-PR394](https://github.com/CrayLabs/SmartSim/pull/394)) +- Added support for MINBATCHTIMEOUT in model execution, which caps the + delay waiting for a minimium number of model execution operations to + accumulate before executing them as a batch + ([SmartSim-PR387](https://github.com/CrayLabs/SmartSim/pull/387)) +- RedisAI 1.2.5 is not supported anymore. The only RedisAI version is + now 1.2.7. Since the officially released RedisAI 1.2.7 has a bug + which breaks the build process on Mac OSX, it was decided to use + commit + [634916c](https://github.com/RedisAI/RedisAI/commit/634916c722e718cc6ea3fad46e63f7d798f9adc2) + from RedisAI\'s GitHub repository, where such bug has been fixed. + This applies to all operating systems. + ([SmartSim-PR383](https://github.com/CrayLabs/SmartSim/pull/383)) +- Add support for creation of multiple databases with unique + identifiers. + ([SmartSim-PR342](https://github.com/CrayLabs/SmartSim/pull/342)) + +### 0.5.1 + +Released on 14 September, 2023 + +Description + +- Add typehints throughout the SmartSim codebase +- Provide support for Slurm heterogeneous jobs +- Provide better support for [PalsMpiexecSettings]{.title-ref} +- Allow for easier inspection of SmartSim entities +- Log ignored error messages from [sacct]{.title-ref} +- Fix colocated db preparation bug when using + [JsrunSettings]{.title-ref} +- Fix bug when user specify CPU and devices greater than 1 +- Fix bug when get_allocation called with reserved keywords +- Enabled mypy in CI for better type safety +- Mitigate additional suppressed pylint errors +- Update linting support and apply to existing errors +- Various improvements to the [smart]{.title-ref} CLI +- Various documentation improvements +- Various test suite improvements + +Detailed Notes + +- Add methods to allow users to inspect files attached to models and + ensembles. + ([SmartSim-PR352](https://github.com/CrayLabs/SmartSim/pull/352)) +- Add a [smart info]{.title-ref} target to provide rudimentary + information about the SmartSim installation. + ([SmartSim-PR350](https://github.com/CrayLabs/SmartSim/pull/350)) +- Remove unnecessary generation producing unexpected directories in + the test suite. + ([SmartSim-PR349](https://github.com/CrayLabs/SmartSim/pull/349)) +- Add support for heterogeneous jobs to [SrunSettings]{.title-ref} by + allowing users to set the [\--het-group]{.title-ref} parameter. + ([SmartSim-PR346](https://github.com/CrayLabs/SmartSim/pull/346)) +- Provide clearer guidelines on how to contribute to SmartSim. + ([SmartSim-PR344](https://github.com/CrayLabs/SmartSim/pull/344)) +- Integrate [PalsMpiexecSettings]{.title-ref} into the + [Experiment]{.title-ref} factory methods when using the + [\"pals\"]{.title-ref} launcher. + ([SmartSim-PR343](https://github.com/CrayLabs/SmartSim/pull/343)) +- Create public properties where appropriate to mitigate + [protected-access]{.title-ref} errors. + ([SmartSim-PR341](https://github.com/CrayLabs/SmartSim/pull/341)) +- Fix a failure to execute [\_prep_colocated_db]{.title-ref} due to + incorrect named attr check. + ([SmartSim-PR339](https://github.com/CrayLabs/SmartSim/pull/339)) +- Enabled and mitigated mypy [disallow_any_generics]{.title-ref} and + [warn_return_any]{.title-ref}. + ([SmartSim-PR338](https://github.com/CrayLabs/SmartSim/pull/338)) +- Add a [smart validate]{.title-ref} target to provide a simple smoke + test to assess a SmartSim build. + ([SmartSim-PR336](https://github.com/CrayLabs/SmartSim/pull/336), + [SmartSim-PR351](https://github.com/CrayLabs/SmartSim/pull/351)) +- Add typehints to [smartsim.\_core.launcher.step.\*]{.title-ref}. + ([SmartSim-PR334](https://github.com/CrayLabs/SmartSim/pull/334)) +- Log errors reported from slurm WLM when attempts to retrieve status + fail. + ([SmartSim-PR331](https://github.com/CrayLabs/SmartSim/pull/331), + [SmartSim-PR332](https://github.com/CrayLabs/SmartSim/pull/332)) +- Fix incorrectly formatted positional arguments in log format + strings. + ([SmartSim-PR330](https://github.com/CrayLabs/SmartSim/pull/330)) +- Ensure that launchers pass environment variables to unmanaged job + steps. + ([SmartSim-PR329](https://github.com/CrayLabs/SmartSim/pull/329)) +- Add additional tests surrounding the [RAI_PATH]{.title-ref} + configuration environment variable. + ([SmartSim-PR328](https://github.com/CrayLabs/SmartSim/pull/328)) +- Remove unnecessary execution of unescaped shell commands. + ([SmartSim-PR327](https://github.com/CrayLabs/SmartSim/pull/327)) +- Add error if user calls get_allocation with reserved keywords in + slurm get_allocation. + ([SmartSim-PR325](https://github.com/CrayLabs/SmartSim/pull/325)) +- Add error when user requests CPU with devices greater than 1 within + add_ml_model and add_script. + ([SmartSim-PR324](https://github.com/CrayLabs/SmartSim/pull/324)) +- Update documentation surrounding ensemble key prefixing. + ([SmartSim-PR322](https://github.com/CrayLabs/SmartSim/pull/322)) +- Fix formatting of the Frontier site installation. + ([SmartSim-PR321](https://github.com/CrayLabs/SmartSim/pull/321)) +- Update pylint dependency, update .pylintrc, mitigate non-breaking + issues, suppress api breaks. + ([SmartSim-PR311](https://github.com/CrayLabs/SmartSim/pull/311)) +- Refactor the [smart]{.title-ref} CLI to use subparsers for better + documentation and extension. + ([SmartSim-PR308](https://github.com/CrayLabs/SmartSim/pull/308)) + +### 0.5.0 + +Released on 6 July, 2023 + +Description + +A full list of changes and detailed notes can be found below: + +- Update SmartRedis dependency to v0.4.1 +- Fix tests for db models and scripts +- Fix add_ml_model() and add_script() documentation, tests, and code +- Remove [requirements.txt]{.title-ref} and other places where + dependencies were defined +- Replace [limit_app_cpus]{.title-ref} with + [limit_db_cpus]{.title-ref} for co-located orchestrators +- Remove wait time associated with Experiment launch summary +- Update and rename Redis conf file +- Migrate from redis-py-cluster to redis-py +- Update full test suite to not require a TF wheel at test time +- Update doc strings +- Remove deprecated code +- Relax the coloredlogs version +- Update Fortran tutorials for SmartRedis +- Add support for multiple network interface binding in Orchestrator + and Colocated DBs +- Add typehints and static analysis + +Detailed notes + +- Updates SmartRedis to the most current release + ([SmartSim-PR316](https://github.com/CrayLabs/SmartSim/pull/316)) +- Fixes and enhancements to documentation + ([SmartSim-PR317](https://github.com/CrayLabs/SmartSim/pull/317), + [SmartSim-PR314](https://github.com/CrayLabs/SmartSim/pull/314), + [SmartSim-PR287](https://github.com/CrayLabs/SmartSim/pull/287)) +- Various fixes and enhancements to the test suite + ([SmartSim-PR315](https://github.com/CrayLabs/SmartSim/pull/314), + [SmartSim-PR312](https://github.com/CrayLabs/SmartSim/pull/312), + [SmartSim-PR310](https://github.com/CrayLabs/SmartSim/pull/310), + [SmartSim-PR302](https://github.com/CrayLabs/SmartSim/pull/302), + [SmartSim-PR283](https://github.com/CrayLabs/SmartSim/pull/283)) +- Fix a defect in the tests related to database models and scripts + that was causing key collisions when testing on workload managers + ([SmartSim-PR313](https://github.com/CrayLabs/SmartSim/pull/313)) +- Remove [requirements.txt]{.title-ref} and other places where + dependencies were defined. + ([SmartSim-PR307](https://github.com/CrayLabs/SmartSim/pull/307)) +- Fix defect where dictionaries used to create run settings can be + changed unexpectedly due to copy-by-ref + ([SmartSim-PR305](https://github.com/CrayLabs/SmartSim/pull/305)) +- The underlying code for Model.add_ml_model() and Model.add_script() + was fixed to correctly handle multi-GPU configurations. Tests were + updated to run on non-local launchers. Documentation was updated and + fixed. Also, the default testing interface has been changed to lo + instead of ipogif. + ([SmartSim-PR304](https://github.com/CrayLabs/SmartSim/pull/304)) +- Typehints have been added. A makefile target [make + check-mypy]{.title-ref} executes static analysis with mypy. + ([SmartSim-PR295](https://github.com/CrayLabs/SmartSim/pull/295), + [SmartSim-PR301](https://github.com/CrayLabs/SmartSim/pull/301), + [SmartSim-PR303](https://github.com/CrayLabs/SmartSim/pull/303)) +- Replace [limit_app_cpus]{.title-ref} with + [limit_db_cpus]{.title-ref} for co-located orchestrators. This + resolves some incorrect behavior/assumptions about how the + application would be pinned. Instead, users should directly specify + the binding options in their application using the options + appropriate for their launcher + ([SmartSim-PR306](https://github.com/CrayLabs/SmartSim/pull/306)) +- Simplify code in [random_permutations]{.title-ref} parameter + generation strategy + ([SmartSim-PR300](https://github.com/CrayLabs/SmartSim/pull/300)) +- Remove wait time associated with Experiment launch summary + ([SmartSim-PR298](https://github.com/CrayLabs/SmartSim/pull/298)) +- Update Redis conf file to conform with Redis v7.0.5 conf file + ([SmartSim-PR293](https://github.com/CrayLabs/SmartSim/pull/293)) +- Migrate from redis-py-cluster to redis-py for cluster status checks + ([SmartSim-PR292](https://github.com/CrayLabs/SmartSim/pull/292)) +- Update full test suite to no longer require a tensorflow wheel to be + available at test time. + ([SmartSim-PR291](https://github.com/CrayLabs/SmartSim/pull/291)) +- Correct spelling of colocated in doc strings + ([SmartSim-PR290](https://github.com/CrayLabs/SmartSim/pull/290)) +- Deprecated launcher-specific orchestrators, constants, and ML + utilities were removed. + ([SmartSim-PR289](https://github.com/CrayLabs/SmartSim/pull/289)) +- Relax the coloredlogs version to be greater than 10.0 + ([SmartSim-PR288](https://github.com/CrayLabs/SmartSim/pull/288)) +- Update the Github Actions runner image from + [macos-10.15]{.title-ref}[ to \`macos-12]{.title-ref}\`. The former + began deprecation in May 2022 and was finally removed in May 2023. + ([SmartSim-PR285](https://github.com/CrayLabs/SmartSim/pull/285)) +- The Fortran tutorials had not been fully updated to show how to + handle return/error codes. These have now all been updated. + ([SmartSim-PR284](https://github.com/CrayLabs/SmartSim/pull/284)) +- Orchestrator and Colocated DB now accept a list of interfaces to + bind to. The argument name is still [interface]{.title-ref} for + backward compatibility reasons. + ([SmartSim-PR281](https://github.com/CrayLabs/SmartSim/pull/281)) +- Typehints have been added to public APIs. A makefile target to + execute static analysis with mypy is available [make + check-mypy]{.title-ref}. + ([SmartSim-PR295](https://github.com/CrayLabs/SmartSim/pull/295)) + +### 0.4.2 + +Released on April 12, 2023 + +Description + +This release of SmartSim had a focus on polishing and extending exiting +features already provided by SmartSim. Most notably, this release +provides support to allow users to colocate their models with an +orchestrator using Unix domain sockets and support for launching models +as batch jobs. + +Additionally, SmartSim has updated its tool chains to provide a better +user experience. Notably, SmarSim can now be used with Python 3.10, +Redis 7.0.5, and RedisAI 1.2.7. Furthermore, SmartSim now utilizes +SmartRedis\'s aggregation lists to streamline the use and extension of +ML data loaders, making working with popular machine learning frameworks +in SmartSim a breeze. + +A full list of changes and detailed notes can be found below: + +- Add support for colocating an orchestrator over UDS +- Add support for Python 3.10, deprecate support for Python 3.7 and + RedisAI 1.2.3 +- Drop support for Ray +- Update ML data loaders to make use of SmartRedis\'s aggregation + lists +- Allow for models to be launched independently as batch jobs +- Update to current version of Redis to 7.0.5 +- Add support for RedisAI 1.2.7, pyTorch 1.11.0, Tensorflow 2.8.0, + ONNXRuntime 1.11.1 +- Fix bug in colocated database entrypoint when loading PyTorch models +- Fix test suite behavior with environment variables + +Detailed Notes + +- Running some tests could result in some SmartSim-specific + environment variables to be set. Such environment variables are now + reset after each test execution. Also, a warning for environment + variable usage in Slurm was added, to make the user aware in case an + environment variable will not be assigned the desired value with + [\--export]{.title-ref}. + ([SmartSim-PR270](https://github.com/CrayLabs/SmartSim/pull/270)) +- The PyTorch and TensorFlow data loaders were update to make use of + aggregation lists. This breaks their API, but makes them easier to + use. + ([SmartSim-PR264](https://github.com/CrayLabs/SmartSim/pull/264)) +- The support for Ray was dropped, as its most recent versions caused + problems when deployed through SmartSim. We plan to release a + separate add-on library to accomplish the same results. If you are + interested in getting the Ray launch functionality back in your + workflow, please get in touch with us! + ([SmartSim-PR263](https://github.com/CrayLabs/SmartSim/pull/263)) +- Update from Redis version 6.0.8 to 7.0.5. + ([SmartSim-PR258](https://github.com/CrayLabs/SmartSim/pull/258)) +- Adds support for Python 3.10 without the ONNX machine learning + backend. Deprecates support for Python 3.7 as it will stop receiving + security updates. Deprecates support for RedisAI 1.2.3. Update the + build process to be able to correctly fetch supported dependencies. + If a user attempts to build an unsupported dependency, an error + message is shown highlighting the discrepancy. + ([SmartSim-PR256](https://github.com/CrayLabs/SmartSim/pull/256)) +- Models were given a [batch_settings]{.title-ref} attribute. When + launching a model through [Experiment.start]{.title-ref} the + [Experiment]{.title-ref} will first check for a non-nullish value at + that attribute. If the check is satisfied, the + [Experiment]{.title-ref} will attempt to wrap the underlying run + command in a batch job using the object referenced at + [Model.batch_settings]{.title-ref} as the batch settings for the + job. If the check is not satisfied, the [Model]{.title-ref} is + launched in the traditional manner as a job step. + ([SmartSim-PR245](https://github.com/CrayLabs/SmartSim/pull/245)) +- Fix bug in colocated database entrypoint stemming from uninitialized + variables. This bug affects PyTorch models being loaded into the + database. + ([SmartSim-PR237](https://github.com/CrayLabs/SmartSim/pull/237)) +- The release of RedisAI 1.2.7 allows us to update support for recent + versions of PyTorch, Tensorflow, and ONNX + ([SmartSim-PR234](https://github.com/CrayLabs/SmartSim/pull/234)) +- Make installation of correct Torch backend more reliable according + to instruction from PyTorch +- In addition to TCP, add UDS support for colocating an orchestrator + with models. Methods [Model.colocate_db_tcp]{.title-ref} and + [Model.colocate_db_uds]{.title-ref} were added to expose this + functionality. The [Model.colocate_db]{.title-ref} method remains + and uses TCP for backward compatibility + ([SmartSim-PR246](https://github.com/CrayLabs/SmartSim/pull/246)) + +### 0.4.1 + +Released on June 24, 2022 + +Description: This release of SmartSim introduces a new experimental +feature to help make SmartSim workflows more portable: the ability to +run simulations models in a container via Singularity. This feature has +been tested on a small number of platforms and we encourage users to +provide feedback on its use. + +We have also made improvements in a variety of areas: new utilities to +load scripts and machine learning models into the database directly from +SmartSim driver scripts and install-time choice to use either +[KeyDB]{.title-ref} or [Redis]{.title-ref} for the Orchestrator. The +[RunSettings]{.title-ref} API is now more consistent across subclasses. +Another key focus of this release was to aid new SmartSim users by +including more extensive tutorials and improving the documentation. The +docker image containing the SmartSim tutorials now also includes a +tutorial on online training. + +Launcher improvements + +- New methods for specifying [RunSettings]{.title-ref} parameters + ([SmartSim-PR166](https://github.com/CrayLabs/SmartSim/pull/166)) + ([SmartSim-PR170](https://github.com/CrayLabs/SmartSim/pull/170)) +- Better support for [mpirun]{.title-ref}, [mpiexec]{.title-ref}, + and [orterun]{.title-ref} as launchers + ([SmartSim-PR186](https://github.com/CrayLabs/SmartSim/pull/186)) +- Experimental: add support for running models via Singularity + ([SmartSim-PR204](https://github.com/CrayLabs/SmartSim/pull/204)) + +Documentation and tutorials + +- Tutorial updates + ([SmartSim-PR155](https://github.com/CrayLabs/SmartSim/pull/155)) + ([SmartSim-PR203](https://github.com/CrayLabs/SmartSim/pull/203)) + ([SmartSim-PR208](https://github.com/CrayLabs/SmartSim/pull/208)) +- Add SmartSim Zoo info to documentation + ([SmartSim-PR175](https://github.com/CrayLabs/SmartSim/pull/175)) +- New tutorial for demonstrating online training + ([SmartSim-PR176](https://github.com/CrayLabs/SmartSim/pull/176)) + ([SmartSim-PR188](https://github.com/CrayLabs/SmartSim/pull/188)) + +General improvements and bug fixes + +- Set models and scripts at the driver level + ([SmartSim-PR185](https://github.com/CrayLabs/SmartSim/pull/185)) +- Optionally use KeyDB for the orchestrator + ([SmartSim-PR180](https://github.com/CrayLabs/SmartSim/pull/180)) +- Ability to specify system-level libraries + ([SmartSim-PR154](https://github.com/CrayLabs/SmartSim/pull/154)) + ([SmartSim-PR182](https://github.com/CrayLabs/SmartSim/pull/182)) +- Fix the handling of LSF gpus_per_shard + ([SmartSim-PR164](https://github.com/CrayLabs/SmartSim/pull/164)) +- Fix error when re-running [smart build]{.title-ref} + ([SmartSim-PR165](https://github.com/CrayLabs/SmartSim/pull/165)) +- Fix generator hanging when tagged configuration variables are + missing + ([SmartSim-PR177](https://github.com/CrayLabs/SmartSim/pull/177)) + +Dependency updates + +- CMake version from 3.10 to 3.13 + ([SmartSim-PR152](https://github.com/CrayLabs/SmartSim/pull/152)) +- Update click to 8.0.2 + ([SmartSim-PR200](https://github.com/CrayLabs/SmartSim/pull/200)) + +### 0.4.0 + +Released on Feb 11, 2022 + +Description: In this release SmartSim continues to promote ease of use. +To this end SmartSim has introduced new portability features that allow +users to abstract away their targeted hardware, while providing even +more compatibility with existing libraries. + +A new feature, Co-located orchestrator deployments has been added which +provides scalable online inference capabilities that overcome previous +performance limitations in seperated orchestrator/application +deployments. For more information on advantages of co-located +deployments, see the Orchestrator section of the SmartSim documentation. + +The SmartSim build was significantly improved to increase customization +of build toolchain and the `smart` command line inferface was expanded. + +Additional tweaks and upgrades have also been made to ensure an optimal +experience. Here is a comprehensive list of changes made in SmartSim +0.4.0. + +Orchestrator Enhancements: + +- Add Orchestrator Co-location + ([SmartSim-PR139](https://github.com/CrayLabs/SmartSim/pull/139)) +- Add Orchestrator configuration file edit methods + ([SmartSim-PR109](https://github.com/CrayLabs/SmartSim/pull/109)) + +Emphasize Driver Script Portability: + +- Add ability to create run settings through an experiment + ([SmartSim-PR110](https://github.com/CrayLabs/SmartSim/pull/110)) +- Add ability to create batch settings through an experiment + ([SmartSim-PR112](https://github.com/CrayLabs/SmartSim/pull/112)) +- Add automatic launcher detection to experiment portability + functions + ([SmartSim-PR120](https://github.com/CrayLabs/SmartSim/pull/120)) + +Expand Machine Learning Library Support: + +- Data loaders for online training in Keras/TF and Pytorch + ([SmartSim-PR115](https://github.com/CrayLabs/SmartSim/pull/115)) + ([SmartSim-PR140](https://github.com/CrayLabs/SmartSim/pull/140)) +- ML backend versions updated with expanded support for multiple + versions + ([SmartSim-PR122](https://github.com/CrayLabs/SmartSim/pull/122)) +- Launch Ray internally using `RunSettings` + ([SmartSim-PR118](https://github.com/CrayLabs/SmartSim/pull/118)) +- Add Ray cluster setup and deployment to SmartSim + ([SmartSim-PR50](https://github.com/CrayLabs/SmartSim/pull/50)) + +Expand Launcher Setting Options: + +- Add ability to use base `RunSettings` on a Slurm, or PBS launchers + ([SmartSim-PR90](https://github.com/CrayLabs/SmartSim/pull/90)) +- Add ability to use base `RunSettings` on LFS launcher + ([SmartSim-PR108](https://github.com/CrayLabs/SmartSim/pull/108)) + +Deprecations and Breaking Changes + +- Orchestrator classes combined into single implementation for + portability + ([SmartSim-PR139](https://github.com/CrayLabs/SmartSim/pull/139)) +- `smartsim.constants` changed to `smartsim.status` + ([SmartSim-PR122](https://github.com/CrayLabs/SmartSim/pull/122)) +- `smartsim.tf` migrated to `smartsim.ml.tf` + ([SmartSim-PR115](https://github.com/CrayLabs/SmartSim/pull/115)) + ([SmartSim-PR140](https://github.com/CrayLabs/SmartSim/pull/140)) +- TOML configuration option removed in favor of environment variable + approach + ([SmartSim-PR122](https://github.com/CrayLabs/SmartSim/pull/122)) + +General Improvements and Bug Fixes: + +- Improve and extend parameter handling + ([SmartSim-PR107](https://github.com/CrayLabs/SmartSim/pull/107)) + ([SmartSim-PR119](https://github.com/CrayLabs/SmartSim/pull/119)) +- Abstract away non-user facing implementation details + ([SmartSim-PR122](https://github.com/CrayLabs/SmartSim/pull/122)) +- Add various dimensions to the CI build matrix for SmartSim testing + ([SmartSim-PR130](https://github.com/CrayLabs/SmartSim/pull/130)) +- Add missing functions to LSFSettings API + ([SmartSim-PR113](https://github.com/CrayLabs/SmartSim/pull/113)) +- Add RedisAI checker for installed backends + ([SmartSim-PR137](https://github.com/CrayLabs/SmartSim/pull/137)) +- Remove heavy and unnecessary dependencies + ([SmartSim-PR116](https://github.com/CrayLabs/SmartSim/pull/116)) + ([SmartSim-PR132](https://github.com/CrayLabs/SmartSim/pull/132)) +- Fix LSFLauncher and LSFOrchestrator + ([SmartSim-PR86](https://github.com/CrayLabs/SmartSim/pull/86)) +- Fix over greedy Workload Manager Parsers + ([SmartSim-PR95](https://github.com/CrayLabs/SmartSim/pull/95)) +- Fix Slurm handling of comma-separated env vars + ([SmartSim-PR104](https://github.com/CrayLabs/SmartSim/pull/104)) +- Fix internal method calls + ([SmartSim-PR138](https://github.com/CrayLabs/SmartSim/pull/138)) + +Documentation Updates: + +- Updates to documentation build process + ([SmartSim-PR133](https://github.com/CrayLabs/SmartSim/pull/133)) + ([SmartSim-PR143](https://github.com/CrayLabs/SmartSim/pull/143)) +- Updates to documentation content + ([SmartSim-PR96](https://github.com/CrayLabs/SmartSim/pull/96)) + ([SmartSim-PR129](https://github.com/CrayLabs/SmartSim/pull/129)) + ([SmartSim-PR136](https://github.com/CrayLabs/SmartSim/pull/136)) + ([SmartSim-PR141](https://github.com/CrayLabs/SmartSim/pull/141)) +- Update SmartSim Examples + ([SmartSim-PR68](https://github.com/CrayLabs/SmartSim/pull/68)) + ([SmartSim-PR100](https://github.com/CrayLabs/SmartSim/pull/100)) + +### 0.3.2 + +Released on August 10, 2021 + +Description: + +- Upgraded RedisAI backend to 1.2.3 + ([SmartSim-PR69](https://github.com/CrayLabs/SmartSim/pull/69)) +- PyTorch 1.7.1, TF 2.4.2, and ONNX 1.6-7 + ([SmartSim-PR69](https://github.com/CrayLabs/SmartSim/pull/69)) +- LSF launcher for IBM machines + ([SmartSim-PR62](https://github.com/CrayLabs/SmartSim/pull/62)) +- Improved code coverage by adding more unit tests + ([SmartSim-PR53](https://github.com/CrayLabs/SmartSim/pull/53)) +- Orchestrator methods to get address and check status + ([SmartSim-PR60](https://github.com/CrayLabs/SmartSim/pull/60)) +- Added Manifest object that tracks deployables in Experiments + ([SmartSim-PR61](https://github.com/CrayLabs/SmartSim/pull/61)) +- Bug fixes + ([SmartSim-PR52](https://github.com/CrayLabs/SmartSim/pull/52)) + ([SmartSim-PR58](https://github.com/CrayLabs/SmartSim/pull/58)) + ([SmartSim-PR67](https://github.com/CrayLabs/SmartSim/pull/67)) + ([SmartSim-PR73](https://github.com/CrayLabs/SmartSim/pull/73)) +- Updated documentation and examples + ([SmartSim-PR51](https://github.com/CrayLabs/SmartSim/pull/51)) + ([SmartSim-PR57](https://github.com/CrayLabs/SmartSim/pull/57)) + ([SmartSim-PR71](https://github.com/CrayLabs/SmartSim/pull/71)) +- Improved IP address aquisition + ([SmartSim-PR72](https://github.com/CrayLabs/SmartSim/pull/72)) +- Binding database to network interfaces + +### 0.3.1 + +Released on May 5, 2021 + +Description: This release was dedicated to making the install process +easier. SmartSim can be installed from PyPI now and the `smart` cli tool +makes installing the machine learning runtimes much easier. + +- Pip install + ([SmartSim-PR42](https://github.com/CrayLabs/SmartSim/pull/42)) +- `smart` cli tool for ML backends + ([SmartSim-PR42](https://github.com/CrayLabs/SmartSim/pull/42)) +- Build Documentation for updated install + ([SmartSim-PR43](https://github.com/CrayLabs/SmartSim/pull/43)) +- Migrate from Jenkins to Github Actions CI + ([SmartSim-PR42](https://github.com/CrayLabs/SmartSim/pull/42)) +- Bug fix for setup.cfg + ([SmartSim-PR35](https://github.com/CrayLabs/SmartSim/pull/35)) + +### 0.3.0 + +Released on April 1, 2021 + +Description: + +- initial 0.3.0 (first public) release of SmartSim + +------------------------------------------------------------------------ + +(smartredis-changelog)= +## SmartRedis + +```{include} ../smartredis/doc/changelog.md +:start-line: 2 +``` + +------------------------------------------------------------------------ + +(smartdashboard-changelog)= +## SmartDashboard + +```{include} ../smartdashboard/doc/changelog.md +:start-line: 2 +``` diff --git a/doc/changelog.rst b/doc/changelog.rst deleted file mode 100644 index aed0e675c..000000000 --- a/doc/changelog.rst +++ /dev/null @@ -1,856 +0,0 @@ -********* -Changelog -********* - -Listed here are the changes between each release of SmartSim -and SmartRedis. - -Jump to :ref:`SmartRedis Changelog ` - - -SmartSim -======== - -Development branch ------------------- - -To be released at some future point in time - -Description - -- Update authentication in release workflow -- Auto-generate type-hints into documentation -- Auto-post release PR to develop -- Bump manifest.json to version 0.0.4 -- Fix symlinking batch ensemble and model bug -- Remove defensive regexp in .gitignore -- Upgrade ubuntu to 22.04 -- Remove helper function ``init_default`` -- Fix telemetry monitor logging errrors for task history -- Change default path for entities -- Drop Python 3.8 support -- Update watchdog dependency -- Historical output files stored under .smartsim directory -- Add option to build Torch backend without the Intel Math Kernel Library -- Fix ReadTheDocs build issue -- Promote device options to an Enum -- Update telemetry monitor, add telemetry collectors -- Add method to specify node features for a Slurm job -- Colo Orchestrator setup now blocks application start until setup finished -- ExecArgs handling correction -- ReadTheDocs config file added and enabled on PRs -- Enforce changelog updates -- Remove deprecated SmartSim modules -- SmartSim Documentation refactor -- Update the version of Redis from `7.0.4` to `7.2.4` -- Fix publishing of development docs -- Update Experiment API typing -- Minor enhancements to test suite -- Improve SmartSim experiment signal handlers - -Detailed Notes - -- Replace the developer created token with the GH_TOKEN environment variable. - (SmartSim-PR570_) -- Add extension to auto-generate function type-hints into documentation. - (SmartSim-PR561_) -- Add to github release workflow to auto generate a pull request from master - into develop for release. (SmartSim-PR566_) -- The manifest.json version needs to match the SmartDashboard version, which is - 0.0.4 in the upcoming release. (SmartSim-PR563_) -- Properly symlinks batch ensembles and batch models. (SmartSim-PR547_) -- Remove defensive regexp in .gitignore and ensure tests write to test_output. - (SmartSim-PR560_) -- After dropping support for Python 3.8, ubuntu needs to be upgraded. - (SmartSim-PR558_) -- Remove helper function ``init_default`` and replace with traditional type - narrowing. (SmartSim-PR545_) -- Ensure the telemetry monitor does not track a task_id - for a managed task. (SmartSim-PR557_) -- The default path for an entity is now the path to the experiment / the - entity name. create_database and create_ensemble now have path arguments. - All path arguments are compatible with relative paths. Relative paths are - relative to the CWD. (SmartSim-PR533_) -- Python 3.8 is reaching its end-of-life in October, 2024, so it will - no longer continue to be supported. (SmartSim-PR544_) -- Update watchdog dependency from 3.x to 4.x, fix new type issues (SmartSim-PR540_) -- The dashboard needs to display historical logs, so log files are written - out under the .smartsim directory and files under the experiment - directory are symlinked to them. (SmartSim-PR532_) -- Add an option to smart build "--torch_with_mkl"/"--no_torch_with_mkl" to - prevent Torch from trying to link in the Intel Math Kernel Library. This - is needed because on machines that have the Intel compilers installed, the - Torch will unconditionally try to link in this library, however fails - because the linking flags are incorrect. (SmartSim-PR538_) -- Change type_extension and pydantic versions in readthedocs environment - to enable docs build. (SmartSim-PR537_) -- Promote devices to a dedicated Enum type throughout the SmartSim code base. - (SmartSim-PR498_) -- Update the telemetry monitor to enable retrieval of metrics on a scheduled - interval. Switch basic experiment tracking telemetry to default to on. Add - database metric collectors. Improve telemetry monitor logging. Create - telemetry subpackage at `smartsim._core.utils.telemetry`. Refactor - telemetry monitor entrypoint. (SmartSim-PR460_) -- Users can now specify node features for a Slurm job through - ``SrunSettings.set_node_feature``. The method accepts a string - or list of strings. (SmartSim-PR529_) -- The request to the colocated entrypoints file within the shell script - is now a blocking process. Once the Orchestrator is setup, it returns - which moves the process to the background and allows the application to - start. This prevents the application from requesting a ML model or - script that has not been uploaded to the Orchestrator yet. (SmartSim-PR522_) -- Add checks and tests to ensure SmartSim users cannot initialize run settings - with a list of lists as the exe_args argument. (SmartSim-PR517_) -- Add readthedocs configuration file and enable readthedocs builds - on pull requests. Additionally added robots.txt file generation - when readthedocs environment detected. (SmartSim-PR512_) -- Add Github Actions workflow that checks if changelog is edited - on pull requests into develop. (SmartSim-PR518_) -- Removed deprecated SmartSim modules: slurm and mpirunSettings. - (SmartSim-PR514_) -- Implemented new structure of SmartSim documentation. Added examples - images and further detail of SmartSim components. (SmartSim-PR463_) -- Update Redis version to `7.2.4`. This change fixes an issue in the Redis - build scripts causing failures on Apple Silicon hosts. (SmartSim-PR507_) -- The container which builds the documentation for every merge to develop - was failing due to a lack of space within the container. This was fixed - by including an additional Github action that removes some unneeded - software and files that come from the default Github Ubuntu container. - (SmartSim-PR504_) -- Update the generic `t.Any` typehints in Experiment API. (SmartSim-PR501_) -- The CI will fail static analysis if common erroneous truthy checks are - detected. (SmartSim-PR524_) -- The CI will fail static analysis if a local variable used while potentially - undefined. (SmartSim-PR521_) -- Remove previously deprecated behavior present in test suite on machines with - Slurm and Open MPI. (SmartSim-PR520_) -- Experiments in the WLM tests are given explicit paths to prevent unexpected - directory creation. Ensure database are not left open on test suite failures. - Update path to pickle file in - ``tests/full_wlm/test_generic_orc_launch_batch.py::test_launch_cluster_orc_reconnect`` - to conform with changes made in SmartSim-PR533_. (SmartSim-PR559_) -- When calling ``Experiment.start`` SmartSim would register a signal handler - that would capture an interrupt signal (^C) to kill any jobs launched through - its ``JobManager``. This would replace the default (or user defined) signal - handler. SmartSim will now attempt to kill any launched jobs before calling - the previously registered signal handler. (SmartSim-PR535_) - -.. _SmartSim-PR570: https://github.com/CrayLabs/SmartSim/pull/570 -.. _SmartSim-PR561: https://github.com/CrayLabs/SmartSim/pull/561 -.. _SmartSim-PR566: https://github.com/CrayLabs/SmartSim/pull/566 -.. _SmartSim-PR563: https://github.com/CrayLabs/SmartSim/pull/563 -.. _SmartSim-PR547: https://github.com/CrayLabs/SmartSim/pull/547 -.. _SmartSim-PR560: https://github.com/CrayLabs/SmartSim/pull/560 -.. _SmartSim-PR559: https://github.com/CrayLabs/SmartSim/pull/559 -.. _SmartSim-PR558: https://github.com/CrayLabs/SmartSim/pull/558 -.. _SmartSim-PR545: https://github.com/CrayLabs/SmartSim/pull/545 -.. _SmartSim-PR557: https://github.com/CrayLabs/SmartSim/pull/557 -.. _SmartSim-PR533: https://github.com/CrayLabs/SmartSim/pull/533 -.. _SmartSim-PR544: https://github.com/CrayLabs/SmartSim/pull/544 -.. _SmartSim-PR540: https://github.com/CrayLabs/SmartSim/pull/540 -.. _SmartSim-PR532: https://github.com/CrayLabs/SmartSim/pull/532 -.. _SmartSim-PR538: https://github.com/CrayLabs/SmartSim/pull/538 -.. _SmartSim-PR537: https://github.com/CrayLabs/SmartSim/pull/537 -.. _SmartSim-PR498: https://github.com/CrayLabs/SmartSim/pull/498 -.. _SmartSim-PR460: https://github.com/CrayLabs/SmartSim/pull/460 -.. _SmartSim-PR512: https://github.com/CrayLabs/SmartSim/pull/512 -.. _SmartSim-PR535: https://github.com/CrayLabs/SmartSim/pull/535 -.. _SmartSim-PR529: https://github.com/CrayLabs/SmartSim/pull/529 -.. _SmartSim-PR522: https://github.com/CrayLabs/SmartSim/pull/522 -.. _SmartSim-PR521: https://github.com/CrayLabs/SmartSim/pull/521 -.. _SmartSim-PR524: https://github.com/CrayLabs/SmartSim/pull/524 -.. _SmartSim-PR520: https://github.com/CrayLabs/SmartSim/pull/520 -.. _SmartSim-PR518: https://github.com/CrayLabs/SmartSim/pull/518 -.. _SmartSim-PR517: https://github.com/CrayLabs/SmartSim/pull/517 -.. _SmartSim-PR514: https://github.com/CrayLabs/SmartSim/pull/514 -.. _SmartSim-PR512: https://github.com/CrayLabs/SmartSim/pull/512 -.. _SmartSim-PR507: https://github.com/CrayLabs/SmartSim/pull/507 -.. _SmartSim-PR504: https://github.com/CrayLabs/SmartSim/pull/504 -.. _SmartSim-PR501: https://github.com/CrayLabs/SmartSim/pull/501 -.. _SmartSim-PR463: https://github.com/CrayLabs/SmartSim/pull/463 - - -0.6.2 ------ - -Released on 16 February, 2024 - -Description - -- Patch SmartSim dependency version - - -Detailed Notes - -- A critical performance concern was identified and addressed in SmartRedis. A - patch fix was deployed, and SmartSim was updated to ensure users do not - inadvertently pull the unpatched version of SmartRedis. (SmartSim-PR493_) - - -.. _SmartSim-PR493: https://github.com/CrayLabs/SmartSim/pull/493 - - -0.6.1 ------ - -Released on 15 February, 2024 - -Description - -- Duplicate for DBModel/Script prevented -- Update license to include 2024 -- Telemetry monitor is now active by default -- Add support for Mac OSX on Apple Silicon -- Remove Torch warnings during testing -- Validate Slurm timing format -- Expose Python Typehints -- Fix test_logs to prevent generation of directory -- Fix Python Typehint for colocated database settings -- Python 3.11 Support -- Quality of life `smart validate` improvements -- Remove Cobalt support -- Enrich logging through context variables -- Upgrade Machine Learning dependencies -- Override sphinx-tabs background color -- Add concurrency group to test workflow -- Fix index when installing torch through smart build - - -Detailed Notes - -- Modify the `git clone` for both Redis and RedisAI to set the line endings to - unix-style line endings when using MacOS on ARM. (SmartSim-PR482_) -- Separate install instructions are now provided for Mac OSX on x64 vs ARM64 (SmartSim-PR479_) -- Prevent duplicate ML model and script names being added to an - Ensemble member if the names exists. (SmartSim-PR475_) -- Updates `Copyright (c) 2021-2023` to `Copyright (c) 2021-2024` - in all of the necessary files. (SmartSim-PR485_) -- Bug fix which prevents the expected behavior when the `SMARTSIM_LOG_LEVEL` - environment variable was set to `developer`. (SmartSim-PR473_) -- Sets the default value of the "enable telemetry" flag to on. - Bumps the output `manifest.json` version number to match that of - `smartdashboard` and pins a watchdog version to avoid build errors. - (SmartSim-PR477_) -- Refactor logic of `Manifest.has_db_objects` to remove excess branching - and improve readability/maintainability. (SmartSim-PR476_) -- SmartSim can now be built and used on platforms using Apple Silicon - (ARM64). Currently, only the PyTorch backend is supported. Note that libtorch - will be downloaded from a CrayLabs github repo. (SmartSim-PR465_) -- Tests that were saving Torch models were emitting warnings. These warnings - were addressed by updating the model save test function. (SmartSim-PR472_) -- Validate the timing format when requesting a slurm allocation. (SmartSim-PR471_) -- Add and ship `py.typed` marker to expose inline type hints. Fix - type errors related to SmartRedis. (SmartSim-PR468_) -- Fix the `test_logs.py::test_context_leak` test that was - erroneously creating a directory named `some value` in SmartSim's root - directory. (SmartSim-PR467_) -- Add Python type hinting to colocated settings. (SmartSim-PR462_) -- Add github actions for running black and isort checks. (SmartSim-PR464_) -- Relax the required version of `typing_extensions`. (SmartSim-PR459_) -- Addition of Python 3.11 to SmartSim. (SmartSim-PR461_) -- Quality of life `smart validate` improvements such as setting `CUDA_VISIBLE_DEVICES` - environment variable within `smart validate` prior to importing any ML deps to - prevent false negatives on multi-GPU systems. Additionally, move SmartRedis logs - from standard out to dedicated log file in the validation temporary directory as well as - suppress `sklearn` deprecation warning by pinning `KMeans` constructor - argument. Lastly, move TF test to last as TF may reserve the GPUs it uses. - (SmartSim-PR458_) -- Some actions in the current GitHub CI/CD workflows were outdated. They were - replaced with the latest versions. (SmartSim-PR446_) -- As the Cobalt workload manager is not used on any system we are aware of, - its support in SmartSim was terminated and classes such as `CobaltLauncher` have - been removed. (SmartSim-PR448_) -- Experiment logs are written to a file that can be read by the dashboard. (SmartSim-PR452_) -- Updated SmartSim's machine learning backends to PyTorch 2.0.1, Tensorflow - 2.13.1, ONNX 1.14.1, and ONNX Runtime 1.16.1. As a result of this change, - there is now an available ONNX wheel for use with Python 3.10, and wheels for - all of SmartSim's machine learning backends with Python 3.11. - (SmartSim-PR451_) (SmartSim-PR461_) -- The sphinx-tabs documentation extension uses a white background for the tabs component. - A custom CSS for those components to inherit the overall theme color has - been added. (SmartSim-PR453_) -- Add concurrency groups to GitHub's CI/CD workflows, preventing - multiple workflows from the same PR to be launched concurrently. - (SmartSim-PR439_) -- Torch changed their preferred indexing when trying to install - their provided wheels. Updated the `pip install` command within - `smart build` to ensure that the appropriate packages can be found. - (SmartSim-PR449_) - - -.. _SmartSim-PR485: https://github.com/CrayLabs/SmartSim/pull/485 -.. _SmartSim-PR482: https://github.com/CrayLabs/SmartSim/pull/482 -.. _SmartSim-PR479: https://github.com/CrayLabs/SmartSim/pull/479 -.. _SmartSim-PR477: https://github.com/CrayLabs/SmartSim/pull/477 -.. _SmartSim-PR476: https://github.com/CrayLabs/SmartSim/pull/476 -.. _SmartSim-PR475: https://github.com/CrayLabs/SmartSim/pull/475 -.. _SmartSim-PR473: https://github.com/CrayLabs/SmartSim/pull/473 -.. _SmartSim-PR472: https://github.com/CrayLabs/SmartSim/pull/472 -.. _SmartSim-PR471: https://github.com/CrayLabs/SmartSim/pull/471 -.. _SmartSim-PR468: https://github.com/CrayLabs/SmartSim/pull/468 -.. _SmartSim-PR467: https://github.com/CrayLabs/SmartSim/pull/467 -.. _SmartSim-PR465: https://github.com/CrayLabs/SmartSim/pull/465 -.. _SmartSim-PR464: https://github.com/CrayLabs/SmartSim/pull/464 -.. _SmartSim-PR462: https://github.com/CrayLabs/SmartSim/pull/462 -.. _SmartSim-PR461: https://github.com/CrayLabs/SmartSim/pull/461 -.. _SmartSim-PR459: https://github.com/CrayLabs/SmartSim/pull/459 -.. _SmartSim-PR458: https://github.com/CrayLabs/SmartSim/pull/458 -.. _SmartSim-PR453: https://github.com/CrayLabs/SmartSim/pull/453 -.. _SmartSim-PR452: https://github.com/CrayLabs/SmartSim/pull/452 -.. _SmartSim-PR451: https://github.com/CrayLabs/SmartSim/pull/451 -.. _SmartSim-PR449: https://github.com/CrayLabs/SmartSim/pull/449 -.. _SmartSim-PR448: https://github.com/CrayLabs/SmartSim/pull/448 -.. _SmartSim-PR446: https://github.com/CrayLabs/SmartSim/pull/446 -.. _SmartSim-PR439: https://github.com/CrayLabs/SmartSim/pull/439 - -0.6.0 ------ - -Released on 18 December, 2023 - -Description - -- Conflicting directives in the SmartSim packaging instructions were fixed -- `sacct` and `sstat` errors are now fatal for Slurm-based workflow executions -- Added documentation section about ML features and TorchScript -- Added TorchScript functions to Online Analysis tutorial -- Added multi-DB example to documentation -- Improved test stability on HPC systems -- Added support for producing & consuming telemetry outputs -- Split tests into groups for parallel execution in CI/CD pipeline -- Change signature of `Experiment.summary()` -- Expose first_device parameter for scripts, functions, models -- Added support for MINBATCHTIMEOUT in model execution -- Remove support for RedisAI 1.2.5, use RedisAI 1.2.7 commit -- Add support for multiple databases - -Detailed Notes - -- Several conflicting directives between the `setup.py` and the `setup.cfg` were fixed - to mitigate warnings issued when building the pip wheel. (SmartSim-PR435_) -- When the Slurm functions `sacct` and `sstat` returned an error, it would be ignored - and SmartSim's state could become inconsistent. To prevent this, errors - raised by `sacct` or `sstat` now result in an exception. (SmartSim-PR392_) -- A section named *ML Features* was added to documentation. It contains multiple - examples of how ML models and functions can be added to and executed on the DB. - TorchScript-based post-processing was added to the *Online Analysis* tutorial (SmartSim-PR411_) -- An example of how to use multiple Orchestrators concurrently was added to the documentation (SmartSim-PR409_) -- The test infrastructure was improved. Tests on HPC system are now stable, and issues such - as non-stopped `Orchestrators` or experiments created in the wrong paths have been fixed (SmartSim-PR381_) -- A telemetry monitor was added to check updates and produce events for SmartDashboard (SmartSim-PR426_) -- Split tests into `group_a`, `group_b`, `slow_tests` for parallel execution in CI/CD pipeline (SmartSim-PR417_, SmartSim-PR424_) -- Change `format` argument to `style` in `Experiment.summary()`, this is - an API break (SmartSim-PR391_) -- Added support for first_device parameter for scripts, functions, - and models. This causes them to be loaded to the first num_devices - beginning with first_device (SmartSim-PR394_) -- Added support for MINBATCHTIMEOUT in model execution, which caps the delay - waiting for a minimium number of model execution operations to accumulate - before executing them as a batch (SmartSim-PR387_) -- RedisAI 1.2.5 is not supported anymore. The only RedisAI version - is now 1.2.7. Since the officially released RedisAI 1.2.7 has a - bug which breaks the build process on Mac OSX, it was decided to - use commit 634916c_ from RedisAI's GitHub repository, where such - bug has been fixed. This applies to all operating systems. (SmartSim-PR383_) -- Add support for creation of multiple databases with unique identifiers. (SmartSim-PR342_) - - -.. _SmartSim-PR435: https://github.com/CrayLabs/SmartSim/pull/435 -.. _SmartSim-PR392: https://github.com/CrayLabs/SmartSim/pull/392 -.. _SmartSim-PR411: https://github.com/CrayLabs/SmartSim/pull/411 -.. _SmartSim-PR409: https://github.com/CrayLabs/SmartSim/pull/409 -.. _SmartSim-PR381: https://github.com/CrayLabs/SmartSim/pull/381 -.. _SmartSim-PR426: https://github.com/CrayLabs/SmartSim/pull/426 -.. _SmartSim-PR424: https://github.com/CrayLabs/SmartSim/pull/424 -.. _SmartSim-PR417: https://github.com/CrayLabs/SmartSim/pull/417 -.. _SmartSim-PR391: https://github.com/CrayLabs/SmartSim/pull/391 -.. _SmartSim-PR342: https://github.com/CrayLabs/SmartSim/pull/342 -.. _SmartSim-PR394: https://github.com/CrayLabs/SmartSim/pull/394 -.. _SmartSim-PR387: https://github.com/CrayLabs/SmartSim/pull/387 -.. _SmartSim-PR383: https://github.com/CrayLabs/SmartSim/pull/383 -.. _634916c: https://github.com/RedisAI/RedisAI/commit/634916c722e718cc6ea3fad46e63f7d798f9adc2 -.. _SmartSim-PR342: https://github.com/CrayLabs/SmartSim/pull/342 - - -0.5.1 ------ - -Released on 14 September, 2023 - -Description - -- Add typehints throughout the SmartSim codebase -- Provide support for Slurm heterogeneous jobs -- Provide better support for `PalsMpiexecSettings` -- Allow for easier inspection of SmartSim entities -- Log ignored error messages from `sacct` -- Fix colocated db preparation bug when using `JsrunSettings` -- Fix bug when user specify CPU and devices greater than 1 -- Fix bug when get_allocation called with reserved keywords -- Enabled mypy in CI for better type safety -- Mitigate additional suppressed pylint errors -- Update linting support and apply to existing errors -- Various improvements to the `smart` CLI -- Various documentation improvements -- Various test suite improvements - -Detailed Notes - -- Add methods to allow users to inspect files attached to models and ensembles. (SmartSim-PR352_) -- Add a `smart info` target to provide rudimentary information about the SmartSim installation. (SmartSim-PR350_) -- Remove unnecessary generation producing unexpected directories in the test suite. (SmartSim-PR349_) -- Add support for heterogeneous jobs to `SrunSettings` by allowing users to set the `--het-group` parameter. (SmartSim-PR346_) -- Provide clearer guidelines on how to contribute to SmartSim. (SmartSim-PR344_) -- Integrate `PalsMpiexecSettings` into the `Experiment` factory methods when using the `"pals"` launcher. (SmartSim-PR343_) -- Create public properties where appropriate to mitigate `protected-access` errors. (SmartSim-PR341_) -- Fix a failure to execute `_prep_colocated_db` due to incorrect named attr check. (SmartSim-PR339_) -- Enabled and mitigated mypy `disallow_any_generics` and `warn_return_any`. (SmartSim-PR338_) -- Add a `smart validate` target to provide a simple smoke test to assess a SmartSim build. (SmartSim-PR336_, SmartSim-PR351_) -- Add typehints to `smartsim._core.launcher.step.*`. (SmartSim-PR334_) -- Log errors reported from slurm WLM when attempts to retrieve status fail. (SmartSim-PR331_, SmartSim-PR332_) -- Fix incorrectly formatted positional arguments in log format strings. (SmartSim-PR330_) -- Ensure that launchers pass environment variables to unmanaged job steps. (SmartSim-PR329_) -- Add additional tests surrounding the `RAI_PATH` configuration environment variable. (SmartSim-PR328_) -- Remove unnecessary execution of unescaped shell commands. (SmartSim-PR327_) -- Add error if user calls get_allocation with reserved keywords in slurm get_allocation. (SmartSim-PR325_) -- Add error when user requests CPU with devices greater than 1 within add_ml_model and add_script. (SmartSim-PR324_) -- Update documentation surrounding ensemble key prefixing. (SmartSim-PR322_) -- Fix formatting of the Frontier site installation. (SmartSim-PR321_) -- Update pylint dependency, update .pylintrc, mitigate non-breaking issues, suppress api breaks. (SmartSim-PR311_) -- Refactor the `smart` CLI to use subparsers for better documentation and extension. (SmartSim-PR308_) - -.. _SmartSim-PR352: https://github.com/CrayLabs/SmartSim/pull/352 -.. _SmartSim-PR351: https://github.com/CrayLabs/SmartSim/pull/351 -.. _SmartSim-PR350: https://github.com/CrayLabs/SmartSim/pull/350 -.. _SmartSim-PR349: https://github.com/CrayLabs/SmartSim/pull/349 -.. _SmartSim-PR346: https://github.com/CrayLabs/SmartSim/pull/346 -.. _SmartSim-PR344: https://github.com/CrayLabs/SmartSim/pull/344 -.. _SmartSim-PR343: https://github.com/CrayLabs/SmartSim/pull/343 -.. _SmartSim-PR341: https://github.com/CrayLabs/SmartSim/pull/341 -.. _SmartSim-PR339: https://github.com/CrayLabs/SmartSim/pull/339 -.. _SmartSim-PR338: https://github.com/CrayLabs/SmartSim/pull/338 -.. _SmartSim-PR336: https://github.com/CrayLabs/SmartSim/pull/336 -.. _SmartSim-PR334: https://github.com/CrayLabs/SmartSim/pull/334 -.. _SmartSim-PR332: https://github.com/CrayLabs/SmartSim/pull/332 -.. _SmartSim-PR331: https://github.com/CrayLabs/SmartSim/pull/331 -.. _SmartSim-PR330: https://github.com/CrayLabs/SmartSim/pull/330 -.. _SmartSim-PR329: https://github.com/CrayLabs/SmartSim/pull/329 -.. _SmartSim-PR328: https://github.com/CrayLabs/SmartSim/pull/328 -.. _SmartSim-PR327: https://github.com/CrayLabs/SmartSim/pull/327 -.. _SmartSim-PR325: https://github.com/CrayLabs/SmartSim/pull/325 -.. _SmartSim-PR324: https://github.com/CrayLabs/SmartSim/pull/324 -.. _SmartSim-PR322: https://github.com/CrayLabs/SmartSim/pull/322 -.. _SmartSim-PR321: https://github.com/CrayLabs/SmartSim/pull/321 -.. _SmartSim-PR311: https://github.com/CrayLabs/SmartSim/pull/311 -.. _SmartSim-PR308: https://github.com/CrayLabs/SmartSim/pull/308 - - -0.5.0 ------ - -Released on 6 July, 2023 - -Description - -A full list of changes and detailed notes can be found below: - -- Update SmartRedis dependency to v0.4.1 -- Fix tests for db models and scripts -- Fix add_ml_model() and add_script() documentation, tests, and code -- Remove `requirements.txt` and other places where dependencies were defined -- Replace `limit_app_cpus` with `limit_db_cpus` for co-located orchestrators -- Remove wait time associated with Experiment launch summary -- Update and rename Redis conf file -- Migrate from redis-py-cluster to redis-py -- Update full test suite to not require a TF wheel at test time -- Update doc strings -- Remove deprecated code -- Relax the coloredlogs version -- Update Fortran tutorials for SmartRedis -- Add support for multiple network interface binding in Orchestrator and Colocated DBs -- Add typehints and static analysis - -Detailed notes - -- Updates SmartRedis to the most current release (SmartSim-PR316_) -- Fixes and enhancements to documentation (SmartSim-PR317_, SmartSim-PR314_, SmartSim-PR287_) -- Various fixes and enhancements to the test suite (SmartSim-PR315_, SmartSim-PR312_, SmartSim-PR310_, SmartSim-PR302_, SmartSim-PR283_) -- Fix a defect in the tests related to database models and scripts that was - causing key collisions when testing on workload managers (SmartSim-PR313_) -- Remove `requirements.txt` and other places where dependencies were defined. (SmartSim-PR307_) -- Fix defect where dictionaries used to create run settings can be changed - unexpectedly due to copy-by-ref (SmartSim-PR305_) -- The underlying code for Model.add_ml_model() and Model.add_script() was fixed - to correctly handle multi-GPU configurations. Tests were updated to run on - non-local launchers. Documentation was updated and fixed. Also, the default - testing interface has been changed to lo instead of ipogif. (SmartSim-PR304_) -- Typehints have been added. A makefile target `make check-mypy` executes static - analysis with mypy. (SmartSim-PR295_, SmartSim-PR301_, SmartSim-PR303_) -- Replace `limit_app_cpus` with `limit_db_cpus` for co-located orchestrators. - This resolves some incorrect behavior/assumptions about how the application - would be pinned. Instead, users should directly specify the binding options in - their application using the options appropriate for their launcher (SmartSim-PR306_) -- Simplify code in `random_permutations` parameter generation strategy (SmartSim-PR300_) -- Remove wait time associated with Experiment launch summary (SmartSim-PR298_) -- Update Redis conf file to conform with Redis v7.0.5 conf file (SmartSim-PR293_) -- Migrate from redis-py-cluster to redis-py for cluster status checks (SmartSim-PR292_) -- Update full test suite to no longer require a tensorflow wheel to be available at test time. (SmartSim-PR291_) -- Correct spelling of colocated in doc strings (SmartSim-PR290_) -- Deprecated launcher-specific orchestrators, constants, and ML - utilities were removed. (SmartSim-PR289_) -- Relax the coloredlogs version to be greater than 10.0 (SmartSim-PR288_) -- Update the Github Actions runner image from `macos-10.15`` to `macos-12``. The - former began deprecation in May 2022 and was finally removed in May 2023. (SmartSim-PR285_) -- The Fortran tutorials had not been fully updated to show how to handle - return/error codes. These have now all been updated. (SmartSim-PR284_) -- Orchestrator and Colocated DB now accept a list of interfaces to bind to. The - argument name is still `interface` for backward compatibility reasons. (SmartSim-PR281_) -- Typehints have been added to public APIs. A makefile target to execute static - analysis with mypy is available `make check-mypy`. (SmartSim-PR295_) - -.. _SmartSim-PR317: https://github.com/CrayLabs/SmartSim/pull/317 -.. _SmartSim-PR316: https://github.com/CrayLabs/SmartSim/pull/316 -.. _SmartSim-PR315: https://github.com/CrayLabs/SmartSim/pull/314 -.. _SmartSim-PR314: https://github.com/CrayLabs/SmartSim/pull/314 -.. _SmartSim-PR313: https://github.com/CrayLabs/SmartSim/pull/313 -.. _SmartSim-PR312: https://github.com/CrayLabs/SmartSim/pull/312 -.. _SmartSim-PR310: https://github.com/CrayLabs/SmartSim/pull/310 -.. _SmartSim-PR307: https://github.com/CrayLabs/SmartSim/pull/307 -.. _SmartSim-PR306: https://github.com/CrayLabs/SmartSim/pull/306 -.. _SmartSim-PR305: https://github.com/CrayLabs/SmartSim/pull/305 -.. _SmartSim-PR304: https://github.com/CrayLabs/SmartSim/pull/304 -.. _SmartSim-PR303: https://github.com/CrayLabs/SmartSim/pull/303 -.. _SmartSim-PR302: https://github.com/CrayLabs/SmartSim/pull/302 -.. _SmartSim-PR301: https://github.com/CrayLabs/SmartSim/pull/301 -.. _SmartSim-PR300: https://github.com/CrayLabs/SmartSim/pull/300 -.. _SmartSim-PR298: https://github.com/CrayLabs/SmartSim/pull/298 -.. _SmartSim-PR295: https://github.com/CrayLabs/SmartSim/pull/295 -.. _SmartSim-PR293: https://github.com/CrayLabs/SmartSim/pull/293 -.. _SmartSim-PR292: https://github.com/CrayLabs/SmartSim/pull/292 -.. _SmartSim-PR291: https://github.com/CrayLabs/SmartSim/pull/291 -.. _SmartSim-PR290: https://github.com/CrayLabs/SmartSim/pull/290 -.. _SmartSim-PR289: https://github.com/CrayLabs/SmartSim/pull/289 -.. _SmartSim-PR288: https://github.com/CrayLabs/SmartSim/pull/288 -.. _SmartSim-PR287: https://github.com/CrayLabs/SmartSim/pull/287 -.. _SmartSim-PR285: https://github.com/CrayLabs/SmartSim/pull/285 -.. _SmartSim-PR284: https://github.com/CrayLabs/SmartSim/pull/284 -.. _SmartSim-PR283: https://github.com/CrayLabs/SmartSim/pull/283 -.. _SmartSim-PR281: https://github.com/CrayLabs/SmartSim/pull/281 - -0.4.2 ------ - -Released on April 12, 2023 - -Description - -This release of SmartSim had a focus on polishing and extending exiting -features already provided by SmartSim. Most notably, this release provides -support to allow users to colocate their models with an orchestrator using -Unix domain sockets and support for launching models as batch jobs. - -Additionally, SmartSim has updated its tool chains to provide a better user -experience. Notably, SmarSim can now be used with Python 3.10, Redis 7.0.5, and -RedisAI 1.2.7. Furthermore, SmartSim now utilizes SmartRedis's aggregation lists to -streamline the use and extension of ML data loaders, making working with popular -machine learning frameworks in SmartSim a breeze. - -A full list of changes and detailed notes can be found below: - -- Add support for colocating an orchestrator over UDS -- Add support for Python 3.10, deprecate support for Python 3.7 and RedisAI 1.2.3 -- Drop support for Ray -- Update ML data loaders to make use of SmartRedis's aggregation lists -- Allow for models to be launched independently as batch jobs -- Update to current version of Redis to 7.0.5 -- Add support for RedisAI 1.2.7, pyTorch 1.11.0, Tensorflow 2.8.0, ONNXRuntime 1.11.1 -- Fix bug in colocated database entrypoint when loading PyTorch models -- Fix test suite behavior with environment variables - -Detailed Notes - -- Running some tests could result in some SmartSim-specific environment variables to be set. Such environment variables are now reset - after each test execution. Also, a warning for environment variable usage in Slurm was added, to make the user aware in case an environment - variable will not be assigned the desired value with `--export`. (SmartSim-PR270_) -- The PyTorch and TensorFlow data loaders were update to make use of aggregation lists. This breaks their API, but makes them easier to use. (SmartSim-PR264_) -- The support for Ray was dropped, as its most recent versions caused problems when deployed through SmartSim. - We plan to release a separate add-on library to accomplish the same results. If - you are interested in getting the Ray launch functionality back in your workflow, please get in touch with us! (SmartSim-PR263_) -- Update from Redis version 6.0.8 to 7.0.5. (SmartSim-PR258_) -- Adds support for Python 3.10 without the ONNX machine learning backend. Deprecates support for - Python 3.7 as it will stop receiving security updates. Deprecates support for RedisAI 1.2.3. - Update the build process to be able to correctly fetch supported dependencies. If a user - attempts to build an unsupported dependency, an error message is shown highlighting the - discrepancy. (SmartSim-PR256_) -- Models were given a `batch_settings` attribute. When launching a model through `Experiment.start` - the `Experiment` will first check for a non-nullish value at that attribute. If the check is - satisfied, the `Experiment` will attempt to wrap the underlying run command in a batch job using - the object referenced at `Model.batch_settings` as the batch settings for the job. If the check - is not satisfied, the `Model` is launched in the traditional manner as a job step. (SmartSim-PR245_) -- Fix bug in colocated database entrypoint stemming from uninitialized variables. This bug affects PyTorch models being loaded into the database. (SmartSim-PR237_) -- The release of RedisAI 1.2.7 allows us to update support for recent versions of PyTorch, Tensorflow, and ONNX (SmartSim-PR234_) -- Make installation of correct Torch backend more reliable according to instruction from PyTorch -- In addition to TCP, add UDS support for colocating an orchestrator with models. Methods - `Model.colocate_db_tcp` and `Model.colocate_db_uds` were added to expose this functionality. - The `Model.colocate_db` method remains and uses TCP for backward compatibility (SmartSim-PR246_) - -.. _SmartSim-PR270: https://github.com/CrayLabs/SmartSim/pull/270 -.. _SmartSim-PR264: https://github.com/CrayLabs/SmartSim/pull/264 -.. _SmartSim-PR263: https://github.com/CrayLabs/SmartSim/pull/263 -.. _SmartSim-PR258: https://github.com/CrayLabs/SmartSim/pull/258 -.. _SmartSim-PR256: https://github.com/CrayLabs/SmartSim/pull/256 -.. _SmartSim-PR246: https://github.com/CrayLabs/SmartSim/pull/246 -.. _SmartSim-PR245: https://github.com/CrayLabs/SmartSim/pull/245 -.. _SmartSim-PR237: https://github.com/CrayLabs/SmartSim/pull/237 -.. _SmartSim-PR234: https://github.com/CrayLabs/SmartSim/pull/234 - - -0.4.1 ------ - -Released on June 24, 2022 - -Description: -This release of SmartSim introduces a new experimental feature to help make -SmartSim workflows more portable: the ability to run simulations models in a -container via Singularity. This feature has been tested on a small number of -platforms and we encourage users to provide feedback on its use. - -We have also made improvements in a variety of areas: new utilities to load -scripts and machine learning models into the database directly from SmartSim -driver scripts and install-time choice to use either `KeyDB` or `Redis` for the -Orchestrator. The `RunSettings` API is now more consistent across subclasses. Another -key focus of this release was to aid new SmartSim users by including more -extensive tutorials and improving the documentation. The docker image containing -the SmartSim tutorials now also includes a tutorial on online training. - - -Launcher improvements - - - New methods for specifying `RunSettings` parameters (SmartSim-PR166_) (SmartSim-PR170_) - - Better support for `mpirun`, `mpiexec`, and `orterun` as launchers (SmartSim-PR186_) - - Experimental: add support for running models via Singularity (SmartSim-PR204_) - -Documentation and tutorials - - - Tutorial updates (SmartSim-PR155_) (SmartSim-PR203_) (SmartSim-PR208_) - - Add SmartSim Zoo info to documentation (SmartSim-PR175_) - - New tutorial for demonstrating online training (SmartSim-PR176_) (SmartSim-PR188_) - -General improvements and bug fixes - - - Set models and scripts at the driver level (SmartSim-PR185_) - - Optionally use KeyDB for the orchestrator (SmartSim-PR180_) - - Ability to specify system-level libraries (SmartSim-PR154_) (SmartSim-PR182_) - - Fix the handling of LSF gpus_per_shard (SmartSim-PR164_) - - Fix error when re-running `smart build` (SmartSim-PR165_) - - Fix generator hanging when tagged configuration variables are missing (SmartSim-PR177_) - -Dependency updates - - - CMake version from 3.10 to 3.13 (SmartSim-PR152_) - - Update click to 8.0.2 (SmartSim-PR200_) - -.. _SmartSim-PR152: https://github.com/CrayLabs/SmartSim/pull/152 -.. _SmartSim-PR154: https://github.com/CrayLabs/SmartSim/pull/154 -.. _SmartSim-PR155: https://github.com/CrayLabs/SmartSim/pull/155 -.. _SmartSim-PR164: https://github.com/CrayLabs/SmartSim/pull/164 -.. _SmartSim-PR165: https://github.com/CrayLabs/SmartSim/pull/165 -.. _SmartSim-PR166: https://github.com/CrayLabs/SmartSim/pull/166 -.. _SmartSim-PR170: https://github.com/CrayLabs/SmartSim/pull/170 -.. _SmartSim-PR175: https://github.com/CrayLabs/SmartSim/pull/175 -.. _SmartSim-PR176: https://github.com/CrayLabs/SmartSim/pull/176 -.. _SmartSim-PR177: https://github.com/CrayLabs/SmartSim/pull/177 -.. _SmartSim-PR180: https://github.com/CrayLabs/SmartSim/pull/180 -.. _SmartSim-PR182: https://github.com/CrayLabs/SmartSim/pull/182 -.. _SmartSim-PR185: https://github.com/CrayLabs/SmartSim/pull/185 -.. _SmartSim-PR186: https://github.com/CrayLabs/SmartSim/pull/186 -.. _SmartSim-PR188: https://github.com/CrayLabs/SmartSim/pull/188 -.. _SmartSim-PR200: https://github.com/CrayLabs/SmartSim/pull/200 -.. _SmartSim-PR203: https://github.com/CrayLabs/SmartSim/pull/203 -.. _SmartSim-PR204: https://github.com/CrayLabs/SmartSim/pull/204 -.. _SmartSim-PR208: https://github.com/CrayLabs/SmartSim/pull/208 - -0.4.0 ------ - -Released on Feb 11, 2022 - -Description: -In this release SmartSim continues to promote ease of use. -To this end SmartSim has introduced new portability features -that allow users to abstract away their targeted hardware, -while providing even more compatibility with existing -libraries. - -A new feature, Co-located orchestrator deployments has -been added which provides scalable online inference -capabilities that overcome previous performance limitations -in seperated orchestrator/application deployments. -For more information on advantages of co-located deployments, -see the Orchestrator section of the SmartSim documentation. - -The SmartSim build was significantly improved to increase -customization of build toolchain and the ``smart`` command -line inferface was expanded. - -Additional tweaks and upgrades have also been -made to ensure an optimal experience. Here is a -comprehensive list of changes made in SmartSim 0.4.0. - - -Orchestrator Enhancements: - - - Add Orchestrator Co-location (SmartSim-PR139_) - - Add Orchestrator configuration file edit methods (SmartSim-PR109_) - -Emphasize Driver Script Portability: - - - Add ability to create run settings through an experiment (SmartSim-PR110_) - - Add ability to create batch settings through an experiment (SmartSim-PR112_) - - Add automatic launcher detection to experiment portability functions (SmartSim-PR120_) - -Expand Machine Learning Library Support: - - - Data loaders for online training in Keras/TF and Pytorch (SmartSim-PR115_) (SmartSim-PR140_) - - ML backend versions updated with expanded support for multiple versions (SmartSim-PR122_) - - Launch Ray internally using ``RunSettings`` (SmartSim-PR118_) - - Add Ray cluster setup and deployment to SmartSim (SmartSim-PR50_) - -Expand Launcher Setting Options: - - - Add ability to use base ``RunSettings`` on a Slurm, or PBS launchers (SmartSim-PR90_) - - Add ability to use base ``RunSettings`` on LFS launcher (SmartSim-PR108_) - -Deprecations and Breaking Changes - - - Orchestrator classes combined into single implementation for portability (SmartSim-PR139_) - - ``smartsim.constants`` changed to ``smartsim.status`` (SmartSim-PR122_) - - ``smartsim.tf`` migrated to ``smartsim.ml.tf`` (SmartSim-PR115_) (SmartSim-PR140_) - - TOML configuration option removed in favor of environment variable approach (SmartSim-PR122_) - -General Improvements and Bug Fixes: - - - Improve and extend parameter handling (SmartSim-PR107_) (SmartSim-PR119_) - - Abstract away non-user facing implementation details (SmartSim-PR122_) - - Add various dimensions to the CI build matrix for SmartSim testing (SmartSim-PR130_) - - Add missing functions to LSFSettings API (SmartSim-PR113_) - - Add RedisAI checker for installed backends (SmartSim-PR137_) - - Remove heavy and unnecessary dependencies (SmartSim-PR116_) (SmartSim-PR132_) - - Fix LSFLauncher and LSFOrchestrator (SmartSim-PR86_) - - Fix over greedy Workload Manager Parsers (SmartSim-PR95_) - - Fix Slurm handling of comma-separated env vars (SmartSim-PR104_) - - Fix internal method calls (SmartSim-PR138_) - -Documentation Updates: - - - Updates to documentation build process (SmartSim-PR133_) (SmartSim-PR143_) - - Updates to documentation content (SmartSim-PR96_) (SmartSim-PR129_) (SmartSim-PR136_) (SmartSim-PR141_) - - Update SmartSim Examples (SmartSim-PR68_) (SmartSim-PR100_) - - -.. _SmartSim-PR50: https://github.com/CrayLabs/SmartSim/pull/50 -.. _SmartSim-PR68: https://github.com/CrayLabs/SmartSim/pull/68 -.. _SmartSim-PR86: https://github.com/CrayLabs/SmartSim/pull/86 -.. _SmartSim-PR90: https://github.com/CrayLabs/SmartSim/pull/90 -.. _SmartSim-PR95: https://github.com/CrayLabs/SmartSim/pull/95 -.. _SmartSim-PR96: https://github.com/CrayLabs/SmartSim/pull/96 -.. _SmartSim-PR100: https://github.com/CrayLabs/SmartSim/pull/100 -.. _SmartSim-PR104: https://github.com/CrayLabs/SmartSim/pull/104 -.. _SmartSim-PR107: https://github.com/CrayLabs/SmartSim/pull/107 -.. _SmartSim-PR108: https://github.com/CrayLabs/SmartSim/pull/108 -.. _SmartSim-PR109: https://github.com/CrayLabs/SmartSim/pull/109 -.. _SmartSim-PR110: https://github.com/CrayLabs/SmartSim/pull/110 -.. _SmartSim-PR112: https://github.com/CrayLabs/SmartSim/pull/112 -.. _SmartSim-PR113: https://github.com/CrayLabs/SmartSim/pull/113 -.. _SmartSim-PR115: https://github.com/CrayLabs/SmartSim/pull/115 -.. _SmartSim-PR116: https://github.com/CrayLabs/SmartSim/pull/116 -.. _SmartSim-PR118: https://github.com/CrayLabs/SmartSim/pull/118 -.. _SmartSim-PR119: https://github.com/CrayLabs/SmartSim/pull/119 -.. _SmartSim-PR120: https://github.com/CrayLabs/SmartSim/pull/120 -.. _SmartSim-PR122: https://github.com/CrayLabs/SmartSim/pull/122 -.. _SmartSim-PR129: https://github.com/CrayLabs/SmartSim/pull/129 -.. _SmartSim-PR130: https://github.com/CrayLabs/SmartSim/pull/130 -.. _SmartSim-PR132: https://github.com/CrayLabs/SmartSim/pull/132 -.. _SmartSim-PR133: https://github.com/CrayLabs/SmartSim/pull/133 -.. _SmartSim-PR136: https://github.com/CrayLabs/SmartSim/pull/136 -.. _SmartSim-PR137: https://github.com/CrayLabs/SmartSim/pull/137 -.. _SmartSim-PR138: https://github.com/CrayLabs/SmartSim/pull/138 -.. _SmartSim-PR139: https://github.com/CrayLabs/SmartSim/pull/139 -.. _SmartSim-PR140: https://github.com/CrayLabs/SmartSim/pull/140 -.. _SmartSim-PR141: https://github.com/CrayLabs/SmartSim/pull/141 -.. _SmartSim-PR143: https://github.com/CrayLabs/SmartSim/pull/143 - - -0.3.2 ------ - -Released on August 10, 2021 - -Description: - - - Upgraded RedisAI backend to 1.2.3 (SmartSim-PR69_) - - PyTorch 1.7.1, TF 2.4.2, and ONNX 1.6-7 (SmartSim-PR69_) - - LSF launcher for IBM machines (SmartSim-PR62_) - - Improved code coverage by adding more unit tests (SmartSim-PR53_) - - Orchestrator methods to get address and check status (SmartSim-PR60_) - - Added Manifest object that tracks deployables in Experiments (SmartSim-PR61_) - - Bug fixes (SmartSim-PR52_) (SmartSim-PR58_) (SmartSim-PR67_) (SmartSim-PR73_) - - Updated documentation and examples (SmartSim-PR51_) (SmartSim-PR57_) (SmartSim-PR71_) - - Improved IP address aquisition (SmartSim-PR72_) - - Binding database to network interfaces - -.. _SmartSim-PR51: https://github.com/CrayLabs/SmartSim/pull/51 -.. _SmartSim-PR52: https://github.com/CrayLabs/SmartSim/pull/52 -.. _SmartSim-PR53: https://github.com/CrayLabs/SmartSim/pull/53 -.. _SmartSim-PR57: https://github.com/CrayLabs/SmartSim/pull/57 -.. _SmartSim-PR58: https://github.com/CrayLabs/SmartSim/pull/58 -.. _SmartSim-PR60: https://github.com/CrayLabs/SmartSim/pull/60 -.. _SmartSim-PR61: https://github.com/CrayLabs/SmartSim/pull/61 -.. _SmartSim-PR62: https://github.com/CrayLabs/SmartSim/pull/62 -.. _SmartSim-PR67: https://github.com/CrayLabs/SmartSim/pull/67 -.. _SmartSim-PR69: https://github.com/CrayLabs/SmartSim/pull/69 -.. _SmartSim-PR71: https://github.com/CrayLabs/SmartSim/pull/71 -.. _SmartSim-PR72: https://github.com/CrayLabs/SmartSim/pull/72 -.. _SmartSim-PR73: https://github.com/CrayLabs/SmartSim/pull/73 - -0.3.1 ------ - -Released on May 5, 2021 - -Description: -This release was dedicated to making the install process -easier. SmartSim can be installed from PyPI now and the -``smart`` cli tool makes installing the machine learning -runtimes much easier. - - - Pip install (SmartSim-PR42_) - - ``smart`` cli tool for ML backends (SmartSim-PR42_) - - Build Documentation for updated install (SmartSim-PR43_) - - Migrate from Jenkins to Github Actions CI (SmartSim-PR42_) - - Bug fix for setup.cfg (SmartSim-PR35_) - -.. _SmartSim-PR43: https://github.com/CrayLabs/SmartSim/pull/43 -.. _SmartSim-PR42: https://github.com/CrayLabs/SmartSim/pull/42 -.. _SmartSim-PR35: https://github.com/CrayLabs/SmartSim/pull/35 - -0.3.0 ------ - -Released on April 1, 2021 - -Description: - - - initial 0.3.0 (first public) release of SmartSim - - ---------------------------------------------------------------- - -.. _sr_changelog: - -SmartRedis -========== - -.. include:: ../smartredis/doc/changelog.rst - :start-line: 3 diff --git a/doc/conf.py b/doc/conf.py index 38f419bc1..71d109b5c 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -59,6 +59,7 @@ 'sphinx_tabs.tabs', 'sphinx_design', 'sphinx.ext.mathjax', + 'myst_parser' ] # sphinx_autodoc_typehints configurations always_use_bars_union = True @@ -72,6 +73,14 @@ # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] +source_suffix = { + '.rst': 'restructuredtext', + '.md': 'markdown', +} + +linkcheck_ignore = [ + 'Redis::set_model_multigpu', +] # The path to the MathJax.js file that Sphinx will use to render math expressions mathjax_path = 'https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js' @@ -79,7 +88,7 @@ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', "**.ipynb_checkpoints"] +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', "**.ipynb_checkpoints", "tutorials/ml_training/surrogate/README.md", "tutorials/online_analysis/lattice/README.md"] breathe_projects = { "c_client":"../smartredis/doc/c_client/xml", diff --git a/doc/requirements-doc.txt b/doc/requirements-doc.txt index 108d4cad9..696881bef 100644 --- a/doc/requirements-doc.txt +++ b/doc/requirements-doc.txt @@ -15,3 +15,4 @@ numpy sphinx-design pypandoc sphinx-autodoc-typehints +myst_parser