From 1f6f90aef241e2bcdc6bf66973c104e1194e2d56 Mon Sep 17 00:00:00 2001 From: NouamaneTazi Date: Mon, 4 Mar 2024 16:02:53 +0000 Subject: [PATCH 1/6] update docs --- README.md | 39 +++++++++++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 672b1880..f0fac6c6 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,34 @@ -# ⚡️ Nanotron +

⚡️ Nanotron

+ +

+ + GitHub release + + + GitHub release + + + License + +

+ +

+

+ Philosophy • + Core Features • + Installation • + Usage • + Contributions +

+

+ +

+ +

+ + + +# The objective of this library is to provide easy distributed primitives in order to train a variety of models efficiently using 3D parallelism. For more information about the internal design of the library or 3D parallelism in general, please check out [[docs.md]](./docs/docs.md) and [[3d_parallelism.md]](./docs/3d_parallelism.md). @@ -28,12 +58,10 @@ To install (in a new env): ```bash pip install torch pip install packaging; pip install "flash-attn>=2.5.0" --no-build-isolation -git clone git@github.com:huggingface/nanotron.git -cd nanotron -pip install -e . +pip install nanotron ``` -Also nice to have `transformers` `datasets` `python-etcd` `tensorboardX`: `pip install transformers datasets python-etcd tensorboardX` +Also nice to have: `pip install transformers datasets python-etcd tensorboardX` We also support a set of flavors that you can install using `pip install -e [$FLAVOR]`: - `dev`: Used is you are developping in `nanotron`. It installs in particular our linter mechanism. On top of that you have to run `pre-commit install` afterwards. @@ -68,7 +96,6 @@ pre-commit run --config .pre-commit-config.yaml --all-files Features we would like to add: - [ ] Support `torch.compile` -- [ ] Support `torch.distributed.rpc` - [ ] More optimized kernels - [ ] Support Zero3 - [ ] Other PP schedules (such as Interleaved 1f1b...) From 0446018a449f918b5ce21da4a8d1376a40883963 Mon Sep 17 00:00:00 2001 From: NouamaneTazi Date: Mon, 4 Mar 2024 16:08:23 +0000 Subject: [PATCH 2/6] 0.4 --- pyproject.toml | 2 +- src/nanotron/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index f3372750..ebb81b8f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "nanotron" -version = "0.2" +version = "0.4" description = "Minimalistic Large Language Model Training and Finetuning" authors = [ {name = "Nouamane Tazi", email="nouamane@huggingface.co"}, diff --git a/src/nanotron/__init__.py b/src/nanotron/__init__.py index 09888577..896a370c 100644 --- a/src/nanotron/__init__.py +++ b/src/nanotron/__init__.py @@ -1 +1 @@ -__version__ = "0.2" +__version__ = "0.4" From 13d53a7838ec79214b876bad8dcadfee9b60a883 Mon Sep 17 00:00:00 2001 From: NouamaneTazi Date: Mon, 4 Mar 2024 16:24:56 +0000 Subject: [PATCH 3/6] . --- src/nanotron/parallel/pipeline_parallel/__init__.py | 5 +++++ src/nanotron/trainer.py | 8 +++++--- 2 files changed, 10 insertions(+), 3 deletions(-) create mode 100644 src/nanotron/parallel/pipeline_parallel/__init__.py diff --git a/src/nanotron/parallel/pipeline_parallel/__init__.py b/src/nanotron/parallel/pipeline_parallel/__init__.py new file mode 100644 index 00000000..a4d66e50 --- /dev/null +++ b/src/nanotron/parallel/pipeline_parallel/__init__.py @@ -0,0 +1,5 @@ +from nanotron.parallel.pipeline_parallel.engine import PipelineEngine +from nanotron.parallel.pipeline_parallel.tensor_pointer import TensorPointer +from nanotron.parallel.pipeline_parallel.utils import get_pp_rank_of + +__all__ = ["PipelineEngine", "TensorPointer", "get_pp_rank_of"] diff --git a/src/nanotron/trainer.py b/src/nanotron/trainer.py index 81c0732e..8ff99c38 100644 --- a/src/nanotron/trainer.py +++ b/src/nanotron/trainer.py @@ -55,9 +55,11 @@ from nanotron.parallel import ParallelContext from nanotron.parallel.data_parallel.utils import sync_gradients_across_dp from nanotron.parallel.parameters import NanotronParameter, sanity_check -from nanotron.parallel.pipeline_parallel.engine import PipelineEngine -from nanotron.parallel.pipeline_parallel.tensor_pointer import TensorPointer -from nanotron.parallel.pipeline_parallel.utils import get_pp_rank_of +from nanotron.parallel.pipeline_parallel import ( + PipelineEngine, + TensorPointer, + get_pp_rank_of, +) from nanotron.parallel.tensor_parallel.nn import ( TensorParallelLinearMode, TensorParallelRowLinear, From d9cfabd463795a9ac8276d1dccddc8961de8bca4 Mon Sep 17 00:00:00 2001 From: NouamaneTazi Date: Mon, 4 Mar 2024 16:26:50 +0000 Subject: [PATCH 4/6] . --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f0fac6c6..eef777a6 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,13 @@

⚡️ Nanotron

- + GitHub release GitHub release - + License

From 734fd3c4af37cc83ae8f0484bc70157d68d20519 Mon Sep 17 00:00:00 2001 From: NouamaneTazi Date: Mon, 4 Mar 2024 16:37:58 +0000 Subject: [PATCH 5/6] refacto set_logger_verbosity_format --- .pre-commit-config.yaml | 8 -------- run_generate.py | 28 ++++++++++++++++------------ src/nanotron/logging.py | 23 +++++++++++++++++++++-- src/nanotron/trainer.py | 11 ++--------- 4 files changed, 39 insertions(+), 31 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5141302e..3e40f956 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,14 +19,6 @@ repos: args: - --fix - --exit-non-zero-on-fix - - repo: https://github.com/PyCQA/isort - rev: 5.12.0 - hooks: - - id: isort - args: - - --profile=black - - --skip-glob=wandb/**/* - - --thirdparty=wandb - repo: https://github.com/codespell-project/codespell rev: v2.1.0 hooks: diff --git a/run_generate.py b/run_generate.py index e280a5fd..0f52b8ed 100644 --- a/run_generate.py +++ b/run_generate.py @@ -15,9 +15,19 @@ import torch from nanotron import distributed as dist from nanotron import logging -from nanotron.config import GenerationArgs, LoggingArgs, ParallelismArgs, get_config_from_file -from nanotron.generation.decode import GenerationInput, TokenizerConfig, decode_text, decode_tokenized -from nanotron.logging import log_rank, set_logger_verbosity_format +from nanotron.config import ( + GenerationArgs, + LoggingArgs, + ParallelismArgs, + get_config_from_file, +) +from nanotron.generation.decode import ( + GenerationInput, + TokenizerConfig, + decode_text, + decode_tokenized, +) +from nanotron.logging import log_rank, set_ranks_logging_level from nanotron.models import build_model from nanotron.parallel import ParallelContext from nanotron.parallel.parameters import sanity_check @@ -32,9 +42,7 @@ get_synced_random_state, set_random_seed, ) -from nanotron.serialize import ( - load_weights, -) +from nanotron.serialize import load_weights from nanotron.trainer import CONFIG_TO_MODEL_CLASS, mark_tied_parameters try: @@ -86,12 +94,8 @@ def main(): log_level_replica="info", ) - if dist.get_rank(parallel_context.world_pg) == 0: - if logging_config.log_level is not None: - set_logger_verbosity_format(logging_config.log_level, parallel_context=parallel_context) - else: - if logging_config.log_level_replica is not None: - set_logger_verbosity_format(logging_config.log_level_replica, parallel_context=parallel_context) + # Set log levels + set_ranks_logging_level(parallel_context=parallel_context, logging_config=logging_config) log_rank(f"model_config: {model_config}", logger=logger, level=logging.INFO, rank=0) log_rank(f"tokenizer_path: {tokenizer_path}", logger=logger, level=logging.INFO, rank=0) diff --git a/src/nanotron/logging.py b/src/nanotron/logging.py index 0e8441dd..72d68efc 100644 --- a/src/nanotron/logging.py +++ b/src/nanotron/logging.py @@ -18,13 +18,24 @@ import sys from dataclasses import dataclass from functools import lru_cache -from logging import CRITICAL, DEBUG, ERROR, FATAL, INFO, NOTSET, WARNING, Formatter, Logger +from logging import ( + CRITICAL, + DEBUG, + ERROR, + FATAL, + INFO, + NOTSET, + WARNING, + Formatter, + Logger, +) from typing import List, Optional, Union import torch from torch import distributed as torch_dist from nanotron import distributed as dist +from nanotron.config.config import LoggingArgs from nanotron.parallel import ParallelContext log_levels = { @@ -283,7 +294,6 @@ def set_logger_verbosity_format(logging_level: str, parallel_context: ParallelCo f"TP={dist.get_rank(parallel_context.tp_pg)}{expert_parallel_log}{'|' + node_name if node_name else ''}]: %(message)s", datefmt="%m/%d/%Y %H:%M:%S", ) - # TODO @thomasw21: `logging.log_levels` returns valid lg log levels log_level = log_levels[logging_level] # main root logger @@ -299,4 +309,13 @@ def set_logger_verbosity_format(logging_level: str, parallel_context: ParallelCo set_formatter(formatter=formatter) +def set_ranks_logging_level(parallel_context: ParallelContext, logging_config: LoggingArgs): + if dist.get_rank(parallel_context.world_pg) == 0: + if logging_config.log_level is not None: + set_logger_verbosity_format(logging_config.log_level, parallel_context=parallel_context) + else: + if logging_config.log_level_replica is not None: + set_logger_verbosity_format(logging_config.log_level_replica, parallel_context=parallel_context) + + _configure_library_root_logger() diff --git a/src/nanotron/trainer.py b/src/nanotron/trainer.py index 8ff99c38..490793fa 100644 --- a/src/nanotron/trainer.py +++ b/src/nanotron/trainer.py @@ -45,7 +45,7 @@ human_format, log_memory, log_rank, - set_logger_verbosity_format, + set_ranks_logging_level, ) from nanotron.models import NanotronModel, build_model from nanotron.models.base import check_model_has_grad @@ -145,14 +145,7 @@ def __init__( self.pre_init() # Set log levels - if dist.get_rank(self.parallel_context.world_pg) == 0: - if self.config.logging.log_level is not None: - set_logger_verbosity_format(self.config.logging.log_level, parallel_context=self.parallel_context) - else: - if self.config.logging.log_level_replica is not None: - set_logger_verbosity_format( - self.config.logging.log_level_replica, parallel_context=self.parallel_context - ) + set_ranks_logging_level(parallel_context=self.parallel_context, logging_config=self.config.logging) # Log benchmark info if os.environ.get("NANOTRON_BENCHMARK", "0") == "1": From 1fcc6b59f28fa70bbe835ee72c21ba7afde506a7 Mon Sep 17 00:00:00 2001 From: NouamaneTazi Date: Mon, 4 Mar 2024 17:07:12 +0000 Subject: [PATCH 6/6] . --- src/nanotron/trainer.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/nanotron/trainer.py b/src/nanotron/trainer.py index 663f1340..7b262fd0 100644 --- a/src/nanotron/trainer.py +++ b/src/nanotron/trainer.py @@ -193,8 +193,6 @@ def __init__( ) # Define iteration start state - self.start_iteration_step: int - self.consumed_train_samples: int if self.init_checkpoint_path is not None: checkpoint_metadata = load_meta( parallel_context=self.parallel_context, root_folder=self.init_checkpoint_path @@ -261,8 +259,6 @@ def train( self.save_checkpoint() if isinstance(dataloader_or_dls, tuple): - dataloader_or_dls[1] if len(dataloader_or_dls) > 1 else None - dataloader_or_dls[2] if len(dataloader_or_dls) > 2 else None dataloader = dataloader_or_dls[0] else: dataloader = dataloader_or_dls @@ -629,7 +625,7 @@ def _init_model( # Mark some parameters as tied self._mark_tied_parameters(model=model, parallel_context=parallel_context, parallel_config=parallel_config) - + # count number of parameters num_params = sum(p.numel() for p in model.parameters()) size_params = sum(p.numel() * p.element_size() for p in model.parameters()) @@ -764,12 +760,17 @@ def save_checkpoint(self) -> Path: return checkpoint_path - - def _mark_tied_parameters(self, model: NanotronModel, parallel_context: ParallelContext, parallel_config: Optional[ParallelismArgs] = None): + def _mark_tied_parameters( + self, + model: NanotronModel, + parallel_context: ParallelContext, + parallel_config: Optional[ParallelismArgs] = None, + ): mark_tied_parameters( model=self.model, parallel_context=self.parallel_context, parallel_config=self.config.parallelism ) - + + def mark_tied_parameters( model: NanotronModel, parallel_context: ParallelContext, parallel_config: Optional[ParallelismArgs] = None ):