Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Bump v0.4 + Quick refactos #96

Merged
merged 9 commits into from
Mar 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 33 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,34 @@
# ⚡️ Nanotron
<h1 align="center">⚡️ Nanotron</h1>

<p align="center">
<a href="https://github.com/huggingface/nanotron/releases">
<img alt="GitHub release" src="https://img.shields.io/github/release/huggingface/nanotron.svg">
</a>
<a href="https://arxiv.org/abs/2210.07316">
<img alt="GitHub release" src="https://img.shields.io/badge/arXiv-2305.14251-b31b1b.svg">
</a>
<a href="https://github.com/huggingface/nanotron/blob/master/LICENSE">
<img alt="License" src="https://img.shields.io/github/license/huggingface/nanotron.svg?color=green">
</a>
</p>

<h4 align="center">
<p>
<a href="#Philosophy">Philosophy</a> •
<a href="#Core-Features">Core Features</a> •
<a href="#Installation">Installation</a> •
<a href="#Quick-examples">Usage</a> •
<a href="#Development-guidelines">Contributions</a>
<p>
</h4>

<h3 align="center">
<a href="https://huggingface.co/nanotron"><img style="float: middle; padding: 10px 10px 10px 10px;" width="60" height="55" src="https://huggingface.co/datasets/huggingface/brand-assets/resolve/main/hf-logo.png" /></a>
</h3>



#

The objective of this library is to provide easy distributed primitives in order to train a variety of models efficiently using 3D parallelism. For more information about the internal design of the library or 3D parallelism in general, please check out [[docs.md]](./docs/docs.md) and [[3d_parallelism.md]](./docs/3d_parallelism.md).

Expand Down Expand Up @@ -28,12 +58,10 @@ To install (in a new env):
```bash
pip install torch
pip install packaging; pip install "flash-attn>=2.5.0" --no-build-isolation
git clone git@github.com:huggingface/nanotron.git
cd nanotron
pip install -e .
pip install nanotron
```

Also nice to have `transformers` `datasets` `python-etcd` `tensorboardX`: `pip install transformers datasets python-etcd tensorboardX`
Also nice to have: `pip install transformers datasets python-etcd tensorboardX`

We also support a set of flavors that you can install using `pip install -e [$FLAVOR]`:
- `dev`: Used is you are developping in `nanotron`. It installs in particular our linter mechanism. On top of that you have to run `pre-commit install` afterwards.
Expand Down Expand Up @@ -68,7 +96,6 @@ pre-commit run --config .pre-commit-config.yaml --all-files

Features we would like to add:
- [ ] Support `torch.compile`
- [ ] Support `torch.distributed.rpc`
- [ ] More optimized kernels
- [ ] Support Zero3
- [ ] Other PP schedules (such as Interleaved 1f1b...)
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "nanotron"
version = "0.2"
version = "0.4"
description = "Minimalistic Large Language Model Training and Finetuning"
authors = [
{name = "Nouamane Tazi", email="nouamane@huggingface.co"},
Expand Down
28 changes: 16 additions & 12 deletions run_generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,19 @@
import torch
from nanotron import distributed as dist
from nanotron import logging
from nanotron.config import GenerationArgs, LoggingArgs, ParallelismArgs, get_config_from_file
from nanotron.generation.decode import GenerationInput, TokenizerConfig, decode_text, decode_tokenized
from nanotron.logging import log_rank, set_logger_verbosity_format
from nanotron.config import (
GenerationArgs,
LoggingArgs,
ParallelismArgs,
get_config_from_file,
)
from nanotron.generation.decode import (
GenerationInput,
TokenizerConfig,
decode_text,
decode_tokenized,
)
from nanotron.logging import log_rank, set_ranks_logging_level
from nanotron.models import build_model
from nanotron.parallel import ParallelContext
from nanotron.parallel.parameters import sanity_check
Expand All @@ -32,9 +42,7 @@
get_synced_random_state,
set_random_seed,
)
from nanotron.serialize import (
load_weights,
)
from nanotron.serialize import load_weights
from nanotron.trainer import CONFIG_TO_MODEL_CLASS, mark_tied_parameters

try:
Expand Down Expand Up @@ -86,12 +94,8 @@ def main():
log_level_replica="info",
)

if dist.get_rank(parallel_context.world_pg) == 0:
if logging_config.log_level is not None:
set_logger_verbosity_format(logging_config.log_level, parallel_context=parallel_context)
else:
if logging_config.log_level_replica is not None:
set_logger_verbosity_format(logging_config.log_level_replica, parallel_context=parallel_context)
# Set log levels
set_ranks_logging_level(parallel_context=parallel_context, logging_config=logging_config)

log_rank(f"model_config: {model_config}", logger=logger, level=logging.INFO, rank=0)
log_rank(f"tokenizer_path: {tokenizer_path}", logger=logger, level=logging.INFO, rank=0)
Expand Down
2 changes: 1 addition & 1 deletion src/nanotron/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.2"
__version__ = "0.4"
23 changes: 21 additions & 2 deletions src/nanotron/logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,24 @@
import sys
from dataclasses import dataclass
from functools import lru_cache
from logging import CRITICAL, DEBUG, ERROR, FATAL, INFO, NOTSET, WARNING, Formatter, Logger
from logging import (
CRITICAL,
DEBUG,
ERROR,
FATAL,
INFO,
NOTSET,
WARNING,
Formatter,
Logger,
)
from typing import List, Optional, Union

import torch
from torch import distributed as torch_dist

from nanotron import distributed as dist
from nanotron.config.config import LoggingArgs
from nanotron.parallel import ParallelContext

log_levels = {
Expand Down Expand Up @@ -283,7 +294,6 @@ def set_logger_verbosity_format(logging_level: str, parallel_context: ParallelCo
f"TP={dist.get_rank(parallel_context.tp_pg)}{expert_parallel_log}{'|' + node_name if node_name else ''}]: %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
)
# TODO @thomasw21: `logging.log_levels` returns valid lg log levels
log_level = log_levels[logging_level]

# main root logger
Expand All @@ -299,4 +309,13 @@ def set_logger_verbosity_format(logging_level: str, parallel_context: ParallelCo
set_formatter(formatter=formatter)


def set_ranks_logging_level(parallel_context: ParallelContext, logging_config: LoggingArgs):
if dist.get_rank(parallel_context.world_pg) == 0:
if logging_config.log_level is not None:
set_logger_verbosity_format(logging_config.log_level, parallel_context=parallel_context)
else:
if logging_config.log_level_replica is not None:
set_logger_verbosity_format(logging_config.log_level_replica, parallel_context=parallel_context)


_configure_library_root_logger()
5 changes: 5 additions & 0 deletions src/nanotron/parallel/pipeline_parallel/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from nanotron.parallel.pipeline_parallel.engine import PipelineEngine
from nanotron.parallel.pipeline_parallel.tensor_pointer import TensorPointer
from nanotron.parallel.pipeline_parallel.utils import get_pp_rank_of

__all__ = ["PipelineEngine", "TensorPointer", "get_pp_rank_of"]
23 changes: 7 additions & 16 deletions src/nanotron/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
human_format,
log_memory,
log_rank,
set_logger_verbosity_format,
set_ranks_logging_level,
)
from nanotron.models import NanotronModel, build_model
from nanotron.models.base import check_model_has_grad
Expand All @@ -55,9 +55,11 @@
from nanotron.parallel import ParallelContext
from nanotron.parallel.data_parallel.utils import sync_gradients_across_dp
from nanotron.parallel.parameters import NanotronParameter, sanity_check
from nanotron.parallel.pipeline_parallel.engine import PipelineEngine
from nanotron.parallel.pipeline_parallel.tensor_pointer import TensorPointer
from nanotron.parallel.pipeline_parallel.utils import get_pp_rank_of
from nanotron.parallel.pipeline_parallel import (
PipelineEngine,
TensorPointer,
get_pp_rank_of,
)
from nanotron.parallel.tensor_parallel.nn import (
TensorParallelLinearMode,
TensorParallelRowLinear,
Expand Down Expand Up @@ -143,14 +145,7 @@ def __init__(
self.pre_init()

# Set log levels
if dist.get_rank(self.parallel_context.world_pg) == 0:
if self.config.logging.log_level is not None:
set_logger_verbosity_format(self.config.logging.log_level, parallel_context=self.parallel_context)
else:
if self.config.logging.log_level_replica is not None:
set_logger_verbosity_format(
self.config.logging.log_level_replica, parallel_context=self.parallel_context
)
set_ranks_logging_level(parallel_context=self.parallel_context, logging_config=self.config.logging)

# Log benchmark info
if os.environ.get("NANOTRON_BENCHMARK", "0") == "1":
Expand Down Expand Up @@ -198,8 +193,6 @@ def __init__(
)

# Define iteration start state
self.start_iteration_step: int
self.consumed_train_samples: int
if self.init_checkpoint_path is not None:
checkpoint_metadata = load_meta(
parallel_context=self.parallel_context, root_folder=self.init_checkpoint_path
Expand Down Expand Up @@ -266,8 +259,6 @@ def train(
self.save_checkpoint()

if isinstance(dataloader_or_dls, tuple):
dataloader_or_dls[1] if len(dataloader_or_dls) > 1 else None
dataloader_or_dls[2] if len(dataloader_or_dls) > 2 else None
dataloader = dataloader_or_dls[0]
else:
dataloader = dataloader_or_dls
Expand Down
Loading