huggingface · NouamaneTazi · Feb 1, 2024 · Jan 31, 2024 · Jan 31, 2024 · Jan 31, 2024
diff --git a/.github/workflows/3d_parallelism_unit_tests.yaml b/.github/workflows/3d_parallelism_unit_tests.yaml
@@ -10,7 +10,7 @@ on:
       - "tests/**/*.py"
 
   pull_request:
-    branches: [ main ]
+    branches: [ '**' ]
     paths:
      - "src/**/*.py"
      - "examples/**/*.py"

diff --git a/.github/workflows/code_quality.yaml b/.github/workflows/code_quality.yaml
@@ -9,7 +9,7 @@ on:
       - "src/**/*.py"
 
   pull_request:
-    branches: [ main ]
+    branches: [ '**' ]
     paths:
      - "src/**/*.py"
 

diff --git a/.github/workflows/fa2_unit_tests.yaml b/.github/workflows/fa2_unit_tests.yaml
@@ -11,7 +11,7 @@ on:
       - "tests/**/*.py"
 
   pull_request:
-    branches: [ main ]
+    branches: [ '**' ]
     paths:
      - "src/**/*.py"
      - "examples/**/*.py"

diff --git a/src/nanotron/distributed.py b/src/nanotron/distributed.py
@@ -260,6 +260,8 @@ def initialize_torch_distributed():
 
     # Call the init process.
     port = find_free_port()
-    init_method = f"tcp://localhost:{port}"
-    dist.init_process_group(init_method=init_method, backend=backend, world_size=world_size, rank=rank, timeout=dist.default_pg_timeout)
+    init_method = f"env://localhost:{port}"
+    dist.init_process_group(
+        init_method=init_method, backend=backend, world_size=world_size, rank=rank, timeout=dist.default_pg_timeout
+    )
     return True
diff --git a/src/nanotron/optim/clip_grads.py b/src/nanotron/optim/clip_grads.py
@@ -68,7 +68,7 @@ def clip_grad_norm(
                 dtype=torch.float,
             ).pow(norm_type)
         else:
-            total_norm = torch.zeros(1, dtype=torch.float, device=torch.device("cuda"))
+            total_norm = torch.zeros([], dtype=torch.float, device=torch.device("cuda"))
         dist.all_reduce(total_norm, group=mp_pg, op=dist.ReduceOp.SUM)
         total_norm.pow_(1.0 / norm_type)
 

diff --git a/tests/test_clip_grads.py b/tests/test_clip_grads.py
@@ -190,8 +190,13 @@ def _test_clip_grads_with_pp(parallel_context: ParallelContext, norm_type: float
 
 
 @pytest.mark.skipif(available_gpus() < 2, reason="test_clip_grads_with_tp requires at least 2 gpus")
-@pytest.mark.parametrize("tp_mode", list(TensorParallelLinearMode))
-@pytest.mark.parametrize("async_communication", [False, True])
+@pytest.mark.parametrize(
+    "tp_mode,async_communication",
+    [
+        pytest.param(TensorParallelLinearMode.ALL_REDUCE, False),
+        pytest.param(TensorParallelLinearMode.REDUCE_SCATTER, True),
+    ],
+)
 @pytest.mark.parametrize("norm_type", [math.inf, 1.0, 2.0])
 def test_clip_grads_with_tp(tp_mode: TensorParallelLinearMode, async_communication: bool, norm_type: float):
     init_distributed(tp=2, dp=1, pp=1)(_test_clip_grads_with_tp)(
@@ -340,17 +345,9 @@ def test_clip_grads_tied_weights(norm_type: float):
 
 def _test_clip_grads_tied_weights(parallel_context: ParallelContext, norm_type: float):
     if dist.get_rank(parallel_context.pp_pg) == 0:
-        model = nn.ModuleDict(
-            {
-                "dense0": nn.Linear(10, 10, device="cuda"),
-            }
-        )
+        model = nn.ModuleDict({"dense0": nn.Linear(10, 10, device="cuda")})
     else:
-        model = nn.ModuleDict(
-            {
-                "dense1": nn.Linear(10, 10, device="cuda"),
-            }
-        )
+        model = nn.ModuleDict({"dense1": nn.Linear(10, 10, device="cuda")})
 
     # Tie weights/bias
     tie_parameters(
@@ -422,14 +419,17 @@ def _test_clip_grads_tied_weights(parallel_context: ParallelContext, norm_type:
         norm_type=norm_type,
     )
     ref_total_norm = torch.nn.utils.clip_grad_norm_([ref_weight, ref_bias], max_norm=1.0, norm_type=norm_type)
+    assert len(total_norm.shape) == 0, f"total_norm should be a scalar. Got {total_norm}"
 
     # Check that the gradients have changed
     assert not torch.allclose(old_grad, weight.grad), "Gradients should have changed after clipping"
 
     # Test that we get the same gradient after clipping
     torch.testing.assert_close(weight.grad, ref_weight.grad, rtol=1e-7, atol=1e-6)
     torch.testing.assert_close(bias.grad, ref_bias.grad, rtol=1e-7, atol=1e-6)
-    assert total_norm == ref_total_norm, "Total norm should be the same"
+    torch.testing.assert_close(
+        total_norm, ref_total_norm, rtol=0, atol=0, msg=lambda msg: f"{msg}\n" f"Got {total_norm} and {ref_total_norm}"
+    )
 
 
 @pytest.mark.parametrize("half_precision", [torch.float16, torch.bfloat16])

diff --git a/tests/test_tensor_parallel.py b/tests/test_tensor_parallel.py
@@ -1,5 +1,4 @@
 import os
-from contextlib import nullcontext as does_not_raise
 from typing import Any
 
 import pytest
@@ -21,6 +20,8 @@
 @pytest.mark.parametrize("tp_mode", list(TensorParallelLinearMode))
 @pytest.mark.parametrize("async_communication", [False, True])
 def test_column_linear(tp: int, dp: int, pp: int, tp_mode: TensorParallelLinearMode, async_communication: bool):
+    if tp_mode is TensorParallelLinearMode.ALL_REDUCE and async_communication:
+        pytest.skip("ALL_REDUCE mode does not support async communication")
     init_distributed(tp=tp, dp=dp, pp=pp)(_test_column_linear)(
         tp_mode=tp_mode, async_communication=async_communication
     )
@@ -145,25 +146,13 @@ def _test_column_linear(
 
 
 @pytest.mark.parametrize("tp,dp,pp", [pytest.param(i, 1, 1) for i in range(1, min(4, available_gpus()) + 1)])
-@pytest.mark.parametrize(
-    "tp_mode,async_communication,expectation",
-    [
-        pytest.param(TensorParallelLinearMode.ALL_REDUCE, False, does_not_raise()),
-        pytest.param(TensorParallelLinearMode.REDUCE_SCATTER, False, does_not_raise()),
-        pytest.param(TensorParallelLinearMode.REDUCE_SCATTER, True, does_not_raise()),
-        pytest.param(
-            TensorParallelLinearMode.ALL_REDUCE,
-            True,
-            pytest.raises(
-                ValueError,
-                match=r"Cf this: https://github.com/huggingface/nanotron/blob/bf82cded9eef1ba77864b48e65bffefad4076339/src/nanotron/core/parallel/tensor_parallel/nn.py#L132",
-            ),
-        ),
-    ],
-)
+@pytest.mark.parametrize("tp_mode", list(TensorParallelLinearMode))
+@pytest.mark.parametrize("async_communication", [False, True])
 def test_row_linear(
     tp: int, dp: int, pp: int, tp_mode: TensorParallelLinearMode, async_communication: bool, expectation: Any
 ):
+    if tp_mode is TensorParallelLinearMode.ALL_REDUCE and async_communication:
+        pytest.skip("ALL_REDUCE mode does not support async communication")
     init_distributed(tp=tp, dp=dp, pp=pp)(_test_row_linear)(
         tp_mode=tp_mode, async_communication=async_communication, expectation=expectation
     )

diff --git a/tests/test_zero.py b/tests/test_zero.py
@@ -201,6 +201,8 @@ def _test_zero_optimizer(parallel_context: ParallelContext):
 def test_zero_optimizer_with_tp(
     tp: int, dp: int, pp: int, tp_mode: TensorParallelLinearMode, async_communication: bool
 ):
+    if tp_mode is TensorParallelLinearMode.ALL_REDUCE and async_communication:
+        pytest.skip("ALL_REDUCE mode does not support async communication")
     init_distributed(pp=pp, dp=dp, tp=tp)(_test_zero_optimizer_with_tp)(
         tp_mode=tp_mode, async_communication=async_communication
     )