Fix AutoTP gathering replaced layer params when bias is not None (#7257)

HollowMan6 · hwchen2017 · inkcherry · web-flow · commit b666844ffc1a · 2025-05-25T04:03:20.000Z
Some params are one-dimensional, this PR adds support for these params. Resolve #7249 ```log param.shape torch.Size([768, 1536]) param.shape torch.Size([768]) ... ``` ```log with deepspeed.module_inject.layers.GatherReplacedLayerParams([param], model, enabled=True): ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "deepspeed/module_inject/layers.py", line 359, in __enter__ self.params[0].gather_params(self.params) File "torch/utils/_contextlib.py", line 116, in decorate_context return func(*args, **kwargs) ^^^^^^^^^^^^^^^^^^^^^ File "deepspeed/module_inject/layers.py", line 473, in gather_params param.shape[1], ~~~~~~~~~~~^^^ IndexError: tuple index out of range ``` --------- Signed-off-by: Hollow Man <hollowman@opensuse.org> Signed-off-by: inkcherry <mingzhi.liu@intel.com> Co-authored-by: Hongwei Chen <33092912+hwchen2017@users.noreply.github.com> Co-authored-by: inkcherry <mingzhi.liu@intel.com>
diff --git a/deepspeed/module_inject/layers.py b/deepspeed/module_inject/layers.py
@@ -49,6 +49,18 @@ def set_autotp_mode(training=False):
         DEEPSPEED_AUTOTP_MODE = AUTOTP_MODE.INFERENCE
 
 
+def add_bias(input, bias):
+    if bias is None:
+        return input
+    if is_autotp_training_mode():
+        # Training mode - avoid inplace to ensure correct autograd
+        input = input + bias
+        return input
+    else:
+        input += bias
+        return input
+
+
 class RowParallel(torch.autograd.Function):
     """
     A custom autograd function for performing row-wise parallelism.
@@ -92,7 +104,7 @@ def forward(ctx: Any, group: dist.ProcessGroup, input: torch.Tensor, weight, bia
         ctx.group = group
         output = torch.matmul(input, weight.transpose(-1, -2))
         if bias is not None:
-            output += bias
+            output = add_bias(output, bias)
 
         ctx.save_for_backward(input, weight)
 
@@ -220,6 +232,14 @@ def _tp_partition(self, params_list: List[torch.Tensor]):
         """
         pass
 
+    def config_requires_grad(self, weight):
+        if weight is not None:
+            if self.is_training_mode():
+                if weight.requires_grad is None:
+                    weight.requires_grad = True
+            else:
+                weight.requires_grad = False
+
     def config_tp_params(self, weight):
         """
         Configures the weight tensor for training with tensor parallelism. This includes enabling gradients
@@ -233,15 +253,11 @@ def config_tp_params(self, weight):
         if self.is_training_mode():
             assert self.support_training, "No implementation of backward."
         if weight is not None:
-            if self.is_training_mode():
-                if weight.requires_grad is None:
-                    weight.requires_grad = True
-            else:
-                weight.requires_grad = False
-            setattr(weight, DS_TENSOR_MODEL_PARALLEL, True)
-            setattr(weight, DS_IS_REPLACED_MODULE, True)
+            self.config_requires_grad(weight)
             weight.gather_params = self.gather_params
             weight._tp_partition = self._tp_partition
+            setattr(weight, DS_TENSOR_MODEL_PARALLEL, True)
+            setattr(weight, DS_IS_REPLACED_MODULE, True)
 
     def is_training_mode(self):
         global DEEPSPEED_AUTOTP_MODE
@@ -377,13 +393,14 @@ def __init__(self, module, mp_group, **kwargs):
         self.support_training = True
         self.config_tp_params(self.weight)
         if self.bias is not None:
-            self.config_tp_params(self.bias)
+            # bias here is not tp params
+            self.config_requires_grad(self.bias)
 
     def forward(self, input):
         output = torch.matmul(input, self.weight.transpose(-1, -2))
         output = RowParallel.apply(self.mp_group, output, not self.is_training_mode())
         if self.bias is not None:
-            output += self.bias
+            output = add_bias(output, self.bias)
         return output
 
     @torch.no_grad()
@@ -395,6 +412,7 @@ def gather_params(self, params_list):
                 return
             params_list[idx].data_partition = param.data
             param = param.transpose(0, 1).contiguous()
+
             output_param = torch.empty(self.tp_world_size * param.shape[0],
                                        param.shape[1],
                                        dtype=param.dtype,
@@ -412,9 +430,14 @@ def _tp_partition(self, params_list):
 
         else:
             for idx, param in enumerate(params_list):
-                if param is None or idx > 0:
+                if param is None:
                     # don't slipt bias
                     return
+                if idx > 0:  # move bias to device at initialization
+                    _partition = self.move(param).detach()
+                    params_list[idx].data = _partition
+                    return
+
                 _partition = torch.chunk(param, self.tp_world_size, dim=-1)[self.tp_index]
 
                 _partition = self.move(_partition).detach()
@@ -455,7 +478,7 @@ def forward(self, input):
                 input = ColumnParallel.apply(self.mp_group, input)
             output = torch.matmul(input, self.weight.transpose(-1, -2))
             if self.bias is not None:
-                output += self.bias
+                output = add_bias(output, self.bias)
         else:
             output = AsyncColumnParallel.apply(self.mp_group, input, self.weight, self.bias)
 
@@ -467,8 +490,7 @@ def gather_params(self, params_list):
         for idx, param in enumerate(params_list):
 
             params_list[idx].data_partition = param.data
-            output_param = torch.empty(self.tp_world_size * param.shape[0],
-                                       param.shape[1],
+            output_param = torch.empty((self.tp_world_size * param.shape[0], *param.shape[1:]),
                                        dtype=param.dtype,
                                        device=param.device)
             dist.all_gather_into_tensor(output_param, param, group=self.mp_group)
@@ -651,7 +673,7 @@ def forward(self, input):
         if self.mp_group is not None:
             dist.inference_all_reduce(output, group=self.mp_group)
         if self.bias is not None:
-            output += self.bias
+            output = add_bias(output, self.bias)
         return output
 
 
diff --git a/tests/unit/model_parallelism/test_autotp_training.py b/tests/unit/model_parallelism/test_autotp_training.py
@@ -19,6 +19,7 @@
 from deepspeed.module_inject.layers import LinearAllreduce, LinearLayer, set_autotp_mode
 from unit.checkpoint.common import compare_lr_scheduler_states, compare_optimizer_states
 import os
+from deepspeed.runtime.utils import is_model_parallel_parameter
 
 
 def skip_on_device():
@@ -30,10 +31,9 @@ class SequentialLinearModel(torch.nn.Module):
 
     def __init__(self, hidden_dim, empty_grad=False, nlayers=1):
         super(SequentialLinearModel, self).__init__()
-        self.linears = torch.nn.ModuleList(
-            [torch.nn.Linear(hidden_dim, hidden_dim, bias=None) for i in range(nlayers)])
+        self.linears = torch.nn.ModuleList([torch.nn.Linear(hidden_dim, hidden_dim) for _ in range(nlayers)])
         if empty_grad:
-            self.linear2 = torch.nn.Linear(hidden_dim, hidden_dim, bias=None)
+            self.linear2 = torch.nn.Linear(hidden_dim, hidden_dim)
         self.cross_entropy_loss = torch.nn.CrossEntropyLoss()
         self.empty_grad = empty_grad
 
@@ -153,8 +153,7 @@ def process_linear_layer(hidden_dim, input):
     torch_linear = nn.Linear(hidden_dim,
                              hidden_dim,
                              dtype=preferred_dtype(),
-                             device=get_accelerator().current_device(),
-                             bias=None)
+                             device=get_accelerator().current_device())
     torch_out = torch_linear(input)
     torch_loss = torch_out.sum()
     torch_loss.backward()
@@ -215,6 +214,9 @@ def testRowParallel(self, tp_size: int, tp_overlap_comm: bool):
         loss.backward()
 
         torch_grad = torch.chunk(torch_linear.weight.grad, tp_size, dim=1)[groups.get_tensor_model_parallel_rank()]
+        torch_bias_grad = torch_linear.bias.grad
+        assert torch.allclose(linear.bias.grad, torch_bias_grad.to(get_accelerator().current_device()), atol=1e-3)
+        # The gradient of the weight is not the same as the torch_linear.weight.grad
         assert torch.allclose(linear.weight.grad, torch_grad.to(get_accelerator().current_device()), atol=1e-3)
         assert torch.allclose(out, torch_out.to(get_accelerator().current_device()), atol=1e-2)
 
@@ -266,6 +268,10 @@ def testColumnParallel(self, tp_size: int, tp_overlap_comm: bool):
 
         cur_device_out = torch.chunk(torch_out, tp_size, dim=-1)[groups.get_tensor_model_parallel_rank()]
         torch_grad = torch.chunk(torch_linear.weight.grad, tp_size, dim=0)[groups.get_tensor_model_parallel_rank()]
+
+        torch_bias_grad = torch.chunk(torch_linear.bias.grad, tp_size, dim=0)[groups.get_tensor_model_parallel_rank()]
+        assert torch.allclose(linear.bias.grad, torch_bias_grad.to(get_accelerator().current_device()), atol=1e-3)
+
         assert torch.allclose(linear.weight.grad, torch_grad.to(get_accelerator().current_device()), atol=1e-3)
         assert torch.allclose(cur_device_out.to(get_accelerator().current_device()).contiguous(),
                               out.contiguous(),
@@ -307,23 +313,36 @@ def test(self, layer_type):
         model = SequentialLinearModel(hidden_dim=hidden_dim)
         model, _, _, _ = deepspeed.initialize(model=model, model_parameters=model.parameters(), config=config_dict)
 
-        torch_linear = nn.Linear(hidden_dim, hidden_dim, dtype=preferred_dtype(), device="cpu", bias=None)
+        torch_linear = nn.Linear(hidden_dim, hidden_dim, dtype=preferred_dtype(), device="cpu")
         total_params = sum(p.numel() for p in torch_linear.parameters())
-
         tp_layer = None
         if layer_type == "linear":
-            tp_layer = LinearLayer(torch_linear, groups.get_tensor_model_parallel_group())
+            tp_layer = LinearLayer(deepcopy(torch_linear), groups.get_tensor_model_parallel_group())
         elif layer_type == "linearallreduce":
-            tp_layer = LinearAllreduce(torch_linear, groups.get_tensor_model_parallel_group())
+            tp_layer = LinearAllreduce(deepcopy(torch_linear), groups.get_tensor_model_parallel_group())
         else:
             raise ValueError(f"Invalid linear type: {config_dict['linear_type']}")
 
         tp_params = sum(p.numel() for p in tp_layer.parameters())
 
-        assert total_params // tp_size == tp_params
+        expected_tp_params = 0
+        # compute expected TP params:
+        # - column-parallel (LinearLayer): weight & bias both split => total // tp_size
+        # - row-parallel    (LinearAllreduce): weight split, bias (1d tensors) replicated
+        if layer_type == "linearallreduce":
+            weight_params = torch_linear.weight.numel()
+            bias_params = torch_linear.bias.numel()
+            expected_tp_params = weight_params // tp_size + bias_params
+        else:
+            expected_tp_params = total_params // tp_size
+        assert expected_tp_params == tp_params, (
+            f"{layer_type}: expected {expected_tp_params} tp params, got {tp_params}")
+
         for name, param in tp_layer.named_parameters(recurse=False):
-            param.gather_params([param])
+            if is_model_parallel_parameter(param):
+                param.gather_params([param])
 
+        torch_linear = torch_linear.to(get_accelerator().current_device())
         is_same_weights = all(
             torch.equal(param1, param2) for param1, param2 in zip(tp_layer.parameters(), torch_linear.parameters()))
 
@@ -333,11 +352,12 @@ def test(self, layer_type):
         assert total_params == params1
 
         for name, param in tp_layer.named_parameters(recurse=False):
-            param._tp_partition([param])
+            if is_model_parallel_parameter(param):
+                param._tp_partition([param])
 
         tp_params2 = sum(p.numel() for p in tp_layer.parameters())
 
-        assert total_params // tp_size == tp_params2
+        assert expected_tp_params == tp_params2
 
 
 def dummy_init_engine(config):
@@ -571,7 +591,7 @@ def test(self, tp_size: int, zero_stage: int):
 
         tp_norm = tp_optimizer._global_grad_norm
 
-        assert math.isclose(base_norm, tp_norm, abs_tol=1e-3)
+        assert math.isclose(base_norm, tp_norm, abs_tol=1e-3), f"base_norm: {base_norm}, tp_norm: {tp_norm}"
         tp_params_numel = sum(p.numel() for p in tp_model.parameters())
         base_params_numel = sum(p.numel() for p in base_model.parameters())
-        assert tp_params_numel < base_params_numel
+        assert tp_params_numel < base_params_numel, f"tp_params_numel: {tp_params_numel}, base_params_numel: {base_params_numel}"