[TiledMLP]: fix for bs>1 (#7412)

stas00 · web-flow · commit 2790220d3125 · 2025-07-07T14:34:03.000-04:00
It looks like my TiledMLP was working correctly only for batch_size=1 fixing to work with any bs thanks to @winglian for detecting the problem and sending me an easy repro --------- Signed-off-by: Stas Bekman <stas@stason.org>
diff --git a/deepspeed/runtime/sequence_parallel/ulysses_sp.py b/deepspeed/runtime/sequence_parallel/ulysses_sp.py
@@ -706,7 +706,7 @@ def backward(ctx, *grads) -> torch.Tensor:
         }
 
         # if seqlen is not exactly divisible by shards the last step will be shorter than shard_step
-        shard_step = kwargs_to_shard_shards[grad_requiring_tensor_key][0].numel()
+        shard_step = kwargs_to_shard_shards[grad_requiring_tensor_key][0].shape[1]
         for i in range(shards):
 
             # when fn involves one or more model weights deepspeed will normally push a grad to
@@ -731,8 +731,8 @@ def backward(ctx, *grads) -> torch.Tensor:
             shard_offset = i * shard_step
             # this will enable gradual population of the pre-allocated
             # `grad_requiring_tensor_shard.grad` during `torch.autograd.backward` calls
-            grad_requiring_tensor_shard.grad = (grad_requiring_tensor_grad.view(-1).narrow(
-                0, shard_offset, grad_requiring_tensor_shard.numel()).view_as(grad_requiring_tensor_shard))
+            grad_requiring_tensor_shard.grad = (grad_requiring_tensor_grad.narrow(
+                1, shard_offset, shard_step).view_as(grad_requiring_tensor_shard))
 
             with torch.enable_grad():
                 output = fn(**kwargs_to_shard_shard, **kwargs_to_pass)
@@ -741,8 +741,8 @@ def backward(ctx, *grads) -> torch.Tensor:
                 # loss use-case
                 torch.autograd.backward(output, incoming_grad)
             else:
-                incoming_grad_shard = (incoming_grad.view(-1).narrow(
-                    0, shard_offset, grad_requiring_tensor_shard.numel()).view_as(grad_requiring_tensor_shard))
+                incoming_grad_shard = (incoming_grad.narrow(1, shard_offset,
+                                                            shard_step).view_as(grad_requiring_tensor_shard))
                 torch.autograd.backward(output, incoming_grad_shard)
 
         # positional args
@@ -836,7 +836,7 @@ def backward(ctx, *grads) -> torch.Tensor:
         x_grad = torch.zeros_like(x)
         x_shards = list(torch.chunk(x, chunks=shards, dim=1))
 
-        shard_step = x_shards[0].numel()
+        shard_step = x_shards[0].shape[1]
         for i, x_shard in enumerate(x_shards):
 
             # Tell deepspeed not to add a new grad to its ipg bucket until the last shard is run
@@ -852,8 +852,8 @@ def backward(ctx, *grads) -> torch.Tensor:
             x_shard.requires_grad_(x_requires_grad)
 
             shard_offset = i * shard_step
-            x_shard.grad = x_grad.view(-1).narrow(0, shard_offset, x_shard.numel()).view_as(x_shard)
-            incoming_grad_shard = incoming_grad.view(-1).narrow(0, shard_offset, x_shard.numel()).view_as(x_shard)
+            x_shard.grad = x_grad.narrow(1, shard_offset, shard_step).view_as(x_shard)
+            incoming_grad_shard = incoming_grad.narrow(1, shard_offset, shard_step).view_as(x_shard)
             with torch.enable_grad():
                 output = fn(self, x_shard)
             torch.autograd.backward(output, incoming_grad_shard)
@@ -1010,15 +1010,14 @@ def backward(ctx, *grads) -> torch.Tensor:
         shift_labels_shards = list(torch.chunk(shift_labels, chunks=shards, dim=1))
 
         # if seqlen is not exactly divisible by shards the last step will be shorter than shard_step
-        shard_step = logits_shards[0].numel()
+        shard_step = logits_shards[0].shape[1]
         for i in range(shards):
             logits_shard = logits_shards.pop(0)
             shift_labels_shard = shift_labels_shards.pop(0)
 
             shard_offset = i * shard_step
             # this will enable gradual population of the pre-allocated `logits_shard.grad` during `torch.autograd.backward` calls
-            logits_shard.grad = (logits_grad.view(-1).narrow(0, shard_offset,
-                                                             logits_shard.numel()).view_as(logits_shard))
+            logits_shard.grad = (logits_grad.narrow(1, shard_offset, shard_step).view_as(logits_shard))
 
             with torch.enable_grad():
                 if all((shift_labels_shard == -100).squeeze()):
diff --git a/tests/unit/ulysses_alst/test_tiled_compute.py b/tests/unit/ulysses_alst/test_tiled_compute.py
@@ -43,17 +43,20 @@ def forward(self, x):
 
 class MyModel(Module):
 
-    def __init__(self, hidden_dim):
+    def __init__(self, hidden_dim, vocab_size):
         super().__init__()
+        self.vocab_size = vocab_size
         # Critical - need to use a stack of at least 2 mlps to validate that the backward of the last mlp sends the correct gradients to the previous mlp in the stack
         self.mlp1 = SimpleMLP(hidden_dim)
         self.mlp2 = SimpleMLP(hidden_dim)
+        self.lm_head = torch.nn.Linear(hidden_dim, vocab_size, bias=False)
         self.cross_entropy_loss = torch.nn.CrossEntropyLoss()
 
     def forward(self, x, y):
         x = self.mlp1(x)
         x = self.mlp2(x)
-        return self.cross_entropy_loss(x, y)
+        logits = self.lm_head(x)
+        return self.cross_entropy_loss(logits.view(-1, self.vocab_size), y.view(-1))
 
 
 def mlp_forward_tiled_mlp(self, x):
@@ -121,17 +124,18 @@ def test_tiled_mlp(self, zero_stage):
         # for debug
         # torch.set_printoptions(precision=8, sci_mode=True)
 
+        vocab_size = 10
         seed = 42
-        hidden_dim = 100
-        bs = 1
-        seqlen = hidden_dim
+        hidden_dim = 128
+        bs = 2
+        seqlen = 64
         torch.manual_seed(seed)
         x = torch.rand((bs, seqlen, hidden_dim), dtype=dtype, requires_grad=True)
-        y = torch.empty((bs, seqlen), dtype=torch.long, requires_grad=False).random_(hidden_dim)
+        y = torch.empty((bs, seqlen), dtype=torch.long, requires_grad=False).random_(vocab_size)
 
         # A. Baseline: model with normal MLP
         torch.manual_seed(seed)
-        model_a = MyModel(hidden_dim=hidden_dim).to(dtype)
+        model_a = MyModel(hidden_dim=hidden_dim, vocab_size=vocab_size).to(dtype)
         model_a, _, _, _ = deepspeed.initialize(config=config_dict,
                                                 model=model_a,
                                                 model_parameters=model_a.parameters())
@@ -144,15 +148,17 @@ def test_tiled_mlp(self, zero_stage):
 
         loss_a = model_a(x_a, y_a)
         model_a.backward(loss_a)
-        grad_a1 = get_grad(model_a.module.mlp1.up_proj.weight, zero_stage)
-        grad_a2 = get_grad(model_a.module.mlp2.up_proj.weight, zero_stage)
-        assert grad_a1 is not None
-        assert grad_a2 is not None
+        param_grad_a1 = get_grad(model_a.module.mlp1.up_proj.weight, zero_stage)
+        param_grad_a2 = get_grad(model_a.module.mlp2.up_proj.weight, zero_stage)
+        x_grad_a = x_a.grad
+        assert param_grad_a1 is not None
+        assert param_grad_a2 is not None
+        assert x_grad_a is not None
 
         # B. model with tiled MLP using TiledMLP
         torch.manual_seed(seed)
         SimpleMLP.forward = mlp_forward_tiled_mlp
-        model_b = MyModel(hidden_dim=hidden_dim).to(dtype)
+        model_b = MyModel(hidden_dim=hidden_dim, vocab_size=vocab_size).to(dtype)
         model_b, _, _, _ = deepspeed.initialize(config=config_dict,
                                                 model=model_b,
                                                 model_parameters=model_b.parameters())
@@ -161,31 +167,34 @@ def test_tiled_mlp(self, zero_stage):
         y_b = y.clone().detach()
         loss_b = model_b(x_b, y_b)
         model_b.backward(loss_b)
-        grad_b1 = get_grad(model_b.module.mlp1.up_proj.weight, zero_stage)
-        grad_b2 = get_grad(model_b.module.mlp2.up_proj.weight, zero_stage)
-        assert grad_b1 is not None
-        assert grad_b2 is not None
+        param_grad_b1 = get_grad(model_b.module.mlp1.up_proj.weight, zero_stage)
+        param_grad_b2 = get_grad(model_b.module.mlp2.up_proj.weight, zero_stage)
+        x_grad_b = x_b.grad
+        assert param_grad_b1 is not None
+        assert param_grad_b2 is not None
+        assert x_grad_b is not None
 
         # print(f"{loss_a=}")
         # print(f"{loss_b=}")
-        # print(f"{grad_a1=}")
-        # print(f"{grad_b1=}")
-        # print(f"{grad_a2=}")
-        # print(f"{grad_b2=}")
+        # print(f"{param_grad_a1=}")
+        # print(f"{param_grad_b1=}")
+        # print(f"{param_grad_a2=}")
+        # print(f"{param_grad_b2=}")
         torch_assert_equal(loss_a, loss_b)
 
         # Gradient will not be exactly the same, especially under half-precision. And bf16 is
         # particularly lossy so need to lower tolerance a bit more than the default. Switch to
         # dtype torch.float or even torch.double to see that the diff is tiny - so the math is
         # correct, but accumulation error adds up. Alternatively making hidden_dim bigger makes the
         # divergence much smaller as well.
-        torch_assert_close(grad_a1, grad_b1)  #, rtol=1e-03, atol=1e-04)
-        torch_assert_close(grad_a2, grad_b2)  #, rtol=1e-03, atol=1e-04)
+        torch_assert_close(param_grad_a1, param_grad_b1)  #, rtol=1e-03, atol=1e-04)
+        torch_assert_close(param_grad_a2, param_grad_b2)  #, rtol=1e-03, atol=1e-04)
+        torch_assert_close(x_grad_a, x_grad_b)
 
         # C. model with tiled MLP using the generic version of the same via sequence_tiled_compute + SequenceTiledCompute
         torch.manual_seed(seed)
         SimpleMLP.forward = mlp_forward_sequence_tiled_compute
-        model_c = MyModel(hidden_dim=hidden_dim).to(dtype)
+        model_c = MyModel(hidden_dim=hidden_dim, vocab_size=vocab_size).to(dtype)
         model_c, _, _, _ = deepspeed.initialize(config=config_dict,
                                                 model=model_c,
                                                 model_parameters=model_c.parameters())
@@ -194,16 +203,19 @@ def test_tiled_mlp(self, zero_stage):
         y_c = y.clone().detach()
         loss_c = model_c(x_c, y_c)
         model_c.backward(loss_c)
-        grad_c1 = get_grad(model_c.module.mlp1.up_proj.weight, zero_stage)
-        grad_c2 = get_grad(model_c.module.mlp2.up_proj.weight, zero_stage)
-        assert grad_c1 is not None
-        assert grad_c2 is not None
+        param_grad_c1 = get_grad(model_c.module.mlp1.up_proj.weight, zero_stage)
+        param_grad_c2 = get_grad(model_c.module.mlp2.up_proj.weight, zero_stage)
+        x_grad_c = x_c.grad
+        assert param_grad_c1 is not None
+        assert param_grad_c2 is not None
+        assert x_grad_c is not None
 
         # print(f"{loss_a=}")
         # print(f"{loss_c=}")
-        # print(f"{grad_a1=}")
-        # print(f"{grad_c1=}")
+        # print(f"{param_grad_a1=}")
+        # print(f"{param_grad_c1=}")
         # see notes for B
         torch_assert_equal(loss_a, loss_c)
-        torch_assert_close(grad_a1, grad_c1)  #, rtol=1e-03, atol=1e-04)
-        torch_assert_close(grad_a2, grad_c2)  #, rtol=1e-03, atol=1e-04)
+        torch_assert_close(param_grad_a1, param_grad_c1)  #, rtol=1e-03, atol=1e-04)
+        torch_assert_close(param_grad_a2, param_grad_c2)  #, rtol=1e-03, atol=1e-04)
+        torch_assert_close(x_grad_a, x_grad_c)