Support complicated use cases with TiedLayerSpec (#7208)

limjcst · Mingjie Li · tohtana · web-flow · commit 185330cdff31 · 2025-04-09T15:11:51.000Z
I want to reuse a composed module in the pipeline. For example, the
following `MyModule` has a member `linear`, which is also a module.

```python
class MyModule(torch.nn.Module):
    def __init__(self, n_in: int, n_out: int):
        super().__init__()
        self.linear = torch.nn.Linear(n_in, n_out)
        self.layer_norm = torch.nn.LayerNorm(n_out)

    def forward(self, data: torch.Tensor) -&gt; torch.Tensor:
        hidden = self.linear(data)
        hidden = self.layer_norm(hidden)
        return hidden
```

`MyModule.linear.weight` should be synchronized among related ranks. As
a result, I add `linear.weight` to `TiedLayerSpec.tied_weight_attr`.
BTW, I generate the whole `tied_weight_attr` by the following
instruction.

```python
tied_weight_attr = [name for name, p in layer.named_parameters() if p.numel() &gt; 1]
```

However, the builtin `getattr` used by `PipelineModule` fails to find a
nested attribute like `linear.weight`.
Hence, this PR first extends the builtin `getattr` to a recursive
version `PipelineModule._recursive_getattr`, accessing each attribute
segment one by one.

Meanwhile, the order of tied weights matters in synchronization. This PR
suggests to sort tie_keys in `PipelineModule._index_tied_modules` to
avoid hanging.

Signed-off-by: Mingjie Li &lt;limingjie@chinamobile.com&gt;
Co-authored-by: Mingjie Li &lt;limingjie@chinamobile.com&gt;
Co-authored-by: Masahiro Tanaka &lt;81312776+tohtana@users.noreply.github.com&gt;
diff --git a/deepspeed/runtime/pipe/module.py b/deepspeed/runtime/pipe/module.py
@@ -443,26 +443,34 @@ def _partition_layers(self, method='uniform'):
 
         self._set_bounds(start=self.parts[stage_id], stop=self.parts[stage_id + 1])
 
+    @staticmethod
+    def _recursive_getattr(module: torch.nn.Module, attr_name: str) -> torch.Tensor:
+        '''Allow getting an attribute like "linear.weight"'''
+        weight = module
+        for item in attr_name.split("."):
+            weight = getattr(weight, item)
+        return weight
+
     def allreduce_tied_weight_gradients(self):
         '''All reduce the gradients of the tied weights between tied stages'''
         for key, comm in self.tied_comms.items():
             for attr_name in comm['weight_attr']:
-                weight = getattr(self.tied_modules[key], attr_name)
+                weight = self._recursive_getattr(self.tied_modules[key], attr_name)
                 dist.all_reduce(weight.grad, group=comm['group'])
 
     def get_tied_weights_and_groups(self):
         weight_group_list = []
         for key, comm in self.tied_comms.items():
             for attr_name in comm['weight_attr']:
-                weight = getattr(self.tied_modules[key], attr_name)
+                weight = self._recursive_getattr(self.tied_modules[key], attr_name)
                 weight_group_list.append((weight, comm['group']))
         return weight_group_list
 
     def _synchronize_tied_weights(self):
         for key, comm in self.tied_comms.items():
             for attr_name in comm['weight_attr']:
                 dist.broadcast(
-                    getattr(comm['module'], attr_name),
+                    self._recursive_getattr(comm['module'], attr_name),
                     src=min(comm['ranks']),
                     group=comm['group'],
                 )
@@ -475,7 +483,10 @@ def _index_tied_modules(self):
 
         specs = self._layer_specs
         tie_keys = set(s.key for s in specs if isinstance(s, TiedLayerSpec))
-        for key in tie_keys:
+        # Since Python 3.7, "Dictionary order is guaranteed to be insertion order."
+        # Sort tie_keys here so that orders of self.tied_comms.items() are consistent
+        # among ranks.
+        for key in sorted(tie_keys):
             # Find the layers that the tied module appears in
             tied_layers = []
             for idx, layer in enumerate(specs):