Fix broken Llama4 accuracy in MoE part (#40609)

nvpohanh · Cyrilvallez · Cyrilvallez · commit ad6b8982a4d5 · 2025-09-04T22:18:41.000+02:00
* Fix broken Llama4 accuracy in MoE part Llama4 accuracy is broken by a bug in #39501 . It forgot to transpose the router_scores before applying it to routed_in, causing Llama4 to generate garbage output. This PR fixes that issue by adding back the transpose() and adding some comments explaining why the transpose() is needed. Signed-off-by: Po-Han Huang <pohanh@nvidia.com> * remove comment --------- Signed-off-by: Po-Han Huang <pohanh@nvidia.com> Co-authored-by: Cyril Vallez <cyril.vallez@gmail.com>
diff --git a/src/transformers/models/llama4/modeling_llama4.py b/src/transformers/models/llama4/modeling_llama4.py
@@ -158,7 +158,7 @@ def forward(self, hidden_states):
         hidden_states = hidden_states.reshape(-1, self.hidden_dim)
         router_scores, router_logits = self.router(hidden_states)
         routed_in = hidden_states.repeat(router_scores.shape[1], 1)
-        routed_in = routed_in * router_scores.reshape(-1, 1)
+        routed_in = routed_in * router_scores.transpose(0, 1).reshape(-1, 1)
         routed_out = self.experts(routed_in)
         out = self.shared_expert(hidden_states)
         out.add_(routed_out.reshape(router_scores.shape[1], -1, routed_out.shape[-1]).sum(dim=0))