reduce the weight loading time

frank-wei · facebook-github-bot · commit ba70a5ec9c8c · 2025-09-02T23:33:25.000-07:00
Summary:
ATT
On GB200, the MOE MXFP4 weight transpose takes quite a long time. 
Add the cache for weight transpose indices so that the expert weight transpose time can be reduced

**20b:**
Before: Model loading took 94sec
```
�[1;36m(EngineCore_0 pid=3397977)�[0;0m INFO 09-01 19:27:08 [default_loader.py:267] Loading weights took 2.83 seconds
�[1;36m(EngineCore_0 pid=3397977)�[0;0m INFO 09-01 19:28:41 [gpu_model_runner.py:1977] Model loading took 14.1643 GiB and 94.110470 seconds
```
After: Model loading took  5.9sec
```
�[1;36m(EngineCore_0 pid=3005216)�[0;0m INFO 09-02 16:54:43 [default_loader.py:267] Loading weights took 2.54 seconds
�[1;36m(EngineCore_0 pid=3005216)�[0;0m INFO 09-02 16:54:47 [gpu_model_runner.py:1977] Model loading took 14.1693 GiB and 5.918206 seconds
```

**120b:**
**Loading time verification:**
**Before, P1928776629**
E2E predictor warm up takes: 17:28:53 ~  17:39:59 = 11min 6sec

Model loading takes  568.133048 seconds
```
(EngineCore_0 pid=344869) INFO 09-02 17:29:45 [default_loader.py:267] Loading weights took 8.25 seconds
(EngineCore_0 pid=344869) INFO 09-02 17:39:05 [gpu_model_runner.py:1977] Model loading took 68.7019 GiB and 568.133048 seconds
```

**After, P1928762318**
E2E predictor warm up takes: 17:26:12 ~ 17:28:15 = 2min 3sec

Model loading takes 15.083996 seconds
```
(EngineCore_0 pid=156514) INFO 09-02 17:27:05 [default_loader.py:267] Loading weights took 9.18 seconds
(EngineCore_0 pid=156514) INFO 09-02 17:27:12 [gpu_model_runner.py:1977] Model loading took 68.7093 GiB and 15.083996 seconds
```
**Accuracy verification:**
```
aime25 medium: P1928806083
[{'eval_name': 'aime25', 'model_name': 'gpt-oss-120b-medium_temp1.0_20250902_175112', 'metric': 0.7875}]

aime25 high:P1928898566
[{'eval_name': 'aime25', 'model_name': 'gpt-oss-120b-high_temp1.0_20250902_180141', 'metric': 0.9}]
```

Test Plan:
Compared the transposed weights and they are matched between before and after. P1928725920
python test_eq.py
```
import torch

[g1w, g1s, g1b] = torch.load("/tmp/gemm1_wei.pt")
[g1w2, g1s2, g1b2] = torch.load("/tmp/gemm1_wei2.pt")

for i in range(len(g1w)):
    print(i)
    print(torch.equal(g1w[i], g1w2[i]))
    print(torch.equal(g1s[i], g1s2[i]))
    print(torch.equal(g1b[i], g1b2[i]))

[g2w, g2s, g2b] = torch.load("/tmp/gemm2_wei.pt")
[g2w2, g2s2, g2b2] = torch.load("/tmp/gemm2_wei2.pt")

for i in range(len(g2w)):
    print(i)
    print(torch.equal(g2w[i], g2w2[i]))
    print(torch.equal(g2s[i], g2s2[i]))
    print(torch.equal(g2b[i], g2b2[i]))
```

Rollback Plan:

Differential Revision: D81544286
diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from typing import Callable, Optional
+from typing import Callable, Dict, Optional, Union
 
 import torch
 from torch.nn.parameter import Parameter
@@ -122,7 +122,8 @@ def __init__(self, moe: FusedMoEConfig):
                 "MXFP4 MoE is enabled on Blackwell but FlashInfer "
                 "is not available. This may result in degraded performance. "
                 "Please `pip install vllm[flashinfer]` for best results.")
-
+        self._cache_permute_indices: Dict[torch.Size, torch.Tensor] = {}
+        
     def _should_use_marlin(self):
         if envs.VLLM_MXFP4_USE_MARLIN is not None:
             return envs.VLLM_MXFP4_USE_MARLIN
@@ -261,12 +262,37 @@ def create_weights(self, layer: torch.nn.Module, num_experts: int,
         )
         layer.register_parameter("w2_bias", w2_bias)
         set_weight_attrs(w2_bias, extra_weight_attrs)
+    
+    def _maybe_get_cached_permute_indices(
+        self,
+        dst_w_weight: torch.Tensor,
+        epilogue_tile_m: int,
+        num_elts_per_sf: Union[None, int] = None,
+    ) -> torch.Tensor:
+        from flashinfer.utils import get_shuffle_matrix_a_row_indices, get_shuffle_matrix_sf_a_row_indices
+        key = self._cache_permute_indices.get(dst_w_weight.shape)
+        if key is None:
+            if num_elts_per_sf is None:
+                permute1 = get_shuffle_matrix_a_row_indices(
+                    dst_w_weight, epilogue_tile_m=epilogue_tile_m
+                )
+            else:
+                permute1 = get_shuffle_matrix_sf_a_row_indices(
+                    dst_w_weight,
+                    epilogue_tile_m=epilogue_tile_m,
+                    num_elts_per_sf=num_elts_per_sf,
+                )
+            self._cache_permute_indices[dst_w_weight.shape] = permute1.to(
+                dst_w_weight.device
+            )
+        permute_indices = self._cache_permute_indices[dst_w_weight.shape]
+        return permute_indices
 
     def process_weights_after_loading(self, layer):
         if self.use_marlin:
             prepare_moe_fp4_layer_for_marlin(layer)
         elif should_use_flashinfer_mxfp4():
-            from flashinfer import shuffle_matrix_a, shuffle_matrix_sf_a
+            from flashinfer.fp4_quantization import nvfp4_block_scale_interleave
             layer.gemm1_alpha = Parameter(torch.tensor(
                 [1.702] * self.num_experts, dtype=torch.float32).cuda(),
                                           requires_grad=False)
@@ -343,25 +369,74 @@ def swap_every_two_rows(x, axis=-1):
             gemm2_bias_shuffled = []
             epilogue_tile_m = 128  # FIXME: this depends on the kernel internals
             for i in range(self.num_experts):
+                # w13 weight shuffling
+                permute_indices = self._maybe_get_cached_permute_indices(
+                    w13_weight[i].view(torch.uint8),
+                    epilogue_tile_m,
+                )
                 gemm1_weights_mxfp4_shuffled.append(
-                    shuffle_matrix_a(w13_weight[i].view(torch.uint8),
-                                     epilogue_tile_m))
+                    w13_weight[i]
+                    .view(torch.uint8)[permute_indices.to(w13_weight.device)]
+                    .contiguous()
+                )
+                # w13 scale shuffling
+                permute_sf_indices = self._maybe_get_cached_permute_indices(
+                    w13_weight_scale[i].view(torch.uint8),
+                    epilogue_tile_m,
+                    num_elts_per_sf=16,
+                )
                 gemm1_scales_mxfp4_shuffled.append(
-                    shuffle_matrix_sf_a(w13_weight_scale[i].view(torch.uint8),
-                                        epilogue_tile_m))
+                    nvfp4_block_scale_interleave(
+                        w13_weight_scale[i]
+                        .view(torch.uint8)[
+                            permute_sf_indices.to(w13_weight_scale.device)
+                        ]
+                        .contiguous()
+                    )
+                )
+                # w13 bias shuffling
+                permute_bias_indices = self._maybe_get_cached_permute_indices(
+                    w13_bias[i].clone().reshape(-1, 1),
+                    epilogue_tile_m,
+                )
                 gemm1_bias_shuffled.append(
-                    shuffle_matrix_a(w13_bias[i].clone().reshape(-1, 1),
-                                     epilogue_tile_m))
-
+                    w13_bias[i].clone().reshape(-1, 1)[permute_bias_indices.to(w13_bias.device)]
+                    .contiguous()
+                )
+                # w2 weight shuffling
+                permute_indices = self._maybe_get_cached_permute_indices(
+                    w2_weight[i].view(torch.uint8),
+                    epilogue_tile_m,
+                )
                 gemm2_weights_mxfp4_shuffled.append(
-                    shuffle_matrix_a(w2_weight[i].view(torch.uint8),
-                                     epilogue_tile_m))
+                    w2_weight[i]
+                    .view(torch.uint8)[permute_indices.to(w2_weight.device)]
+                    .contiguous()
+                )
+                # w2 scale shuffling
+                permute_sf_indices = self._maybe_get_cached_permute_indices(
+                    w2_weight_scale[i].view(torch.uint8),
+                    epilogue_tile_m,
+                    num_elts_per_sf=16,
+                )
                 gemm2_scales_mxfp4_shuffled.append(
-                    shuffle_matrix_sf_a(w2_weight_scale[i].view(torch.uint8),
-                                        epilogue_tile_m))
+                    nvfp4_block_scale_interleave(
+                        w2_weight_scale[i]
+                        .view(torch.uint8)[
+                            permute_sf_indices.to(w13_weight_scale.device)
+                        ]
+                        .contiguous()
+                    )
+                )
+                # w2 bias shuffling
+                permute_indices = self._maybe_get_cached_permute_indices(
+                    w2_bias[i].clone().reshape(-1, 1),
+                    epilogue_tile_m,
+                )
                 gemm2_bias_shuffled.append(
-                    shuffle_matrix_a(w2_bias[i].clone().reshape(-1, 1),
-                                     epilogue_tile_m))
+                    w2_bias[i].clone().reshape(-1, 1)[permute_indices.to(w2_bias.device)]
+                    .contiguous()
+                )
 
             w13_weight = torch.stack(gemm1_weights_mxfp4_shuffled)
             w13_weight_scale = torch.stack(