new change

dragondream-chen · dragondream-chen · commit ef5dd7bead65 · 2025-09-02T20:37:08.000+08:00
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -449,25 +449,11 @@ def forward_cuda(
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        skip_expert_load_scatter_add = False
         if enable_eplb:
             assert expert_load_view is not None
             assert logical_to_physical_map is not None
             assert logical_replica_count is not None
             assert isinstance(layer, FusedMoE)
-            # if `skip_expert_load_scatter_add` is True, 
-            # update `expert_load_view` in modular_kernel,
-            # skipping scatter_add_ in FusedMoE.select_experts. 
-            if (self.fused_experts is not None and 
-                isinstance(self.fused_experts, FusedMoEModularKernel)):
-
-                # There is no `expert_num_tokens` in 
-                # `expert_tokens_meta` of DeepEPHTPrepareAndFinalize
-                # so it is not supported DeepEPHTPrepareAndFinalize for now. 
-                # TODO: Maybe it is better to support DeepEPHTPrepareAndFinalize.
-                if not isinstance(self.fused_experts.prepare_finalize,
-                    DeepEPHTPrepareAndFinalize):
-                    skip_expert_load_scatter_add = True
 
         topk_weights, topk_ids = FusedMoE.select_experts(
             hidden_states=x,
@@ -486,7 +472,7 @@ def forward_cuda(
             expert_load_view=expert_load_view,
             logical_to_physical_map=logical_to_physical_map,
             logical_replica_count=logical_replica_count,
-            skip_expert_load_scatter_add=skip_expert_load_scatter_add
+            fused_experts_method=self.fused_experts
             )
 
         if self.rocm_aiter_moe_enabled:
@@ -1408,7 +1394,7 @@ def select_experts(
         expert_load_view: Optional[torch.Tensor] = None,
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
-        skip_expert_load_scatter_add: bool = False,
+        fused_experts_method: Optional[Callable] = None
     ) -> tuple[torch.Tensor, torch.Tensor]:
         """
         Route the input hidden states to the top-k experts based on the
@@ -1489,12 +1475,25 @@ def select_experts(
             topk_ids = physical_ids
 
             # 2. Record expert load metrics
-            # Note: When using FusedMoEModularKernel, expert load statistics are handled
-            # directly in the kernel using ExpertTokensMetadata.expert_num_tokens for better performance.
-            # For other implementations or when metadata is not available, we fall back to scatter_add_.
+            # When using FusedMoEModularKernel, 
+            # expert load statistics are handled directly in the kernel using 
+            # ExpertTokensMetadata.expert_num_tokens for better performance.
+            # For other implementations or when metadata is not available, 
+            # we fall back to scatter_add_.
             
-            # Check if we're using FusedMoEModularKernel and if it has already processed the load
+            # Check if we're using FusedMoEModularKernel and 
+            # if it has already processed the load.
             # If not, use the traditional scatter_add_ approach.
+
+            # There is no expert_num_tokens in 
+            # expert_tokens_meta of DeepEPHTPrepareAndFinalize
+            # so it is not supported DeepEPHTPrepareAndFinalize for now. 
+            # TODO: Maybe it is better to support DeepEPHTPrepareAndFinalize.
+            skip_expert_load_scatter_add = ((fused_experts_method is not None) and 
+                isinstance(fused_experts_method, FusedMoEModularKernel) and 
+                (fused_experts_method.prepare_finalize.__class__ != 
+                "DeepEPHTPrepareAndFinalize"))
+
             if not skip_expert_load_scatter_add:
                 logger.debug("expert_load_view update from topk_ids through scatter_add_.")
                 # Fallback to scatter_add_ for non-modular kernel implementations
@@ -1512,6 +1511,8 @@ def select_experts(
                 expert_load_view.scatter_add_(dim=0,
                                               index=index.long(),
                                               src=src.to(expert_load_view))
+            else:
+                logger.debug("expert_load_view update in modular_kernel through add_.")
 
             topk_ids = topk_ids.to(dtype=indices_type)
 
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
@@ -926,27 +926,11 @@ def apply(
         logical_to_physical_map: Optional[torch.Tensor] = None,
         logical_replica_count: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
-        skip_expert_load_scatter_add = False
         if enable_eplb:
             assert expert_load_view is not None
             assert logical_to_physical_map is not None
             assert logical_replica_count is not None
             assert isinstance(layer, FusedMoE)
-            # if `skip_expert_load_scatter_add` is True, 
-            # update `expert_load_view` in modular_kernel,
-            # skipping scatter_add_ in FusedMoE.select_experts. 
-            if (self.fused_experts is not None and 
-                isinstance(self.fused_experts, FusedMoEModularKernel)):
-
-                # There is no `expert_num_tokens` in 
-                # `expert_tokens_meta` of DeepEPHTPrepareAndFinalize
-                # so it is not supported DeepEPHTPrepareAndFinalize for now. 
-                # TODO: Maybe it is better to support DeepEPHTPrepareAndFinalize.
-                from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize \
-                    import DeepEPHTPrepareAndFinalize
-                if not isinstance(self.fused_experts.prepare_finalize,
-                    DeepEPHTPrepareAndFinalize):
-                    skip_expert_load_scatter_add = True
 
         if not self.flashinfer_moe_enabled:
             topk_weights, topk_ids = FusedMoE.select_experts(
@@ -966,7 +950,7 @@ def apply(
                 expert_load_view=expert_load_view,
                 logical_to_physical_map=logical_to_physical_map,
                 logical_replica_count=logical_replica_count,
-                skip_expert_load_scatter_add=skip_expert_load_scatter_add
+                fused_experts_method=self.fused_experts
             )
 
         if self.rocm_aiter_moe_enabled: