From 68d8ad79b0f4c3dcd7451c152fa1329da9590dbb Mon Sep 17 00:00:00 2001
From: Lu Fang <lufang@fb.com>
Date: Sun, 19 Jan 2025 22:39:41 -0800
Subject: [PATCH] Introduce CUDA Graph support for DeepSeek v3

Signed-off-by: Lu Fang <lufang@fb.com>
---
 csrc/moe/moe_align_sum_kernels.cu             | 27 ++++++++++---------
 csrc/moe/moe_ops.h                            |  3 ++-
 csrc/moe/torch_bindings.cpp                   |  3 ++-
 vllm/_custom_ops.py                           |  6 +++--
 vllm/config.py                                |  2 +-
 .../layers/fused_moe/fused_moe.py             |  5 +++-
 6 files changed, 27 insertions(+), 19 deletions(-)

diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu
index 24341d63fb1f..41f34c4ff5d4 100644
--- a/csrc/moe/moe_align_sum_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@@ -221,7 +221,8 @@ __global__ void moe_sum_kernel(
 void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
                           int64_t block_size, torch::Tensor sorted_token_ids,
                           torch::Tensor experts_ids,
-                          torch::Tensor num_tokens_post_pad) {
+                          torch::Tensor num_tokens_post_pad,
+                          bool use_shared_memory) {
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
   // If we have very large number of experts, we can no longer use shared
@@ -229,21 +230,22 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
   // TODO(simon): the right solution should be calculating the exact right
   // amount of shared memory and use that. The num_experts >= 256 is just a
   // temporary solution to unblock Deepseek V3.
-  if (num_experts >= 256) {
+  if (!use_shared_memory) {
     VLLM_DISPATCH_INTEGRAL_TYPES(
         topk_ids.scalar_type(), "moe_align_block_size_global_mem_kernel", [&] {
           // calc needed amount of shared mem for `tokens_cnts` and `cumsum`
           // tensors
           const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE);
 
-          const int32_t mem_tokens_cnts =
-              ((num_experts + 1) * num_experts) * sizeof(int32_t);
-          const int32_t mem_cumsum = (num_experts + 1) * sizeof(int32_t);
-          // allocate global memory
-          int32_t* tokens_cnts;
-          int32_t* cumsum;
-          cudaMalloc(&tokens_cnts, mem_tokens_cnts);
-          cudaMalloc(&cumsum, mem_cumsum);
+          torch::Tensor token_cnts =
+              torch::empty({(num_experts + 1) * num_experts},
+                           torch::TensorOptions()
+                               .dtype(torch::kInt)
+                               .device(topk_ids.device()));
+          torch::Tensor cumsum =
+              torch::empty({num_experts + 1}, torch::TensorOptions()
+                                                  .dtype(torch::kInt)
+                                                  .device(topk_ids.device()));
 
           auto kernel =
               vllm::moe::moe_align_block_size_global_mem_kernel<scalar_t>;
@@ -252,9 +254,8 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
               sorted_token_ids.data_ptr<int32_t>(),
               experts_ids.data_ptr<int32_t>(),
               num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
-              topk_ids.numel(), tokens_cnts, cumsum);
-          cudaFree(tokens_cnts);
-          cudaFree(cumsum);
+              topk_ids.numel(), token_cnts.data_ptr<int32_t>(),
+              cumsum.data_ptr<int32_t>());
         });
   } else {
     VLLM_DISPATCH_INTEGRAL_TYPES(
diff --git a/csrc/moe/moe_ops.h b/csrc/moe/moe_ops.h
index 596cc0aa6c85..7454e26e4f85 100644
--- a/csrc/moe/moe_ops.h
+++ b/csrc/moe/moe_ops.h
@@ -11,4 +11,5 @@ void moe_sum(torch::Tensor& input, torch::Tensor& output);
 void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
                           int64_t block_size, torch::Tensor sorted_token_ids,
                           torch::Tensor experts_ids,
-                          torch::Tensor num_tokens_post_pad);
+                          torch::Tensor num_tokens_post_pad,
+                          bool use_shared_memory);
diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
index f3a558c14ab9..5d22e1be5c57 100644
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -19,7 +19,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
       "moe_align_block_size(Tensor topk_ids, int num_experts,"
       "                     int block_size, Tensor! sorted_token_ids,"
       "                     Tensor! experts_ids,"
-      "                     Tensor! num_tokens_post_pad) -> ()");
+      "                     Tensor! num_tokens_post_pad,"
+      "                     bool use_shared_memory) -> ()");
   m.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);
 
 #ifndef USE_ROCM
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index d04cbbc0a9ee..508a35c364b6 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -917,10 +917,12 @@ def moe_sum(input: torch.Tensor, output: torch.Tensor):
 def moe_align_block_size(topk_ids: torch.Tensor, num_experts: int,
                          block_size: int, sorted_token_ids: torch.Tensor,
                          experts_ids: torch.Tensor,
-                         num_tokens_post_pad: torch.Tensor) -> None:
+                         num_tokens_post_pad: torch.Tensor,
+                         use_shared_memory: bool = True) -> None:
     torch.ops._moe_C.moe_align_block_size(topk_ids, num_experts, block_size,
                                           sorted_token_ids, experts_ids,
-                                          num_tokens_post_pad)
+                                          num_tokens_post_pad,
+                                          use_shared_memory)
 
 
 def topk_softmax(topk_weights: torch.Tensor, topk_ids: torch.Tensor,
diff --git a/vllm/config.py b/vllm/config.py
index 4698a0502033..b0a92b2e2134 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -607,7 +607,7 @@ def _verify_cuda_graph(self) -> None:
         self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
                                           self.max_model_len)
 
-        MODEL_NOT_SUPPORT_CUDA_GRAPH = ['deepseek_v3', 'mllama']
+        MODEL_NOT_SUPPORT_CUDA_GRAPH = ['mllama']
         if (self.hf_config.model_type in MODEL_NOT_SUPPORT_CUDA_GRAPH
                 and not self.enforce_eager):
             logger.warning(
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 308c1d6ac6db..4d658f083583 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -256,8 +256,11 @@ def moe_align_block_size(
     num_tokens_post_pad = torch.empty((1),
                                       dtype=torch.int32,
                                       device=topk_ids.device)
+    # Note: DeepSeek V3 has 256 experts.
+    use_shared_memory = num_experts < 256
     ops.moe_align_block_size(topk_ids, num_experts, block_size, sorted_ids,
-                             expert_ids, num_tokens_post_pad)
+                             expert_ids, num_tokens_post_pad,
+                             use_shared_memory)
     return sorted_ids, expert_ids, num_tokens_post_pad