deepseek-ai
diff --git a/‎csrc/cutlass b/‎csrc/cutlass
diff --git a/‎csrc/cutlass
Lines changed: 1 addition & 0 deletions b/‎csrc/cutlass
Lines changed: 1 addition & 0 deletions
diff --git a/‎csrc/flash_api.cpp
Lines changed: 40 additions & 9 deletions b/‎csrc/flash_api.cpp
Lines changed: 40 additions & 9 deletions
diff --git a/‎csrc/kernels/fp8_transpose_v.h
Lines changed: 84 additions & 0 deletions b/‎csrc/kernels/fp8_transpose_v.h
Lines changed: 84 additions & 0 deletions
diff --git a/‎csrc/kernels/params.h
Lines changed: 1 addition & 0 deletions b/‎csrc/kernels/params.h
Lines changed: 1 addition & 0 deletions
@@ -0,0 +1 @@
+//home/wangzhe/deepseek/tune_block_shape/DeepGEMM/third-party/cutlass
@@ -20,6 +20,23 @@
 #define CHECK_SHAPE(x, ...) TORCH_CHECK(x.sizes() == torch::IntArrayRef({__VA_ARGS__}), #x " must have shape (" #__VA_ARGS__ ")")
 #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
 
+inline float get_scalar_f32_cpu_only(c10::optional<const at::Tensor> & scale,
+                                     const char* name = "dequant scale") {
+    TORCH_CHECK(scale.has_value(),
+                name, " is None (optional has no value)");
+    const at::Tensor& t = *scale;           
+    TORCH_CHECK(!t.device().is_cuda(),
+                "descale_q / descale_k must be on CPU, but got ",
+                t.device().type(), " device");
+    TORCH_CHECK(t.scalar_type() == torch::kFloat32,
+                "descale_q / descale_k must be float32, but got ",
+                t.scalar_type());
+    TORCH_CHECK(t.numel() == 1,
+                "descale_q / descale_k must be a scalar, but got ",
+                t.numel(), " elements");
+    return t.item<float>();                
+}
+
 std::vector<at::Tensor>
 get_mla_metadata(
     at::Tensor &seqlens_k,
@@ -68,16 +85,19 @@ mha_fwd_kvcache_mla(
     const float softmax_scale,
     bool is_causal,
     const at::Tensor &tile_scheduler_metadata,   // num_sm_parts x TileSchedulerMetaDataSize
-    const at::Tensor &num_splits                 // batch_size + 1
+    const at::Tensor &num_splits,                 // batch_size + 1
+    c10::optional<const at::Tensor> &descale_q,  // batch_size
+    c10::optional<const at::Tensor> &descale_k  // batch_size
 ) {
     // Check the architecture
     auto dprops = at::cuda::getCurrentDeviceProperties();
     bool is_sm90 = dprops->major == 9 && dprops->minor == 0;
     TORCH_CHECK(is_sm90);
 
     // Check data types
-    auto q_dtype = q.dtype();
-    TORCH_CHECK(q_dtype == torch::kBFloat16 || q_dtype == torch::kHalf);
+    auto q_dtype = q.scalar_type();
+    TORCH_CHECK(q_dtype == torch::kBFloat16 || q_dtype == torch::kHalf||
+                q_dtype == torch::kFloat8_e4m3fn, "Unsupported dtype for query tensor");
     TORCH_CHECK(kcache.dtype() == q_dtype, "query and key must have the same dtype");
     TORCH_CHECK(seqlens_k.dtype() == torch::kInt32, "seqlens_k must have dtype int32");
     TORCH_CHECK(block_table.dtype() == torch::kInt32, "block_table must have dtype torch.int32");
@@ -106,7 +126,7 @@ mha_fwd_kvcache_mla(
     const int num_heads_q = sizes[2];
     const int head_size_k = sizes[3];
     TORCH_CHECK(head_size_k == 576, "Only head_size_k == 576 is supported");
-    TORCH_CHECK(head_size_v == 512, "Only head_size_v == 576 is supported");
+    TORCH_CHECK(head_size_v == 512, "Only head_size_v == 512 is supported");
 
     const int max_num_blocks_per_seq = block_table.size(1);
     const int num_blocks = kcache.size(0);
@@ -133,7 +153,9 @@ mha_fwd_kvcache_mla(
     at::cuda::CUDAGuard device_guard{(char)q.get_device()};
 
     auto opts = q.options();
-    at::Tensor out = torch::empty({batch_size, q_seq_per_hk, num_heads, head_size_v}, opts);
+    auto out_type = (q_dtype == torch::kFloat8_e4m3fn) ? torch::kBFloat16 : q_dtype;
+    at::Tensor out = torch::empty({batch_size, q_seq_per_hk, num_heads, head_size_v}, opts.dtype(out_type));
+
     at::Tensor softmax_lse = torch::empty({batch_size, num_heads, q_seq_per_hk}, opts.dtype(at::kFloat));
     CHECK_CONTIGUOUS(softmax_lse);
 
@@ -152,6 +174,11 @@ mha_fwd_kvcache_mla(
     params.d_v = head_size_v;
     params.scale_softmax = softmax_scale;
     params.scale_softmax_log2 = float(softmax_scale * M_LOG2E);
+    if (q_dtype == torch::kFloat8_e4m3fn) {
+        params.descale_q = get_scalar_f32_cpu_only(descale_q);
+        params.descale_k = get_scalar_f32_cpu_only(descale_q);
+    }
+
     // Set the pointers and strides.
     params.q_ptr = q.data_ptr();
     params.k_ptr = kcache.data_ptr();
@@ -188,15 +215,19 @@ mha_fwd_kvcache_mla(
     auto stream = at::cuda::getCurrentCUDAStream().stream();
     TORCH_CHECK(head_size_k == 576);
     if (q_dtype == torch::kBFloat16) {
-        run_flash_splitkv_mla_kernel<cutlass::bfloat16_t>(params, stream);
-        run_flash_mla_combine_kernel<cutlass::bfloat16_t>(params, stream);
+         TORCH_CHECK(false, "Unsupported tensor dtype for query");
+        //run_flash_splitkv_mla_kernel<cutlass::bfloat16_t>(params, stream);
+        //run_flash_mla_combine_kernel<cutlass::bfloat16_t>(params, stream);
     } else if (q_dtype == torch::kHalf) {
+         TORCH_CHECK(false, "Unsupported tensor dtype for query");
 #ifdef FLASH_MLA_DISABLE_FP16
         TORCH_CHECK(false, "FlashMLA is compiled with -DFLASH_MLA_DISABLE_FP16. Please remove this flag from your environment and re-compile FlashMLA.");
 #else
-        run_flash_splitkv_mla_kernel<cutlass::half_t>(params, stream);
-        run_flash_mla_combine_kernel<cutlass::half_t>(params, stream);
+        //run_flash_splitkv_mla_kernel<cutlass::half_t>(params, stream);
+        //run_flash_mla_combine_kernel<cutlass::half_t>(params, stream);
 #endif
+    } else if (q_dtype == torch::kFloat8_e4m3fn) {
+        run_flash_splitkv_mla_kernel<cutlass::float_e4m3_t, cutlass::bfloat16_t>(params, stream);
     } else {
         TORCH_CHECK(false, "Unsupported tensor dtype for query");
     }
 
@@ -0,0 +1,84 @@
+/** 
+ * ref to Fa3's SmemTranspose64x64: 
+ * https://github.com/Dao-AILab/flash-attention/blob/0823cf7b5d96499c1c79a4f64b1e256a035ba4b4/hopper/mainloop_fwd_sm90_tma_gmma_ws.hpp#L26
+*/
+
+#pragma once
+using namespace cute;
+
+template <int kBlockN, int kHeadDim>
+struct SmemTransposeFp8_64x64 {
+    static_assert((kBlockN % 64 == 0) && (kHeadDim % 64 == 0));
+
+    using Element = cutlass::float_e4m3_t;
+    using TransposeShapeAtomV = Shape<_64, _64>;
+    using SmemLayoutAtomV = decltype(tile_to_shape(GMMA::Layout_K_SW64_Atom<Element>{}, TransposeShapeAtomV{}));
+    using SmemLayoutV =
+        decltype(tile_to_shape(SmemLayoutAtomV{},
+                 Shape<Int<kBlockN>, Int<kHeadDim>>{}));
+    
+    // for fp8 in-kernel transpose -- src layout
+    using SmemLayoutDivideV = decltype(tiled_divide(SmemLayoutV{}, TransposeShapeAtomV{}));
+    using SmemShapeLDSM = Shape<Shape<_8, _8>, Shape<_16, _4>>;
+    using FactoringShapeV = decltype(make_shape(SmemShapeLDSM{}, shape<1>(SmemLayoutDivideV{}), shape<2>(SmemLayoutDivideV{})));
+    using SmemLayoutTransposeV = decltype(composition(SmemLayoutDivideV{}, make_layout(FactoringShapeV{})));
+
+    // For fp8, this is the memory transpose.
+    using SmemLayoutAtomVt = decltype(tile_to_shape(GMMA::Layout_K_SW64_Atom<Element>{}, TransposeShapeAtomV{}));
+    using SmemLayoutVt = 
+        decltype(tile_to_shape(SmemLayoutAtomVt{},
+                 Shape<Int<kHeadDim>, Int<kBlockN>>{}));
+
+    // for fp8 in-kernel transpose -- dst layout
+    using SmemLayoutVtTrans = decltype(composition(
+        SmemLayoutVt{}, make_ordered_layout(product_each(shape(SmemLayoutV{})), Step<_2, _1>{})));
+    using SmemLayoutDivideVt = decltype(tiled_divide(SmemLayoutVtTrans{}, TransposeShapeAtomV{}));
+    using SmemShapeSTSM = Shape<Shape<_16, _4>, Shape<_16, _4>>;
+    using FactoringShapeVt = decltype(make_shape(SmemShapeSTSM{}, shape<1>(SmemLayoutDivideVt{}), shape<2>(SmemLayoutDivideVt{})));
+    using SmemLayoutTransposeVt = decltype(composition(SmemLayoutDivideVt{}, make_layout(FactoringShapeVt{})));
+
+
+    using ldsm_thread_shape = Shape<_4, _1, _8, _4>;
+    using ldsm_value_shape = Shape<_2, _8, _2, _1>;
+    using ldsm_value_stride = Stride<_2, _4, _1, _0>;
+    using TiledCopyLDSM = decltype(make_tiled_copy(Copy_Atom<SM75_U16x8_LDSM_T, Element>{}, Layout<ldsm_thread_shape>{},
+                                                    Layout<ldsm_value_shape, ldsm_value_stride>{}));
+    TiledCopyLDSM tiled_copy_ldsm;
+
+    using stsm_thread_shape = Shape<_4, _1, _8, _4>;
+    // using stsm_thread_stride = Stride<_1, _0, _4, _32>;
+    using stsm_value_shape = Shape<_4, _4, _2, _1>;
+    using stsm_value_stride = Stride<_1, _8, _4, _0>;
+
+    using TiledCopySTSM = decltype(make_tiled_copy(Copy_Atom<SM90_U32x4_STSM_N, Element>{}, Layout<stsm_thread_shape>{},
+                                                    Layout<stsm_value_shape, stsm_value_stride>{}));
+    TiledCopySTSM tiled_copy_stsm;
+
+    template <class SmemTensor, class SmemTensorOut>
+    CUTLASS_DEVICE void transpose(SmemTensor &&s_in, SmemTensorOut &&s_out) {
+        using namespace cute;
+
+        auto tid = threadIdx.x % cutlass::NumThreadsPerWarpGroup;
+        auto thr_copy_ldsm = tiled_copy_ldsm.get_thread_slice(tid);
+        auto thr_copy_stsm = tiled_copy_stsm.get_thread_slice(tid);
+
+        auto tXsX = thr_copy_ldsm.partition_S(s_in);
+        auto tXrX = make_tensor<Element>(shape(tXsX));
+        auto tXsX_out = thr_copy_stsm.partition_D(s_out);
+
+        cute::copy(tiled_copy_ldsm, tXsX, tXrX);
+
+        auto data = tXrX.data();
+        CUTLASS_PRAGMA_UNROLL
+        for (int n = 0; n < size(tXrX); n += 8) {
+            uint32_t *data_32bit = reinterpret_cast<uint32_t *>(&data[n]);
+            auto upper = data_32bit[0];
+            auto lower = data_32bit[1];
+            data_32bit[0] = __byte_perm(upper, lower, 0x6420);
+            data_32bit[1] = __byte_perm(upper, lower, 0x7531);
+        }
+
+        cute::copy(tiled_copy_stsm, tXrX, tXsX_out);
+    }
+};
+
@@ -14,6 +14,7 @@ struct Flash_fwd_mla_params {
     int q_head_per_hk;  // The number of q_head(s) per KV head, = h_q / h_k
     bool is_causal;
     float scale_softmax, scale_softmax_log2;
+    float descale_q, descale_k;
 
     void *__restrict__ q_ptr;
     void *__restrict__ k_ptr;
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+//home/wangzhe/deepseek/tune_block_shape/DeepGEMM/third-party/cutlass`