[Core] Default to using per_token quantization for fp8 when cutlass is supported. (#8651)

elfiegg · mgoin · web-flow · commit fa0050db0866 · 2025-01-16T04:31:27.000Z
Signed-off-by: mgoin &lt;michael@neuralmagic.com&gt;
Co-authored-by: Michael Goin &lt;mgoin@redhat.com&gt;
Co-authored-by: mgoin &lt;michael@neuralmagic.com&gt;
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
@@ -355,7 +355,8 @@ def apply(self,
             input_scale=layer.input_scale,
             bias=bias,
             cutlass_fp8_supported=self.cutlass_fp8_supported,
-            use_per_token_if_dynamic=False)
+            # Default to using per_token quantization if cutlass is supported
+            use_per_token_if_dynamic=self.cutlass_fp8_supported)
 
 
 class Fp8MoEMethod(FusedMoEMethodBase):