We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent cd9d06f commit fa0050dCopy full SHA for fa0050d
vllm/model_executor/layers/quantization/fp8.py
@@ -355,7 +355,8 @@ def apply(self,
355
input_scale=layer.input_scale,
356
bias=bias,
357
cutlass_fp8_supported=self.cutlass_fp8_supported,
358
- use_per_token_if_dynamic=False)
+ # Default to using per_token quantization if cutlass is supported
359
+ use_per_token_if_dynamic=self.cutlass_fp8_supported)
360
361
362
class Fp8MoEMethod(FusedMoEMethodBase):
0 commit comments