add dtype checking and fix tests

wwl2755 · wwl2755 · commit 2f0a859c3ef5 · 2025-09-08T02:41:50.000Z
Signed-off-by: wwl2755 &lt;wangwenlong2755@gmail.com&gt;
diff --git a/tests/kernels/attention/test_mha_attn.py b/tests/kernels/attention/test_mha_attn.py
@@ -33,19 +33,38 @@ def test_mha_attn_platform(device: str):
     torch.set_default_dtype(torch.float16)
 
     if device == "cpu":
-        with patch("vllm.attention.selector.current_platform", CpuPlatform()):
+        with patch("vllm.model_executor.models.vision.current_platform",
+                   CpuPlatform()):
             attn = MultiHeadAttention(16, 64, scale=1)
             assert attn.attn_backend == _Backend.TORCH_SDPA
     elif device == "hip":
-        with patch("vllm.attention.selector.current_platform", RocmPlatform()):
+        with patch("vllm.model_executor.models.vision.current_platform",
+                   RocmPlatform()):
             attn = MultiHeadAttention(16, 64, scale=1)
             assert attn.attn_backend == _Backend.TORCH_SDPA
     else:
-        with patch("vllm.attention.selector.current_platform", CudaPlatform()):
+        # Test CUDA with head_size=64 (divisible by 32)
+        # - should use vLLM FlashAttention
+        with patch("vllm.model_executor.models.vision.current_platform",
+                   CudaPlatform()):
             attn = MultiHeadAttention(16, 64, scale=1)
-            assert attn.attn_backend == _Backend.XFORMERS
-
-        with patch("vllm.attention.selector.current_platform", CudaPlatform()):
+            assert attn.attn_backend == _Backend.FLASH_ATTN
+
+        # Test CUDA with head_size=72 (not divisible by 32)
+        # - upstream FA available
+        with patch("vllm.model_executor.models.vision.current_platform",
+                   CudaPlatform()), \
+             patch("transformers.utils.is_flash_attn_2_available",
+                   return_value=True):
+            attn = MultiHeadAttention(16, 72, scale=1)
+            assert attn.attn_backend == _Backend.FLASH_ATTN
+
+        # Test CUDA with head_size=72 (not divisible by 32)
+        # - upstream FA not available
+        with patch("vllm.model_executor.models.vision.current_platform",
+                   CudaPlatform()), \
+             patch("transformers.utils.is_flash_attn_2_available",
+                   return_value=False):
             attn = MultiHeadAttention(16, 72, scale=1)
             assert attn.attn_backend == _Backend.XFORMERS
 
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
@@ -350,10 +350,13 @@ def __init__(
             f"divisible by num_kv_heads ({self.num_kv_heads})"
         self.num_queries_per_kv = self.num_heads // self.num_kv_heads
 
-        # dtype = torch.get_default_dtype()
+        # During model initialization, the default dtype is set as the model
+        # weight and activation dtype.
+        dtype = torch.get_default_dtype()
 
         # Determine the attention backend
-        backend, use_upstream_fa = get_vit_attn_backend(head_size=head_size)
+        backend, use_upstream_fa = get_vit_attn_backend(head_size=head_size,
+                                                        dtype=dtype)
 
         if current_platform.is_rocm():
             # currently, only torch_sdpa is supported on rocm
diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py
@@ -173,7 +173,8 @@ def __init__(
 
         # Detect attention implementation.
         self.attn_backend, self.use_upstream_fa = get_vit_attn_backend(
-            head_size=self.hidden_size_per_attention_head)
+            head_size=self.hidden_size_per_attention_head,
+            dtype=torch.get_default_dtype())
         if self.attn_backend not in {
                 _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS,
                 _Backend.ROCM_AITER_FA
@@ -463,7 +464,8 @@ def __init__(
                 ), "vit's config.hidden must be equal to config.embed_dim"
         self.ln = nn.LayerNorm(hidden_size, eps=1e-6)
 
-        self.attn_backend, _ = get_vit_attn_backend(head_size=head_dim)
+        self.attn_backend, _ = get_vit_attn_backend(
+            head_size=head_dim, dtype=torch.get_default_dtype())
 
     @property
     def dtype(self) -> torch.dtype:
diff --git a/vllm/model_executor/models/glm4_1v.py b/vllm/model_executor/models/glm4_1v.py
@@ -261,7 +261,8 @@ def __init__(
 
         # Detect attention implementation.
         self.attn_backend, self.use_upstream_fa = get_vit_attn_backend(
-            head_size=self.hidden_size_per_attention_head)
+            head_size=self.hidden_size_per_attention_head,
+            dtype=torch.get_default_dtype())
         if self.attn_backend not in {
                 _Backend.FLASH_ATTN,
                 _Backend.TORCH_SDPA,
@@ -732,7 +733,8 @@ def __init__(
         self.post_layernorm = RMSNorm(vision_config.hidden_size,
                                       eps=vision_config.rms_norm_eps)
 
-        self.attn_backend, _ = get_vit_attn_backend(head_size=head_dim)
+        self.attn_backend, _ = get_vit_attn_backend(
+            head_size=head_dim, dtype=torch.get_default_dtype())
 
     @property
     def dtype(self) -> torch.dtype:
diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py
@@ -375,7 +375,7 @@ def __init__(
 
         # Detect attention implementation.
         self.attn_backend, self.use_upstream_fa = get_vit_attn_backend(
-            head_size=self.head_dim)
+            head_size=self.head_dim, dtype=torch.get_default_dtype())
         if self.attn_backend not in {_Backend.FLASH_ATTN, _Backend.XFORMERS}:
             raise RuntimeError(
                 f"Keye-VL does not support {self.attn_backend} backend now.")
diff --git a/vllm/model_executor/models/qwen2_5_vl.py b/vllm/model_executor/models/qwen2_5_vl.py
@@ -300,7 +300,8 @@ def __init__(
 
         # Detect attention implementation.
         self.attn_backend, self.use_upstream_fa = get_vit_attn_backend(
-            head_size=self.hidden_size_per_attention_head)
+            head_size=self.hidden_size_per_attention_head,
+            dtype=torch.get_default_dtype())
         if self.attn_backend not in {
                 _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS,
                 _Backend.ROCM_AITER_FA
@@ -633,7 +634,8 @@ def __init__(
             prefix=f"{prefix}.merger",
             use_data_parallel=use_data_parallel,
         )
-        self.attn_backend, _ = get_vit_attn_backend(head_size=head_dim)
+        self.attn_backend, _ = get_vit_attn_backend(
+            head_size=head_dim, dtype=torch.get_default_dtype())
 
     @property
     def dtype(self) -> torch.dtype:
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
@@ -315,7 +315,8 @@ def __init__(
 
         # Detect attention implementation.
         self.attn_backend, self.use_upstream_fa = get_vit_attn_backend(
-            head_size=self.hidden_size_per_attention_head)
+            head_size=self.hidden_size_per_attention_head,
+            dtype=torch.get_default_dtype())
         if self.attn_backend not in {
                 _Backend.FLASH_ATTN, _Backend.TORCH_SDPA, _Backend.XFORMERS,
                 _Backend.ROCM_AITER_FA
@@ -632,7 +633,8 @@ def __init__(
             quant_config=quant_config,
             prefix=f"{prefix}.merger",
         )
-        self.attn_backend, _ = get_vit_attn_backend(head_size=head_dim)
+        self.attn_backend, _ = get_vit_attn_backend(
+            head_size=head_dim, dtype=torch.get_default_dtype())
 
     @property
     def dtype(self) -> torch.dtype:
diff --git a/vllm/model_executor/models/siglip2navit.py b/vllm/model_executor/models/siglip2navit.py
@@ -237,7 +237,7 @@ def __init__(
 
         # Detect attention implementation.
         self.attn_backend, self.use_upstream_fa = get_vit_attn_backend(
-            head_size=self.head_dim)
+            head_size=self.head_dim, dtype=torch.get_default_dtype())
         if self.attn_backend not in {
                 _Backend.FLASH_ATTN, _Backend.TORCH_SDPA,
                 _Backend.ROCM_AITER_FA
diff --git a/vllm/model_executor/models/vision.py b/vllm/model_executor/models/vision.py
@@ -68,7 +68,8 @@ def get_vision_encoder_info(
     raise NotImplementedError(msg)
 
 
-def get_vit_attn_backend(head_size: int) -> tuple[_Backend, bool]:
+def get_vit_attn_backend(head_size: int,
+                         dtype: torch.dtype) -> tuple[_Backend, bool]:
     """
     Get the available attention backend for Vision Transformer.
     
@@ -79,7 +80,7 @@ def get_vit_attn_backend(head_size: int) -> tuple[_Backend, bool]:
     if selected_backend is not None:
         return selected_backend, False
 
-    return current_platform.get_vit_attn_backend(head_size)
+    return current_platform.get_vit_attn_backend(head_size, dtype)
 
 
 def resolve_visual_encoder_outputs(
diff --git a/vllm/platforms/cuda.py b/vllm/platforms/cuda.py
@@ -203,7 +203,11 @@ def get_current_memory_usage(cls,
         return torch.cuda.max_memory_allocated(device)
 
     @classmethod
-    def get_vit_attn_backend(cls, head_size: int) -> tuple[_Backend, bool]:
+    def get_vit_attn_backend(cls, head_size: int,
+                             dtype: torch.dtype) -> tuple[_Backend, bool]:
+        if dtype not in (torch.float16, torch.bfloat16):
+            return _Backend.XFORMERS, False
+
         if cls.has_device_capability(80):
             if head_size % 32 == 0:
                 # Use vllm-flash-attn
diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py
@@ -190,7 +190,8 @@ def device_id_to_physical_device_id(cls, device_id: int):
             return device_id
 
     @classmethod
-    def get_vit_attn_backend(cls, head_size: int) -> tuple[_Backend, bool]:
+    def get_vit_attn_backend(cls, head_size: int,
+                             dtype: torch.dtype) -> tuple[_Backend, bool]:
         return _Backend.TORCH_SDPA, False
 
     @classmethod
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
@@ -175,7 +175,8 @@ class RocmPlatform(Platform):
     ]
 
     @classmethod
-    def get_vit_attn_backend(cls, head_size: int) -> tuple[_Backend, bool]:
+    def get_vit_attn_backend(cls, head_size: int,
+                             dtype: torch.dtype) -> tuple[_Backend, bool]:
         if (envs.VLLM_ROCM_USE_AITER and envs.VLLM_ROCM_USE_AITER_MHA
                 and on_gfx9()):
             # Note: AITER FA is only supported for Qwen-VL models.