Update and try to fix tests

DarkLight1337 · DarkLight1337 · commit 9b177b5dd6b5 · 2025-01-30T17:01:51.000Z
Signed-off-by: DarkLight1337 &lt;tlleungac@connect.ust.hk&gt;
diff --git a/tests/models/decoder_only/vision_language/test_h2ovl.py b/tests/models/decoder_only/vision_language/test_h2ovl.py
diff --git a/tests/models/multimodal/processing/test_h2ovl.py b/tests/models/multimodal/processing/test_h2ovl.py
@@ -0,0 +1,138 @@
+"""Tests for H2OVL's multimodal preprocessing kwargs."""
+from typing import Optional
+
+import pytest
+
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.image import rescale_image_size
+from vllm.multimodal.utils import cached_get_tokenizer
+
+from ....conftest import _ImageAssets
+from ...utils import build_model_context
+
+
+@pytest.mark.parametrize("model_id", [
+    "h2oai/h2ovl-mississippi-800m",
+    "h2oai/h2ovl-mississippi-2b",
+])
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
+@pytest.mark.parametrize("max_dynamic_patch", [1, 2, 4, 8])
+@pytest.mark.parametrize("dynamic_image_size", [True, False, None])
+@pytest.mark.parametrize("num_imgs", [1, 2])
+def test_processor_override(
+    model_id: str,
+    image_assets: _ImageAssets,
+    size_factors: list[int],
+    max_dynamic_patch: int,
+    dynamic_image_size: Optional[bool],
+    num_imgs: int,
+):
+    from vllm.model_executor.models.h2ovl import (calculate_h2ovl_targets,
+                                                  get_h2ovl_target_ratios)
+
+    ctx = build_model_context(
+        model_name=model_id,
+        tokenizer_name=model_id,
+        trust_remote_code=True,
+        mm_processor_kwargs=None,
+        limit_mm_per_prompt={"image": num_imgs},
+    )
+    tokenizer = cached_get_tokenizer(
+        ctx.model_config.tokenizer,
+        trust_remote_code=ctx.model_config.trust_remote_code,
+    )
+    processor = MULTIMODAL_REGISTRY.create_processor(
+        ctx.model_config,
+        tokenizer=tokenizer,
+    )
+
+    config = processor.info.get_hf_config()
+    use_msac = config.use_msac
+
+    mm_processor_kwargs = {
+        "max_dynamic_patch": max_dynamic_patch,
+    }
+    if dynamic_image_size is not None:
+        mm_processor_kwargs["dynamic_image_size"] = dynamic_image_size
+
+    # Build the image str / prompt based on the number of images we pass
+    prompt = "<image>" * num_imgs
+
+    for asset in image_assets:
+        for factor in size_factors:
+            image = rescale_image_size(asset.pil_image, factor)
+            mm_data = {"image": [image] * num_imgs}
+
+            width, height = image.size
+
+            # Calculate the expected number of blocks
+            if use_msac:
+                # First pass
+                blocks1, _, _, aspect_ratio = calculate_h2ovl_targets(
+                    orig_width=width,
+                    orig_height=height,
+                    target_ratios=get_h2ovl_target_ratios(
+                        config.min_dynamic_patch,
+                        max_dynamic_patch,
+                        prior_aspect_ratio=None,
+                    ),
+                    image_size=config.vision_config.image_size,
+                    use_thumbnail=False,  # Thumbnail is handled separately
+                )
+
+                # Second pass
+                blocks2, _, _, _ = calculate_h2ovl_targets(
+                    orig_width=width,
+                    orig_height=height,
+                    target_ratios=get_h2ovl_target_ratios(
+                        config.min_dynamic_patch,
+                        max_dynamic_patch,
+                        prior_aspect_ratio=aspect_ratio,
+                    ),
+                    image_size=config.vision_config.image_size,
+                    use_thumbnail=False,
+                )
+
+                # Add thumbnail if use_thumbnail is True and total_blocks > 1
+                if config.use_thumbnail:
+                    blocks1 += 1 if blocks1 > 1 else 0
+                    blocks2 += 1 if blocks2 > 1 else 0
+
+                # Total blocks is the sum of blocks from both passes minus
+                # overlapping
+                total_blocks = blocks1 + blocks2 - 1
+
+                expected_num_patches = total_blocks
+            else:
+                blocks, _, _, _ = calculate_h2ovl_targets(
+                    orig_width=width,
+                    orig_height=height,
+                    target_ratios=get_h2ovl_target_ratios(
+                        config.min_dynamic_patch,
+                        max_dynamic_patch,
+                        prior_aspect_ratio=None,
+                    ),
+                    image_size=config.vision_config.image_size,
+                    use_thumbnail=False,
+                )
+                expected_num_patches = blocks
+
+                if config.use_thumbnail and expected_num_patches != 1:
+                    expected_num_patches += 1
+
+            processed_inputs = processor.apply(prompt, mm_data,
+                                               mm_processor_kwargs)
+            pixel_shape = (
+                processed_inputs["mm_kwargs"]["pixel_values_flat"].shape)
+
+            assert pixel_shape[0] == expected_num_patches * num_imgs
diff --git a/tests/models/multimodal/processing/test_internvl.py b/tests/models/multimodal/processing/test_internvl.py
@@ -43,15 +43,15 @@ def test_processor_override(
     if dynamic_image_size is not None:
         mm_processor_kwargs["dynamic_image_size"] = dynamic_image_size
 
-    expected_num_patches = max_dynamic_patch + 1 if max_dynamic_patch > 1 else 1
-    if dynamic_image_size is False:
-        expected_num_patches = 1
-
     # Build the image str / prompt based on the number of images we pass
     prompt = "<image>" * num_imgs
     image = image_assets[0].pil_image.resize((448 * 2, 448 * 2))
     mm_data = {"image": [image] * num_imgs}
 
+    expected_num_patches = max_dynamic_patch + 1 if max_dynamic_patch > 1 else 1
+    if dynamic_image_size is False:
+        expected_num_patches = 1
+
     processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
 
     # Ensure we have the right number of placeholders per num_crops size
diff --git a/vllm/model_executor/models/h2ovl.py b/vllm/model_executor/models/h2ovl.py
@@ -292,19 +292,26 @@ def get_num_image_tokens(
         *,
         image_width: int,
         image_height: int,
+        use_msac: Optional[bool] = None,
     ) -> int:
+        use_msac = (self.use_msac if use_msac is None else use_msac)
+
         target_ratios = self.resolve_target_ratios(
             use_thumbnail=False,  # Applied in calculate_targets
         )
 
+        use_thumbnail = self.use_thumbnail
         num_patches, _, _, _ = calculate_h2ovl_targets(
             orig_width=image_width,
             orig_height=image_height,
             image_size=self.image_size,
             target_ratios=target_ratios,
-            use_thumbnail=self.use_thumbnail,
+            use_thumbnail=use_thumbnail,
         )
 
+        if use_msac:
+            num_patches = (num_patches - use_thumbnail) * 2 + use_thumbnail
+
         return num_patches * self.num_image_token
 
     def _images_to_pixel_values_lst(
@@ -349,6 +356,43 @@ def get_hf_processor(
             dynamic_image_size=dynamic_image_size,
         )
 
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+        max_tokens_no_msac = self.get_max_image_tokens(use_msac=False)
+        if max_tokens_no_msac // seq_len < 2:
+            # Dummy data will have one image; in that case msac may be applied
+            max_tokens_per_image = self.get_max_image_tokens(use_msac=None)
+        else:
+            max_tokens_per_image = max_tokens_no_msac
+
+        return {"image": max_tokens_per_image}
+
+    def get_num_image_tokens(
+        self,
+        *,
+        image_width: int,
+        image_height: int,
+        processor: Optional[H2OVLProcessor],
+        use_msac: Optional[bool] = None,
+    ) -> int:
+        if processor is None:
+            processor = self.get_hf_processor()
+
+        return processor.get_num_image_tokens(
+            image_width=image_width,
+            image_height=image_height,
+            use_msac=use_msac,
+        )
+
+    def get_max_image_tokens(self, use_msac: Optional[bool] = None) -> int:
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        return self.get_num_image_tokens(
+            image_width=target_width,
+            image_height=target_height,
+            processor=None,
+            use_msac=use_msac,
+        )
+
 
 class H2OVLMultiModalProcessor(InternVLMultiModalProcessor[H2OVLProcessingInfo]
                                ):
@@ -386,14 +430,9 @@ def get_replacement_internvl(item_idx: int):
                     image_width=image_size.width,
                     image_height=image_size.height,
                     processor=hf_processor,
+                    use_msac=None if num_images == 1 else False,
                 )
 
-            if num_images > 1 and hf_processor.use_msac:
-                # Assume feature size scales linearly with number of patches
-                use_thumbnail = hf_processor.use_thumbnail
-                feature_size = ((feature_size - use_thumbnail) * 2 +
-                                use_thumbnail)
-
             num_patches = image_num_patches[item_idx]
             if num_patches is not None:
                 assert isinstance(num_patches, int)