Skip to content

Commit 9b177b5

Browse files
committed
Update and try to fix tests
Signed-off-by: DarkLight1337 <[email protected]>
1 parent ac6e6e2 commit 9b177b5

File tree

4 files changed

+188
-140
lines changed

4 files changed

+188
-140
lines changed

tests/models/decoder_only/vision_language/test_h2ovl.py

Lines changed: 0 additions & 129 deletions
This file was deleted.
Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
"""Tests for H2OVL's multimodal preprocessing kwargs."""
2+
from typing import Optional
3+
4+
import pytest
5+
6+
from vllm.multimodal import MULTIMODAL_REGISTRY
7+
from vllm.multimodal.image import rescale_image_size
8+
from vllm.multimodal.utils import cached_get_tokenizer
9+
10+
from ....conftest import _ImageAssets
11+
from ...utils import build_model_context
12+
13+
14+
@pytest.mark.parametrize("model_id", [
15+
"h2oai/h2ovl-mississippi-800m",
16+
"h2oai/h2ovl-mississippi-2b",
17+
])
18+
@pytest.mark.parametrize(
19+
"size_factors",
20+
[
21+
# Single-scale
22+
[1.0],
23+
# Single-scale, batched
24+
[1.0, 1.0, 1.0],
25+
# Multi-scale
26+
[0.25, 0.5, 1.0],
27+
],
28+
)
29+
@pytest.mark.parametrize("max_dynamic_patch", [1, 2, 4, 8])
30+
@pytest.mark.parametrize("dynamic_image_size", [True, False, None])
31+
@pytest.mark.parametrize("num_imgs", [1, 2])
32+
def test_processor_override(
33+
model_id: str,
34+
image_assets: _ImageAssets,
35+
size_factors: list[int],
36+
max_dynamic_patch: int,
37+
dynamic_image_size: Optional[bool],
38+
num_imgs: int,
39+
):
40+
from vllm.model_executor.models.h2ovl import (calculate_h2ovl_targets,
41+
get_h2ovl_target_ratios)
42+
43+
ctx = build_model_context(
44+
model_name=model_id,
45+
tokenizer_name=model_id,
46+
trust_remote_code=True,
47+
mm_processor_kwargs=None,
48+
limit_mm_per_prompt={"image": num_imgs},
49+
)
50+
tokenizer = cached_get_tokenizer(
51+
ctx.model_config.tokenizer,
52+
trust_remote_code=ctx.model_config.trust_remote_code,
53+
)
54+
processor = MULTIMODAL_REGISTRY.create_processor(
55+
ctx.model_config,
56+
tokenizer=tokenizer,
57+
)
58+
59+
config = processor.info.get_hf_config()
60+
use_msac = config.use_msac
61+
62+
mm_processor_kwargs = {
63+
"max_dynamic_patch": max_dynamic_patch,
64+
}
65+
if dynamic_image_size is not None:
66+
mm_processor_kwargs["dynamic_image_size"] = dynamic_image_size
67+
68+
# Build the image str / prompt based on the number of images we pass
69+
prompt = "<image>" * num_imgs
70+
71+
for asset in image_assets:
72+
for factor in size_factors:
73+
image = rescale_image_size(asset.pil_image, factor)
74+
mm_data = {"image": [image] * num_imgs}
75+
76+
width, height = image.size
77+
78+
# Calculate the expected number of blocks
79+
if use_msac:
80+
# First pass
81+
blocks1, _, _, aspect_ratio = calculate_h2ovl_targets(
82+
orig_width=width,
83+
orig_height=height,
84+
target_ratios=get_h2ovl_target_ratios(
85+
config.min_dynamic_patch,
86+
max_dynamic_patch,
87+
prior_aspect_ratio=None,
88+
),
89+
image_size=config.vision_config.image_size,
90+
use_thumbnail=False, # Thumbnail is handled separately
91+
)
92+
93+
# Second pass
94+
blocks2, _, _, _ = calculate_h2ovl_targets(
95+
orig_width=width,
96+
orig_height=height,
97+
target_ratios=get_h2ovl_target_ratios(
98+
config.min_dynamic_patch,
99+
max_dynamic_patch,
100+
prior_aspect_ratio=aspect_ratio,
101+
),
102+
image_size=config.vision_config.image_size,
103+
use_thumbnail=False,
104+
)
105+
106+
# Add thumbnail if use_thumbnail is True and total_blocks > 1
107+
if config.use_thumbnail:
108+
blocks1 += 1 if blocks1 > 1 else 0
109+
blocks2 += 1 if blocks2 > 1 else 0
110+
111+
# Total blocks is the sum of blocks from both passes minus
112+
# overlapping
113+
total_blocks = blocks1 + blocks2 - 1
114+
115+
expected_num_patches = total_blocks
116+
else:
117+
blocks, _, _, _ = calculate_h2ovl_targets(
118+
orig_width=width,
119+
orig_height=height,
120+
target_ratios=get_h2ovl_target_ratios(
121+
config.min_dynamic_patch,
122+
max_dynamic_patch,
123+
prior_aspect_ratio=None,
124+
),
125+
image_size=config.vision_config.image_size,
126+
use_thumbnail=False,
127+
)
128+
expected_num_patches = blocks
129+
130+
if config.use_thumbnail and expected_num_patches != 1:
131+
expected_num_patches += 1
132+
133+
processed_inputs = processor.apply(prompt, mm_data,
134+
mm_processor_kwargs)
135+
pixel_shape = (
136+
processed_inputs["mm_kwargs"]["pixel_values_flat"].shape)
137+
138+
assert pixel_shape[0] == expected_num_patches * num_imgs

tests/models/multimodal/processing/test_internvl.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,15 +43,15 @@ def test_processor_override(
4343
if dynamic_image_size is not None:
4444
mm_processor_kwargs["dynamic_image_size"] = dynamic_image_size
4545

46-
expected_num_patches = max_dynamic_patch + 1 if max_dynamic_patch > 1 else 1
47-
if dynamic_image_size is False:
48-
expected_num_patches = 1
49-
5046
# Build the image str / prompt based on the number of images we pass
5147
prompt = "<image>" * num_imgs
5248
image = image_assets[0].pil_image.resize((448 * 2, 448 * 2))
5349
mm_data = {"image": [image] * num_imgs}
5450

51+
expected_num_patches = max_dynamic_patch + 1 if max_dynamic_patch > 1 else 1
52+
if dynamic_image_size is False:
53+
expected_num_patches = 1
54+
5555
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
5656

5757
# Ensure we have the right number of placeholders per num_crops size

vllm/model_executor/models/h2ovl.py

Lines changed: 46 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -292,19 +292,26 @@ def get_num_image_tokens(
292292
*,
293293
image_width: int,
294294
image_height: int,
295+
use_msac: Optional[bool] = None,
295296
) -> int:
297+
use_msac = (self.use_msac if use_msac is None else use_msac)
298+
296299
target_ratios = self.resolve_target_ratios(
297300
use_thumbnail=False, # Applied in calculate_targets
298301
)
299302

303+
use_thumbnail = self.use_thumbnail
300304
num_patches, _, _, _ = calculate_h2ovl_targets(
301305
orig_width=image_width,
302306
orig_height=image_height,
303307
image_size=self.image_size,
304308
target_ratios=target_ratios,
305-
use_thumbnail=self.use_thumbnail,
309+
use_thumbnail=use_thumbnail,
306310
)
307311

312+
if use_msac:
313+
num_patches = (num_patches - use_thumbnail) * 2 + use_thumbnail
314+
308315
return num_patches * self.num_image_token
309316

310317
def _images_to_pixel_values_lst(
@@ -349,6 +356,43 @@ def get_hf_processor(
349356
dynamic_image_size=dynamic_image_size,
350357
)
351358

359+
def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
360+
max_tokens_no_msac = self.get_max_image_tokens(use_msac=False)
361+
if max_tokens_no_msac // seq_len < 2:
362+
# Dummy data will have one image; in that case msac may be applied
363+
max_tokens_per_image = self.get_max_image_tokens(use_msac=None)
364+
else:
365+
max_tokens_per_image = max_tokens_no_msac
366+
367+
return {"image": max_tokens_per_image}
368+
369+
def get_num_image_tokens(
370+
self,
371+
*,
372+
image_width: int,
373+
image_height: int,
374+
processor: Optional[H2OVLProcessor],
375+
use_msac: Optional[bool] = None,
376+
) -> int:
377+
if processor is None:
378+
processor = self.get_hf_processor()
379+
380+
return processor.get_num_image_tokens(
381+
image_width=image_width,
382+
image_height=image_height,
383+
use_msac=use_msac,
384+
)
385+
386+
def get_max_image_tokens(self, use_msac: Optional[bool] = None) -> int:
387+
target_width, target_height = self.get_image_size_with_most_features()
388+
389+
return self.get_num_image_tokens(
390+
image_width=target_width,
391+
image_height=target_height,
392+
processor=None,
393+
use_msac=use_msac,
394+
)
395+
352396

353397
class H2OVLMultiModalProcessor(InternVLMultiModalProcessor[H2OVLProcessingInfo]
354398
):
@@ -386,14 +430,9 @@ def get_replacement_internvl(item_idx: int):
386430
image_width=image_size.width,
387431
image_height=image_size.height,
388432
processor=hf_processor,
433+
use_msac=None if num_images == 1 else False,
389434
)
390435

391-
if num_images > 1 and hf_processor.use_msac:
392-
# Assume feature size scales linearly with number of patches
393-
use_thumbnail = hf_processor.use_thumbnail
394-
feature_size = ((feature_size - use_thumbnail) * 2 +
395-
use_thumbnail)
396-
397436
num_patches = image_num_patches[item_idx]
398437
if num_patches is not None:
399438
assert isinstance(num_patches, int)

0 commit comments

Comments
 (0)