vllm-project · ywang96 · Jan 16, 2025 · Jan 16, 2025 · Jan 16, 2025 · Jan 16, 2025
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
@@ -37,6 +37,7 @@
 
 # For profile run
 _MAX_FRAMES_PER_VIDEO = 16
+_MAX_IMAGE_SIZE_PLACEHOLDER = 12288
 
 
 class LlavaOnevisionVideoPixelInputs(TypedDict):
@@ -101,6 +102,17 @@ def get_hf_processor(self):
     def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
         return {"image": None, "video": None}
 
+    def get_max_image_tokens(self) -> int:
+
+        target_width, target_height = self.get_image_size_with_most_features()
+
+        # FIXME: This is in fact not accurate and we compare with a placeholder.
+        return max(
+            self.get_num_image_tokens(
+                image_width=target_width,
+                image_height=target_height,
+            ), _MAX_IMAGE_SIZE_PLACEHOLDER)
+
     def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
         return {
             "image": self.get_max_image_tokens(),