Cleanup

DarkLight1337 · DarkLight1337 · commit b7e5324815aa · 2025-01-07T17:18:34.000Z
Signed-off-by: DarkLight1337 &lt;tlleungac@connect.ust.hk&gt;
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
@@ -123,7 +123,7 @@ def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
     def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
         return {"image": self.get_max_image_tokens()}
 
-    def apply_feature_select_strategy(
+    def _apply_feature_select_strategy(
         self,
         strategy: str,
         encoder_num_image_tokens: int,
@@ -145,7 +145,7 @@ def get_num_image_tokens(
         hf_config = self.get_hf_config()
         vision_encoder_info = self.get_vision_encoder_info()
 
-        return self.apply_feature_select_strategy(
+        return self._apply_feature_select_strategy(
             hf_config.vision_feature_select_strategy,
             vision_encoder_info.get_num_image_tokens(
                 image_width=image_width,
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
@@ -1,3 +1,4 @@
+from abc import abstractmethod
 from functools import cached_property
 from typing import (Final, Iterable, List, Literal, Mapping, Optional,
                     Protocol, Set, Tuple, TypedDict, TypeVar, Union)
@@ -82,7 +83,7 @@ def get_num_image_tokens(
         hf_config = self.get_hf_config()
         vision_encoder_info = self.get_vision_encoder_info()
 
-        base_feature_size = self.apply_feature_select_strategy(
+        base_feature_size = self._apply_feature_select_strategy(
             hf_config.vision_feature_select_strategy,
             vision_encoder_info.get_num_image_tokens(
                 image_width=image_width,
@@ -99,7 +100,7 @@ def get_num_image_tokens(
         (
             unpadded_feature_size,
             newline_feature_size,
-        ) = self.get_num_unpadded_features(
+        ) = self._get_num_unpadded_features(
             original_height=image_height,
             original_width=image_width,
             npatches=vision_encoder_info.get_patch_grid_length(),
@@ -110,7 +111,7 @@ def get_num_image_tokens(
         return unpadded_feature_size + newline_feature_size + base_feature_size
 
     # Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L86
-    def get_num_unpadded_features(
+    def _get_num_unpadded_features(
         self,
         *,
         original_height: int,
@@ -162,6 +163,19 @@ def get_image_size_with_most_features(self) -> ImageSize:
 
 class BaseLlavaNextMultiModalProcessor(BaseLlavaMultiModalProcessor[_I]):
 
+    # Copied from BaseMultiModalProcessor
+    @abstractmethod
+    def _get_mm_fields_config(
+        self,
+        hf_inputs: BatchFeature,
+        hf_processor_mm_kwargs: Mapping[str, object],
+    ) -> Mapping[str, MultiModalFieldConfig]:
+        raise NotImplementedError
+
+
+class LlavaNextMultiModalProcessor(
+        BaseLlavaNextMultiModalProcessor[LlavaNextProcessingInfo]):
+
     def _get_mm_fields_config(
         self,
         hf_inputs: BatchFeature,
@@ -174,11 +188,6 @@ def _get_mm_fields_config(
         )
 
 
-class LlavaNextMultiModalProcessor(
-        BaseLlavaNextMultiModalProcessor[LlavaNextProcessingInfo]):
-    pass
-
-
 @MULTIMODAL_REGISTRY.register_processor(LlavaNextMultiModalProcessor,
                                         info=LlavaNextProcessingInfo,
                                         dummy=LlavaDummyInputsBuilder)
diff --git a/vllm/model_executor/models/llava_next_video.py b/vllm/model_executor/models/llava_next_video.py
@@ -66,7 +66,7 @@ def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
         max_video_tokens = self.get_num_video_tokens(
             image_width=target_width,
             image_height=target_height,
-            num_frames=self.get_max_num_frames(seq_len),
+            num_frames=self.get_num_frames_with_most_features(seq_len),
         )
 
         return {"video": max_video_tokens}
@@ -76,7 +76,7 @@ def get_image_size_with_most_features(self) -> ImageSize:
         width = height = vision_encoder_info.get_image_size()
         return ImageSize(width=width, height=height)
 
-    def get_num_frame_tokens(
+    def _get_num_frame_tokens(
         self,
         *,
         image_width: int,
@@ -98,14 +98,14 @@ def get_num_video_tokens(
         image_height: int,
         num_frames: int,
     ) -> int:
-        num_frame_tokens = self.get_num_frame_tokens(
+        num_frame_tokens = self._get_num_frame_tokens(
             image_width=image_width,
             image_height=image_height,
         )
 
         return num_frame_tokens * num_frames
 
-    def get_max_video_frames(self, max_tokens: int) -> int:
+    def _get_max_video_frames(self, max_tokens: int) -> int:
         target_width, target_height = self.get_image_size_with_most_features()
 
         num_frames = 0
@@ -125,11 +125,11 @@ def get_max_video_frames(self, max_tokens: int) -> int:
 
         return num_frames
 
-    def get_max_num_frames(self, seq_len: int) -> int:
+    def get_num_frames_with_most_features(self, seq_len: int) -> int:
         mm_config = self.ctx.get_mm_config()
         max_videos = mm_config.limit_per_prompt.get("video", 1)
 
-        max_total_frames = self.get_max_video_frames(seq_len)
+        max_total_frames = self._get_max_video_frames(seq_len)
 
         return max(max_total_frames // max(max_videos, 1), 1)
 
@@ -146,15 +146,18 @@ def get_dummy_processor_inputs(
 
         processor = self.info.get_hf_processor()
         video_token = processor.video_token
+
         target_width, target_height = \
             self.info.get_image_size_with_most_features()
+        target_num_frames = \
+            self.info.get_num_frames_with_most_features(seq_len)
 
         mm_data = {
             "video":
             self._get_dummy_videos(
                 width=target_width,
                 height=target_height,
-                num_frames=self.info.get_max_num_frames(seq_len),
+                num_frames=target_num_frames,
                 num_videos=num_videos,
             )
         }
diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
@@ -19,8 +19,8 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.inputs import (MultiModalFieldConfig, MultiModalKwargs,
                                     NestedTensors)
-from vllm.multimodal.parse import (ImageSize, MultiModalDataItems,
-                                   VideoEmbeddingItems, VideoProcessorItems)
+from vllm.multimodal.parse import (MultiModalDataItems, VideoEmbeddingItems,
+                                   VideoProcessorItems)
 from vllm.multimodal.processing import PromptReplacement
 from vllm.multimodal.profiling import ProcessorInputs
 from vllm.sequence import IntermediateTensors
@@ -109,7 +109,7 @@ def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
 
     # Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L86
     # with additional logic afterwards taken from LlavaOnevisionProcessor
-    def get_num_unpadded_features(
+    def _get_num_unpadded_features(
         self,
         *,
         original_height: int,
@@ -145,23 +145,7 @@ def get_num_unpadded_features(
 
         return (unpadded_features, newline_features)
 
-    def get_image_size_with_most_features(self) -> ImageSize:
-        hf_config = self.get_hf_config()
-        largest_feature_size, largest_feature_pinpoint = 0, None
-        for (height, width) in hf_config.image_grid_pinpoints:
-            feat_size = self.get_num_image_tokens(image_width=width,
-                                                  image_height=height)
-            if feat_size > largest_feature_size:
-                largest_feature_size = feat_size
-                largest_feature_pinpoint = ImageSize(width=width,
-                                                     height=height)
-
-        if largest_feature_size == 0 or largest_feature_pinpoint is None:
-            raise ValueError("Cannot have a largest feature size of 0!")
-
-        return largest_feature_pinpoint
-
-    def get_num_frame_tokens(
+    def _get_num_frame_tokens(
         self,
         *,
         image_width: int,
@@ -183,14 +167,14 @@ def get_num_video_tokens(
         image_height: int,
         num_frames: int,
     ) -> int:
-        num_frame_tokens = self.get_num_frame_tokens(
+        num_frame_tokens = self._get_num_frame_tokens(
             image_width=image_width,
             image_height=image_height,
         )
 
         return num_frame_tokens * num_frames + 1  # Newline token
 
-    def get_max_video_frames(self, max_tokens: int) -> int:
+    def _get_max_video_frames(self, max_tokens: int) -> int:
         target_width, target_height = self.get_image_size_with_most_features()
 
         num_frames = 0
@@ -210,14 +194,14 @@ def get_max_video_frames(self, max_tokens: int) -> int:
 
         return num_frames
 
-    def get_max_num_frames(self, seq_len: int) -> int:
+    def get_num_frames_with_most_features(self, seq_len: int) -> int:
         mm_config = self.ctx.get_mm_config()
         max_images = mm_config.limit_per_prompt.get("image", 1)
         max_videos = mm_config.limit_per_prompt.get("video", 1)
 
         max_image_tokens = self.get_max_image_tokens() * max_images
-        max_total_frames = self.get_max_video_frames(seq_len -
-                                                     max_image_tokens)
+        max_total_frames = self._get_max_video_frames(seq_len -
+                                                      max_image_tokens)
         max_frames_per_video = min(max_total_frames // max(max_videos, 1),
                                    _MAX_FRAMES_PER_VIDEO)
 
@@ -229,7 +213,7 @@ def get_max_video_tokens(self, seq_len: int) -> int:
         return self.get_num_video_tokens(
             image_width=target_width,
             image_height=target_height,
-            num_frames=self.get_max_num_frames(seq_len),
+            num_frames=self.get_num_frames_with_most_features(seq_len),
         )
 
 
@@ -247,8 +231,11 @@ def get_dummy_processor_inputs(
         processor = self.info.get_hf_processor()
         image_token = processor.image_token
         video_token = processor.video_token
+
         target_width, target_height = \
             self.info.get_image_size_with_most_features()
+        target_num_frames = \
+            self.info.get_num_frames_with_most_features(seq_len)
 
         mm_data = {
             "image":
@@ -259,7 +246,7 @@ def get_dummy_processor_inputs(
             self._get_dummy_videos(
                 width=target_width,
                 height=target_height,
-                num_frames=self.info.get_max_num_frames(seq_len),
+                num_frames=target_num_frames,
                 num_videos=num_videos,
             )
         }
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
@@ -836,7 +836,7 @@ def get_max_image_tokens(self) -> int:
             image_height=target_height,
         )
 
-    def get_max_video_frames(self, max_tokens: int) -> int:
+    def _get_max_video_frames(self, max_tokens: int) -> int:
         target_width, target_height = self.get_image_size_with_most_features()
 
         num_frames = 0
@@ -856,14 +856,14 @@ def get_max_video_frames(self, max_tokens: int) -> int:
 
         return num_frames
 
-    def get_max_num_frames(self, seq_len: int) -> int:
+    def get_num_frames_with_most_features(self, seq_len: int) -> int:
         mm_config = self.ctx.get_mm_config()
         max_images = mm_config.limit_per_prompt.get("image", 1)
         max_videos = mm_config.limit_per_prompt.get("video", 1)
 
         max_image_tokens = self.get_max_image_tokens() * max_images
-        max_total_frames = self.get_max_video_frames(seq_len -
-                                                     max_image_tokens)
+        max_total_frames = self._get_max_video_frames(seq_len -
+                                                      max_image_tokens)
 
         num_frames = max(max_total_frames // max(max_videos, 1), 1)
 
@@ -879,7 +879,7 @@ def get_max_video_tokens(self, seq_len: int) -> int:
         return self.get_num_video_tokens(
             image_width=target_width,
             image_height=target_height,
-            num_frames=self.get_max_num_frames(seq_len),
+            num_frames=self.get_num_frames_with_most_features(seq_len),
         )
 
 
@@ -896,8 +896,11 @@ def get_dummy_processor_inputs(
         hf_processor = self.info.get_hf_processor()
         image_token: str = hf_processor.image_token
         video_token: str = hf_processor.video_token
+
         target_width, target_height = \
             self.info.get_image_size_with_most_features()
+        target_num_frames = \
+            self.info.get_num_frames_with_most_features(seq_len)
 
         mm_data = {
             "image":
@@ -908,7 +911,7 @@ def get_dummy_processor_inputs(
             self._get_dummy_videos(
                 width=target_width,
                 height=target_height,
-                num_frames=self.info.get_max_num_frames(seq_len),
+                num_frames=target_num_frames,
                 num_videos=num_videos,
             )
         }