19
19
from vllm .multimodal import MULTIMODAL_REGISTRY
20
20
from vllm .multimodal .inputs import (MultiModalFieldConfig , MultiModalKwargs ,
21
21
NestedTensors )
22
- from vllm .multimodal .parse import (ImageSize , MultiModalDataItems ,
23
- VideoEmbeddingItems , VideoProcessorItems )
22
+ from vllm .multimodal .parse import (MultiModalDataItems , VideoEmbeddingItems ,
23
+ VideoProcessorItems )
24
24
from vllm .multimodal .processing import PromptReplacement
25
25
from vllm .multimodal .profiling import ProcessorInputs
26
26
from vllm .sequence import IntermediateTensors
@@ -109,7 +109,7 @@ def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
109
109
110
110
# Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L86
111
111
# with additional logic afterwards taken from LlavaOnevisionProcessor
112
- def get_num_unpadded_features (
112
+ def _get_num_unpadded_features (
113
113
self ,
114
114
* ,
115
115
original_height : int ,
@@ -145,23 +145,7 @@ def get_num_unpadded_features(
145
145
146
146
return (unpadded_features , newline_features )
147
147
148
- def get_image_size_with_most_features (self ) -> ImageSize :
149
- hf_config = self .get_hf_config ()
150
- largest_feature_size , largest_feature_pinpoint = 0 , None
151
- for (height , width ) in hf_config .image_grid_pinpoints :
152
- feat_size = self .get_num_image_tokens (image_width = width ,
153
- image_height = height )
154
- if feat_size > largest_feature_size :
155
- largest_feature_size = feat_size
156
- largest_feature_pinpoint = ImageSize (width = width ,
157
- height = height )
158
-
159
- if largest_feature_size == 0 or largest_feature_pinpoint is None :
160
- raise ValueError ("Cannot have a largest feature size of 0!" )
161
-
162
- return largest_feature_pinpoint
163
-
164
- def get_num_frame_tokens (
148
+ def _get_num_frame_tokens (
165
149
self ,
166
150
* ,
167
151
image_width : int ,
@@ -183,14 +167,14 @@ def get_num_video_tokens(
183
167
image_height : int ,
184
168
num_frames : int ,
185
169
) -> int :
186
- num_frame_tokens = self .get_num_frame_tokens (
170
+ num_frame_tokens = self ._get_num_frame_tokens (
187
171
image_width = image_width ,
188
172
image_height = image_height ,
189
173
)
190
174
191
175
return num_frame_tokens * num_frames + 1 # Newline token
192
176
193
- def get_max_video_frames (self , max_tokens : int ) -> int :
177
+ def _get_max_video_frames (self , max_tokens : int ) -> int :
194
178
target_width , target_height = self .get_image_size_with_most_features ()
195
179
196
180
num_frames = 0
@@ -210,14 +194,14 @@ def get_max_video_frames(self, max_tokens: int) -> int:
210
194
211
195
return num_frames
212
196
213
- def get_max_num_frames (self , seq_len : int ) -> int :
197
+ def get_num_frames_with_most_features (self , seq_len : int ) -> int :
214
198
mm_config = self .ctx .get_mm_config ()
215
199
max_images = mm_config .limit_per_prompt .get ("image" , 1 )
216
200
max_videos = mm_config .limit_per_prompt .get ("video" , 1 )
217
201
218
202
max_image_tokens = self .get_max_image_tokens () * max_images
219
- max_total_frames = self .get_max_video_frames (seq_len -
220
- max_image_tokens )
203
+ max_total_frames = self ._get_max_video_frames (seq_len -
204
+ max_image_tokens )
221
205
max_frames_per_video = min (max_total_frames // max (max_videos , 1 ),
222
206
_MAX_FRAMES_PER_VIDEO )
223
207
@@ -229,7 +213,7 @@ def get_max_video_tokens(self, seq_len: int) -> int:
229
213
return self .get_num_video_tokens (
230
214
image_width = target_width ,
231
215
image_height = target_height ,
232
- num_frames = self .get_max_num_frames (seq_len ),
216
+ num_frames = self .get_num_frames_with_most_features (seq_len ),
233
217
)
234
218
235
219
@@ -247,8 +231,11 @@ def get_dummy_processor_inputs(
247
231
processor = self .info .get_hf_processor ()
248
232
image_token = processor .image_token
249
233
video_token = processor .video_token
234
+
250
235
target_width , target_height = \
251
236
self .info .get_image_size_with_most_features ()
237
+ target_num_frames = \
238
+ self .info .get_num_frames_with_most_features (seq_len )
252
239
253
240
mm_data = {
254
241
"image" :
@@ -259,7 +246,7 @@ def get_dummy_processor_inputs(
259
246
self ._get_dummy_videos (
260
247
width = target_width ,
261
248
height = target_height ,
262
- num_frames = self . info . get_max_num_frames ( seq_len ) ,
249
+ num_frames = target_num_frames ,
263
250
num_videos = num_videos ,
264
251
)
265
252
}
0 commit comments