Skip to content

Commit e7a7112

Browse files
DarkLight1337Isotr0py
authored andcommitted
[Doc] Expand Multimodal API Reference (vllm-project#11852)
Signed-off-by: DarkLight1337 <[email protected]> Signed-off-by: Isotr0py <[email protected]>
1 parent 8251010 commit e7a7112

File tree

9 files changed

+139
-71
lines changed

9 files changed

+139
-71
lines changed

docs/source/api/multimodal/index.md

Lines changed: 8 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -2,72 +2,27 @@
22

33
# Multi-Modality
44

5-
```{eval-rst}
6-
.. currentmodule:: vllm.multimodal
7-
```
8-
95
vLLM provides experimental support for multi-modal models through the {mod}`vllm.multimodal` package.
106

117
Multi-modal inputs can be passed alongside text and token prompts to [supported models](#supported-mm-models)
128
via the `multi_modal_data` field in {class}`vllm.inputs.PromptType`.
139

1410
Looking to add your own multi-modal model? Please follow the instructions listed [here](#enabling-multimodal-inputs).
1511

16-
1712
## Module Contents
1813

19-
```{eval-rst}
20-
.. automodule:: vllm.multimodal
21-
```
22-
23-
### Registry
24-
2514
```{eval-rst}
2615
.. autodata:: vllm.multimodal.MULTIMODAL_REGISTRY
2716
```
2817

29-
```{eval-rst}
30-
.. autoclass:: vllm.multimodal.MultiModalRegistry
31-
:members:
32-
:show-inheritance:
33-
```
34-
35-
### Base Classes
36-
37-
```{eval-rst}
38-
.. automodule:: vllm.multimodal.base
39-
:members:
40-
:show-inheritance:
41-
```
18+
## Submodules
4219

43-
### Input Classes
20+
```{toctree}
21+
:maxdepth: 1
4422
45-
```{eval-rst}
46-
.. automodule:: vllm.multimodal.inputs
47-
:members:
48-
:show-inheritance:
49-
```
50-
51-
### Audio Classes
52-
53-
```{eval-rst}
54-
.. automodule:: vllm.multimodal.audio
55-
:members:
56-
:show-inheritance:
57-
```
58-
59-
### Image Classes
60-
61-
```{eval-rst}
62-
.. automodule:: vllm.multimodal.image
63-
:members:
64-
:show-inheritance:
65-
```
66-
67-
### Video Classes
68-
69-
```{eval-rst}
70-
.. automodule:: vllm.multimodal.video
71-
:members:
72-
:show-inheritance:
23+
inputs
24+
parse
25+
processing
26+
profiling
27+
registry
7328
```
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
# Input Definitions
2+
3+
## User-facing inputs
4+
5+
```{eval-rst}
6+
.. autodata:: vllm.multimodal.MultiModalDataDict
7+
```
8+
9+
## Internal data structures
10+
11+
```{eval-rst}
12+
.. autoclass:: vllm.multimodal.inputs.PlaceholderRange
13+
:members:
14+
:show-inheritance:
15+
```
16+
17+
```{eval-rst}
18+
.. autodata:: vllm.multimodal.inputs.NestedTensors
19+
```
20+
21+
```{eval-rst}
22+
.. autoclass:: vllm.multimodal.inputs.MultiModalFieldElem
23+
:members:
24+
:show-inheritance:
25+
```
26+
27+
```{eval-rst}
28+
.. autoclass:: vllm.multimodal.inputs.MultiModalFieldConfig
29+
:members:
30+
:show-inheritance:
31+
```
32+
33+
```{eval-rst}
34+
.. autoclass:: vllm.multimodal.inputs.MultiModalKwargsItem
35+
:members:
36+
:show-inheritance:
37+
```
38+
39+
```{eval-rst}
40+
.. autoclass:: vllm.multimodal.inputs.MultiModalKwargs
41+
:members:
42+
:show-inheritance:
43+
```
44+
45+
```{eval-rst}
46+
.. autoclass:: vllm.multimodal.inputs.MultiModalInputsV2
47+
:members:
48+
:show-inheritance:
49+
```
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# Data Parsing
2+
3+
## Module Contents
4+
5+
```{eval-rst}
6+
.. automodule:: vllm.multimodal.parse
7+
:members:
8+
:member-order: bysource
9+
```
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# Data Processing
2+
3+
## Module Contents
4+
5+
```{eval-rst}
6+
.. automodule:: vllm.multimodal.processing
7+
:members:
8+
:member-order: bysource
9+
```
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# Memory Profiling
2+
3+
## Module Contents
4+
5+
```{eval-rst}
6+
.. automodule:: vllm.multimodal.profiling
7+
:members:
8+
:member-order: bysource
9+
```
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# Registry
2+
3+
## Module Contents
4+
5+
```{eval-rst}
6+
.. automodule:: vllm.multimodal.registry
7+
:members:
8+
:member-order: bysource
9+
```

vllm/multimodal/parse.py

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,16 @@
1313

1414
from .audio import resample_audio
1515
from .inputs import (AudioItem, HfAudioItem, HfImageItem, HfVideoItem,
16-
ImageItem, ModalityData, MultiModalDataDict,
17-
NestedTensors, VideoItem)
16+
ImageItem, ModalityData, MultiModalDataDict, VideoItem)
1817

1918
_T = TypeVar("_T")
2019
_I = TypeVar("_I")
2120

2221

2322
class ModalityDataItems(ABC, Generic[_T, _I]):
23+
"""
24+
Represents data items for a modality in :class:`MultiModalDataItems`.
25+
"""
2426

2527
def __init__(self, data: _T, modality: str) -> None:
2628
super().__init__()
@@ -69,6 +71,7 @@ def get_passthrough_data(self) -> Mapping[str, object]:
6971

7072

7173
class ProcessorBatchItems(ModalityDataItems[Sequence[_T], _T]):
74+
"""Base class for data items that are arranged in a list."""
7275

7376
def get_count(self) -> int:
7477
return len(self.data)
@@ -83,7 +86,12 @@ def get_passthrough_data(self) -> Mapping[str, object]:
8386
return {}
8487

8588

86-
class EmbeddingItems(ModalityDataItems[NestedTensors, torch.Tensor]):
89+
class EmbeddingItems(ModalityDataItems[Union[torch.Tensor, list[torch.Tensor]],
90+
torch.Tensor]):
91+
"""
92+
Base class for data items that are expressed as a batched embedding tensor,
93+
or a list of embedding tensors (one per item).
94+
"""
8795

8896
def get_count(self) -> int:
8997
return len(self.data)
@@ -109,7 +117,7 @@ def __init__(self, data: Sequence[HfAudioItem]) -> None:
109117

110118
class AudioEmbeddingItems(EmbeddingItems):
111119

112-
def __init__(self, data: NestedTensors) -> None:
120+
def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None:
113121
super().__init__(data, "audio")
114122

115123

@@ -137,7 +145,7 @@ def get_image_size(self, item_idx: int) -> ImageSize:
137145

138146
class ImageEmbeddingItems(EmbeddingItems):
139147

140-
def __init__(self, data: NestedTensors) -> None:
148+
def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None:
141149
super().__init__(data, "image")
142150

143151

@@ -163,7 +171,7 @@ def get_frame_size(self, item_idx: int) -> ImageSize:
163171

164172
class VideoEmbeddingItems(EmbeddingItems):
165173

166-
def __init__(self, data: NestedTensors) -> None:
174+
def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None:
167175
super().__init__(data, "video")
168176

169177

@@ -172,8 +180,8 @@ def __init__(self, data: NestedTensors) -> None:
172180

173181
class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]):
174182
"""
175-
As :class:`MultiModalDataDict`, but normalized such that each entry
176-
corresponds to a list.
183+
As :data:`~vllm.multimodal.inputs.MultiModalDataDict`, but normalized
184+
such that each entry corresponds to a list.
177185
"""
178186

179187
def get_count(self, modality: str, *, strict: bool = True) -> int:
@@ -226,7 +234,8 @@ def get_items(
226234

227235
class MultiModalDataParser:
228236
"""
229-
Parses :class:`MultiModalDataDict` into :class:`MultiModalDataItems`.
237+
Parses :data:`~vllm.multimodal.inputs.MultiModalDataDict` into
238+
:class:`MultiModalDataItems`.
230239
231240
Args:
232241
target_sr (float, optional): Enables automatic resampling of audio
@@ -238,7 +247,9 @@ def __init__(self, *, target_sr: Optional[float] = None) -> None:
238247

239248
self.target_sr = target_sr
240249

241-
def _is_embeddings(self, data: object) -> TypeGuard[NestedTensors]:
250+
def _is_embeddings(
251+
self, data: object
252+
) -> TypeGuard[Union[torch.Tensor, list[torch.Tensor]]]:
242253
if isinstance(data, torch.Tensor):
243254
return data.ndim == 3
244255
if is_list_of(data, torch.Tensor):

vllm/multimodal/processing.py

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -33,20 +33,24 @@
3333

3434
@dataclass
3535
class PromptReplacement:
36+
"""
37+
Defines how to replace portions of an input prompt with placeholder tokens.
38+
"""
39+
3640
modality: str
3741
"""The modality for which the replacement is made."""
3842

3943
target: _PromptSeq
40-
"""The text or token sequence to find and replace."""
44+
"""The token sequence (or text) to find and replace."""
4145

4246
replacement: Union[Callable[[int], _PromptSeq],
4347
_PromptSeq] = field(repr=False)
4448
"""
45-
Given the index of the processed item within :attr:`modality`, output the
46-
replacement text or token sequence.
49+
Given the index of the processed item within :attr:`modality`,
50+
output the replacement token sequence (or text).
4751
48-
For convenience, you can pass in the replacement instead of a function
49-
if it does not depend on the input.
52+
For convenience, you can directly pass in the replacement token sequence
53+
(or text) instead of a function if it does not depend on the input.
5054
"""
5155

5256
def bind(self, tokenizer: AnyTokenizer) -> "BoundPromptReplacement":
@@ -132,6 +136,11 @@ def token_ids(self) -> list[int]:
132136

133137
@dataclass
134138
class BoundPromptReplacement:
139+
"""
140+
A :class:`PromptReplacement` bound to a tokenizer to automatically
141+
convert :attr:`target` and the result of :meth:`get_replacement` between
142+
token sequence and text representations.
143+
"""
135144
tokenizer: AnyTokenizer = field(repr=False)
136145
modality: str
137146

@@ -144,6 +153,7 @@ def __post_init__(self) -> None:
144153

145154
@property
146155
def target(self) -> _BoundPromptSequence:
156+
"""The token sequence (or text) to find and replace."""
147157
target = self._target
148158

149159
return _BoundPromptSequence(
@@ -153,6 +163,10 @@ def target(self) -> _BoundPromptSequence:
153163
)
154164

155165
def get_replacement(self, item_idx: int) -> _BoundPromptSequence:
166+
"""
167+
Given the index of the processed item within :attr:`modality`,
168+
output the replacement token sequence (or text).
169+
"""
156170
replacement = self._replacement
157171
if callable(replacement):
158172
cache_key = item_idx
@@ -528,7 +542,7 @@ def put(
528542

529543

530544
class BaseProcessingInfo:
531-
"""Base class containing information to perform processing."""
545+
"""Base class to provide the information necessary for data processing."""
532546

533547
def __init__(self, ctx: InputProcessingContext) -> None:
534548
super().__init__()

vllm/multimodal/profiling.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,10 @@
1919

2020
@dataclass
2121
class ProcessorInputs:
22-
"""Keyword arguments to :meth:`BaseMultiModalProcessor`."""
22+
"""
23+
Represents the keyword arguments to
24+
:meth:`vllm.multimodal.processing.BaseMultiModalProcessor.apply`.
25+
"""
2326
prompt_text: str
2427
mm_data: MultiModalDataDict
2528
hf_processor_mm_kwargs: Mapping[str, object] = field(default_factory=dict)
@@ -47,7 +50,7 @@ def get_dummy_processor_inputs(
4750
) -> ProcessorInputs:
4851
"""
4952
Build the input which, after processing, results in
50-
`self.info.get_mm_max_tokens_per_item()` placeholder tokens.
53+
:code:`self.info.get_mm_max_tokens_per_item()` placeholder tokens.
5154
"""
5255
raise NotImplementedError
5356

0 commit comments

Comments
 (0)