noamgat
diff --git a/‎docs/configuration/conserving_memory.md‎
Lines changed: 15 additions & 17 deletions b/‎docs/configuration/conserving_memory.md‎
Lines changed: 15 additions & 17 deletions
diff --git a/‎docs/configuration/optimization.md‎
Lines changed: 29 additions & 44 deletions b/‎docs/configuration/optimization.md‎
Lines changed: 29 additions & 44 deletions
diff --git a/‎examples/offline_inference/mistral-small.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/offline_inference/mistral-small.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/offline_inference/vision_language.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/offline_inference/vision_language.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/models/utils.py‎
Lines changed: 3 additions & 2 deletions b/‎tests/models/utils.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎tests/multimodal/test_cache.py‎
Lines changed: 51 additions & 0 deletions b/‎tests/multimodal/test_cache.py‎
Lines changed: 51 additions & 0 deletions
diff --git a/‎tests/multimodal/test_processing.py‎
Lines changed: 2 additions & 46 deletions b/‎tests/multimodal/test_processing.py‎
Lines changed: 2 additions & 46 deletions
diff --git a/‎vllm/config.py‎
Lines changed: 27 additions & 3 deletions b/‎vllm/config.py‎
Lines changed: 27 additions & 3 deletions
diff --git a/‎vllm/engine/arg_utils.py‎
Lines changed: 11 additions & 11 deletions b/‎vllm/engine/arg_utils.py‎
Lines changed: 11 additions & 11 deletions
diff --git a/‎vllm/entrypoints/cli/serve.py‎
Lines changed: 2 additions & 3 deletions b/‎vllm/entrypoints/cli/serve.py‎
Lines changed: 2 additions & 3 deletions
@@ -86,7 +86,7 @@ llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct",
 
 If you run out of CPU RAM, try the following options:
 
-- (Multi-modal models only) you can set the size of multi-modal input cache using `VLLM_MM_INPUT_CACHE_GIB` environment variable (default 4 GiB).
+- (Multi-modal models only) you can set the size of multi-modal processor cache using `VLLM_MM_INPUT_CACHE_GIB` environment variable (default 4 GiB per API process + 4 GiB per engine core process)
 - (CPU backend only) you can set the size of KV cache using `VLLM_CPU_KVCACHE_SPACE` environment variable (default 4 GiB).
 
 ## Multi-modal input limits
@@ -129,20 +129,18 @@ reduce the size of the processed multi-modal inputs, which in turn saves memory.
 
 Here are some examples:
 
-??? code
-
-    ```python
-    from vllm import LLM
+```python
+from vllm import LLM
 
-    # Available for Qwen2-VL series models
-    llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
-            mm_processor_kwargs={
-                "max_pixels": 768 * 768,  # Default is 1280 * 28 * 28
-            })
-
-    # Available for InternVL series models
-    llm = LLM(model="OpenGVLab/InternVL2-2B",
-            mm_processor_kwargs={
-                "max_dynamic_patch": 4,  # Default is 12
-            })
-    ```
+# Available for Qwen2-VL series models
+llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
+          mm_processor_kwargs={
+              "max_pixels": 768 * 768,  # Default is 1280 * 28 * 28
+          })
+
+# Available for InternVL series models
+llm = LLM(model="OpenGVLab/InternVL2-2B",
+          mm_processor_kwargs={
+              "max_dynamic_patch": 4,  # Default is 12
+          })
+```
@@ -2,6 +2,9 @@
 
 This guide covers optimization strategies and performance tuning for vLLM V1.
 
+!!! tip
+    Running out of memory? Consult [this guide](./conserving_memory.md) on how to conserve memory.
+
 ## Preemption
 
 Due to the auto-regressive nature of transformer architecture, there are times when KV cache space is insufficient to handle all batched requests.
@@ -126,62 +129,44 @@ Data parallelism replicates the entire model across multiple GPU sets and proces
 Data parallelism can be combined with the other parallelism strategies and is set by `data_parallel_size=N`.
 Note that MoE layers will be sharded according to the product of the tensor parallel size and data parallel size.
 
-## Reducing Memory Usage
-
-If you encounter out-of-memory issues, consider these strategies:
+## Input Processing
 
-### Context Length and Batch Size
+### Parallel Processing
 
-You can reduce memory usage by limiting the context length and batch size:
+You can run input processing in parallel via [API server scale-out](../serving/data_parallel_deployment.md#internal-load-balancing).
+This is useful when input processing (which is run inside the API server)
+becomes a bottleneck compared to model execution (which is run inside engine core)
+and you have excess CPU capacity.
 
-```python
-from vllm import LLM
+```console
+# Run 4 API processes and 1 engine core process
+vllm serve Qwen/Qwen2.5-VL-3B-Instruct --api-server-count 4
 
-llm = LLM(
-    model="meta-llama/Llama-3.1-8B-Instruct",
-    max_model_len=2048,  # Limit context window
-    max_num_seqs=4       # Limit batch size
-)
+# Run 4 API processes and 2 engine core processes
+vllm serve Qwen/Qwen2.5-VL-3B-Instruct --api-server-count 4 -dp 2
 ```
 
-### Adjust CUDA Graph Compilation
+!!! note
+    API server scale-out is only available for online inference.
 
-CUDA graph compilation in V1 uses more memory than in V0. You can reduce memory usage by adjusting the compilation level:
-
-```python
-from vllm import LLM
-from vllm.config import CompilationConfig, CompilationLevel
-
-llm = LLM(
-    model="meta-llama/Llama-3.1-8B-Instruct",
-    compilation_config=CompilationConfig(
-        level=CompilationLevel.PIECEWISE,
-        cudagraph_capture_sizes=[1, 2, 4, 8]  # Capture fewer batch sizes
-    )
-)
-```
+!!! note
+    [Multi-modal processor cache](#processor-cache) is disabled when API server scale-out is enabled
+    because it requires a one-to-one correspondance between API and engine core processes.
 
-Or, if you are not concerned about latency or overall performance, disable CUDA graph compilation entirely with `enforce_eager=True`:
+## Multi-Modal Caching
 
-```python
-from vllm import LLM
+### Processor Cache
 
-llm = LLM(
-    model="meta-llama/Llama-3.1-8B-Instruct",
-    enforce_eager=True  # Disable CUDA graph compilation
-)
-```
+By default, the multi-modal processor cache is enabled to avoid repeatedly processing
+the same multi-modal inputs via Hugging Face `AutoProcessor`,
+which commonly occurs in multi-turn conversations.
 
-### Multimodal Models
+You can adjust the size of the cache via `VLLM_MM_INPUT_CACHE_GIB` environment variable
+(default 4 GiB per API process + 4 GiB per engine core process).
 
-For multi-modal models, you can reduce memory usage by limiting the number of images/videos per request:
+If you do not benefit much from the cache, you can disable it completely via `disable_mm_preprocessor_cache`:
 
 ```python
-from vllm import LLM
-
-# Accept up to 2 images per prompt
-llm = LLM(
-    model="Qwen/Qwen2.5-VL-3B-Instruct",
-    limit_mm_per_prompt={"image": 2}
-)
+llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
+          disable_mm_preprocessor_cache=True)
 ```
@@ -166,7 +166,7 @@ def parse_args():
     parser.add_argument(
         "--disable-mm-preprocessor-cache",
         action="store_true",
-        help="If True, disables caching of multi-modal preprocessor/mapper.",
+        help="If True, disables caching of multi-modal processor.",
     )
     return parser.parse_args()
 
 
@@ -1565,7 +1565,7 @@ def parse_args():
     parser.add_argument(
         "--disable-mm-preprocessor-cache",
         action="store_true",
-        help="If True, disables caching of multi-modal preprocessor/mapper.",
+        help="If True, disables caching of multi-modal processor.",
     )
 
     parser.add_argument(
 
@@ -9,7 +9,7 @@
 import torch.nn.functional as F
 from transformers import PretrainedConfig
 
-from vllm.config import ModelConfig, RunnerOption
+from vllm.config import ModelConfig, ModelDType, RunnerOption
 from vllm.inputs import InputContext
 from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
 
@@ -257,7 +257,7 @@ def check_logprobs_close(
 def build_model_context(
     model_id: str,
     runner: RunnerOption = "auto",
-    dtype: Union[str, torch.dtype] = "auto",
+    dtype: ModelDType = "auto",
     model_config_kwargs: Optional[dict[str, Any]] = None,
     mm_processor_kwargs: Optional[dict[str, Any]] = None,
     limit_mm_per_prompt: Optional[dict[str, int]] = None,
@@ -279,6 +279,7 @@ def build_model_context(
     model_info.check_transformers_version(on_fail="skip")
 
     model_config_kwargs = model_config_kwargs or {}
+    limit_mm_per_prompt = limit_mm_per_prompt or {}
     model_config = ModelConfig(
         model_id,
         runner=runner,
 
@@ -0,0 +1,51 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+
+from vllm.multimodal.cache import MultiModalCache, MultiModalCacheItemMetadata
+from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargs,
+                                    MultiModalKwargsItem,
+                                    MultiModalSharedField)
+
+
+def _dummy_elem(modality: str, key: str, size: int):
+    return MultiModalFieldElem(
+        modality=modality,
+        key=key,
+        data=torch.empty((size, ), dtype=torch.int8),
+        field=MultiModalSharedField(1),
+    )
+
+
+def _dummy_item(modality: str, size_by_key: dict[str, int]):
+    return MultiModalKwargsItem.from_elems([
+        _dummy_elem(modality, key, size) for key, size in size_by_key.items()
+    ])
+
+
+def _dummy_kw(size_by_key_modality: dict[str, dict[str, int]]):
+    return MultiModalKwargs.from_items([
+        _dummy_item(modality, size_by_key)
+        for modality, size_by_key in size_by_key_modality.items()
+    ])
+
+
+# yapf: disable
+@pytest.mark.parametrize(
+    ("item", "expected_size"),
+    [
+        (_dummy_item("a", {"a1": 100}), 100),
+        (_dummy_item("a", {"a1": 100, "a2": 110}), 210),
+        (_dummy_kw({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}), 460),  # noqa: E501
+    ],
+)
+# yapf: enable
+def test_cache_item_size(item, expected_size):
+    cache = MultiModalCache.get_lru_cache(2048, type(item))
+
+    cache[""] = item
+    assert cache.currsize == expected_size
+
+    cache[""] = MultiModalCacheItemMetadata.wraps(item)
+    assert cache.currsize == expected_size
@@ -6,20 +6,15 @@
 
 import numpy as np
 import pytest
-import torch
 
 from vllm.config import ModelConfig
 from vllm.inputs import InputProcessingContext
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.inputs import (MultiModalFieldElem, MultiModalKwargs,
-                                    MultiModalKwargsItem,
-                                    MultiModalSharedField)
 # yapf conflicts with isort for this block
 # yapf: disable
 from vllm.multimodal.processing import (PlaceholderFeaturesInfo,
-                                        ProcessingCache, PromptIndexTargets,
-                                        PromptInsertion, PromptReplacement,
-                                        apply_text_matches,
+                                        PromptIndexTargets, PromptInsertion,
+                                        PromptReplacement, apply_text_matches,
                                         apply_token_matches,
                                         find_mm_placeholders,
                                         find_text_matches, find_token_matches,
@@ -902,45 +897,6 @@ def test_find_mm_placeholders(
     assert result == expected
 
 
-def _dummy_elem(modality: str, key: str, size: int):
-    return MultiModalFieldElem(
-        modality=modality,
-        key=key,
-        data=torch.empty((size, ), dtype=torch.int8),
-        field=MultiModalSharedField(1),
-    )
-
-
-def _dummy_item(modality: str, size_by_key: dict[str, int]):
-    return MultiModalKwargsItem.from_elems([
-        _dummy_elem(modality, key, size) for key, size in size_by_key.items()
-    ])
-
-
-def _dummy_kw(size_by_key_modality: dict[str, dict[str, int]]):
-    return MultiModalKwargs.from_items([
-        _dummy_item(modality, size_by_key)
-        for modality, size_by_key in size_by_key_modality.items()
-    ])
-
-
-# yapf: disable
-@pytest.mark.parametrize(
-    ("item", "expected_size"),
-    [
-        (_dummy_item("a", {"a1": 100}), 100),
-        (_dummy_item("a", {"a1": 100, "a2": 110}), 210),
-        (_dummy_kw({"a": {"a1": 100, "a2": 110}, "b": {"b1": 120, "b2": 130}}), 460),  # noqa: E501
-    ],
-)
-# yapf: enable
-def test_cache_item_size(item, expected_size):
-    cache = ProcessingCache.get_lru_cache(2048, type(item))
-    cache[""] = item
-
-    assert cache.currsize == expected_size
-
-
 @pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
 @pytest.mark.parametrize(
     ("limit", "num_supported", "is_valid"),
 
@@ -444,8 +444,7 @@ class ModelConfig:
     model that is being run. For example, for Phi-3-Vision: `{"num_crops": 4}`.
     """
     disable_mm_preprocessor_cache: bool = False
-    """If `True`, disable caching of the multi-modal preprocessor/mapper (not
-    recommended)."""
+    """If `True`, disable caching of the multi-modal processor."""
     override_neuron_config: dict[str, Any] = field(default_factory=dict)
     """Initialize non-default neuron config or override default neuron config
     that are specific to Neuron devices, this argument will be used to
@@ -1692,6 +1691,31 @@ def uses_mrope(self) -> bool:
     def is_multimodal_model(self) -> bool:
         return self.multimodal_config is not None
 
+    @property
+    def processor_return_mm_hashes(self) -> bool:
+        """Whether the multi-modal processor should output hashes."""
+        mm_config = self.multimodal_config
+        if mm_config is None:
+            return False
+
+        return not mm_config.disable_mm_preprocessor_cache
+
+    @property
+    def enable_mm_input_cache(self) -> bool:
+        """Whether the multi-modal input cache should be enabled."""
+        mm_config = self.multimodal_config
+        if mm_config is None:
+            return False
+
+        return not mm_config.disable_mm_preprocessor_cache
+
+    def get_mm_input_cache_gb(self) -> int:
+        mm_config = self.multimodal_config
+        if mm_config is None:
+            return 0
+
+        return envs.VLLM_MM_INPUT_CACHE_GIB
+
     @property
     def is_cross_encoder(self) -> bool:
         return (self._model_info.supports_cross_encoding
@@ -3369,7 +3393,7 @@ class MultiModalConfig:
 
     disable_mm_preprocessor_cache: bool = False
     """
-    If `True`, disable caching of the processed multi-modal inputs.
+    If `True`, disable caching of the multi-modal processor.
     """
 
     interleave_mm_strings: bool = False
 
@@ -1230,17 +1230,17 @@ def create_engine_config(
             enable_multimodal_encoder_data_parallel,
         )
 
-        supports_mm_preprocessor_cache = (self.data_parallel_size == 1
-                                          or data_parallel_external_lb)
-        if (not supports_mm_preprocessor_cache
-                and model_config.is_multimodal_model
-                and not model_config.disable_mm_preprocessor_cache):
-            logger.warning(
-                "Multi-modal preprocessor cache is not compatible "
-                "with data parallelism when there does not exist a "
-                "one-to-one correspondance between API process and "
-                "EngineCore process, so the cache will be disabled.")
-            model_config.set_disable_mm_preprocessor_cache(True)
+        if model_config.is_multimodal_model:
+            dp_supports_mm_processor_cache = (self.data_parallel_size == 1
+                                              or data_parallel_external_lb)
+            if (not dp_supports_mm_processor_cache
+                    and not model_config.disable_mm_preprocessor_cache):
+                logger.warning(
+                    "Multi-modal processor cache is disabled because "
+                    "it is not compatible with data parallelism when "
+                    "there does not exist a one-to-one correspondance "
+                    "between API and engine core processes.")
+                model_config.set_disable_mm_preprocessor_cache(True)
 
         speculative_config = self.create_speculative_config(
             target_model_config=model_config,
 
@@ -163,9 +163,8 @@ def run_multi_api_server(args: argparse.Namespace):
 
         if model_config.is_multimodal_model and not (
                 orig_disable_mm_preprocessor_cache):
-            logger.warning(
-                "Multi-modal preprocessor cache is not compatible "
-                "with api_server_count > 1, so the cache will be disabled.")
+            logger.warning("Multi-modal processor cache is disabled because "
+                           "it is not compatible with `api_server_count > 1`.")
 
     executor_class = Executor.get_class(vllm_config)
     log_stats = not engine_args.disable_log_stats
Original file line number	Diff line number	Diff line change
`@@ -166,7 +166,7 @@ def parse_args():`
`166`	`166`	`parser.add_argument(`
`167`	`167`	`"--disable-mm-preprocessor-cache",`
`168`	`168`	`action="store_true",`
`169`		`- help="If True, disables caching of multi-modal preprocessor/mapper.",`
	`169`	`+ help="If True, disables caching of multi-modal processor.",`
`170`	`170`	`)`
`171`	`171`	`return parser.parse_args()`
`172`	`172`
Original file line number	Diff line number	Diff line change
`@@ -1565,7 +1565,7 @@ def parse_args():`
`1565`	`1565`	`parser.add_argument(`
`1566`	`1566`	`"--disable-mm-preprocessor-cache",`
`1567`	`1567`	`action="store_true",`
`1568`		`- help="If True, disables caching of multi-modal preprocessor/mapper.",`
	`1568`	`+ help="If True, disables caching of multi-modal processor.",`
`1569`	`1569`	`)`
`1570`	`1570`
`1571`	`1571`	`parser.add_argument(`