Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion vllm/v1/core/kv_cache_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -843,6 +843,7 @@ def _get_kv_cache_config_uniform_type(vllm_config: VllmConfig,
kv_cache_tensors=kv_cache_tensors,
kv_cache_groups=create_kv_cache_group_specs(kv_cache_spec,
grouped_layer_names),
kv_bytes_per_block=len(kv_cache_tensors) * page_size,
)

num_tokens = num_blocks * vllm_config.cache_config.block_size
Expand Down Expand Up @@ -1003,6 +1004,7 @@ def _get_kv_cache_config_uniform_page_size(
num_blocks=num_blocks,
kv_cache_tensors=kv_cache_tensors,
kv_cache_groups=kv_cache_groups,
kv_bytes_per_block=len(kv_cache_tensors) * page_size,
)

min_block_size = min(
Expand All @@ -1021,7 +1023,10 @@ def _get_kv_cache_config_uniform_page_size(


def _get_kv_cache_config_attention_free() -> KVCacheConfig:
return KVCacheConfig(num_blocks=1, kv_cache_tensors=[], kv_cache_groups=[])
return KVCacheConfig(num_blocks=1,
kv_cache_tensors=[],
kv_cache_groups=[],
kv_bytes_per_block=0)


def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]):
Expand Down Expand Up @@ -1149,7 +1154,12 @@ def unify_kv_cache_configs(kv_cache_configs: list[KVCacheConfig]):
# first `num_blocks` blocks of the tensor.
min_num_blocks = min(kv_cache_config.num_blocks
for kv_cache_config in kv_cache_configs)
kv_bytes_per_block = sum([
kv_cache_config.kv_bytes_per_block
for kv_cache_config in kv_cache_configs
])
for kv_cache_config in kv_cache_configs:
kv_cache_config.num_blocks = min_num_blocks
kv_cache_config.kv_bytes_per_block = kv_bytes_per_block

return kv_cache_configs
6 changes: 5 additions & 1 deletion vllm/v1/engine/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,11 @@ def _initialize_kv_caches(
for cfg in kv_cache_configs
])
num_gpu_blocks = kv_cache_configs[0].num_blocks
num_cpu_blocks = 0
if kv_cache_configs[0].kv_bytes_per_block == 0:
num_cpu_blocks = 0
else:
num_cpu_blocks = (int(vllm_config.cache_config.swap_space_bytes) //
kv_cache_configs[0].kv_bytes_per_block)
Comment on lines +213 to +214
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is also a knob called offloaded_block_size in #22595. IIUC, it also impacts the calculation of num_cpu_blocks, right? (i.e., if we have larger CPU blocks, we should have less number of CPU blocks)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In v0, the offloading was part of the core.
My suggestion for v1 is to have the offloading as a connector.
I wanted to follow the convention for connectors, where all of their arguments are actually defined in their kv_connector_extra_config.

However, deriving num_cpu_blocks from some kind of a swap_space parameter requires knowledge of kv_bytes_per_block.
So basically, I need my connector (both scheduler-side and worker-side) to be aware of kv_bytes_per_block.
This requires changing things in core, so I tried to make minimal changes and came up with the approach here:

For the scheduler-side connector, report kv_bytes_per_block by setting the existing V0 field num_cpu_blocks.
For the worker-side connector, pass-on kv_cache_configs via the register_kv_caches function (in a follow-up PR).

When the offloading connector gets this num_cpu_blocks (given in GPU block size), it can derive the actual num_cpu_blocks by dividing by block_size_factor.

To sum-up, I'm trying to make minimal changes to the core.
This results in the actual offloading configuration parameters split between vllm_config.cache_config and kv_connector_extra_config.

I'm good with taking a different approach.
Your thoughts?
Perhaps we should ask other relevant folks on their opinion here?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah. This is a good point. I think at a high level, there should be two parameters that can be configured by users: (1) total_cpu_buffer_size and (2) cpu_buffer_block_size (how many tokens in each CPU block).

For (1), it's also worth thinking whether it's per rank or per vLLM instance (i.e., summed across all ranks). I feel like if it's per rank, probably it will be better to pass it in the KV connector configs, while it makes more sense to have a "global" cache size when it's configured by global configurations like --swap-space.

For (2), I think it should definitely be put into the KV connector config as it's the current CPU-offloading-connector-specific configuration.

To sum up, I feel like putting all the configs into the KV connector config will probably be better and less confusing. WDYT?

scheduler_kv_cache_config = kv_cache_configs[0]

# Initialize kv cache and warmup the execution
Expand Down
2 changes: 2 additions & 0 deletions vllm/v1/kv_cache_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,3 +264,5 @@ class KVCacheConfig:
see `_get_kv_cache_config_uniform_page_size` for more details.
"""
kv_cache_groups: list[KVCacheGroupSpec]
"""The number of KV bytes per block"""
kv_bytes_per_block: int