From 8ee5ff386f01ce428da5c4ff16407a985fe8ee91 Mon Sep 17 00:00:00 2001 From: inkcherry Date: Wed, 7 May 2025 07:44:01 +0000 Subject: [PATCH 1/6] update Signed-off-by: inkcherry --- vllm/attention/layer.py | 11 ++++++++++- vllm/model_executor/models/qwen2.py | 30 +++++++++++++++-------------- vllm/model_executor/models/utils.py | 15 +++++++++++++++ 3 files changed, 41 insertions(+), 15 deletions(-) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index aa218cc37af9..d17badac0a77 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -56,9 +56,18 @@ def __init__( `self.kv_cache`. """ super().__init__() + + # Determine the attention sliding window size: + # Priority: per-layer setting > model-level setting > None if per_layer_sliding_window is not None: # per-layer sliding window - sliding_window = per_layer_sliding_window + + if per_layer_sliding_window == -1: + sliding_window = None + else: + assert per_layer_sliding_window > 0, \ + "per_layer_sliding_window must be positive or -1 (to force disable)" + sliding_window = per_layer_sliding_window elif cache_config is not None: # model-level sliding window sliding_window = cache_config.sliding_window diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index f76f31c9fc8d..6ae08484d9b4 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -53,9 +53,9 @@ from .interfaces import SupportsLoRA, SupportsPP from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper, - is_pp_missing_parameter, + extract_layer_index, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, - maybe_prefix) + maybe_prefix, resolve_sliding_window) logger = init_logger(__name__) @@ -103,6 +103,7 @@ def __init__(self, hidden_size: int, num_heads: int, num_kv_heads: int, + max_window_layers: int = None, max_position: int = 4096 * 32, rope_theta: float = 10000, cache_config: Optional[CacheConfig] = None, @@ -117,6 +118,7 @@ def __init__(self, assert self.total_num_heads % tp_size == 0 self.num_heads = self.total_num_heads // tp_size self.total_num_kv_heads = num_kv_heads + if self.total_num_kv_heads >= tp_size: # Number of KV heads is greater than TP size, so we partition # the KV heads across multiple tensor parallel GPUs. @@ -156,12 +158,21 @@ def __init__(self, base=self.rope_theta, rope_scaling=rope_scaling, ) + + self.layer_idx = extract_layer_index(prefix) + slide_window = resolve_sliding_window( + cache_config.sliding_window, + self.layer_idx, + max_window_layers, + ) + self.attn = Attention(self.num_heads, self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, cache_config=cache_config, quant_config=quant_config, + per_layer_sliding_window=slide_window, prefix=f"{prefix}.attn", attn_type=attn_type) @@ -202,11 +213,14 @@ def __init__( else: attn_type = AttentionType.ENCODER_ONLY + max_window_layers = getattr(config, "max_window_layers", None) + self.self_attn = Qwen2Attention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, + max_window_layers=max_window_layers, rope_theta=rope_theta, cache_config=cache_config, quant_config=quant_config, @@ -273,18 +287,6 @@ def __init__(self, cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config - # TODO (@robertgshaw2): see if this can be moved out - if (cache_config.sliding_window is not None - and hasattr(config, "max_window_layers")): - raise ValueError("Sliding window for some but all layers is not " - "supported. This model uses sliding window " - "but `max_window_layers` = {} is less than " - "`num_hidden_layers` = {}. Please open an issue " - "to discuss this feature.".format( - config.max_window_layers, - config.num_hidden_layers, - )) - self.config = config self.quant_config = quant_config self.vocab_size = config.vocab_size diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 1be40ecd3e28..2d112435ed5f 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -712,3 +712,18 @@ def fast_topk(values, topk, dim): else: # Use topk for efficiency with larger k values return torch.topk(values, topk, dim=dim) + + +def resolve_sliding_window( + sliding_window: Optional[int], + layer_idx: Optional[int], + max_window_layers: Optional[int], +) -> Optional[int]: + + if sliding_window is None: + return -1 + + if max_window_layers is None: + return sliding_window + + return sliding_window if layer_idx >= max_window_layers else -1 From 57fd105ee8080683dcb9a5f83f51b67ea95e2fcc Mon Sep 17 00:00:00 2001 From: inkcherry Date: Wed, 7 May 2025 07:58:07 +0000 Subject: [PATCH 2/6] use constant Signed-off-by: inkcherry --- vllm/attention/layer.py | 5 +++-- vllm/model_executor/models/utils.py | 6 ++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index d17badac0a77..43b42bd030e6 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -18,6 +18,7 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod +from vllm.model_executor.models.utils import NOT_USE_SLIDING_WINDOW from vllm.platforms import _Backend, current_platform from vllm.utils import direct_register_custom_op @@ -62,11 +63,11 @@ def __init__( if per_layer_sliding_window is not None: # per-layer sliding window - if per_layer_sliding_window == -1: + if per_layer_sliding_window == NOT_USE_SLIDING_WINDOW: sliding_window = None else: assert per_layer_sliding_window > 0, \ - "per_layer_sliding_window must be positive or -1 (to force disable)" + f"per_layer_sliding_window must be positive or {NOT_USE_SLIDING_WINDOW} (to force disable)" sliding_window = per_layer_sliding_window elif cache_config is not None: # model-level sliding window diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 2d112435ed5f..52654c590a08 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -24,6 +24,8 @@ WeightsMapping = Mapping[str, Optional[str]] """If a key maps to a value of `None`, the corresponding weight is ignored.""" +NOT_USE_SLIDING_WINDOW = -1 + @dataclass class WeightsMapper: @@ -721,9 +723,9 @@ def resolve_sliding_window( ) -> Optional[int]: if sliding_window is None: - return -1 + return NOT_USE_SLIDING_WINDOW if max_window_layers is None: return sliding_window - return sliding_window if layer_idx >= max_window_layers else -1 + return sliding_window if layer_idx >= max_window_layers else NOT_USE_SLIDING_WINDOW From a214e2af8921c4488312c0e07e371ee9bd867389 Mon Sep 17 00:00:00 2001 From: inkcherry Date: Wed, 7 May 2025 09:09:34 +0000 Subject: [PATCH 3/6] format Signed-off-by: inkcherry --- vllm/attention/layer.py | 9 ++++++--- vllm/model_executor/models/utils.py | 7 +++---- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 43b42bd030e6..783b7f988cab 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -18,10 +18,11 @@ from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod -from vllm.model_executor.models.utils import NOT_USE_SLIDING_WINDOW from vllm.platforms import _Backend, current_platform from vllm.utils import direct_register_custom_op +NOT_USE_SLIDING_WINDOW = -1 + class Attention(nn.Module): """Attention layer. @@ -66,9 +67,11 @@ def __init__( if per_layer_sliding_window == NOT_USE_SLIDING_WINDOW: sliding_window = None else: - assert per_layer_sliding_window > 0, \ - f"per_layer_sliding_window must be positive or {NOT_USE_SLIDING_WINDOW} (to force disable)" + assert per_layer_sliding_window > 0, ( + f"per_layer_sliding_window must be positive or " + f"{NOT_USE_SLIDING_WINDOW} (to force disable)") sliding_window = per_layer_sliding_window + elif cache_config is not None: # model-level sliding window sliding_window = cache_config.sliding_window diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 52654c590a08..7b2f6f116dbc 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -11,6 +11,7 @@ from transformers import PretrainedConfig import vllm.envs as envs +from vllm.attention.layer import NOT_USE_SLIDING_WINDOW from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -24,8 +25,6 @@ WeightsMapping = Mapping[str, Optional[str]] """If a key maps to a value of `None`, the corresponding weight is ignored.""" -NOT_USE_SLIDING_WINDOW = -1 - @dataclass class WeightsMapper: @@ -725,7 +724,7 @@ def resolve_sliding_window( if sliding_window is None: return NOT_USE_SLIDING_WINDOW - if max_window_layers is None: + if max_window_layers is None or layer_idx >= max_window_layers: return sliding_window - return sliding_window if layer_idx >= max_window_layers else NOT_USE_SLIDING_WINDOW + return NOT_USE_SLIDING_WINDOW From 0a0e447d2d5c412767a4bfb2a461553ee6027ce6 Mon Sep 17 00:00:00 2001 From: inkcherry Date: Sun, 11 May 2025 09:24:25 +0000 Subject: [PATCH 4/6] update Signed-off-by: inkcherry --- vllm/config.py | 2 +- vllm/model_executor/models/qwen2.py | 18 +++++++++++++++++- vllm/model_executor/models/utils.py | 16 ---------------- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 40beace3040c..464318d1bb12 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -508,7 +508,7 @@ def __post_init__(self) -> None: self.model, hf_token=self.hf_token, revision=self.revision) self.dtype = _get_and_verify_dtype(self.hf_config, self.dtype) - interleaved_attn_models = ["gemma2", "gemma3_text", "cohere2"] + interleaved_attn_models = ["gemma2", "gemma3_text", "cohere2", "qwen2"] sliding_window = getattr(self.hf_text_config, "sliding_window", None) has_interleaved_attention = (sliding_window is not None) and ( isinstance(sliding_window, list) or diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 6ae08484d9b4..365e44b78baa 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -30,6 +30,7 @@ from transformers import Qwen2Config from vllm.attention import Attention, AttentionType +from vllm.attention.layer import NOT_USE_SLIDING_WINDOW from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size @@ -55,11 +56,26 @@ from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper, extract_layer_index, is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, - maybe_prefix, resolve_sliding_window) + maybe_prefix) logger = init_logger(__name__) +def resolve_sliding_window( + sliding_window: Optional[int], + layer_idx: Optional[int], + max_window_layers: Optional[int], +) -> Optional[int]: + + if sliding_window is None: + return NOT_USE_SLIDING_WINDOW + + if max_window_layers is None or layer_idx >= max_window_layers: + return sliding_window + + return NOT_USE_SLIDING_WINDOW + + class Qwen2MLP(nn.Module): def __init__( diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 7b2f6f116dbc..1be40ecd3e28 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -11,7 +11,6 @@ from transformers import PretrainedConfig import vllm.envs as envs -from vllm.attention.layer import NOT_USE_SLIDING_WINDOW from vllm.config import VllmConfig from vllm.logger import init_logger from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -713,18 +712,3 @@ def fast_topk(values, topk, dim): else: # Use topk for efficiency with larger k values return torch.topk(values, topk, dim=dim) - - -def resolve_sliding_window( - sliding_window: Optional[int], - layer_idx: Optional[int], - max_window_layers: Optional[int], -) -> Optional[int]: - - if sliding_window is None: - return NOT_USE_SLIDING_WINDOW - - if max_window_layers is None or layer_idx >= max_window_layers: - return sliding_window - - return NOT_USE_SLIDING_WINDOW From cc9b656104f562071c9911190373ad5efdaa0ac6 Mon Sep 17 00:00:00 2001 From: inkcherry Date: Mon, 12 May 2025 02:44:16 +0000 Subject: [PATCH 5/6] revert Signed-off-by: inkcherry --- vllm/attention/layer.py | 15 +--------- vllm/config.py | 2 +- vllm/model_executor/models/qwen2.py | 44 +++++++++-------------------- 3 files changed, 15 insertions(+), 46 deletions(-) diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index 783b7f988cab..aa218cc37af9 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -21,8 +21,6 @@ from vllm.platforms import _Backend, current_platform from vllm.utils import direct_register_custom_op -NOT_USE_SLIDING_WINDOW = -1 - class Attention(nn.Module): """Attention layer. @@ -58,20 +56,9 @@ def __init__( `self.kv_cache`. """ super().__init__() - - # Determine the attention sliding window size: - # Priority: per-layer setting > model-level setting > None if per_layer_sliding_window is not None: # per-layer sliding window - - if per_layer_sliding_window == NOT_USE_SLIDING_WINDOW: - sliding_window = None - else: - assert per_layer_sliding_window > 0, ( - f"per_layer_sliding_window must be positive or " - f"{NOT_USE_SLIDING_WINDOW} (to force disable)") - sliding_window = per_layer_sliding_window - + sliding_window = per_layer_sliding_window elif cache_config is not None: # model-level sliding window sliding_window = cache_config.sliding_window diff --git a/vllm/config.py b/vllm/config.py index 464318d1bb12..40beace3040c 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -508,7 +508,7 @@ def __post_init__(self) -> None: self.model, hf_token=self.hf_token, revision=self.revision) self.dtype = _get_and_verify_dtype(self.hf_config, self.dtype) - interleaved_attn_models = ["gemma2", "gemma3_text", "cohere2", "qwen2"] + interleaved_attn_models = ["gemma2", "gemma3_text", "cohere2"] sliding_window = getattr(self.hf_text_config, "sliding_window", None) has_interleaved_attention = (sliding_window is not None) and ( isinstance(sliding_window, list) or diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index 365e44b78baa..f76f31c9fc8d 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -30,7 +30,6 @@ from transformers import Qwen2Config from vllm.attention import Attention, AttentionType -from vllm.attention.layer import NOT_USE_SLIDING_WINDOW from vllm.compilation.decorators import support_torch_compile from vllm.config import CacheConfig, VllmConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size @@ -54,28 +53,13 @@ from .interfaces import SupportsLoRA, SupportsPP from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper, - extract_layer_index, is_pp_missing_parameter, + is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) logger = init_logger(__name__) -def resolve_sliding_window( - sliding_window: Optional[int], - layer_idx: Optional[int], - max_window_layers: Optional[int], -) -> Optional[int]: - - if sliding_window is None: - return NOT_USE_SLIDING_WINDOW - - if max_window_layers is None or layer_idx >= max_window_layers: - return sliding_window - - return NOT_USE_SLIDING_WINDOW - - class Qwen2MLP(nn.Module): def __init__( @@ -119,7 +103,6 @@ def __init__(self, hidden_size: int, num_heads: int, num_kv_heads: int, - max_window_layers: int = None, max_position: int = 4096 * 32, rope_theta: float = 10000, cache_config: Optional[CacheConfig] = None, @@ -134,7 +117,6 @@ def __init__(self, assert self.total_num_heads % tp_size == 0 self.num_heads = self.total_num_heads // tp_size self.total_num_kv_heads = num_kv_heads - if self.total_num_kv_heads >= tp_size: # Number of KV heads is greater than TP size, so we partition # the KV heads across multiple tensor parallel GPUs. @@ -174,21 +156,12 @@ def __init__(self, base=self.rope_theta, rope_scaling=rope_scaling, ) - - self.layer_idx = extract_layer_index(prefix) - slide_window = resolve_sliding_window( - cache_config.sliding_window, - self.layer_idx, - max_window_layers, - ) - self.attn = Attention(self.num_heads, self.head_dim, self.scaling, num_kv_heads=self.num_kv_heads, cache_config=cache_config, quant_config=quant_config, - per_layer_sliding_window=slide_window, prefix=f"{prefix}.attn", attn_type=attn_type) @@ -229,14 +202,11 @@ def __init__( else: attn_type = AttentionType.ENCODER_ONLY - max_window_layers = getattr(config, "max_window_layers", None) - self.self_attn = Qwen2Attention( hidden_size=self.hidden_size, num_heads=config.num_attention_heads, max_position=config.max_position_embeddings, num_kv_heads=config.num_key_value_heads, - max_window_layers=max_window_layers, rope_theta=rope_theta, cache_config=cache_config, quant_config=quant_config, @@ -303,6 +273,18 @@ def __init__(self, cache_config = vllm_config.cache_config quant_config = vllm_config.quant_config + # TODO (@robertgshaw2): see if this can be moved out + if (cache_config.sliding_window is not None + and hasattr(config, "max_window_layers")): + raise ValueError("Sliding window for some but all layers is not " + "supported. This model uses sliding window " + "but `max_window_layers` = {} is less than " + "`num_hidden_layers` = {}. Please open an issue " + "to discuss this feature.".format( + config.max_window_layers, + config.num_hidden_layers, + )) + self.config = config self.quant_config = quant_config self.vocab_size = config.vocab_size From d067c0596174b8a1cb75cf6236f13c09afa8d9b8 Mon Sep 17 00:00:00 2001 From: inkcherry Date: Mon, 12 May 2025 03:09:57 +0000 Subject: [PATCH 6/6] update Signed-off-by: inkcherry --- vllm/model_executor/models/qwen2.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index f76f31c9fc8d..87f3eada43d0 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -276,14 +276,14 @@ def __init__(self, # TODO (@robertgshaw2): see if this can be moved out if (cache_config.sliding_window is not None and hasattr(config, "max_window_layers")): - raise ValueError("Sliding window for some but all layers is not " - "supported. This model uses sliding window " - "but `max_window_layers` = {} is less than " - "`num_hidden_layers` = {}. Please open an issue " - "to discuss this feature.".format( - config.max_window_layers, - config.num_hidden_layers, - )) + assert config.max_window_layers == config.num_hidden_layers, ( + "Sliding window for some but all layers is not supported. " + "This model uses sliding window but `max_window_layers` = {} " + "is less than `num_hidden_layers` = {}. Please open an issue " + "to discuss this feature.".format( + config.max_window_layers, + config.num_hidden_layers, + )) self.config = config self.quant_config = quant_config