From 8ee5ff386f01ce428da5c4ff16407a985fe8ee91 Mon Sep 17 00:00:00 2001
From: inkcherry <mingzhi.liu@intel.com>
Date: Wed, 7 May 2025 07:44:01 +0000
Subject: [PATCH 1/6] update

Signed-off-by: inkcherry <mingzhi.liu@intel.com>
---
 vllm/attention/layer.py             | 11 ++++++++++-
 vllm/model_executor/models/qwen2.py | 30 +++++++++++++++--------------
 vllm/model_executor/models/utils.py | 15 +++++++++++++++
 3 files changed, 41 insertions(+), 15 deletions(-)

diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index aa218cc37af9..d17badac0a77 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -56,9 +56,18 @@ def __init__(
         `self.kv_cache`.
         """
         super().__init__()
+
+        # Determine the attention sliding window size:
+        # Priority: per-layer setting > model-level setting > None
         if per_layer_sliding_window is not None:
             # per-layer sliding window
-            sliding_window = per_layer_sliding_window
+
+            if per_layer_sliding_window == -1:
+                sliding_window = None
+            else:
+                assert per_layer_sliding_window > 0, \
+            "per_layer_sliding_window must be positive or -1 (to force disable)"
+                sliding_window = per_layer_sliding_window
         elif cache_config is not None:
             # model-level sliding window
             sliding_window = cache_config.sliding_window
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index f76f31c9fc8d..6ae08484d9b4 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -53,9 +53,9 @@
 
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper,
-                    is_pp_missing_parameter,
+                    extract_layer_index, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix)
+                    maybe_prefix, resolve_sliding_window)
 
 logger = init_logger(__name__)
 
@@ -103,6 +103,7 @@ def __init__(self,
                  hidden_size: int,
                  num_heads: int,
                  num_kv_heads: int,
+                 max_window_layers: int = None,
                  max_position: int = 4096 * 32,
                  rope_theta: float = 10000,
                  cache_config: Optional[CacheConfig] = None,
@@ -117,6 +118,7 @@ def __init__(self,
         assert self.total_num_heads % tp_size == 0
         self.num_heads = self.total_num_heads // tp_size
         self.total_num_kv_heads = num_kv_heads
+
         if self.total_num_kv_heads >= tp_size:
             # Number of KV heads is greater than TP size, so we partition
             # the KV heads across multiple tensor parallel GPUs.
@@ -156,12 +158,21 @@ def __init__(self,
             base=self.rope_theta,
             rope_scaling=rope_scaling,
         )
+
+        self.layer_idx = extract_layer_index(prefix)
+        slide_window = resolve_sliding_window(
+            cache_config.sliding_window,
+            self.layer_idx,
+            max_window_layers,
+        )
+
         self.attn = Attention(self.num_heads,
                               self.head_dim,
                               self.scaling,
                               num_kv_heads=self.num_kv_heads,
                               cache_config=cache_config,
                               quant_config=quant_config,
+                              per_layer_sliding_window=slide_window,
                               prefix=f"{prefix}.attn",
                               attn_type=attn_type)
 
@@ -202,11 +213,14 @@ def __init__(
         else:
             attn_type = AttentionType.ENCODER_ONLY
 
+        max_window_layers = getattr(config, "max_window_layers", None)
+
         self.self_attn = Qwen2Attention(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             max_position=config.max_position_embeddings,
             num_kv_heads=config.num_key_value_heads,
+            max_window_layers=max_window_layers,
             rope_theta=rope_theta,
             cache_config=cache_config,
             quant_config=quant_config,
@@ -273,18 +287,6 @@ def __init__(self,
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
 
-        # TODO (@robertgshaw2): see if this can be moved out
-        if (cache_config.sliding_window is not None
-                and hasattr(config, "max_window_layers")):
-            raise ValueError("Sliding window for some but all layers is not "
-                             "supported. This model uses sliding window "
-                             "but `max_window_layers` = {} is less than "
-                             "`num_hidden_layers` = {}. Please open an issue "
-                             "to discuss this feature.".format(
-                                 config.max_window_layers,
-                                 config.num_hidden_layers,
-                             ))
-
         self.config = config
         self.quant_config = quant_config
         self.vocab_size = config.vocab_size
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 1be40ecd3e28..2d112435ed5f 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -712,3 +712,18 @@ def fast_topk(values, topk, dim):
     else:
         # Use topk for efficiency with larger k values
         return torch.topk(values, topk, dim=dim)
+
+
+def resolve_sliding_window(
+    sliding_window: Optional[int],
+    layer_idx: Optional[int],
+    max_window_layers: Optional[int],
+) -> Optional[int]:
+
+    if sliding_window is None:
+        return -1
+
+    if max_window_layers is None:
+        return sliding_window
+
+    return sliding_window if layer_idx >= max_window_layers else -1

From 57fd105ee8080683dcb9a5f83f51b67ea95e2fcc Mon Sep 17 00:00:00 2001
From: inkcherry <mingzhi.liu@intel.com>
Date: Wed, 7 May 2025 07:58:07 +0000
Subject: [PATCH 2/6] use constant

Signed-off-by: inkcherry <mingzhi.liu@intel.com>
---
 vllm/attention/layer.py             | 5 +++--
 vllm/model_executor/models/utils.py | 6 ++++--
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index d17badac0a77..43b42bd030e6 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -18,6 +18,7 @@
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
+from vllm.model_executor.models.utils import NOT_USE_SLIDING_WINDOW
 from vllm.platforms import _Backend, current_platform
 from vllm.utils import direct_register_custom_op
 
@@ -62,11 +63,11 @@ def __init__(
         if per_layer_sliding_window is not None:
             # per-layer sliding window
 
-            if per_layer_sliding_window == -1:
+            if per_layer_sliding_window == NOT_USE_SLIDING_WINDOW:
                 sliding_window = None
             else:
                 assert per_layer_sliding_window > 0, \
-            "per_layer_sliding_window must be positive or -1 (to force disable)"
+            f"per_layer_sliding_window must be positive or {NOT_USE_SLIDING_WINDOW} (to force disable)"
                 sliding_window = per_layer_sliding_window
         elif cache_config is not None:
             # model-level sliding window
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 2d112435ed5f..52654c590a08 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -24,6 +24,8 @@
 WeightsMapping = Mapping[str, Optional[str]]
 """If a key maps to a value of `None`, the corresponding weight is ignored."""
 
+NOT_USE_SLIDING_WINDOW = -1
+
 
 @dataclass
 class WeightsMapper:
@@ -721,9 +723,9 @@ def resolve_sliding_window(
 ) -> Optional[int]:
 
     if sliding_window is None:
-        return -1
+        return NOT_USE_SLIDING_WINDOW
 
     if max_window_layers is None:
         return sliding_window
 
-    return sliding_window if layer_idx >= max_window_layers else -1
+    return sliding_window if layer_idx >= max_window_layers else NOT_USE_SLIDING_WINDOW

From a214e2af8921c4488312c0e07e371ee9bd867389 Mon Sep 17 00:00:00 2001
From: inkcherry <mingzhi.liu@intel.com>
Date: Wed, 7 May 2025 09:09:34 +0000
Subject: [PATCH 3/6] format

Signed-off-by: inkcherry <mingzhi.liu@intel.com>
---
 vllm/attention/layer.py             | 9 ++++++---
 vllm/model_executor/models/utils.py | 7 +++----
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 43b42bd030e6..783b7f988cab 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -18,10 +18,11 @@
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
-from vllm.model_executor.models.utils import NOT_USE_SLIDING_WINDOW
 from vllm.platforms import _Backend, current_platform
 from vllm.utils import direct_register_custom_op
 
+NOT_USE_SLIDING_WINDOW = -1
+
 
 class Attention(nn.Module):
     """Attention layer.
@@ -66,9 +67,11 @@ def __init__(
             if per_layer_sliding_window == NOT_USE_SLIDING_WINDOW:
                 sliding_window = None
             else:
-                assert per_layer_sliding_window > 0, \
-            f"per_layer_sliding_window must be positive or {NOT_USE_SLIDING_WINDOW} (to force disable)"
+                assert per_layer_sliding_window > 0, (
+                    f"per_layer_sliding_window must be positive or "
+                    f"{NOT_USE_SLIDING_WINDOW} (to force disable)")
                 sliding_window = per_layer_sliding_window
+
         elif cache_config is not None:
             # model-level sliding window
             sliding_window = cache_config.sliding_window
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 52654c590a08..7b2f6f116dbc 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -11,6 +11,7 @@
 from transformers import PretrainedConfig
 
 import vllm.envs as envs
+from vllm.attention.layer import NOT_USE_SLIDING_WINDOW
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -24,8 +25,6 @@
 WeightsMapping = Mapping[str, Optional[str]]
 """If a key maps to a value of `None`, the corresponding weight is ignored."""
 
-NOT_USE_SLIDING_WINDOW = -1
-
 
 @dataclass
 class WeightsMapper:
@@ -725,7 +724,7 @@ def resolve_sliding_window(
     if sliding_window is None:
         return NOT_USE_SLIDING_WINDOW
 
-    if max_window_layers is None:
+    if max_window_layers is None or layer_idx >= max_window_layers:
         return sliding_window
 
-    return sliding_window if layer_idx >= max_window_layers else NOT_USE_SLIDING_WINDOW
+    return NOT_USE_SLIDING_WINDOW

From 0a0e447d2d5c412767a4bfb2a461553ee6027ce6 Mon Sep 17 00:00:00 2001
From: inkcherry <mingzhi.liu@intel.com>
Date: Sun, 11 May 2025 09:24:25 +0000
Subject: [PATCH 4/6] update

Signed-off-by: inkcherry <mingzhi.liu@intel.com>
---
 vllm/config.py                      |  2 +-
 vllm/model_executor/models/qwen2.py | 18 +++++++++++++++++-
 vllm/model_executor/models/utils.py | 16 ----------------
 3 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index 40beace3040c..464318d1bb12 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -508,7 +508,7 @@ def __post_init__(self) -> None:
             self.model, hf_token=self.hf_token, revision=self.revision)
         self.dtype = _get_and_verify_dtype(self.hf_config, self.dtype)
 
-        interleaved_attn_models = ["gemma2", "gemma3_text", "cohere2"]
+        interleaved_attn_models = ["gemma2", "gemma3_text", "cohere2", "qwen2"]
         sliding_window = getattr(self.hf_text_config, "sliding_window", None)
         has_interleaved_attention = (sliding_window is not None) and (
             isinstance(sliding_window, list) or
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 6ae08484d9b4..365e44b78baa 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -30,6 +30,7 @@
 from transformers import Qwen2Config
 
 from vllm.attention import Attention, AttentionType
+from vllm.attention.layer import NOT_USE_SLIDING_WINDOW
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -55,11 +56,26 @@
 from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper,
                     extract_layer_index, is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
-                    maybe_prefix, resolve_sliding_window)
+                    maybe_prefix)
 
 logger = init_logger(__name__)
 
 
+def resolve_sliding_window(
+    sliding_window: Optional[int],
+    layer_idx: Optional[int],
+    max_window_layers: Optional[int],
+) -> Optional[int]:
+
+    if sliding_window is None:
+        return NOT_USE_SLIDING_WINDOW
+
+    if max_window_layers is None or layer_idx >= max_window_layers:
+        return sliding_window
+
+    return NOT_USE_SLIDING_WINDOW
+
+
 class Qwen2MLP(nn.Module):
 
     def __init__(
diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py
index 7b2f6f116dbc..1be40ecd3e28 100644
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -11,7 +11,6 @@
 from transformers import PretrainedConfig
 
 import vllm.envs as envs
-from vllm.attention.layer import NOT_USE_SLIDING_WINDOW
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -713,18 +712,3 @@ def fast_topk(values, topk, dim):
     else:
         # Use topk for efficiency with larger k values
         return torch.topk(values, topk, dim=dim)
-
-
-def resolve_sliding_window(
-    sliding_window: Optional[int],
-    layer_idx: Optional[int],
-    max_window_layers: Optional[int],
-) -> Optional[int]:
-
-    if sliding_window is None:
-        return NOT_USE_SLIDING_WINDOW
-
-    if max_window_layers is None or layer_idx >= max_window_layers:
-        return sliding_window
-
-    return NOT_USE_SLIDING_WINDOW

From cc9b656104f562071c9911190373ad5efdaa0ac6 Mon Sep 17 00:00:00 2001
From: inkcherry <mingzhi.liu@intel.com>
Date: Mon, 12 May 2025 02:44:16 +0000
Subject: [PATCH 5/6] revert

Signed-off-by: inkcherry <mingzhi.liu@intel.com>
---
 vllm/attention/layer.py             | 15 +---------
 vllm/config.py                      |  2 +-
 vllm/model_executor/models/qwen2.py | 44 +++++++++--------------------
 3 files changed, 15 insertions(+), 46 deletions(-)

diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index 783b7f988cab..aa218cc37af9 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -21,8 +21,6 @@
 from vllm.platforms import _Backend, current_platform
 from vllm.utils import direct_register_custom_op
 
-NOT_USE_SLIDING_WINDOW = -1
-
 
 class Attention(nn.Module):
     """Attention layer.
@@ -58,20 +56,9 @@ def __init__(
         `self.kv_cache`.
         """
         super().__init__()
-
-        # Determine the attention sliding window size:
-        # Priority: per-layer setting > model-level setting > None
         if per_layer_sliding_window is not None:
             # per-layer sliding window
-
-            if per_layer_sliding_window == NOT_USE_SLIDING_WINDOW:
-                sliding_window = None
-            else:
-                assert per_layer_sliding_window > 0, (
-                    f"per_layer_sliding_window must be positive or "
-                    f"{NOT_USE_SLIDING_WINDOW} (to force disable)")
-                sliding_window = per_layer_sliding_window
-
+            sliding_window = per_layer_sliding_window
         elif cache_config is not None:
             # model-level sliding window
             sliding_window = cache_config.sliding_window
diff --git a/vllm/config.py b/vllm/config.py
index 464318d1bb12..40beace3040c 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -508,7 +508,7 @@ def __post_init__(self) -> None:
             self.model, hf_token=self.hf_token, revision=self.revision)
         self.dtype = _get_and_verify_dtype(self.hf_config, self.dtype)
 
-        interleaved_attn_models = ["gemma2", "gemma3_text", "cohere2", "qwen2"]
+        interleaved_attn_models = ["gemma2", "gemma3_text", "cohere2"]
         sliding_window = getattr(self.hf_text_config, "sliding_window", None)
         has_interleaved_attention = (sliding_window is not None) and (
             isinstance(sliding_window, list) or
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 365e44b78baa..f76f31c9fc8d 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -30,7 +30,6 @@
 from transformers import Qwen2Config
 
 from vllm.attention import Attention, AttentionType
-from vllm.attention.layer import NOT_USE_SLIDING_WINDOW
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
@@ -54,28 +53,13 @@
 
 from .interfaces import SupportsLoRA, SupportsPP
 from .utils import (AutoWeightsLoader, PPMissingLayer, WeightsMapper,
-                    extract_layer_index, is_pp_missing_parameter,
+                    is_pp_missing_parameter,
                     make_empty_intermediate_tensors_factory, make_layers,
                     maybe_prefix)
 
 logger = init_logger(__name__)
 
 
-def resolve_sliding_window(
-    sliding_window: Optional[int],
-    layer_idx: Optional[int],
-    max_window_layers: Optional[int],
-) -> Optional[int]:
-
-    if sliding_window is None:
-        return NOT_USE_SLIDING_WINDOW
-
-    if max_window_layers is None or layer_idx >= max_window_layers:
-        return sliding_window
-
-    return NOT_USE_SLIDING_WINDOW
-
-
 class Qwen2MLP(nn.Module):
 
     def __init__(
@@ -119,7 +103,6 @@ def __init__(self,
                  hidden_size: int,
                  num_heads: int,
                  num_kv_heads: int,
-                 max_window_layers: int = None,
                  max_position: int = 4096 * 32,
                  rope_theta: float = 10000,
                  cache_config: Optional[CacheConfig] = None,
@@ -134,7 +117,6 @@ def __init__(self,
         assert self.total_num_heads % tp_size == 0
         self.num_heads = self.total_num_heads // tp_size
         self.total_num_kv_heads = num_kv_heads
-
         if self.total_num_kv_heads >= tp_size:
             # Number of KV heads is greater than TP size, so we partition
             # the KV heads across multiple tensor parallel GPUs.
@@ -174,21 +156,12 @@ def __init__(self,
             base=self.rope_theta,
             rope_scaling=rope_scaling,
         )
-
-        self.layer_idx = extract_layer_index(prefix)
-        slide_window = resolve_sliding_window(
-            cache_config.sliding_window,
-            self.layer_idx,
-            max_window_layers,
-        )
-
         self.attn = Attention(self.num_heads,
                               self.head_dim,
                               self.scaling,
                               num_kv_heads=self.num_kv_heads,
                               cache_config=cache_config,
                               quant_config=quant_config,
-                              per_layer_sliding_window=slide_window,
                               prefix=f"{prefix}.attn",
                               attn_type=attn_type)
 
@@ -229,14 +202,11 @@ def __init__(
         else:
             attn_type = AttentionType.ENCODER_ONLY
 
-        max_window_layers = getattr(config, "max_window_layers", None)
-
         self.self_attn = Qwen2Attention(
             hidden_size=self.hidden_size,
             num_heads=config.num_attention_heads,
             max_position=config.max_position_embeddings,
             num_kv_heads=config.num_key_value_heads,
-            max_window_layers=max_window_layers,
             rope_theta=rope_theta,
             cache_config=cache_config,
             quant_config=quant_config,
@@ -303,6 +273,18 @@ def __init__(self,
         cache_config = vllm_config.cache_config
         quant_config = vllm_config.quant_config
 
+        # TODO (@robertgshaw2): see if this can be moved out
+        if (cache_config.sliding_window is not None
+                and hasattr(config, "max_window_layers")):
+            raise ValueError("Sliding window for some but all layers is not "
+                             "supported. This model uses sliding window "
+                             "but `max_window_layers` = {} is less than "
+                             "`num_hidden_layers` = {}. Please open an issue "
+                             "to discuss this feature.".format(
+                                 config.max_window_layers,
+                                 config.num_hidden_layers,
+                             ))
+
         self.config = config
         self.quant_config = quant_config
         self.vocab_size = config.vocab_size

From d067c0596174b8a1cb75cf6236f13c09afa8d9b8 Mon Sep 17 00:00:00 2001
From: inkcherry <mingzhi.liu@intel.com>
Date: Mon, 12 May 2025 03:09:57 +0000
Subject: [PATCH 6/6] update

Signed-off-by: inkcherry <mingzhi.liu@intel.com>
---
 vllm/model_executor/models/qwen2.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index f76f31c9fc8d..87f3eada43d0 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -276,14 +276,14 @@ def __init__(self,
         # TODO (@robertgshaw2): see if this can be moved out
         if (cache_config.sliding_window is not None
                 and hasattr(config, "max_window_layers")):
-            raise ValueError("Sliding window for some but all layers is not "
-                             "supported. This model uses sliding window "
-                             "but `max_window_layers` = {} is less than "
-                             "`num_hidden_layers` = {}. Please open an issue "
-                             "to discuss this feature.".format(
-                                 config.max_window_layers,
-                                 config.num_hidden_layers,
-                             ))
+            assert config.max_window_layers == config.num_hidden_layers, (
+                "Sliding window for some but all layers is not supported. "
+                "This model uses sliding window but `max_window_layers` = {} "
+                "is less than `num_hidden_layers` = {}. Please open an issue "
+                "to discuss this feature.".format(
+                    config.max_window_layers,
+                    config.num_hidden_layers,
+                ))
 
         self.config = config
         self.quant_config = quant_config