Skip to content

Commit e81d4e6

Browse files
authored
[Misc] Add check for dual_chunk_attention (#24070)
Signed-off-by: zjy0516 <[email protected]>
1 parent 02d411f commit e81d4e6

File tree

1 file changed

+6
-1
lines changed

1 file changed

+6
-1
lines changed

vllm/config/__init__.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,8 @@
4949
try_get_tokenizer_config, uses_mrope)
5050
from vllm.transformers_utils.s3_utils import S3Model
5151
from vllm.transformers_utils.utils import is_s3, maybe_model_redirect
52-
from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS, LayerBlockType,
52+
from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS,
53+
STR_DUAL_CHUNK_FLASH_ATTN_VAL, LayerBlockType,
5354
LazyLoader, common_broadcastable_dtype, random_uuid)
5455

5556
if TYPE_CHECKING:
@@ -1304,6 +1305,10 @@ def verify_dual_chunk_attention_config(
13041305
self.hf_config.dual_chunk_attention_config[
13051306
"sparse_attention_enabled"] = True
13061307

1308+
if envs.VLLM_ATTENTION_BACKEND != STR_DUAL_CHUNK_FLASH_ATTN_VAL:
1309+
raise ValueError("please set VLLM_ATTENTION_BACKEND to "
1310+
f"{STR_DUAL_CHUNK_FLASH_ATTN_VAL}")
1311+
13071312
def verify_async_output_proc(self, parallel_config, speculative_config,
13081313
device_config) -> None:
13091314
if not self.use_async_output_proc:

0 commit comments

Comments
 (0)