File tree Expand file tree Collapse file tree 1 file changed +6
-1
lines changed Expand file tree Collapse file tree 1 file changed +6
-1
lines changed Original file line number Diff line number Diff line change 49
49
try_get_tokenizer_config , uses_mrope )
50
50
from vllm .transformers_utils .s3_utils import S3Model
51
51
from vllm .transformers_utils .utils import is_s3 , maybe_model_redirect
52
- from vllm .utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS , LayerBlockType ,
52
+ from vllm .utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS ,
53
+ STR_DUAL_CHUNK_FLASH_ATTN_VAL , LayerBlockType ,
53
54
LazyLoader , common_broadcastable_dtype , random_uuid )
54
55
55
56
if TYPE_CHECKING :
@@ -1304,6 +1305,10 @@ def verify_dual_chunk_attention_config(
1304
1305
self .hf_config .dual_chunk_attention_config [
1305
1306
"sparse_attention_enabled" ] = True
1306
1307
1308
+ if envs .VLLM_ATTENTION_BACKEND != STR_DUAL_CHUNK_FLASH_ATTN_VAL :
1309
+ raise ValueError ("please set VLLM_ATTENTION_BACKEND to "
1310
+ f"{ STR_DUAL_CHUNK_FLASH_ATTN_VAL } " )
1311
+
1307
1312
def verify_async_output_proc (self , parallel_config , speculative_config ,
1308
1313
device_config ) -> None :
1309
1314
if not self .use_async_output_proc :
You can’t perform that action at this time.
0 commit comments