From ffb7cd6141fa4f8f43571527d937d5b75e525bfa Mon Sep 17 00:00:00 2001 From: Yannick Schnider Date: Mon, 8 Sep 2025 15:15:31 +0200 Subject: [PATCH 1/3] re-enable prefilll of entire model length Signed-off-by: Yannick Schnider --- vllm/v1/worker/gpu_model_runner.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 549c5dd2bbb2..0d10d782976a 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1819,10 +1819,10 @@ def _bookkeeping_sync( start_idx = self.input_batch.num_tokens_no_spec[req_idx] end_idx = start_idx + len(sampled_ids) - assert end_idx <= self.max_model_len, ( + assert end_idx <= self.max_model_len + 1, ( "Sampled token IDs exceed the max model length. " - f"Total number of tokens: {end_idx} > max_model_len: " - f"{self.max_model_len}") + f"Total number of tokens: {end_idx} > max_model_len + 1: " + f"{self.max_model_len + 1}") self.input_batch.token_ids_cpu[req_idx, start_idx:end_idx] = sampled_ids From 49f33357f7b4fc04b0d115e22991aef8c1af024b Mon Sep 17 00:00:00 2001 From: Yannick Schnider Date: Mon, 8 Sep 2025 15:20:29 +0200 Subject: [PATCH 2/3] adjust log message Signed-off-by: Yannick Schnider --- vllm/v1/worker/gpu_model_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 0d10d782976a..ec9519e1dd5a 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1820,7 +1820,7 @@ def _bookkeeping_sync( start_idx = self.input_batch.num_tokens_no_spec[req_idx] end_idx = start_idx + len(sampled_ids) assert end_idx <= self.max_model_len + 1, ( - "Sampled token IDs exceed the max model length. " + "Sampled token IDs exceed the max model length + 1. " f"Total number of tokens: {end_idx} > max_model_len + 1: " f"{self.max_model_len + 1}") From ea31082ff44e090eadeac4cba805c7fcf6ca2fc2 Mon Sep 17 00:00:00 2001 From: Yannick Schnider Date: Mon, 8 Sep 2025 16:55:16 +0200 Subject: [PATCH 3/3] increase buffer size Signed-off-by: Yannick Schnider --- vllm/v1/worker/gpu_input_batch.py | 2 +- vllm/v1/worker/tpu_input_batch.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/v1/worker/gpu_input_batch.py b/vllm/v1/worker/gpu_input_batch.py index bf9b16575e60..37469503afe0 100644 --- a/vllm/v1/worker/gpu_input_batch.py +++ b/vllm/v1/worker/gpu_input_batch.py @@ -101,7 +101,7 @@ def __init__( # This buffer is not directly transferred to the GPU, so it does not # need to be pinned. self.token_ids_cpu_tensor = torch.zeros( - (max_num_reqs, max_model_len), + (max_num_reqs, max_model_len + 1), device="cpu", dtype=torch.int32, pin_memory=False, diff --git a/vllm/v1/worker/tpu_input_batch.py b/vllm/v1/worker/tpu_input_batch.py index 81c798685cb3..8f5f9c3e93aa 100644 --- a/vllm/v1/worker/tpu_input_batch.py +++ b/vllm/v1/worker/tpu_input_batch.py @@ -44,7 +44,7 @@ def __init__( # This buffer is not directly transferred to the GPU, so it does not # need to be pinned. self.token_ids_cpu_tensor = torch.zeros( - (max_num_reqs, max_model_len), + (max_num_reqs, max_model_len + 1), device="cpu", dtype=torch.int32, pin_memory=False,