vllm-project · yannicks1 · Sep 8, 2025 · Sep 8, 2025 · Sep 8, 2025 · gemini-code-assist
@@ -101,7 +101,7 @@ def __init__(
         # This buffer is not directly transferred to the GPU, so it does not
         # need to be pinned.
         self.token_ids_cpu_tensor = torch.zeros(
-            (max_num_reqs, max_model_len),
+            (max_num_reqs, max_model_len + 1),
             device="cpu",
             dtype=torch.int32,
             pin_memory=False,

@@ -1819,10 +1819,10 @@ def _bookkeeping_sync(
 
             start_idx = self.input_batch.num_tokens_no_spec[req_idx]
             end_idx = start_idx + len(sampled_ids)
-            assert end_idx <= self.max_model_len, (
-                "Sampled token IDs exceed the max model length. "
-                f"Total number of tokens: {end_idx} > max_model_len: "
-                f"{self.max_model_len}")
+            assert end_idx <= self.max_model_len + 1, (
+                "Sampled token IDs exceed the max model length + 1. "
+                f"Total number of tokens: {end_idx} > max_model_len + 1: "
+                f"{self.max_model_len + 1}")
 
             self.input_batch.token_ids_cpu[req_idx,
                                            start_idx:end_idx] = sampled_ids

@@ -44,7 +44,7 @@ def __init__(
         # This buffer is not directly transferred to the GPU, so it does not
         # need to be pinned.
         self.token_ids_cpu_tensor = torch.zeros(
-            (max_num_reqs, max_model_len),
+            (max_num_reqs, max_model_len + 1),
             device="cpu",
             dtype=torch.int32,
             pin_memory=False,