Skip to content

Commit 18a88fc

Browse files
authored
[V1] Remove scheduling constraint on partial requests (#12674)
Signed-off-by: Woosuk Kwon <[email protected]>
1 parent d1ca7df commit 18a88fc

File tree

4 files changed

+350
-123
lines changed

4 files changed

+350
-123
lines changed

tests/v1/core/test_scheduler.py

Lines changed: 214 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,214 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
from typing import List, Optional
3+
4+
from vllm.config import CacheConfig, ModelConfig, SchedulerConfig
5+
from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
6+
from vllm.sampling_params import SamplingParams
7+
from vllm.v1.core.scheduler import Scheduler
8+
from vllm.v1.outputs import ModelRunnerOutput
9+
from vllm.v1.request import Request, RequestStatus
10+
11+
12+
def create_scheduler(
13+
model: str = "facebook/opt-125m",
14+
max_num_seqs: int = 16,
15+
max_num_batched_tokens: int = 8192,
16+
) -> Scheduler:
17+
scheduler_config = SchedulerConfig(
18+
max_num_seqs=max_num_seqs,
19+
max_num_batched_tokens=max_num_batched_tokens,
20+
max_model_len=max_num_batched_tokens,
21+
)
22+
model_config = ModelConfig(
23+
model=model,
24+
task="auto",
25+
tokenizer=model,
26+
tokenizer_mode="auto",
27+
trust_remote_code=True,
28+
dtype="float16",
29+
seed=42,
30+
)
31+
cache_config = CacheConfig(
32+
block_size=16,
33+
gpu_memory_utilization=0.9,
34+
swap_space=0,
35+
cache_dtype="auto",
36+
)
37+
cache_config.num_gpu_blocks = 10000
38+
return Scheduler(scheduler_config,
39+
model_config,
40+
cache_config,
41+
lora_config=None)
42+
43+
44+
def create_requests(
45+
num_requests: int,
46+
num_tokens: int = 10,
47+
mm_positions: Optional[List[PlaceholderRange]] = None,
48+
):
49+
sampling_params = SamplingParams()
50+
requests = []
51+
for i in range(num_requests):
52+
if mm_positions is not None:
53+
mm_position = mm_positions[i]
54+
mm_inputs = [MultiModalKwargs({})] * len(mm_position)
55+
else:
56+
mm_position = None
57+
mm_inputs = None
58+
request = Request(
59+
request_id=f"{i}",
60+
prompt=None,
61+
prompt_token_ids=[i] * num_tokens,
62+
sampling_params=sampling_params,
63+
multi_modal_inputs=mm_inputs,
64+
multi_modal_placeholders=mm_position,
65+
multi_modal_hashes=None,
66+
eos_token_id=None,
67+
arrival_time=0,
68+
)
69+
requests.append(request)
70+
return requests
71+
72+
73+
def test_add_requests():
74+
scheduler = create_scheduler()
75+
requests = create_requests(num_requests=10)
76+
77+
for i, request in enumerate(requests):
78+
scheduler.add_request(request)
79+
assert request.request_id in scheduler.requests
80+
assert len(scheduler.waiting) == i + 1
81+
82+
83+
def test_finish_request():
84+
scheduler = create_scheduler()
85+
requests = create_requests(num_requests=10)
86+
for request in requests:
87+
scheduler.add_request(request)
88+
89+
for i, request in enumerate(requests):
90+
scheduler.finish_requests(request.request_id,
91+
RequestStatus.FINISHED_ABORTED)
92+
assert request.request_id not in scheduler.requests
93+
assert len(scheduler.waiting) == 9 - i
94+
95+
96+
def test_get_num_unfinished_requests():
97+
scheduler = create_scheduler()
98+
requests = create_requests(num_requests=10)
99+
for request in requests:
100+
scheduler.add_request(request)
101+
102+
for i, request in enumerate(requests):
103+
scheduler.finish_requests(request.request_id,
104+
RequestStatus.FINISHED_STOPPED)
105+
assert scheduler.get_num_unfinished_requests() == len(requests) - i - 1
106+
107+
108+
def test_schedule():
109+
scheduler = create_scheduler()
110+
requests = create_requests(num_requests=10)
111+
for request in requests:
112+
scheduler.add_request(request)
113+
114+
# Test initial scheduling
115+
output = scheduler.schedule()
116+
assert len(output.scheduled_new_reqs) == len(requests)
117+
assert len(output.scheduled_cached_reqs) == 0
118+
assert len(output.finished_req_ids) == 0
119+
# Verify all requests are scheduled.
120+
for req_id, num_tokens in output.num_scheduled_tokens.items():
121+
assert num_tokens == len(requests[int(req_id)].prompt_token_ids)
122+
123+
# Verify requests moved from waiting to running
124+
assert len(scheduler.waiting) == 0
125+
assert len(scheduler.running) == len(requests)
126+
for i, request in enumerate(requests):
127+
assert scheduler.running[i] == request
128+
129+
130+
def test_schedule_multimodal_requests():
131+
scheduler = create_scheduler(model="llava-hf/llava-1.5-7b-hf")
132+
mm_positions = [[PlaceholderRange(offset=i, length=100)]
133+
for i in range(10)]
134+
requests = create_requests(
135+
num_requests=10,
136+
num_tokens=200,
137+
mm_positions=mm_positions,
138+
)
139+
for request in requests:
140+
scheduler.add_request(request)
141+
142+
output = scheduler.schedule()
143+
assert len(output.scheduled_new_reqs) == len(requests)
144+
assert len(output.scheduled_cached_reqs) == 0
145+
assert len(output.finished_req_ids) == 0
146+
for req_id, num_tokens in output.num_scheduled_tokens.items():
147+
assert num_tokens == len(requests[int(req_id)].prompt_token_ids)
148+
assert len(output.scheduled_encoder_inputs) == 10
149+
for req_id, encoder_input in output.scheduled_encoder_inputs.items():
150+
assert len(encoder_input) == 1
151+
152+
153+
def test_schedule_partial_requests():
154+
"""Test scheduling behavior with partial requests.
155+
156+
This test verifies that:
157+
1. The scheduler can handle multiple partial requests in a single step when
158+
constrained by encoder budget.
159+
2. A request in RUNNING state may be unscheduled in subsequent steps if
160+
there is insufficient encoder budget.
161+
"""
162+
scheduler = create_scheduler(
163+
model="llava-hf/llava-1.5-7b-hf",
164+
max_num_batched_tokens=1024,
165+
)
166+
mm_positions = [[PlaceholderRange(offset=100, length=600)]
167+
for _ in range(3)]
168+
requests = create_requests(
169+
num_requests=3,
170+
num_tokens=800,
171+
mm_positions=mm_positions,
172+
)
173+
for request in requests:
174+
scheduler.add_request(request)
175+
176+
output = scheduler.schedule()
177+
assert len(output.scheduled_new_reqs) == 3
178+
assert len(output.scheduled_cached_reqs) == 0
179+
assert len(output.finished_req_ids) == 0
180+
181+
assert scheduler.max_num_encoder_input_tokens == 1024
182+
# The first request is scheduled fully.
183+
assert output.num_scheduled_tokens[requests[0].request_id] == 800
184+
# The second request is scheduled partially.
185+
# The <img> tokens are not scheduled because of the encoder budget.
186+
assert output.num_scheduled_tokens[requests[1].request_id] == 100
187+
# The third request is also scheduled partially.
188+
# The <img> tokens are not scheduled because of the encoder budget.
189+
assert output.num_scheduled_tokens[requests[2].request_id] == 100
190+
req_to_index = {
191+
request.request_id: i
192+
for i, request in enumerate(requests)
193+
}
194+
model_runner_output = ModelRunnerOutput(
195+
req_ids=[request.request_id for request in requests],
196+
req_id_to_index=req_to_index,
197+
sampled_token_ids=[0] * len(requests),
198+
logprob_token_ids_cpu=None,
199+
logprobs_cpu=None,
200+
)
201+
scheduler.update_from_output(output, model_runner_output)
202+
203+
# Schedule the next step.
204+
# Only the first and second requests are scheduled.
205+
# The third request is in the RUNNING state but not scheduled in this step
206+
# because of the encoder budget.
207+
output = scheduler.schedule()
208+
assert len(scheduler.running) == 3
209+
assert len(output.scheduled_new_reqs) == 0
210+
assert len(output.scheduled_cached_reqs) == 2
211+
assert len(output.finished_req_ids) == 0
212+
assert output.num_scheduled_tokens[requests[0].request_id] == 1
213+
assert output.num_scheduled_tokens[requests[1].request_id] == 700
214+
assert requests[2].request_id not in output.num_scheduled_tokens

0 commit comments

Comments
 (0)