Skip to content

Commit 1b41751

Browse files
committed
Optimize entrypoints API server tests
Signed-off-by: Sahithi Chigurupati <[email protected]>
1 parent d3d2aad commit 1b41751

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

57 files changed

+257
-427
lines changed

.buildkite/test-pipeline.yaml

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -122,12 +122,22 @@ steps:
122122
torch_nightly: true
123123
source_file_dependencies:
124124
- vllm/
125-
- tests/entrypoints/openai
125+
- tests/entrypoints/openai/basic_tests
126+
- tests/entrypoints/openai/embedding_tests
127+
- tests/entrypoints/openai/individual_tests
128+
- tests/entrypoints/openai/lora_tests
129+
- tests/entrypoints/openai/multimodal_tests
130+
- tests/entrypoints/openai/tool_parsers
126131
- tests/entrypoints/test_chat_utils
127132
commands:
128133
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
129134
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
130-
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py
135+
- pytest -v -s entrypoints/openai/basic_tests/
136+
- pytest -v -s entrypoints/openai/embedding_tests/
137+
- pytest -v -s entrypoints/openai/individual_tests/
138+
- pytest -v -s entrypoints/openai/lora_tests/
139+
- pytest -v -s entrypoints/openai/multimodal_tests/
140+
- pytest -v -s entrypoints/openai/tool_parsers/
131141
- pytest -v -s entrypoints/test_chat_utils.py
132142

133143
- label: Distributed Tests (4 GPUs) # 10min
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
# Basic API tests package
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
4+
import pytest
5+
6+
from ....utils import RemoteOpenAIServer
7+
8+
BASIC_SERVER_ARGS = [
9+
"--dtype", "bfloat16", "--max-model-len", "1024", "--enforce-eager",
10+
"--max-num-seqs", "32", "--gpu-memory-utilization", "0.7",
11+
"--disable-log-stats", "--disable-log-requests",
12+
"--enable-server-load-tracking", "--chat-template",
13+
"{% for message in messages %}{{message['role'] + ': ' \
14+
+ message['content'] + '\\n'}}{% endfor %}", "--enable-auto-tool-choice",
15+
"--tool-call-parser", "hermes", "--trust-remote-code"
16+
]
17+
18+
19+
@pytest.fixture(scope="package")
20+
def server():
21+
with RemoteOpenAIServer("microsoft/DialoGPT-small",
22+
BASIC_SERVER_ARGS,
23+
max_wait_seconds=120) as server:
24+
yield server

tests/entrypoints/openai/test_basic.py renamed to tests/entrypoints/openai/basic_tests/test_basic.py

Lines changed: 27 additions & 106 deletions
Original file line numberDiff line numberDiff line change
@@ -11,63 +11,7 @@
1111

1212
from vllm.version import __version__ as VLLM_VERSION
1313

14-
from ...utils import RemoteOpenAIServer
15-
16-
MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
17-
18-
19-
@pytest.fixture(scope='module')
20-
def server_args(request: pytest.FixtureRequest) -> list[str]:
21-
""" Provide extra arguments to the server via indirect parametrization
22-
23-
Usage:
24-
25-
>>> @pytest.mark.parametrize(
26-
>>> "server_args",
27-
>>> [
28-
>>> ["--disable-frontend-multiprocessing"],
29-
>>> [
30-
>>> "--model=NousResearch/Hermes-3-Llama-3.1-70B",
31-
>>> "--enable-auto-tool-choice",
32-
>>> ],
33-
>>> ],
34-
>>> indirect=True,
35-
>>> )
36-
>>> def test_foo(server, client):
37-
>>> ...
38-
39-
This will run `test_foo` twice with servers with:
40-
- `--disable-frontend-multiprocessing`
41-
- `--model=NousResearch/Hermes-3-Llama-3.1-70B --enable-auto-tool-choice`.
42-
43-
"""
44-
if not hasattr(request, "param"):
45-
return []
46-
47-
val = request.param
48-
49-
if isinstance(val, str):
50-
return [val]
51-
52-
return request.param
53-
54-
55-
@pytest.fixture(scope="module")
56-
def server(server_args):
57-
args = [
58-
# use half precision for speed and memory savings in CI environment
59-
"--dtype",
60-
"bfloat16",
61-
"--max-model-len",
62-
"8192",
63-
"--enforce-eager",
64-
"--max-num-seqs",
65-
"128",
66-
*server_args,
67-
]
68-
69-
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
70-
yield remote_server
14+
MODEL_NAME = "microsoft/DialoGPT-small"
7115

7216

7317
@pytest_asyncio.fixture
@@ -76,52 +20,23 @@ async def client(server):
7620
yield async_client
7721

7822

79-
@pytest.mark.parametrize(
80-
"server_args",
81-
[
82-
pytest.param([], id="default-frontend-multiprocessing"),
83-
pytest.param(["--disable-frontend-multiprocessing"],
84-
id="disable-frontend-multiprocessing")
85-
],
86-
indirect=True,
87-
)
8823
@pytest.mark.asyncio
89-
async def test_show_version(server: RemoteOpenAIServer):
24+
async def test_show_version(server):
9025
response = requests.get(server.url_for("version"))
9126
response.raise_for_status()
9227

9328
assert response.json() == {"version": VLLM_VERSION}
9429

9530

96-
@pytest.mark.parametrize(
97-
"server_args",
98-
[
99-
pytest.param([], id="default-frontend-multiprocessing"),
100-
pytest.param(["--disable-frontend-multiprocessing"],
101-
id="disable-frontend-multiprocessing")
102-
],
103-
indirect=True,
104-
)
10531
@pytest.mark.asyncio
106-
async def test_check_health(server: RemoteOpenAIServer):
32+
async def test_check_health(server):
10733
response = requests.get(server.url_for("health"))
10834

10935
assert response.status_code == HTTPStatus.OK
11036

11137

112-
@pytest.mark.parametrize(
113-
"server_args",
114-
[
115-
pytest.param(["--max-model-len", "10100"],
116-
id="default-frontend-multiprocessing"),
117-
pytest.param(
118-
["--disable-frontend-multiprocessing", "--max-model-len", "10100"],
119-
id="disable-frontend-multiprocessing")
120-
],
121-
indirect=True,
122-
)
12338
@pytest.mark.asyncio
124-
async def test_request_cancellation(server: RemoteOpenAIServer):
39+
async def test_request_cancellation(server):
12540
# clunky test: send an ungodly amount of load in with short timeouts
12641
# then ensure that it still responds quickly afterwards
12742

@@ -159,7 +74,7 @@ async def test_request_cancellation(server: RemoteOpenAIServer):
15974

16075

16176
@pytest.mark.asyncio
162-
async def test_request_wrong_content_type(server: RemoteOpenAIServer):
77+
async def test_request_wrong_content_type(server):
16378

16479
chat_input = [{"role": "user", "content": "Write a long story"}]
16580
client = server.get_async_client()
@@ -174,38 +89,44 @@ async def test_request_wrong_content_type(server: RemoteOpenAIServer):
17489
})
17590

17691

177-
@pytest.mark.parametrize(
178-
"server_args",
179-
[
180-
pytest.param(["--enable-server-load-tracking"],
181-
id="enable-server-load-tracking")
182-
],
183-
indirect=True,
184-
)
18592
@pytest.mark.asyncio
186-
async def test_server_load(server: RemoteOpenAIServer):
93+
async def test_server_load(server):
18794
# Check initial server load
18895
response = requests.get(server.url_for("load"))
18996
assert response.status_code == HTTPStatus.OK
190-
assert response.json().get("server_load") == 0
97+
initial_load = response.json().get("server_load")
98+
print(f"Initial server load: {initial_load}")
99+
assert initial_load == 0, f"Expected initial \
100+
server_load to be 0, but got {initial_load}"
191101

192102
def make_long_completion_request():
193103
return requests.post(
194-
server.url_for("v1/completions"),
104+
server.url_for("v1/chat/completions"),
195105
headers={"Content-Type": "application/json"},
196106
json={
197-
"prompt": "Give me a long story",
198-
"max_tokens": 1000,
199-
"temperature": 0,
107+
"model":
108+
MODEL_NAME,
109+
"messages": [{
110+
"role":
111+
"user",
112+
"content":
113+
"Give me a very long story with many details"
114+
}],
115+
"max_tokens":
116+
1000,
117+
"temperature":
118+
0,
119+
"stream":
120+
True,
200121
},
122+
stream=True,
201123
)
202124

203125
# Start the completion request in a background thread.
204126
completion_future = asyncio.create_task(
205127
asyncio.to_thread(make_long_completion_request))
206128

207-
# Give a short delay to ensure the request has started.
208-
await asyncio.sleep(0.1)
129+
await asyncio.sleep(0.5)
209130

210131
# Check server load while the completion request is running.
211132
response = requests.get(server.url_for("load"))

tests/entrypoints/openai/test_chat_echo.py renamed to tests/entrypoints/openai/basic_tests/test_chat_echo.py

Lines changed: 1 addition & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -7,25 +7,8 @@
77
import pytest
88
import pytest_asyncio
99

10-
from ...utils import RemoteOpenAIServer
11-
1210
# # any model with a chat template should work here
13-
MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"
14-
15-
16-
@pytest.fixture(scope="module")
17-
def server():
18-
args = [
19-
# use half precision for speed and memory savings in CI environment
20-
"--dtype",
21-
"float16",
22-
"--enforce-eager",
23-
"--max-model-len",
24-
"4080",
25-
]
26-
27-
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
28-
yield remote_server
11+
MODEL_NAME = "microsoft/DialoGPT-small"
2912

3013

3114
@pytest_asyncio.fixture

tests/entrypoints/openai/test_chat_logit_bias_validation.py renamed to tests/entrypoints/openai/basic_tests/test_chat_logit_bias_validation.py

Lines changed: 1 addition & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,7 @@
77

88
from vllm.config import ModelConfig
99

10-
from ...utils import RemoteOpenAIServer
11-
12-
MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
10+
MODEL_NAME = "microsoft/DialoGPT-small"
1311

1412

1513
def get_vocab_size(model_name):
@@ -21,20 +19,6 @@ def get_vocab_size(model_name):
2119
return config.get_vocab_size()
2220

2321

24-
@pytest.fixture(scope="module")
25-
def server():
26-
args = [
27-
"--dtype",
28-
"bfloat16",
29-
"--max-model-len",
30-
"1024",
31-
"--enforce-eager",
32-
]
33-
34-
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
35-
yield remote_server
36-
37-
3822
@pytest_asyncio.fixture
3923
async def client(server):
4024
async with server.get_async_client() as async_client:

tests/entrypoints/openai/test_chat_template.py renamed to tests/entrypoints/openai/basic_tests/test_chat_template.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,8 @@
99
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
1010
from vllm.transformers_utils.tokenizer import get_tokenizer
1111

12-
from ...models.registry import HF_EXAMPLE_MODELS
13-
from ...utils import VLLM_PATH
12+
from ....models.registry import HF_EXAMPLE_MODELS
13+
from ....utils import VLLM_PATH
1414

1515
chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
1616
assert chatml_jinja_path.exists()

tests/entrypoints/openai/test_cli_args.py renamed to tests/entrypoints/openai/basic_tests/test_cli_args.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from vllm.entrypoints.openai.serving_models import LoRAModulePath
1111
from vllm.utils import FlexibleArgumentParser
1212

13-
from ...utils import VLLM_PATH
13+
from ....utils import VLLM_PATH
1414

1515
LORA_MODULE = {
1616
"name": "module2",

tests/entrypoints/openai/test_return_token_ids.py renamed to tests/entrypoints/openai/basic_tests/test_return_token_ids.py

Lines changed: 1 addition & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -5,25 +5,7 @@
55

66
from vllm.transformers_utils.tokenizer import get_tokenizer
77

8-
from ...utils import RemoteOpenAIServer
9-
10-
MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"
11-
12-
13-
@pytest.fixture(scope="module")
14-
def server():
15-
args = [
16-
"--max-model-len",
17-
"2048",
18-
"--max-num-seqs",
19-
"128",
20-
"--enable-auto-tool-choice",
21-
"--tool-call-parser",
22-
"hermes",
23-
"--enforce-eager",
24-
]
25-
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
26-
yield remote_server
8+
MODEL_NAME = "microsoft/DialoGPT-small"
279

2810

2911
@pytest.mark.asyncio

tests/entrypoints/openai/test_serving_chat.py renamed to tests/entrypoints/openai/basic_tests/test_serving_chat.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
OpenAIServingModels)
1818
from vllm.transformers_utils.tokenizer import get_tokenizer
1919

20-
MODEL_NAME = "openai-community/gpt2"
20+
MODEL_NAME = "microsoft/DialoGPT-small"
2121
CHAT_TEMPLATE = "Dummy chat template for testing {}"
2222
BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
2323

0 commit comments

Comments
 (0)