Skip to content

Commit 3d320ed

Browse files
huachenheliFeiDaLI
authored andcommitted
[Core] Add torch profiler CPU traces for AsyncLLM. (vllm-project#21794)
Signed-off-by: Chenheli Hua <[email protected]>
1 parent fde5698 commit 3d320ed

File tree

2 files changed

+35
-4
lines changed

2 files changed

+35
-4
lines changed

vllm/envs.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -667,8 +667,10 @@ def get_vllm_port() -> Optional[int]:
667667
"VLLM_LORA_RESOLVER_CACHE_DIR":
668668
lambda: os.getenv("VLLM_LORA_RESOLVER_CACHE_DIR", None),
669669

670-
# Enables torch profiler if set. Path to the directory where torch profiler
671-
# traces are saved. Note that it must be an absolute path.
670+
# Enables torch profiler if set.
671+
# Both AsyncLLM's CPU traces as well as workers'
672+
# traces (CPU & GPU) will be saved under this directory.
673+
# Note that it must be an absolute path.
672674
"VLLM_TORCH_PROFILER_DIR":
673675
lambda: (None if os.getenv("VLLM_TORCH_PROFILER_DIR", None) is None else os
674676
.path.expanduser(os.getenv("VLLM_TORCH_PROFILER_DIR", "."))),

vllm/v1/engine/async_llm.py

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,15 @@
11
# SPDX-License-Identifier: Apache-2.0
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
33
import asyncio
4+
import os
5+
import socket
46
import time
57
from collections.abc import AsyncGenerator, Iterable, Mapping
68
from copy import copy
79
from typing import Any, Optional, Union
810

911
import numpy as np
12+
import torch
1013

1114
import vllm.envs as envs
1215
from vllm.config import ModelConfig, VllmConfig
@@ -144,6 +147,26 @@ def __init__(
144147
except RuntimeError:
145148
pass
146149

150+
if envs.VLLM_TORCH_PROFILER_DIR:
151+
logger.info(
152+
"Torch profiler enabled. AsyncLLM CPU traces will be collected under %s", # noqa: E501
153+
envs.VLLM_TORCH_PROFILER_DIR)
154+
worker_name = f"{socket.gethostname()}_{os.getpid()}.async_llm"
155+
self.profiler = torch.profiler.profile(
156+
activities=[
157+
torch.profiler.ProfilerActivity.CPU,
158+
],
159+
with_stack=envs.VLLM_TORCH_PROFILER_WITH_STACK,
160+
on_trace_ready=torch.profiler.tensorboard_trace_handler(
161+
envs.VLLM_TORCH_PROFILER_DIR,
162+
worker_name=worker_name,
163+
use_gzip=True))
164+
else:
165+
logger.info(
166+
"Torch profiler disabled. AsyncLLM CPU traces will not be collected." # noqa: E501
167+
)
168+
self.profiler = None
169+
147170
@classmethod
148171
@deprecate_kwargs(
149172
"disable_log_requests",
@@ -562,10 +585,16 @@ async def check_health(self) -> None:
562585
raise self.dead_error
563586

564587
async def start_profile(self) -> None:
565-
await self.engine_core.profile_async(True)
588+
coros = [self.engine_core.profile_async(True)]
589+
if self.profiler is not None:
590+
coros.append(asyncio.to_thread(self.profiler.start))
591+
await asyncio.gather(*coros)
566592

567593
async def stop_profile(self) -> None:
568-
await self.engine_core.profile_async(False)
594+
coros = [self.engine_core.profile_async(False)]
595+
if self.profiler is not None:
596+
coros.append(asyncio.to_thread(self.profiler.stop))
597+
await asyncio.gather(*coros)
569598

570599
async def reset_mm_cache(self) -> None:
571600
self.processor.mm_registry.reset_processor_cache(self.model_config)

0 commit comments

Comments
 (0)