Skip to content

Commit 01157ad

Browse files
markmceicherseiji
authored andcommitted
[Metrics] Deprecate TPOT in favor of ITL (vllm-project#24110)
Signed-off-by: Mark McLoughlin <[email protected]>
1 parent cde793a commit 01157ad

File tree

7 files changed

+64
-26
lines changed

7 files changed

+64
-26
lines changed

examples/online_serving/prometheus_grafana/grafana.json

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -402,7 +402,7 @@
402402
},
403403
"disableTextWrap": false,
404404
"editorMode": "builder",
405-
"expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
405+
"expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
406406
"fullMetaSearch": false,
407407
"includeNullMetadata": false,
408408
"instant": false,
@@ -418,7 +418,7 @@
418418
},
419419
"disableTextWrap": false,
420420
"editorMode": "builder",
421-
"expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
421+
"expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
422422
"fullMetaSearch": false,
423423
"hide": false,
424424
"includeNullMetadata": false,
@@ -435,7 +435,7 @@
435435
},
436436
"disableTextWrap": false,
437437
"editorMode": "builder",
438-
"expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
438+
"expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
439439
"fullMetaSearch": false,
440440
"hide": false,
441441
"includeNullMetadata": false,
@@ -452,7 +452,7 @@
452452
},
453453
"disableTextWrap": false,
454454
"editorMode": "builder",
455-
"expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
455+
"expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:inter_token_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
456456
"fullMetaSearch": false,
457457
"hide": false,
458458
"includeNullMetadata": false,
@@ -468,15 +468,15 @@
468468
"uid": "${DS_PROMETHEUS}"
469469
},
470470
"editorMode": "code",
471-
"expr": "rate(vllm:time_per_output_token_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:time_per_output_token_seconds_count{model_name=\"$model_name\"}[$__rate_interval])",
471+
"expr": "rate(vllm:inter_token_latency_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:inter_token_latency_seconds_count{model_name=\"$model_name\"}[$__rate_interval])",
472472
"hide": false,
473473
"instant": false,
474474
"legendFormat": "Mean",
475475
"range": true,
476476
"refId": "E"
477477
}
478478
],
479-
"title": "Time Per Output Token Latency",
479+
"title": "Inter Token Latency",
480480
"type": "timeseries"
481481
},
482482
{

tests/entrypoints/openai/test_metrics.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -250,12 +250,15 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
250250
"vllm:request_params_max_tokens_sum",
251251
"vllm:request_params_max_tokens_bucket",
252252
"vllm:request_params_max_tokens_count",
253-
"vllm:time_to_first_token_seconds_sum",
254-
"vllm:time_to_first_token_seconds_bucket",
255-
"vllm:time_to_first_token_seconds_count",
256253
"vllm:time_per_output_token_seconds_sum",
257254
"vllm:time_per_output_token_seconds_bucket",
258255
"vllm:time_per_output_token_seconds_count",
256+
"vllm:time_to_first_token_seconds_sum",
257+
"vllm:time_to_first_token_seconds_bucket",
258+
"vllm:time_to_first_token_seconds_count",
259+
"vllm:inter_token_latency_seconds_sum",
260+
"vllm:inter_token_latency_seconds_bucket",
261+
"vllm:inter_token_latency_seconds_count",
259262
"vllm:e2e_request_latency_seconds_sum",
260263
"vllm:e2e_request_latency_seconds_bucket",
261264
"vllm:e2e_request_latency_seconds_count",
@@ -273,7 +276,11 @@ async def test_metrics_counts(server: RemoteOpenAIServer,
273276
"vllm:request_decode_time_seconds_count",
274277
]
275278

276-
HIDDEN_DEPRECATED_METRICS: list[str] = []
279+
HIDDEN_DEPRECATED_METRICS: list[str] = [
280+
"vllm:time_per_output_token_seconds_sum",
281+
"vllm:time_per_output_token_seconds_bucket",
282+
"vllm:time_per_output_token_seconds_count",
283+
]
277284

278285

279286
@pytest.mark.asyncio
@@ -289,9 +296,10 @@ async def test_metrics_exist(server: RemoteOpenAIServer,
289296
assert response.status_code == HTTPStatus.OK
290297

291298
for metric in (EXPECTED_METRICS_V1 if use_v1 else EXPECTED_METRICS):
292-
if (not server.show_hidden_metrics
293-
and metric not in HIDDEN_DEPRECATED_METRICS):
294-
assert metric in response.text
299+
if (metric in HIDDEN_DEPRECATED_METRICS
300+
and not server.show_hidden_metrics):
301+
continue
302+
assert metric in response.text
295303

296304

297305
@pytest.mark.asyncio

vllm/engine/llm_engine.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1414,7 +1414,7 @@ def _get_stats(self,
14141414
num_generation_tokens_iter = 0
14151415
num_tokens_iter = 0
14161416
time_to_first_tokens_iter: List[float] = []
1417-
time_per_output_tokens_iter: List[float] = []
1417+
inter_token_latencies_iter: List[float] = []
14181418
num_preemption_iter = (0 if scheduler_outputs is None else
14191419
scheduler_outputs.preempted)
14201420

@@ -1498,9 +1498,9 @@ def _get_stats(self,
14981498
num_generation_tokens_from_prefill_groups += (
14991499
seq_group.num_seqs())
15001500
else:
1501-
# TPOTs.
1501+
# ITLs
15021502
latency = seq_group.get_last_token_latency()
1503-
time_per_output_tokens_iter.append(latency)
1503+
inter_token_latencies_iter.append(latency)
15041504
if seq_group.state.current_step == 0:
15051505
# For async_output_proc, the do_log_stats()
15061506
# is called following init_multi_step(), which
@@ -1582,7 +1582,7 @@ def _get_stats(self,
15821582
num_generation_tokens_iter=num_generation_tokens_iter,
15831583
num_tokens_iter=num_tokens_iter,
15841584
time_to_first_tokens_iter=time_to_first_tokens_iter,
1585-
time_per_output_tokens_iter=time_per_output_tokens_iter,
1585+
inter_token_latencies_iter=inter_token_latencies_iter,
15861586
num_preemption_iter=num_preemption_iter,
15871587

15881588
# Request stats

vllm/engine/metrics.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -113,9 +113,21 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
113113
0.75, 1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0, 160.0, 640.0,
114114
2560.0
115115
])
116+
# Deprecated in 0.11 - Renamed as vllm:inter_token_latency_seconds
117+
# TODO: in 0.12, only enable if show_hidden_metrics=True
116118
self.histogram_time_per_output_token = self._histogram_cls(
117119
name="vllm:time_per_output_token_seconds",
118-
documentation="Histogram of time per output token in seconds.",
120+
documentation=(
121+
"Histogram of time per output token in seconds."
122+
"DEPRECATED: Use vllm:inter_token_latency_seconds instead."),
123+
labelnames=labelnames,
124+
buckets=[
125+
0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
126+
1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0
127+
])
128+
self.histogram_inter_token_latency = self._histogram_cls(
129+
name="vllm:inter_token_latency_seconds",
130+
documentation="Histogram of inter token latency in seconds.",
119131
labelnames=labelnames,
120132
buckets=[
121133
0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
@@ -491,7 +503,9 @@ def _log_prometheus(self, stats: Stats) -> None:
491503
self._log_histogram(self.metrics.histogram_time_to_first_token,
492504
stats.time_to_first_tokens_iter)
493505
self._log_histogram(self.metrics.histogram_time_per_output_token,
494-
stats.time_per_output_tokens_iter)
506+
stats.inter_token_latencies_iter)
507+
self._log_histogram(self.metrics.histogram_inter_token_latency,
508+
stats.inter_token_latencies_iter)
495509

496510
# Request level data
497511
# Latency

vllm/engine/metrics_types.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ class Stats:
4343
num_generation_tokens_iter: int
4444
num_tokens_iter: int
4545
time_to_first_tokens_iter: List[float]
46-
time_per_output_tokens_iter: List[float]
46+
inter_token_latencies_iter: List[float]
4747
num_preemption_iter: int
4848

4949
# Request stats (should have _requests suffix)

vllm/v1/metrics/loggers.py

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -377,9 +377,13 @@ def __init__(self,
377377
self.histogram_time_to_first_token = make_per_engine(
378378
histogram_time_to_first_token, engine_indexes, model_name)
379379

380+
# Deprecated in 0.11 - Renamed as vllm:inter_token_latency_seconds
381+
# TODO: in 0.12, only enable if show_hidden_metrics=True
380382
histogram_time_per_output_token = self._histogram_cls(
381383
name="vllm:time_per_output_token_seconds",
382-
documentation="Histogram of time per output token in seconds.",
384+
documentation=(
385+
"Histogram of time per output token in seconds."
386+
"DEPRECATED: Use vllm:inter_token_latency_seconds instead."),
383387
buckets=[
384388
0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
385389
1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0
@@ -388,6 +392,17 @@ def __init__(self,
388392
self.histogram_time_per_output_token = make_per_engine(
389393
histogram_time_per_output_token, engine_indexes, model_name)
390394

395+
histogram_inter_token_latency = self._histogram_cls(
396+
name="vllm:inter_token_latency_seconds",
397+
documentation="Histogram of inter-token latency in seconds.",
398+
buckets=[
399+
0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 0.75,
400+
1.0, 2.5, 5.0, 7.5, 10.0, 20.0, 40.0, 80.0
401+
],
402+
labelnames=labelnames)
403+
self.histogram_inter_token_latency = make_per_engine(
404+
histogram_inter_token_latency, engine_indexes, model_name)
405+
391406
request_latency_buckets = [
392407
0.3, 0.5, 0.8, 1.0, 1.5, 2.0, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0,
393408
40.0, 50.0, 60.0, 120.0, 240.0, 480.0, 960.0, 1920.0, 7680.0
@@ -537,8 +552,9 @@ def record(self,
537552
self.histogram_n_request[engine_idx].observe(n_param)
538553
for ttft in iteration_stats.time_to_first_tokens_iter:
539554
self.histogram_time_to_first_token[engine_idx].observe(ttft)
540-
for tpot in iteration_stats.time_per_output_tokens_iter:
541-
self.histogram_time_per_output_token[engine_idx].observe(tpot)
555+
for itl in iteration_stats.inter_token_latencies_iter:
556+
self.histogram_inter_token_latency[engine_idx].observe(itl)
557+
self.histogram_time_per_output_token[engine_idx].observe(itl)
542558

543559
for finished_request in iteration_stats.finished_requests:
544560
self.counter_request_success[

vllm/v1/metrics/stats.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ def __init__(self):
9696
self.max_num_generation_tokens_iter: list[int] = []
9797
self.n_params_iter: list[int] = []
9898
self.time_to_first_tokens_iter: list[float] = []
99-
self.time_per_output_tokens_iter: list[float] = []
99+
self.inter_token_latencies_iter: list[float] = []
100100
self.waiting_lora_adapters: dict[str, int] = {}
101101
self.running_lora_adapters: dict[str, int] = {}
102102

@@ -128,8 +128,8 @@ def update_from_output(self, output: "EngineCoreOutput",
128128
if is_prefilling:
129129
req_stats.first_token_ts = engine_core_timestamp
130130
else:
131-
tpot = engine_core_timestamp - req_stats.last_token_ts
132-
self.time_per_output_tokens_iter.append(tpot)
131+
itl = engine_core_timestamp - req_stats.last_token_ts
132+
self.inter_token_latencies_iter.append(itl)
133133

134134
req_stats.last_token_ts = engine_core_timestamp
135135

0 commit comments

Comments
 (0)