@@ -377,9 +377,13 @@ def __init__(self,
377
377
self .histogram_time_to_first_token = make_per_engine (
378
378
histogram_time_to_first_token , engine_indexes , model_name )
379
379
380
+ # Deprecated in 0.11 - Renamed as vllm:inter_token_latency_seconds
381
+ # TODO: in 0.12, only enable if show_hidden_metrics=True
380
382
histogram_time_per_output_token = self ._histogram_cls (
381
383
name = "vllm:time_per_output_token_seconds" ,
382
- documentation = "Histogram of time per output token in seconds." ,
384
+ documentation = (
385
+ "Histogram of time per output token in seconds."
386
+ "DEPRECATED: Use vllm:inter_token_latency_seconds instead." ),
383
387
buckets = [
384
388
0.01 , 0.025 , 0.05 , 0.075 , 0.1 , 0.15 , 0.2 , 0.3 , 0.4 , 0.5 , 0.75 ,
385
389
1.0 , 2.5 , 5.0 , 7.5 , 10.0 , 20.0 , 40.0 , 80.0
@@ -388,6 +392,17 @@ def __init__(self,
388
392
self .histogram_time_per_output_token = make_per_engine (
389
393
histogram_time_per_output_token , engine_indexes , model_name )
390
394
395
+ histogram_inter_token_latency = self ._histogram_cls (
396
+ name = "vllm:inter_token_latency_seconds" ,
397
+ documentation = "Histogram of inter-token latency in seconds." ,
398
+ buckets = [
399
+ 0.01 , 0.025 , 0.05 , 0.075 , 0.1 , 0.15 , 0.2 , 0.3 , 0.4 , 0.5 , 0.75 ,
400
+ 1.0 , 2.5 , 5.0 , 7.5 , 10.0 , 20.0 , 40.0 , 80.0
401
+ ],
402
+ labelnames = labelnames )
403
+ self .histogram_inter_token_latency = make_per_engine (
404
+ histogram_inter_token_latency , engine_indexes , model_name )
405
+
391
406
request_latency_buckets = [
392
407
0.3 , 0.5 , 0.8 , 1.0 , 1.5 , 2.0 , 2.5 , 5.0 , 10.0 , 15.0 , 20.0 , 30.0 ,
393
408
40.0 , 50.0 , 60.0 , 120.0 , 240.0 , 480.0 , 960.0 , 1920.0 , 7680.0
@@ -537,8 +552,9 @@ def record(self,
537
552
self .histogram_n_request [engine_idx ].observe (n_param )
538
553
for ttft in iteration_stats .time_to_first_tokens_iter :
539
554
self .histogram_time_to_first_token [engine_idx ].observe (ttft )
540
- for tpot in iteration_stats .time_per_output_tokens_iter :
541
- self .histogram_time_per_output_token [engine_idx ].observe (tpot )
555
+ for itl in iteration_stats .inter_token_latencies_iter :
556
+ self .histogram_inter_token_latency [engine_idx ].observe (itl )
557
+ self .histogram_time_per_output_token [engine_idx ].observe (itl )
542
558
543
559
for finished_request in iteration_stats .finished_requests :
544
560
self .counter_request_success [
0 commit comments