14
14
from vllm .v1 .sample .metadata import SamplingMetadata
15
15
from vllm .v1 .worker .block_table import BlockTable
16
16
17
+ _SAMPLING_EPS = 1e-5
18
+
17
19
if TYPE_CHECKING :
18
20
from vllm .multimodal .inputs import PlaceholderRange
19
21
@@ -120,6 +122,16 @@ def __init__(
120
122
self .top_k_cpu = self .top_k_cpu_tensor .numpy ()
121
123
self .top_k_reqs : Set [str ] = set ()
122
124
125
+ self .min_p = torch .empty ((max_num_reqs , ),
126
+ dtype = torch .float32 ,
127
+ device = device )
128
+ self .min_p_cpu_tensor = torch .empty ((max_num_reqs , ),
129
+ dtype = torch .float32 ,
130
+ device = "cpu" ,
131
+ pin_memory = pin_memory )
132
+ self .min_p_cpu = self .min_p_cpu_tensor .numpy ()
133
+ self .min_p_reqs : Set [str ] = set ()
134
+
123
135
# Frequency penalty related data structures
124
136
self .frequency_penalties = torch .empty ((max_num_reqs , ),
125
137
dtype = torch .float ,
@@ -223,8 +235,11 @@ def add_request(
223
235
self .top_k_cpu [req_index ] = sampling_params .top_k
224
236
if sampling_params .top_k > 0 :
225
237
self .top_k_reqs .add (req_id )
238
+ self .min_p_cpu [req_index ] = sampling_params .min_p
226
239
self .frequency_penalties_cpu [
227
240
req_index ] = sampling_params .frequency_penalty
241
+ if sampling_params .min_p > _SAMPLING_EPS :
242
+ self .min_p_reqs .add (req_id )
228
243
if sampling_params .frequency_penalty != 0.0 :
229
244
self .frequency_penalties_reqs .add (req_id )
230
245
self .presence_penalties_cpu [
@@ -273,6 +288,7 @@ def remove_request(self, req_id: str) -> Optional[int]:
273
288
self .random_reqs .discard (req_id )
274
289
self .top_p_reqs .discard (req_id )
275
290
self .top_k_reqs .discard (req_id )
291
+ self .min_p_reqs .discard (req_id )
276
292
self .frequency_penalties_reqs .discard (req_id )
277
293
self .presence_penalties_reqs .discard (req_id )
278
294
self .repetition_penalties_reqs .discard (req_id )
@@ -299,6 +315,7 @@ def clear(self) -> None:
299
315
self .random_reqs .clear ()
300
316
self .top_p_reqs .clear ()
301
317
self .top_k_reqs .clear ()
318
+ self .min_p_reqs .clear ()
302
319
self .frequency_penalties_reqs .clear ()
303
320
self .presence_penalties_reqs .clear ()
304
321
self .repetition_penalties_reqs .clear ()
@@ -354,6 +371,7 @@ def condense(self, empty_req_indices: List[int]) -> None:
354
371
empty_index ] = self .presence_penalties_cpu [last_req_index ]
355
372
self .repetition_penalties_cpu [
356
373
empty_index ] = self .repetition_penalties_cpu [last_req_index ]
374
+ self .min_p_cpu [empty_index ] = self .min_p_cpu [last_req_index ]
357
375
self .min_tokens [empty_index ] = self .min_tokens [last_req_index ]
358
376
self .stop_token_ids [empty_index ] = self .stop_token_ids [
359
377
last_req_index ]
@@ -381,6 +399,8 @@ def make_sampling_metadata(
381
399
self .top_p_cpu_tensor [:self .num_reqs ], non_blocking = True )
382
400
self .top_k [:self .num_reqs ].copy_ (
383
401
self .top_k_cpu_tensor [:self .num_reqs ], non_blocking = True )
402
+ self .min_p [:self .num_reqs ].copy_ (
403
+ self .min_p_cpu_tensor [:self .num_reqs ], non_blocking = True )
384
404
if not self .no_penalties :
385
405
# Since syncing these tensors is expensive only copy them
386
406
# if necessary i.e. if there are requests which require
@@ -421,6 +441,8 @@ def make_sampling_metadata(
421
441
all_random = self .all_random ,
422
442
top_p = self .top_p [:self .num_reqs ],
423
443
top_k = self .top_k [:self .num_reqs ],
444
+ min_p = self .min_p [:self .num_reqs ],
445
+ no_min_p = self .no_min_p ,
424
446
no_top_p = self .no_top_p ,
425
447
no_top_k = self .no_top_k ,
426
448
generators = self .generators ,
@@ -497,6 +519,10 @@ def no_top_p(self) -> bool:
497
519
def no_top_k (self ) -> bool :
498
520
return len (self .top_k_reqs ) == 0
499
521
522
+ @property
523
+ def no_min_p (self ) -> bool :
524
+ return len (self .min_p_reqs ) == 0
525
+
500
526
@property
501
527
def no_penalties (self ) -> bool :
502
528
return (len (self .presence_penalties_reqs ) == 0
0 commit comments