Skip to content

Commit 544a5f7

Browse files
Zerohertzhmellor
authored andcommitted
[Docs] Fix warnings in mkdocs build (vllm-project#23649)
Signed-off-by: Zerohertz <[email protected]> Signed-off-by: Hyogeun Oh (오효근) <[email protected]> Signed-off-by: Harry Mellor <[email protected]> Co-authored-by: Harry Mellor <[email protected]>
1 parent 400300c commit 544a5f7

File tree

14 files changed

+66
-58
lines changed

14 files changed

+66
-58
lines changed

vllm/attention/backends/differential_flash_attn.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -805,14 +805,18 @@ def forward(
805805
"""Forward pass with FlashAttention.
806806
807807
Args:
808-
query: shape = [num_tokens, num_heads, head_size]
809-
key: shape = [num_tokens, num_kv_heads, head_size]
810-
value: shape = [num_tokens, num_kv_heads, head_size]
811-
output: shape = [num_tokens, num_heads, head_size]
812-
kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
808+
layer: Attention layer instance.
809+
q: Query tensor with shape = [num_tokens, num_heads, head_size]
810+
k: Key tensor with shape = [num_tokens, num_kv_heads, head_size]
811+
v: Value tensor with shape = [num_tokens, num_kv_heads, head_size]
812+
kv_cache: KV cache tensor with shape
813+
[2, num_blocks, block_size, num_kv_heads, head_size].
813814
NOTE: kv_cache will be an empty tensor with shape [0]
814815
for profiling run.
815816
attn_metadata: Metadata for attention.
817+
output: Output tensor with shape [num_tokens, num_heads, head_size]
818+
output_scale: Optional output scale tensor.
819+
output_block_scale: Optional output block scale tensor.
816820
NOTE: It in-place updates the output tensor.
817821
NOTE: FP8 quantization, flash-attn expect the size of
818822
{q,k,v}_descale to be (num_sequences, num_kv_heads).

vllm/attention/backends/flash_attn.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -605,7 +605,8 @@ def forward(
605605
key: shape = [num_tokens, num_kv_heads, head_size]
606606
value: shape = [num_tokens, num_kv_heads, head_size]
607607
output: shape = [num_tokens, num_heads, head_size]
608-
kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
608+
kv_cache: KV cache tensor with shape
609+
[2, num_blocks, block_size, num_kv_heads, head_size].
609610
NOTE: kv_cache will be an empty tensor with shape [0]
610611
for profiling run.
611612
attn_metadata: Metadata for attention.
@@ -850,7 +851,7 @@ def forward(
850851

851852

852853
def _get_query_key_seq_metadata(
853-
attn_metadata,
854+
attn_metadata: FlashAttentionMetadata,
854855
is_prompt: bool,
855856
attn_type: str,
856857
) -> tuple:

vllm/attention/backends/rocm_flash_attn.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -584,17 +584,18 @@ def forward(
584584
use prefill sequence attributes
585585
586586
Args:
587+
layer: Attention layer instance.
587588
query: shape = [num_tokens, num_heads * head_size]
588589
key: shape = [num_tokens, num_kv_heads * head_size]
589590
value: shape = [num_tokens, num_kv_heads * head_size]
590-
kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
591+
kv_cache: KV cache tensor with shape
592+
[2, num_blocks, block_size * num_kv_heads * head_size].
591593
NOTE: kv_cache will be an empty tensor with shape [0]
592594
for profiling run.
593595
attn_metadata: Metadata for attention.
594-
attn_type: Select attention type, between encoder attention,
595-
decoder self-attention, or encoder/decoder cross-
596-
attention. Defaults to decoder self-attention,
597-
which is the vLLM default generally
596+
output: Optional output tensor.
597+
output_scale: Optional output scale tensor.
598+
output_block_scale: Optional output block scale tensor.
598599
Returns:
599600
shape = [num_tokens, num_heads * head_size]
600601
"""

vllm/attention/backends/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -561,7 +561,7 @@ def get_num_prefill_decode_query_kv_tokens(
561561
562562
Raises:
563563
AssertionError: If the number of encoder tokens in `attn_metadata`
564-
is `None` when required for the calculations.
564+
is `None` when required for the calculations.
565565
"""
566566
num_prefill_query_tokens = 0
567567
num_decode_query_tokens = 0

vllm/attention/backends/xformers.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -471,17 +471,18 @@ def forward(
471471
max_encoder_seq_len)
472472
473473
Args:
474+
layer: Attention layer instance.
474475
query: shape = [num_tokens, num_heads * head_size]
475476
key: shape = [num_tokens, num_kv_heads * head_size]
476477
value: shape = [num_tokens, num_kv_heads * head_size]
477-
kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
478+
kv_cache: KV cache tensor with shape
479+
[2, num_blocks, block_size * num_kv_heads * head_size].
478480
NOTE: kv_cache will be an empty tensor with shape [0]
479481
for profiling run.
480482
attn_metadata: Metadata for attention.
481-
attn_type: Select attention type, between encoder attention,
482-
decoder self-attention, or encoder/decoder cross-
483-
attention. Defaults to decoder self-attention,
484-
which is the vLLM default generally
483+
output: Optional output tensor.
484+
output_scale: Optional output scale tensor.
485+
output_block_scale: Optional output block scale tensor.
485486
Returns:
486487
shape = [num_tokens, num_heads * head_size]
487488
"""
@@ -644,7 +645,6 @@ def _run_memory_efficient_xformers_forward(
644645
for API spec.
645646
646647
Args:
647-
output: shape = [num_prefill_tokens, num_heads, head_size]
648648
query: shape = [num_prefill_tokens, num_heads, head_size]
649649
key: shape = [num_prefill_tokens, num_kv_heads, head_size]
650650
value: shape = [num_prefill_tokens, num_kv_heads, head_size]

vllm/core/block_manager.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -352,7 +352,7 @@ def can_swap_in(self, seq_group: SequenceGroup,
352352
with num_lookahead_slots.
353353
354354
Args:
355-
sequence_group (SequenceGroup): The sequence group to swap in.
355+
seq_group (SequenceGroup): The sequence group to swap in.
356356
num_lookahead_slots (int): Number of lookahead slots used in
357357
speculative decoding, default to 0.
358358
@@ -405,8 +405,6 @@ def can_swap_out(self, seq_group: SequenceGroup) -> bool:
405405
406406
Args:
407407
seq_group (SequenceGroup): The sequence group to swap out.
408-
num_lookahead_slots (int): Number of lookahead slots used in
409-
speculative decoding, default to 0.
410408
411409
Returns:
412410
bool: Whether it's possible to swap out current sequence group.
@@ -420,7 +418,7 @@ def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
420418
swapping out the given sequence_group with num_lookahead_slots.
421419
422420
Args:
423-
sequence_group (SequenceGroup): The sequence group to swap out.
421+
seq_group (SequenceGroup): The sequence group to swap out.
424422
425423
Returns:
426424
List[Tuple[int, int]]: The mapping of swapping block from
@@ -473,7 +471,7 @@ def _can_swap(self,
473471
on to the 'device'.
474472
475473
Args:
476-
sequence_group (SequenceGroup): The sequence group to swap in/out.
474+
seq_group (SequenceGroup): The sequence group to swap in/out.
477475
device (Device): device to swap the 'seq_group' on.
478476
status (SequenceStatus): The status of sequence which is needed
479477
for action. RUNNING for swap out and SWAPPED for swap in

vllm/engine/async_llm_engine.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -486,10 +486,10 @@ class AsyncLLMEngine(EngineClient):
486486
_engine_class: Type[_AsyncLLMEngine] = _AsyncLLMEngine
487487

488488
def __init__(self,
489-
*args,
489+
*args: Any,
490490
log_requests: bool = True,
491491
start_engine_loop: bool = True,
492-
**kwargs) -> None:
492+
**kwargs: Any) -> None:
493493
if envs.VLLM_USE_V1:
494494
raise ValueError(
495495
"Using V0 AsyncLLMEngine, but envs.VLLM_USE_V1=True. "

vllm/engine/llm_engine.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -644,10 +644,10 @@ def add_request(
644644
Details:
645645
- Set arrival_time to the current time if it is None.
646646
- Set prompt_token_ids to the encoded prompt if it is None.
647-
- Create `n` number of [Sequence][vllm.Sequence] objects.
648-
- Create a [SequenceGroup][vllm.SequenceGroup] object
649-
from the list of [Sequence][vllm.Sequence].
650-
- Add the [SequenceGroup][vllm.SequenceGroup] object to the
647+
- Create `n` number of [Sequence][vllm.sequence.Sequence] objects.
648+
- Create a [SequenceGroup][vllm.sequence.SequenceGroup] object
649+
from the list of [Sequence][vllm.sequence.Sequence].
650+
- Add the [SequenceGroup][vllm.sequence.SequenceGroup] object to the
651651
scheduler.
652652
653653
Example:

vllm/entrypoints/llm.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -186,7 +186,7 @@ def __init__(
186186
CompilationConfig]] = None,
187187
logits_processors: Optional[list[Union[str,
188188
type[LogitsProcessor]]]] = None,
189-
**kwargs,
189+
**kwargs: Any,
190190
) -> None:
191191
"""LLM constructor."""
192192

@@ -697,8 +697,8 @@ def chat(
697697
Generate responses for a chat conversation.
698698
699699
The chat conversation is converted into a text prompt using the
700-
tokenizer and calls the [generate][] method to generate the
701-
responses.
700+
tokenizer and calls the [generate][vllm.LLM.generate] method to generate
701+
the responses.
702702
703703
Multi-modal inputs can be passed in the same way you would pass them
704704
to the OpenAI API.
@@ -1334,8 +1334,8 @@ def sleep(self, level: int = 1):
13341334

13351335
def wake_up(self, tags: Optional[list[str]] = None):
13361336
"""
1337-
Wake up the engine from sleep mode. See the [sleep][] method
1338-
for more details.
1337+
Wake up the engine from sleep mode. See the [sleep][vllm.LLM.sleep]
1338+
method for more details.
13391339
13401340
Args:
13411341
tags: An optional list of tags to reallocate the engine memory

vllm/entrypoints/openai/tool_parsers/minimax_tool_parser.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -461,7 +461,8 @@ def _find_tool_boundaries(self, text: str) -> list[tuple[int, int]]:
461461
i += 1
462462
return boundaries
463463

464-
def _extract_tool_args(self, tool_content: str, args_match) -> str:
464+
def _extract_tool_args(self, tool_content: str,
465+
args_match: re.Match[str]) -> str:
465466
"""
466467
Extract tool arguments from tool content.
467468

0 commit comments

Comments
 (0)