Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion tests/core/test_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -641,7 +641,7 @@ def test_schedule_decode_blocks_to_copy_update():
# Nothing is preempted.
assert output.blocks_to_swap_out == []
# Since append_slot returns the source -> dist mapping, it should
# applied.
# be applied.
assert output.blocks_to_copy == [(2, 3)]


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def to_bytes(y, sr):

async def transcribe_audio(client, tokenizer, y, sr):
# Send loaded audio directly instead of loading from disk,
# dont account for that time though
# don't account for that time though
with to_bytes(y, sr) as f:
start_time = time.perf_counter()
transcription = await client.audio.transcriptions.create(
Expand Down
2 changes: 1 addition & 1 deletion tests/entrypoints/openai/test_return_token_ids.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ async def test_comparison_with_prompt_logprobs_and_logprobs(server):
logprobs_token_ids.append(token_id)

# When echo=True, the logprobs include both prompt and response tokens
# The token_ids field should match the the suffix of response portion
# The token_ids field should match the suffix of response portion
# The prompt_token_ids should match the prompt portion
assert len(completion.choices[0].token_ids) < len(logprobs_token_ids)
response_token_ids_length = len(completion.choices[0].token_ids)
Expand Down
2 changes: 1 addition & 1 deletion tests/entrypoints/openai/test_serving_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,7 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type):
}],
)

# By default cache_salt in the engine prompt is not set
# By default, cache_salt in the engine prompt is not set
with suppress(Exception):
await serving_chat.create_chat_completion(req)
assert "cache_salt" not in mock_engine.generate.call_args.args[0]
Expand Down
2 changes: 1 addition & 1 deletion tests/kernels/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1236,7 +1236,7 @@ def baseline_scaled_mm(a: torch.Tensor,
bias: Optional[torch.Tensor] = None) -> torch.Tensor:

# We treat N-dimensional group scaling as extended numpy-style broadcasting
# in numpy simply stretches dimensions with an extent of 1 to match the
# in numpy simply stretches dimensions with an extent of 1 to match
# the target shape by repeating the data along that dimension (broadcasting)
# , we extend these semantics to say if the extent of a dimension in the
# source shape is not 1 and does not match the target shape we repeat each
Expand Down
4 changes: 2 additions & 2 deletions tests/multimodal/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -458,7 +458,7 @@ def run_dp_sharded_vision_model_vs_direct(local_rank: int, world_size: int,
with torch.inference_mode():
sharded_output = run_dp_sharded_vision_model(image_input, vision_model)

# Check that the world size is setup correctly
# Check that the world size is set up correctly
assert get_tensor_model_parallel_world_size() == world_size

# Check that the outputs have the same shape
Expand Down Expand Up @@ -642,7 +642,7 @@ def run_dp_sharded_mrope_vision_model_vs_direct(local_rank: int,
rope_type="rope_3d")
sharded_output = torch.cat(sharded_output, dim=0)

# Check that the world size is setup correctly
# Check that the world size is set up correctly
assert get_tensor_model_parallel_world_size() == world_size

# Compare outputs (only on rank 0)
Expand Down
2 changes: 1 addition & 1 deletion tests/v1/e2e/test_spec_decode.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def test_ngram_correctness(
model_name: str,
):
'''
Compare the outputs of a original LLM and a speculative LLM
Compare the outputs of an original LLM and a speculative LLM
should be the same when using ngram speculative decoding.
'''
with monkeypatch.context() as m:
Expand Down
4 changes: 2 additions & 2 deletions tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def test_basic_lifecycle():
engine_core_outputs = scheduler.update_from_output(scheduler_output,
model_runner_output)

# Ensure the request is finished after 1 tokens.
# Ensure the request is finished after 1 token.
assert request.is_finished()
assert request.status == RequestStatus.FINISHED_LENGTH_CAPPED
output = engine_core_outputs[0].outputs[0]
Expand Down Expand Up @@ -141,7 +141,7 @@ def test_short_prompt_lifecycle():


def test_prefix_cache_lifecycle():
"""Test that remote decode params still works with a prefix cache hit."""
"""Test that remote decode params still work with a prefix cache hit."""

vllm_config = create_vllm_config()
scheduler = create_scheduler(vllm_config)
Expand Down
4 changes: 2 additions & 2 deletions tests/v1/spec_decode/test_tree_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ def test_tree_attn_correctness() -> None:
dtype=torch.bfloat16,
)

# Setup the block table and KV cache for paged KV.
# Set up the block table and KV cache for paged KV.
assert max_sequence_length % block_size == 0
max_blocks_per_batch = max_sequence_length // block_size
kv_cache = torch.randn(
Expand Down Expand Up @@ -222,7 +222,7 @@ def test_tree_attn_correctness() -> None:
num_alloc_blocks_per_batch] = block_ids.view(
-1, num_alloc_blocks_per_batch)

# Setup the slot mapping for the input KVs.
# Set up the slot mapping for the input KVs.
tree_positions = sequence_position + torch.arange(
0,
tree_size_q,
Expand Down
2 changes: 1 addition & 1 deletion vllm/lora/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ def get_adapter_absolute_path(lora_path: str) -> str:
except (HfHubHTTPError, RepositoryNotFoundError, EntryNotFoundError,
HFValidationError):
# Handle errors that may occur during the download
# Return original path instead instead of throwing error here
# Return original path instead of throwing error here
logger.exception("Error downloading the HuggingFace model")
return lora_path

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def find_matched_target(
config that a layer corresponds to.

Recall that a compressed-tensors configs has a concept of
config_groups, where each layer can be quantized with with a different
config_groups, where each layer can be quantized with a different
scheme.

targets in each config_group will be a list of either layer names
Expand Down
10 changes: 5 additions & 5 deletions vllm/multimodal/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ def fetch_image(
image_mode: str = "RGB",
) -> Image.Image:
"""
Load a PIL image from a HTTP or base64 data URL.
Load a PIL image from an HTTP or base64 data URL.

By default, the image is converted into RGB format.
"""
Expand All @@ -237,7 +237,7 @@ async def fetch_image_async(
image_mode: str = "RGB",
) -> Image.Image:
"""
Asynchronously load a PIL image from a HTTP or base64 data URL.
Asynchronously load a PIL image from an HTTP or base64 data URL.

By default, the image is converted into RGB format.
"""
Expand All @@ -261,7 +261,7 @@ def fetch_video(
image_mode: str = "RGB",
) -> tuple[npt.NDArray, dict[str, Any]]:
"""
Load video from a HTTP or base64 data URL.
Load video from an HTTP or base64 data URL.
"""
image_io = ImageMediaIO(image_mode=image_mode,
**self.media_io_kwargs.get("image", {}))
Expand All @@ -281,7 +281,7 @@ async def fetch_video_async(
image_mode: str = "RGB",
) -> tuple[npt.NDArray, dict[str, Any]]:
"""
Asynchronously load video from a HTTP or base64 data URL.
Asynchronously load video from an HTTP or base64 data URL.

By default, the image is converted into RGB format.
"""
Expand Down Expand Up @@ -370,7 +370,7 @@ def group_mm_inputs_by_modality(

def modality_group_func(
mm_input: MultiModalKwargsItems) -> Union[str, int]:
# If the input has multiple modalities, return a id as the unique key
# If the input has multiple modalities, return an id as the unique key
# for the mm_input input.
if len(mm_input) > 1:
return id(mm_input)
Expand Down
2 changes: 1 addition & 1 deletion vllm/v1/attention/backends/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -709,7 +709,7 @@ def reorder_batch_to_split_decodes_and_prefills(

for i, req_id in enumerate(input_batch.req_ids):
num_tokens = scheduler_output.num_scheduled_tokens[req_id]
# for now treat 1 scheduled token as "decode" even if its not,
# for now treat 1 scheduled token as "decode" even if it's not,
# we should update this to something like < 8 in the future but
# currently the TritonMLA._forward_decode only supports
# num_tokens = 1
Expand Down
4 changes: 2 additions & 2 deletions vllm/v1/structured_output/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,9 @@ def get_outlines_cache_path() -> str:
elif xdg_cache_home:
return os.path.join(xdg_cache_home, ".cache", "outlines")
# If homedir is "/", we may be inside a container, and thus writing to
# root would be problematic, so we fallback to using a tempfile.
# root would be problematic, so we fall back to using a tempfile.
# Also validate the path exists, since os.path.expanduser does
# not garuntee existence.
# not guarantee existence.
elif os.path.isdir(home_dir) and home_dir != "/":
# Default Unix fallback: ~/.cache/outlines
return os.path.join(home_dir, ".cache", "outlines")
Expand Down
2 changes: 1 addition & 1 deletion vllm/v1/worker/tpu_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ def execute_model(
scheduler_output: "SchedulerOutput",
) -> Optional[ModelRunnerOutput]:
output = self.model_runner.execute_model(scheduler_output)
# every worker's output is needed when kv_transfer_group is setup
# every worker's output is needed when kv_transfer_group is set up
return output if self.is_driver_worker or has_kv_transfer_group(
) else None

Expand Down