Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions vllm/entrypoints/chat_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,7 @@ def _detect_content_format(
return "openai"


def _resolve_hf_chat_template(
def resolve_hf_chat_template(
tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
chat_template: Optional[str],
tools: Optional[list[dict[str, Any]]],
Expand Down Expand Up @@ -352,7 +352,7 @@ def _resolve_chat_template_content_format(
trust_remote_code: bool,
) -> _ChatTemplateContentFormat:
if isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)):
hf_chat_template = _resolve_hf_chat_template(
hf_chat_template = resolve_hf_chat_template(
tokenizer,
chat_template=chat_template,
trust_remote_code=trust_remote_code,
Expand Down Expand Up @@ -1140,7 +1140,7 @@ def apply_hf_chat_template(
tokenize: bool = False, # Different from HF's default
**kwargs: Any,
) -> str:
hf_chat_template = _resolve_hf_chat_template(
hf_chat_template = resolve_hf_chat_template(
tokenizer,
chat_template=chat_template,
tools=tools,
Expand Down
25 changes: 22 additions & 3 deletions vllm/entrypoints/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@
from vllm.engine.multiprocessing.client import MQLLMEngineClient
from vllm.engine.multiprocessing.engine import run_mp_engine
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.chat_utils import load_chat_template
from vllm.entrypoints.chat_utils import (load_chat_template,
resolve_hf_chat_template)
from vllm.entrypoints.launcher import serve_http
from vllm.entrypoints.logger import RequestLogger
from vllm.entrypoints.openai.cli_args import (make_arg_parser,
Expand Down Expand Up @@ -84,6 +85,7 @@
from vllm.logger import init_logger
from vllm.transformers_utils.config import (
maybe_register_config_serialize_by_value)
from vllm.transformers_utils.tokenizer import MistralTokenizer
from vllm.usage.usage_lib import UsageContext
from vllm.utils import (Device, FlexibleArgumentParser, get_open_zmq_ipc_path,
is_valid_ipv6_address, set_ulimit)
Expand Down Expand Up @@ -883,8 +885,25 @@ async def init_app_state(

resolved_chat_template = load_chat_template(args.chat_template)
if resolved_chat_template is not None:
logger.info("Using supplied chat template:\n%s",
resolved_chat_template)
# Get the tokenizer to check official template
tokenizer = await engine_client.get_tokenizer()

# For HF tokenizer, check if the chat template matches.
# We don't need to check Mistral tokenizer
# because it doesn't support chat template
if not isinstance(tokenizer, MistralTokenizer):
hf_chat_template = resolve_hf_chat_template(
tokenizer,
chat_template=None,
tools=None,
trust_remote_code=model_config.trust_remote_code)

if hf_chat_template != resolved_chat_template:
logger.warning(
"Using supplied chat template: %s\n"
"It is different from official chat template '%s'. "
"This discrepancy may lead to performance degradation.",
resolved_chat_template, args.model)

state.openai_serving_models = OpenAIServingModels(
engine_client=engine_client,
Expand Down