Skip to content

Commit 18bdcf4

Browse files
authored
feat - add a new endpoint get_tokenizer_info to provide tokenizer/chat-template information (#20575)
Signed-off-by: m-misiura <mmisiura@redhat.com>
1 parent 1c3198b commit 18bdcf4

File tree

5 files changed

+182
-3
lines changed

5 files changed

+182
-3
lines changed

tests/entrypoints/openai/test_tokenization.py

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ def server(zephyr_lora_added_tokens_files: str): # noqa: F811
3232
f"zephyr-lora2={zephyr_lora_added_tokens_files}",
3333
"--max-lora-rank",
3434
"64",
35+
"--enable-tokenizer-info-endpoint",
3536
]
3637

3738
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
@@ -283,3 +284,106 @@ async def test_detokenize(
283284
response.raise_for_status()
284285

285286
assert response.json() == {"prompt": prompt}
287+
288+
289+
@pytest.mark.asyncio
290+
@pytest.mark.parametrize(
291+
"model_name,tokenizer_name",
292+
[(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
293+
indirect=["tokenizer_name"],
294+
)
295+
async def test_tokenizer_info_basic(
296+
server: RemoteOpenAIServer,
297+
model_name: str,
298+
tokenizer_name: str,
299+
):
300+
"""Test basic tokenizer info endpoint functionality."""
301+
response = requests.get(server.url_for("tokenizer_info"))
302+
response.raise_for_status()
303+
result = response.json()
304+
assert "tokenizer_class" in result
305+
assert isinstance(result["tokenizer_class"], str)
306+
assert result["tokenizer_class"]
307+
308+
309+
@pytest.mark.asyncio
310+
async def test_tokenizer_info_schema(server: RemoteOpenAIServer):
311+
"""Test that the response matches expected schema types."""
312+
response = requests.get(server.url_for("tokenizer_info"))
313+
response.raise_for_status()
314+
result = response.json()
315+
field_types = {
316+
"add_bos_token": bool,
317+
"add_prefix_space": bool,
318+
"clean_up_tokenization_spaces": bool,
319+
"split_special_tokens": bool,
320+
"bos_token": str,
321+
"eos_token": str,
322+
"pad_token": str,
323+
"unk_token": str,
324+
"chat_template": str,
325+
"errors": str,
326+
"model_max_length": int,
327+
"additional_special_tokens": list,
328+
"added_tokens_decoder": dict,
329+
}
330+
for field, expected_type in field_types.items():
331+
if field in result and result[field] is not None:
332+
assert isinstance(
333+
result[field],
334+
expected_type), (f"{field} should be {expected_type.__name__}")
335+
336+
337+
@pytest.mark.asyncio
338+
async def test_tokenizer_info_added_tokens_structure(
339+
server: RemoteOpenAIServer, ):
340+
"""Test added_tokens_decoder structure if present."""
341+
response = requests.get(server.url_for("tokenizer_info"))
342+
response.raise_for_status()
343+
result = response.json()
344+
added_tokens = result.get("added_tokens_decoder")
345+
if added_tokens:
346+
for token_id, token_info in added_tokens.items():
347+
assert isinstance(token_id, str), "Token IDs should be strings"
348+
assert isinstance(token_info, dict), "Token info should be a dict"
349+
assert "content" in token_info, "Token info should have content"
350+
assert "special" in token_info, (
351+
"Token info should have special flag")
352+
assert isinstance(token_info["special"],
353+
bool), ("Special flag should be boolean")
354+
355+
356+
@pytest.mark.asyncio
357+
async def test_tokenizer_info_consistency_with_tokenize(
358+
server: RemoteOpenAIServer, ):
359+
"""Test that tokenizer info is consistent with tokenization endpoint."""
360+
info_response = requests.get(server.url_for("tokenizer_info"))
361+
info_response.raise_for_status()
362+
info = info_response.json()
363+
tokenize_response = requests.post(
364+
server.url_for("tokenize"),
365+
json={
366+
"model": MODEL_NAME,
367+
"prompt": "Hello world!"
368+
},
369+
)
370+
tokenize_response.raise_for_status()
371+
tokenize_result = tokenize_response.json()
372+
info_max_len = info.get("model_max_length")
373+
tokenize_max_len = tokenize_result.get("max_model_len")
374+
if info_max_len and tokenize_max_len:
375+
assert info_max_len >= tokenize_max_len, (
376+
"Info max length should be >= tokenize max length")
377+
378+
379+
@pytest.mark.asyncio
380+
async def test_tokenizer_info_chat_template(server: RemoteOpenAIServer):
381+
"""Test chat template is properly included."""
382+
response = requests.get(server.url_for("tokenizer_info"))
383+
response.raise_for_status()
384+
result = response.json()
385+
chat_template = result.get("chat_template")
386+
if chat_template:
387+
assert isinstance(chat_template,
388+
str), ("Chat template should be a string")
389+
assert chat_template.strip(), "Chat template should not be empty"

vllm/entrypoints/openai/api_server.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -522,6 +522,19 @@ async def detokenize(request: DetokenizeRequest, raw_request: Request):
522522
assert_never(generator)
523523

524524

525+
def maybe_register_tokenizer_info_endpoint(args):
526+
"""Conditionally register the tokenizer info endpoint if enabled."""
527+
if getattr(args, 'enable_tokenizer_info_endpoint', False):
528+
529+
@router.get("/tokenizer_info")
530+
async def get_tokenizer_info(raw_request: Request):
531+
"""Get comprehensive tokenizer information."""
532+
result = await tokenization(raw_request).get_tokenizer_info()
533+
return JSONResponse(content=result.model_dump(),
534+
status_code=result.code if isinstance(
535+
result, ErrorResponse) else 200)
536+
537+
525538
@router.get("/v1/models")
526539
async def show_available_models(raw_request: Request):
527540
handler = models(raw_request)
@@ -1692,6 +1705,7 @@ async def run_server_worker(listen_address,
16921705
uvicorn_kwargs['log_config'] = log_config
16931706

16941707
async with build_async_engine_client(args, client_config) as engine_client:
1708+
maybe_register_tokenizer_info_endpoint(args)
16951709
app = build_app(args)
16961710

16971711
vllm_config = await engine_client.get_vllm_config()

vllm/entrypoints/openai/cli_args.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,9 @@ class FrontendArgs:
182182
"""If set to True, enable tracking server_load_metrics in the app state."""
183183
enable_force_include_usage: bool = False
184184
"""If set to True, including usage on every request."""
185+
enable_tokenizer_info_endpoint: bool = False
186+
"""Enable the /get_tokenizer_info endpoint. May expose chat
187+
templates and other tokenizer configuration."""
185188

186189
@staticmethod
187190
def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:

vllm/entrypoints/openai/protocol.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1953,6 +1953,16 @@ class DetokenizeResponse(OpenAIBaseModel):
19531953
prompt: str
19541954

19551955

1956+
class TokenizerInfoResponse(OpenAIBaseModel):
1957+
"""
1958+
Response containing tokenizer configuration
1959+
equivalent to tokenizer_config.json
1960+
"""
1961+
1962+
model_config = ConfigDict(extra="allow")
1963+
tokenizer_class: str
1964+
1965+
19561966
class LoadLoRAAdapterRequest(BaseModel):
19571967
lora_name: str
19581968
lora_path: str

vllm/entrypoints/openai/serving_tokenization.py

Lines changed: 51 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# SPDX-License-Identifier: Apache-2.0
22
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3-
4-
from typing import Final, Optional, Union
3+
from dataclasses import dataclass
4+
from typing import Any, Final, Optional, Union
55

66
import jinja2
77
from fastapi import Request
@@ -17,11 +17,13 @@
1717
ErrorResponse,
1818
TokenizeChatRequest,
1919
TokenizeRequest,
20-
TokenizeResponse)
20+
TokenizeResponse,
21+
TokenizerInfoResponse)
2122
# yapf: enable
2223
from vllm.entrypoints.openai.serving_engine import OpenAIServing
2324
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
2425
from vllm.logger import init_logger
26+
from vllm.transformers_utils.tokenizer import AnyTokenizer
2527

2628
logger = init_logger(__name__)
2729

@@ -155,3 +157,49 @@ async def create_detokenize(
155157
input_text = prompt_input["prompt"]
156158

157159
return DetokenizeResponse(prompt=input_text)
160+
161+
async def get_tokenizer_info(
162+
self, ) -> Union[TokenizerInfoResponse, ErrorResponse]:
163+
"""Get comprehensive tokenizer information."""
164+
try:
165+
tokenizer = await self.engine_client.get_tokenizer()
166+
info = TokenizerInfo(tokenizer, self.chat_template).to_dict()
167+
return TokenizerInfoResponse(**info)
168+
except Exception as e:
169+
return self.create_error_response(
170+
f"Failed to get tokenizer info: {str(e)}")
171+
172+
173+
@dataclass
174+
class TokenizerInfo:
175+
tokenizer: AnyTokenizer
176+
chat_template: Optional[str]
177+
178+
def to_dict(self) -> dict[str, Any]:
179+
"""Return the tokenizer configuration."""
180+
return self._get_tokenizer_config()
181+
182+
def _get_tokenizer_config(self) -> dict[str, Any]:
183+
"""Get tokenizer configuration directly from the tokenizer object."""
184+
config = dict(getattr(self.tokenizer, "init_kwargs", None) or {})
185+
186+
# Remove file path fields
187+
config.pop("vocab_file", None)
188+
config.pop("merges_file", None)
189+
190+
config = self._make_json_serializable(config)
191+
config["tokenizer_class"] = type(self.tokenizer).__name__
192+
if self.chat_template:
193+
config["chat_template"] = self.chat_template
194+
return config
195+
196+
def _make_json_serializable(self, obj):
197+
"""Convert any non-JSON-serializable objects to serializable format."""
198+
if hasattr(obj, "content"):
199+
return obj.content
200+
elif isinstance(obj, dict):
201+
return {k: self._make_json_serializable(v) for k, v in obj.items()}
202+
elif isinstance(obj, list):
203+
return [self._make_json_serializable(item) for item in obj]
204+
else:
205+
return obj

0 commit comments

Comments
 (0)