vllm-project
diff --git a/‎csrc/attention/mla/sm100_cutlass_mla_kernel.cu
Lines changed: 10 additions & 0 deletions b/‎csrc/attention/mla/sm100_cutlass_mla_kernel.cu
Lines changed: 10 additions & 0 deletions
diff --git a/‎csrc/ops.h
Lines changed: 0 additions & 13 deletions b/‎csrc/ops.h
Lines changed: 0 additions & 13 deletions
diff --git a/‎csrc/torch_bindings.cpp
Lines changed: 2 additions & 3 deletions b/‎csrc/torch_bindings.cpp
Lines changed: 2 additions & 3 deletions
diff --git a/‎docs/assets/deployment/open_webui.png
-10.4 KB b/‎docs/assets/deployment/open_webui.png
-10.4 KB
diff --git a/‎docs/deployment/frameworks/open-webui.md
Lines changed: 33 additions & 17 deletions b/‎docs/deployment/frameworks/open-webui.md
Lines changed: 33 additions & 17 deletions
diff --git a/‎tests/distributed/test_pipeline_parallel.py
Lines changed: 17 additions & 7 deletions b/‎tests/distributed/test_pipeline_parallel.py
Lines changed: 17 additions & 7 deletions
diff --git a/‎tests/entrypoints/openai/test_tokenization.py
Lines changed: 104 additions & 0 deletions b/‎tests/entrypoints/openai/test_tokenization.py
Lines changed: 104 additions & 0 deletions
diff --git a/‎vllm/entrypoints/openai/api_server.py
Lines changed: 14 additions & 0 deletions b/‎vllm/entrypoints/openai/api_server.py
Lines changed: 14 additions & 0 deletions
diff --git a/‎vllm/entrypoints/openai/cli_args.py
Lines changed: 3 additions & 0 deletions b/‎vllm/entrypoints/openai/cli_args.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎vllm/entrypoints/openai/protocol.py
Lines changed: 10 additions & 0 deletions b/‎vllm/entrypoints/openai/protocol.py
Lines changed: 10 additions & 0 deletions
@@ -18,6 +18,7 @@ limitations under the License.
  * Taken from SGLANG PR https://github.com/sgl-project/sglang/pull/6929
  * by Alcanderian JieXin Liang
  */
+#include "core/registration.h"
 
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
@@ -270,4 +271,13 @@ int64_t sm100_cutlass_mla_get_workspace_size(int64_t max_seq_len, int64_t num_ba
 }
 
 #endif
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
+  m.impl("sm100_cutlass_mla_decode", &sm100_cutlass_mla_decode);
+}
+
+TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CatchAll, m) {
+  m.impl("sm100_cutlass_mla_get_workspace_size", &sm100_cutlass_mla_get_workspace_size);
+}
+
 // clang-format on
@@ -167,19 +167,6 @@ void cutlass_mla_decode(torch::Tensor const& out, torch::Tensor const& q_nope,
                         torch::Tensor const& seq_lens,
                         torch::Tensor const& page_table, double scale);
 
-void sm100_cutlass_mla_decode(
-    torch::Tensor const& out, torch::Tensor const& q_nope,
-    torch::Tensor const& q_pe, torch::Tensor const& kv_c_and_k_pe_cache,
-    torch::Tensor const& seq_lens, torch::Tensor const& page_table,
-    torch::Tensor const& workspace, double sm_scale,
-    int64_t num_kv_splits =
-        1 /* Set to 1 to avoid cuda_graph issue by default. */);
-
-int64_t sm100_cutlass_mla_get_workspace_size(
-    int64_t max_seq_len, int64_t num_batches, int64_t sm_count = 0,
-    int64_t num_kv_splits =
-        1 /* Set to 1 to avoid cuda_graph issue by default. */);
-
 torch::Tensor get_cuda_view_from_cpu_tensor(torch::Tensor& cpu_tensor);
 
 #ifndef USE_ROCM
 
@@ -521,15 +521,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "                         Tensor page_table, Tensor workspace, float "
       "scale,"
       "                         int num_kv_splits) -> ()");
-  ops.impl("sm100_cutlass_mla_decode", torch::kCUDA, &sm100_cutlass_mla_decode);
+  // conditionally compiled so impl in source file
 
   // SM100 CUTLASS MLA workspace
   ops.def(
       "sm100_cutlass_mla_get_workspace_size(int max_seq_len, int num_batches,"
       "                                     int sm_count, int num_kv_splits) "
       "-> int");
-  ops.impl("sm100_cutlass_mla_get_workspace_size",
-           &sm100_cutlass_mla_get_workspace_size);
+  // conditionally compiled so impl in source file
 
   // Compute NVFP4 block quantized tensor.
   ops.def(
 
@@ -1,26 +1,42 @@
 # Open WebUI
 
-1. Install the [Docker](https://docs.docker.com/engine/install/)
+[Open WebUI](https://github.com/open-webui/open-webui) is an extensible, feature-rich,
+and user-friendly self-hosted AI platform designed to operate entirely offline.
+It supports various LLM runners like Ollama and OpenAI-compatible APIs,
+with built-in RAG capabilities, making it a powerful AI deployment solution.
 
-2. Start the vLLM server with the supported chat completion model, e.g.
+To get started with Open WebUI using vLLM, follow these steps:
 
-```bash
-vllm serve qwen/Qwen1.5-0.5B-Chat
-```
+1. Install the [Docker](https://docs.docker.com/engine/install/).
 
-1. Start the [Open WebUI](https://github.com/open-webui/open-webui) docker container (replace the vllm serve host and vllm serve port):
+2. Start the vLLM server with a supported chat completion model:
 
-```bash
-docker run -d -p 3000:8080 \
---name open-webui \
--v open-webui:/app/backend/data \
--e OPENAI_API_BASE_URL=http://<vllm serve host>:<vllm serve port>/v1 \
---restart always \
-ghcr.io/open-webui/open-webui:main
-```
+    ```console
+    vllm serve Qwen/Qwen3-0.6B-Chat
+    ```
 
-1. Open it in the browser: <http://open-webui-host:3000/>
+    !!! note
+        When starting the vLLM server, be sure to specify the host and port using the `--host` and `--port` flags.
+        For example:
 
-On the top of the web page, you can see the model `qwen/Qwen1.5-0.5B-Chat`.
+        ```console
+        python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000
+        ```
 
-![](../../assets/deployment/open_webui.png)
+3. Start the Open WebUI Docker container:
+
+    ```console
+    docker run -d \
+        --name open-webui \
+        -p 3000:8080 \
+        -v open-webui:/app/backend/data \
+        -e OPENAI_API_BASE_URL=http://0.0.0.0:8000/v1 \
+        --restart always \
+        ghcr.io/open-webui/open-webui:main
+    ```
+
+4. Open it in the browser: <http://open-webui-host:3000/>
+
+    At the top of the page, you should see the model `Qwen/Qwen3-0.6B-Chat`.
+
+    ![Web portal of model Qwen/Qwen3-0.6B-Chat](../../assets/deployment/open_webui.png)
@@ -14,8 +14,9 @@
 
 import pytest
 
-from vllm.config import TaskOption
+from vllm.config import _FLOAT16_NOT_SUPPORTED_MODELS, TaskOption
 from vllm.logger import init_logger
+from vllm.transformers_utils.config import get_config
 
 from ..models.registry import HF_EXAMPLE_MODELS
 from ..utils import compare_two_settings, create_new_process_for_each_test
@@ -158,7 +159,7 @@ def iter_params(self, model_id: str):
     "databricks/dbrx-instruct": PPTestSettings.fast(load_format="dummy"),
     "Deci/DeciLM-7B-instruct": PPTestSettings.fast(),
     "deepseek-ai/deepseek-llm-7b-chat": PPTestSettings.fast(),
-    "deepseek-ai/DeepSeek-V2-Lite-Chat": PPTestSettings.fast(),
+    "deepseek-ai/DeepSeek-V2-Lite-Chat": PPTestSettings.fast(tp_base=2),
     "LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct": PPTestSettings.fast(),
     "tiiuae/falcon-7b": PPTestSettings.fast(),
     "google/gemma-1.1-2b-it": PPTestSettings.fast(),
@@ -210,9 +211,11 @@ def iter_params(self, model_id: str):
 
 EMBEDDING_MODELS = {  # type: ignore[var-annotated]
     # [Text-only]
-    "intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(),
-    "BAAI/bge-multilingual-gemma2": PPTestSettings.fast(),
-    "Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(load_format="dummy"),
+    "intfloat/e5-mistral-7b-instruct": PPTestSettings.fast(task="embed"),
+    "BAAI/bge-multilingual-gemma2": PPTestSettings.fast(task="embed"),
+    "Qwen/Qwen2.5-Math-RM-72B": PPTestSettings.fast(
+        load_format="dummy", task="embed"
+    ),
 }
 
 MULTIMODAL_MODELS = {
@@ -248,6 +251,7 @@ def iter_params(self, model_id: str):
     "meta-llama/Llama-3.2-1B-Instruct",
     "ArthurZ/Ilama-3.2-1B",
     "ibm/PowerLM-3b",
+    "deepseek-ai/DeepSeek-V2-Lite-Chat",
     # [LANGUAGE EMBEDDING]
     "intfloat/e5-mistral-7b-instruct",
     "BAAI/bge-multilingual-gemma2",
@@ -287,6 +291,11 @@ def _compare_tp(
     trust_remote_code = model_info.trust_remote_code
     tokenizer_mode = model_info.tokenizer_mode
     hf_overrides = model_info.hf_overrides
+    hf_config = get_config(model_id, trust_remote_code)
+
+    dtype = "float16"
+    if hf_config.model_type in _FLOAT16_NOT_SUPPORTED_MODELS:
+        dtype = "bfloat16"
 
     if load_format == "dummy":
         # Avoid OOM
@@ -316,7 +325,7 @@ def _compare_tp(
     common_args = [
         # use half precision for speed and memory savings in CI environment
         "--dtype",
-        "float16",
+        dtype,
         "--max-model-len",
         "2048",
         "--max-num-seqs",
@@ -338,6 +347,7 @@ def _compare_tp(
         common_args.extend(["--hf-overrides", json.dumps(hf_overrides)])
 
     specific_case = tp_size == 2 and pp_size == 2 and chunked_prefill
+    testing_ray_compiled_graph = False
     if distributed_backend == "ray" and (vllm_major_version == "1"
                                          or specific_case):
         # For V1, test Ray Compiled Graph for all the tests
@@ -351,6 +361,7 @@ def _compare_tp(
         # Temporary. Currently when zeromq + SPMD is used, it does not properly
         # terminate because of a Ray Compiled Graph issue.
         common_args.append("--disable-frontend-multiprocessing")
+        testing_ray_compiled_graph = True
     elif distributed_backend == "mp":
         # Both V0/V1 of multiprocessing executor support PP
         pp_env = {
@@ -394,7 +405,6 @@ def _compare_tp(
                              tp_env,
                              method=method)
     except Exception:
-        testing_ray_compiled_graph = pp_env is not None
         if testing_ray_compiled_graph and vllm_major_version == "0":
             # Ray Compiled Graph tests are flaky for V0,
             # so we don't want to fail the test
 
@@ -32,6 +32,7 @@ def server(zephyr_lora_added_tokens_files: str):  # noqa: F811
         f"zephyr-lora2={zephyr_lora_added_tokens_files}",
         "--max-lora-rank",
         "64",
+        "--enable-tokenizer-info-endpoint",
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
@@ -283,3 +284,106 @@ async def test_detokenize(
     response.raise_for_status()
 
     assert response.json() == {"prompt": prompt}
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name,tokenizer_name",
+    [(MODEL_NAME, MODEL_NAME), ("zephyr-lora2", "zephyr-lora2")],
+    indirect=["tokenizer_name"],
+)
+async def test_tokenizer_info_basic(
+    server: RemoteOpenAIServer,
+    model_name: str,
+    tokenizer_name: str,
+):
+    """Test basic tokenizer info endpoint functionality."""
+    response = requests.get(server.url_for("tokenizer_info"))
+    response.raise_for_status()
+    result = response.json()
+    assert "tokenizer_class" in result
+    assert isinstance(result["tokenizer_class"], str)
+    assert result["tokenizer_class"]
+
+
+@pytest.mark.asyncio
+async def test_tokenizer_info_schema(server: RemoteOpenAIServer):
+    """Test that the response matches expected schema types."""
+    response = requests.get(server.url_for("tokenizer_info"))
+    response.raise_for_status()
+    result = response.json()
+    field_types = {
+        "add_bos_token": bool,
+        "add_prefix_space": bool,
+        "clean_up_tokenization_spaces": bool,
+        "split_special_tokens": bool,
+        "bos_token": str,
+        "eos_token": str,
+        "pad_token": str,
+        "unk_token": str,
+        "chat_template": str,
+        "errors": str,
+        "model_max_length": int,
+        "additional_special_tokens": list,
+        "added_tokens_decoder": dict,
+    }
+    for field, expected_type in field_types.items():
+        if field in result and result[field] is not None:
+            assert isinstance(
+                result[field],
+                expected_type), (f"{field} should be {expected_type.__name__}")
+
+
+@pytest.mark.asyncio
+async def test_tokenizer_info_added_tokens_structure(
+    server: RemoteOpenAIServer, ):
+    """Test added_tokens_decoder structure if present."""
+    response = requests.get(server.url_for("tokenizer_info"))
+    response.raise_for_status()
+    result = response.json()
+    added_tokens = result.get("added_tokens_decoder")
+    if added_tokens:
+        for token_id, token_info in added_tokens.items():
+            assert isinstance(token_id, str), "Token IDs should be strings"
+            assert isinstance(token_info, dict), "Token info should be a dict"
+            assert "content" in token_info, "Token info should have content"
+            assert "special" in token_info, (
+                "Token info should have special flag")
+            assert isinstance(token_info["special"],
+                              bool), ("Special flag should be boolean")
+
+
+@pytest.mark.asyncio
+async def test_tokenizer_info_consistency_with_tokenize(
+    server: RemoteOpenAIServer, ):
+    """Test that tokenizer info is consistent with tokenization endpoint."""
+    info_response = requests.get(server.url_for("tokenizer_info"))
+    info_response.raise_for_status()
+    info = info_response.json()
+    tokenize_response = requests.post(
+        server.url_for("tokenize"),
+        json={
+            "model": MODEL_NAME,
+            "prompt": "Hello world!"
+        },
+    )
+    tokenize_response.raise_for_status()
+    tokenize_result = tokenize_response.json()
+    info_max_len = info.get("model_max_length")
+    tokenize_max_len = tokenize_result.get("max_model_len")
+    if info_max_len and tokenize_max_len:
+        assert info_max_len >= tokenize_max_len, (
+            "Info max length should be >= tokenize max length")
+
+
+@pytest.mark.asyncio
+async def test_tokenizer_info_chat_template(server: RemoteOpenAIServer):
+    """Test chat template is properly included."""
+    response = requests.get(server.url_for("tokenizer_info"))
+    response.raise_for_status()
+    result = response.json()
+    chat_template = result.get("chat_template")
+    if chat_template:
+        assert isinstance(chat_template,
+                          str), ("Chat template should be a string")
+        assert chat_template.strip(), "Chat template should not be empty"
@@ -522,6 +522,19 @@ async def detokenize(request: DetokenizeRequest, raw_request: Request):
     assert_never(generator)
 
 
+def maybe_register_tokenizer_info_endpoint(args):
+    """Conditionally register the tokenizer info endpoint if enabled."""
+    if getattr(args, 'enable_tokenizer_info_endpoint', False):
+
+        @router.get("/tokenizer_info")
+        async def get_tokenizer_info(raw_request: Request):
+            """Get comprehensive tokenizer information."""
+            result = await tokenization(raw_request).get_tokenizer_info()
+            return JSONResponse(content=result.model_dump(),
+                                status_code=result.code if isinstance(
+                                    result, ErrorResponse) else 200)
+
+
 @router.get("/v1/models")
 async def show_available_models(raw_request: Request):
     handler = models(raw_request)
@@ -1692,6 +1705,7 @@ async def run_server_worker(listen_address,
         uvicorn_kwargs['log_config'] = log_config
 
     async with build_async_engine_client(args, client_config) as engine_client:
+        maybe_register_tokenizer_info_endpoint(args)
         app = build_app(args)
 
         vllm_config = await engine_client.get_vllm_config()
 
@@ -182,6 +182,9 @@ class FrontendArgs:
     """If set to True, enable tracking server_load_metrics in the app state."""
     enable_force_include_usage: bool = False
     """If set to True, including usage on every request."""
+    enable_tokenizer_info_endpoint: bool = False
+    """Enable the /get_tokenizer_info endpoint. May expose chat
+    templates and other tokenizer configuration."""
 
     @staticmethod
     def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
 
@@ -1953,6 +1953,16 @@ class DetokenizeResponse(OpenAIBaseModel):
     prompt: str
 
 
+class TokenizerInfoResponse(OpenAIBaseModel):
+    """
+    Response containing tokenizer configuration 
+    equivalent to tokenizer_config.json
+    """
+
+    model_config = ConfigDict(extra="allow")
+    tokenizer_class: str
+
+
 class LoadLoRAAdapterRequest(BaseModel):
     lora_name: str
     lora_path: str