vllm-project · afeldman-nm · Jun 7, 2025 · Jun 7, 2025 · Jun 7, 2025 · Jun 7, 2025
diff --git a/examples/offline_inference/logits_processor.py b/examples/offline_inference/logits_processor.py
@@ -0,0 +1,73 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""This example demonstrates instantiating vLLM with a custom logits processor
+class object.
+
+For testing purposes, a dummy logits processor is employed which, if
+`target_token` is passed as a keyword argument to `SamplingParams.extra_args`,
+will mask out all tokens except `target_token`.
+
+A batch is constructed with `temperature=0.0` and 50% of requests specifying
+`target_token`, and for these requests - and *only* these requests - we
+expect the `target_token` to be decoded in each step, yielding an output
+similar to that shown below:
+
+Generated Outputs:
+------------------------------------------------------------
+Prompt:    'Hello, my name is'
+Output:    " ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' '"
+------------------------------------------------------------
+Prompt:    'The president of the United States is'
+Output:    " not a racist. He is a racist.\nHe's a racist because he"
+------------------------------------------------------------
+Prompt:    'The capital of France is'
+Output:    ' also also also also also also also also also also also also also
+             also also also'
+------------------------------------------------------------
+Prompt:    'The future of AI is'
+Output:    ' in the hands of the people.\n\nThe future of AI is in the'
+------------------------------------------------------------
+"""
+
+from vllm import LLM, SamplingParams
+from vllm.test_utils import DummyLogitsProcessor
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a mixture of requests which do and don't utilize the dummy logitproc
+sampling_params_list = [
+    SamplingParams(temperature=0.0, extra_args={"target_token": 128}),
+    SamplingParams(temperature=0.0),
+    SamplingParams(temperature=0.0, extra_args={"target_token": 67}),
+    SamplingParams(temperature=0.0),
+]
+
+
+def main():
+    # Create an LLM.
+    llm = LLM(
+        model="facebook/opt-125m",
+        logits_processors=[DummyLogitsProcessor],
+    )
+    # Generate texts from the prompts.
+    # The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params_list)
+    # Print the outputs.
+    print("\nGenerated Outputs:\n" + "-" * 60)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt:    {prompt!r}")
+        print(f"Output:    {generated_text!r}")
+        print("-" * 60)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/online_serving/openai_completion_client_logits_processor.py b/examples/online_serving/openai_completion_client_logits_processor.py
@@ -0,0 +1,53 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import argparse
+
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Client for vLLM API server")
+    parser.add_argument(
+        "--stream", action="store_true", help="Enable streaming response"
+    )
+    return parser.parse_args()
+
+
+def main(args):
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    models = client.models.list()
+    model = models.data[0].id
+
+    # Completion API
+    completion = client.completions.create(
+        model=model,
+        prompt="A robot may not injure a human being",
+        echo=False,
+        n=2,
+        stream=args.stream,
+        logprobs=3,
+    )
+
+    print("-" * 50)
+    print("Completion results:")
+    if args.stream:
+        for c in completion:
+            print(c)
+    else:
+        print(completion)
+    print("-" * 50)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/requirements/test.txt b/requirements/test.txt
@@ -31,6 +31,10 @@ argcomplete==3.5.1
     # via datamodel-code-generator
 arrow==1.3.0
     # via isoduration
+async-timeout==5.0.1
+    # via
+    #   aiohttp
+    #   redis
 attrs==24.2.0
     # via
     #   aiohttp
@@ -141,6 +145,11 @@ eval-type-backport==0.2.2
     # via mteb
 evaluate==0.4.3
     # via lm-eval
+exceptiongroup==1.3.0
+    # via
+    #   anyio
+    #   hypothesis
+    #   pytest
 fastparquet==2024.11.0
     # via genai-perf
 fastrlock==0.8.2
@@ -695,7 +704,6 @@ setuptools==77.0.3
     # via
     #   mamba-ssm
     #   pytablewriter
-    #   torch
     #   triton
 shellingham==1.5.4
     # via typer
@@ -758,8 +766,13 @@ tokenizers==0.21.1
     # via
     #   -r requirements/test.in
     #   transformers
+toml==0.10.2
+    # via datamodel-code-generator
 tomli==2.2.1
-    # via schemathesis
+    # via
+    #   black
+    #   pytest
+    #   schemathesis
 tomli-w==1.2.0
     # via schemathesis
 torch==2.7.1+cu128
@@ -833,14 +846,19 @@ types-python-dateutil==2.9.0.20241206
     # via arrow
 typing-extensions==4.12.2
     # via
+    #   anyio
+    #   black
+    #   exceptiongroup
     #   huggingface-hub
     #   librosa
     #   mistral-common
     #   mteb
+    #   multidict
     #   pqdm
     #   pydantic
     #   pydantic-core
     #   pydantic-extra-types
+    #   rich
     #   torch
     #   typer
     #   typing-inspection

diff --git a/tests/v1/entrypoints/openai/test_multi_api_servers.py b/tests/v1/entrypoints/openai/test_multi_api_servers.py
@@ -2,11 +2,11 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
 import os
-import re
 
 import openai  # use the official client for correctness check
 import pytest
 import pytest_asyncio
+import regex as re
 import requests
 
 from tests.utils import RemoteOpenAIServer

diff --git a/tests/v1/sample/test_logits_processors.py → ...ple/logits_processors/test_correctness.py b/tests/v1/sample/test_logits_processors.py → ...ple/logits_processors/test_correctness.py
@@ -14,6 +14,7 @@
                                    create_prompt_tokens_tensor,
                                    fake_apply_logitsprocs,
                                    fake_update_logitsprocs_state)
+from vllm.config import VllmConfig
 from vllm.platforms import current_platform
 from vllm.sampling_params import SamplingParams
 from vllm.utils import is_pin_memory_available
@@ -23,9 +24,9 @@
                                              LogitsProcessor,
                                              MinPLogitsProcessor,
                                              MinTokensLogitsProcessor,
-                                             MoveDirectionality,
-                                             init_builtin_logitsprocs)
+                                             MoveDirectionality)
 # yapf: enable
+from vllm.v1.sample.logits_processor.load import build_logitsprocs
 from vllm.v1.sample.metadata import SamplingMetadata
 
 PIN_MEMORY_AVAILABLE = is_pin_memory_available()
@@ -70,7 +71,6 @@ def __str__(self):
         summ = ', '.join(f'{k}={v}' for k, v in vars(self).items())
         return f"MyClass({summ})"
 
-
 def _generate_fake_sampling_metadata(
     num_output_tokens: int,
     batch_size: int,
@@ -88,11 +88,11 @@ def _generate_fake_sampling_metadata(
                               vocab_size,
                               size=np.random.randint(
                                   1, MAX_NUM_PROMPT_TOKENS)).tolist())
-    logitsprocs = init_builtin_logitsprocs(
-        pin_memory_available=PIN_MEMORY_AVAILABLE,
-        max_num_reqs=MAX_NUM_REQS + 1,
-        device=device)
-
+    logitsprocs = build_logitsprocs(   
+        vllm_config=VllmConfig(),
+        device=device,
+        is_pin_memory=PIN_MEMORY_AVAILABLE,
+    )
     fake_sampling_metadata = SamplingMetadata(
         temperature=torch.full((batch_size, ), 0.0),
         all_greedy=True,

diff --git a/tests/v1/sample/logits_processors/test_custom_cli.py b/tests/v1/sample/logits_processors/test_custom_cli.py
@@ -0,0 +1,118 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import random
+
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+
+from tests.utils import RemoteOpenAIServer
+from tests.v1.sample.logits_processors.utils import (DUMMY_LOGITPROC_ARG,
+                                                     MAX_TOKENS, MODEL_NAME,
+                                                     TEMP_GREEDY, prompts)
+from vllm.test_utils import DUMMY_LOGITPROC_FQCN
+
+
+@pytest.fixture(scope="module")
+def default_server_args():
+    return [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "128",
+        "--enforce-eager"
+    ]
+
+
+@pytest.fixture(scope="function",
+                params=[[], ["--logits-processors", DUMMY_LOGITPROC_FQCN]])
+def server(default_server_args, request, monkeypatch):
+    """Server cli arg list is parameterized by logitproc source: either fully-
+    qualified class name (FQCN) specified by `--logits-processors`, or
+    entrypoint.
+
+    Entrypoint requires no cli argument, but for testing purposes an
+    environment variable must be set to mock a dummy logit processor entrypoint
+    """
+    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1")
+    if request.param:
+        # Append FQCN argument
+        default_server_args = default_server_args + request.param
+    else:
+        # Enable mock logit processor entrypoint
+        monkeypatch.setenv("VLLM_MOCK_LP_ENTRYPOINT", "1")
+
+    with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+api_kwargs = {
+    "temperature": TEMP_GREEDY,
+    "max_tokens": MAX_TOKENS,
+    "logprobs": 0,
+}
+
+extra_body_kwargs = {"vllm_xargs": {DUMMY_LOGITPROC_ARG: 128}}
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_custom_logitsprocs_cli(client: openai.AsyncOpenAI,
+                                      model_name: str):
+    """Test CLI interface for passing custom logitsprocs
+
+    Launch vLLM OpenAI-compatible server with CLI argument to loads a custom
+    logitproc that has a well-defined behavior (mask out all tokens except one
+    `target_token`). Logitproc is specified by fully-qualified class name (FQCN)
+
+    Pass in requests, 50% of which pass a `target_token` value
+    in through `extra_body["vllm_xargs"]`, 50% of which do not.
+
+    Validate that requests which activate the custom logitproc, only output
+    `target_token`
+    """
+    use_dummy_logitproc = True
+    for prompt in prompts:
+        # Send vLLM API request; for some requests, activate dummy logitproc
+        kwargs = {
+            **api_kwargs,
+        }
+        if use_dummy_logitproc:
+            target_token = random.choice([128, 67])
+            # For requests which activate the dummy logitproc, choose one of
+            # two `target_token` values which are known not to be EOS tokens
+            kwargs["extra_body"] = {
+                "vllm_xargs": {
+                    DUMMY_LOGITPROC_ARG: target_token
+                }
+            }
+        batch = await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            **kwargs,
+        )
+
+        if use_dummy_logitproc:
+            # Only for requests which activate dummy logitproc - validate that
+            # only `target_token` is generated
+            choices: openai.types.CompletionChoice = batch.choices
+            toks = choices[0].logprobs.tokens
+            if not all([x == toks[0] for x in toks]):
+                raise AssertionError(
+                    f"Generated {toks} should all be {toks[0]}")
+
+        # Alternate whether to activate dummy logitproc for each request
+        use_dummy_logitproc = not use_dummy_logitproc