vllm-project · afeldman-nm · Jun 6, 2025 · Jun 6, 2025 · Jun 7, 2025 · Jun 7, 2025
diff --git a/examples/offline_inference/logits_processor.py b/examples/offline_inference/logits_processor.py
@@ -0,0 +1,104 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from typing import Optional
+
+import torch
+
+from vllm import LLM, SamplingParams
+from vllm.v1.sample.logits_processor import (
+    BatchUpdate,
+    LogitsProcessor,
+    MoveDirectionality,
+)
+
+
+def make_dummy_logitproc_type():
+    class DummyLogitsProcessor(LogitsProcessor):
+        """Fake logit processor to support unit testing and examples"""
+
+        def __init__(self, _):
+            super().__init__()
+            self.req_info = {}
+
+        def is_argmax_invariant(self) -> bool:
+            """Never impacts greedy sampling"""
+            return False
+
+        def update_state(self, batch_update: Optional[BatchUpdate]):
+            if not batch_update:
+                return
+
+            # Process added requests.
+            for index, params, _ in batch_update.added:
+                if isinstance(params, SamplingParams) and params.extra_args:
+                    target_token = params.extra_args.get("target_token", None)
+                else:
+                    target_token = None
+                self.req_info[index] = target_token
+
+            if self.req_info:
+                # Process removed requests.
+                for index in batch_update.removed:
+                    self.req_info.pop(index, None)
+
+                # Process moved requests, unidirectional (a->b) and swap (a<->b)
+                for adx, bdx, direct in batch_update.moved:
+                    if direct == MoveDirectionality.SWAP:
+                        (self.req_info[adx], self.req_info[bdx]) = (
+                            self.req_info[bdx],
+                            self.req_info[adx],
+                        )
+                    else:
+                        self.req_info[bdx] = self.req_info[adx]
+
+        def apply(self, logits: torch.Tensor) -> torch.Tensor:
+            for bdx in range(logits.shape[0]):
+                if (target_token := self.req_info[bdx]) is not None:
+                    mask = torch.ones_like(logits[bdx, :], dtype=torch.bool)
+                    mask[target_token] = False
+                    logits[bdx, mask] = float("-inf")
+
+            return logits
+
+    return DummyLogitsProcessor
+
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a mixture of requests which do and don't utilize the dummy logitproc
+sampling_params_list = [
+    SamplingParams(temperature=0.0, extra_args={"target_token": 128}),
+    SamplingParams(temperature=0.0),
+    SamplingParams(temperature=0.0, extra_args={"target_token": 67}),
+    SamplingParams(temperature=0.0),
+]
+
+
+def main():
+    # Create an LLM.
+    llm = LLM(
+        model="facebook/opt-125m",
+        logits_processors=[make_dummy_logitproc_type()],
+    )
+    # Generate texts from the prompts.
+    # The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params_list)
+    # Print the outputs.
+    print("\nGenerated Outputs:\n" + "-" * 60)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt:    {prompt!r}")
+        print(f"Output:    {generated_text!r}")
+        print("-" * 60)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/online_serving/openai_completion_client_logits_processor.py b/examples/online_serving/openai_completion_client_logits_processor.py
@@ -0,0 +1,53 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import argparse
+
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Client for vLLM API server")
+    parser.add_argument(
+        "--stream", action="store_true", help="Enable streaming response"
+    )
+    return parser.parse_args()
+
+
+def main(args):
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    models = client.models.list()
+    model = models.data[0].id
+
+    # Completion API
+    completion = client.completions.create(
+        model=model,
+        prompt="A robot may not injure a human being",
+        echo=False,
+        n=2,
+        stream=args.stream,
+        logprobs=3,
+    )
+
+    print("-" * 50)
+    print("Completion results:")
+    if args.stream:
+        for c in completion:
+            print(c)
+    else:
+        print(completion)
+    print("-" * 50)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
diff --git a/tests/v1/entrypoints/openai/test_multi_api_servers.py b/tests/v1/entrypoints/openai/test_multi_api_servers.py
@@ -2,11 +2,11 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import asyncio
 import os
-import re
 
 import openai  # use the official client for correctness check
 import pytest
 import pytest_asyncio
+import regex as re
 import requests
 
 from tests.utils import RemoteOpenAIServer

diff --git a/tests/v1/sample/test_logits_processors.py → ...ple/logits_processors/test_correctness.py b/tests/v1/sample/test_logits_processors.py → ...ple/logits_processors/test_correctness.py
@@ -14,6 +14,7 @@
                                    create_prompt_tokens_tensor,
                                    fake_apply_logitsprocs,
                                    fake_update_logitsprocs_state)
+from vllm.config import VllmConfig
 from vllm.platforms import current_platform
 from vllm.sampling_params import SamplingParams
 from vllm.utils import is_pin_memory_available
@@ -23,9 +24,9 @@
                                              LogitsProcessor,
                                              MinPLogitsProcessor,
                                              MinTokensLogitsProcessor,
-                                             MoveDirectionality,
-                                             init_builtin_logitsprocs)
+                                             MoveDirectionality)
 # yapf: enable
+from vllm.v1.sample.logits_processor.load import build_logitsprocs
 from vllm.v1.sample.metadata import SamplingMetadata
 
 PIN_MEMORY_AVAILABLE = is_pin_memory_available()
@@ -70,7 +71,6 @@ def __str__(self):
         summ = ', '.join(f'{k}={v}' for k, v in vars(self).items())
         return f"MyClass({summ})"
 
-
 def _generate_fake_sampling_metadata(
     num_output_tokens: int,
     batch_size: int,
@@ -88,11 +88,11 @@ def _generate_fake_sampling_metadata(
                               vocab_size,
                               size=np.random.randint(
                                   1, MAX_NUM_PROMPT_TOKENS)).tolist())
-    logitsprocs = init_builtin_logitsprocs(
-        pin_memory_available=PIN_MEMORY_AVAILABLE,
-        max_num_reqs=MAX_NUM_REQS + 1,
-        device=device)
-
+    logitsprocs = build_logitsprocs(   
+        vllm_config=VllmConfig(),
+        device=device,
+        is_pin_memory=PIN_MEMORY_AVAILABLE,
+    )
     fake_sampling_metadata = SamplingMetadata(
         temperature=torch.full((batch_size, ), 0.0),
         all_greedy=True,

diff --git a/tests/v1/sample/logits_processors/test_custom_cli.py b/tests/v1/sample/logits_processors/test_custom_cli.py
@@ -0,0 +1,113 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import random
+
+import openai  # use the official client for correctness check
+import pytest
+import pytest_asyncio
+
+from tests.utils import RemoteOpenAIServer
+from tests.v1.sample.logits_processors.utils import (
+    DUMMY_LOGITPROC_ARG, DUMMY_LOGITPROC_ENTRYPOINT, DUMMY_LOGITPROC_FQN,
+    MAX_TOKENS, MODEL_NAME, TEMP_GREEDY, prompts)
+
+
+@pytest.fixture(scope="module")
+def default_server_args():
+    return [
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "2048",
+        "--max-num-seqs",
+        "128",
+        "--enforce-eager"
+    ]
+
+
+@pytest.fixture(
+    scope="module",
+    params=[[
+        "--logits-processors-entrypoints",
+        DUMMY_LOGITPROC_ENTRYPOINT + "," + DUMMY_LOGITPROC_ENTRYPOINT
+    ],
+            [
+                "--logits-processors-fqns",
+                DUMMY_LOGITPROC_FQN + "," + DUMMY_LOGITPROC_FQN
+            ]])
+def server(default_server_args, request):
+    if request.param:
+        default_server_args = default_server_args + request.param
+    with RemoteOpenAIServer(MODEL_NAME, default_server_args) as remote_server:
+        yield remote_server
+
+
+@pytest_asyncio.fixture
+async def client(server):
+    async with server.get_async_client() as async_client:
+        yield async_client
+
+
+api_kwargs = {
+    "temperature": TEMP_GREEDY,
+    "max_tokens": MAX_TOKENS,
+    "logprobs": 0,
+}
+
+extra_body_kwargs = {"vllm_xargs": {DUMMY_LOGITPROC_ARG: 128}}
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_custom_logitsprocs_cli(client: openai.AsyncOpenAI,
+                                      model_name: str):
+    """Test CLI interface for passing custom logitsprocs
+
+    Launch vLLM OpenAI-compatible server with CLI argument to loads a custom
+    logitproc that has a well-defined behavior (mask out all tokens except one
+    `target_token`) Test is implicitly parameterized by the logitproc source
+    (fully-qualified class name or entrypoint)
+
+    Pass in requests, 50% of which pass a `target_token` value
+    in through `extra_body["vllm_xargs"]`, 50% of which do not.
+
+    Validate that requests which activate the custom logitproc, only output
+    `target_token`
+    """
+    use_dummy_logitproc = True
+    for prompt in prompts:
+        # Send vLLM API request; for some requests, activate dummy logitproc
+        kwargs = {
+            **api_kwargs,
+        }
+        if use_dummy_logitproc:
+            target_token = random.choice([128, 67])
+            # For requests which activate the dummy logitproc, choose one of
+            # two `target_token` values which are known not to be EOS tokens
+            kwargs["extra_body"] = {
+                "vllm_xargs": {
+                    DUMMY_LOGITPROC_ARG: target_token
+                }
+            }
+        batch = await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            **kwargs,
+        )
+
+        if use_dummy_logitproc:
+            # Only for requests which activate dummy logitproc - validate that
+            # only `target_token` is generated
+            choices: openai.types.CompletionChoice = batch.choices
+            toks = choices[0].logprobs.tokens
+            if not all([x == toks[0] for x in toks]):
+                raise AssertionError(
+                    f"Generated {toks} should all be {toks[0]}")
+
+        # Alternate whether to activate dummy logitproc for each request
+        use_dummy_logitproc = not use_dummy_logitproc