chore: address Nick's comments and add tests for v1

aarnphm · aarnphm · commit a49f18047a6b · 2025-07-02T08:10:32.000-04:00
Signed-off-by: Aaron Pham &lt;contact@aarnphm.xyz&gt;
diff --git a/tests/v1/engine/test_llm_engine.py b/tests/v1/engine/test_llm_engine.py
@@ -1,19 +1,30 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from __future__ import annotations
 
 import random
-from typing import Optional
+from typing import TYPE_CHECKING, Optional
 
 import pytest
 
-from vllm import LLM, SamplingParams
+from vllm import LLM
+from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 from vllm.v1.metrics.reader import Counter, Gauge, Histogram, Metric, Vector
 
+if TYPE_CHECKING:
+    from tests.conftest import VllmRunner
+
 MODEL = "facebook/opt-125m"
 DTYPE = "half"
 
 
-def _vllm_model(apc: bool, vllm_runner, monkeypatch):
+def _vllm_model(
+    apc: bool,
+    vllm_runner: VllmRunner,
+    monkeypatch: pytest.MonkeyPatch,
+    *,
+    skip_tokenizer_init: bool = False,
+):
     """Set up VllmRunner instance."""
     monkeypatch.setenv("VLLM_USE_V1", "1")
     return vllm_runner(
@@ -23,6 +34,7 @@ def _vllm_model(apc: bool, vllm_runner, monkeypatch):
         enforce_eager=True,
         enable_prefix_caching=apc,
         gpu_memory_utilization=0.5,
+        skip_tokenizer_init=skip_tokenizer_init,
     )
 
 
@@ -45,9 +57,25 @@ def vllm_model_apc(vllm_runner, monkeypatch):
         yield vllm_model
 
 
+@pytest.fixture(
+    # Function scope decouples tests & allows
+    # env var adjustment via monkeypatch
+    scope="function",
+    # Prefix caching
+    params=[False, True])
+def vllm_model_skip_tokenizer_init(vllm_runner, request, monkeypatch):
+    """VllmRunner test fixture with APC."""
+    with _vllm_model(request.param,
+                     vllm_runner,
+                     monkeypatch,
+                     skip_tokenizer_init=True) as vllm_model:
+        yield vllm_model
+
+
 def _get_test_sampling_params(
     prompt_list: list[str],
     seed: Optional[int] = 42,
+    structured_outputs: bool = False,
 ) -> tuple[list[SamplingParams], list[int]]:
     """Generate random sampling params for a batch."""
 
@@ -62,14 +90,45 @@ def get_mostly_n_gt1() -> int:
     n_list = [get_mostly_n_gt1() for _ in range(len(prompt_list))]
     # High temperature to maximize the chance of unique completions
     return [
-        SamplingParams(temperature=0.95, top_p=0.95, n=n, seed=seed)
-        for n in n_list
+        SamplingParams(
+            temperature=0.95,
+            top_p=0.95,
+            n=n,
+            seed=seed,
+            guided_decoding=GuidedDecodingParams(
+                regex="[0-9]+") if structured_outputs else None,
+        ) for n in n_list
     ], n_list
 
 
+def test_compatibility_with_skip_tokenizer_init(
+    vllm_model_skip_tokenizer_init: VllmRunner,
+    example_prompts: list[str],
+):
+    # Case 1: Structured output request should raise an error.
+    sampling_params_list, _ = _get_test_sampling_params(
+        example_prompts,
+        structured_outputs=True,
+    )
+    model: LLM = vllm_model_skip_tokenizer_init.model
+    with pytest.raises(ValueError):
+        _ = model.generate(example_prompts, sampling_params_list)
+
+    # Case 2: Standard generation without structured outputs should succeed.
+    sampling_params_list, n_list = _get_test_sampling_params(
+        example_prompts,
+        structured_outputs=False,
+    )
+    outputs = model.generate(example_prompts, sampling_params_list)
+
+    # Basic sanity checks similar to parallel sampling test
+    for out, n in zip(outputs, n_list):
+        assert len(out.outputs) == n
+
+
 def test_parallel_sampling(vllm_model, example_prompts) -> None:
     """Test passes if parallel sampling `n>1` yields `n` unique completions.
-    
+
     Args:
       vllm_model: VllmRunner instance under test.
       example_prompt: test fixture providing prompts for testing.
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
@@ -152,7 +152,7 @@ def _validate_structured_output(self, params: SamplingParams) -> None:
         if not params.guided_decoding or not self.decoding_config:
             return
 
-        if self.model_config.skip_tokenizer_init and self.decoding_config:
+        if self.model_config.skip_tokenizer_init and params.guided_decoding:
             raise ValueError(
                 "Structured outputs requires a tokenizer so it can't be used with 'skip_tokenizer_init'"  # noqa: E501
             )
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
@@ -61,8 +61,7 @@ def __init__(self, vllm_config: VllmConfig):
                 self.reasoner = reasoner_cls(tokenizer=self.tokenizer)
 
     def grammar_init(self, request: Request) -> None:
-        if request.structured_output_request is None or \
-            self.vllm_config.model_config.skip_tokenizer_init:
+        if request.structured_output_request is None:
             return
 
         if TYPE_CHECKING:
@@ -119,8 +118,7 @@ def grammar_bitmask(
         scheduled_spec_decode_tokens: dict[str, list[int]],
     ) -> Optional[npt.NDArray[np.int32]]:
         # Prepare the structured output bitmask for this batch.
-        if not structured_output_request_ids \
-            or self.vllm_config.model_config.skip_tokenizer_init:
+        if not structured_output_request_ids:
             return None
 
         max_num_spec_tokens = 0
@@ -198,8 +196,7 @@ def grammar_bitmask(
         return bitmask_tensor.numpy()
 
     def should_advance(self, request: Request) -> bool:
-        if not request.use_structured_output \
-            or self.vllm_config.model_config.skip_tokenizer_init:
+        if not request.use_structured_output:
             return False
 
         # To determine whether we can advance the FSM.

Original file line number	Diff line number	Diff line change
`@@ -152,7 +152,7 @@ def _validate_structured_output(self, params: SamplingParams) -> None:`
`152`	`152`	`if not params.guided_decoding or not self.decoding_config:`
`153`	`153`	`return`
`154`	`154`
`155`		`- if self.model_config.skip_tokenizer_init and self.decoding_config:`
	`155`	`+ if self.model_config.skip_tokenizer_init and params.guided_decoding:`
`156`	`156`	`raise ValueError(`
`157`	`157`	`"Structured outputs requires a tokenizer so it can't be used with 'skip_tokenizer_init'" # noqa: E501`
`158`	`158`	`)`