Handle structured text with skip_tokenizer_init=True

christian-pinto · christian-pinto · commit 30ed44e217bf · 2025-06-27T10:57:54.000Z
Signed-off-by: Christian Pinto &lt;christian.pinto@ibm.com&gt;
diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py
@@ -544,7 +544,7 @@ def schedule(self) -> SchedulerOutput:
             self.requests,
             structured_output_request_ids,
             scheduled_spec_decode_tokens,
-        )
+        ) if structured_output_request_ids else None
         # Construct the scheduler output.
         new_reqs_data = [
             NewRequestData.from_request(req,
@@ -826,8 +826,9 @@ def update_from_output(
                 # the outer lists can be of length > 1.
                 new_logprobs = logprobs.slice(req_index, req_index + 1)
 
-            if new_token_ids and self.structured_output_manager.should_advance(
-                    request):
+            if new_token_ids and self.structured_output_manager \
+                and self.structured_output_manager.should_advance(
+                request):
                 # NOTE: structured_output_request
                 # should not be None if use_structured_output, we have
                 # check above, so safe to ignore type warning
@@ -840,7 +841,8 @@ def update_from_output(
 
             # Add newly generated spec token ids to the request.
             if spec_token_ids is not None:
-                if self.structured_output_manager.should_advance(request):
+                if self.structured_output_manager \
+                    and self.structured_output_manager.should_advance(request):
                     metadata = request.structured_output_request
                     # Needs to happen after new_token_ids are accepted.
                     request.spec_token_ids = metadata.grammar.validate_tokens(  # type: ignore[union-attr]
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
@@ -86,7 +86,11 @@ def __init__(self,
         self.collective_rpc("initialize_cache",
                             args=(num_gpu_blocks, num_cpu_blocks))
 
-        self.structured_output_manager = StructuredOutputManager(vllm_config)
+        if vllm_config.model_config.skip_tokenizer_init:
+            # Structured output generation requires a tokenizer
+            self.structured_output_manager = None
+        else:
+            self.structured_output_manager = StructuredOutputManager(vllm_config)
 
         # Setup scheduler.
         if isinstance(vllm_config.scheduler_config.scheduler_cls, str):
@@ -198,7 +202,11 @@ def add_request(self, request: EngineCoreRequest):
                 request.mm_inputs, request.mm_hashes)
 
         req = Request.from_engine_core_request(request)
-        if req.use_structured_output:
+        if req.use_structured_output and self.structured_output_manager:
+            # We check for `structured_output_manager` because
+            # a StructuredOutputManager is not instantiated if a tokenizer
+            # is not initialized for the model.
+
             # Start grammar compilation asynchronously
             self.structured_output_manager.grammar_init(req)
 
@@ -299,7 +307,8 @@ def step_with_batch_queue(
         return engine_core_outputs, scheduled_batch
 
     def shutdown(self):
-        self.structured_output_manager.clear_backend()
+        if self.structured_output_manager:
+            self.structured_output_manager.clear_backend()
         if self.model_executor:
             self.model_executor.shutdown()
         if self.scheduler:
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
@@ -46,12 +46,11 @@ def __init__(self, vllm_config: VllmConfig):
         # compilation, so we set it to half the number of CPUs.
         max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2)
         self.executor = ThreadPoolExecutor(max_workers=max_workers)
-        self.tokenizer = None if vllm_config.model_config.skip_tokenizer_init \
-            else init_tokenizer_from_configs(
-                model_config=self.vllm_config.model_config,
-                scheduler_config=self.vllm_config.scheduler_config,
-                lora_config=self.vllm_config.lora_config,
-            ).get_lora_tokenizer(None)
+        self.tokenizer = init_tokenizer_from_configs(
+            model_config=self.vllm_config.model_config,
+            scheduler_config=self.vllm_config.scheduler_config,
+            lora_config=self.vllm_config.lora_config,
+        ).get_lora_tokenizer(None)
         reasoning_backend = vllm_config.decoding_config.reasoning_backend
         if reasoning_backend:
             reasoner_cls = ReasoningParserManager.get_reasoning_parser(
@@ -116,8 +115,6 @@ def grammar_bitmask(
         scheduled_spec_decode_tokens: dict[str, list[int]],
     ) -> Optional[npt.NDArray[np.int32]]:
         # Prepare the structured output bitmask for this batch.
-        if not structured_output_request_ids:
-            return None
 
         max_num_spec_tokens = 0
         if self.vllm_config.speculative_config is not None: