Skip to content

Commit 808cc3c

Browse files
Better support for skip_tokenizer_init=True
Signed-off-by: Christian Pinto <christian.pinto@ibm.com>
1 parent 198ff71 commit 808cc3c

File tree

6 files changed

+34
-18
lines changed

6 files changed

+34
-18
lines changed

vllm/config.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -614,6 +614,7 @@ def __post_init__(self) -> None:
614614
self.served_model_name = get_served_model_name(self.model,
615615
self.served_model_name)
616616
self.multimodal_config = self._init_multimodal_config()
617+
self.model_supports_multimodal_raw_input = self._init_model_supports_multimodal_raw_input()
617618
if not self.skip_tokenizer_init:
618619
self._verify_tokenizer_mode()
619620

@@ -706,6 +707,9 @@ def _init_multimodal_config(self) -> Optional["MultiModalConfig"]:
706707

707708
return None
708709

710+
def _init_model_supports_multimodal_raw_input(self):
711+
return self.registry.supports_multimodal_raw_input(self.architectures)
712+
709713
def _get_encoder_config(self):
710714
return get_sentence_transformer_tokenizer_config(
711715
self.model, self.revision)
@@ -1100,10 +1104,10 @@ def get_sliding_window(self) -> Optional[Union[int, list[Optional[int]]]]:
11001104
return self.get_hf_config_sliding_window()
11011105

11021106
def get_vocab_size(self) -> int:
1103-
return self.hf_text_config.vocab_size
1107+
return getattr(self.hf_text_config, "vocab_size", 0)
11041108

11051109
def get_hidden_size(self) -> int:
1106-
return self.hf_text_config.hidden_size
1110+
return getattr(self.hf_text_config, "hidden_size", 0)
11071111

11081112
@property
11091113
def is_deepseek_mla(self) -> bool:

vllm/multimodal/registry.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -266,7 +266,7 @@ def create_processor(
266266
if not model_config.is_multimodal_model:
267267
raise ValueError(f"{model_config.model} is not a multimodal model")
268268

269-
if tokenizer is None:
269+
if tokenizer is None and not model_config.skip_tokenizer_init:
270270
tokenizer = cached_tokenizer_from_config(model_config)
271271
if disable_cache is None:
272272
mm_config = model_config.get_multimodal_config()

vllm/v1/engine/llm_engine.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -82,11 +82,15 @@ def __init__(
8282
self.dp_group = None
8383
self.should_execute_dummy_batch = False
8484

85-
# Tokenizer (+ ensure liveness if running in another process).
86-
self.tokenizer = init_tokenizer_from_configs(
87-
model_config=vllm_config.model_config,
88-
scheduler_config=vllm_config.scheduler_config,
89-
lora_config=vllm_config.lora_config)
85+
86+
if not self.vllm_config.model_config.skip_tokenizer_init:
87+
# Tokenizer (+ ensure liveness if running in another process).
88+
self.tokenizer = init_tokenizer_from_configs(
89+
model_config=vllm_config.model_config,
90+
scheduler_config=vllm_config.scheduler_config,
91+
lora_config=vllm_config.lora_config)
92+
else:
93+
self.tokenizer = None
9094

9195
# Processor (convert Inputs --> EngineCoreRequests)
9296
self.processor = Processor(vllm_config=vllm_config,

vllm/v1/engine/output_processor.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -327,8 +327,11 @@ def add_request(
327327
if request_id in self.request_states:
328328
raise ValueError(f"Request id {request_id} already running.")
329329

330+
tokenizer = None if not self.tokenizer else \
331+
self.tokenizer.get_lora_tokenizer(request.lora_request)
332+
330333
req_state = RequestState.from_new_request(
331-
tokenizer=self.tokenizer.get_lora_tokenizer(request.lora_request),
334+
tokenizer=tokenizer,
332335
request=request,
333336
prompt=prompt,
334337
parent_req=parent_req,

vllm/v1/engine/processor.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -364,7 +364,10 @@ def _validate_model_input(
364364
prompt_type: Literal["encoder", "decoder"],
365365
):
366366
model_config = self.model_config
367-
tokenizer = self.tokenizer.get_lora_tokenizer(lora_request)
367+
if model_config.skip_tokenizer_init:
368+
tokenizer = None
369+
else:
370+
tokenizer = self.tokenizer.get_lora_tokenizer(lora_request)
368371

369372
prompt_ids = prompt_inputs["prompt_token_ids"]
370373
if not prompt_ids:
@@ -373,9 +376,10 @@ def _validate_model_input(
373376
else:
374377
raise ValueError(f"The {prompt_type} prompt cannot be empty")
375378

376-
max_input_id = max(prompt_ids, default=0)
377-
if max_input_id > tokenizer.max_token_id:
378-
raise ValueError(f"Token id {max_input_id} is out of vocabulary")
379+
if tokenizer:
380+
max_input_id = max(prompt_ids, default=0)
381+
if max_input_id > tokenizer.max_token_id:
382+
raise ValueError(f"Token id {max_input_id} is out of vocabulary")
379383

380384
max_prompt_len = self.model_config.max_model_len
381385
if len(prompt_ids) > max_prompt_len:

vllm/v1/structured_output/__init__.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -46,11 +46,12 @@ def __init__(self, vllm_config: VllmConfig):
4646
# compilation, so we set it to half the number of CPUs.
4747
max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2)
4848
self.executor = ThreadPoolExecutor(max_workers=max_workers)
49-
self.tokenizer = init_tokenizer_from_configs(
50-
model_config=self.vllm_config.model_config,
51-
scheduler_config=self.vllm_config.scheduler_config,
52-
lora_config=self.vllm_config.lora_config,
53-
).get_lora_tokenizer(None)
49+
self.tokenizer = None if vllm_config.model_config.skip_tokenizer_init else \
50+
init_tokenizer_from_configs(
51+
model_config=self.vllm_config.model_config,
52+
scheduler_config=self.vllm_config.scheduler_config,
53+
lora_config=self.vllm_config.lora_config,
54+
).get_lora_tokenizer(None)
5455
reasoning_backend = vllm_config.decoding_config.reasoning_backend
5556
if reasoning_backend:
5657
reasoner_cls = ReasoningParserManager.get_reasoning_parser(

0 commit comments

Comments
 (0)