From 3c4a138b9c0478dd37493783d2f56766e9f7254f Mon Sep 17 00:00:00 2001 From: Andy Xie Date: Wed, 16 Jul 2025 00:04:59 +0800 Subject: [PATCH] [Misc] unify variable for LLM instance Signed-off-by: Andy Xie --- docs/configuration/model_resolution.md | 2 +- docs/features/lora.md | 4 +- docs/features/quantization/fp8.md | 10 ++- docs/features/quantization/int4.md | 3 +- docs/features/quantization/int8.md | 3 +- docs/models/pooling_models.md | 10 +-- examples/offline_inference/basic/classify.py | 4 +- examples/offline_inference/basic/embed.py | 4 +- examples/offline_inference/basic/score.py | 4 +- .../embed_jina_embeddings_v3.py | 4 +- .../offline_inference/embed_matryoshka_fy.py | 4 +- .../offline_inference/neuron_speculation.py | 8 +- .../prithvi_geospatial_mae.py | 4 +- examples/offline_inference/qwen3_reranker.py | 6 +- .../test_basic_correctness.py | 4 +- tests/basic_correctness/test_preemption.py | 10 +-- tests/conftest.py | 32 ++++---- tests/core/test_num_computed_tokens_update.py | 2 +- tests/detokenizer/test_stop_reason.py | 2 +- tests/detokenizer/test_stop_strings.py | 42 +++++------ tests/lora/test_llama_tp.py | 20 ++--- tests/metrics/test_metrics.py | 16 ++-- .../test_model_load_with_params.py | 10 +-- .../models/language/generation/test_hybrid.py | 2 +- .../language/generation/test_mistral.py | 14 ++-- tests/models/language/pooling/mteb_utils.py | 20 ++--- tests/models/language/pooling/test_gritlm.py | 4 +- tests/models/language/pooling/test_jina.py | 4 +- .../pooling/test_nomic_max_model_len.py | 6 +- .../pooling/test_truncation_control.py | 6 +- .../multimodal/generation/test_pixtral.py | 5 +- .../multimodal/generation/test_whisper.py | 2 +- .../multimodal/generation/vlm_utils/core.py | 2 +- .../multimodal/pooling/test_dse_qwen2_vl.py | 2 +- .../pooling/test_jinavl_reranker.py | 2 +- tests/models/quantization/test_modelopt.py | 6 +- tests/models/quantization/test_nvfp4.py | 6 +- .../test_disable_sliding_window.py | 22 +++--- tests/prefix_caching/test_prefix_caching.py | 6 +- tests/quantization/test_gptq_dynamic.py | 2 +- tests/quantization/test_quark.py | 4 +- .../test_register_quantization_config.py | 2 +- tests/samplers/test_ignore_eos.py | 2 +- tests/samplers/test_logits_processor.py | 10 +-- tests/samplers/test_logprobs.py | 4 +- tests/samplers/test_no_bad_words.py | 12 +-- tests/samplers/test_seeded_generate.py | 2 +- tests/spec_decode/e2e/conftest.py | 3 +- tests/tokenization/test_detokenize.py | 2 +- tests/v1/core/test_scheduler_e2e.py | 12 +-- tests/v1/engine/test_llm_engine.py | 14 ++-- tests/v1/sample/test_logprobs.py | 8 +- tests/v1/sample/test_sampling_params_e2e.py | 74 +++++++++---------- tests/v1/test_oracle.py | 6 +- 54 files changed, 237 insertions(+), 237 deletions(-) diff --git a/docs/configuration/model_resolution.md b/docs/configuration/model_resolution.md index d98142a835c..49576a8217d 100644 --- a/docs/configuration/model_resolution.md +++ b/docs/configuration/model_resolution.md @@ -14,7 +14,7 @@ For example: ```python from vllm import LLM -model = LLM( +llm = LLM( model="cerebras/Cerebras-GPT-1.3B", hf_overrides={"architectures": ["GPT2LMHeadModel"]}, # GPT-2 ) diff --git a/docs/features/lora.md b/docs/features/lora.md index 6acfdcce445..ea1b495138c 100644 --- a/docs/features/lora.md +++ b/docs/features/lora.md @@ -302,7 +302,7 @@ To this end, we allow registration of default multimodal LoRAs to handle this au return tokenizer.apply_chat_template(chat, tokenize=False) - model = LLM( + llm = LLM( model=model_id, enable_lora=True, max_lora_rank=64, @@ -329,7 +329,7 @@ To this end, we allow registration of default multimodal LoRAs to handle this au } - outputs = model.generate( + outputs = llm.generate( inputs, sampling_params=SamplingParams( temperature=0.2, diff --git a/docs/features/quantization/fp8.md b/docs/features/quantization/fp8.md index a6c0fd78e76..0661933acd6 100644 --- a/docs/features/quantization/fp8.md +++ b/docs/features/quantization/fp8.md @@ -86,8 +86,9 @@ Load and run the model in `vllm`: ```python from vllm import LLM -model = LLM("./Meta-Llama-3-8B-Instruct-FP8-Dynamic") -result = model.generate("Hello my name is") + +llm = LLM("./Meta-Llama-3-8B-Instruct-FP8-Dynamic") +result = llm.generate("Hello my name is") print(result[0].outputs[0].text) ``` @@ -125,9 +126,10 @@ In this mode, all Linear modules (except for the final `lm_head`) have their wei ```python from vllm import LLM -model = LLM("facebook/opt-125m", quantization="fp8") + +llm = LLM("facebook/opt-125m", quantization="fp8") # INFO 06-10 17:55:42 model_runner.py:157] Loading model weights took 0.1550 GB -result = model.generate("Hello, my name is") +result = llm.generate("Hello, my name is") print(result[0].outputs[0].text) ``` diff --git a/docs/features/quantization/int4.md b/docs/features/quantization/int4.md index f26de73c2f0..1df32a11ed9 100644 --- a/docs/features/quantization/int4.md +++ b/docs/features/quantization/int4.md @@ -108,7 +108,8 @@ After quantization, you can load and run the model in vLLM: ```python from vllm import LLM -model = LLM("./Meta-Llama-3-8B-Instruct-W4A16-G128") + +llm = LLM("./Meta-Llama-3-8B-Instruct-W4A16-G128") ``` To evaluate accuracy, you can use `lm_eval`: diff --git a/docs/features/quantization/int8.md b/docs/features/quantization/int8.md index 7e1cb3fee94..45fae58a648 100644 --- a/docs/features/quantization/int8.md +++ b/docs/features/quantization/int8.md @@ -114,7 +114,8 @@ After quantization, you can load and run the model in vLLM: ```python from vllm import LLM -model = LLM("./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token") + +llm = LLM("./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token") ``` To evaluate accuracy, you can use `lm_eval`: diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md index f0de84a66f8..a91fd1dcb64 100644 --- a/docs/models/pooling_models.md +++ b/docs/models/pooling_models.md @@ -149,11 +149,11 @@ You can change the output dimensions of embedding models that support Matryoshka ```python from vllm import LLM, PoolingParams -model = LLM(model="jinaai/jina-embeddings-v3", - task="embed", - trust_remote_code=True) -outputs = model.embed(["Follow the white rabbit."], - pooling_params=PoolingParams(dimensions=32)) +llm = LLM(model="jinaai/jina-embeddings-v3", + task="embed", + trust_remote_code=True) +outputs = llm.embed(["Follow the white rabbit."], + pooling_params=PoolingParams(dimensions=32)) print(outputs[0].outputs) ``` diff --git a/examples/offline_inference/basic/classify.py b/examples/offline_inference/basic/classify.py index 219064e9742..aaf0e83c9de 100644 --- a/examples/offline_inference/basic/classify.py +++ b/examples/offline_inference/basic/classify.py @@ -28,10 +28,10 @@ def main(args: Namespace): # Create an LLM. # You should pass task="classify" for classification models - model = LLM(**vars(args)) + llm = LLM(**vars(args)) # Generate logits. The output is a list of ClassificationRequestOutputs. - outputs = model.classify(prompts) + outputs = llm.classify(prompts) # Print the outputs. print("\nGenerated Outputs:\n" + "-" * 60) diff --git a/examples/offline_inference/basic/embed.py b/examples/offline_inference/basic/embed.py index 1114033d5ce..7ff9c7f5e0e 100644 --- a/examples/offline_inference/basic/embed.py +++ b/examples/offline_inference/basic/embed.py @@ -31,10 +31,10 @@ def main(args: Namespace): # Create an LLM. # You should pass task="embed" for embedding models - model = LLM(**vars(args)) + llm = LLM(**vars(args)) # Generate embedding. The output is a list of EmbeddingRequestOutputs. - outputs = model.embed(prompts) + outputs = llm.embed(prompts) # Print the outputs. print("\nGenerated Outputs:\n" + "-" * 60) diff --git a/examples/offline_inference/basic/score.py b/examples/offline_inference/basic/score.py index 6a08de2d2c3..d37527b0a13 100644 --- a/examples/offline_inference/basic/score.py +++ b/examples/offline_inference/basic/score.py @@ -27,10 +27,10 @@ def main(args: Namespace): # Create an LLM. # You should pass task="score" for cross-encoder models - model = LLM(**vars(args)) + llm = LLM(**vars(args)) # Generate scores. The output is a list of ScoringRequestOutputs. - outputs = model.score(text_1, texts_2) + outputs = llm.score(text_1, texts_2) # Print the outputs. print("\nGenerated Outputs:\n" + "-" * 60) diff --git a/examples/offline_inference/embed_jina_embeddings_v3.py b/examples/offline_inference/embed_jina_embeddings_v3.py index e68128399ba..7d78b8c63c6 100644 --- a/examples/offline_inference/embed_jina_embeddings_v3.py +++ b/examples/offline_inference/embed_jina_embeddings_v3.py @@ -30,11 +30,11 @@ def main(args: Namespace): # Create an LLM. # You should pass task="embed" for embedding models - model = LLM(**vars(args)) + llm = LLM(**vars(args)) # Generate embedding. The output is a list of EmbeddingRequestOutputs. # Only text matching task is supported for now. See #16120 - outputs = model.embed(prompts) + outputs = llm.embed(prompts) # Print the outputs. print("\nGenerated Outputs:") diff --git a/examples/offline_inference/embed_matryoshka_fy.py b/examples/offline_inference/embed_matryoshka_fy.py index 7f5d74d9a3a..50a645ba827 100644 --- a/examples/offline_inference/embed_matryoshka_fy.py +++ b/examples/offline_inference/embed_matryoshka_fy.py @@ -30,10 +30,10 @@ def main(args: Namespace): # Create an LLM. # You should pass task="embed" for embedding models - model = LLM(**vars(args)) + llm = LLM(**vars(args)) # Generate embedding. The output is a list of EmbeddingRequestOutputs. - outputs = model.embed(prompts, pooling_params=PoolingParams(dimensions=32)) + outputs = llm.embed(prompts, pooling_params=PoolingParams(dimensions=32)) # Print the outputs. print("\nGenerated Outputs:") diff --git a/examples/offline_inference/neuron_speculation.py b/examples/offline_inference/neuron_speculation.py index 2ef69f29863..aaf3006a327 100644 --- a/examples/offline_inference/neuron_speculation.py +++ b/examples/offline_inference/neuron_speculation.py @@ -25,7 +25,7 @@ def config_buckets(): os.environ["NEURON_TOKEN_GEN_BUCKETS"] = "128,512,1024,2048" -def initialize_model(): +def initialize_llm(): """Create an LLM with speculative decoding.""" return LLM( model="openlm-research/open_llama_7b", @@ -43,9 +43,9 @@ def initialize_model(): ) -def process_requests(model: LLM, sampling_params: SamplingParams): +def process_requests(llm: LLM, sampling_params: SamplingParams): """Generate texts from prompts and print them.""" - outputs = model.generate(prompts, sampling_params) + outputs = llm.generate(prompts, sampling_params) for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text @@ -55,7 +55,7 @@ def process_requests(model: LLM, sampling_params: SamplingParams): def main(): """Main function that sets up the model and processes prompts.""" config_buckets() - model = initialize_model() + model = initialize_llm() # Create a sampling params object. sampling_params = SamplingParams(max_tokens=100, top_k=1) process_requests(model, sampling_params) diff --git a/examples/offline_inference/prithvi_geospatial_mae.py b/examples/offline_inference/prithvi_geospatial_mae.py index 567c448a8c9..6dc03e85baa 100644 --- a/examples/offline_inference/prithvi_geospatial_mae.py +++ b/examples/offline_inference/prithvi_geospatial_mae.py @@ -140,7 +140,7 @@ class PrithviMAE: def __init__(self): print("Initializing PrithviMAE model") - self.model = LLM( + self.llm = LLM( model=os.path.join(os.path.dirname(__file__), "./model"), skip_tokenizer_init=True, dtype="float32", @@ -158,7 +158,7 @@ def run(self, input_data, location_coords): prompt = {"prompt_token_ids": [1], "multi_modal_data": mm_data} - outputs = self.model.encode(prompt, use_tqdm=False) + outputs = self.llm.encode(prompt, use_tqdm=False) print("################ Inference done (it took seconds) ##############") return outputs[0].outputs.data diff --git a/examples/offline_inference/qwen3_reranker.py b/examples/offline_inference/qwen3_reranker.py index fe3cebc348f..8b27c6ff524 100644 --- a/examples/offline_inference/qwen3_reranker.py +++ b/examples/offline_inference/qwen3_reranker.py @@ -17,13 +17,13 @@ # Models converted offline using this method can not only be more efficient # and support the vllm score API, but also make the init parameters more # concise, for example. -# model = LLM(model="tomaarsen/Qwen3-Reranker-0.6B-seq-cls", task="score") +# llm = LLM(model="tomaarsen/Qwen3-Reranker-0.6B-seq-cls", task="score") # If you want to load the official original version, the init parameters are # as follows. -def get_model() -> LLM: +def get_llm() -> LLM: """Initializes and returns the LLM model for Qwen3-Reranker.""" return LLM( model=model_name, @@ -77,7 +77,7 @@ def main() -> None: ] documents = [document_template.format(doc=doc, suffix=suffix) for doc in documents] - model = get_model() + model = get_llm() outputs = model.score(queries, documents) print("-" * 30) diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py index 2e103019f7a..13ddf035a55 100644 --- a/tests/basic_correctness/test_basic_correctness.py +++ b/tests/basic_correctness/test_basic_correctness.py @@ -236,13 +236,13 @@ def test_failed_model_execution(vllm_runner, monkeypatch) -> None: monkeypatch.setenv('VLLM_ENABLE_V1_MULTIPROCESSING', '0') with vllm_runner('facebook/opt-125m', enforce_eager=True) as vllm_model: - if isinstance(vllm_model.model.llm_engine, LLMEngineV1): + if isinstance(vllm_model.llm.llm_engine, LLMEngineV1): v1_test_failed_model_execution(vllm_model) def v1_test_failed_model_execution(vllm_model): - engine = vllm_model.model.llm_engine + engine = vllm_model.llm.llm_engine mocked_execute_model = Mock( side_effect=RuntimeError("Mocked Critical Error")) engine.engine_core.engine_core.model_executor.execute_model =\ diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py index 341a39a42b8..db2fa2f6bef 100644 --- a/tests/basic_correctness/test_preemption.py +++ b/tests/basic_correctness/test_preemption.py @@ -81,7 +81,7 @@ def test_chunked_prefill_recompute( disable_log_stats=False, ) as vllm_model: vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt + assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt < ARTIFICIAL_PREEMPTION_MAX_CNT) for i in range(len(example_prompts)): @@ -118,10 +118,10 @@ def test_preemption( distributed_executor_backend=distributed_executor_backend, ) as vllm_model: vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt + assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt < ARTIFICIAL_PREEMPTION_MAX_CNT) total_preemption = ( - vllm_model.model.llm_engine.scheduler[0].num_cumulative_preemption) + vllm_model.llm.llm_engine.scheduler[0].num_cumulative_preemption) check_outputs_equal( outputs_0_lst=hf_outputs, @@ -174,12 +174,12 @@ def test_preemption_infeasible( ) as vllm_model: sampling_params = SamplingParams(max_tokens=max_tokens, ignore_eos=True) - req_outputs = vllm_model.model.generate( + req_outputs = vllm_model.llm.generate( example_prompts, sampling_params=sampling_params, ) - assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt + assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt < ARTIFICIAL_PREEMPTION_MAX_CNT) # Verify the request is ignored and not hang. diff --git a/tests/conftest.py b/tests/conftest.py index f3524d1fe2a..a18dbf58c80 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -784,7 +784,7 @@ def __init__( enforce_eager: Optional[bool] = False, **kwargs, ) -> None: - self.model = LLM( + self.llm = LLM( model=model_name, task=task, tokenizer=tokenizer_name, @@ -854,9 +854,9 @@ def generate( videos=videos, audios=audios) - req_outputs = self.model.generate(inputs, - sampling_params=sampling_params, - **kwargs) + req_outputs = self.llm.generate(inputs, + sampling_params=sampling_params, + **kwargs) outputs: list[tuple[list[list[int]], list[str]]] = [] for req_output in req_outputs: @@ -902,9 +902,9 @@ def generate_w_logprobs( videos=videos, audios=audios) - req_outputs = self.model.generate(inputs, - sampling_params=sampling_params, - **kwargs) + req_outputs = self.llm.generate(inputs, + sampling_params=sampling_params, + **kwargs) toks_str_logsprobs_prompt_logprobs = ( self._final_steps_generate_w_logprobs(req_outputs)) @@ -924,8 +924,8 @@ def generate_encoder_decoder_w_logprobs( ''' assert sampling_params.logprobs is not None - req_outputs = self.model.generate(encoder_decoder_prompts, - sampling_params=sampling_params) + req_outputs = self.llm.generate(encoder_decoder_prompts, + sampling_params=sampling_params) toks_str_logsprobs_prompt_logprobs = ( self._final_steps_generate_w_logprobs(req_outputs)) # Omit prompt logprobs if not required by sampling params @@ -1018,7 +1018,7 @@ def generate_beam_search( videos=videos, audios=audios) - outputs = self.model.beam_search( + outputs = self.llm.beam_search( inputs, BeamSearchParams(beam_width=beam_width, max_tokens=max_tokens)) returned_outputs = [] @@ -1029,7 +1029,7 @@ def generate_beam_search( return returned_outputs def classify(self, prompts: list[str]) -> list[list[float]]: - req_outputs = self.model.classify(prompts) + req_outputs = self.llm.classify(prompts) return [req_output.outputs.probs for req_output in req_outputs] def embed(self, @@ -1044,11 +1044,11 @@ def embed(self, videos=videos, audios=audios) - req_outputs = self.model.embed(inputs, *args, **kwargs) + req_outputs = self.llm.embed(inputs, *args, **kwargs) return [req_output.outputs.embedding for req_output in req_outputs] def encode(self, prompts: list[str]) -> list[list[float]]: - req_outputs = self.model.encode(prompts) + req_outputs = self.llm.encode(prompts) return [req_output.outputs.data for req_output in req_outputs] def score( @@ -1058,18 +1058,18 @@ def score( *args, **kwargs, ) -> list[float]: - req_outputs = self.model.score(text_1, text_2, *args, **kwargs) + req_outputs = self.llm.score(text_1, text_2, *args, **kwargs) return [req_output.outputs.score for req_output in req_outputs] def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]: - executor = self.model.llm_engine.model_executor + executor = self.llm.llm_engine.model_executor return executor.apply_model(func) def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): - del self.model + del self.llm cleanup_dist_env_and_memory() diff --git a/tests/core/test_num_computed_tokens_update.py b/tests/core/test_num_computed_tokens_update.py index 1b958e34df8..9e1b7913dfb 100644 --- a/tests/core/test_num_computed_tokens_update.py +++ b/tests/core/test_num_computed_tokens_update.py @@ -37,7 +37,7 @@ def test_num_computed_tokens_update(num_scheduler_steps: int, num_scheduler_steps=num_scheduler_steps, enable_chunked_prefill=enable_chunked_prefill, enforce_eager=enforce_eager) - engine: LLMEngine = runner.model.llm_engine + engine: LLMEngine = runner.llm.llm_engine # In multi-step + chunked-prefill there is no separate single prompt step. # What is scheduled will run for num_scheduler_steps always. diff --git a/tests/detokenizer/test_stop_reason.py b/tests/detokenizer/test_stop_reason.py index 9716f7d72a5..1ff679789c9 100644 --- a/tests/detokenizer/test_stop_reason.py +++ b/tests/detokenizer/test_stop_reason.py @@ -28,7 +28,7 @@ def vllm_model(vllm_runner): def test_stop_reason(vllm_model, example_prompts): tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL) stop_token_id = tokenizer.convert_tokens_to_ids(STOP_STR) - llm = vllm_model.model + llm = vllm_model.llm # test stop token outputs = llm.generate(example_prompts, diff --git a/tests/detokenizer/test_stop_strings.py b/tests/detokenizer/test_stop_strings.py index efe938a20c4..cb87c44cc39 100644 --- a/tests/detokenizer/test_stop_strings.py +++ b/tests/detokenizer/test_stop_strings.py @@ -101,42 +101,42 @@ def _stop_token_id(llm): def test_stop_strings(): # If V0, must set enforce_eager=False since we use # async output processing below. - vllm_model = LLM(MODEL, enforce_eager=envs.VLLM_USE_V1) + llm = LLM(MODEL, enforce_eager=envs.VLLM_USE_V1) if envs.VLLM_USE_V1: - _stop_basic(vllm_model) + _stop_basic(llm) else: - _set_async_mode(vllm_model, True) - _stop_basic(vllm_model) + _set_async_mode(llm, True) + _stop_basic(llm) - _set_async_mode(vllm_model, False) - _stop_basic(vllm_model) + _set_async_mode(llm, False) + _stop_basic(llm) if envs.VLLM_USE_V1: - _stop_multi_tokens(vllm_model) + _stop_multi_tokens(llm) else: - _set_async_mode(vllm_model, True) - _stop_multi_tokens(vllm_model) + _set_async_mode(llm, True) + _stop_multi_tokens(llm) - _set_async_mode(vllm_model, False) - _stop_multi_tokens(vllm_model) + _set_async_mode(llm, False) + _stop_multi_tokens(llm) if envs.VLLM_USE_V1: - _stop_partial_token(vllm_model) + _stop_partial_token(llm) else: - _set_async_mode(vllm_model, True) - _stop_partial_token(vllm_model) + _set_async_mode(llm, True) + _stop_partial_token(llm) - _set_async_mode(vllm_model, False) - _stop_partial_token(vllm_model) + _set_async_mode(llm, False) + _stop_partial_token(llm) if envs.VLLM_USE_V1: # FIXME: this does not respect include_in_output=False - # _stop_token_id(vllm_model) + # _stop_token_id(llm) pass else: - _set_async_mode(vllm_model, True) - _stop_token_id(vllm_model) + _set_async_mode(llm, True) + _stop_token_id(llm) - _set_async_mode(vllm_model, False) - _stop_token_id(vllm_model) + _set_async_mode(llm, False) + _stop_token_id(llm) diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py index bebf44b6dfd..b1ad1fdd060 100644 --- a/tests/lora/test_llama_tp.py +++ b/tests/lora/test_llama_tp.py @@ -186,25 +186,25 @@ def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files, model_uri = tmp_path / "vllm" / model_ref / suffix / model_name tensorizer_config = TensorizerConfig(tensorizer_uri=str(model_uri)) - loaded_vllm_model = LLM(model=model_ref, - load_format="tensorizer", - enable_lora=True, - enforce_eager=True, - model_loader_extra_config=tensorizer_config, - max_num_seqs=13, - tensor_parallel_size=2, - max_loras=2) + loaded_llm = LLM(model=model_ref, + load_format="tensorizer", + enable_lora=True, + enforce_eager=True, + model_loader_extra_config=tensorizer_config, + max_num_seqs=13, + tensor_parallel_size=2, + max_loras=2) tc_as_dict = tensorizer_config.to_serializable() print("lora adapter created") - assert do_sample(loaded_vllm_model, + assert do_sample(loaded_llm, sql_lora_files, tensorizer_config_dict=tc_as_dict, lora_id=0) == EXPECTED_NO_LORA_OUTPUT print("lora 1") - assert do_sample(loaded_vllm_model, + assert do_sample(loaded_llm, sql_lora_files, tensorizer_config_dict=tc_as_dict, lora_id=1) == EXPECTED_LORA_OUTPUT diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py index 7bb5d8980d6..b527b94aa58 100644 --- a/tests/metrics/test_metrics.py +++ b/tests/metrics/test_metrics.py @@ -44,7 +44,7 @@ def test_metric_counter_prompt_tokens( dtype=dtype, disable_log_stats=False, gpu_memory_utilization=0.4) as vllm_model: - tokenizer = vllm_model.model.get_tokenizer() + tokenizer = vllm_model.llm.get_tokenizer() prompt_token_counts = [ len(tokenizer.encode(p)) for p in example_prompts ] @@ -56,7 +56,7 @@ def test_metric_counter_prompt_tokens( vllm_prompt_token_count = sum(prompt_token_counts) _ = vllm_model.generate_greedy(example_prompts, max_tokens) - stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus'] + stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus'] metric_count = stat_logger.metrics.counter_prompt_tokens.labels( **stat_logger.labels)._value.get() @@ -80,8 +80,8 @@ def test_metric_counter_generation_tokens( disable_log_stats=False, gpu_memory_utilization=0.4) as vllm_model: vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - tokenizer = vllm_model.model.get_tokenizer() - stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus'] + tokenizer = vllm_model.llm.get_tokenizer() + stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus'] metric_count = stat_logger.metrics.counter_generation_tokens.labels( **stat_logger.labels)._value.get() vllm_generation_count = 0 @@ -116,8 +116,8 @@ def test_metric_counter_generation_tokens_multi_step( disable_async_output_proc=disable_async_output_proc, ) as vllm_model: vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) - tokenizer = vllm_model.model.get_tokenizer() - stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus'] + tokenizer = vllm_model.llm.get_tokenizer() + stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus'] metric_count = stat_logger.metrics.counter_generation_tokens.labels( **stat_logger.labels)._value.get() vllm_generation_count = 0 @@ -148,7 +148,7 @@ def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str, disable_log_stats=False, gpu_memory_utilization=0.3, served_model_name=served_model_name) as vllm_model: - stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus'] + stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus'] metrics_tag_content = stat_logger.labels["model_name"] if envs.VLLM_CI_USE_S3: @@ -256,7 +256,7 @@ def test_metric_spec_decode( ) as vllm_model: # Force log interval to be 0 to catch all metrics. - stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus'] + stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus'] stat_logger.local_interval = 0 # Note that the purpose of this test is to verify spec decode diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py index 4bdb651e517..53ade6b9bda 100644 --- a/tests/model_executor/test_model_load_with_params.py +++ b/tests/model_executor/test_model_load_with_params.py @@ -32,8 +32,8 @@ def test_model_loading_with_params(vllm_runner): output = vllm_model.embed("Write a short story about a robot that" " dreams for the first time.\n") - model_config = vllm_model.model.llm_engine.model_config - model_tokenizer = vllm_model.model.llm_engine.tokenizer + model_config = vllm_model.llm.llm_engine.model_config + model_tokenizer = vllm_model.llm.llm_engine.tokenizer # asserts on the bert model config file assert model_config.encoder_config["max_seq_length"] == 512 @@ -70,8 +70,8 @@ def test_roberta_model_loading_with_params(vllm_runner): output = vllm_model.embed("Write a short story about a robot that" " dreams for the first time.\n") - model_config = vllm_model.model.llm_engine.model_config - model_tokenizer = vllm_model.model.llm_engine.tokenizer + model_config = vllm_model.llm.llm_engine.model_config + model_tokenizer = vllm_model.llm.llm_engine.tokenizer # asserts on the bert model config file assert model_config.encoder_config["max_seq_length"] == 512 @@ -108,7 +108,7 @@ def test_facebook_roberta_model_loading_with_params(vllm_runner): output = vllm_model.embed("Write a short story about a robot that" " dreams for the first time.\n") - model_tokenizer = vllm_model.model.llm_engine.tokenizer + model_tokenizer = vllm_model.llm.llm_engine.tokenizer assert model_tokenizer.tokenizer_id == model_name def check_model(model): diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py index eba14e64553..67260a0aac6 100644 --- a/tests/models/language/generation/test_hybrid.py +++ b/tests/models/language/generation/test_hybrid.py @@ -275,7 +275,7 @@ def test_models_preemption_recompute( Tests that outputs are identical with and w/o preemptions (recompute). """ with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model: - scheduler = vllm_model.model.llm_engine.scheduler[0] + scheduler = vllm_model.llm.llm_engine.scheduler[0] scheduler.ENABLE_ARTIFICIAL_PREEMPT = True preempt_vllm_outputs = vllm_model.generate_greedy( example_prompts, max_tokens) diff --git a/tests/models/language/generation/test_mistral.py b/tests/models/language/generation/test_mistral.py index c70698ede37..81a88f2d485 100644 --- a/tests/models/language/generation/test_mistral.py +++ b/tests/models/language/generation/test_mistral.py @@ -238,8 +238,8 @@ def test_mistral_symbolic_languages(vllm_runner, model: str, load_format="mistral") as vllm_model: for prompt in SYMBOLIC_LANG_PROMPTS: msg = {"role": "user", "content": prompt} - outputs = vllm_model.model.chat([msg], - sampling_params=SAMPLING_PARAMS) + outputs = vllm_model.llm.chat([msg], + sampling_params=SAMPLING_PARAMS) assert "�" not in outputs[0].outputs[0].text.strip() @@ -253,11 +253,11 @@ def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None: load_format="mistral") as vllm_model: msgs = copy.deepcopy(MSGS) - outputs = vllm_model.model.chat(msgs, - tools=TOOLS, - sampling_params=SAMPLING_PARAMS) + outputs = vllm_model.llm.chat(msgs, + tools=TOOLS, + sampling_params=SAMPLING_PARAMS) - tokenizer = vllm_model.model.get_tokenizer() + tokenizer = vllm_model.llm.get_tokenizer() tool_parser = MistralToolParser(tokenizer) model_output = outputs[0].outputs[0].text.strip() @@ -308,7 +308,7 @@ def test_mistral_guided_decoding( f"Give an example JSON for an employee profile that " f"fits this schema: {SAMPLE_JSON_SCHEMA}" }] - outputs = vllm_model.model.chat(messages, sampling_params=params) + outputs = vllm_model.llm.chat(messages, sampling_params=params) generated_text = outputs[0].outputs[0].text json_response = json.loads(generated_text) diff --git a/tests/models/language/pooling/mteb_utils.py b/tests/models/language/pooling/mteb_utils.py index 6c4fde5fdfa..dd6769d2911 100644 --- a/tests/models/language/pooling/mteb_utils.py +++ b/tests/models/language/pooling/mteb_utils.py @@ -30,7 +30,7 @@ class VllmMtebEncoder(mteb.Encoder): def __init__(self, vllm_model): super().__init__() - self.model = vllm_model + self.llm = vllm_model self.rng = np.random.default_rng(seed=42) def encode( @@ -43,7 +43,7 @@ def encode( # issues by randomizing the order. r = self.rng.permutation(len(sentences)) sentences = [sentences[i] for i in r] - outputs = self.model.embed(sentences, use_tqdm=False) + outputs = self.llm.embed(sentences, use_tqdm=False) embeds = np.array(outputs) embeds = embeds[np.argsort(r)] return embeds @@ -61,10 +61,10 @@ def predict( queries = [s[0] for s in sentences] corpus = [s[1] for s in sentences] - outputs = self.model.score(queries, - corpus, - truncate_prompt_tokens=-1, - use_tqdm=False) + outputs = self.llm.score(queries, + corpus, + truncate_prompt_tokens=-1, + use_tqdm=False) scores = np.array(outputs) scores = scores[np.argsort(r)] return scores @@ -178,11 +178,11 @@ def mteb_test_embed_models(hf_runner, if model_info.architecture: assert (model_info.architecture - in vllm_model.model.llm_engine.model_config.architectures) + in vllm_model.llm.llm_engine.model_config.architectures) vllm_main_score = run_mteb_embed_task(VllmMtebEncoder(vllm_model), MTEB_EMBED_TASKS) - vllm_dtype = vllm_model.model.llm_engine.model_config.dtype + vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype with hf_runner(model_info.name, is_sentence_transformer=True, @@ -203,7 +203,7 @@ def mteb_test_embed_models(hf_runner, def run_mteb_rerank(cross_encoder, tasks, languages): with tempfile.TemporaryDirectory() as results_folder: - bm25s = mteb.get_model("bm25s") + bm25s = mteb.get_llm("bm25s") tasks = mteb.get_tasks(tasks=tasks, languages=languages) subset = "default" @@ -284,7 +284,7 @@ def mteb_test_rerank_models(hf_runner, max_num_seqs=8, **vllm_extra_kwargs) as vllm_model: - model_config = vllm_model.model.llm_engine.model_config + model_config = vllm_model.llm.llm_engine.model_config if model_info.architecture: assert (model_info.architecture in model_config.architectures) diff --git a/tests/models/language/pooling/test_gritlm.py b/tests/models/language/pooling/test_gritlm.py index c2f70bb647a..5cf0d3fd4f6 100644 --- a/tests/models/language/pooling/test_gritlm.py +++ b/tests/models/language/pooling/test_gritlm.py @@ -124,7 +124,7 @@ def test_gritlm_offline_embedding(vllm_runner): task="embed", max_model_len=MAX_MODEL_LEN, ) as vllm_model: - llm = vllm_model.model + llm = vllm_model.llm d_rep = run_llm_encode( llm, @@ -171,7 +171,7 @@ def test_gritlm_offline_generate(monkeypatch: pytest.MonkeyPatch, vllm_runner): task="generate", max_model_len=MAX_MODEL_LEN, ) as vllm_model: - llm = vllm_model.model + llm = vllm_model.llm sampling_params = SamplingParams(temperature=0.0, max_tokens=256) outputs = llm.generate(input, sampling_params=sampling_params) diff --git a/tests/models/language/pooling/test_jina.py b/tests/models/language/pooling/test_jina.py index 9bfe7411e16..16c711407ae 100644 --- a/tests/models/language/pooling/test_jina.py +++ b/tests/models/language/pooling/test_jina.py @@ -87,10 +87,10 @@ def test_matryoshka( task="embed", dtype=dtype, max_model_len=None) as vllm_model: - assert vllm_model.model.llm_engine.model_config.is_matryoshka + assert vllm_model.llm.llm_engine.model_config.is_matryoshka matryoshka_dimensions = ( - vllm_model.model.llm_engine.model_config.matryoshka_dimensions) + vllm_model.llm.llm_engine.model_config.matryoshka_dimensions) assert matryoshka_dimensions is not None if dimensions not in matryoshka_dimensions: diff --git a/tests/models/language/pooling/test_nomic_max_model_len.py b/tests/models/language/pooling/test_nomic_max_model_len.py index 250b3a52835..7413ef578e3 100644 --- a/tests/models/language/pooling/test_nomic_max_model_len.py +++ b/tests/models/language/pooling/test_nomic_max_model_len.py @@ -23,7 +23,7 @@ def test_default(model_info, vllm_runner): with vllm_runner(model_info.name, task="embed", max_model_len=None) as vllm_model: - model_config = vllm_model.model.llm_engine.model_config + model_config = vllm_model.llm.llm_engine.model_config if model_info.name == "nomic-ai/nomic-embed-text-v2-moe": # For nomic-embed-text-v2-moe the length is set to 512 # by sentence_bert_config.json. @@ -38,7 +38,7 @@ def test_set_max_model_len_legal(model_info, vllm_runner): # set max_model_len <= 512 with vllm_runner(model_info.name, task="embed", max_model_len=256) as vllm_model: - model_config = vllm_model.model.llm_engine.model_config + model_config = vllm_model.llm.llm_engine.model_config assert model_config.max_model_len == 256 # set 512 < max_model_len <= 2048 @@ -52,7 +52,7 @@ def test_set_max_model_len_legal(model_info, vllm_runner): else: with vllm_runner(model_info.name, task="embed", max_model_len=1024) as vllm_model: - model_config = vllm_model.model.llm_engine.model_config + model_config = vllm_model.llm.llm_engine.model_config assert model_config.max_model_len == 1024 diff --git a/tests/models/language/pooling/test_truncation_control.py b/tests/models/language/pooling/test_truncation_control.py index 33aff1c873f..c7399e01c73 100644 --- a/tests/models/language/pooling/test_truncation_control.py +++ b/tests/models/language/pooling/test_truncation_control.py @@ -28,7 +28,7 @@ def test_smaller_truncation_size(vllm_runner, with vllm_runner(model_name, task="embed", max_model_len=max_model_len) as vllm_model: - vllm_output = vllm_model.model.encode( + vllm_output = vllm_model.llm.encode( input_str, truncate_prompt_tokens=truncate_prompt_tokens) prompt_tokens = vllm_output[0].prompt_token_ids @@ -43,7 +43,7 @@ def test_max_truncation_size(vllm_runner, with vllm_runner(model_name, task="embed", max_model_len=max_model_len) as vllm_model: - vllm_output = vllm_model.model.encode( + vllm_output = vllm_model.llm.encode( input_str, truncate_prompt_tokens=truncate_prompt_tokens) prompt_tokens = vllm_output[0].prompt_token_ids @@ -61,7 +61,7 @@ def test_bigger_truncation_size(vllm_runner, model_name, task="embed", max_model_len=max_model_len) as vllm_model: - llm_output = vllm_model.model.encode( + llm_output = vllm_model.llm.encode( input_str, truncate_prompt_tokens=truncate_prompt_tokens) assert llm_output == f"""truncate_prompt_tokens value diff --git a/tests/models/multimodal/generation/test_pixtral.py b/tests/models/multimodal/generation/test_pixtral.py index 1def825ab08..e157d6f4a79 100644 --- a/tests/models/multimodal/generation/test_pixtral.py +++ b/tests/models/multimodal/generation/test_pixtral.py @@ -180,8 +180,7 @@ def test_chat( ) as vllm_model: outputs = [] for msg in MSGS: - output = vllm_model.model.chat(msg, - sampling_params=SAMPLING_PARAMS) + output = vllm_model.llm.chat(msg, sampling_params=SAMPLING_PARAMS) outputs.extend(output) @@ -217,7 +216,7 @@ def test_multi_modal_placeholders(vllm_runner, prompt, max_model_len=8192, limit_mm_per_prompt=LIMIT_MM_PER_PROMPT, ) as vllm_model: - outputs = vllm_model.model.generate(prompt) + outputs = vllm_model.llm.generate(prompt) assert len(outputs) == 1, f"{len(outputs)=}" output: RequestOutput = outputs[0] diff --git a/tests/models/multimodal/generation/test_whisper.py b/tests/models/multimodal/generation/test_whisper.py index 363d55153aa..4a65e8c9520 100644 --- a/tests/models/multimodal/generation/test_whisper.py +++ b/tests/models/multimodal/generation/test_whisper.py @@ -106,7 +106,7 @@ def run_test( tensor_parallel_size=tensor_parallel_size, distributed_executor_backend=distributed_executor_backend, ) as vllm_model: - llm = vllm_model.model + llm = vllm_model.llm sampling_params = SamplingParams( temperature=0, diff --git a/tests/models/multimodal/generation/vlm_utils/core.py b/tests/models/multimodal/generation/vlm_utils/core.py index 8c83d8f8a8a..cf8962ce497 100644 --- a/tests/models/multimodal/generation/vlm_utils/core.py +++ b/tests/models/multimodal/generation/vlm_utils/core.py @@ -85,7 +85,7 @@ def run_test( enforce_eager=enforce_eager, task=task, **vllm_runner_kwargs_) as vllm_model: - tokenizer = vllm_model.model.get_tokenizer() + tokenizer = vllm_model.llm.get_tokenizer() vllm_kwargs: dict[str, Any] = {} if get_stop_token_ids is not None: diff --git a/tests/models/multimodal/pooling/test_dse_qwen2_vl.py b/tests/models/multimodal/pooling/test_dse_qwen2_vl.py index f889eea5e83..a6f5aeccf94 100644 --- a/tests/models/multimodal/pooling/test_dse_qwen2_vl.py +++ b/tests/models/multimodal/pooling/test_dse_qwen2_vl.py @@ -96,7 +96,7 @@ def _run_test( dtype=dtype, enforce_eager=True, max_model_len=8192) as vllm_model: - tokenizer = vllm_model.model.get_tokenizer() + tokenizer = vllm_model.llm.get_tokenizer() texts = [ # this is necessary because vllm_model.embed will not apply any # templating to the prompt, and therefore lacks an image_pad diff --git a/tests/models/multimodal/pooling/test_jinavl_reranker.py b/tests/models/multimodal/pooling/test_jinavl_reranker.py index 50c91f1f81c..712b6801de4 100644 --- a/tests/models/multimodal/pooling/test_jinavl_reranker.py +++ b/tests/models/multimodal/pooling/test_jinavl_reranker.py @@ -56,7 +56,7 @@ def create_image_param(url: str) -> ChatCompletionContentPartImageParam: mm_processor_kwargs=mm_processor_kwargs, limit_mm_per_prompt=limit_mm_per_prompt, ) as vllm_model: - outputs = vllm_model.model.score(query, documents) + outputs = vllm_model.llm.score(query, documents) return [output.outputs.score for output in outputs] diff --git a/tests/models/quantization/test_modelopt.py b/tests/models/quantization/test_modelopt.py index 6ad526cc893..e23d4d9d211 100644 --- a/tests/models/quantization/test_modelopt.py +++ b/tests/models/quantization/test_modelopt.py @@ -45,7 +45,7 @@ reason="fp8 is not supported on this GPU type.") @pytest.mark.parametrize("model_name", MODELS) def test_models(example_prompts, model_name) -> None: - model = LLM( + llm = LLM( model=model_name, max_model_len=MAX_MODEL_LEN, trust_remote_code=True, @@ -68,9 +68,9 @@ def test_models(example_prompts, model_name) -> None: # Note: these need to be run 1 at a time due to numerical precision, # since the expected strs were generated this way. for prompt in formatted_prompts: - outputs = model.generate(prompt, params) + outputs = llm.generate(prompt, params) generations.append(outputs[0].outputs[0].text) - del model + del llm print(model_name, generations) expected_strs = EXPECTED_STRS_MAP[model_name] diff --git a/tests/models/quantization/test_nvfp4.py b/tests/models/quantization/test_nvfp4.py index b95dad9a4ef..b3c217e729e 100644 --- a/tests/models/quantization/test_nvfp4.py +++ b/tests/models/quantization/test_nvfp4.py @@ -46,7 +46,7 @@ reason="modelopt_fp4 is not supported on this GPU type.") @pytest.mark.parametrize("model_name", MODELS) def test_models(example_prompts, model_name) -> None: - model = LLM( + llm = LLM( model=model_name, max_model_len=MAX_MODEL_LEN, trust_remote_code=True, @@ -69,9 +69,9 @@ def test_models(example_prompts, model_name) -> None: # Note: these need to be run 1 at a time due to numerical precision, # since the expected strs were generated this way. for prompt in formatted_prompts: - outputs = model.generate(prompt, params) + outputs = llm.generate(prompt, params) generations.append(outputs[0].outputs[0].text) - del model + del llm print(model_name, generations) expected_strs = EXPECTED_STRS_MAP[model_name] diff --git a/tests/prefix_caching/test_disable_sliding_window.py b/tests/prefix_caching/test_disable_sliding_window.py index f00a8f6998c..b940ab416e6 100644 --- a/tests/prefix_caching/test_disable_sliding_window.py +++ b/tests/prefix_caching/test_disable_sliding_window.py @@ -25,25 +25,25 @@ @pytest.mark.parametrize("model_len_len", MODEL_LEN_LEN) def test_disable_sliding_window(model_len_len, ): model, sliding_len, full_len = model_len_len - vllm_disabled_model = LLM(model, disable_sliding_window=True) - vllm_disabled_model.generate("Hi my name is") - model_config = vllm_disabled_model.llm_engine.model_config + disabled_llm = LLM(model, disable_sliding_window=True) + disabled_llm.generate("Hi my name is") + model_config = disabled_llm.llm_engine.model_config assert model_config.max_model_len == sliding_len, ( "Max len expected to equal sliding_len of %s, but got %s", sliding_len, model_config.max_model_len) - del vllm_disabled_model + del disabled_llm cleanup_dist_env_and_memory() - vllm_enabled_model = LLM(model, - enforce_eager=True, - disable_sliding_window=False, - enable_prefix_caching=False) - vllm_enabled_model.generate("Hi my name is") - model_config = vllm_enabled_model.llm_engine.model_config + enabled_llm = LLM(model, + enforce_eager=True, + disable_sliding_window=False, + enable_prefix_caching=False) + enabled_llm.generate("Hi my name is") + model_config = enabled_llm.llm_engine.model_config assert model_config.max_model_len == full_len, ( "Max len expected to equal full_len of %s, but got %s", full_len, model_config.max_model_len) - del vllm_enabled_model + del enabled_llm cleanup_dist_env_and_memory() diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py index a65fc934b16..5bf6ed957c7 100644 --- a/tests/prefix_caching/test_prefix_caching.py +++ b/tests/prefix_caching/test_prefix_caching.py @@ -93,8 +93,8 @@ def test_mixed_requests( # Run all the promopts greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens) - req_outputs = vllm_model.model.generate(example_prompts, - greedy_params) + req_outputs = vllm_model.llm.generate(example_prompts, + greedy_params) # Verify number of cached tokens for i in range(len(req_outputs)): @@ -161,7 +161,7 @@ def test_fully_cached_prefill_needs_uncached_token(model): max_num_batched_tokens=max_num_batched_tokens, max_num_seqs=max_num_batched_tokens, ) - engine: LLMEngine = runner.model.llm_engine + engine: LLMEngine = runner.llm.llm_engine scheduler: Scheduler = SchedulerProxy(engine.scheduler[0]) # type: ignore engine.scheduler[0] = scheduler diff --git a/tests/quantization/test_gptq_dynamic.py b/tests/quantization/test_gptq_dynamic.py index 23b999e7c67..aea50e99c1d 100644 --- a/tests/quantization/test_gptq_dynamic.py +++ b/tests/quantization/test_gptq_dynamic.py @@ -39,7 +39,7 @@ def test_gptq_with_dynamic(vllm_runner, model_id: str, use_marlin_kernel: bool, linear_method_cls = GPTQMarlinLinearMethod if use_marlin_kernel else ( GPTQLinearMethod) - for name, submodule in (vllm_model.model.llm_engine.model_executor. + for name, submodule in (vllm_model.llm.llm_engine.model_executor. driver_worker.model_runner.model.named_modules()): if name == "lm_head": assert isinstance(submodule.quant_method, linear_method_cls) diff --git a/tests/quantization/test_quark.py b/tests/quantization/test_quark.py index 2db11cb997d..4a0c8ba4d8a 100644 --- a/tests/quantization/test_quark.py +++ b/tests/quantization/test_quark.py @@ -107,11 +107,11 @@ def test_quark_fp8_parity(vllm_runner): } with (vllm_runner(quark_model_id, **llm_kwargs) as quark_handle, vllm_runner(fp8_model_id, **llm_kwargs) as fp8_handle): - quark_model = (quark_handle.model.llm_engine.model_executor. + quark_model = (quark_handle.llm.llm_engine.model_executor. driver_worker.model_runner.model) quark_state_dict = quark_model.state_dict() - fp8_model = (fp8_handle.model.llm_engine.model_executor.driver_worker. + fp8_model = (fp8_handle.llm.llm_engine.model_executor.driver_worker. model_runner.model) fp8_state_dict = fp8_model.state_dict() diff --git a/tests/quantization/test_register_quantization_config.py b/tests/quantization/test_register_quantization_config.py index 6c541fdbeea..84705e92c85 100644 --- a/tests/quantization/test_register_quantization_config.py +++ b/tests/quantization/test_register_quantization_config.py @@ -111,7 +111,7 @@ def test_custom_quant(vllm_runner, model, monkeypatch): quantization="custom_quant", enforce_eager=True) as llm: - model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 + model = llm.llm.llm_engine.model_executor.driver_worker.model_runner.model # noqa: E501 layer = model.model.layers[0] qkv_proj = layer.self_attn.qkv_proj diff --git a/tests/samplers/test_ignore_eos.py b/tests/samplers/test_ignore_eos.py index 7eb9c0b5fb8..ea4a17dd230 100644 --- a/tests/samplers/test_ignore_eos.py +++ b/tests/samplers/test_ignore_eos.py @@ -36,7 +36,7 @@ def test_ignore_eos( ignore_eos=True) for prompt in example_prompts: - ignore_eos_output = vllm_model.model.generate( + ignore_eos_output = vllm_model.llm.generate( prompt, sampling_params=sampling_params) output_length = len(ignore_eos_output[0].outputs[0].token_ids) assert output_length == max_tokens diff --git a/tests/samplers/test_logits_processor.py b/tests/samplers/test_logits_processor.py index 901c8759126..123f9595e97 100644 --- a/tests/samplers/test_logits_processor.py +++ b/tests/samplers/test_logits_processor.py @@ -26,7 +26,7 @@ def test_logits_processor_force_generate( dtype: str, ) -> None: with vllm_runner(model, dtype=dtype) as vllm_model: - tokenizer = vllm_model.model.get_tokenizer() + tokenizer = vllm_model.llm.get_tokenizer() repeat_times = 2 enforced_answers = " vLLM" vllm_token_ids = tokenizer.encode(enforced_answers, @@ -45,13 +45,13 @@ def pick_vllm(token_ids, logits): ) # test logits_processors when prompt_logprobs is not None - vllm_model.model._add_request( + vllm_model.llm._add_request( example_prompts[0], params=params_with_logprobs, ) # test prompt_logprobs is not None - vllm_model.model._add_request( + vllm_model.llm._add_request( example_prompts[1], params=SamplingParams( prompt_logprobs=3, @@ -60,11 +60,11 @@ def pick_vllm(token_ids, logits): ) # test grouped requests - vllm_model.model._add_request( + vllm_model.llm._add_request( example_prompts[2], params=SamplingParams(max_tokens=max_tokens), ) - outputs = vllm_model.model._run_engine(use_tqdm=False) + outputs = vllm_model.llm._run_engine(use_tqdm=False) assert outputs[0].outputs[0].text == enforced_answers * repeat_times diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py index 86c8a03eee1..87f40b10053 100644 --- a/tests/samplers/test_logprobs.py +++ b/tests/samplers/test_logprobs.py @@ -64,7 +64,7 @@ def test_get_prompt_logprobs( prompt_logprobs=num_top_logprobs, temperature=0.0, detokenize=detokenize) - vllm_results = vllm_model.model.generate( + vllm_results = vllm_model.llm.generate( example_prompts, sampling_params=vllm_sampling_params) # Test whether logprobs are included in the results. @@ -174,7 +174,7 @@ def test_none_logprobs(vllm_runner, model, chunked_prefill_token_size: int, logprobs=None, temperature=0.0, detokenize=detokenize) - results_logprobs_none = vllm_model.model.generate( + results_logprobs_none = vllm_model.llm.generate( example_prompts, sampling_params=sampling_params_logprobs_none) for i in range(len(results_logprobs_none)): diff --git a/tests/samplers/test_no_bad_words.py b/tests/samplers/test_no_bad_words.py index 42b529ae169..11803b8d7a5 100644 --- a/tests/samplers/test_no_bad_words.py +++ b/tests/samplers/test_no_bad_words.py @@ -20,7 +20,7 @@ def v1(run_with_both_engines): def _generate( - model: LLM, + llm: LLM, prompt: str, num_prompt_tokens: int, temperature: float = 0, @@ -32,7 +32,7 @@ def _generate( ) # [([output_token_ids, ], [output_text, ]), ] - output = model.generate([prompt], sampling_params=sampling_params) + output = llm.generate([prompt], sampling_params=sampling_params) output_token_ids = output[0][0][0][num_prompt_tokens:] # [0] first (and only) request output @@ -66,10 +66,10 @@ def test_one_token_bad_word(self, vllm_runner): assert self.target_token_id not in output_token_ids def _generate(self, - model: LLM, + llm: LLM, bad_words: Optional[list[str]] = None) -> list[int]: return _generate( - model=model, + llm=llm, prompt=self.PROMPT, num_prompt_tokens=self.num_prompt_tokens, bad_words=bad_words, @@ -156,10 +156,10 @@ def test_two_token_bad_word(self, vllm_runner): or (self.neighbour_token_id2 in output_token_ids)) def _generate(self, - model: LLM, + llm: LLM, bad_words: Optional[list[str]] = None) -> list[int]: return _generate( - model=model, + llm=llm, prompt=self.PROMPT, num_prompt_tokens=self.num_prompt_tokens, bad_words=bad_words, diff --git a/tests/samplers/test_seeded_generate.py b/tests/samplers/test_seeded_generate.py index b339b4b2ddf..5a0efd98acc 100644 --- a/tests/samplers/test_seeded_generate.py +++ b/tests/samplers/test_seeded_generate.py @@ -49,7 +49,7 @@ def test_random_sample_with_seed( sampling_params_seed_2 = copy.deepcopy(sampling_params) sampling_params_seed_2.seed = 200 - llm = vllm_model.model + llm = vllm_model.llm for prompt in example_prompts: for params in ( diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py index f3fe9db3f79..e4b1aa73654 100644 --- a/tests/spec_decode/e2e/conftest.py +++ b/tests/spec_decode/e2e/conftest.py @@ -212,8 +212,7 @@ def run_equality_correctness_test( with vllm_runner(**sd_args) as vllm_model: if ensure_all_accepted or expected_acceptance_rate is not None: # Force log interval to be 0 to catch all metrics. - stat_logger = vllm_model.model.llm_engine.stat_loggers[ - 'prometheus'] + stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus'] stat_logger.local_interval = -100 sd_outputs = vllm_model.generate_w_logprobs(prompts, sampling_params) diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py index f8aeba8301b..ccafc884612 100644 --- a/tests/tokenization/test_detokenize.py +++ b/tests/tokenization/test_detokenize.py @@ -393,7 +393,7 @@ def test_decode_prompt_logprobs_chunked_prefill( logprobs=5, prompt_logprobs=5, temperature=0.0) - vllm_results = vllm_model.model.generate( + vllm_results = vllm_model.llm.generate( example_prompts, sampling_params=vllm_sampling_params) for idx, result in enumerate(vllm_results): diff --git a/tests/v1/core/test_scheduler_e2e.py b/tests/v1/core/test_scheduler_e2e.py index 85415f6ad4b..bd0320baef8 100644 --- a/tests/v1/core/test_scheduler_e2e.py +++ b/tests/v1/core/test_scheduler_e2e.py @@ -14,7 +14,7 @@ @pytest.fixture(scope="module") -def model() -> LLM: +def llm() -> LLM: return LLM(MODEL, enforce_eager=True, enable_prefix_caching=True, @@ -24,16 +24,16 @@ def model() -> LLM: block_size=16) -def test_concurrent_partial_prefill(model): - outputs = model.generate([PROMPT] * 3) +def test_concurrent_partial_prefill(llm): + outputs = llm.generate([PROMPT] * 3) assert len(outputs) == 3 for output in outputs: assert len(output.outputs) == 1 -def test_prefix_cache_stats_is_recorded(model): +def test_prefix_cache_stats_is_recorded(llm): # 17 tokens will make sure first 16 tokens are cached in a block input_tokens = {"prompt_token_ids": [101] * 17} - _ = model.generate([input_tokens]) - outputs = model.generate([input_tokens]) + _ = llm.generate([input_tokens]) + outputs = llm.generate([input_tokens]) assert outputs[0].num_cached_tokens == 16 diff --git a/tests/v1/engine/test_llm_engine.py b/tests/v1/engine/test_llm_engine.py index 059106c62a2..f37686317fd 100644 --- a/tests/v1/engine/test_llm_engine.py +++ b/tests/v1/engine/test_llm_engine.py @@ -112,9 +112,9 @@ def test_compatibility_with_skip_tokenizer_init( example_prompts, structured_outputs=True, ) - model: LLM = vllm_model_skip_tokenizer_init.model + llm: LLM = vllm_model_skip_tokenizer_init.llm with pytest.raises(ValueError): - _ = model.generate(example_prompts, sampling_params_list) + _ = llm.generate(example_prompts, sampling_params_list) def test_parallel_sampling(vllm_model, example_prompts) -> None: @@ -125,8 +125,8 @@ def test_parallel_sampling(vllm_model, example_prompts) -> None: example_prompt: test fixture providing prompts for testing. """ sampling_params_list, n_list = _get_test_sampling_params(example_prompts) - model: LLM = vllm_model.model - outputs = model.generate(example_prompts, sampling_params_list) + llm: LLM = vllm_model.llm + outputs = llm.generate(example_prompts, sampling_params_list) # Validate each request response for out, n in zip(outputs, n_list): @@ -166,10 +166,10 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts): speculative_config=speculative_config, disable_log_stats=False, ) as vllm_model: - model: LLM = vllm_model.model + llm: LLM = vllm_model.llm sampling_params = SamplingParams(temperature=0.0, max_tokens=max_tokens) - outputs = model.generate(example_prompts, sampling_params) + outputs = llm.generate(example_prompts, sampling_params) n_prompts = len(example_prompts) assert len(outputs) == n_prompts @@ -180,7 +180,7 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts): total_tokens += len(out.outputs[0].token_ids) assert total_tokens == max_tokens * n_prompts - metrics = model.get_metrics() + metrics = llm.get_metrics() def find_metric(name) -> list[Metric]: found = [] diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py index 69180e6e5db..4f1f340a4cc 100644 --- a/tests/v1/sample/test_logprobs.py +++ b/tests/v1/sample/test_logprobs.py @@ -112,7 +112,7 @@ def _run_and_validate( max_tokens: int, do_apc: bool, ) -> None: - vllm_results = vllm_model.model.generate( + vllm_results = vllm_model.llm.generate( test_prompts, sampling_params=vllm_sampling_params) for vllm_result, hf_logprob, hf_output, logprob_prompt_logprob in zip( @@ -288,7 +288,7 @@ def test_get_logprobs_and_prompt_logprobs( """ with monkeypatch.context() as m: m.setenv("VLLM_USE_V1", "1") - do_apc = vllm_model.model.llm_engine.cache_config.enable_prefix_caching + do_apc = vllm_model.llm.llm_engine.cache_config.enable_prefix_caching if do_apc and (temperature < 2.0 or batch_logprobs_composition != SAMPLE_PROMPT): # Skip some test-cases to save time. @@ -378,7 +378,7 @@ def test_none_logprobs(vllm_model, example_prompts, prompt_logprobs=None, temperature=0.0, ) - results_logprobs_none = vllm_model.model.generate( + results_logprobs_none = vllm_model.llm.generate( example_prompts, sampling_params=sampling_params_logprobs_none, ) @@ -408,7 +408,7 @@ def test_zero_logprobs(vllm_model, example_prompts, logprobs=0, prompt_logprobs=0, temperature=0.0) - results_logprobs_zero = vllm_model.model.generate( + results_logprobs_zero = vllm_model.llm.generate( example_prompts, sampling_params=sampling_params_logprobs_zero) for i in range(len(results_logprobs_zero)): diff --git a/tests/v1/sample/test_sampling_params_e2e.py b/tests/v1/sample/test_sampling_params_e2e.py index ac0f3eb5883..f53e1e1c485 100644 --- a/tests/v1/sample/test_sampling_params_e2e.py +++ b/tests/v1/sample/test_sampling_params_e2e.py @@ -14,30 +14,30 @@ @pytest.fixture(scope="module") -def model() -> LLM: +def llm() -> LLM: # Disable prefix caching so that we can test prompt logprobs. # TODO remove this after https://github.com/vllm-project/vllm/pull/13949 # is merged return LLM(MODEL, enforce_eager=True, enable_prefix_caching=False) -def test_n_gt_1(model): +def test_n_gt_1(llm): """ParallelSampling is supported.""" params = SamplingParams(n=3) - outputs = model.generate(PROMPT, params) + outputs = llm.generate(PROMPT, params) assert len(outputs[0].outputs) == 3 -def test_best_of(model): +def test_best_of(llm): """Raise a ValueError since best_of is deprecated.""" params = SamplingParams(n=2, best_of=3) with pytest.raises(ValueError): - _ = model.generate(PROMPT, params) + _ = llm.generate(PROMPT, params) -def test_penalties(model): +def test_penalties(llm): """Check that we do not get errors if applied.""" params = SamplingParams( @@ -49,18 +49,18 @@ def test_penalties(model): top_p=0.5, top_k=3, ) - _ = model.generate(PROMPT, params) + _ = llm.generate(PROMPT, params) -def test_stop(model): +def test_stop(llm): """Check that we respect the stop words.""" - output = model.generate(PROMPT, SamplingParams(temperature=0)) + output = llm.generate(PROMPT, SamplingParams(temperature=0)) split_text = output[0].outputs[0].text.split() STOP_IDX = 5 params = SamplingParams(temperature=0, stop=split_text[STOP_IDX]) - output = model.generate(PROMPT, params) + output = llm.generate(PROMPT, params) new_split_text = output[0].outputs[0].text.split() # Output should not contain the stop word. @@ -69,40 +69,40 @@ def test_stop(model): params = SamplingParams(temperature=0, stop=split_text[STOP_IDX], include_stop_str_in_output=True) - output = model.generate(PROMPT, params) + output = llm.generate(PROMPT, params) new_split_text = output[0].outputs[0].text.split() # Output should contain the stop word. assert len(new_split_text) == STOP_IDX + 1 -def test_stop_token_ids(model): +def test_stop_token_ids(llm): """Check that we respect the stop token ids.""" - output = model.generate(PROMPT, SamplingParams(temperature=0)) + output = llm.generate(PROMPT, SamplingParams(temperature=0)) stop_token_id_0 = output[0].outputs[0].token_ids[5] stop_token_id_1 = output[0].outputs[0].token_ids[6] stop_token_ids = [stop_token_id_1, stop_token_id_0] params = SamplingParams(temperature=0, stop_token_ids=stop_token_ids) - output = model.generate(PROMPT, params) + output = llm.generate(PROMPT, params) assert output[0].outputs[0].token_ids[-1] == stop_token_id_0 stop_token_ids = [stop_token_id_0, stop_token_id_1] params = SamplingParams(temperature=0, stop_token_ids=stop_token_ids) - output = model.generate(PROMPT, params) + output = llm.generate(PROMPT, params) assert output[0].outputs[0].token_ids[-1] == stop_token_id_0 -def test_detokenize_false(model): +def test_detokenize_false(llm): """Check that detokenize=False option works.""" - output = model.generate(PROMPT, SamplingParams(detokenize=False)) + output = llm.generate(PROMPT, SamplingParams(detokenize=False)) assert len(output[0].outputs[0].token_ids) > 0 assert len(output[0].outputs[0].text) == 0 - output = model.generate( + output = llm.generate( PROMPT, SamplingParams(detokenize=False, logprobs=3, prompt_logprobs=3)) assert len(output[0].outputs[0].token_ids) > 0 @@ -118,28 +118,28 @@ def test_detokenize_false(model): assert all(lp.decoded_token is None for lp in logprobs.values()) -def test_bad_words(model): +def test_bad_words(llm): """Check that we respect bad words.""" - output = model.generate(PROMPT, SamplingParams(temperature=0)) + output = llm.generate(PROMPT, SamplingParams(temperature=0)) split_text = output[0].outputs[0].text.split() bad_words_1 = " ".join(split_text[:2]) params = SamplingParams(temperature=0, bad_words=[bad_words_1]) - output = model.generate(PROMPT, params) + output = llm.generate(PROMPT, params) new_text = output[0].outputs[0].text assert bad_words_1 not in new_text bad_words_2 = new_text.split()[-1] params = SamplingParams(temperature=0, bad_words=[bad_words_1, bad_words_2]) - output = model.generate(PROMPT, params) + output = llm.generate(PROMPT, params) new_text = output[0].outputs[0].text assert bad_words_1 not in new_text assert bad_words_2 not in new_text -def test_logits_processor(model): +def test_logits_processor(llm): """Check that we reject logits processor.""" # This sample logits processor gives infinite score to the i-th token, @@ -150,47 +150,45 @@ def pick_ith(token_ids, logits): return logits with pytest.raises(ValueError): - _ = model.generate(PROMPT, - SamplingParams(logits_processors=[pick_ith])) + _ = llm.generate(PROMPT, SamplingParams(logits_processors=[pick_ith])) -def test_allowed_token_ids(model): +def test_allowed_token_ids(llm): """Check that we can use allowed_token_ids.""" TOKEN_ID = 10 allowed_token_ids = [TOKEN_ID] - output = model.generate( - PROMPT, SamplingParams(allowed_token_ids=allowed_token_ids)) + output = llm.generate(PROMPT, + SamplingParams(allowed_token_ids=allowed_token_ids)) assert output[0].outputs[0].token_ids[-1] == TOKEN_ID # Reject empty allowed_token_ids. with pytest.raises(ValueError): - _ = model.generate(PROMPT, SamplingParams(allowed_token_ids=[])) + _ = llm.generate(PROMPT, SamplingParams(allowed_token_ids=[])) # Reject negative token id. with pytest.raises(ValueError): - _ = model.generate(PROMPT, SamplingParams(allowed_token_ids=[-1])) + _ = llm.generate(PROMPT, SamplingParams(allowed_token_ids=[-1])) # Reject out of vocabulary. with pytest.raises(ValueError): - _ = model.generate(PROMPT, - SamplingParams(allowed_token_ids=[10000000])) + _ = llm.generate(PROMPT, SamplingParams(allowed_token_ids=[10000000])) -def test_priority(model): +def test_priority(llm): """Check that we reject requests with priority.""" # Reject all allowed token ids with pytest.raises(ValueError): - _ = model.generate(PROMPT, priority=[1]) + _ = llm.generate(PROMPT, priority=[1]) -def test_seed(model): +def test_seed(llm): """Check that seed impacts randomness.""" - out_1 = model.generate(PROMPT, SamplingParams(seed=42)) - out_2 = model.generate(PROMPT, SamplingParams(seed=42)) - out_3 = model.generate(PROMPT, SamplingParams(seed=43)) + out_1 = llm.generate(PROMPT, SamplingParams(seed=42)) + out_2 = llm.generate(PROMPT, SamplingParams(seed=42)) + out_3 = llm.generate(PROMPT, SamplingParams(seed=43)) assert out_1[0].outputs[0].text == out_2[0].outputs[0].text assert out_1[0].outputs[0].text != out_3[0].outputs[0].text diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py index 7a7ba346a71..85ce9601352 100644 --- a/tests/v1/test_oracle.py +++ b/tests/v1/test_oracle.py @@ -112,9 +112,9 @@ def test_v1_llm_by_default(monkeypatch): m.delenv("VLLM_USE_V1") # Should default to V1 for supported config. - model = LLM(MODEL, enforce_eager=True, enable_lora=True) - print(model.generate("Hello my name is")) - assert hasattr(model.llm_engine, "engine_core") + llm = LLM(MODEL, enforce_eager=True, enable_lora=True) + print(llm.generate("Hello my name is")) + assert hasattr(llm.llm_engine, "engine_core") m.delenv("VLLM_USE_V1")