From 3c4a138b9c0478dd37493783d2f56766e9f7254f Mon Sep 17 00:00:00 2001
From: Andy Xie <andy.xning@gmail.com>
Date: Wed, 16 Jul 2025 00:04:59 +0800
Subject: [PATCH] [Misc] unify variable for LLM instance

Signed-off-by: Andy Xie <andy.xning@gmail.com>
---
 docs/configuration/model_resolution.md        |  2 +-
 docs/features/lora.md                         |  4 +-
 docs/features/quantization/fp8.md             | 10 ++-
 docs/features/quantization/int4.md            |  3 +-
 docs/features/quantization/int8.md            |  3 +-
 docs/models/pooling_models.md                 | 10 +--
 examples/offline_inference/basic/classify.py  |  4 +-
 examples/offline_inference/basic/embed.py     |  4 +-
 examples/offline_inference/basic/score.py     |  4 +-
 .../embed_jina_embeddings_v3.py               |  4 +-
 .../offline_inference/embed_matryoshka_fy.py  |  4 +-
 .../offline_inference/neuron_speculation.py   |  8 +-
 .../prithvi_geospatial_mae.py                 |  4 +-
 examples/offline_inference/qwen3_reranker.py  |  6 +-
 .../test_basic_correctness.py                 |  4 +-
 tests/basic_correctness/test_preemption.py    | 10 +--
 tests/conftest.py                             | 32 ++++----
 tests/core/test_num_computed_tokens_update.py |  2 +-
 tests/detokenizer/test_stop_reason.py         |  2 +-
 tests/detokenizer/test_stop_strings.py        | 42 +++++------
 tests/lora/test_llama_tp.py                   | 20 ++---
 tests/metrics/test_metrics.py                 | 16 ++--
 .../test_model_load_with_params.py            | 10 +--
 .../models/language/generation/test_hybrid.py |  2 +-
 .../language/generation/test_mistral.py       | 14 ++--
 tests/models/language/pooling/mteb_utils.py   | 20 ++---
 tests/models/language/pooling/test_gritlm.py  |  4 +-
 tests/models/language/pooling/test_jina.py    |  4 +-
 .../pooling/test_nomic_max_model_len.py       |  6 +-
 .../pooling/test_truncation_control.py        |  6 +-
 .../multimodal/generation/test_pixtral.py     |  5 +-
 .../multimodal/generation/test_whisper.py     |  2 +-
 .../multimodal/generation/vlm_utils/core.py   |  2 +-
 .../multimodal/pooling/test_dse_qwen2_vl.py   |  2 +-
 .../pooling/test_jinavl_reranker.py           |  2 +-
 tests/models/quantization/test_modelopt.py    |  6 +-
 tests/models/quantization/test_nvfp4.py       |  6 +-
 .../test_disable_sliding_window.py            | 22 +++---
 tests/prefix_caching/test_prefix_caching.py   |  6 +-
 tests/quantization/test_gptq_dynamic.py       |  2 +-
 tests/quantization/test_quark.py              |  4 +-
 .../test_register_quantization_config.py      |  2 +-
 tests/samplers/test_ignore_eos.py             |  2 +-
 tests/samplers/test_logits_processor.py       | 10 +--
 tests/samplers/test_logprobs.py               |  4 +-
 tests/samplers/test_no_bad_words.py           | 12 +--
 tests/samplers/test_seeded_generate.py        |  2 +-
 tests/spec_decode/e2e/conftest.py             |  3 +-
 tests/tokenization/test_detokenize.py         |  2 +-
 tests/v1/core/test_scheduler_e2e.py           | 12 +--
 tests/v1/engine/test_llm_engine.py            | 14 ++--
 tests/v1/sample/test_logprobs.py              |  8 +-
 tests/v1/sample/test_sampling_params_e2e.py   | 74 +++++++++----------
 tests/v1/test_oracle.py                       |  6 +-
 54 files changed, 237 insertions(+), 237 deletions(-)

diff --git a/docs/configuration/model_resolution.md b/docs/configuration/model_resolution.md
index d98142a835c..49576a8217d 100644
--- a/docs/configuration/model_resolution.md
+++ b/docs/configuration/model_resolution.md
@@ -14,7 +14,7 @@ For example:
 ```python
 from vllm import LLM
 
-model = LLM(
+llm = LLM(
     model="cerebras/Cerebras-GPT-1.3B",
     hf_overrides={"architectures": ["GPT2LMHeadModel"]},  # GPT-2
 )
diff --git a/docs/features/lora.md b/docs/features/lora.md
index 6acfdcce445..ea1b495138c 100644
--- a/docs/features/lora.md
+++ b/docs/features/lora.md
@@ -302,7 +302,7 @@ To this end, we allow registration of default multimodal LoRAs to handle this au
         return tokenizer.apply_chat_template(chat, tokenize=False)
 
 
-    model = LLM(
+    llm = LLM(
         model=model_id,
         enable_lora=True,
         max_lora_rank=64,
@@ -329,7 +329,7 @@ To this end, we allow registration of default multimodal LoRAs to handle this au
     }
 
 
-    outputs = model.generate(
+    outputs = llm.generate(
         inputs,
         sampling_params=SamplingParams(
             temperature=0.2,
diff --git a/docs/features/quantization/fp8.md b/docs/features/quantization/fp8.md
index a6c0fd78e76..0661933acd6 100644
--- a/docs/features/quantization/fp8.md
+++ b/docs/features/quantization/fp8.md
@@ -86,8 +86,9 @@ Load and run the model in `vllm`:
 
 ```python
 from vllm import LLM
-model = LLM("./Meta-Llama-3-8B-Instruct-FP8-Dynamic")
-result = model.generate("Hello my name is")
+
+llm = LLM("./Meta-Llama-3-8B-Instruct-FP8-Dynamic")
+result = llm.generate("Hello my name is")
 print(result[0].outputs[0].text)
 ```
 
@@ -125,9 +126,10 @@ In this mode, all Linear modules (except for the final `lm_head`) have their wei
 
 ```python
 from vllm import LLM
-model = LLM("facebook/opt-125m", quantization="fp8")
+
+llm = LLM("facebook/opt-125m", quantization="fp8")
 # INFO 06-10 17:55:42 model_runner.py:157] Loading model weights took 0.1550 GB
-result = model.generate("Hello, my name is")
+result = llm.generate("Hello, my name is")
 print(result[0].outputs[0].text)
 ```
 
diff --git a/docs/features/quantization/int4.md b/docs/features/quantization/int4.md
index f26de73c2f0..1df32a11ed9 100644
--- a/docs/features/quantization/int4.md
+++ b/docs/features/quantization/int4.md
@@ -108,7 +108,8 @@ After quantization, you can load and run the model in vLLM:
 
 ```python
 from vllm import LLM
-model = LLM("./Meta-Llama-3-8B-Instruct-W4A16-G128")
+
+llm = LLM("./Meta-Llama-3-8B-Instruct-W4A16-G128")
 ```
 
 To evaluate accuracy, you can use `lm_eval`:
diff --git a/docs/features/quantization/int8.md b/docs/features/quantization/int8.md
index 7e1cb3fee94..45fae58a648 100644
--- a/docs/features/quantization/int8.md
+++ b/docs/features/quantization/int8.md
@@ -114,7 +114,8 @@ After quantization, you can load and run the model in vLLM:
 
 ```python
 from vllm import LLM
-model = LLM("./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token")
+
+llm = LLM("./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token")
 ```
 
 To evaluate accuracy, you can use `lm_eval`:
diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md
index f0de84a66f8..a91fd1dcb64 100644
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@@ -149,11 +149,11 @@ You can change the output dimensions of embedding models that support Matryoshka
 ```python
 from vllm import LLM, PoolingParams
 
-model = LLM(model="jinaai/jina-embeddings-v3", 
-            task="embed", 
-            trust_remote_code=True)
-outputs = model.embed(["Follow the white rabbit."], 
-                      pooling_params=PoolingParams(dimensions=32))
+llm = LLM(model="jinaai/jina-embeddings-v3",
+          task="embed",
+          trust_remote_code=True)
+outputs = llm.embed(["Follow the white rabbit."],
+                    pooling_params=PoolingParams(dimensions=32))
 print(outputs[0].outputs)
 ```
 
diff --git a/examples/offline_inference/basic/classify.py b/examples/offline_inference/basic/classify.py
index 219064e9742..aaf0e83c9de 100644
--- a/examples/offline_inference/basic/classify.py
+++ b/examples/offline_inference/basic/classify.py
@@ -28,10 +28,10 @@ def main(args: Namespace):
 
     # Create an LLM.
     # You should pass task="classify" for classification models
-    model = LLM(**vars(args))
+    llm = LLM(**vars(args))
 
     # Generate logits. The output is a list of ClassificationRequestOutputs.
-    outputs = model.classify(prompts)
+    outputs = llm.classify(prompts)
 
     # Print the outputs.
     print("\nGenerated Outputs:\n" + "-" * 60)
diff --git a/examples/offline_inference/basic/embed.py b/examples/offline_inference/basic/embed.py
index 1114033d5ce..7ff9c7f5e0e 100644
--- a/examples/offline_inference/basic/embed.py
+++ b/examples/offline_inference/basic/embed.py
@@ -31,10 +31,10 @@ def main(args: Namespace):
 
     # Create an LLM.
     # You should pass task="embed" for embedding models
-    model = LLM(**vars(args))
+    llm = LLM(**vars(args))
 
     # Generate embedding. The output is a list of EmbeddingRequestOutputs.
-    outputs = model.embed(prompts)
+    outputs = llm.embed(prompts)
 
     # Print the outputs.
     print("\nGenerated Outputs:\n" + "-" * 60)
diff --git a/examples/offline_inference/basic/score.py b/examples/offline_inference/basic/score.py
index 6a08de2d2c3..d37527b0a13 100644
--- a/examples/offline_inference/basic/score.py
+++ b/examples/offline_inference/basic/score.py
@@ -27,10 +27,10 @@ def main(args: Namespace):
 
     # Create an LLM.
     # You should pass task="score" for cross-encoder models
-    model = LLM(**vars(args))
+    llm = LLM(**vars(args))
 
     # Generate scores. The output is a list of ScoringRequestOutputs.
-    outputs = model.score(text_1, texts_2)
+    outputs = llm.score(text_1, texts_2)
 
     # Print the outputs.
     print("\nGenerated Outputs:\n" + "-" * 60)
diff --git a/examples/offline_inference/embed_jina_embeddings_v3.py b/examples/offline_inference/embed_jina_embeddings_v3.py
index e68128399ba..7d78b8c63c6 100644
--- a/examples/offline_inference/embed_jina_embeddings_v3.py
+++ b/examples/offline_inference/embed_jina_embeddings_v3.py
@@ -30,11 +30,11 @@ def main(args: Namespace):
 
     # Create an LLM.
     # You should pass task="embed" for embedding models
-    model = LLM(**vars(args))
+    llm = LLM(**vars(args))
 
     # Generate embedding. The output is a list of EmbeddingRequestOutputs.
     # Only text matching task is supported for now. See #16120
-    outputs = model.embed(prompts)
+    outputs = llm.embed(prompts)
 
     # Print the outputs.
     print("\nGenerated Outputs:")
diff --git a/examples/offline_inference/embed_matryoshka_fy.py b/examples/offline_inference/embed_matryoshka_fy.py
index 7f5d74d9a3a..50a645ba827 100644
--- a/examples/offline_inference/embed_matryoshka_fy.py
+++ b/examples/offline_inference/embed_matryoshka_fy.py
@@ -30,10 +30,10 @@ def main(args: Namespace):
 
     # Create an LLM.
     # You should pass task="embed" for embedding models
-    model = LLM(**vars(args))
+    llm = LLM(**vars(args))
 
     # Generate embedding. The output is a list of EmbeddingRequestOutputs.
-    outputs = model.embed(prompts, pooling_params=PoolingParams(dimensions=32))
+    outputs = llm.embed(prompts, pooling_params=PoolingParams(dimensions=32))
 
     # Print the outputs.
     print("\nGenerated Outputs:")
diff --git a/examples/offline_inference/neuron_speculation.py b/examples/offline_inference/neuron_speculation.py
index 2ef69f29863..aaf3006a327 100644
--- a/examples/offline_inference/neuron_speculation.py
+++ b/examples/offline_inference/neuron_speculation.py
@@ -25,7 +25,7 @@ def config_buckets():
     os.environ["NEURON_TOKEN_GEN_BUCKETS"] = "128,512,1024,2048"
 
 
-def initialize_model():
+def initialize_llm():
     """Create an LLM with speculative decoding."""
     return LLM(
         model="openlm-research/open_llama_7b",
@@ -43,9 +43,9 @@ def initialize_model():
     )
 
 
-def process_requests(model: LLM, sampling_params: SamplingParams):
+def process_requests(llm: LLM, sampling_params: SamplingParams):
     """Generate texts from prompts and print them."""
-    outputs = model.generate(prompts, sampling_params)
+    outputs = llm.generate(prompts, sampling_params)
     for output in outputs:
         prompt = output.prompt
         generated_text = output.outputs[0].text
@@ -55,7 +55,7 @@ def process_requests(model: LLM, sampling_params: SamplingParams):
 def main():
     """Main function that sets up the model and processes prompts."""
     config_buckets()
-    model = initialize_model()
+    model = initialize_llm()
     # Create a sampling params object.
     sampling_params = SamplingParams(max_tokens=100, top_k=1)
     process_requests(model, sampling_params)
diff --git a/examples/offline_inference/prithvi_geospatial_mae.py b/examples/offline_inference/prithvi_geospatial_mae.py
index 567c448a8c9..6dc03e85baa 100644
--- a/examples/offline_inference/prithvi_geospatial_mae.py
+++ b/examples/offline_inference/prithvi_geospatial_mae.py
@@ -140,7 +140,7 @@
 class PrithviMAE:
     def __init__(self):
         print("Initializing PrithviMAE model")
-        self.model = LLM(
+        self.llm = LLM(
             model=os.path.join(os.path.dirname(__file__), "./model"),
             skip_tokenizer_init=True,
             dtype="float32",
@@ -158,7 +158,7 @@ def run(self, input_data, location_coords):
 
         prompt = {"prompt_token_ids": [1], "multi_modal_data": mm_data}
 
-        outputs = self.model.encode(prompt, use_tqdm=False)
+        outputs = self.llm.encode(prompt, use_tqdm=False)
         print("################ Inference done (it took seconds)  ##############")
 
         return outputs[0].outputs.data
diff --git a/examples/offline_inference/qwen3_reranker.py b/examples/offline_inference/qwen3_reranker.py
index fe3cebc348f..8b27c6ff524 100644
--- a/examples/offline_inference/qwen3_reranker.py
+++ b/examples/offline_inference/qwen3_reranker.py
@@ -17,13 +17,13 @@
 # Models converted offline using this method can not only be more efficient
 # and support the vllm score API, but also make the init parameters more
 # concise, for example.
-# model = LLM(model="tomaarsen/Qwen3-Reranker-0.6B-seq-cls", task="score")
+# llm = LLM(model="tomaarsen/Qwen3-Reranker-0.6B-seq-cls", task="score")
 
 # If you want to load the official original version, the init parameters are
 # as follows.
 
 
-def get_model() -> LLM:
+def get_llm() -> LLM:
     """Initializes and returns the LLM model for Qwen3-Reranker."""
     return LLM(
         model=model_name,
@@ -77,7 +77,7 @@ def main() -> None:
     ]
     documents = [document_template.format(doc=doc, suffix=suffix) for doc in documents]
 
-    model = get_model()
+    model = get_llm()
     outputs = model.score(queries, documents)
 
     print("-" * 30)
diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 2e103019f7a..13ddf035a55 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -236,13 +236,13 @@ def test_failed_model_execution(vllm_runner, monkeypatch) -> None:
     monkeypatch.setenv('VLLM_ENABLE_V1_MULTIPROCESSING', '0')
 
     with vllm_runner('facebook/opt-125m', enforce_eager=True) as vllm_model:
-        if isinstance(vllm_model.model.llm_engine, LLMEngineV1):
+        if isinstance(vllm_model.llm.llm_engine, LLMEngineV1):
             v1_test_failed_model_execution(vllm_model)
 
 
 def v1_test_failed_model_execution(vllm_model):
 
-    engine = vllm_model.model.llm_engine
+    engine = vllm_model.llm.llm_engine
     mocked_execute_model = Mock(
         side_effect=RuntimeError("Mocked Critical Error"))
     engine.engine_core.engine_core.model_executor.execute_model =\
diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py
index 341a39a42b8..db2fa2f6bef 100644
--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@@ -81,7 +81,7 @@ def test_chunked_prefill_recompute(
             disable_log_stats=False,
     ) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
+        assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt
                 < ARTIFICIAL_PREEMPTION_MAX_CNT)
 
     for i in range(len(example_prompts)):
@@ -118,10 +118,10 @@ def test_preemption(
             distributed_executor_backend=distributed_executor_backend,
     ) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
+        assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt
                 < ARTIFICIAL_PREEMPTION_MAX_CNT)
         total_preemption = (
-            vllm_model.model.llm_engine.scheduler[0].num_cumulative_preemption)
+            vllm_model.llm.llm_engine.scheduler[0].num_cumulative_preemption)
 
     check_outputs_equal(
         outputs_0_lst=hf_outputs,
@@ -174,12 +174,12 @@ def test_preemption_infeasible(
     ) as vllm_model:
         sampling_params = SamplingParams(max_tokens=max_tokens,
                                          ignore_eos=True)
-        req_outputs = vllm_model.model.generate(
+        req_outputs = vllm_model.llm.generate(
             example_prompts,
             sampling_params=sampling_params,
         )
 
-        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
+        assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt
                 < ARTIFICIAL_PREEMPTION_MAX_CNT)
 
     # Verify the request is ignored and not hang.
diff --git a/tests/conftest.py b/tests/conftest.py
index f3524d1fe2a..a18dbf58c80 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -784,7 +784,7 @@ def __init__(
         enforce_eager: Optional[bool] = False,
         **kwargs,
     ) -> None:
-        self.model = LLM(
+        self.llm = LLM(
             model=model_name,
             task=task,
             tokenizer=tokenizer_name,
@@ -854,9 +854,9 @@ def generate(
                                  videos=videos,
                                  audios=audios)
 
-        req_outputs = self.model.generate(inputs,
-                                          sampling_params=sampling_params,
-                                          **kwargs)
+        req_outputs = self.llm.generate(inputs,
+                                        sampling_params=sampling_params,
+                                        **kwargs)
 
         outputs: list[tuple[list[list[int]], list[str]]] = []
         for req_output in req_outputs:
@@ -902,9 +902,9 @@ def generate_w_logprobs(
                                  videos=videos,
                                  audios=audios)
 
-        req_outputs = self.model.generate(inputs,
-                                          sampling_params=sampling_params,
-                                          **kwargs)
+        req_outputs = self.llm.generate(inputs,
+                                        sampling_params=sampling_params,
+                                        **kwargs)
 
         toks_str_logsprobs_prompt_logprobs = (
             self._final_steps_generate_w_logprobs(req_outputs))
@@ -924,8 +924,8 @@ def generate_encoder_decoder_w_logprobs(
         '''
 
         assert sampling_params.logprobs is not None
-        req_outputs = self.model.generate(encoder_decoder_prompts,
-                                          sampling_params=sampling_params)
+        req_outputs = self.llm.generate(encoder_decoder_prompts,
+                                        sampling_params=sampling_params)
         toks_str_logsprobs_prompt_logprobs = (
             self._final_steps_generate_w_logprobs(req_outputs))
         # Omit prompt logprobs if not required by sampling params
@@ -1018,7 +1018,7 @@ def generate_beam_search(
                                  videos=videos,
                                  audios=audios)
 
-        outputs = self.model.beam_search(
+        outputs = self.llm.beam_search(
             inputs,
             BeamSearchParams(beam_width=beam_width, max_tokens=max_tokens))
         returned_outputs = []
@@ -1029,7 +1029,7 @@ def generate_beam_search(
         return returned_outputs
 
     def classify(self, prompts: list[str]) -> list[list[float]]:
-        req_outputs = self.model.classify(prompts)
+        req_outputs = self.llm.classify(prompts)
         return [req_output.outputs.probs for req_output in req_outputs]
 
     def embed(self,
@@ -1044,11 +1044,11 @@ def embed(self,
                                  videos=videos,
                                  audios=audios)
 
-        req_outputs = self.model.embed(inputs, *args, **kwargs)
+        req_outputs = self.llm.embed(inputs, *args, **kwargs)
         return [req_output.outputs.embedding for req_output in req_outputs]
 
     def encode(self, prompts: list[str]) -> list[list[float]]:
-        req_outputs = self.model.encode(prompts)
+        req_outputs = self.llm.encode(prompts)
         return [req_output.outputs.data for req_output in req_outputs]
 
     def score(
@@ -1058,18 +1058,18 @@ def score(
         *args,
         **kwargs,
     ) -> list[float]:
-        req_outputs = self.model.score(text_1, text_2, *args, **kwargs)
+        req_outputs = self.llm.score(text_1, text_2, *args, **kwargs)
         return [req_output.outputs.score for req_output in req_outputs]
 
     def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
-        executor = self.model.llm_engine.model_executor
+        executor = self.llm.llm_engine.model_executor
         return executor.apply_model(func)
 
     def __enter__(self):
         return self
 
     def __exit__(self, exc_type, exc_value, traceback):
-        del self.model
+        del self.llm
         cleanup_dist_env_and_memory()
 
 
diff --git a/tests/core/test_num_computed_tokens_update.py b/tests/core/test_num_computed_tokens_update.py
index 1b958e34df8..9e1b7913dfb 100644
--- a/tests/core/test_num_computed_tokens_update.py
+++ b/tests/core/test_num_computed_tokens_update.py
@@ -37,7 +37,7 @@ def test_num_computed_tokens_update(num_scheduler_steps: int,
                         num_scheduler_steps=num_scheduler_steps,
                         enable_chunked_prefill=enable_chunked_prefill,
                         enforce_eager=enforce_eager)
-    engine: LLMEngine = runner.model.llm_engine
+    engine: LLMEngine = runner.llm.llm_engine
 
     # In multi-step + chunked-prefill there is no separate single prompt step.
     # What is scheduled will run for num_scheduler_steps always.
diff --git a/tests/detokenizer/test_stop_reason.py b/tests/detokenizer/test_stop_reason.py
index 9716f7d72a5..1ff679789c9 100644
--- a/tests/detokenizer/test_stop_reason.py
+++ b/tests/detokenizer/test_stop_reason.py
@@ -28,7 +28,7 @@ def vllm_model(vllm_runner):
 def test_stop_reason(vllm_model, example_prompts):
     tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL)
     stop_token_id = tokenizer.convert_tokens_to_ids(STOP_STR)
-    llm = vllm_model.model
+    llm = vllm_model.llm
 
     # test stop token
     outputs = llm.generate(example_prompts,
diff --git a/tests/detokenizer/test_stop_strings.py b/tests/detokenizer/test_stop_strings.py
index efe938a20c4..cb87c44cc39 100644
--- a/tests/detokenizer/test_stop_strings.py
+++ b/tests/detokenizer/test_stop_strings.py
@@ -101,42 +101,42 @@ def _stop_token_id(llm):
 def test_stop_strings():
     # If V0, must set enforce_eager=False since we use
     # async output processing below.
-    vllm_model = LLM(MODEL, enforce_eager=envs.VLLM_USE_V1)
+    llm = LLM(MODEL, enforce_eager=envs.VLLM_USE_V1)
 
     if envs.VLLM_USE_V1:
-        _stop_basic(vllm_model)
+        _stop_basic(llm)
     else:
-        _set_async_mode(vllm_model, True)
-        _stop_basic(vllm_model)
+        _set_async_mode(llm, True)
+        _stop_basic(llm)
 
-        _set_async_mode(vllm_model, False)
-        _stop_basic(vllm_model)
+        _set_async_mode(llm, False)
+        _stop_basic(llm)
 
     if envs.VLLM_USE_V1:
-        _stop_multi_tokens(vllm_model)
+        _stop_multi_tokens(llm)
     else:
-        _set_async_mode(vllm_model, True)
-        _stop_multi_tokens(vllm_model)
+        _set_async_mode(llm, True)
+        _stop_multi_tokens(llm)
 
-        _set_async_mode(vllm_model, False)
-        _stop_multi_tokens(vllm_model)
+        _set_async_mode(llm, False)
+        _stop_multi_tokens(llm)
 
     if envs.VLLM_USE_V1:
-        _stop_partial_token(vllm_model)
+        _stop_partial_token(llm)
     else:
-        _set_async_mode(vllm_model, True)
-        _stop_partial_token(vllm_model)
+        _set_async_mode(llm, True)
+        _stop_partial_token(llm)
 
-        _set_async_mode(vllm_model, False)
-        _stop_partial_token(vllm_model)
+        _set_async_mode(llm, False)
+        _stop_partial_token(llm)
 
     if envs.VLLM_USE_V1:
         # FIXME: this does not respect include_in_output=False
-        # _stop_token_id(vllm_model)
+        # _stop_token_id(llm)
         pass
     else:
-        _set_async_mode(vllm_model, True)
-        _stop_token_id(vllm_model)
+        _set_async_mode(llm, True)
+        _stop_token_id(llm)
 
-        _set_async_mode(vllm_model, False)
-        _stop_token_id(vllm_model)
+        _set_async_mode(llm, False)
+        _stop_token_id(llm)
diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py
index bebf44b6dfd..b1ad1fdd060 100644
--- a/tests/lora/test_llama_tp.py
+++ b/tests/lora/test_llama_tp.py
@@ -186,25 +186,25 @@ def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files,
     model_uri = tmp_path / "vllm" / model_ref / suffix / model_name
     tensorizer_config = TensorizerConfig(tensorizer_uri=str(model_uri))
 
-    loaded_vllm_model = LLM(model=model_ref,
-                            load_format="tensorizer",
-                            enable_lora=True,
-                            enforce_eager=True,
-                            model_loader_extra_config=tensorizer_config,
-                            max_num_seqs=13,
-                            tensor_parallel_size=2,
-                            max_loras=2)
+    loaded_llm = LLM(model=model_ref,
+                     load_format="tensorizer",
+                     enable_lora=True,
+                     enforce_eager=True,
+                     model_loader_extra_config=tensorizer_config,
+                     max_num_seqs=13,
+                     tensor_parallel_size=2,
+                     max_loras=2)
 
     tc_as_dict = tensorizer_config.to_serializable()
 
     print("lora adapter created")
-    assert do_sample(loaded_vllm_model,
+    assert do_sample(loaded_llm,
                      sql_lora_files,
                      tensorizer_config_dict=tc_as_dict,
                      lora_id=0) == EXPECTED_NO_LORA_OUTPUT
 
     print("lora 1")
-    assert do_sample(loaded_vllm_model,
+    assert do_sample(loaded_llm,
                      sql_lora_files,
                      tensorizer_config_dict=tc_as_dict,
                      lora_id=1) == EXPECTED_LORA_OUTPUT
diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
index 7bb5d8980d6..b527b94aa58 100644
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -44,7 +44,7 @@ def test_metric_counter_prompt_tokens(
                      dtype=dtype,
                      disable_log_stats=False,
                      gpu_memory_utilization=0.4) as vllm_model:
-        tokenizer = vllm_model.model.get_tokenizer()
+        tokenizer = vllm_model.llm.get_tokenizer()
         prompt_token_counts = [
             len(tokenizer.encode(p)) for p in example_prompts
         ]
@@ -56,7 +56,7 @@ def test_metric_counter_prompt_tokens(
         vllm_prompt_token_count = sum(prompt_token_counts)
 
         _ = vllm_model.generate_greedy(example_prompts, max_tokens)
-        stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
+        stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
         metric_count = stat_logger.metrics.counter_prompt_tokens.labels(
             **stat_logger.labels)._value.get()
 
@@ -80,8 +80,8 @@ def test_metric_counter_generation_tokens(
                      disable_log_stats=False,
                      gpu_memory_utilization=0.4) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-        tokenizer = vllm_model.model.get_tokenizer()
-        stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
+        tokenizer = vllm_model.llm.get_tokenizer()
+        stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
         metric_count = stat_logger.metrics.counter_generation_tokens.labels(
             **stat_logger.labels)._value.get()
         vllm_generation_count = 0
@@ -116,8 +116,8 @@ def test_metric_counter_generation_tokens_multi_step(
             disable_async_output_proc=disable_async_output_proc,
     ) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-        tokenizer = vllm_model.model.get_tokenizer()
-        stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
+        tokenizer = vllm_model.llm.get_tokenizer()
+        stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
         metric_count = stat_logger.metrics.counter_generation_tokens.labels(
             **stat_logger.labels)._value.get()
         vllm_generation_count = 0
@@ -148,7 +148,7 @@ def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
                      disable_log_stats=False,
                      gpu_memory_utilization=0.3,
                      served_model_name=served_model_name) as vllm_model:
-        stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
+        stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
         metrics_tag_content = stat_logger.labels["model_name"]
 
     if envs.VLLM_CI_USE_S3:
@@ -256,7 +256,7 @@ def test_metric_spec_decode(
     ) as vllm_model:
 
         # Force log interval to be 0 to catch all metrics.
-        stat_logger = vllm_model.model.llm_engine.stat_loggers['prometheus']
+        stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
         stat_logger.local_interval = 0
 
         # Note that the purpose of this test is to verify spec decode
diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py
index 4bdb651e517..53ade6b9bda 100644
--- a/tests/model_executor/test_model_load_with_params.py
+++ b/tests/model_executor/test_model_load_with_params.py
@@ -32,8 +32,8 @@ def test_model_loading_with_params(vllm_runner):
         output = vllm_model.embed("Write a short story about a robot that"
                                   " dreams for the first time.\n")
 
-        model_config = vllm_model.model.llm_engine.model_config
-        model_tokenizer = vllm_model.model.llm_engine.tokenizer
+        model_config = vllm_model.llm.llm_engine.model_config
+        model_tokenizer = vllm_model.llm.llm_engine.tokenizer
 
         # asserts on the bert model config file
         assert model_config.encoder_config["max_seq_length"] == 512
@@ -70,8 +70,8 @@ def test_roberta_model_loading_with_params(vllm_runner):
         output = vllm_model.embed("Write a short story about a robot that"
                                   " dreams for the first time.\n")
 
-        model_config = vllm_model.model.llm_engine.model_config
-        model_tokenizer = vllm_model.model.llm_engine.tokenizer
+        model_config = vllm_model.llm.llm_engine.model_config
+        model_tokenizer = vllm_model.llm.llm_engine.tokenizer
 
         # asserts on the bert model config file
         assert model_config.encoder_config["max_seq_length"] == 512
@@ -108,7 +108,7 @@ def test_facebook_roberta_model_loading_with_params(vllm_runner):
         output = vllm_model.embed("Write a short story about a robot that"
                                   " dreams for the first time.\n")
 
-        model_tokenizer = vllm_model.model.llm_engine.tokenizer
+        model_tokenizer = vllm_model.llm.llm_engine.tokenizer
         assert model_tokenizer.tokenizer_id == model_name
 
         def check_model(model):
diff --git a/tests/models/language/generation/test_hybrid.py b/tests/models/language/generation/test_hybrid.py
index eba14e64553..67260a0aac6 100644
--- a/tests/models/language/generation/test_hybrid.py
+++ b/tests/models/language/generation/test_hybrid.py
@@ -275,7 +275,7 @@ def test_models_preemption_recompute(
     Tests that outputs are identical with and w/o preemptions (recompute).
     """
     with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
-        scheduler = vllm_model.model.llm_engine.scheduler[0]
+        scheduler = vllm_model.llm.llm_engine.scheduler[0]
         scheduler.ENABLE_ARTIFICIAL_PREEMPT = True
         preempt_vllm_outputs = vllm_model.generate_greedy(
             example_prompts, max_tokens)
diff --git a/tests/models/language/generation/test_mistral.py b/tests/models/language/generation/test_mistral.py
index c70698ede37..81a88f2d485 100644
--- a/tests/models/language/generation/test_mistral.py
+++ b/tests/models/language/generation/test_mistral.py
@@ -238,8 +238,8 @@ def test_mistral_symbolic_languages(vllm_runner, model: str,
                      load_format="mistral") as vllm_model:
         for prompt in SYMBOLIC_LANG_PROMPTS:
             msg = {"role": "user", "content": prompt}
-            outputs = vllm_model.model.chat([msg],
-                                            sampling_params=SAMPLING_PARAMS)
+            outputs = vllm_model.llm.chat([msg],
+                                          sampling_params=SAMPLING_PARAMS)
             assert "�" not in outputs[0].outputs[0].text.strip()
 
 
@@ -253,11 +253,11 @@ def test_mistral_function_calling(vllm_runner, model: str, dtype: str) -> None:
                      load_format="mistral") as vllm_model:
 
         msgs = copy.deepcopy(MSGS)
-        outputs = vllm_model.model.chat(msgs,
-                                        tools=TOOLS,
-                                        sampling_params=SAMPLING_PARAMS)
+        outputs = vllm_model.llm.chat(msgs,
+                                      tools=TOOLS,
+                                      sampling_params=SAMPLING_PARAMS)
 
-        tokenizer = vllm_model.model.get_tokenizer()
+        tokenizer = vllm_model.llm.get_tokenizer()
         tool_parser = MistralToolParser(tokenizer)
 
         model_output = outputs[0].outputs[0].text.strip()
@@ -308,7 +308,7 @@ def test_mistral_guided_decoding(
                 f"Give an example JSON for an employee profile that "
                 f"fits this schema: {SAMPLE_JSON_SCHEMA}"
             }]
-            outputs = vllm_model.model.chat(messages, sampling_params=params)
+            outputs = vllm_model.llm.chat(messages, sampling_params=params)
 
         generated_text = outputs[0].outputs[0].text
         json_response = json.loads(generated_text)
diff --git a/tests/models/language/pooling/mteb_utils.py b/tests/models/language/pooling/mteb_utils.py
index 6c4fde5fdfa..dd6769d2911 100644
--- a/tests/models/language/pooling/mteb_utils.py
+++ b/tests/models/language/pooling/mteb_utils.py
@@ -30,7 +30,7 @@ class VllmMtebEncoder(mteb.Encoder):
 
     def __init__(self, vllm_model):
         super().__init__()
-        self.model = vllm_model
+        self.llm = vllm_model
         self.rng = np.random.default_rng(seed=42)
 
     def encode(
@@ -43,7 +43,7 @@ def encode(
         # issues by randomizing the order.
         r = self.rng.permutation(len(sentences))
         sentences = [sentences[i] for i in r]
-        outputs = self.model.embed(sentences, use_tqdm=False)
+        outputs = self.llm.embed(sentences, use_tqdm=False)
         embeds = np.array(outputs)
         embeds = embeds[np.argsort(r)]
         return embeds
@@ -61,10 +61,10 @@ def predict(
         queries = [s[0] for s in sentences]
         corpus = [s[1] for s in sentences]
 
-        outputs = self.model.score(queries,
-                                   corpus,
-                                   truncate_prompt_tokens=-1,
-                                   use_tqdm=False)
+        outputs = self.llm.score(queries,
+                                 corpus,
+                                 truncate_prompt_tokens=-1,
+                                 use_tqdm=False)
         scores = np.array(outputs)
         scores = scores[np.argsort(r)]
         return scores
@@ -178,11 +178,11 @@ def mteb_test_embed_models(hf_runner,
 
         if model_info.architecture:
             assert (model_info.architecture
-                    in vllm_model.model.llm_engine.model_config.architectures)
+                    in vllm_model.llm.llm_engine.model_config.architectures)
 
         vllm_main_score = run_mteb_embed_task(VllmMtebEncoder(vllm_model),
                                               MTEB_EMBED_TASKS)
-        vllm_dtype = vllm_model.model.llm_engine.model_config.dtype
+        vllm_dtype = vllm_model.llm.llm_engine.model_config.dtype
 
     with hf_runner(model_info.name,
                    is_sentence_transformer=True,
@@ -203,7 +203,7 @@ def mteb_test_embed_models(hf_runner,
 
 def run_mteb_rerank(cross_encoder, tasks, languages):
     with tempfile.TemporaryDirectory() as results_folder:
-        bm25s = mteb.get_model("bm25s")
+        bm25s = mteb.get_llm("bm25s")
         tasks = mteb.get_tasks(tasks=tasks, languages=languages)
 
         subset = "default"
@@ -284,7 +284,7 @@ def mteb_test_rerank_models(hf_runner,
                      max_num_seqs=8,
                      **vllm_extra_kwargs) as vllm_model:
 
-        model_config = vllm_model.model.llm_engine.model_config
+        model_config = vllm_model.llm.llm_engine.model_config
 
         if model_info.architecture:
             assert (model_info.architecture in model_config.architectures)
diff --git a/tests/models/language/pooling/test_gritlm.py b/tests/models/language/pooling/test_gritlm.py
index c2f70bb647a..5cf0d3fd4f6 100644
--- a/tests/models/language/pooling/test_gritlm.py
+++ b/tests/models/language/pooling/test_gritlm.py
@@ -124,7 +124,7 @@ def test_gritlm_offline_embedding(vllm_runner):
             task="embed",
             max_model_len=MAX_MODEL_LEN,
     ) as vllm_model:
-        llm = vllm_model.model
+        llm = vllm_model.llm
 
         d_rep = run_llm_encode(
             llm,
@@ -171,7 +171,7 @@ def test_gritlm_offline_generate(monkeypatch: pytest.MonkeyPatch, vllm_runner):
             task="generate",
             max_model_len=MAX_MODEL_LEN,
     ) as vllm_model:
-        llm = vllm_model.model
+        llm = vllm_model.llm
 
         sampling_params = SamplingParams(temperature=0.0, max_tokens=256)
         outputs = llm.generate(input, sampling_params=sampling_params)
diff --git a/tests/models/language/pooling/test_jina.py b/tests/models/language/pooling/test_jina.py
index 9bfe7411e16..16c711407ae 100644
--- a/tests/models/language/pooling/test_jina.py
+++ b/tests/models/language/pooling/test_jina.py
@@ -87,10 +87,10 @@ def test_matryoshka(
                      task="embed",
                      dtype=dtype,
                      max_model_len=None) as vllm_model:
-        assert vllm_model.model.llm_engine.model_config.is_matryoshka
+        assert vllm_model.llm.llm_engine.model_config.is_matryoshka
 
         matryoshka_dimensions = (
-            vllm_model.model.llm_engine.model_config.matryoshka_dimensions)
+            vllm_model.llm.llm_engine.model_config.matryoshka_dimensions)
         assert matryoshka_dimensions is not None
 
         if dimensions not in matryoshka_dimensions:
diff --git a/tests/models/language/pooling/test_nomic_max_model_len.py b/tests/models/language/pooling/test_nomic_max_model_len.py
index 250b3a52835..7413ef578e3 100644
--- a/tests/models/language/pooling/test_nomic_max_model_len.py
+++ b/tests/models/language/pooling/test_nomic_max_model_len.py
@@ -23,7 +23,7 @@
 def test_default(model_info, vllm_runner):
     with vllm_runner(model_info.name, task="embed",
                      max_model_len=None) as vllm_model:
-        model_config = vllm_model.model.llm_engine.model_config
+        model_config = vllm_model.llm.llm_engine.model_config
         if model_info.name == "nomic-ai/nomic-embed-text-v2-moe":
             # For nomic-embed-text-v2-moe the length is set to 512
             # by sentence_bert_config.json.
@@ -38,7 +38,7 @@ def test_set_max_model_len_legal(model_info, vllm_runner):
     # set max_model_len <= 512
     with vllm_runner(model_info.name, task="embed",
                      max_model_len=256) as vllm_model:
-        model_config = vllm_model.model.llm_engine.model_config
+        model_config = vllm_model.llm.llm_engine.model_config
         assert model_config.max_model_len == 256
 
     # set 512 < max_model_len <= 2048
@@ -52,7 +52,7 @@ def test_set_max_model_len_legal(model_info, vllm_runner):
     else:
         with vllm_runner(model_info.name, task="embed",
                          max_model_len=1024) as vllm_model:
-            model_config = vllm_model.model.llm_engine.model_config
+            model_config = vllm_model.llm.llm_engine.model_config
             assert model_config.max_model_len == 1024
 
 
diff --git a/tests/models/language/pooling/test_truncation_control.py b/tests/models/language/pooling/test_truncation_control.py
index 33aff1c873f..c7399e01c73 100644
--- a/tests/models/language/pooling/test_truncation_control.py
+++ b/tests/models/language/pooling/test_truncation_control.py
@@ -28,7 +28,7 @@ def test_smaller_truncation_size(vllm_runner,
 
     with vllm_runner(model_name, task="embed",
                      max_model_len=max_model_len) as vllm_model:
-        vllm_output = vllm_model.model.encode(
+        vllm_output = vllm_model.llm.encode(
             input_str, truncate_prompt_tokens=truncate_prompt_tokens)
 
     prompt_tokens = vllm_output[0].prompt_token_ids
@@ -43,7 +43,7 @@ def test_max_truncation_size(vllm_runner,
 
     with vllm_runner(model_name, task="embed",
                      max_model_len=max_model_len) as vllm_model:
-        vllm_output = vllm_model.model.encode(
+        vllm_output = vllm_model.llm.encode(
             input_str, truncate_prompt_tokens=truncate_prompt_tokens)
 
     prompt_tokens = vllm_output[0].prompt_token_ids
@@ -61,7 +61,7 @@ def test_bigger_truncation_size(vllm_runner,
             model_name, task="embed",
             max_model_len=max_model_len) as vllm_model:
 
-        llm_output = vllm_model.model.encode(
+        llm_output = vllm_model.llm.encode(
             input_str, truncate_prompt_tokens=truncate_prompt_tokens)
 
         assert llm_output == f"""truncate_prompt_tokens value 
diff --git a/tests/models/multimodal/generation/test_pixtral.py b/tests/models/multimodal/generation/test_pixtral.py
index 1def825ab08..e157d6f4a79 100644
--- a/tests/models/multimodal/generation/test_pixtral.py
+++ b/tests/models/multimodal/generation/test_pixtral.py
@@ -180,8 +180,7 @@ def test_chat(
     ) as vllm_model:
         outputs = []
         for msg in MSGS:
-            output = vllm_model.model.chat(msg,
-                                           sampling_params=SAMPLING_PARAMS)
+            output = vllm_model.llm.chat(msg, sampling_params=SAMPLING_PARAMS)
 
             outputs.extend(output)
 
@@ -217,7 +216,7 @@ def test_multi_modal_placeholders(vllm_runner, prompt,
             max_model_len=8192,
             limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
     ) as vllm_model:
-        outputs = vllm_model.model.generate(prompt)
+        outputs = vllm_model.llm.generate(prompt)
 
         assert len(outputs) == 1, f"{len(outputs)=}"
         output: RequestOutput = outputs[0]
diff --git a/tests/models/multimodal/generation/test_whisper.py b/tests/models/multimodal/generation/test_whisper.py
index 363d55153aa..4a65e8c9520 100644
--- a/tests/models/multimodal/generation/test_whisper.py
+++ b/tests/models/multimodal/generation/test_whisper.py
@@ -106,7 +106,7 @@ def run_test(
             tensor_parallel_size=tensor_parallel_size,
             distributed_executor_backend=distributed_executor_backend,
     ) as vllm_model:
-        llm = vllm_model.model
+        llm = vllm_model.llm
 
         sampling_params = SamplingParams(
             temperature=0,
diff --git a/tests/models/multimodal/generation/vlm_utils/core.py b/tests/models/multimodal/generation/vlm_utils/core.py
index 8c83d8f8a8a..cf8962ce497 100644
--- a/tests/models/multimodal/generation/vlm_utils/core.py
+++ b/tests/models/multimodal/generation/vlm_utils/core.py
@@ -85,7 +85,7 @@ def run_test(
                      enforce_eager=enforce_eager,
                      task=task,
                      **vllm_runner_kwargs_) as vllm_model:
-        tokenizer = vllm_model.model.get_tokenizer()
+        tokenizer = vllm_model.llm.get_tokenizer()
 
         vllm_kwargs: dict[str, Any] = {}
         if get_stop_token_ids is not None:
diff --git a/tests/models/multimodal/pooling/test_dse_qwen2_vl.py b/tests/models/multimodal/pooling/test_dse_qwen2_vl.py
index f889eea5e83..a6f5aeccf94 100644
--- a/tests/models/multimodal/pooling/test_dse_qwen2_vl.py
+++ b/tests/models/multimodal/pooling/test_dse_qwen2_vl.py
@@ -96,7 +96,7 @@ def _run_test(
                      dtype=dtype,
                      enforce_eager=True,
                      max_model_len=8192) as vllm_model:
-        tokenizer = vllm_model.model.get_tokenizer()
+        tokenizer = vllm_model.llm.get_tokenizer()
         texts = [
             # this is necessary because vllm_model.embed will not apply any
             # templating to the prompt, and therefore lacks an image_pad
diff --git a/tests/models/multimodal/pooling/test_jinavl_reranker.py b/tests/models/multimodal/pooling/test_jinavl_reranker.py
index 50c91f1f81c..712b6801de4 100644
--- a/tests/models/multimodal/pooling/test_jinavl_reranker.py
+++ b/tests/models/multimodal/pooling/test_jinavl_reranker.py
@@ -56,7 +56,7 @@ def create_image_param(url: str) -> ChatCompletionContentPartImageParam:
             mm_processor_kwargs=mm_processor_kwargs,
             limit_mm_per_prompt=limit_mm_per_prompt,
     ) as vllm_model:
-        outputs = vllm_model.model.score(query, documents)
+        outputs = vllm_model.llm.score(query, documents)
 
     return [output.outputs.score for output in outputs]
 
diff --git a/tests/models/quantization/test_modelopt.py b/tests/models/quantization/test_modelopt.py
index 6ad526cc893..e23d4d9d211 100644
--- a/tests/models/quantization/test_modelopt.py
+++ b/tests/models/quantization/test_modelopt.py
@@ -45,7 +45,7 @@
                     reason="fp8 is not supported on this GPU type.")
 @pytest.mark.parametrize("model_name", MODELS)
 def test_models(example_prompts, model_name) -> None:
-    model = LLM(
+    llm = LLM(
         model=model_name,
         max_model_len=MAX_MODEL_LEN,
         trust_remote_code=True,
@@ -68,9 +68,9 @@ def test_models(example_prompts, model_name) -> None:
     # Note: these need to be run 1 at a time due to numerical precision,
     # since the expected strs were generated this way.
     for prompt in formatted_prompts:
-        outputs = model.generate(prompt, params)
+        outputs = llm.generate(prompt, params)
         generations.append(outputs[0].outputs[0].text)
-    del model
+    del llm
 
     print(model_name, generations)
     expected_strs = EXPECTED_STRS_MAP[model_name]
diff --git a/tests/models/quantization/test_nvfp4.py b/tests/models/quantization/test_nvfp4.py
index b95dad9a4ef..b3c217e729e 100644
--- a/tests/models/quantization/test_nvfp4.py
+++ b/tests/models/quantization/test_nvfp4.py
@@ -46,7 +46,7 @@
                     reason="modelopt_fp4 is not supported on this GPU type.")
 @pytest.mark.parametrize("model_name", MODELS)
 def test_models(example_prompts, model_name) -> None:
-    model = LLM(
+    llm = LLM(
         model=model_name,
         max_model_len=MAX_MODEL_LEN,
         trust_remote_code=True,
@@ -69,9 +69,9 @@ def test_models(example_prompts, model_name) -> None:
     # Note: these need to be run 1 at a time due to numerical precision,
     # since the expected strs were generated this way.
     for prompt in formatted_prompts:
-        outputs = model.generate(prompt, params)
+        outputs = llm.generate(prompt, params)
         generations.append(outputs[0].outputs[0].text)
-    del model
+    del llm
 
     print(model_name, generations)
     expected_strs = EXPECTED_STRS_MAP[model_name]
diff --git a/tests/prefix_caching/test_disable_sliding_window.py b/tests/prefix_caching/test_disable_sliding_window.py
index f00a8f6998c..b940ab416e6 100644
--- a/tests/prefix_caching/test_disable_sliding_window.py
+++ b/tests/prefix_caching/test_disable_sliding_window.py
@@ -25,25 +25,25 @@
 @pytest.mark.parametrize("model_len_len", MODEL_LEN_LEN)
 def test_disable_sliding_window(model_len_len, ):
     model, sliding_len, full_len = model_len_len
-    vllm_disabled_model = LLM(model, disable_sliding_window=True)
-    vllm_disabled_model.generate("Hi my name is")
-    model_config = vllm_disabled_model.llm_engine.model_config
+    disabled_llm = LLM(model, disable_sliding_window=True)
+    disabled_llm.generate("Hi my name is")
+    model_config = disabled_llm.llm_engine.model_config
     assert model_config.max_model_len == sliding_len, (
         "Max len expected to equal sliding_len of %s, but got %s", sliding_len,
         model_config.max_model_len)
 
-    del vllm_disabled_model
+    del disabled_llm
     cleanup_dist_env_and_memory()
 
-    vllm_enabled_model = LLM(model,
-                             enforce_eager=True,
-                             disable_sliding_window=False,
-                             enable_prefix_caching=False)
-    vllm_enabled_model.generate("Hi my name is")
-    model_config = vllm_enabled_model.llm_engine.model_config
+    enabled_llm = LLM(model,
+                      enforce_eager=True,
+                      disable_sliding_window=False,
+                      enable_prefix_caching=False)
+    enabled_llm.generate("Hi my name is")
+    model_config = enabled_llm.llm_engine.model_config
     assert model_config.max_model_len == full_len, (
         "Max len expected to equal full_len of %s, but got %s", full_len,
         model_config.max_model_len)
 
-    del vllm_enabled_model
+    del enabled_llm
     cleanup_dist_env_and_memory()
diff --git a/tests/prefix_caching/test_prefix_caching.py b/tests/prefix_caching/test_prefix_caching.py
index a65fc934b16..5bf6ed957c7 100644
--- a/tests/prefix_caching/test_prefix_caching.py
+++ b/tests/prefix_caching/test_prefix_caching.py
@@ -93,8 +93,8 @@ def test_mixed_requests(
             # Run all the promopts
             greedy_params = SamplingParams(temperature=0.0,
                                            max_tokens=max_tokens)
-            req_outputs = vllm_model.model.generate(example_prompts,
-                                                    greedy_params)
+            req_outputs = vllm_model.llm.generate(example_prompts,
+                                                  greedy_params)
 
             # Verify number of cached tokens
             for i in range(len(req_outputs)):
@@ -161,7 +161,7 @@ def test_fully_cached_prefill_needs_uncached_token(model):
         max_num_batched_tokens=max_num_batched_tokens,
         max_num_seqs=max_num_batched_tokens,
     )
-    engine: LLMEngine = runner.model.llm_engine
+    engine: LLMEngine = runner.llm.llm_engine
 
     scheduler: Scheduler = SchedulerProxy(engine.scheduler[0])  # type: ignore
     engine.scheduler[0] = scheduler
diff --git a/tests/quantization/test_gptq_dynamic.py b/tests/quantization/test_gptq_dynamic.py
index 23b999e7c67..aea50e99c1d 100644
--- a/tests/quantization/test_gptq_dynamic.py
+++ b/tests/quantization/test_gptq_dynamic.py
@@ -39,7 +39,7 @@ def test_gptq_with_dynamic(vllm_runner, model_id: str, use_marlin_kernel: bool,
     linear_method_cls = GPTQMarlinLinearMethod if use_marlin_kernel else (
         GPTQLinearMethod)
 
-    for name, submodule in (vllm_model.model.llm_engine.model_executor.
+    for name, submodule in (vllm_model.llm.llm_engine.model_executor.
                             driver_worker.model_runner.model.named_modules()):
         if name == "lm_head":
             assert isinstance(submodule.quant_method, linear_method_cls)
diff --git a/tests/quantization/test_quark.py b/tests/quantization/test_quark.py
index 2db11cb997d..4a0c8ba4d8a 100644
--- a/tests/quantization/test_quark.py
+++ b/tests/quantization/test_quark.py
@@ -107,11 +107,11 @@ def test_quark_fp8_parity(vllm_runner):
     }
     with (vllm_runner(quark_model_id, **llm_kwargs) as
           quark_handle, vllm_runner(fp8_model_id, **llm_kwargs) as fp8_handle):
-        quark_model = (quark_handle.model.llm_engine.model_executor.
+        quark_model = (quark_handle.llm.llm_engine.model_executor.
                        driver_worker.model_runner.model)
         quark_state_dict = quark_model.state_dict()
 
-        fp8_model = (fp8_handle.model.llm_engine.model_executor.driver_worker.
+        fp8_model = (fp8_handle.llm.llm_engine.model_executor.driver_worker.
                      model_runner.model)
         fp8_state_dict = fp8_model.state_dict()
 
diff --git a/tests/quantization/test_register_quantization_config.py b/tests/quantization/test_register_quantization_config.py
index 6c541fdbeea..84705e92c85 100644
--- a/tests/quantization/test_register_quantization_config.py
+++ b/tests/quantization/test_register_quantization_config.py
@@ -111,7 +111,7 @@ def test_custom_quant(vllm_runner, model, monkeypatch):
                      quantization="custom_quant",
                      enforce_eager=True) as llm:
 
-        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
+        model = llm.llm.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
         layer = model.model.layers[0]
         qkv_proj = layer.self_attn.qkv_proj
 
diff --git a/tests/samplers/test_ignore_eos.py b/tests/samplers/test_ignore_eos.py
index 7eb9c0b5fb8..ea4a17dd230 100644
--- a/tests/samplers/test_ignore_eos.py
+++ b/tests/samplers/test_ignore_eos.py
@@ -36,7 +36,7 @@ def test_ignore_eos(
                                          ignore_eos=True)
 
         for prompt in example_prompts:
-            ignore_eos_output = vllm_model.model.generate(
+            ignore_eos_output = vllm_model.llm.generate(
                 prompt, sampling_params=sampling_params)
             output_length = len(ignore_eos_output[0].outputs[0].token_ids)
             assert output_length == max_tokens
diff --git a/tests/samplers/test_logits_processor.py b/tests/samplers/test_logits_processor.py
index 901c8759126..123f9595e97 100644
--- a/tests/samplers/test_logits_processor.py
+++ b/tests/samplers/test_logits_processor.py
@@ -26,7 +26,7 @@ def test_logits_processor_force_generate(
     dtype: str,
 ) -> None:
     with vllm_runner(model, dtype=dtype) as vllm_model:
-        tokenizer = vllm_model.model.get_tokenizer()
+        tokenizer = vllm_model.llm.get_tokenizer()
         repeat_times = 2
         enforced_answers = " vLLM"
         vllm_token_ids = tokenizer.encode(enforced_answers,
@@ -45,13 +45,13 @@ def pick_vllm(token_ids, logits):
         )
 
         # test logits_processors when prompt_logprobs is not None
-        vllm_model.model._add_request(
+        vllm_model.llm._add_request(
             example_prompts[0],
             params=params_with_logprobs,
         )
 
         # test prompt_logprobs is not None
-        vllm_model.model._add_request(
+        vllm_model.llm._add_request(
             example_prompts[1],
             params=SamplingParams(
                 prompt_logprobs=3,
@@ -60,11 +60,11 @@ def pick_vllm(token_ids, logits):
         )
 
         # test grouped requests
-        vllm_model.model._add_request(
+        vllm_model.llm._add_request(
             example_prompts[2],
             params=SamplingParams(max_tokens=max_tokens),
         )
 
-        outputs = vllm_model.model._run_engine(use_tqdm=False)
+        outputs = vllm_model.llm._run_engine(use_tqdm=False)
 
         assert outputs[0].outputs[0].text == enforced_answers * repeat_times
diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py
index 86c8a03eee1..87f40b10053 100644
--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
@@ -64,7 +64,7 @@ def test_get_prompt_logprobs(
                                               prompt_logprobs=num_top_logprobs,
                                               temperature=0.0,
                                               detokenize=detokenize)
-        vllm_results = vllm_model.model.generate(
+        vllm_results = vllm_model.llm.generate(
             example_prompts, sampling_params=vllm_sampling_params)
 
     # Test whether logprobs are included in the results.
@@ -174,7 +174,7 @@ def test_none_logprobs(vllm_runner, model, chunked_prefill_token_size: int,
                                                        logprobs=None,
                                                        temperature=0.0,
                                                        detokenize=detokenize)
-        results_logprobs_none = vllm_model.model.generate(
+        results_logprobs_none = vllm_model.llm.generate(
             example_prompts, sampling_params=sampling_params_logprobs_none)
 
     for i in range(len(results_logprobs_none)):
diff --git a/tests/samplers/test_no_bad_words.py b/tests/samplers/test_no_bad_words.py
index 42b529ae169..11803b8d7a5 100644
--- a/tests/samplers/test_no_bad_words.py
+++ b/tests/samplers/test_no_bad_words.py
@@ -20,7 +20,7 @@ def v1(run_with_both_engines):
 
 
 def _generate(
-    model: LLM,
+    llm: LLM,
     prompt: str,
     num_prompt_tokens: int,
     temperature: float = 0,
@@ -32,7 +32,7 @@ def _generate(
     )
 
     # [([output_token_ids, ], [output_text, ]), ]
-    output = model.generate([prompt], sampling_params=sampling_params)
+    output = llm.generate([prompt], sampling_params=sampling_params)
 
     output_token_ids = output[0][0][0][num_prompt_tokens:]
     # [0] first (and only) request output
@@ -66,10 +66,10 @@ def test_one_token_bad_word(self, vllm_runner):
             assert self.target_token_id not in output_token_ids
 
     def _generate(self,
-                  model: LLM,
+                  llm: LLM,
                   bad_words: Optional[list[str]] = None) -> list[int]:
         return _generate(
-            model=model,
+            llm=llm,
             prompt=self.PROMPT,
             num_prompt_tokens=self.num_prompt_tokens,
             bad_words=bad_words,
@@ -156,10 +156,10 @@ def test_two_token_bad_word(self, vllm_runner):
                     or (self.neighbour_token_id2 in output_token_ids))
 
     def _generate(self,
-                  model: LLM,
+                  llm: LLM,
                   bad_words: Optional[list[str]] = None) -> list[int]:
         return _generate(
-            model=model,
+            llm=llm,
             prompt=self.PROMPT,
             num_prompt_tokens=self.num_prompt_tokens,
             bad_words=bad_words,
diff --git a/tests/samplers/test_seeded_generate.py b/tests/samplers/test_seeded_generate.py
index b339b4b2ddf..5a0efd98acc 100644
--- a/tests/samplers/test_seeded_generate.py
+++ b/tests/samplers/test_seeded_generate.py
@@ -49,7 +49,7 @@ def test_random_sample_with_seed(
     sampling_params_seed_2 = copy.deepcopy(sampling_params)
     sampling_params_seed_2.seed = 200
 
-    llm = vllm_model.model
+    llm = vllm_model.llm
 
     for prompt in example_prompts:
         for params in (
diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py
index f3fe9db3f79..e4b1aa73654 100644
--- a/tests/spec_decode/e2e/conftest.py
+++ b/tests/spec_decode/e2e/conftest.py
@@ -212,8 +212,7 @@ def run_equality_correctness_test(
     with vllm_runner(**sd_args) as vllm_model:
         if ensure_all_accepted or expected_acceptance_rate is not None:
             # Force log interval to be 0 to catch all metrics.
-            stat_logger = vllm_model.model.llm_engine.stat_loggers[
-                'prometheus']
+            stat_logger = vllm_model.llm.llm_engine.stat_loggers['prometheus']
             stat_logger.local_interval = -100
 
         sd_outputs = vllm_model.generate_w_logprobs(prompts, sampling_params)
diff --git a/tests/tokenization/test_detokenize.py b/tests/tokenization/test_detokenize.py
index f8aeba8301b..ccafc884612 100644
--- a/tests/tokenization/test_detokenize.py
+++ b/tests/tokenization/test_detokenize.py
@@ -393,7 +393,7 @@ def test_decode_prompt_logprobs_chunked_prefill(
                                               logprobs=5,
                                               prompt_logprobs=5,
                                               temperature=0.0)
-        vllm_results = vllm_model.model.generate(
+        vllm_results = vllm_model.llm.generate(
             example_prompts, sampling_params=vllm_sampling_params)
 
         for idx, result in enumerate(vllm_results):
diff --git a/tests/v1/core/test_scheduler_e2e.py b/tests/v1/core/test_scheduler_e2e.py
index 85415f6ad4b..bd0320baef8 100644
--- a/tests/v1/core/test_scheduler_e2e.py
+++ b/tests/v1/core/test_scheduler_e2e.py
@@ -14,7 +14,7 @@
 
 
 @pytest.fixture(scope="module")
-def model() -> LLM:
+def llm() -> LLM:
     return LLM(MODEL,
                enforce_eager=True,
                enable_prefix_caching=True,
@@ -24,16 +24,16 @@ def model() -> LLM:
                block_size=16)
 
 
-def test_concurrent_partial_prefill(model):
-    outputs = model.generate([PROMPT] * 3)
+def test_concurrent_partial_prefill(llm):
+    outputs = llm.generate([PROMPT] * 3)
     assert len(outputs) == 3
     for output in outputs:
         assert len(output.outputs) == 1
 
 
-def test_prefix_cache_stats_is_recorded(model):
+def test_prefix_cache_stats_is_recorded(llm):
     # 17 tokens will make sure first 16 tokens are cached in a block
     input_tokens = {"prompt_token_ids": [101] * 17}
-    _ = model.generate([input_tokens])
-    outputs = model.generate([input_tokens])
+    _ = llm.generate([input_tokens])
+    outputs = llm.generate([input_tokens])
     assert outputs[0].num_cached_tokens == 16
diff --git a/tests/v1/engine/test_llm_engine.py b/tests/v1/engine/test_llm_engine.py
index 059106c62a2..f37686317fd 100644
--- a/tests/v1/engine/test_llm_engine.py
+++ b/tests/v1/engine/test_llm_engine.py
@@ -112,9 +112,9 @@ def test_compatibility_with_skip_tokenizer_init(
         example_prompts,
         structured_outputs=True,
     )
-    model: LLM = vllm_model_skip_tokenizer_init.model
+    llm: LLM = vllm_model_skip_tokenizer_init.llm
     with pytest.raises(ValueError):
-        _ = model.generate(example_prompts, sampling_params_list)
+        _ = llm.generate(example_prompts, sampling_params_list)
 
 
 def test_parallel_sampling(vllm_model, example_prompts) -> None:
@@ -125,8 +125,8 @@ def test_parallel_sampling(vllm_model, example_prompts) -> None:
       example_prompt: test fixture providing prompts for testing.
     """
     sampling_params_list, n_list = _get_test_sampling_params(example_prompts)
-    model: LLM = vllm_model.model
-    outputs = model.generate(example_prompts, sampling_params_list)
+    llm: LLM = vllm_model.llm
+    outputs = llm.generate(example_prompts, sampling_params_list)
 
     # Validate each request response
     for out, n in zip(outputs, n_list):
@@ -166,10 +166,10 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
             speculative_config=speculative_config,
             disable_log_stats=False,
     ) as vllm_model:
-        model: LLM = vllm_model.model
+        llm: LLM = vllm_model.llm
         sampling_params = SamplingParams(temperature=0.0,
                                          max_tokens=max_tokens)
-        outputs = model.generate(example_prompts, sampling_params)
+        outputs = llm.generate(example_prompts, sampling_params)
 
         n_prompts = len(example_prompts)
         assert len(outputs) == n_prompts
@@ -180,7 +180,7 @@ def test_engine_metrics(vllm_runner, monkeypatch, example_prompts):
             total_tokens += len(out.outputs[0].token_ids)
         assert total_tokens == max_tokens * n_prompts
 
-        metrics = model.get_metrics()
+        metrics = llm.get_metrics()
 
         def find_metric(name) -> list[Metric]:
             found = []
diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py
index 69180e6e5db..4f1f340a4cc 100644
--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@@ -112,7 +112,7 @@ def _run_and_validate(
     max_tokens: int,
     do_apc: bool,
 ) -> None:
-    vllm_results = vllm_model.model.generate(
+    vllm_results = vllm_model.llm.generate(
         test_prompts, sampling_params=vllm_sampling_params)
 
     for vllm_result, hf_logprob, hf_output, logprob_prompt_logprob in zip(
@@ -288,7 +288,7 @@ def test_get_logprobs_and_prompt_logprobs(
     """
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
-        do_apc = vllm_model.model.llm_engine.cache_config.enable_prefix_caching
+        do_apc = vllm_model.llm.llm_engine.cache_config.enable_prefix_caching
         if do_apc and (temperature < 2.0
                        or batch_logprobs_composition != SAMPLE_PROMPT):
             # Skip some test-cases to save time.
@@ -378,7 +378,7 @@ def test_none_logprobs(vllm_model, example_prompts,
             prompt_logprobs=None,
             temperature=0.0,
         )
-        results_logprobs_none = vllm_model.model.generate(
+        results_logprobs_none = vllm_model.llm.generate(
             example_prompts,
             sampling_params=sampling_params_logprobs_none,
         )
@@ -408,7 +408,7 @@ def test_zero_logprobs(vllm_model, example_prompts,
                                                        logprobs=0,
                                                        prompt_logprobs=0,
                                                        temperature=0.0)
-        results_logprobs_zero = vllm_model.model.generate(
+        results_logprobs_zero = vllm_model.llm.generate(
             example_prompts, sampling_params=sampling_params_logprobs_zero)
 
         for i in range(len(results_logprobs_zero)):
diff --git a/tests/v1/sample/test_sampling_params_e2e.py b/tests/v1/sample/test_sampling_params_e2e.py
index ac0f3eb5883..f53e1e1c485 100644
--- a/tests/v1/sample/test_sampling_params_e2e.py
+++ b/tests/v1/sample/test_sampling_params_e2e.py
@@ -14,30 +14,30 @@
 
 
 @pytest.fixture(scope="module")
-def model() -> LLM:
+def llm() -> LLM:
     # Disable prefix caching so that we can test prompt logprobs.
     # TODO remove this after https://github.com/vllm-project/vllm/pull/13949
     # is merged
     return LLM(MODEL, enforce_eager=True, enable_prefix_caching=False)
 
 
-def test_n_gt_1(model):
+def test_n_gt_1(llm):
     """ParallelSampling is supported."""
 
     params = SamplingParams(n=3)
-    outputs = model.generate(PROMPT, params)
+    outputs = llm.generate(PROMPT, params)
     assert len(outputs[0].outputs) == 3
 
 
-def test_best_of(model):
+def test_best_of(llm):
     """Raise a ValueError since best_of is deprecated."""
 
     params = SamplingParams(n=2, best_of=3)
     with pytest.raises(ValueError):
-        _ = model.generate(PROMPT, params)
+        _ = llm.generate(PROMPT, params)
 
 
-def test_penalties(model):
+def test_penalties(llm):
     """Check that we do not get errors if applied."""
 
     params = SamplingParams(
@@ -49,18 +49,18 @@ def test_penalties(model):
         top_p=0.5,
         top_k=3,
     )
-    _ = model.generate(PROMPT, params)
+    _ = llm.generate(PROMPT, params)
 
 
-def test_stop(model):
+def test_stop(llm):
     """Check that we respect the stop words."""
 
-    output = model.generate(PROMPT, SamplingParams(temperature=0))
+    output = llm.generate(PROMPT, SamplingParams(temperature=0))
     split_text = output[0].outputs[0].text.split()
 
     STOP_IDX = 5
     params = SamplingParams(temperature=0, stop=split_text[STOP_IDX])
-    output = model.generate(PROMPT, params)
+    output = llm.generate(PROMPT, params)
     new_split_text = output[0].outputs[0].text.split()
 
     # Output should not contain the stop word.
@@ -69,40 +69,40 @@ def test_stop(model):
     params = SamplingParams(temperature=0,
                             stop=split_text[STOP_IDX],
                             include_stop_str_in_output=True)
-    output = model.generate(PROMPT, params)
+    output = llm.generate(PROMPT, params)
     new_split_text = output[0].outputs[0].text.split()
 
     # Output should contain the stop word.
     assert len(new_split_text) == STOP_IDX + 1
 
 
-def test_stop_token_ids(model):
+def test_stop_token_ids(llm):
     """Check that we respect the stop token ids."""
 
-    output = model.generate(PROMPT, SamplingParams(temperature=0))
+    output = llm.generate(PROMPT, SamplingParams(temperature=0))
 
     stop_token_id_0 = output[0].outputs[0].token_ids[5]
     stop_token_id_1 = output[0].outputs[0].token_ids[6]
 
     stop_token_ids = [stop_token_id_1, stop_token_id_0]
     params = SamplingParams(temperature=0, stop_token_ids=stop_token_ids)
-    output = model.generate(PROMPT, params)
+    output = llm.generate(PROMPT, params)
     assert output[0].outputs[0].token_ids[-1] == stop_token_id_0
 
     stop_token_ids = [stop_token_id_0, stop_token_id_1]
     params = SamplingParams(temperature=0, stop_token_ids=stop_token_ids)
-    output = model.generate(PROMPT, params)
+    output = llm.generate(PROMPT, params)
     assert output[0].outputs[0].token_ids[-1] == stop_token_id_0
 
 
-def test_detokenize_false(model):
+def test_detokenize_false(llm):
     """Check that detokenize=False option works."""
 
-    output = model.generate(PROMPT, SamplingParams(detokenize=False))
+    output = llm.generate(PROMPT, SamplingParams(detokenize=False))
     assert len(output[0].outputs[0].token_ids) > 0
     assert len(output[0].outputs[0].text) == 0
 
-    output = model.generate(
+    output = llm.generate(
         PROMPT, SamplingParams(detokenize=False, logprobs=3,
                                prompt_logprobs=3))
     assert len(output[0].outputs[0].token_ids) > 0
@@ -118,28 +118,28 @@ def test_detokenize_false(model):
             assert all(lp.decoded_token is None for lp in logprobs.values())
 
 
-def test_bad_words(model):
+def test_bad_words(llm):
     """Check that we respect bad words."""
 
-    output = model.generate(PROMPT, SamplingParams(temperature=0))
+    output = llm.generate(PROMPT, SamplingParams(temperature=0))
     split_text = output[0].outputs[0].text.split()
 
     bad_words_1 = " ".join(split_text[:2])
     params = SamplingParams(temperature=0, bad_words=[bad_words_1])
-    output = model.generate(PROMPT, params)
+    output = llm.generate(PROMPT, params)
     new_text = output[0].outputs[0].text
     assert bad_words_1 not in new_text
 
     bad_words_2 = new_text.split()[-1]
     params = SamplingParams(temperature=0,
                             bad_words=[bad_words_1, bad_words_2])
-    output = model.generate(PROMPT, params)
+    output = llm.generate(PROMPT, params)
     new_text = output[0].outputs[0].text
     assert bad_words_1 not in new_text
     assert bad_words_2 not in new_text
 
 
-def test_logits_processor(model):
+def test_logits_processor(llm):
     """Check that we reject logits processor."""
 
     # This sample logits processor gives infinite score to the i-th token,
@@ -150,47 +150,45 @@ def pick_ith(token_ids, logits):
         return logits
 
     with pytest.raises(ValueError):
-        _ = model.generate(PROMPT,
-                           SamplingParams(logits_processors=[pick_ith]))
+        _ = llm.generate(PROMPT, SamplingParams(logits_processors=[pick_ith]))
 
 
-def test_allowed_token_ids(model):
+def test_allowed_token_ids(llm):
     """Check that we can use allowed_token_ids."""
 
     TOKEN_ID = 10
     allowed_token_ids = [TOKEN_ID]
-    output = model.generate(
-        PROMPT, SamplingParams(allowed_token_ids=allowed_token_ids))
+    output = llm.generate(PROMPT,
+                          SamplingParams(allowed_token_ids=allowed_token_ids))
     assert output[0].outputs[0].token_ids[-1] == TOKEN_ID
 
     # Reject empty allowed_token_ids.
     with pytest.raises(ValueError):
-        _ = model.generate(PROMPT, SamplingParams(allowed_token_ids=[]))
+        _ = llm.generate(PROMPT, SamplingParams(allowed_token_ids=[]))
 
     # Reject negative token id.
     with pytest.raises(ValueError):
-        _ = model.generate(PROMPT, SamplingParams(allowed_token_ids=[-1]))
+        _ = llm.generate(PROMPT, SamplingParams(allowed_token_ids=[-1]))
 
     # Reject out of vocabulary.
     with pytest.raises(ValueError):
-        _ = model.generate(PROMPT,
-                           SamplingParams(allowed_token_ids=[10000000]))
+        _ = llm.generate(PROMPT, SamplingParams(allowed_token_ids=[10000000]))
 
 
-def test_priority(model):
+def test_priority(llm):
     """Check that we reject requests with priority."""
 
     # Reject all allowed token ids
     with pytest.raises(ValueError):
-        _ = model.generate(PROMPT, priority=[1])
+        _ = llm.generate(PROMPT, priority=[1])
 
 
-def test_seed(model):
+def test_seed(llm):
     """Check that seed impacts randomness."""
 
-    out_1 = model.generate(PROMPT, SamplingParams(seed=42))
-    out_2 = model.generate(PROMPT, SamplingParams(seed=42))
-    out_3 = model.generate(PROMPT, SamplingParams(seed=43))
+    out_1 = llm.generate(PROMPT, SamplingParams(seed=42))
+    out_2 = llm.generate(PROMPT, SamplingParams(seed=42))
+    out_3 = llm.generate(PROMPT, SamplingParams(seed=43))
 
     assert out_1[0].outputs[0].text == out_2[0].outputs[0].text
     assert out_1[0].outputs[0].text != out_3[0].outputs[0].text
diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py
index 7a7ba346a71..85ce9601352 100644
--- a/tests/v1/test_oracle.py
+++ b/tests/v1/test_oracle.py
@@ -112,9 +112,9 @@ def test_v1_llm_by_default(monkeypatch):
             m.delenv("VLLM_USE_V1")
 
         # Should default to V1 for supported config.
-        model = LLM(MODEL, enforce_eager=True, enable_lora=True)
-        print(model.generate("Hello my name is"))
-        assert hasattr(model.llm_engine, "engine_core")
+        llm = LLM(MODEL, enforce_eager=True, enable_lora=True)
+        print(llm.generate("Hello my name is"))
+        assert hasattr(llm.llm_engine, "engine_core")
         m.delenv("VLLM_USE_V1")