vllm-project · andyxning · Jul 15, 2025
@@ -14,7 +14,7 @@ For example:
 ```python
 from vllm import LLM
 
-model = LLM(
+llm = LLM(
     model="cerebras/Cerebras-GPT-1.3B",
     hf_overrides={"architectures": ["GPT2LMHeadModel"]},  # GPT-2
 )

@@ -302,7 +302,7 @@ To this end, we allow registration of default multimodal LoRAs to handle this au
         return tokenizer.apply_chat_template(chat, tokenize=False)
 
 
-    model = LLM(
+    llm = LLM(
         model=model_id,
         enable_lora=True,
         max_lora_rank=64,
@@ -329,7 +329,7 @@ To this end, we allow registration of default multimodal LoRAs to handle this au
     }
 
 
-    outputs = model.generate(
+    outputs = llm.generate(
         inputs,
         sampling_params=SamplingParams(
             temperature=0.2,

@@ -86,8 +86,9 @@ Load and run the model in `vllm`:
 
 ```python
 from vllm import LLM
-model = LLM("./Meta-Llama-3-8B-Instruct-FP8-Dynamic")
-result = model.generate("Hello my name is")
+
+llm = LLM("./Meta-Llama-3-8B-Instruct-FP8-Dynamic")
+result = llm.generate("Hello my name is")
 print(result[0].outputs[0].text)
 ```
 
@@ -125,9 +126,10 @@ In this mode, all Linear modules (except for the final `lm_head`) have their wei
 
 ```python
 from vllm import LLM
-model = LLM("facebook/opt-125m", quantization="fp8")
+
+llm = LLM("facebook/opt-125m", quantization="fp8")
 # INFO 06-10 17:55:42 model_runner.py:157] Loading model weights took 0.1550 GB
-result = model.generate("Hello, my name is")
+result = llm.generate("Hello, my name is")
 print(result[0].outputs[0].text)
 ```
 

@@ -108,7 +108,8 @@ After quantization, you can load and run the model in vLLM:
 
 ```python
 from vllm import LLM
-model = LLM("./Meta-Llama-3-8B-Instruct-W4A16-G128")
+
+llm = LLM("./Meta-Llama-3-8B-Instruct-W4A16-G128")
 ```
 
 To evaluate accuracy, you can use `lm_eval`:

@@ -114,7 +114,8 @@ After quantization, you can load and run the model in vLLM:
 
 ```python
 from vllm import LLM
-model = LLM("./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token")
+
+llm = LLM("./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token")
 ```
 
 To evaluate accuracy, you can use `lm_eval`:

@@ -149,11 +149,11 @@ You can change the output dimensions of embedding models that support Matryoshka
 ```python
 from vllm import LLM, PoolingParams
 
-model = LLM(model="jinaai/jina-embeddings-v3", 
-            task="embed", 
-            trust_remote_code=True)
-outputs = model.embed(["Follow the white rabbit."], 
-                      pooling_params=PoolingParams(dimensions=32))
+llm = LLM(model="jinaai/jina-embeddings-v3",
+          task="embed",
+          trust_remote_code=True)
+outputs = llm.embed(["Follow the white rabbit."],
+                    pooling_params=PoolingParams(dimensions=32))
 print(outputs[0].outputs)
 ```
 

diff --git a/examples/offline_inference/basic/classify.py b/examples/offline_inference/basic/classify.py
@@ -28,10 +28,10 @@ def main(args: Namespace):
 
     # Create an LLM.
     # You should pass task="classify" for classification models
-    model = LLM(**vars(args))
+    llm = LLM(**vars(args))
 
     # Generate logits. The output is a list of ClassificationRequestOutputs.
-    outputs = model.classify(prompts)
+    outputs = llm.classify(prompts)
 
     # Print the outputs.
     print("\nGenerated Outputs:\n" + "-" * 60)

diff --git a/examples/offline_inference/basic/embed.py b/examples/offline_inference/basic/embed.py
@@ -31,10 +31,10 @@ def main(args: Namespace):
 
     # Create an LLM.
     # You should pass task="embed" for embedding models
-    model = LLM(**vars(args))
+    llm = LLM(**vars(args))
 
     # Generate embedding. The output is a list of EmbeddingRequestOutputs.
-    outputs = model.embed(prompts)
+    outputs = llm.embed(prompts)
 
     # Print the outputs.
     print("\nGenerated Outputs:\n" + "-" * 60)

diff --git a/examples/offline_inference/basic/score.py b/examples/offline_inference/basic/score.py
@@ -27,10 +27,10 @@ def main(args: Namespace):
 
     # Create an LLM.
     # You should pass task="score" for cross-encoder models
-    model = LLM(**vars(args))
+    llm = LLM(**vars(args))
 
     # Generate scores. The output is a list of ScoringRequestOutputs.
-    outputs = model.score(text_1, texts_2)
+    outputs = llm.score(text_1, texts_2)
 
     # Print the outputs.
     print("\nGenerated Outputs:\n" + "-" * 60)

diff --git a/examples/offline_inference/embed_jina_embeddings_v3.py b/examples/offline_inference/embed_jina_embeddings_v3.py
@@ -30,11 +30,11 @@ def main(args: Namespace):
 
     # Create an LLM.
     # You should pass task="embed" for embedding models
-    model = LLM(**vars(args))
+    llm = LLM(**vars(args))
 
     # Generate embedding. The output is a list of EmbeddingRequestOutputs.
     # Only text matching task is supported for now. See #16120
-    outputs = model.embed(prompts)
+    outputs = llm.embed(prompts)
 
     # Print the outputs.
     print("\nGenerated Outputs:")

diff --git a/examples/offline_inference/embed_matryoshka_fy.py b/examples/offline_inference/embed_matryoshka_fy.py
@@ -30,10 +30,10 @@ def main(args: Namespace):
 
     # Create an LLM.
     # You should pass task="embed" for embedding models
-    model = LLM(**vars(args))
+    llm = LLM(**vars(args))
 
     # Generate embedding. The output is a list of EmbeddingRequestOutputs.
-    outputs = model.embed(prompts, pooling_params=PoolingParams(dimensions=32))
+    outputs = llm.embed(prompts, pooling_params=PoolingParams(dimensions=32))
 
     # Print the outputs.
     print("\nGenerated Outputs:")

diff --git a/examples/offline_inference/neuron_speculation.py b/examples/offline_inference/neuron_speculation.py
@@ -25,7 +25,7 @@ def config_buckets():
     os.environ["NEURON_TOKEN_GEN_BUCKETS"] = "128,512,1024,2048"
 
 
-def initialize_model():
+def initialize_llm():
     """Create an LLM with speculative decoding."""
     return LLM(
         model="openlm-research/open_llama_7b",
@@ -43,9 +43,9 @@ def initialize_model():
     )
 
 
-def process_requests(model: LLM, sampling_params: SamplingParams):
+def process_requests(llm: LLM, sampling_params: SamplingParams):
     """Generate texts from prompts and print them."""
-    outputs = model.generate(prompts, sampling_params)
+    outputs = llm.generate(prompts, sampling_params)
     for output in outputs:
         prompt = output.prompt
         generated_text = output.outputs[0].text
@@ -55,7 +55,7 @@ def process_requests(model: LLM, sampling_params: SamplingParams):
 def main():
     """Main function that sets up the model and processes prompts."""
     config_buckets()
-    model = initialize_model()
+    model = initialize_llm()
     # Create a sampling params object.
     sampling_params = SamplingParams(max_tokens=100, top_k=1)
     process_requests(model, sampling_params)

diff --git a/examples/offline_inference/prithvi_geospatial_mae.py b/examples/offline_inference/prithvi_geospatial_mae.py
@@ -140,7 +140,7 @@
 class PrithviMAE:
     def __init__(self):
         print("Initializing PrithviMAE model")
-        self.model = LLM(
+        self.llm = LLM(
             model=os.path.join(os.path.dirname(__file__), "./model"),
             skip_tokenizer_init=True,
             dtype="float32",
@@ -158,7 +158,7 @@ def run(self, input_data, location_coords):
 
         prompt = {"prompt_token_ids": [1], "multi_modal_data": mm_data}
 
-        outputs = self.model.encode(prompt, use_tqdm=False)
+        outputs = self.llm.encode(prompt, use_tqdm=False)
         print("################ Inference done (it took seconds)  ##############")
 
         return outputs[0].outputs.data

diff --git a/examples/offline_inference/qwen3_reranker.py b/examples/offline_inference/qwen3_reranker.py
@@ -17,13 +17,13 @@
 # Models converted offline using this method can not only be more efficient
 # and support the vllm score API, but also make the init parameters more
 # concise, for example.
-# model = LLM(model="tomaarsen/Qwen3-Reranker-0.6B-seq-cls", task="score")
+# llm = LLM(model="tomaarsen/Qwen3-Reranker-0.6B-seq-cls", task="score")
 
 # If you want to load the official original version, the init parameters are
 # as follows.
 
 
-def get_model() -> LLM:
+def get_llm() -> LLM:
     """Initializes and returns the LLM model for Qwen3-Reranker."""
     return LLM(
         model=model_name,
@@ -77,7 +77,7 @@ def main() -> None:
     ]
     documents = [document_template.format(doc=doc, suffix=suffix) for doc in documents]
 
-    model = get_model()
+    model = get_llm()
     outputs = model.score(queries, documents)
 
     print("-" * 30)

diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
@@ -236,13 +236,13 @@ def test_failed_model_execution(vllm_runner, monkeypatch) -> None:
     monkeypatch.setenv('VLLM_ENABLE_V1_MULTIPROCESSING', '0')
 
     with vllm_runner('facebook/opt-125m', enforce_eager=True) as vllm_model:
-        if isinstance(vllm_model.model.llm_engine, LLMEngineV1):
+        if isinstance(vllm_model.llm.llm_engine, LLMEngineV1):
             v1_test_failed_model_execution(vllm_model)
 
 
 def v1_test_failed_model_execution(vllm_model):
 
-    engine = vllm_model.model.llm_engine
+    engine = vllm_model.llm.llm_engine
     mocked_execute_model = Mock(
         side_effect=RuntimeError("Mocked Critical Error"))
     engine.engine_core.engine_core.model_executor.execute_model =\

diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py
@@ -81,7 +81,7 @@ def test_chunked_prefill_recompute(
             disable_log_stats=False,
     ) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
+        assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt
                 < ARTIFICIAL_PREEMPTION_MAX_CNT)
 
     for i in range(len(example_prompts)):
@@ -118,10 +118,10 @@ def test_preemption(
             distributed_executor_backend=distributed_executor_backend,
     ) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
+        assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt
                 < ARTIFICIAL_PREEMPTION_MAX_CNT)
         total_preemption = (
-            vllm_model.model.llm_engine.scheduler[0].num_cumulative_preemption)
+            vllm_model.llm.llm_engine.scheduler[0].num_cumulative_preemption)
 
     check_outputs_equal(
         outputs_0_lst=hf_outputs,
@@ -174,12 +174,12 @@ def test_preemption_infeasible(
     ) as vllm_model:
         sampling_params = SamplingParams(max_tokens=max_tokens,
                                          ignore_eos=True)
-        req_outputs = vllm_model.model.generate(
+        req_outputs = vllm_model.llm.generate(
             example_prompts,
             sampling_params=sampling_params,
         )
 
-        assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
+        assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt
                 < ARTIFICIAL_PREEMPTION_MAX_CNT)
 
     # Verify the request is ignored and not hang.

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -784,7 +784,7 @@ def __init__(
         enforce_eager: Optional[bool] = False,
         **kwargs,
     ) -> None:
-        self.model = LLM(
+        self.llm = LLM(
             model=model_name,
             task=task,
             tokenizer=tokenizer_name,
@@ -849,9 +849,9 @@ def generate(
                                  videos=videos,
                                  audios=audios)
 
-        req_outputs = self.model.generate(inputs,
-                                          sampling_params=sampling_params,
-                                          **kwargs)
+        req_outputs = self.llm.generate(inputs,
+                                        sampling_params=sampling_params,
+                                        **kwargs)
 
         outputs: list[tuple[list[list[int]], list[str]]] = []
         for req_output in req_outputs:
@@ -897,9 +897,9 @@ def generate_w_logprobs(
                                  videos=videos,
                                  audios=audios)
 
-        req_outputs = self.model.generate(inputs,
-                                          sampling_params=sampling_params,
-                                          **kwargs)
+        req_outputs = self.llm.generate(inputs,
+                                        sampling_params=sampling_params,
+                                        **kwargs)
 
         toks_str_logsprobs_prompt_logprobs = (
             self._final_steps_generate_w_logprobs(req_outputs))
@@ -919,8 +919,8 @@ def generate_encoder_decoder_w_logprobs(
         '''
 
         assert sampling_params.logprobs is not None
-        req_outputs = self.model.generate(encoder_decoder_prompts,
-                                          sampling_params=sampling_params)
+        req_outputs = self.llm.generate(encoder_decoder_prompts,
+                                        sampling_params=sampling_params)
         toks_str_logsprobs_prompt_logprobs = (
             self._final_steps_generate_w_logprobs(req_outputs))
         # Omit prompt logprobs if not required by sampling params
@@ -1013,7 +1013,7 @@ def generate_beam_search(
                                  videos=videos,
                                  audios=audios)
 
-        outputs = self.model.beam_search(
+        outputs = self.llm.beam_search(
             inputs,
             BeamSearchParams(beam_width=beam_width, max_tokens=max_tokens))
         returned_outputs = []
@@ -1024,7 +1024,7 @@ def generate_beam_search(
         return returned_outputs
 
     def classify(self, prompts: list[str]) -> list[list[float]]:
-        req_outputs = self.model.classify(prompts)
+        req_outputs = self.llm.classify(prompts)
         return [req_output.outputs.probs for req_output in req_outputs]
 
     def embed(self,
@@ -1039,11 +1039,11 @@ def embed(self,
                                  videos=videos,
                                  audios=audios)
 
-        req_outputs = self.model.embed(inputs, *args, **kwargs)
+        req_outputs = self.llm.embed(inputs, *args, **kwargs)
         return [req_output.outputs.embedding for req_output in req_outputs]
 
     def encode(self, prompts: list[str]) -> list[list[float]]:
-        req_outputs = self.model.encode(prompts)
+        req_outputs = self.llm.encode(prompts)
         return [req_output.outputs.data for req_output in req_outputs]
 
     def score(
@@ -1053,18 +1053,18 @@ def score(
         *args,
         **kwargs,
     ) -> list[float]:
-        req_outputs = self.model.score(text_1, text_2, *args, **kwargs)
+        req_outputs = self.llm.score(text_1, text_2, *args, **kwargs)
         return [req_output.outputs.score for req_output in req_outputs]
 
     def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
-        executor = self.model.llm_engine.model_executor
+        executor = self.llm.llm_engine.model_executor
         return executor.apply_model(func)
 
     def __enter__(self):
         return self
 
     def __exit__(self, exc_type, exc_value, traceback):
-        del self.model
+        del self.llm
         cleanup_dist_env_and_memory()
 
 

diff --git a/tests/core/test_num_computed_tokens_update.py b/tests/core/test_num_computed_tokens_update.py
@@ -37,7 +37,7 @@ def test_num_computed_tokens_update(num_scheduler_steps: int,
                         num_scheduler_steps=num_scheduler_steps,
                         enable_chunked_prefill=enable_chunked_prefill,
                         enforce_eager=enforce_eager)
-    engine: LLMEngine = runner.model.llm_engine
+    engine: LLMEngine = runner.llm.llm_engine
 
     # In multi-step + chunked-prefill there is no separate single prompt step.
     # What is scheduled will run for num_scheduler_steps always.

diff --git a/tests/detokenizer/test_stop_reason.py b/tests/detokenizer/test_stop_reason.py
@@ -28,7 +28,7 @@ def vllm_model(vllm_runner):
 def test_stop_reason(vllm_model, example_prompts):
     tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL)
     stop_token_id = tokenizer.convert_tokens_to_ids(STOP_STR)
-    llm = vllm_model.model
+    llm = vllm_model.llm
 
     # test stop token
     outputs = llm.generate(example_prompts,