[Misc] unify variable for LLM instance

andyxning · andyxning · commit d1978c33295d · 2025-07-16T00:13:32.000+08:00
Signed-off-by: Andy Xie &lt;andy.xning@gmail.com&gt;
diff --git a/docs/configuration/model_resolution.md b/docs/configuration/model_resolution.md
@@ -14,7 +14,7 @@ For example:
 ```python
 from vllm import LLM
 
-model = LLM(
+llm = LLM(
     model="cerebras/Cerebras-GPT-1.3B",
     hf_overrides={"architectures": ["GPT2LMHeadModel"]},  # GPT-2
 )
diff --git a/docs/features/lora.md b/docs/features/lora.md
@@ -302,7 +302,7 @@ To this end, we allow registration of default multimodal LoRAs to handle this au
         return tokenizer.apply_chat_template(chat, tokenize=False)
 
 
-    model = LLM(
+    llm = LLM(
         model=model_id,
         enable_lora=True,
         max_lora_rank=64,
diff --git a/docs/features/quantization/fp8.md b/docs/features/quantization/fp8.md
@@ -86,8 +86,9 @@ Load and run the model in `vllm`:
 
 ```python
 from vllm import LLM
-model = LLM("./Meta-Llama-3-8B-Instruct-FP8-Dynamic")
-result = model.generate("Hello my name is")
+
+llm = LLM("./Meta-Llama-3-8B-Instruct-FP8-Dynamic")
+result = llm.generate("Hello my name is")
 print(result[0].outputs[0].text)
 ```
 
@@ -125,9 +126,10 @@ In this mode, all Linear modules (except for the final `lm_head`) have their wei
 
 ```python
 from vllm import LLM
-model = LLM("facebook/opt-125m", quantization="fp8")
+
+llm = LLM("facebook/opt-125m", quantization="fp8")
 # INFO 06-10 17:55:42 model_runner.py:157] Loading model weights took 0.1550 GB
-result = model.generate("Hello, my name is")
+result = llm.generate("Hello, my name is")
 print(result[0].outputs[0].text)
 ```
 
diff --git a/docs/features/quantization/int4.md b/docs/features/quantization/int4.md
@@ -108,7 +108,8 @@ After quantization, you can load and run the model in vLLM:
 
 ```python
 from vllm import LLM
-model = LLM("./Meta-Llama-3-8B-Instruct-W4A16-G128")
+
+llm = LLM("./Meta-Llama-3-8B-Instruct-W4A16-G128")
 ```
 
 To evaluate accuracy, you can use `lm_eval`:
diff --git a/docs/features/quantization/int8.md b/docs/features/quantization/int8.md
@@ -114,7 +114,8 @@ After quantization, you can load and run the model in vLLM:
 
 ```python
 from vllm import LLM
-model = LLM("./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token")
+
+llm = LLM("./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token")
 ```
 
 To evaluate accuracy, you can use `lm_eval`:
diff --git a/docs/models/pooling_models.md b/docs/models/pooling_models.md
@@ -149,11 +149,11 @@ You can change the output dimensions of embedding models that support Matryoshka
 ```python
 from vllm import LLM, PoolingParams
 
-model = LLM(model="jinaai/jina-embeddings-v3", 
-            task="embed", 
-            trust_remote_code=True)
-outputs = model.embed(["Follow the white rabbit."], 
-                      pooling_params=PoolingParams(dimensions=32))
+llm = LLM(model="jinaai/jina-embeddings-v3",
+          task="embed",
+          trust_remote_code=True)
+outputs = llm.embed(["Follow the white rabbit."],
+                    pooling_params=PoolingParams(dimensions=32))
 print(outputs[0].outputs)
 ```
 
diff --git a/examples/offline_inference/basic/classify.py b/examples/offline_inference/basic/classify.py
@@ -28,10 +28,10 @@ def main(args: Namespace):
 
     # Create an LLM.
     # You should pass task="classify" for classification models
-    model = LLM(**vars(args))
+    llm = LLM(**vars(args))
 
     # Generate logits. The output is a list of ClassificationRequestOutputs.
-    outputs = model.classify(prompts)
+    outputs = llm.classify(prompts)
 
     # Print the outputs.
     print("\nGenerated Outputs:\n" + "-" * 60)
diff --git a/examples/offline_inference/basic/embed.py b/examples/offline_inference/basic/embed.py
@@ -31,10 +31,10 @@ def main(args: Namespace):
 
     # Create an LLM.
     # You should pass task="embed" for embedding models
-    model = LLM(**vars(args))
+    llm = LLM(**vars(args))
 
     # Generate embedding. The output is a list of EmbeddingRequestOutputs.
-    outputs = model.embed(prompts)
+    outputs = llm.embed(prompts)
 
     # Print the outputs.
     print("\nGenerated Outputs:\n" + "-" * 60)
diff --git a/examples/offline_inference/basic/score.py b/examples/offline_inference/basic/score.py
@@ -27,10 +27,10 @@ def main(args: Namespace):
 
     # Create an LLM.
     # You should pass task="score" for cross-encoder models
-    model = LLM(**vars(args))
+    llm = LLM(**vars(args))
 
     # Generate scores. The output is a list of ScoringRequestOutputs.
-    outputs = model.score(text_1, texts_2)
+    outputs = llm.score(text_1, texts_2)
 
     # Print the outputs.
     print("\nGenerated Outputs:\n" + "-" * 60)
diff --git a/examples/offline_inference/embed_jina_embeddings_v3.py b/examples/offline_inference/embed_jina_embeddings_v3.py
@@ -30,11 +30,11 @@ def main(args: Namespace):
 
     # Create an LLM.
     # You should pass task="embed" for embedding models
-    model = LLM(**vars(args))
+    llm = LLM(**vars(args))
 
     # Generate embedding. The output is a list of EmbeddingRequestOutputs.
     # Only text matching task is supported for now. See #16120
-    outputs = model.embed(prompts)
+    outputs = llm.embed(prompts)
 
     # Print the outputs.
     print("\nGenerated Outputs:")
diff --git a/examples/offline_inference/embed_matryoshka_fy.py b/examples/offline_inference/embed_matryoshka_fy.py
@@ -30,10 +30,10 @@ def main(args: Namespace):
 
     # Create an LLM.
     # You should pass task="embed" for embedding models
-    model = LLM(**vars(args))
+    llm = LLM(**vars(args))
 
     # Generate embedding. The output is a list of EmbeddingRequestOutputs.
-    outputs = model.embed(prompts, pooling_params=PoolingParams(dimensions=32))
+    outputs = llm.embed(prompts, pooling_params=PoolingParams(dimensions=32))
 
     # Print the outputs.
     print("\nGenerated Outputs:")
diff --git a/examples/offline_inference/prithvi_geospatial_mae.py b/examples/offline_inference/prithvi_geospatial_mae.py
@@ -140,7 +140,7 @@
 class PrithviMAE:
     def __init__(self):
         print("Initializing PrithviMAE model")
-        self.model = LLM(
+        self.llm = LLM(
             model=os.path.join(os.path.dirname(__file__), "./model"),
             skip_tokenizer_init=True,
             dtype="float32",
@@ -158,7 +158,7 @@ def run(self, input_data, location_coords):
 
         prompt = {"prompt_token_ids": [1], "multi_modal_data": mm_data}
 
-        outputs = self.model.encode(prompt, use_tqdm=False)
+        outputs = self.llm.encode(prompt, use_tqdm=False)
         print("################ Inference done (it took seconds)  ##############")
 
         return outputs[0].outputs.data
diff --git a/examples/offline_inference/qwen3_reranker.py b/examples/offline_inference/qwen3_reranker.py
@@ -17,7 +17,7 @@
 # Models converted offline using this method can not only be more efficient
 # and support the vllm score API, but also make the init parameters more
 # concise, for example.
-# model = LLM(model="tomaarsen/Qwen3-Reranker-0.6B-seq-cls", task="score")
+# llm = LLM(model="tomaarsen/Qwen3-Reranker-0.6B-seq-cls", task="score")
 
 # If you want to load the official original version, the init parameters are
 # as follows.
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -784,7 +784,7 @@ def __init__(
         enforce_eager: Optional[bool] = False,
         **kwargs,
     ) -> None:
-        self.model = LLM(
+        self.llm = LLM(
             model=model_name,
             task=task,
             tokenizer=tokenizer_name,
@@ -849,9 +849,9 @@ def generate(
                                  videos=videos,
                                  audios=audios)
 
-        req_outputs = self.model.generate(inputs,
-                                          sampling_params=sampling_params,
-                                          **kwargs)
+        req_outputs = self.llm.generate(inputs,
+                                        sampling_params=sampling_params,
+                                        **kwargs)
 
         outputs: list[tuple[list[list[int]], list[str]]] = []
         for req_output in req_outputs:
@@ -897,9 +897,9 @@ def generate_w_logprobs(
                                  videos=videos,
                                  audios=audios)
 
-        req_outputs = self.model.generate(inputs,
-                                          sampling_params=sampling_params,
-                                          **kwargs)
+        req_outputs = self.llm.generate(inputs,
+                                        sampling_params=sampling_params,
+                                        **kwargs)
 
         toks_str_logsprobs_prompt_logprobs = (
             self._final_steps_generate_w_logprobs(req_outputs))
@@ -919,8 +919,8 @@ def generate_encoder_decoder_w_logprobs(
         '''
 
         assert sampling_params.logprobs is not None
-        req_outputs = self.model.generate(encoder_decoder_prompts,
-                                          sampling_params=sampling_params)
+        req_outputs = self.llm.generate(encoder_decoder_prompts,
+                                        sampling_params=sampling_params)
         toks_str_logsprobs_prompt_logprobs = (
             self._final_steps_generate_w_logprobs(req_outputs))
         # Omit prompt logprobs if not required by sampling params
@@ -1013,7 +1013,7 @@ def generate_beam_search(
                                  videos=videos,
                                  audios=audios)
 
-        outputs = self.model.beam_search(
+        outputs = self.llm.beam_search(
             inputs,
             BeamSearchParams(beam_width=beam_width, max_tokens=max_tokens))
         returned_outputs = []
@@ -1024,7 +1024,7 @@ def generate_beam_search(
         return returned_outputs
 
     def classify(self, prompts: list[str]) -> list[list[float]]:
-        req_outputs = self.model.classify(prompts)
+        req_outputs = self.llm.classify(prompts)
         return [req_output.outputs.probs for req_output in req_outputs]
 
     def embed(self,
@@ -1039,11 +1039,11 @@ def embed(self,
                                  videos=videos,
                                  audios=audios)
 
-        req_outputs = self.model.embed(inputs, *args, **kwargs)
+        req_outputs = self.llm.embed(inputs, *args, **kwargs)
         return [req_output.outputs.embedding for req_output in req_outputs]
 
     def encode(self, prompts: list[str]) -> list[list[float]]:
-        req_outputs = self.model.encode(prompts)
+        req_outputs = self.llm.encode(prompts)
         return [req_output.outputs.data for req_output in req_outputs]
 
     def score(
@@ -1053,18 +1053,18 @@ def score(
         *args,
         **kwargs,
     ) -> list[float]:
-        req_outputs = self.model.score(text_1, text_2, *args, **kwargs)
+        req_outputs = self.llm.score(text_1, text_2, *args, **kwargs)
         return [req_output.outputs.score for req_output in req_outputs]
 
     def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
-        executor = self.model.llm_engine.model_executor
+        executor = self.llm.llm_engine.model_executor
         return executor.apply_model(func)
 
     def __enter__(self):
         return self
 
     def __exit__(self, exc_type, exc_value, traceback):
-        del self.model
+        del self.llm
         cleanup_dist_env_and_memory()
 
 
diff --git a/tests/detokenizer/test_stop_strings.py b/tests/detokenizer/test_stop_strings.py
@@ -101,42 +101,42 @@ def _stop_token_id(llm):
 def test_stop_strings():
     # If V0, must set enforce_eager=False since we use
     # async output processing below.
-    vllm_model = LLM(MODEL, enforce_eager=envs.VLLM_USE_V1)
+    llm = LLM(MODEL, enforce_eager=envs.VLLM_USE_V1)
 
     if envs.VLLM_USE_V1:
-        _stop_basic(vllm_model)
+        _stop_basic(llm)
     else:
-        _set_async_mode(vllm_model, True)
-        _stop_basic(vllm_model)
+        _set_async_mode(llm, True)
+        _stop_basic(llm)
 
-        _set_async_mode(vllm_model, False)
-        _stop_basic(vllm_model)
+        _set_async_mode(llm, False)
+        _stop_basic(llm)
 
     if envs.VLLM_USE_V1:
-        _stop_multi_tokens(vllm_model)
+        _stop_multi_tokens(llm)
     else:
-        _set_async_mode(vllm_model, True)
-        _stop_multi_tokens(vllm_model)
+        _set_async_mode(llm, True)
+        _stop_multi_tokens(llm)
 
-        _set_async_mode(vllm_model, False)
-        _stop_multi_tokens(vllm_model)
+        _set_async_mode(llm, False)
+        _stop_multi_tokens(llm)
 
     if envs.VLLM_USE_V1:
-        _stop_partial_token(vllm_model)
+        _stop_partial_token(llm)
     else:
-        _set_async_mode(vllm_model, True)
-        _stop_partial_token(vllm_model)
+        _set_async_mode(llm, True)
+        _stop_partial_token(llm)
 
-        _set_async_mode(vllm_model, False)
-        _stop_partial_token(vllm_model)
+        _set_async_mode(llm, False)
+        _stop_partial_token(llm)
 
     if envs.VLLM_USE_V1:
         # FIXME: this does not respect include_in_output=False
-        # _stop_token_id(vllm_model)
+        # _stop_token_id(llm)
         pass
     else:
-        _set_async_mode(vllm_model, True)
-        _stop_token_id(vllm_model)
+        _set_async_mode(llm, True)
+        _stop_token_id(llm)
 
-        _set_async_mode(vllm_model, False)
-        _stop_token_id(vllm_model)
+        _set_async_mode(llm, False)
+        _stop_token_id(llm)
diff --git a/tests/lora/test_llama_tp.py b/tests/lora/test_llama_tp.py
@@ -186,25 +186,25 @@ def test_tp2_serialize_and_deserialize_lora(tmp_path, sql_lora_files,
     model_uri = tmp_path / "vllm" / model_ref / suffix / model_name
     tensorizer_config = TensorizerConfig(tensorizer_uri=str(model_uri))
 
-    loaded_vllm_model = LLM(model=model_ref,
-                            load_format="tensorizer",
-                            enable_lora=True,
-                            enforce_eager=True,
-                            model_loader_extra_config=tensorizer_config,
-                            max_num_seqs=13,
-                            tensor_parallel_size=2,
-                            max_loras=2)
+    loaded_llm = LLM(model=model_ref,
+                     load_format="tensorizer",
+                     enable_lora=True,
+                     enforce_eager=True,
+                     model_loader_extra_config=tensorizer_config,
+                     max_num_seqs=13,
+                     tensor_parallel_size=2,
+                     max_loras=2)
 
     tc_as_dict = tensorizer_config.to_serializable()
 
     print("lora adapter created")
-    assert do_sample(loaded_vllm_model,
+    assert do_sample(loaded_llm,
                      sql_lora_files,
                      tensorizer_config_dict=tc_as_dict,
                      lora_id=0) == EXPECTED_NO_LORA_OUTPUT
 
     print("lora 1")
-    assert do_sample(loaded_vllm_model,
+    assert do_sample(loaded_llm,
                      sql_lora_files,
                      tensorizer_config_dict=tc_as_dict,
                      lora_id=1) == EXPECTED_LORA_OUTPUT
diff --git a/tests/models/quantization/test_modelopt.py b/tests/models/quantization/test_modelopt.py
@@ -45,7 +45,7 @@
                     reason="fp8 is not supported on this GPU type.")
 @pytest.mark.parametrize("model_name", MODELS)
 def test_models(example_prompts, model_name) -> None:
-    model = LLM(
+    llm = LLM(
         model=model_name,
         max_model_len=MAX_MODEL_LEN,
         trust_remote_code=True,
@@ -68,9 +68,9 @@ def test_models(example_prompts, model_name) -> None:
     # Note: these need to be run 1 at a time due to numerical precision,
     # since the expected strs were generated this way.
     for prompt in formatted_prompts:
-        outputs = model.generate(prompt, params)
+        outputs = llm.generate(prompt, params)
         generations.append(outputs[0].outputs[0].text)
-    del model
+    del llm
 
     print(model_name, generations)
     expected_strs = EXPECTED_STRS_MAP[model_name]
diff --git a/tests/prefix_caching/test_disable_sliding_window.py b/tests/prefix_caching/test_disable_sliding_window.py
diff --git a/tests/v1/test_oracle.py b/tests/v1/test_oracle.py

Original file line number	Diff line number	Diff line change
`@@ -14,7 +14,7 @@ For example:`
`14`	`14`	```python
`15`	`15`	`from vllm import LLM`
`16`	`16`
`17`		`-model = LLM(`
	`17`	`+llm = LLM(`
`18`	`18`	`model="cerebras/Cerebras-GPT-1.3B",`
`19`	`19`	`hf_overrides={"architectures": ["GPT2LMHeadModel"]}, # GPT-2`
`20`	`20`	`)`