Skip to content

[Misc] unify variable for LLM instance #20996

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/configuration/model_resolution.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ For example:
```python
from vllm import LLM

model = LLM(
llm = LLM(
model="cerebras/Cerebras-GPT-1.3B",
hf_overrides={"architectures": ["GPT2LMHeadModel"]}, # GPT-2
)
Expand Down
4 changes: 2 additions & 2 deletions docs/features/lora.md
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@ To this end, we allow registration of default multimodal LoRAs to handle this au
return tokenizer.apply_chat_template(chat, tokenize=False)


model = LLM(
llm = LLM(
model=model_id,
enable_lora=True,
max_lora_rank=64,
Expand All @@ -329,7 +329,7 @@ To this end, we allow registration of default multimodal LoRAs to handle this au
}


outputs = model.generate(
outputs = llm.generate(
inputs,
sampling_params=SamplingParams(
temperature=0.2,
Expand Down
10 changes: 6 additions & 4 deletions docs/features/quantization/fp8.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,9 @@ Load and run the model in `vllm`:

```python
from vllm import LLM
model = LLM("./Meta-Llama-3-8B-Instruct-FP8-Dynamic")
result = model.generate("Hello my name is")

llm = LLM("./Meta-Llama-3-8B-Instruct-FP8-Dynamic")
result = llm.generate("Hello my name is")
print(result[0].outputs[0].text)
```

Expand Down Expand Up @@ -125,9 +126,10 @@ In this mode, all Linear modules (except for the final `lm_head`) have their wei

```python
from vllm import LLM
model = LLM("facebook/opt-125m", quantization="fp8")

llm = LLM("facebook/opt-125m", quantization="fp8")
# INFO 06-10 17:55:42 model_runner.py:157] Loading model weights took 0.1550 GB
result = model.generate("Hello, my name is")
result = llm.generate("Hello, my name is")
print(result[0].outputs[0].text)
```

Expand Down
3 changes: 2 additions & 1 deletion docs/features/quantization/int4.md
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,8 @@ After quantization, you can load and run the model in vLLM:

```python
from vllm import LLM
model = LLM("./Meta-Llama-3-8B-Instruct-W4A16-G128")

llm = LLM("./Meta-Llama-3-8B-Instruct-W4A16-G128")
```

To evaluate accuracy, you can use `lm_eval`:
Expand Down
3 changes: 2 additions & 1 deletion docs/features/quantization/int8.md
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,8 @@ After quantization, you can load and run the model in vLLM:

```python
from vllm import LLM
model = LLM("./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token")

llm = LLM("./Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Per-Token")
```

To evaluate accuracy, you can use `lm_eval`:
Expand Down
10 changes: 5 additions & 5 deletions docs/models/pooling_models.md
Original file line number Diff line number Diff line change
Expand Up @@ -149,11 +149,11 @@ You can change the output dimensions of embedding models that support Matryoshka
```python
from vllm import LLM, PoolingParams

model = LLM(model="jinaai/jina-embeddings-v3",
task="embed",
trust_remote_code=True)
outputs = model.embed(["Follow the white rabbit."],
pooling_params=PoolingParams(dimensions=32))
llm = LLM(model="jinaai/jina-embeddings-v3",
task="embed",
trust_remote_code=True)
outputs = llm.embed(["Follow the white rabbit."],
pooling_params=PoolingParams(dimensions=32))
print(outputs[0].outputs)
```

Expand Down
4 changes: 2 additions & 2 deletions examples/offline_inference/basic/classify.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,10 @@ def main(args: Namespace):

# Create an LLM.
# You should pass task="classify" for classification models
model = LLM(**vars(args))
llm = LLM(**vars(args))

# Generate logits. The output is a list of ClassificationRequestOutputs.
outputs = model.classify(prompts)
outputs = llm.classify(prompts)

# Print the outputs.
print("\nGenerated Outputs:\n" + "-" * 60)
Expand Down
4 changes: 2 additions & 2 deletions examples/offline_inference/basic/embed.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,10 @@ def main(args: Namespace):

# Create an LLM.
# You should pass task="embed" for embedding models
model = LLM(**vars(args))
llm = LLM(**vars(args))

# Generate embedding. The output is a list of EmbeddingRequestOutputs.
outputs = model.embed(prompts)
outputs = llm.embed(prompts)

# Print the outputs.
print("\nGenerated Outputs:\n" + "-" * 60)
Expand Down
4 changes: 2 additions & 2 deletions examples/offline_inference/basic/score.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,10 @@ def main(args: Namespace):

# Create an LLM.
# You should pass task="score" for cross-encoder models
model = LLM(**vars(args))
llm = LLM(**vars(args))

# Generate scores. The output is a list of ScoringRequestOutputs.
outputs = model.score(text_1, texts_2)
outputs = llm.score(text_1, texts_2)

# Print the outputs.
print("\nGenerated Outputs:\n" + "-" * 60)
Expand Down
4 changes: 2 additions & 2 deletions examples/offline_inference/embed_jina_embeddings_v3.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,11 @@ def main(args: Namespace):

# Create an LLM.
# You should pass task="embed" for embedding models
model = LLM(**vars(args))
llm = LLM(**vars(args))

# Generate embedding. The output is a list of EmbeddingRequestOutputs.
# Only text matching task is supported for now. See #16120
outputs = model.embed(prompts)
outputs = llm.embed(prompts)

# Print the outputs.
print("\nGenerated Outputs:")
Expand Down
4 changes: 2 additions & 2 deletions examples/offline_inference/embed_matryoshka_fy.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,10 @@ def main(args: Namespace):

# Create an LLM.
# You should pass task="embed" for embedding models
model = LLM(**vars(args))
llm = LLM(**vars(args))

# Generate embedding. The output is a list of EmbeddingRequestOutputs.
outputs = model.embed(prompts, pooling_params=PoolingParams(dimensions=32))
outputs = llm.embed(prompts, pooling_params=PoolingParams(dimensions=32))

# Print the outputs.
print("\nGenerated Outputs:")
Expand Down
8 changes: 4 additions & 4 deletions examples/offline_inference/neuron_speculation.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def config_buckets():
os.environ["NEURON_TOKEN_GEN_BUCKETS"] = "128,512,1024,2048"


def initialize_model():
def initialize_llm():
"""Create an LLM with speculative decoding."""
return LLM(
model="openlm-research/open_llama_7b",
Expand All @@ -43,9 +43,9 @@ def initialize_model():
)


def process_requests(model: LLM, sampling_params: SamplingParams):
def process_requests(llm: LLM, sampling_params: SamplingParams):
"""Generate texts from prompts and print them."""
outputs = model.generate(prompts, sampling_params)
outputs = llm.generate(prompts, sampling_params)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
Expand All @@ -55,7 +55,7 @@ def process_requests(model: LLM, sampling_params: SamplingParams):
def main():
"""Main function that sets up the model and processes prompts."""
config_buckets()
model = initialize_model()
model = initialize_llm()
# Create a sampling params object.
sampling_params = SamplingParams(max_tokens=100, top_k=1)
process_requests(model, sampling_params)
Expand Down
4 changes: 2 additions & 2 deletions examples/offline_inference/prithvi_geospatial_mae.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@
class PrithviMAE:
def __init__(self):
print("Initializing PrithviMAE model")
self.model = LLM(
self.llm = LLM(
model=os.path.join(os.path.dirname(__file__), "./model"),
skip_tokenizer_init=True,
dtype="float32",
Expand All @@ -158,7 +158,7 @@ def run(self, input_data, location_coords):

prompt = {"prompt_token_ids": [1], "multi_modal_data": mm_data}

outputs = self.model.encode(prompt, use_tqdm=False)
outputs = self.llm.encode(prompt, use_tqdm=False)
print("################ Inference done (it took seconds) ##############")

return outputs[0].outputs.data
Expand Down
6 changes: 3 additions & 3 deletions examples/offline_inference/qwen3_reranker.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,13 @@
# Models converted offline using this method can not only be more efficient
# and support the vllm score API, but also make the init parameters more
# concise, for example.
# model = LLM(model="tomaarsen/Qwen3-Reranker-0.6B-seq-cls", task="score")
# llm = LLM(model="tomaarsen/Qwen3-Reranker-0.6B-seq-cls", task="score")

# If you want to load the official original version, the init parameters are
# as follows.


def get_model() -> LLM:
def get_llm() -> LLM:
"""Initializes and returns the LLM model for Qwen3-Reranker."""
return LLM(
model=model_name,
Expand Down Expand Up @@ -77,7 +77,7 @@ def main() -> None:
]
documents = [document_template.format(doc=doc, suffix=suffix) for doc in documents]

model = get_model()
model = get_llm()
outputs = model.score(queries, documents)

print("-" * 30)
Expand Down
4 changes: 2 additions & 2 deletions tests/basic_correctness/test_basic_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,13 +236,13 @@ def test_failed_model_execution(vllm_runner, monkeypatch) -> None:
monkeypatch.setenv('VLLM_ENABLE_V1_MULTIPROCESSING', '0')

with vllm_runner('facebook/opt-125m', enforce_eager=True) as vllm_model:
if isinstance(vllm_model.model.llm_engine, LLMEngineV1):
if isinstance(vllm_model.llm.llm_engine, LLMEngineV1):
v1_test_failed_model_execution(vllm_model)


def v1_test_failed_model_execution(vllm_model):

engine = vllm_model.model.llm_engine
engine = vllm_model.llm.llm_engine
mocked_execute_model = Mock(
side_effect=RuntimeError("Mocked Critical Error"))
engine.engine_core.engine_core.model_executor.execute_model =\
Expand Down
10 changes: 5 additions & 5 deletions tests/basic_correctness/test_preemption.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def test_chunked_prefill_recompute(
disable_log_stats=False,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt
< ARTIFICIAL_PREEMPTION_MAX_CNT)

for i in range(len(example_prompts)):
Expand Down Expand Up @@ -118,10 +118,10 @@ def test_preemption(
distributed_executor_backend=distributed_executor_backend,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt
< ARTIFICIAL_PREEMPTION_MAX_CNT)
total_preemption = (
vllm_model.model.llm_engine.scheduler[0].num_cumulative_preemption)
vllm_model.llm.llm_engine.scheduler[0].num_cumulative_preemption)

check_outputs_equal(
outputs_0_lst=hf_outputs,
Expand Down Expand Up @@ -174,12 +174,12 @@ def test_preemption_infeasible(
) as vllm_model:
sampling_params = SamplingParams(max_tokens=max_tokens,
ignore_eos=True)
req_outputs = vllm_model.model.generate(
req_outputs = vllm_model.llm.generate(
example_prompts,
sampling_params=sampling_params,
)

assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
assert (vllm_model.llm.llm_engine.scheduler[0].artificial_preempt_cnt
< ARTIFICIAL_PREEMPTION_MAX_CNT)

# Verify the request is ignored and not hang.
Expand Down
32 changes: 16 additions & 16 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -784,7 +784,7 @@ def __init__(
enforce_eager: Optional[bool] = False,
**kwargs,
) -> None:
self.model = LLM(
self.llm = LLM(
model=model_name,
task=task,
tokenizer=tokenizer_name,
Expand Down Expand Up @@ -849,9 +849,9 @@ def generate(
videos=videos,
audios=audios)

req_outputs = self.model.generate(inputs,
sampling_params=sampling_params,
**kwargs)
req_outputs = self.llm.generate(inputs,
sampling_params=sampling_params,
**kwargs)

outputs: list[tuple[list[list[int]], list[str]]] = []
for req_output in req_outputs:
Expand Down Expand Up @@ -897,9 +897,9 @@ def generate_w_logprobs(
videos=videos,
audios=audios)

req_outputs = self.model.generate(inputs,
sampling_params=sampling_params,
**kwargs)
req_outputs = self.llm.generate(inputs,
sampling_params=sampling_params,
**kwargs)

toks_str_logsprobs_prompt_logprobs = (
self._final_steps_generate_w_logprobs(req_outputs))
Expand All @@ -919,8 +919,8 @@ def generate_encoder_decoder_w_logprobs(
'''

assert sampling_params.logprobs is not None
req_outputs = self.model.generate(encoder_decoder_prompts,
sampling_params=sampling_params)
req_outputs = self.llm.generate(encoder_decoder_prompts,
sampling_params=sampling_params)
toks_str_logsprobs_prompt_logprobs = (
self._final_steps_generate_w_logprobs(req_outputs))
# Omit prompt logprobs if not required by sampling params
Expand Down Expand Up @@ -1013,7 +1013,7 @@ def generate_beam_search(
videos=videos,
audios=audios)

outputs = self.model.beam_search(
outputs = self.llm.beam_search(
inputs,
BeamSearchParams(beam_width=beam_width, max_tokens=max_tokens))
returned_outputs = []
Expand All @@ -1024,7 +1024,7 @@ def generate_beam_search(
return returned_outputs

def classify(self, prompts: list[str]) -> list[list[float]]:
req_outputs = self.model.classify(prompts)
req_outputs = self.llm.classify(prompts)
return [req_output.outputs.probs for req_output in req_outputs]

def embed(self,
Expand All @@ -1039,11 +1039,11 @@ def embed(self,
videos=videos,
audios=audios)

req_outputs = self.model.embed(inputs, *args, **kwargs)
req_outputs = self.llm.embed(inputs, *args, **kwargs)
return [req_output.outputs.embedding for req_output in req_outputs]

def encode(self, prompts: list[str]) -> list[list[float]]:
req_outputs = self.model.encode(prompts)
req_outputs = self.llm.encode(prompts)
return [req_output.outputs.data for req_output in req_outputs]

def score(
Expand All @@ -1053,18 +1053,18 @@ def score(
*args,
**kwargs,
) -> list[float]:
req_outputs = self.model.score(text_1, text_2, *args, **kwargs)
req_outputs = self.llm.score(text_1, text_2, *args, **kwargs)
return [req_output.outputs.score for req_output in req_outputs]

def apply_model(self, func: Callable[[nn.Module], _R]) -> list[_R]:
executor = self.model.llm_engine.model_executor
executor = self.llm.llm_engine.model_executor
return executor.apply_model(func)

def __enter__(self):
return self

def __exit__(self, exc_type, exc_value, traceback):
del self.model
del self.llm
cleanup_dist_env_and_memory()


Expand Down
2 changes: 1 addition & 1 deletion tests/core/test_num_computed_tokens_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def test_num_computed_tokens_update(num_scheduler_steps: int,
num_scheduler_steps=num_scheduler_steps,
enable_chunked_prefill=enable_chunked_prefill,
enforce_eager=enforce_eager)
engine: LLMEngine = runner.model.llm_engine
engine: LLMEngine = runner.llm.llm_engine

# In multi-step + chunked-prefill there is no separate single prompt step.
# What is scheduled will run for num_scheduler_steps always.
Expand Down
2 changes: 1 addition & 1 deletion tests/detokenizer/test_stop_reason.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def vllm_model(vllm_runner):
def test_stop_reason(vllm_model, example_prompts):
tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL)
stop_token_id = tokenizer.convert_tokens_to_ids(STOP_STR)
llm = vllm_model.model
llm = vllm_model.llm

# test stop token
outputs = llm.generate(example_prompts,
Expand Down
Loading