Skip to content

Commit aed8468

Browse files
authored
[Doc] Add missing llava family multi-image examples (#19698)
Signed-off-by: Isotr0py <2037008807@qq.com>
1 parent 5c76b9c commit aed8468

File tree

1 file changed

+103
-0
lines changed

1 file changed

+103
-0
lines changed

examples/offline_inference/vision_language_multi_image.py

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -289,6 +289,106 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
289289
)
290290

291291

292+
def load_llava(question: str, image_urls: list[str]) -> ModelRequestData:
293+
# NOTE: CAUTION! Original Llava models wasn't really trained on multi-image inputs,
294+
# it will generate poor response for multi-image inputs!
295+
model_name = "llava-hf/llava-1.5-7b-hf"
296+
engine_args = EngineArgs(
297+
model=model_name,
298+
max_num_seqs=16,
299+
limit_mm_per_prompt={"image": len(image_urls)},
300+
)
301+
302+
placeholders = [{"type": "image", "image": url} for url in image_urls]
303+
messages = [
304+
{
305+
"role": "user",
306+
"content": [
307+
*placeholders,
308+
{"type": "text", "text": question},
309+
],
310+
}
311+
]
312+
313+
processor = AutoProcessor.from_pretrained(model_name)
314+
315+
prompt = processor.apply_chat_template(
316+
messages, tokenize=False, add_generation_prompt=True
317+
)
318+
319+
return ModelRequestData(
320+
engine_args=engine_args,
321+
prompt=prompt,
322+
image_data=[fetch_image(url) for url in image_urls],
323+
)
324+
325+
326+
def load_llava_next(question: str, image_urls: list[str]) -> ModelRequestData:
327+
model_name = "llava-hf/llava-v1.6-mistral-7b-hf"
328+
engine_args = EngineArgs(
329+
model=model_name,
330+
max_model_len=8192,
331+
max_num_seqs=16,
332+
limit_mm_per_prompt={"image": len(image_urls)},
333+
)
334+
335+
placeholders = [{"type": "image", "image": url} for url in image_urls]
336+
messages = [
337+
{
338+
"role": "user",
339+
"content": [
340+
*placeholders,
341+
{"type": "text", "text": question},
342+
],
343+
}
344+
]
345+
346+
processor = AutoProcessor.from_pretrained(model_name)
347+
348+
prompt = processor.apply_chat_template(
349+
messages, tokenize=False, add_generation_prompt=True
350+
)
351+
352+
return ModelRequestData(
353+
engine_args=engine_args,
354+
prompt=prompt,
355+
image_data=[fetch_image(url) for url in image_urls],
356+
)
357+
358+
359+
def load_llava_onevision(question: str, image_urls: list[str]) -> ModelRequestData:
360+
model_name = "llava-hf/llava-onevision-qwen2-7b-ov-hf"
361+
engine_args = EngineArgs(
362+
model=model_name,
363+
max_model_len=16384,
364+
max_num_seqs=16,
365+
limit_mm_per_prompt={"image": len(image_urls)},
366+
)
367+
368+
placeholders = [{"type": "image", "image": url} for url in image_urls]
369+
messages = [
370+
{
371+
"role": "user",
372+
"content": [
373+
*placeholders,
374+
{"type": "text", "text": question},
375+
],
376+
}
377+
]
378+
379+
processor = AutoProcessor.from_pretrained(model_name)
380+
381+
prompt = processor.apply_chat_template(
382+
messages, tokenize=False, add_generation_prompt=True
383+
)
384+
385+
return ModelRequestData(
386+
engine_args=engine_args,
387+
prompt=prompt,
388+
image_data=[fetch_image(url) for url in image_urls],
389+
)
390+
391+
292392
def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
293393
model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
294394

@@ -737,6 +837,9 @@ def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData:
737837
"idefics3": load_idefics3,
738838
"internvl_chat": load_internvl,
739839
"kimi_vl": load_kimi_vl,
840+
"llava": load_llava,
841+
"llava-next": load_llava_next,
842+
"llava-onevision": load_llava_onevision,
740843
"llama4": load_llama4,
741844
"mistral3": load_mistral3,
742845
"mllama": load_mllama,

0 commit comments

Comments
 (0)