@@ -289,6 +289,106 @@ def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
289
289
)
290
290
291
291
292
+ def load_llava (question : str , image_urls : list [str ]) -> ModelRequestData :
293
+ # NOTE: CAUTION! Original Llava models wasn't really trained on multi-image inputs,
294
+ # it will generate poor response for multi-image inputs!
295
+ model_name = "llava-hf/llava-1.5-7b-hf"
296
+ engine_args = EngineArgs (
297
+ model = model_name ,
298
+ max_num_seqs = 16 ,
299
+ limit_mm_per_prompt = {"image" : len (image_urls )},
300
+ )
301
+
302
+ placeholders = [{"type" : "image" , "image" : url } for url in image_urls ]
303
+ messages = [
304
+ {
305
+ "role" : "user" ,
306
+ "content" : [
307
+ * placeholders ,
308
+ {"type" : "text" , "text" : question },
309
+ ],
310
+ }
311
+ ]
312
+
313
+ processor = AutoProcessor .from_pretrained (model_name )
314
+
315
+ prompt = processor .apply_chat_template (
316
+ messages , tokenize = False , add_generation_prompt = True
317
+ )
318
+
319
+ return ModelRequestData (
320
+ engine_args = engine_args ,
321
+ prompt = prompt ,
322
+ image_data = [fetch_image (url ) for url in image_urls ],
323
+ )
324
+
325
+
326
+ def load_llava_next (question : str , image_urls : list [str ]) -> ModelRequestData :
327
+ model_name = "llava-hf/llava-v1.6-mistral-7b-hf"
328
+ engine_args = EngineArgs (
329
+ model = model_name ,
330
+ max_model_len = 8192 ,
331
+ max_num_seqs = 16 ,
332
+ limit_mm_per_prompt = {"image" : len (image_urls )},
333
+ )
334
+
335
+ placeholders = [{"type" : "image" , "image" : url } for url in image_urls ]
336
+ messages = [
337
+ {
338
+ "role" : "user" ,
339
+ "content" : [
340
+ * placeholders ,
341
+ {"type" : "text" , "text" : question },
342
+ ],
343
+ }
344
+ ]
345
+
346
+ processor = AutoProcessor .from_pretrained (model_name )
347
+
348
+ prompt = processor .apply_chat_template (
349
+ messages , tokenize = False , add_generation_prompt = True
350
+ )
351
+
352
+ return ModelRequestData (
353
+ engine_args = engine_args ,
354
+ prompt = prompt ,
355
+ image_data = [fetch_image (url ) for url in image_urls ],
356
+ )
357
+
358
+
359
+ def load_llava_onevision (question : str , image_urls : list [str ]) -> ModelRequestData :
360
+ model_name = "llava-hf/llava-onevision-qwen2-7b-ov-hf"
361
+ engine_args = EngineArgs (
362
+ model = model_name ,
363
+ max_model_len = 16384 ,
364
+ max_num_seqs = 16 ,
365
+ limit_mm_per_prompt = {"image" : len (image_urls )},
366
+ )
367
+
368
+ placeholders = [{"type" : "image" , "image" : url } for url in image_urls ]
369
+ messages = [
370
+ {
371
+ "role" : "user" ,
372
+ "content" : [
373
+ * placeholders ,
374
+ {"type" : "text" , "text" : question },
375
+ ],
376
+ }
377
+ ]
378
+
379
+ processor = AutoProcessor .from_pretrained (model_name )
380
+
381
+ prompt = processor .apply_chat_template (
382
+ messages , tokenize = False , add_generation_prompt = True
383
+ )
384
+
385
+ return ModelRequestData (
386
+ engine_args = engine_args ,
387
+ prompt = prompt ,
388
+ image_data = [fetch_image (url ) for url in image_urls ],
389
+ )
390
+
391
+
292
392
def load_llama4 (question : str , image_urls : list [str ]) -> ModelRequestData :
293
393
model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
294
394
@@ -737,6 +837,9 @@ def load_tarsier(question: str, image_urls: list[str]) -> ModelRequestData:
737
837
"idefics3" : load_idefics3 ,
738
838
"internvl_chat" : load_internvl ,
739
839
"kimi_vl" : load_kimi_vl ,
840
+ "llava" : load_llava ,
841
+ "llava-next" : load_llava_next ,
842
+ "llava-onevision" : load_llava_onevision ,
740
843
"llama4" : load_llama4 ,
741
844
"mistral3" : load_mistral3 ,
742
845
"mllama" : load_mllama ,
0 commit comments