Skip to content

Commit e8689c7

Browse files
Kwai-Keyehuydhn
authored andcommitted
[Model][VLM] Support Keye-VL-8B-Preview (vllm-project#20126)
Signed-off-by: Kwai-Keye <Keye@kuaishou.com>
1 parent b1e5aa3 commit e8689c7

File tree

7 files changed

+1801
-2
lines changed

7 files changed

+1801
-2
lines changed

docs/models/supported_models.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -559,6 +559,7 @@ Specified using `--task generate`.
559559
| `H2OVLChatModel` | H2OVL | T + I<sup>E+</sup> | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc. | | ✅︎ | ✅︎\* |
560560
| `Idefics3ForConditionalGeneration` | Idefics3 | T + I | `HuggingFaceM4/Idefics3-8B-Llama3` etc. | ✅︎ | | ✅︎ |
561561
| `InternVLChatModel` | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>) | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc. | ✅︎ | ✅︎ | ✅︎ |
562+
| `KeyeForConditionalGeneration` | Keye-VL-8B-Preview | T + I<sup>E+</sup> + V<sup>E+</sup> | `Kwai-Keye/Keye-VL-8B-Preview` | | | ✅︎ |
562563
| `KimiVLForConditionalGeneration` | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking | T + I<sup>+</sup> | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking` | | | ✅︎ |
563564
| `Llama4ForConditionalGeneration` | Llama 4 | T + I<sup>+</sup> | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | | ✅︎ | ✅︎ |
564565
| `LlavaForConditionalGeneration` | LLaVA-1.5 | T + I<sup>E+</sup> | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc. | | ✅︎ | ✅︎ |

examples/offline_inference/vision_language.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -429,6 +429,37 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
429429
)
430430

431431

432+
# Keye-VL
433+
def run_keye_vl(questions: list[str], modality: str) -> ModelRequestData:
434+
model_name = "Kwai-Keye/Keye-VL-8B-Preview"
435+
436+
engine_args = EngineArgs(
437+
model=model_name,
438+
max_model_len=8192,
439+
trust_remote_code=True,
440+
limit_mm_per_prompt={modality: 1},
441+
)
442+
443+
if modality == "image":
444+
placeholder = "<|image_pad|>"
445+
elif modality == "video":
446+
placeholder = "<|video_pad|>"
447+
448+
prompts = [
449+
(
450+
f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
451+
f"{question}<|im_end|>\n"
452+
"<|im_start|>assistant\n"
453+
)
454+
for question in questions
455+
]
456+
457+
return ModelRequestData(
458+
engine_args=engine_args,
459+
prompts=prompts,
460+
)
461+
462+
432463
# Kimi-VL
433464
def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
434465
assert modality == "image"
@@ -1154,6 +1185,7 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
11541185
"h2ovl_chat": run_h2ovl,
11551186
"idefics3": run_idefics3,
11561187
"internvl_chat": run_internvl,
1188+
"keye_vl": run_keye_vl,
11571189
"kimi_vl": run_kimi_vl,
11581190
"llava": run_llava,
11591191
"llava-next": run_llava_next,

examples/offline_inference/vision_language_multi_image.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -423,6 +423,43 @@ def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
423423
)
424424

425425

426+
def load_keye_vl(question: str, image_urls: list[str]) -> ModelRequestData:
427+
model_name = "Kwai-Keye/Keye-VL-8B-Preview"
428+
429+
engine_args = EngineArgs(
430+
model=model_name,
431+
trust_remote_code=True,
432+
max_model_len=8192,
433+
max_num_seqs=5,
434+
limit_mm_per_prompt={"image": len(image_urls)},
435+
)
436+
437+
placeholders = [{"type": "image", "image": url} for url in image_urls]
438+
messages = [
439+
{
440+
"role": "user",
441+
"content": [
442+
*placeholders,
443+
{"type": "text", "text": question},
444+
],
445+
},
446+
]
447+
448+
processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
449+
450+
prompt = processor.apply_chat_template(
451+
messages, tokenize=False, add_generation_prompt=True
452+
)
453+
454+
image_data = [fetch_image(url) for url in image_urls]
455+
456+
return ModelRequestData(
457+
engine_args=engine_args,
458+
prompt=prompt,
459+
image_data=image_data,
460+
)
461+
462+
426463
def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:
427464
model_name = "moonshotai/Kimi-VL-A3B-Instruct"
428465

@@ -862,6 +899,7 @@ def load_tarsier2(question: str, image_urls: list[str]) -> ModelRequestData:
862899
"h2ovl_chat": load_h2ovl,
863900
"idefics3": load_idefics3,
864901
"internvl_chat": load_internvl,
902+
"keye_vl": load_keye_vl,
865903
"kimi_vl": load_kimi_vl,
866904
"llava": load_llava,
867905
"llava-next": load_llava_next,

tests/models/registry.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -351,6 +351,8 @@ def check_available_online(
351351
trust_remote_code=True),
352352
"Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3", # noqa: E501
353353
{"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}), # noqa: E501
354+
"KeyeForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501
355+
trust_remote_code=True),
354356
"KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct", # noqa: E501
355357
extras={"thinking": "moonshotai/Kimi-VL-A3B-Thinking"}, # noqa: E501
356358
trust_remote_code=True,

vllm/entrypoints/chat_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -540,7 +540,7 @@ def _placeholder_str(self, modality: ModalityStr,
540540
return "<image>"
541541
if model_type in ("mllama", "llama4"):
542542
return "<|image|>"
543-
if model_type in ("qwen2_vl", "qwen2_5_vl"):
543+
if model_type in ("qwen2_vl", "qwen2_5_vl", "keye", "Keye"):
544544
return "<|vision_start|><|image_pad|><|vision_end|>"
545545
if model_type == "qwen2_5_omni":
546546
return "<|vision_start|><|IMAGE|><|vision_end|>"
@@ -570,7 +570,7 @@ def _placeholder_str(self, modality: ModalityStr,
570570
return "<video>"
571571
if model_type == "glm4v":
572572
return "<|begin_of_video|><|video|><|end_of_video|>"
573-
if model_type in ("qwen2_vl", "qwen2_5_vl"):
573+
if model_type in ("qwen2_vl", "qwen2_5_vl", "keye", "Keye"):
574574
return "<|vision_start|><|video_pad|><|vision_end|>"
575575
if model_type == "qwen2_5_omni":
576576
return "<|vision_start|><|VIDEO|><|vision_end|>"

0 commit comments

Comments
 (0)