[Model][VLM] Support Keye-VL-8B-Preview (vllm-project#20126)

Kwai-Keye · huydhn · commit e8689c7af06e · 2025-07-08T16:11:54.000-07:00
Signed-off-by: Kwai-Keye &lt;Keye@kuaishou.com&gt;
diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
@@ -559,6 +559,7 @@ Specified using `--task generate`.
 | `H2OVLChatModel`                             | H2OVL                                                                    | T + I<sup>E+</sup>                                                    | `h2oai/h2ovl-mississippi-800m`, `h2oai/h2ovl-mississippi-2b`, etc.                                                                                      |                        | ✅︎                          | ✅︎\*                  |
 | `Idefics3ForConditionalGeneration`           | Idefics3                                                                 | T + I                                                                 | `HuggingFaceM4/Idefics3-8B-Llama3` etc.                                                                                                                 | ✅︎                     |                             |  ✅︎                   |
 | `InternVLChatModel`                          | InternVL 3.0, InternVideo 2.5, InternVL 2.5, Mono-InternVL, InternVL 2.0 | T + I<sup>E+</sup> + (V<sup>E+</sup>)                                 | `OpenGVLab/InternVL3-9B`, `OpenGVLab/InternVideo2_5_Chat_8B`, `OpenGVLab/InternVL2_5-4B`, `OpenGVLab/Mono-InternVL-2B`, `OpenGVLab/InternVL2-4B`, etc.  | ✅︎                     | ✅︎                          | ✅︎                    |
+| `KeyeForConditionalGeneration`               | Keye-VL-8B-Preview                                                       | T + I<sup>E+</sup> + V<sup>E+</sup>                                   | `Kwai-Keye/Keye-VL-8B-Preview`                                                                                                                          |                        |                             | ✅︎                    |
 | `KimiVLForConditionalGeneration`             | Kimi-VL-A3B-Instruct, Kimi-VL-A3B-Thinking                               | T + I<sup>+</sup>                                                     | `moonshotai/Kimi-VL-A3B-Instruct`, `moonshotai/Kimi-VL-A3B-Thinking`                                                                                    |                        |                             | ✅︎                    |
 | `Llama4ForConditionalGeneration`             | Llama 4                                                                  | T + I<sup>+</sup>                                                     | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. |                        | ✅︎                          | ✅︎                    |
 | `LlavaForConditionalGeneration`              | LLaVA-1.5                                                                | T + I<sup>E+</sup>                                                    | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), etc.                                                                        |                        | ✅︎                          | ✅︎                    |
diff --git a/examples/offline_inference/vision_language.py b/examples/offline_inference/vision_language.py
@@ -429,6 +429,37 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
     )
 
 
+# Keye-VL
+def run_keye_vl(questions: list[str], modality: str) -> ModelRequestData:
+    model_name = "Kwai-Keye/Keye-VL-8B-Preview"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        trust_remote_code=True,
+        limit_mm_per_prompt={modality: 1},
+    )
+
+    if modality == "image":
+        placeholder = "<|image_pad|>"
+    elif modality == "video":
+        placeholder = "<|video_pad|>"
+
+    prompts = [
+        (
+            f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
+            f"{question}<|im_end|>\n"
+            "<|im_start|>assistant\n"
+        )
+        for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
+
+
 # Kimi-VL
 def run_kimi_vl(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
@@ -1154,6 +1185,7 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
     "h2ovl_chat": run_h2ovl,
     "idefics3": run_idefics3,
     "internvl_chat": run_internvl,
+    "keye_vl": run_keye_vl,
     "kimi_vl": run_kimi_vl,
     "llava": run_llava,
     "llava-next": run_llava_next,
diff --git a/examples/offline_inference/vision_language_multi_image.py b/examples/offline_inference/vision_language_multi_image.py
@@ -423,6 +423,43 @@ def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
     )
 
 
+def load_keye_vl(question: str, image_urls: list[str]) -> ModelRequestData:
+    model_name = "Kwai-Keye/Keye-VL-8B-Preview"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=8192,
+        max_num_seqs=5,
+        limit_mm_per_prompt={"image": len(image_urls)},
+    )
+
+    placeholders = [{"type": "image", "image": url} for url in image_urls]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                *placeholders,
+                {"type": "text", "text": question},
+            ],
+        },
+    ]
+
+    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
+
+    prompt = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+
+    image_data = [fetch_image(url) for url in image_urls]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image_data=image_data,
+    )
+
+
 def load_kimi_vl(question: str, image_urls: list[str]) -> ModelRequestData:
     model_name = "moonshotai/Kimi-VL-A3B-Instruct"
 
@@ -862,6 +899,7 @@ def load_tarsier2(question: str, image_urls: list[str]) -> ModelRequestData:
     "h2ovl_chat": load_h2ovl,
     "idefics3": load_idefics3,
     "internvl_chat": load_internvl,
+    "keye_vl": load_keye_vl,
     "kimi_vl": load_kimi_vl,
     "llava": load_llava,
     "llava-next": load_llava_next,
diff --git a/tests/models/registry.py b/tests/models/registry.py
@@ -351,6 +351,8 @@ def check_available_online(
                                          trust_remote_code=True),
     "Idefics3ForConditionalGeneration": _HfExamplesInfo("HuggingFaceM4/Idefics3-8B-Llama3",  # noqa: E501
                                                         {"tiny": "HuggingFaceTB/SmolVLM-256M-Instruct"}),  # noqa: E501
+    "KeyeForConditionalGeneration": _HfExamplesInfo("Kwai-Keye/Keye-VL-8B-Preview", # noqa: E501
+                                                    trust_remote_code=True),
     "KimiVLForConditionalGeneration": _HfExamplesInfo("moonshotai/Kimi-VL-A3B-Instruct",  # noqa: E501
                                                       extras={"thinking": "moonshotai/Kimi-VL-A3B-Thinking"},  # noqa: E501
                                                       trust_remote_code=True,
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
@@ -540,7 +540,7 @@ def _placeholder_str(self, modality: ModalityStr,
                 return "<image>"
             if model_type in ("mllama", "llama4"):
                 return "<|image|>"
-            if model_type in ("qwen2_vl", "qwen2_5_vl"):
+            if model_type in ("qwen2_vl", "qwen2_5_vl", "keye", "Keye"):
                 return "<|vision_start|><|image_pad|><|vision_end|>"
             if model_type == "qwen2_5_omni":
                 return "<|vision_start|><|IMAGE|><|vision_end|>"
@@ -570,7 +570,7 @@ def _placeholder_str(self, modality: ModalityStr,
                 return "<video>"
             if model_type == "glm4v":
                 return "<|begin_of_video|><|video|><|end_of_video|>"
-            if model_type in ("qwen2_vl", "qwen2_5_vl"):
+            if model_type in ("qwen2_vl", "qwen2_5_vl", "keye", "Keye"):
                 return "<|vision_start|><|video_pad|><|vision_end|>"
             if model_type == "qwen2_5_omni":
                 return "<|vision_start|><|VIDEO|><|vision_end|>"
diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py
diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py