[Frontend] Support image object in llm.chat (vllm-project#19635)

sfeng33 · web-flow · commit fe1e924811b9 · 2025-07-06T06:47:13.000Z
Signed-off-by: sfeng33 &lt;4florafeng@gmail.com&gt;
Signed-off-by: Flora Feng &lt;4florafeng@gmail.com&gt;
diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md
@@ -101,6 +101,49 @@ To substitute multiple images inside the same text prompt, you can pass in a lis
 
 Full example: <gh-file:examples/offline_inference/vision_language_multi_image.py>
 
+If using the [LLM.chat](https://docs.vllm.ai/en/stable/models/generative_models.html#llmchat) method, you can pass images directly in the message content using various formats: image URLs, PIL Image objects, or pre-computed embeddings:
+
+```python
+from vllm import LLM
+from vllm.assets.image import ImageAsset
+
+llm = LLM(model="llava-hf/llava-1.5-7b-hf")
+image_url = "https://picsum.photos/id/32/512/512"
+image_pil = ImageAsset('cherry_blossom').pil_image
+image_embeds = torch.load(...)
+
+conversation = [
+    {"role": "system", "content": "You are a helpful assistant"},
+    {"role": "user", "content": "Hello"},
+    {"role": "assistant", "content": "Hello! How can I assist you today?"},
+    {
+        "role": "user",
+        "content": [{
+            "type": "image_url",
+            "image_url": {
+                "url": image_url
+            }
+        },{
+            "type": "image_pil",
+            "image_pil": image_pil
+        }, {
+            "type": "image_embeds",
+            "image_embeds": image_embeds
+        }, {
+            "type": "text",
+            "text": "What's in these images?"
+        }],
+    },
+]
+
+# Perform inference and log output.
+outputs = llm.chat(conversation)
+
+for o in outputs:
+    generated_text = o.outputs[0].text
+    print(generated_text)
+```
+
 Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:
 
 ??? Code
diff --git a/examples/offline_inference/mistral-small.py b/examples/offline_inference/mistral-small.py
@@ -6,6 +6,7 @@
 
 from vllm import LLM
 from vllm.sampling_params import SamplingParams
+from vllm.assets.image import ImageAsset
 
 # This script is an offline demo for running Mistral-Small-3.1
 #
@@ -71,14 +72,16 @@ def run_simple_demo(args: argparse.Namespace):
     )
 
     prompt = "Describe this image in one sentence."
-    image_url = "https://picsum.photos/id/237/200/300"
 
     messages = [
         {
             "role": "user",
             "content": [
                 {"type": "text", "text": prompt},
-                {"type": "image_url", "image_url": {"url": image_url}},
+                {
+                    "type": "image_pil",
+                    "image_pil": ImageAsset("cherry_blossom").pil_image,
+                },
             ],
         },
     ]
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
@@ -264,10 +264,8 @@ def test_parse_chat_messages_multiple_images(
                     "url": image_url
                 }
             }, {
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
+                "type": "image_pil",
+                "image_pil": ImageAsset('cherry_blossom').pil_image
             }, {
                 "type": "text",
                 "text": "What's in these images?"
@@ -303,10 +301,8 @@ async def test_parse_chat_messages_multiple_images_async(
                     "url": image_url
                 }
             }, {
-                "type": "image_url",
-                "image_url": {
-                    "url": image_url
-                }
+                "type": "image_pil",
+                "image_pil": ImageAsset('cherry_blossom').pil_image
             }, {
                 "type": "text",
                 "text": "What's in these images?"
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
@@ -28,7 +28,8 @@
                                ChatCompletionToolMessageParam)
 from openai.types.chat.chat_completion_content_part_input_audio_param import (
     InputAudio)
-from pydantic import TypeAdapter
+from PIL import Image
+from pydantic import BaseModel, ConfigDict, TypeAdapter
 # yapf: enable
 from transformers import (PreTrainedTokenizer, PreTrainedTokenizerFast,
                           ProcessorMixin)
@@ -91,6 +92,25 @@ class ChatCompletionContentPartVideoParam(TypedDict, total=False):
     """The type of the content part."""
 
 
+class PILImage(BaseModel):
+    """
+    A PIL.Image.Image object.
+    """
+    image_pil: Image.Image
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+
+class CustomChatCompletionContentPILImageParam(TypedDict, total=False):
+    """A simpler version of the param that only accepts a PIL image.
+
+    Example:
+    {
+        "image_pil": ImageAsset('cherry_blossom').pil_image
+    }
+    """
+    image_pil: Required[PILImage]
+
+
 class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False):
     """A simpler version of the param that only accepts a plain image_url.
     This is supported by OpenAI API, although it is not documented.
@@ -129,6 +149,7 @@ class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False):
     OpenAIChatCompletionContentPartParam, ChatCompletionContentPartAudioParam,
     ChatCompletionContentPartInputAudioParam,
     ChatCompletionContentPartVideoParam, ChatCompletionContentPartRefusalParam,
+    CustomChatCompletionContentPILImageParam,
     CustomChatCompletionContentSimpleImageParam,
     ChatCompletionContentPartImageEmbedsParam,
     CustomChatCompletionContentSimpleAudioParam,
@@ -631,6 +652,10 @@ def parse_image_embeds(self,
                            image_embeds: Union[str, dict[str, str]]) -> None:
         raise NotImplementedError
 
+    @abstractmethod
+    def parse_image_pil(self, image_pil: Image.Image) -> None:
+        raise NotImplementedError
+
     @abstractmethod
     def parse_audio(self, audio_url: str) -> None:
         raise NotImplementedError
@@ -677,6 +702,10 @@ def parse_image_embeds(self,
 
         self._add_placeholder(placeholder)
 
+    def parse_image_pil(self, image_pil: Image.Image) -> None:
+        placeholder = self._tracker.add("image", image_pil)
+        self._add_placeholder(placeholder)
+
     def parse_audio(self, audio_url: str) -> None:
         audio = self._connector.fetch_audio(audio_url)
 
@@ -733,6 +762,13 @@ def parse_image_embeds(self,
         placeholder = self._tracker.add("image_embeds", future)
         self._add_placeholder(placeholder)
 
+    def parse_image_pil(self, image_pil: Image.Image) -> None:
+        future: asyncio.Future[Image.Image] = asyncio.Future()
+        future.set_result(image_pil)
+
+        placeholder = self._tracker.add("image", future)
+        self._add_placeholder(placeholder)
+
     def parse_audio(self, audio_url: str) -> None:
         audio_coro = self._connector.fetch_audio_async(audio_url)
 
@@ -851,12 +887,13 @@ def _get_full_multimodal_text_prompt(placeholder_counts: dict[str, int],
 _ImageEmbedsParser = partial(cast, ChatCompletionContentPartImageEmbedsParam)
 _InputAudioParser = partial(cast, ChatCompletionContentPartInputAudioParam)
 _RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam)
+_PILImageParser = partial(cast, CustomChatCompletionContentPILImageParam)
 # Need to validate url objects
 _ImageParser = TypeAdapter(ChatCompletionContentPartImageParam).validate_python
 _AudioParser = TypeAdapter(ChatCompletionContentPartAudioParam).validate_python
 _VideoParser = TypeAdapter(ChatCompletionContentPartVideoParam).validate_python
 
-_ContentPart: TypeAlias = Union[str, dict[str, str], InputAudio]
+_ContentPart: TypeAlias = Union[str, dict[str, str], InputAudio, PILImage]
 
 # Define a mapping from part types to their corresponding parsing functions.
 MM_PARSER_MAP: dict[
@@ -869,6 +906,7 @@ def _get_full_multimodal_text_prompt(placeholder_counts: dict[str, int],
     lambda part: _ImageParser(part).get("image_url", {}).get("url", None),
     "image_embeds":
     lambda part: _ImageEmbedsParser(part).get("image_embeds", None),
+    "image_pil": lambda part: _PILImageParser(part).get("image_pil", None),
     "audio_url":
     lambda part: _AudioParser(part).get("audio_url", {}).get("url", None),
     "input_audio":
@@ -938,7 +976,7 @@ def _parse_chat_message_content_mm_part(
 
 
 VALID_MESSAGE_CONTENT_MM_PART_TYPES = ("text", "refusal", "image_url",
-                                       "image_embeds",
+                                       "image_embeds", "image_pil",
                                        "audio_url", "input_audio", "video_url")
 
 
@@ -1009,6 +1047,10 @@ def _parse_chat_message_content_part(
         else:
             return str_content
 
+    if part_type == "image_pil":
+        image_content = cast(Image.Image, content)
+        mm_parser.parse_image_pil(image_content)
+        return {'type': 'image'} if wrap_dicts else None
     if part_type == "image_url":
         str_content = cast(str, content)
         mm_parser.parse_image(str_content)

Original file line number	Diff line number	Diff line change
`@@ -6,6 +6,7 @@`
`6`	`6`
`7`	`7`	`from vllm import LLM`
`8`	`8`	`from vllm.sampling_params import SamplingParams`
	`9`	`+from vllm.assets.image import ImageAsset`
`9`	`10`
`10`	`11`	`# This script is an offline demo for running Mistral-Small-3.1`
`11`	`12`	`#`
`@@ -71,14 +72,16 @@ def run_simple_demo(args: argparse.Namespace):`
`71`	`72`	`)`
`72`	`73`
`73`	`74`	`prompt = "Describe this image in one sentence."`
`74`		`- image_url = "https://picsum.photos/id/237/200/300"`
`75`	`75`
`76`	`76`	`messages = [`
`77`	`77`	`{`
`78`	`78`	`"role": "user",`
`79`	`79`	`"content": [`
`80`	`80`	`{"type": "text", "text": prompt},`
`81`		`- {"type": "image_url", "image_url": {"url": image_url}},`
	`81`	`+ {`
	`82`	`+ "type": "image_pil",`
	`83`	`+ "image_pil": ImageAsset("cherry_blossom").pil_image,`
	`84`	`+ },`
`82`	`85`	`],`
`83`	`86`	`},`
`84`	`87`	`]`