Skip to content

Commit fe1e924

Browse files
authored
[Frontend] Support image object in llm.chat (vllm-project#19635)
Signed-off-by: sfeng33 <4florafeng@gmail.com> Signed-off-by: Flora Feng <4florafeng@gmail.com>
1 parent 4548c03 commit fe1e924

File tree

4 files changed

+97
-13
lines changed

4 files changed

+97
-13
lines changed

docs/features/multimodal_inputs.md

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,49 @@ To substitute multiple images inside the same text prompt, you can pass in a lis
101101

102102
Full example: <gh-file:examples/offline_inference/vision_language_multi_image.py>
103103

104+
If using the [LLM.chat](https://docs.vllm.ai/en/stable/models/generative_models.html#llmchat) method, you can pass images directly in the message content using various formats: image URLs, PIL Image objects, or pre-computed embeddings:
105+
106+
```python
107+
from vllm import LLM
108+
from vllm.assets.image import ImageAsset
109+
110+
llm = LLM(model="llava-hf/llava-1.5-7b-hf")
111+
image_url = "https://picsum.photos/id/32/512/512"
112+
image_pil = ImageAsset('cherry_blossom').pil_image
113+
image_embeds = torch.load(...)
114+
115+
conversation = [
116+
{"role": "system", "content": "You are a helpful assistant"},
117+
{"role": "user", "content": "Hello"},
118+
{"role": "assistant", "content": "Hello! How can I assist you today?"},
119+
{
120+
"role": "user",
121+
"content": [{
122+
"type": "image_url",
123+
"image_url": {
124+
"url": image_url
125+
}
126+
},{
127+
"type": "image_pil",
128+
"image_pil": image_pil
129+
}, {
130+
"type": "image_embeds",
131+
"image_embeds": image_embeds
132+
}, {
133+
"type": "text",
134+
"text": "What's in these images?"
135+
}],
136+
},
137+
]
138+
139+
# Perform inference and log output.
140+
outputs = llm.chat(conversation)
141+
142+
for o in outputs:
143+
generated_text = o.outputs[0].text
144+
print(generated_text)
145+
```
146+
104147
Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:
105148

106149
??? Code

examples/offline_inference/mistral-small.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
from vllm import LLM
88
from vllm.sampling_params import SamplingParams
9+
from vllm.assets.image import ImageAsset
910

1011
# This script is an offline demo for running Mistral-Small-3.1
1112
#
@@ -71,14 +72,16 @@ def run_simple_demo(args: argparse.Namespace):
7172
)
7273

7374
prompt = "Describe this image in one sentence."
74-
image_url = "https://picsum.photos/id/237/200/300"
7575

7676
messages = [
7777
{
7878
"role": "user",
7979
"content": [
8080
{"type": "text", "text": prompt},
81-
{"type": "image_url", "image_url": {"url": image_url}},
81+
{
82+
"type": "image_pil",
83+
"image_pil": ImageAsset("cherry_blossom").pil_image,
84+
},
8285
],
8386
},
8487
]

tests/entrypoints/test_chat_utils.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -264,10 +264,8 @@ def test_parse_chat_messages_multiple_images(
264264
"url": image_url
265265
}
266266
}, {
267-
"type": "image_url",
268-
"image_url": {
269-
"url": image_url
270-
}
267+
"type": "image_pil",
268+
"image_pil": ImageAsset('cherry_blossom').pil_image
271269
}, {
272270
"type": "text",
273271
"text": "What's in these images?"
@@ -303,10 +301,8 @@ async def test_parse_chat_messages_multiple_images_async(
303301
"url": image_url
304302
}
305303
}, {
306-
"type": "image_url",
307-
"image_url": {
308-
"url": image_url
309-
}
304+
"type": "image_pil",
305+
"image_pil": ImageAsset('cherry_blossom').pil_image
310306
}, {
311307
"type": "text",
312308
"text": "What's in these images?"

vllm/entrypoints/chat_utils.py

Lines changed: 45 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,8 @@
2828
ChatCompletionToolMessageParam)
2929
from openai.types.chat.chat_completion_content_part_input_audio_param import (
3030
InputAudio)
31-
from pydantic import TypeAdapter
31+
from PIL import Image
32+
from pydantic import BaseModel, ConfigDict, TypeAdapter
3233
# yapf: enable
3334
from transformers import (PreTrainedTokenizer, PreTrainedTokenizerFast,
3435
ProcessorMixin)
@@ -91,6 +92,25 @@ class ChatCompletionContentPartVideoParam(TypedDict, total=False):
9192
"""The type of the content part."""
9293

9394

95+
class PILImage(BaseModel):
96+
"""
97+
A PIL.Image.Image object.
98+
"""
99+
image_pil: Image.Image
100+
model_config = ConfigDict(arbitrary_types_allowed=True)
101+
102+
103+
class CustomChatCompletionContentPILImageParam(TypedDict, total=False):
104+
"""A simpler version of the param that only accepts a PIL image.
105+
106+
Example:
107+
{
108+
"image_pil": ImageAsset('cherry_blossom').pil_image
109+
}
110+
"""
111+
image_pil: Required[PILImage]
112+
113+
94114
class CustomChatCompletionContentSimpleImageParam(TypedDict, total=False):
95115
"""A simpler version of the param that only accepts a plain image_url.
96116
This is supported by OpenAI API, although it is not documented.
@@ -129,6 +149,7 @@ class CustomChatCompletionContentSimpleVideoParam(TypedDict, total=False):
129149
OpenAIChatCompletionContentPartParam, ChatCompletionContentPartAudioParam,
130150
ChatCompletionContentPartInputAudioParam,
131151
ChatCompletionContentPartVideoParam, ChatCompletionContentPartRefusalParam,
152+
CustomChatCompletionContentPILImageParam,
132153
CustomChatCompletionContentSimpleImageParam,
133154
ChatCompletionContentPartImageEmbedsParam,
134155
CustomChatCompletionContentSimpleAudioParam,
@@ -631,6 +652,10 @@ def parse_image_embeds(self,
631652
image_embeds: Union[str, dict[str, str]]) -> None:
632653
raise NotImplementedError
633654

655+
@abstractmethod
656+
def parse_image_pil(self, image_pil: Image.Image) -> None:
657+
raise NotImplementedError
658+
634659
@abstractmethod
635660
def parse_audio(self, audio_url: str) -> None:
636661
raise NotImplementedError
@@ -677,6 +702,10 @@ def parse_image_embeds(self,
677702

678703
self._add_placeholder(placeholder)
679704

705+
def parse_image_pil(self, image_pil: Image.Image) -> None:
706+
placeholder = self._tracker.add("image", image_pil)
707+
self._add_placeholder(placeholder)
708+
680709
def parse_audio(self, audio_url: str) -> None:
681710
audio = self._connector.fetch_audio(audio_url)
682711

@@ -733,6 +762,13 @@ def parse_image_embeds(self,
733762
placeholder = self._tracker.add("image_embeds", future)
734763
self._add_placeholder(placeholder)
735764

765+
def parse_image_pil(self, image_pil: Image.Image) -> None:
766+
future: asyncio.Future[Image.Image] = asyncio.Future()
767+
future.set_result(image_pil)
768+
769+
placeholder = self._tracker.add("image", future)
770+
self._add_placeholder(placeholder)
771+
736772
def parse_audio(self, audio_url: str) -> None:
737773
audio_coro = self._connector.fetch_audio_async(audio_url)
738774

@@ -851,12 +887,13 @@ def _get_full_multimodal_text_prompt(placeholder_counts: dict[str, int],
851887
_ImageEmbedsParser = partial(cast, ChatCompletionContentPartImageEmbedsParam)
852888
_InputAudioParser = partial(cast, ChatCompletionContentPartInputAudioParam)
853889
_RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam)
890+
_PILImageParser = partial(cast, CustomChatCompletionContentPILImageParam)
854891
# Need to validate url objects
855892
_ImageParser = TypeAdapter(ChatCompletionContentPartImageParam).validate_python
856893
_AudioParser = TypeAdapter(ChatCompletionContentPartAudioParam).validate_python
857894
_VideoParser = TypeAdapter(ChatCompletionContentPartVideoParam).validate_python
858895

859-
_ContentPart: TypeAlias = Union[str, dict[str, str], InputAudio]
896+
_ContentPart: TypeAlias = Union[str, dict[str, str], InputAudio, PILImage]
860897

861898
# Define a mapping from part types to their corresponding parsing functions.
862899
MM_PARSER_MAP: dict[
@@ -869,6 +906,7 @@ def _get_full_multimodal_text_prompt(placeholder_counts: dict[str, int],
869906
lambda part: _ImageParser(part).get("image_url", {}).get("url", None),
870907
"image_embeds":
871908
lambda part: _ImageEmbedsParser(part).get("image_embeds", None),
909+
"image_pil": lambda part: _PILImageParser(part).get("image_pil", None),
872910
"audio_url":
873911
lambda part: _AudioParser(part).get("audio_url", {}).get("url", None),
874912
"input_audio":
@@ -938,7 +976,7 @@ def _parse_chat_message_content_mm_part(
938976

939977

940978
VALID_MESSAGE_CONTENT_MM_PART_TYPES = ("text", "refusal", "image_url",
941-
"image_embeds",
979+
"image_embeds", "image_pil",
942980
"audio_url", "input_audio", "video_url")
943981

944982

@@ -1009,6 +1047,10 @@ def _parse_chat_message_content_part(
10091047
else:
10101048
return str_content
10111049

1050+
if part_type == "image_pil":
1051+
image_content = cast(Image.Image, content)
1052+
mm_parser.parse_image_pil(image_content)
1053+
return {'type': 'image'} if wrap_dicts else None
10121054
if part_type == "image_url":
10131055
str_content = cast(str, content)
10141056
mm_parser.parse_image(str_content)

0 commit comments

Comments
 (0)