Skip to content

Commit f04e76b

Browse files
authored
feat: support image url as message directly (#3246)
1 parent feab515 commit f04e76b

File tree

14 files changed

+411
-321
lines changed

14 files changed

+411
-321
lines changed

.github/ISSUE_TEMPLATE/bug_report.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ body:
2626
attributes:
2727
label: What version of camel are you using?
2828
description: Run command `python3 -c 'print(__import__("camel").__version__)'` in your shell and paste the output here.
29-
placeholder: E.g., 0.2.76a10
29+
placeholder: E.g., 0.2.76a11
3030
validations:
3131
required: true
3232

camel/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414

1515
from camel.logger import disable_logging, enable_logging, set_log_level
1616

17-
__version__ = '0.2.76a10'
17+
__version__ = '0.2.76a11'
1818

1919
__all__ = [
2020
'__version__',

camel/memories/records.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,42 @@ def from_dict(cls, record_dict: Dict[str, Any]) -> "MemoryRecord":
9494
if "role_type" in data and isinstance(data["role_type"], str):
9595
data["role_type"] = RoleType(data["role_type"])
9696

97+
# Deserialize image_list from base64 strings/URLs back to PIL Images/
98+
# URLs
99+
if "image_list" in data and data["image_list"] is not None:
100+
import base64
101+
from io import BytesIO
102+
103+
from PIL import Image
104+
105+
image_objects = []
106+
for img_item in data["image_list"]:
107+
if isinstance(img_item, dict):
108+
# New format with type indicator
109+
if img_item["type"] == "url":
110+
# URL string, keep as-is
111+
image_objects.append(img_item["data"])
112+
else: # type == "base64"
113+
# Base64 encoded image, convert to PIL Image
114+
img_bytes = base64.b64decode(img_item["data"])
115+
img = Image.open(BytesIO(img_bytes))
116+
# Restore the format attribute if it was saved
117+
if "format" in img_item:
118+
img.format = img_item["format"]
119+
image_objects.append(img)
120+
else:
121+
# Legacy format: assume it's a base64 string
122+
img_bytes = base64.b64decode(img_item)
123+
img = Image.open(BytesIO(img_bytes))
124+
image_objects.append(img)
125+
data["image_list"] = image_objects
126+
127+
# Deserialize video_bytes from base64 string
128+
if "video_bytes" in data and data["video_bytes"] is not None:
129+
import base64
130+
131+
data["video_bytes"] = base64.b64decode(data["video_bytes"])
132+
97133
# Get valid constructor parameters (cached)
98134
valid_params = cls._get_constructor_params(message_cls)
99135

camel/messages/base.py

Lines changed: 126 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,9 @@ class BaseMessage:
6464
content (str): The content of the message.
6565
video_bytes (Optional[bytes]): Optional bytes of a video associated
6666
with the message. (default: :obj:`None`)
67-
image_list (Optional[List[Image.Image]]): Optional list of PIL Image
68-
objects associated with the message. (default: :obj:`None`)
67+
image_list (Optional[List[Union[Image.Image, str]]]): Optional list of
68+
PIL Image objects or image URLs (strings) associated with the
69+
message. (default: :obj:`None`)
6970
image_detail (Literal["auto", "low", "high"]): Detail level of the
7071
images associated with the message. (default: :obj:`auto`)
7172
video_detail (Literal["auto", "low", "high"]): Detail level of the
@@ -80,7 +81,7 @@ class BaseMessage:
8081
content: str
8182

8283
video_bytes: Optional[bytes] = None
83-
image_list: Optional[List[Image.Image]] = None
84+
image_list: Optional[List[Union[Image.Image, str]]] = None
8485
image_detail: Literal["auto", "low", "high"] = "auto"
8586
video_detail: Literal["auto", "low", "high"] = "auto"
8687
parsed: Optional[Union[BaseModel, dict]] = None
@@ -92,7 +93,7 @@ def make_user_message(
9293
content: str,
9394
meta_dict: Optional[Dict[str, str]] = None,
9495
video_bytes: Optional[bytes] = None,
95-
image_list: Optional[List[Image.Image]] = None,
96+
image_list: Optional[List[Union[Image.Image, str]]] = None,
9697
image_detail: Union[
9798
OpenAIVisionDetailType, str
9899
] = OpenAIVisionDetailType.AUTO,
@@ -109,8 +110,9 @@ def make_user_message(
109110
dictionary for the message.
110111
video_bytes (Optional[bytes]): Optional bytes of a video
111112
associated with the message.
112-
image_list (Optional[List[Image.Image]]): Optional list of PIL
113-
Image objects associated with the message.
113+
image_list (Optional[List[Union[Image.Image, str]]]): Optional list
114+
of PIL Image objects or image URLs (strings) associated with
115+
the message.
114116
image_detail (Union[OpenAIVisionDetailType, str]): Detail level of
115117
the images associated with the message.
116118
video_detail (Union[OpenAIVisionDetailType, str]): Detail level of
@@ -137,7 +139,7 @@ def make_assistant_message(
137139
content: str,
138140
meta_dict: Optional[Dict[str, str]] = None,
139141
video_bytes: Optional[bytes] = None,
140-
image_list: Optional[List[Image.Image]] = None,
142+
image_list: Optional[List[Union[Image.Image, str]]] = None,
141143
image_detail: Union[
142144
OpenAIVisionDetailType, str
143145
] = OpenAIVisionDetailType.AUTO,
@@ -154,8 +156,9 @@ def make_assistant_message(
154156
dictionary for the message.
155157
video_bytes (Optional[bytes]): Optional bytes of a video
156158
associated with the message.
157-
image_list (Optional[List[Image.Image]]): Optional list of PIL
158-
Image objects associated with the message.
159+
image_list (Optional[List[Union[Image.Image, str]]]): Optional list
160+
of PIL Image objects or image URLs (strings) associated with
161+
the message.
159162
image_detail (Union[OpenAIVisionDetailType, str]): Detail level of
160163
the images associated with the message.
161164
video_detail (Union[OpenAIVisionDetailType, str]): Detail level of
@@ -436,31 +439,64 @@ def to_openai_user_message(self) -> OpenAIUserMessage:
436439
)
437440
if self.image_list and len(self.image_list) > 0:
438441
for image in self.image_list:
439-
if image.format is None:
440-
# Set default format to PNG as fallback
441-
image.format = 'PNG'
442-
443-
image_type: str = image.format.lower()
444-
if image_type not in OpenAIImageType:
445-
raise ValueError(
446-
f"Image type {image.format} "
447-
f"is not supported by OpenAI vision model"
442+
# Check if image is a URL string or PIL Image
443+
if isinstance(image, str):
444+
# Image is a URL string
445+
hybrid_content.append(
446+
{
447+
"type": "image_url",
448+
"image_url": {
449+
"url": image,
450+
"detail": self.image_detail,
451+
},
452+
}
448453
)
449-
with io.BytesIO() as buffer:
450-
image.save(fp=buffer, format=image.format)
451-
encoded_image = base64.b64encode(buffer.getvalue()).decode(
452-
"utf-8"
454+
else:
455+
# Image is a PIL Image object
456+
if image.format is None:
457+
# Set default format to PNG as fallback
458+
image.format = 'PNG'
459+
460+
image_type: str = image.format.lower()
461+
if image_type not in OpenAIImageType:
462+
raise ValueError(
463+
f"Image type {image.format} "
464+
f"is not supported by OpenAI vision model"
465+
)
466+
467+
# Convert RGBA to RGB for formats that don't support
468+
# transparency or when the image has transparency channel
469+
img_to_save = image
470+
if image.mode in ('RGBA', 'LA', 'P') and image_type in (
471+
'jpeg',
472+
'jpg',
473+
):
474+
# JPEG doesn't support transparency, convert to RGB
475+
img_to_save = image.convert('RGB')
476+
elif (
477+
image.mode in ('RGBA', 'LA', 'P')
478+
and image_type == 'png'
479+
):
480+
# For PNG with transparency, convert to RGBA if needed
481+
if image.mode in ('LA', 'P'):
482+
img_to_save = image.convert('RGBA')
483+
# else: RGBA mode, keep as-is
484+
485+
with io.BytesIO() as buffer:
486+
img_to_save.save(fp=buffer, format=image.format)
487+
encoded_image = base64.b64encode(
488+
buffer.getvalue()
489+
).decode("utf-8")
490+
image_prefix = f"data:image/{image_type};base64,"
491+
hybrid_content.append(
492+
{
493+
"type": "image_url",
494+
"image_url": {
495+
"url": f"{image_prefix}{encoded_image}",
496+
"detail": self.image_detail,
497+
},
498+
}
453499
)
454-
image_prefix = f"data:image/{image_type};base64,"
455-
hybrid_content.append(
456-
{
457-
"type": "image_url",
458-
"image_url": {
459-
"url": f"{image_prefix}{encoded_image}",
460-
"detail": self.image_detail,
461-
},
462-
}
463-
)
464500

465501
if self.video_bytes:
466502
import imageio.v3 as iio
@@ -552,9 +588,66 @@ def to_dict(self) -> Dict:
552588
Returns:
553589
dict: The converted dictionary.
554590
"""
555-
return {
591+
result = {
556592
"role_name": self.role_name,
557593
"role_type": self.role_type.value,
558594
**(self.meta_dict or {}),
559595
"content": self.content,
560596
}
597+
598+
# Include image/video fields if present
599+
if self.image_list is not None:
600+
# Handle both PIL Images and URL strings
601+
import base64
602+
from io import BytesIO
603+
604+
image_data_list = []
605+
for img in self.image_list:
606+
if isinstance(img, str):
607+
# Image is a URL string, store as-is
608+
image_data_list.append({"type": "url", "data": img})
609+
else:
610+
# Image is a PIL Image, convert to base64
611+
# Preserve format, default to PNG if not set
612+
img_format = img.format if img.format else "PNG"
613+
614+
# Handle transparency for different formats
615+
img_to_save = img
616+
if img.mode in (
617+
'RGBA',
618+
'LA',
619+
'P',
620+
) and img_format.upper() in ('JPEG', 'JPG'):
621+
# JPEG doesn't support transparency, convert to RGB
622+
img_to_save = img.convert('RGB')
623+
elif (
624+
img.mode in ('LA', 'P') and img_format.upper() == 'PNG'
625+
):
626+
# For PNG with transparency, convert to RGBA if needed
627+
img_to_save = img.convert('RGBA')
628+
# else: keep as-is for other combinations
629+
630+
buffered = BytesIO()
631+
img_to_save.save(buffered, format=img_format)
632+
img_str = base64.b64encode(buffered.getvalue()).decode()
633+
image_data_list.append(
634+
{
635+
"type": "base64",
636+
"data": img_str,
637+
"format": img_format, # Preserve format
638+
}
639+
)
640+
result["image_list"] = image_data_list
641+
642+
if self.video_bytes is not None:
643+
import base64
644+
645+
result["video_bytes"] = base64.b64encode(self.video_bytes).decode()
646+
647+
if self.image_detail is not None:
648+
result["image_detail"] = self.image_detail
649+
650+
if self.video_detail is not None:
651+
result["video_detail"] = self.video_detail
652+
653+
return result

camel/tasks/task.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -237,8 +237,9 @@ class Task(BaseModel):
237237
(default: :obj:`[]`)
238238
additional_info (Optional[Dict[str, Any]]): Additional information for
239239
the task. (default: :obj:`None`)
240-
image_list (Optional[List[Image.Image]]): Optional list of PIL Image
241-
objects associated with the task. (default: :obj:`None`)
240+
image_list (Optional[List[Union[Image.Image, str]]]): Optional list
241+
of PIL Image objects or image URLs (strings) associated with the
242+
task. (default: :obj:`None`)
242243
image_detail (Literal["auto", "low", "high"]): Detail level of the
243244
images associated with the task. (default: :obj:`auto`)
244245
video_bytes (Optional[bytes]): Optional bytes of a video associated
@@ -271,7 +272,7 @@ class Task(BaseModel):
271272

272273
additional_info: Optional[Dict[str, Any]] = None
273274

274-
image_list: Optional[List[Image.Image]] = None
275+
image_list: Optional[List[Union[Image.Image, str]]] = None
275276

276277
image_detail: Literal["auto", "low", "high"] = "auto"
277278

camel/toolkits/video_analysis_toolkit.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -600,7 +600,7 @@ def ask_question_about_video(
600600
msg = BaseMessage.make_user_message(
601601
role_name="User",
602602
content=prompt,
603-
image_list=video_frames,
603+
image_list=video_frames, # type: ignore[arg-type]
604604
)
605605
# Reset the agent to clear previous state
606606
self.vl_agent.reset()

camel/utils/token_counting.py

Lines changed: 25 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -195,24 +195,32 @@ def count_tokens_from_messages(self, messages: List[OpenAIMessage]) -> int:
195195
image_str: str = item["image_url"]["url"]
196196
detail = item["image_url"]["detail"]
197197

198-
image_prefix_format = "data:image/{};base64,"
199-
image_prefix: Optional[str] = None
200-
for image_type in list(OpenAIImageType):
201-
# Find the correct image format
202-
image_prefix = image_prefix_format.format(
203-
image_type.value
198+
# Only count tokens for base64 encoded images
199+
# For URLs, we cannot reliably determine token count without fetching the image
200+
if image_str.startswith("data:image"):
201+
# Base64 encoded image
202+
image_prefix_format = "data:image/{};base64,"
203+
image_prefix: Optional[str] = None
204+
for image_type in list(OpenAIImageType):
205+
# Find the correct image format
206+
image_prefix = image_prefix_format.format(
207+
image_type.value
208+
)
209+
if image_prefix in image_str:
210+
break
211+
assert isinstance(image_prefix, str)
212+
encoded_image = image_str.split(image_prefix)[
213+
1
214+
]
215+
image_bytes = BytesIO(
216+
base64.b64decode(encoded_image)
204217
)
205-
if image_prefix in image_str:
206-
break
207-
assert isinstance(image_prefix, str)
208-
encoded_image = image_str.split(image_prefix)[1]
209-
image_bytes = BytesIO(
210-
base64.b64decode(encoded_image)
211-
)
212-
image = Image.open(image_bytes)
213-
num_tokens += self._count_tokens_from_image(
214-
image, OpenAIVisionDetailType(detail)
215-
)
218+
image = Image.open(image_bytes)
219+
num_tokens += self._count_tokens_from_image(
220+
image, OpenAIVisionDetailType(detail)
221+
)
222+
# Note: For regular URLs, token count cannot be determined without fetching the image
223+
# The actual token usage will be reported by the API response
216224
if key == "name":
217225
num_tokens += self.tokens_per_name
218226

docs/conf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
project = 'CAMEL'
2828
copyright = '2024, CAMEL-AI.org'
2929
author = 'CAMEL-AI.org'
30-
release = '0.2.76a10'
30+
release = '0.2.76a11'
3131

3232
html_favicon = (
3333
'https://raw.githubusercontent.com/camel-ai/camel/master/misc/favicon.png'

0 commit comments

Comments
 (0)