@@ -64,8 +64,9 @@ class BaseMessage:
6464 content (str): The content of the message.
6565 video_bytes (Optional[bytes]): Optional bytes of a video associated
6666 with the message. (default: :obj:`None`)
67- image_list (Optional[List[Image.Image]]): Optional list of PIL Image
68- objects associated with the message. (default: :obj:`None`)
67+ image_list (Optional[List[Union[Image.Image, str]]]): Optional list of
68+ PIL Image objects or image URLs (strings) associated with the
69+ message. (default: :obj:`None`)
6970 image_detail (Literal["auto", "low", "high"]): Detail level of the
7071 images associated with the message. (default: :obj:`auto`)
7172 video_detail (Literal["auto", "low", "high"]): Detail level of the
@@ -80,7 +81,7 @@ class BaseMessage:
8081 content : str
8182
8283 video_bytes : Optional [bytes ] = None
83- image_list : Optional [List [Image .Image ]] = None
84+ image_list : Optional [List [Union [ Image .Image , str ] ]] = None
8485 image_detail : Literal ["auto" , "low" , "high" ] = "auto"
8586 video_detail : Literal ["auto" , "low" , "high" ] = "auto"
8687 parsed : Optional [Union [BaseModel , dict ]] = None
@@ -92,7 +93,7 @@ def make_user_message(
9293 content : str ,
9394 meta_dict : Optional [Dict [str , str ]] = None ,
9495 video_bytes : Optional [bytes ] = None ,
95- image_list : Optional [List [Image .Image ]] = None ,
96+ image_list : Optional [List [Union [ Image .Image , str ] ]] = None ,
9697 image_detail : Union [
9798 OpenAIVisionDetailType , str
9899 ] = OpenAIVisionDetailType .AUTO ,
@@ -109,8 +110,9 @@ def make_user_message(
109110 dictionary for the message.
110111 video_bytes (Optional[bytes]): Optional bytes of a video
111112 associated with the message.
112- image_list (Optional[List[Image.Image]]): Optional list of PIL
113- Image objects associated with the message.
113+ image_list (Optional[List[Union[Image.Image, str]]]): Optional list
114+ of PIL Image objects or image URLs (strings) associated with
115+ the message.
114116 image_detail (Union[OpenAIVisionDetailType, str]): Detail level of
115117 the images associated with the message.
116118 video_detail (Union[OpenAIVisionDetailType, str]): Detail level of
@@ -137,7 +139,7 @@ def make_assistant_message(
137139 content : str ,
138140 meta_dict : Optional [Dict [str , str ]] = None ,
139141 video_bytes : Optional [bytes ] = None ,
140- image_list : Optional [List [Image .Image ]] = None ,
142+ image_list : Optional [List [Union [ Image .Image , str ] ]] = None ,
141143 image_detail : Union [
142144 OpenAIVisionDetailType , str
143145 ] = OpenAIVisionDetailType .AUTO ,
@@ -154,8 +156,9 @@ def make_assistant_message(
154156 dictionary for the message.
155157 video_bytes (Optional[bytes]): Optional bytes of a video
156158 associated with the message.
157- image_list (Optional[List[Image.Image]]): Optional list of PIL
158- Image objects associated with the message.
159+ image_list (Optional[List[Union[Image.Image, str]]]): Optional list
160+ of PIL Image objects or image URLs (strings) associated with
161+ the message.
159162 image_detail (Union[OpenAIVisionDetailType, str]): Detail level of
160163 the images associated with the message.
161164 video_detail (Union[OpenAIVisionDetailType, str]): Detail level of
@@ -436,31 +439,64 @@ def to_openai_user_message(self) -> OpenAIUserMessage:
436439 )
437440 if self .image_list and len (self .image_list ) > 0 :
438441 for image in self .image_list :
439- if image .format is None :
440- # Set default format to PNG as fallback
441- image .format = 'PNG'
442-
443- image_type : str = image .format .lower ()
444- if image_type not in OpenAIImageType :
445- raise ValueError (
446- f"Image type { image .format } "
447- f"is not supported by OpenAI vision model"
442+ # Check if image is a URL string or PIL Image
443+ if isinstance (image , str ):
444+ # Image is a URL string
445+ hybrid_content .append (
446+ {
447+ "type" : "image_url" ,
448+ "image_url" : {
449+ "url" : image ,
450+ "detail" : self .image_detail ,
451+ },
452+ }
448453 )
449- with io .BytesIO () as buffer :
450- image .save (fp = buffer , format = image .format )
451- encoded_image = base64 .b64encode (buffer .getvalue ()).decode (
452- "utf-8"
454+ else :
455+ # Image is a PIL Image object
456+ if image .format is None :
457+ # Set default format to PNG as fallback
458+ image .format = 'PNG'
459+
460+ image_type : str = image .format .lower ()
461+ if image_type not in OpenAIImageType :
462+ raise ValueError (
463+ f"Image type { image .format } "
464+ f"is not supported by OpenAI vision model"
465+ )
466+
467+ # Convert RGBA to RGB for formats that don't support
468+ # transparency or when the image has transparency channel
469+ img_to_save = image
470+ if image .mode in ('RGBA' , 'LA' , 'P' ) and image_type in (
471+ 'jpeg' ,
472+ 'jpg' ,
473+ ):
474+ # JPEG doesn't support transparency, convert to RGB
475+ img_to_save = image .convert ('RGB' )
476+ elif (
477+ image .mode in ('RGBA' , 'LA' , 'P' )
478+ and image_type == 'png'
479+ ):
480+ # For PNG with transparency, convert to RGBA if needed
481+ if image .mode in ('LA' , 'P' ):
482+ img_to_save = image .convert ('RGBA' )
483+ # else: RGBA mode, keep as-is
484+
485+ with io .BytesIO () as buffer :
486+ img_to_save .save (fp = buffer , format = image .format )
487+ encoded_image = base64 .b64encode (
488+ buffer .getvalue ()
489+ ).decode ("utf-8" )
490+ image_prefix = f"data:image/{ image_type } ;base64,"
491+ hybrid_content .append (
492+ {
493+ "type" : "image_url" ,
494+ "image_url" : {
495+ "url" : f"{ image_prefix } { encoded_image } " ,
496+ "detail" : self .image_detail ,
497+ },
498+ }
453499 )
454- image_prefix = f"data:image/{ image_type } ;base64,"
455- hybrid_content .append (
456- {
457- "type" : "image_url" ,
458- "image_url" : {
459- "url" : f"{ image_prefix } { encoded_image } " ,
460- "detail" : self .image_detail ,
461- },
462- }
463- )
464500
465501 if self .video_bytes :
466502 import imageio .v3 as iio
@@ -552,9 +588,66 @@ def to_dict(self) -> Dict:
552588 Returns:
553589 dict: The converted dictionary.
554590 """
555- return {
591+ result = {
556592 "role_name" : self .role_name ,
557593 "role_type" : self .role_type .value ,
558594 ** (self .meta_dict or {}),
559595 "content" : self .content ,
560596 }
597+
598+ # Include image/video fields if present
599+ if self .image_list is not None :
600+ # Handle both PIL Images and URL strings
601+ import base64
602+ from io import BytesIO
603+
604+ image_data_list = []
605+ for img in self .image_list :
606+ if isinstance (img , str ):
607+ # Image is a URL string, store as-is
608+ image_data_list .append ({"type" : "url" , "data" : img })
609+ else :
610+ # Image is a PIL Image, convert to base64
611+ # Preserve format, default to PNG if not set
612+ img_format = img .format if img .format else "PNG"
613+
614+ # Handle transparency for different formats
615+ img_to_save = img
616+ if img .mode in (
617+ 'RGBA' ,
618+ 'LA' ,
619+ 'P' ,
620+ ) and img_format .upper () in ('JPEG' , 'JPG' ):
621+ # JPEG doesn't support transparency, convert to RGB
622+ img_to_save = img .convert ('RGB' )
623+ elif (
624+ img .mode in ('LA' , 'P' ) and img_format .upper () == 'PNG'
625+ ):
626+ # For PNG with transparency, convert to RGBA if needed
627+ img_to_save = img .convert ('RGBA' )
628+ # else: keep as-is for other combinations
629+
630+ buffered = BytesIO ()
631+ img_to_save .save (buffered , format = img_format )
632+ img_str = base64 .b64encode (buffered .getvalue ()).decode ()
633+ image_data_list .append (
634+ {
635+ "type" : "base64" ,
636+ "data" : img_str ,
637+ "format" : img_format , # Preserve format
638+ }
639+ )
640+ result ["image_list" ] = image_data_list
641+
642+ if self .video_bytes is not None :
643+ import base64
644+
645+ result ["video_bytes" ] = base64 .b64encode (self .video_bytes ).decode ()
646+
647+ if self .image_detail is not None :
648+ result ["image_detail" ] = self .image_detail
649+
650+ if self .video_detail is not None :
651+ result ["video_detail" ] = self .video_detail
652+
653+ return result
0 commit comments