Provide a standard base class for creating custom Signature field type (#8217)

chenmoneygithub · arnavsinghvi11 · web-flow · commit 5e17bcfe6209 · 2025-05-17T09:43:39.000-07:00
* support for custom types in DSPy signatures

* fix completed demos

* rename custom formatting function

* init

* increment

* increment

* increment

* add test

* better arrangement of code

* fix test

* address comments

* add comment

---------

Co-authored-by: Arnav Singhvi &lt;arnav11.singhvi@gmail.com&gt;
diff --git a/dspy/__init__.py b/dspy/__init__.py
@@ -8,7 +8,7 @@
 
 from dspy.evaluate import Evaluate  # isort: skip
 from dspy.clients import *  # isort: skip
-from dspy.adapters import Adapter, ChatAdapter, JSONAdapter, TwoStepAdapter, Image, History  # isort: skip
+from dspy.adapters import Adapter, ChatAdapter, JSONAdapter, TwoStepAdapter, Image, History, BaseType  # isort: skip
 from dspy.utils.logging_utils import configure_dspy_loggers, disable_logging, enable_logging
 from dspy.utils.asyncify import asyncify
 from dspy.utils.saving import load
diff --git a/dspy/adapters/__init__.py b/dspy/adapters/__init__.py
@@ -2,11 +2,12 @@
 from dspy.adapters.chat_adapter import ChatAdapter
 from dspy.adapters.json_adapter import JSONAdapter
 from dspy.adapters.two_step_adapter import TwoStepAdapter
-from dspy.adapters.types import History, Image
+from dspy.adapters.types import History, Image, BaseType
 
 __all__ = [
     "Adapter",
     "ChatAdapter",
+    "BaseType",
     "History",
     "Image",
     "JSONAdapter",
diff --git a/dspy/adapters/base.py b/dspy/adapters/base.py
@@ -1,7 +1,7 @@
 from typing import TYPE_CHECKING, Any, Optional, Type
 
 from dspy.adapters.types import History
-from dspy.adapters.types.image import try_expand_image_tags
+from dspy.adapters.types.base_type import split_message_content_for_custom_types
 from dspy.signatures.signature import Signature
 from dspy.utils.callback import BaseCallback, with_callbacks
 
@@ -141,7 +141,7 @@ def format(
             content = self.format_user_message_content(signature, inputs_copy, main_request=True)
             messages.append({"role": "user", "content": content})
 
-        messages = try_expand_image_tags(messages)
+        messages = split_message_content_for_custom_types(messages)
         return messages
 
     def format_field_description(self, signature: Type[Signature]) -> str:
diff --git a/dspy/adapters/types/__init__.py b/dspy/adapters/types/__init__.py
@@ -1,4 +1,5 @@
 from dspy.adapters.types.history import History
 from dspy.adapters.types.image import Image
+from dspy.adapters.types.base_type import BaseType
 
-__all__ = ["History", "Image"]
+__all__ = ["History", "Image", "BaseType"]
diff --git a/dspy/adapters/types/base_type.py b/dspy/adapters/types/base_type.py
@@ -0,0 +1,104 @@
+import json
+import re
+from typing import Any
+
+import json_repair
+import pydantic
+
+CUSTOM_TYPE_START_IDENTIFIER = "<<CUSTOM-TYPE-START-IDENTIFIER>>"
+CUSTOM_TYPE_END_IDENTIFIER = "<<CUSTOM-TYPE-END-IDENTIFIER>>"
+
+
+class BaseType(pydantic.BaseModel):
+    """Base class to support creating custom types for DSPy signatures.
+
+    This is the parent class of DSPy custom types, e.g, dspy.Image. Subclasses must implement the `format` method to
+    return a list of dictionaries (same as the Array of content parts in the OpenAI API user message's content field).
+
+    Example:
+
+        ```python
+        class Image(BaseType):
+            url: str
+
+            def format(self) -> list[dict[str, Any]]:
+                return [{"type": "image_url", "image_url": {"url": self.url}}]
+        ```
+    """
+
+    def format(self) -> list[dict[str, Any]]:
+        raise NotImplementedError
+
+    @pydantic.model_serializer()
+    def serialize_model(self):
+        return f"{CUSTOM_TYPE_START_IDENTIFIER}{self.format()}{CUSTOM_TYPE_END_IDENTIFIER}"
+
+
+def split_message_content_for_custom_types(messages: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    """Split user message content into a list of content blocks.
+
+    This method splits each user message's content in the `messages` list to be a list of content block, so that
+    the custom types like `dspy.Image` can be properly formatted for better quality. For example, the split content
+    may look like below if the user message has a `dspy.Image` object:
+
+    ```
+    [
+        {"type": "text", "text": "{text_before_image}"},
+        {"type": "image_url", "image_url": {"url": "{image_url}"}},
+        {"type": "text", "text": "{text_after_image}"},
+    ]
+    ```
+
+    This is implemented by finding the `<<CUSTOM-TYPE-START-IDENTIFIER>>` and `<<CUSTOM-TYPE-END-IDENTIFIER>>`
+    in the user message content and splitting the content around them. The `<<CUSTOM-TYPE-START-IDENTIFIER>>`
+    and `<<CUSTOM-TYPE-END-IDENTIFIER>>` are the reserved identifiers for the custom types as in `dspy.BaseType`.
+
+    Args:
+        messages: a list of messages sent to the LM. The format is the same as [OpenAI API's messages
+            format](https://platform.openai.com/docs/guides/chat-completions/response-format).
+
+    Returns:
+        A list of messages with the content split into a list of content blocks around custom types content.
+    """
+    for message in messages:
+        if message["role"] != "user":
+            # Custom type messages are only in user messages
+            continue
+
+        pattern = rf"{CUSTOM_TYPE_START_IDENTIFIER}(.*?){CUSTOM_TYPE_END_IDENTIFIER}"
+        result = []
+        last_end = 0
+        # DSPy adapter always formats user input into a string content before custom type splitting
+        content: str = message["content"]
+
+        for match in re.finditer(pattern, content, re.DOTALL):
+            start, end = match.span()
+
+            # Add text before the current block
+            if start > last_end:
+                result.append({"type": "text", "text": content[last_end:start]})
+
+            # Parse the JSON inside the block
+            custom_type_content = match.group(1).strip()
+            try:
+                parsed = json_repair.loads(custom_type_content)
+                for custom_type_content in parsed:
+                    result.append(custom_type_content)
+            except json.JSONDecodeError:
+                # fallback to raw string if it's not valid JSON
+                parsed = {"type": "text", "text": custom_type_content}
+                result.append(parsed)
+
+            last_end = end
+
+        if last_end == 0:
+            # No custom type found, return the original message
+            continue
+
+        # Add any remaining text after the last match
+        if last_end < len(content):
+            result.append({"type": "text", "text": content[last_end:]})
+
+        message["content"] = result
+
+    return messages
diff --git a/dspy/adapters/types/image.py b/dspy/adapters/types/image.py
@@ -2,13 +2,14 @@
 import io
 import mimetypes
 import os
-import re
-from typing import Any, Dict, List, Union
+from typing import Any, Union
 from urllib.parse import urlparse
 
 import pydantic
 import requests
 
+from dspy.adapters.types.base_type import BaseType
+
 try:
     from PIL import Image as PILImage
 
@@ -17,7 +18,7 @@
     PIL_AVAILABLE = False
 
 
-class Image(pydantic.BaseModel):
+class Image(BaseType):
     url: str
 
     model_config = {
@@ -27,6 +28,13 @@ class Image(pydantic.BaseModel):
         "extra": "forbid",
     }
 
+    def format(self) -> Union[list[dict[str, Any]], str]:
+        try:
+            image_url = encode_image(self.url)
+        except Exception as e:
+            raise ValueError(f"Failed to format image for DSPy: {e}")
+        return [{"type": "image_url", "image_url": {"url": image_url}}]
+
     @pydantic.model_validator(mode="before")
     @classmethod
     def validate_input(cls, values):
@@ -55,10 +63,6 @@ def from_file(cls, file_path: str):
     def from_PIL(cls, pil_image):  # noqa: N802
         return cls(url=encode_image(pil_image))
 
-    @pydantic.model_serializer()
-    def serialize_model(self):
-        return "<DSPY_IMAGE_START>" + self.url + "<DSPY_IMAGE_END>"
-
     def __str__(self):
         return self.serialize_model()
 
@@ -197,54 +201,3 @@ def is_image(obj) -> bool:
         elif is_url(obj):
             return True
     return False
-
-
-def try_expand_image_tags(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-    """Try to expand image tags in the messages."""
-    for message in messages:
-        # NOTE: Assumption that content is a string
-        if "content" in message and "<DSPY_IMAGE_START>" in message["content"]:
-            message["content"] = expand_image_tags(message["content"])
-    return messages
-
-
-def expand_image_tags(text: str) -> Union[str, List[Dict[str, Any]]]:
-    """Expand image tags in the text. If there are any image tags,
-    turn it from a content string into a content list of texts and image urls.
-
-    Args:
-        text: The text content that may contain image tags
-
-    Returns:
-        Either the original string if no image tags, or a list of content dicts
-        with text and image_url entries
-    """
-    image_tag_regex = r'"?<DSPY_IMAGE_START>(.*?)<DSPY_IMAGE_END>"?'
-
-    # If no image tags, return original text
-    if not re.search(image_tag_regex, text):
-        return text
-
-    final_list = []
-    remaining_text = text
-
-    while remaining_text:
-        match = re.search(image_tag_regex, remaining_text)
-        if not match:
-            if remaining_text.strip():
-                final_list.append({"type": "text", "text": remaining_text.strip()})
-            break
-
-        # Get text before the image tag
-        prefix = remaining_text[: match.start()].strip()
-        if prefix:
-            final_list.append({"type": "text", "text": prefix})
-
-        # Add the image
-        image_url = match.group(1)
-        final_list.append({"type": "image_url", "image_url": {"url": image_url}})
-
-        # Update remaining text
-        remaining_text = remaining_text[match.end() :].strip()
-
-    return final_list
diff --git a/dspy/signatures/signature.py b/dspy/signatures/signature.py
@@ -27,7 +27,6 @@ class MySignature(dspy.Signature):
 from pydantic import BaseModel, Field, create_model
 from pydantic.fields import FieldInfo
 
-from dspy.adapters.types.image import Image  # noqa: F401
 from dspy.signatures.field import InputField, OutputField
 
 
diff --git a/tests/adapters/test_chat_adapter.py b/tests/adapters/test_chat_adapter.py
@@ -4,6 +4,7 @@
 import pytest
 
 import dspy
+import pydantic
 
 
 @pytest.mark.parametrize(
@@ -94,3 +95,121 @@ async def test_chat_adapter_async_call():
     lm = dspy.utils.DummyLM([{"answer": "Paris"}])
     result = await adapter.acall(lm, {}, signature, [], {"question": "What is the capital of France?"})
     assert result == [{"answer": "Paris"}]
+
+
+def test_chat_adapter_formats_image():
+    # Test basic image formatting
+    image = dspy.Image(url="https://example.com/image.jpg")
+
+    class MySignature(dspy.Signature):
+        image: dspy.Image = dspy.InputField()
+        text: str = dspy.OutputField()
+
+    adapter = dspy.ChatAdapter()
+    messages = adapter.format(MySignature, [], {"image": image})
+
+    assert len(messages) == 2
+    user_message_content = messages[1]["content"]
+    assert user_message_content is not None
+
+    # The message should have 3 chunks of types: text, image_url, text
+    assert len(user_message_content) == 3
+    assert user_message_content[0]["type"] == "text"
+    assert user_message_content[2]["type"] == "text"
+
+    # Assert that the image is formatted correctly
+    expected_image_content = {"type": "image_url", "image_url": {"url": "https://example.com/image.jpg"}}
+    assert expected_image_content in user_message_content
+
+
+def test_chat_adapter_formats_image_with_few_shot_examples():
+    class MySignature(dspy.Signature):
+        image: dspy.Image = dspy.InputField()
+        text: str = dspy.OutputField()
+
+    adapter = dspy.ChatAdapter()
+
+    demos = [
+        dspy.Example(
+            image=dspy.Image(url="https://example.com/image1.jpg"),
+            text="This is a test image",
+        ),
+        dspy.Example(
+            image=dspy.Image(url="https://example.com/image2.jpg"),
+            text="This is another test image",
+        ),
+    ]
+    messages = adapter.format(MySignature, demos, {"image": dspy.Image(url="https://example.com/image3.jpg")})
+
+    # 1 system message, 2 few shot examples (1 user and assistant message for each example), 1 user message
+    assert len(messages) == 6
+
+    assert {"type": "image_url", "image_url": {"url": "https://example.com/image1.jpg"}} in messages[1]["content"]
+    assert {"type": "image_url", "image_url": {"url": "https://example.com/image2.jpg"}} in messages[3]["content"]
+    assert {"type": "image_url", "image_url": {"url": "https://example.com/image3.jpg"}} in messages[5]["content"]
+
+
+def test_chat_adapter_formats_image_with_nested_images():
+    class ImageWrapper(pydantic.BaseModel):
+        images: list[dspy.Image]
+        tag: list[str]
+
+    class MySignature(dspy.Signature):
+        image: ImageWrapper = dspy.InputField()
+        text: str = dspy.OutputField()
+
+    image1 = dspy.Image(url="https://example.com/image1.jpg")
+    image2 = dspy.Image(url="https://example.com/image2.jpg")
+    image3 = dspy.Image(url="https://example.com/image3.jpg")
+
+    image_wrapper = ImageWrapper(images=[image1, image2, image3], tag=["test", "example"])
+
+    adapter = dspy.ChatAdapter()
+    messages = adapter.format(MySignature, [], {"image": image_wrapper})
+
+    expected_image1_content = {"type": "image_url", "image_url": {"url": "https://example.com/image1.jpg"}}
+    expected_image2_content = {"type": "image_url", "image_url": {"url": "https://example.com/image2.jpg"}}
+    expected_image3_content = {"type": "image_url", "image_url": {"url": "https://example.com/image3.jpg"}}
+
+    assert expected_image1_content in messages[1]["content"]
+    assert expected_image2_content in messages[1]["content"]
+    assert expected_image3_content in messages[1]["content"]
+
+
+def test_chat_adapter_formats_image_with_few_shot_examples_with_nested_images():
+    class ImageWrapper(pydantic.BaseModel):
+        images: list[dspy.Image]
+        tag: list[str]
+
+    class MySignature(dspy.Signature):
+        image: ImageWrapper = dspy.InputField()
+        text: str = dspy.OutputField()
+
+    image1 = dspy.Image(url="https://example.com/image1.jpg")
+    image2 = dspy.Image(url="https://example.com/image2.jpg")
+    image3 = dspy.Image(url="https://example.com/image3.jpg")
+
+    image_wrapper = ImageWrapper(images=[image1, image2, image3], tag=["test", "example"])
+    demos = [
+        dspy.Example(
+            image=image_wrapper,
+            text="This is a test image",
+        ),
+    ]
+
+    image_wrapper_2 = ImageWrapper(images=[dspy.Image(url="https://example.com/image4.jpg")], tag=["test", "example"])
+    adapter = dspy.ChatAdapter()
+    messages = adapter.format(MySignature, demos, {"image": image_wrapper_2})
+
+    assert len(messages) == 4
+
+    # Image information in the few-shot example's user message
+    expected_image1_content = {"type": "image_url", "image_url": {"url": "https://example.com/image1.jpg"}}
+    expected_image2_content = {"type": "image_url", "image_url": {"url": "https://example.com/image2.jpg"}}
+    expected_image3_content = {"type": "image_url", "image_url": {"url": "https://example.com/image3.jpg"}}
+    assert expected_image1_content in messages[1]["content"]
+    assert expected_image2_content in messages[1]["content"]
+    assert expected_image3_content in messages[1]["content"]
+
+    # The query image is formatted in the last user message
+    assert {"type": "image_url", "image_url": {"url": "https://example.com/image4.jpg"}} in messages[-1]["content"]
diff --git a/tests/adapters/test_json_adapter.py b/tests/adapters/test_json_adapter.py
diff --git a/tests/signatures/test_adapter_image.py b/tests/signatures/test_adapter_image.py