Add PIL Image support to InferenceClient (#3199)

NielsRogge · Wauplin · github-actions[bot] · web-flow · commit 7929072ecf92 · 2025-07-04T11:15:01.000+02:00
* Add PIL Image support to InferenceClient - Add PIL Image to ContentT type hints for type checking - Update _open_as_binary to handle PIL Images by converting to bytes - Update _as_url to detect MIME type from PIL Image format - Update docstrings to indicate PIL Image support for all image methods - Fixes #3191: Make InferenceClient accept Pillow images This enables iterative image editing workflows where PIL Images returned by image_to_image can be directly passed back to image_to_image methods without the 'Unsupported input type for image' error. * Make style * Address comments * Apply suggestion from @Wauplin * Apply suggestion from @Wauplin * Apply style fixes * Apply suggestions from code review Co-authored-by: célina <hanouticelina@gmail.com> * add tests * style --------- Co-authored-by: Lucain <lucainp@gmail.com> Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com> Co-authored-by: célina <hanouticelina@gmail.com>
diff --git a/src/huggingface_hub/inference/_client.py b/src/huggingface_hub/inference/_client.py
@@ -1154,8 +1154,8 @@ def image_classification(
         Perform image classification on the given image using the specified model.
 
         Args:
-            image (`Union[str, Path, bytes, BinaryIO]`):
-                The image to classify. It can be raw bytes, an image file, or a URL to an online image.
+            image (`Union[str, Path, bytes, BinaryIO, PIL.Image.Image]`):
+                The image to classify. It can be raw bytes, an image file, a URL to an online image, or a PIL Image.
             model (`str`, *optional*):
                 The model to use for image classification. Can be a model ID hosted on the Hugging Face Hub or a URL to a
                 deployed Inference Endpoint. If not provided, the default recommended model for image classification will be used.
@@ -1212,8 +1212,8 @@ def image_segmentation(
         </Tip>
 
         Args:
-            image (`Union[str, Path, bytes, BinaryIO]`):
-                The image to segment. It can be raw bytes, an image file, or a URL to an online image.
+            image (`Union[str, Path, bytes, BinaryIO, PIL.Image.Image]`):
+                The image to segment. It can be raw bytes, an image file, a URL to an online image, or a PIL Image.
             model (`str`, *optional*):
                 The model to use for image segmentation. Can be a model ID hosted on the Hugging Face Hub or a URL to a
                 deployed Inference Endpoint. If not provided, the default recommended model for image segmentation will be used.
@@ -1284,8 +1284,8 @@ def image_to_image(
         </Tip>
 
         Args:
-            image (`Union[str, Path, bytes, BinaryIO]`):
-                The input image for translation. It can be raw bytes, an image file, or a URL to an online image.
+            image (`Union[str, Path, bytes, BinaryIO, PIL.Image.Image]`):
+                The input image for translation. It can be raw bytes, an image file, a URL to an online image, or a PIL Image.
             prompt (`str`, *optional*):
                 The text prompt to guide the image generation.
             negative_prompt (`str`, *optional*):
@@ -1347,8 +1347,8 @@ def image_to_text(self, image: ContentT, *, model: Optional[str] = None) -> Imag
         (OCR), Pix2Struct, etc). Please have a look to the model card to learn more about a model's specificities.
 
         Args:
-            image (`Union[str, Path, bytes, BinaryIO]`):
-                The input image to caption. It can be raw bytes, an image file, or a URL to an online image..
+            image (`Union[str, Path, bytes, BinaryIO, PIL.Image.Image]`):
+                The input image to caption. It can be raw bytes, an image file, a URL to an online image, or a PIL Image.
             model (`str`, *optional*):
                 The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
                 Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
@@ -1398,8 +1398,8 @@ def object_detection(
         </Tip>
 
         Args:
-            image (`Union[str, Path, bytes, BinaryIO]`):
-                The image to detect objects on. It can be raw bytes, an image file, or a URL to an online image.
+            image (`Union[str, Path, bytes, BinaryIO, PIL.Image.Image]`):
+                The image to detect objects on. It can be raw bytes, an image file, a URL to an online image, or a PIL Image.
             model (`str`, *optional*):
                 The model to use for object detection. Can be a model ID hosted on the Hugging Face Hub or a URL to a
                 deployed Inference Endpoint. If not provided, the default recommended model for object detection (DETR) will be used.
@@ -2973,8 +2973,8 @@ def visual_question_answering(
         Answering open-ended questions based on an image.
 
         Args:
-            image (`Union[str, Path, bytes, BinaryIO]`):
-                The input image for the context. It can be raw bytes, an image file, or a URL to an online image.
+            image (`Union[str, Path, bytes, BinaryIO, PIL.Image.Image]`):
+                The input image for the context. It can be raw bytes, an image file, a URL to an online image, or a PIL Image.
             question (`str`):
                 Question to be answered.
             model (`str`, *optional*):
@@ -3140,8 +3140,8 @@ def zero_shot_image_classification(
         Provide input image and text labels to predict text labels for the image.
 
         Args:
-            image (`Union[str, Path, bytes, BinaryIO]`):
-                The input image to caption. It can be raw bytes, an image file, or a URL to an online image.
+            image (`Union[str, Path, bytes, BinaryIO, PIL.Image.Image]`):
+                The input image to caption. It can be raw bytes, an image file, a URL to an online image, or a PIL Image.
             candidate_labels (`List[str]`):
                 The candidate labels for this image
             labels (`List[str]`, *optional*):
diff --git a/src/huggingface_hub/inference/_common.py b/src/huggingface_hub/inference/_common.py
@@ -62,7 +62,7 @@
 UrlT = str
 PathT = Union[str, Path]
 BinaryT = Union[bytes, BinaryIO]
-ContentT = Union[BinaryT, PathT, UrlT]
+ContentT = Union[BinaryT, PathT, UrlT, "Image"]
 
 # Use to set a Accept: image/png header
 TASKS_EXPECTING_IMAGES = {"text-to-image", "image-to-image"}
@@ -161,11 +161,10 @@ def _open_as_binary(
 
 @contextmanager  # type: ignore
 def _open_as_binary(content: Optional[ContentT]) -> Generator[Optional[BinaryT], None, None]:
-    """Open `content` as a binary file, either from a URL, a local path, or raw bytes.
+    """Open `content` as a binary file, either from a URL, a local path, raw bytes, or a PIL Image.
 
-    Do nothing if `content` is None,
+    Do nothing if `content` is None.
 
-    TODO: handle a PIL.Image as input
     TODO: handle base64 as input
     """
     # If content is a string => must be either a URL or a path
@@ -186,9 +185,21 @@ def _open_as_binary(content: Optional[ContentT]) -> Generator[Optional[BinaryT],
         logger.debug(f"Opening content from {content}")
         with content.open("rb") as f:
             yield f
-    else:
-        # Otherwise: already a file-like object or None
-        yield content
+        return
+
+    # If content is a PIL Image => convert to bytes
+    if is_pillow_available():
+        from PIL import Image
+
+        if isinstance(content, Image.Image):
+            logger.debug("Converting PIL Image to bytes")
+            buffer = io.BytesIO()
+            content.save(buffer, format=content.format or "PNG")
+            yield buffer.getvalue()
+            return
+
+    # Otherwise: already a file-like object or None
+    yield content  # type: ignore
 
 
 def _b64_encode(content: ContentT) -> str:
@@ -202,9 +213,18 @@ def _as_url(content: ContentT, default_mime_type: str) -> str:
     if isinstance(content, str) and (content.startswith("https://") or content.startswith("http://")):
         return content
 
-    mime_type = (
-        mimetypes.guess_type(content, strict=False)[0] if isinstance(content, (str, Path)) else None
-    ) or default_mime_type
+    # Handle MIME type detection for different content types
+    mime_type = None
+    if isinstance(content, (str, Path)):
+        mime_type = mimetypes.guess_type(content, strict=False)[0]
+    elif is_pillow_available():
+        from PIL import Image
+
+        if isinstance(content, Image.Image):
+            # Determine MIME type from PIL Image format, in sync with `_open_as_binary`
+            mime_type = f"image/{(content.format or 'PNG').lower()}"
+
+    mime_type = mime_type or default_mime_type
     encoded_data = _b64_encode(content)
     return f"data:{mime_type};base64,{encoded_data}"
 
diff --git a/src/huggingface_hub/inference/_generated/_async_client.py b/src/huggingface_hub/inference/_generated/_async_client.py
@@ -1197,8 +1197,8 @@ async def image_classification(
         Perform image classification on the given image using the specified model.
 
         Args:
-            image (`Union[str, Path, bytes, BinaryIO]`):
-                The image to classify. It can be raw bytes, an image file, or a URL to an online image.
+            image (`Union[str, Path, bytes, BinaryIO, PIL.Image.Image]`):
+                The image to classify. It can be raw bytes, an image file, a URL to an online image, or a PIL Image.
             model (`str`, *optional*):
                 The model to use for image classification. Can be a model ID hosted on the Hugging Face Hub or a URL to a
                 deployed Inference Endpoint. If not provided, the default recommended model for image classification will be used.
@@ -1256,8 +1256,8 @@ async def image_segmentation(
         </Tip>
 
         Args:
-            image (`Union[str, Path, bytes, BinaryIO]`):
-                The image to segment. It can be raw bytes, an image file, or a URL to an online image.
+            image (`Union[str, Path, bytes, BinaryIO, PIL.Image.Image]`):
+                The image to segment. It can be raw bytes, an image file, a URL to an online image, or a PIL Image.
             model (`str`, *optional*):
                 The model to use for image segmentation. Can be a model ID hosted on the Hugging Face Hub or a URL to a
                 deployed Inference Endpoint. If not provided, the default recommended model for image segmentation will be used.
@@ -1329,8 +1329,8 @@ async def image_to_image(
         </Tip>
 
         Args:
-            image (`Union[str, Path, bytes, BinaryIO]`):
-                The input image for translation. It can be raw bytes, an image file, or a URL to an online image.
+            image (`Union[str, Path, bytes, BinaryIO, PIL.Image.Image]`):
+                The input image for translation. It can be raw bytes, an image file, a URL to an online image, or a PIL Image.
             prompt (`str`, *optional*):
                 The text prompt to guide the image generation.
             negative_prompt (`str`, *optional*):
@@ -1393,8 +1393,8 @@ async def image_to_text(self, image: ContentT, *, model: Optional[str] = None) -
         (OCR), Pix2Struct, etc). Please have a look to the model card to learn more about a model's specificities.
 
         Args:
-            image (`Union[str, Path, bytes, BinaryIO]`):
-                The input image to caption. It can be raw bytes, an image file, or a URL to an online image..
+            image (`Union[str, Path, bytes, BinaryIO, PIL.Image.Image]`):
+                The input image to caption. It can be raw bytes, an image file, a URL to an online image, or a PIL Image.
             model (`str`, *optional*):
                 The model to use for inference. Can be a model ID hosted on the Hugging Face Hub or a URL to a deployed
                 Inference Endpoint. This parameter overrides the model defined at the instance level. Defaults to None.
@@ -1445,8 +1445,8 @@ async def object_detection(
         </Tip>
 
         Args:
-            image (`Union[str, Path, bytes, BinaryIO]`):
-                The image to detect objects on. It can be raw bytes, an image file, or a URL to an online image.
+            image (`Union[str, Path, bytes, BinaryIO, PIL.Image.Image]`):
+                The image to detect objects on. It can be raw bytes, an image file, a URL to an online image, or a PIL Image.
             model (`str`, *optional*):
                 The model to use for object detection. Can be a model ID hosted on the Hugging Face Hub or a URL to a
                 deployed Inference Endpoint. If not provided, the default recommended model for object detection (DETR) will be used.
@@ -3033,8 +3033,8 @@ async def visual_question_answering(
         Answering open-ended questions based on an image.
 
         Args:
-            image (`Union[str, Path, bytes, BinaryIO]`):
-                The input image for the context. It can be raw bytes, an image file, or a URL to an online image.
+            image (`Union[str, Path, bytes, BinaryIO, PIL.Image.Image]`):
+                The input image for the context. It can be raw bytes, an image file, a URL to an online image, or a PIL Image.
             question (`str`):
                 Question to be answered.
             model (`str`, *optional*):
@@ -3203,8 +3203,8 @@ async def zero_shot_image_classification(
         Provide input image and text labels to predict text labels for the image.
 
         Args:
-            image (`Union[str, Path, bytes, BinaryIO]`):
-                The input image to caption. It can be raw bytes, an image file, or a URL to an online image.
+            image (`Union[str, Path, bytes, BinaryIO, PIL.Image.Image]`):
+                The input image to caption. It can be raw bytes, an image file, a URL to an online image, or a PIL Image.
             candidate_labels (`List[str]`):
                 The candidate labels for this image
             labels (`List[str]`, *optional*):
diff --git a/tests/test_inference_client.py b/tests/test_inference_client.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import base64
 import io
 import json
 import os
@@ -803,6 +804,15 @@ def test_open_as_binary_from_bytes(self) -> None:
         with _open_as_binary(content_bytes) as content:
             assert content == content_bytes
 
+    def test_open_as_binary_from_pil_image(self) -> None:
+        pil_image = Image.open(self.image_file)
+        with _open_as_binary(pil_image) as content:
+            assert isinstance(content, bytes)
+
+            buffer = io.BytesIO()
+            pil_image.save(buffer, format=pil_image.format or "PNG")
+            assert content == buffer.getvalue()
+
 
 class TestHeadersAndCookies(TestBase):
     def test_headers_and_cookies(self) -> None:
@@ -1213,3 +1223,21 @@ def test_as_url(content_input, default_mime_type, expected, is_exact_match, tmp_
         assert result == expected
     else:
         assert result.startswith(expected)
+
+
+def test_as_url_with_pil_image(image_file: str):
+    """Test `_as_url` helper with a PIL Image."""
+    pil_image = Image.open(image_file)
+
+    pil_image.format = "PNG"
+    png_url = _as_url(pil_image, default_mime_type="image/jpeg")
+    assert png_url.startswith("data:image/png;base64,")
+
+    pil_image.format = None
+    png_url = _as_url(pil_image, default_mime_type="image/jpeg")
+    assert png_url.startswith("data:image/png;base64,")
+
+    buffer = io.BytesIO()
+    pil_image.save(buffer, format="PNG")
+    b64_encoded = base64.b64encode(buffer.getvalue()).decode()
+    assert png_url == f"data:image/png;base64,{b64_encoded}"