Skip to content

Commit bb9bc0c

Browse files
committed
Add class to describe image samples and loading logic for images from url
1 parent ecf2984 commit bb9bc0c

File tree

1 file changed

+69
-0
lines changed

1 file changed

+69
-0
lines changed

src/guidellm/utils/images.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
from PIL import Image
2+
from bs4 import BeautifulSoup
3+
from urllib.parse import urljoin, urlparse
4+
from pydantic import Field, ConfigDict
5+
from typing import List, Optional
6+
from io import BytesIO
7+
8+
from loguru import logger
9+
10+
import requests
11+
12+
from guidellm.config import settings
13+
from guidellm.core.serializable import Serializable
14+
15+
__all__ = ["load_images", "ImageDescriptor"]
16+
17+
class ImageDescriptor(Serializable):
18+
"""
19+
A class to represent image data in serializable format.
20+
"""
21+
model_config = ConfigDict(arbitrary_types_allowed=True)
22+
23+
url: Optional[str] = Field(description="url address for image.")
24+
image: Image.Image = Field(description="PIL image", exclude=True)
25+
filename: Optional[int] = Field(
26+
default=None,
27+
description="Image filename.",
28+
)
29+
30+
31+
def load_images(data: str) -> List[ImageDescriptor]:
32+
"""
33+
Load an HTML file from a path or URL
34+
35+
:param data: the path or URL to load the HTML file from
36+
:type data: Union[str, Path]
37+
:return: Descriptor containing image url and the data in PIL.Image.Image format
38+
:rtype: ImageDescriptor
39+
"""
40+
41+
images = []
42+
if not data:
43+
return None
44+
if isinstance(data, str) and data.startswith("http"):
45+
response = requests.get(data, timeout=settings.request_timeout)
46+
response.raise_for_status()
47+
48+
soup = BeautifulSoup(response.text, 'html.parser')
49+
for img_tag in soup.find_all("img"):
50+
img_url = img_tag.get("src")
51+
52+
if img_url:
53+
# Handle relative URLs
54+
img_url = urljoin(data, img_url)
55+
56+
# Download the image
57+
logger.debug("Loading image: {}", img_url)
58+
img_response = requests.get(img_url)
59+
img_response.raise_for_status()
60+
61+
# Load image into Pillow
62+
images.append(
63+
ImageDescriptor(
64+
url=img_url,
65+
image=Image.open(BytesIO(img_response.content)),
66+
)
67+
)
68+
69+
return images

0 commit comments

Comments
 (0)