1
+ from PIL import Image
2
+ from bs4 import BeautifulSoup
3
+ from urllib .parse import urljoin , urlparse
4
+ from pydantic import Field , ConfigDict
5
+ from typing import List , Optional
6
+ from io import BytesIO
7
+
8
+ from loguru import logger
9
+
10
+ import requests
11
+
12
+ from guidellm .config import settings
13
+ from guidellm .core .serializable import Serializable
14
+
15
+ __all__ = ["load_images" , "ImageDescriptor" ]
16
+
17
+ class ImageDescriptor (Serializable ):
18
+ """
19
+ A class to represent image data in serializable format.
20
+ """
21
+ model_config = ConfigDict (arbitrary_types_allowed = True )
22
+
23
+ url : Optional [str ] = Field (description = "url address for image." )
24
+ image : Image .Image = Field (description = "PIL image" , exclude = True )
25
+ filename : Optional [int ] = Field (
26
+ default = None ,
27
+ description = "Image filename." ,
28
+ )
29
+
30
+
31
+ def load_images (data : str ) -> List [ImageDescriptor ]:
32
+ """
33
+ Load an HTML file from a path or URL
34
+
35
+ :param data: the path or URL to load the HTML file from
36
+ :type data: Union[str, Path]
37
+ :return: Descriptor containing image url and the data in PIL.Image.Image format
38
+ :rtype: ImageDescriptor
39
+ """
40
+
41
+ images = []
42
+ if not data :
43
+ return None
44
+ if isinstance (data , str ) and data .startswith ("http" ):
45
+ response = requests .get (data , timeout = settings .request_timeout )
46
+ response .raise_for_status ()
47
+
48
+ soup = BeautifulSoup (response .text , 'html.parser' )
49
+ for img_tag in soup .find_all ("img" ):
50
+ img_url = img_tag .get ("src" )
51
+
52
+ if img_url :
53
+ # Handle relative URLs
54
+ img_url = urljoin (data , img_url )
55
+
56
+ # Download the image
57
+ logger .debug ("Loading image: {}" , img_url )
58
+ img_response = requests .get (img_url )
59
+ img_response .raise_for_status ()
60
+
61
+ # Load image into Pillow
62
+ images .append (
63
+ ImageDescriptor (
64
+ url = img_url ,
65
+ image = Image .open (BytesIO (img_response .content )),
66
+ )
67
+ )
68
+
69
+ return images
0 commit comments