From f0d188f547b8b851bfc0f96c9b862d4e1b4b56de Mon Sep 17 00:00:00 2001 From: CdnCentreForChildProtection <166165190+CdnCentreForChildProtection@users.noreply.github.com> Date: Wed, 16 Jul 2025 13:40:42 -0500 Subject: [PATCH 1/2] Allow configurable timeouts for the scan methods that submit a large byte body. --- arachnid_shield_sdk/api/v1.py | 68 ++++++++++++++++++++++++++++------- 1 file changed, 56 insertions(+), 12 deletions(-) diff --git a/arachnid_shield_sdk/api/v1.py b/arachnid_shield_sdk/api/v1.py index 55e3576..fedece2 100644 --- a/arachnid_shield_sdk/api/v1.py +++ b/arachnid_shield_sdk/api/v1.py @@ -30,13 +30,21 @@ def __init__(self, username: typing.Union[str, bytes], password: typing.Union[st super().__init__(username=username, password=password) self.__client = super()._build_sync_http_client() - def scan_media_from_bytes(self, contents: typing.Union[bytes, io.BytesIO], mime_type: str) -> ScannedMedia: + def scan_media_from_bytes( + self, + contents: typing.Union[bytes, io.BytesIO], + mime_type: str, + timeout: typing.Optional[httpx.Timeout] = None, + ) -> ScannedMedia: """Given the contents of some media, along with a mime type, scan the contents for matches against known child abuse media. Args: contents: The raw bytes that represent the media. mime_type: The mimetype of the media. + timeout: + If provided, will set a timeout configuration for the underlying http client. + Otherwise, will disable the timeout entirely. Returns: The record of a successful media scan. @@ -45,10 +53,13 @@ def scan_media_from_bytes(self, contents: typing.Union[bytes, io.BytesIO], mime_ `ArachnidShieldError` on a failed but complete interaction with the Arachnid Shield API, and `httpx.HTTPError` on any other connection failures. """ - return self.scan_media_from_bytes_with_config(ScanMediaFromBytes(contents=contents, mime_type=mime_type)) + return self.scan_media_from_bytes_with_config(ScanMediaFromBytes(contents=contents, mime_type=mime_type), timeout=timeout) def scan_media_from_file( - self, filepath: pathlib.Path, mime_type_override: typing.Optional[str] = None + self, + filepath: pathlib.Path, + mime_type_override: typing.Optional[str] = None, + timeout: typing.Optional[httpx.Timeout] = None, ) -> ScannedMedia: """Given path to the media file to scan, and an optional value for mime_type that bypasses guessing it based of the filepath, @@ -60,6 +71,9 @@ def scan_media_from_file( mime_type_override: If provided, will use this as the mime_type instead of guessing it from the filepath. + timeout: + If provided, will set a timeout configuration for the underlying http client. + Otherwise, will disable the timeout entirely. Returns: The record of a successful media scan. @@ -78,7 +92,7 @@ def scan_media_from_file( detail=( f"Failed to identify mime_type for {filepath}. " f"You may specify it explicitly by providing " - f"`force_mime_type`." + f"`mime_type_override`." ) ) ) @@ -87,7 +101,7 @@ def scan_media_from_file( contents = f.read() config = ScanMediaFromBytes(contents=contents, mime_type=mime_type) - return self.scan_media_from_bytes_with_config(config) + return self.scan_media_from_bytes_with_config(config, timeout=timeout) def scan_media_from_url(self, url: str) -> ScannedMedia: """Given the absolute url that hosts the media we wish to scan, @@ -105,12 +119,19 @@ def scan_media_from_url(self, url: str) -> ScannedMedia: """ return self.scan_media_from_url_with_config(ScanMediaFromUrl(url=url)) - def scan_media_from_bytes_with_config(self, config: ScanMediaFromBytes) -> ScannedMedia: + def scan_media_from_bytes_with_config( + self, + config: ScanMediaFromBytes, + timeout: typing.Optional[httpx.Timeout] = httpx.Timeout(5) + ) -> ScannedMedia: """Given the contents of some media, along with a mime type, scan the contents for matches against known child abuse media. Args: config: The context that will be used to build the request. + timeout: + If provided explicitly, a configuration passed to the underlying http client. + It defaults to 5 seconds, and can be disabled by setting it to `None`. Returns: ScannedMedia: A record of a successful scan of the media. @@ -125,6 +146,7 @@ def scan_media_from_bytes_with_config(self, config: ScanMediaFromBytes) -> Scann url=url, headers={"Content-Type": config.mime_type}, content=config.contents, + timeout=timeout, ) if response.is_client_error or response.is_server_error: @@ -203,13 +225,21 @@ def __init__(self, username: typing.Union[str, bytes], password: typing.Union[st super().__init__(username=username, password=password) self.__client = super()._build_async_http_client() - async def scan_media_from_bytes(self, contents: typing.Union[bytes, io.BytesIO], mime_type: str) -> ScannedMedia: + async def scan_media_from_bytes( + self, + contents: typing.Union[bytes, io.BytesIO], + mime_type: str, + timeout: typing.Optional[httpx.Timeout] = None, + ) -> ScannedMedia: """Given the contents of some media, along with a mime type, scan the contents for matches against known child abuse media. Args: contents: The raw bytes that represent the media. mime_type: The mimetype of the media. + timeout: + If provided, will set a timeout configuration for the underlying http client. + Otherwise, will disable the timeout entirely. Returns: The record of a successful media scan. @@ -219,7 +249,7 @@ async def scan_media_from_bytes(self, contents: typing.Union[bytes, io.BytesIO], the Arachnid Shield API, and `httpx.HTTPError` on any other connection failures. """ - return await self.scan_media_from_bytes_with_config(ScanMediaFromBytes(contents=contents, mime_type=mime_type)) + return await self.scan_media_from_bytes_with_config(ScanMediaFromBytes(contents=contents, mime_type=mime_type), timeout=timeout) async def scan_media_from_url(self, url: str) -> ScannedMedia: """Given the absolute url that hosts the media we wish to scan, @@ -238,7 +268,10 @@ async def scan_media_from_url(self, url: str) -> ScannedMedia: return await self.scan_media_from_url_with_config(ScanMediaFromUrl(url=url)) async def scan_media_from_file( - self, filepath: pathlib.Path, mime_type_override: typing.Optional[str] = None + self, + filepath: pathlib.Path, + mime_type_override: typing.Optional[str] = None, + timeout: typing.Optional[httpx.Timeout] = None, ) -> ScannedMedia: """Given path to the media file to scan, and an optional value for mime_type that bypasses guessing it based of the filepath, @@ -250,6 +283,9 @@ async def scan_media_from_file( mime_type_override: If provided, will use this as the mime_type instead of guessing it from the filepath. + timeout: + If provided, will set a timeout configuration for the underlying http client. + Otherwise, will disable the timeout entirely. Returns: The record of a successful media scan. @@ -268,7 +304,7 @@ async def scan_media_from_file( detail=( f"Failed to identify mime_type for {filepath}. " f"You may specify it explicitly by providing " - f"`force_mime_type`." + f"`mime_type_override`." ) ) ) @@ -277,14 +313,21 @@ async def scan_media_from_file( contents = f.read() config = ScanMediaFromBytes(contents=contents, mime_type=mime_type) - return await self.scan_media_from_bytes_with_config(config) + return await self.scan_media_from_bytes_with_config(config, timeout=timeout) - async def scan_media_from_bytes_with_config(self, config: ScanMediaFromBytes) -> ScannedMedia: + async def scan_media_from_bytes_with_config( + self, + config: ScanMediaFromBytes, + timeout: typing.Optional[httpx.Timeout] = httpx.Timeout(5) + ) -> ScannedMedia: """Given the contents of some media, along with a mime type, scan the contents for matches against known child abuse media. Args: config: The context that will be used to build the request. + timeout: + If provided explicitly, a configuration passed to the underlying http client. + It defaults to 5 seconds, and can be disabled by setting it to `None`. Returns: ScannedMedia: A record of a successful scan of the media. @@ -300,6 +343,7 @@ async def scan_media_from_bytes_with_config(self, config: ScanMediaFromBytes) -> url=url, headers={"Content-Type": config.mime_type}, content=config.contents, + timeout=timeout, ) if response.is_client_error or response.is_server_error: From 258f9ca488920fa58ed62db29c3e944a80e6764a Mon Sep 17 00:00:00 2001 From: CdnCentreForChildProtection <166165190+CdnCentreForChildProtection@users.noreply.github.com> Date: Wed, 16 Jul 2025 14:34:42 -0500 Subject: [PATCH 2/2] Provide better defaults for the timeouts across all scan methods. --- arachnid_shield_sdk/api/v1.py | 102 +++++++++++++++++++++++++--------- 1 file changed, 76 insertions(+), 26 deletions(-) diff --git a/arachnid_shield_sdk/api/v1.py b/arachnid_shield_sdk/api/v1.py index fedece2..ae6eb05 100644 --- a/arachnid_shield_sdk/api/v1.py +++ b/arachnid_shield_sdk/api/v1.py @@ -18,6 +18,22 @@ ) +TIMEOUT_WRITE_PERMISSIVE = httpx.Timeout( + 60, # Default timeout for all operations unless otherwise stated. + connect=3, + # Large chunks can take arbitrarily long to complete a write + # so wait arbitrarily long to finish writes. + write=None, +) + +TIMEOUT_READ_PERMISSIVE = httpx.Timeout( + 60, # Default timeout for all operations unless otherwise stated. + connect=3, + # Allow the server enough time to process the request and to read the response back. + read=60 +) + + class ArachnidShield(_ArachnidShield): """A client to communicate with the Arachnid Shield API provided by the Canadian Centre for Child Protection. @@ -34,7 +50,7 @@ def scan_media_from_bytes( self, contents: typing.Union[bytes, io.BytesIO], mime_type: str, - timeout: typing.Optional[httpx.Timeout] = None, + timeout: typing.Optional[httpx.Timeout] = TIMEOUT_WRITE_PERMISSIVE, ) -> ScannedMedia: """Given the contents of some media, along with a mime type, scan the contents for matches against known child abuse media. @@ -43,8 +59,7 @@ def scan_media_from_bytes( contents: The raw bytes that represent the media. mime_type: The mimetype of the media. timeout: - If provided, will set a timeout configuration for the underlying http client. - Otherwise, will disable the timeout entirely. + If provided, will set a timeout configuration for the underlying http client. Returns: The record of a successful media scan. @@ -59,7 +74,7 @@ def scan_media_from_file( self, filepath: pathlib.Path, mime_type_override: typing.Optional[str] = None, - timeout: typing.Optional[httpx.Timeout] = None, + timeout: typing.Optional[httpx.Timeout] = TIMEOUT_WRITE_PERMISSIVE, ) -> ScannedMedia: """Given path to the media file to scan, and an optional value for mime_type that bypasses guessing it based of the filepath, @@ -72,8 +87,7 @@ def scan_media_from_file( If provided, will use this as the mime_type instead of guessing it from the filepath. timeout: - If provided, will set a timeout configuration for the underlying http client. - Otherwise, will disable the timeout entirely. + If provided, will set a timeout configuration for the underlying http client. Returns: The record of a successful media scan. @@ -103,12 +117,18 @@ def scan_media_from_file( config = ScanMediaFromBytes(contents=contents, mime_type=mime_type) return self.scan_media_from_bytes_with_config(config, timeout=timeout) - def scan_media_from_url(self, url: str) -> ScannedMedia: + def scan_media_from_url( + self, + url: str, + timeout: typing.Optional[httpx.Timeout] = TIMEOUT_READ_PERMISSIVE, + ) -> ScannedMedia: """Given the absolute url that hosts the media we wish to scan, scan the contents of that url for matches against known harmful content. Args: url: The absolute URL to scan. + timeout: + If provided, will set a timeout configuration for the underlying http client. Returns: The record of a successful media scan. @@ -117,12 +137,12 @@ def scan_media_from_url(self, url: str) -> ScannedMedia: `ArachnidShieldError` on a failed but complete interaction with the Arachnid Shield API, and `httpx.HTTPError` on any other connection failures. """ - return self.scan_media_from_url_with_config(ScanMediaFromUrl(url=url)) + return self.scan_media_from_url_with_config(ScanMediaFromUrl(url=url), timeout=timeout) def scan_media_from_bytes_with_config( self, config: ScanMediaFromBytes, - timeout: typing.Optional[httpx.Timeout] = httpx.Timeout(5) + timeout: typing.Optional[httpx.Timeout] = TIMEOUT_WRITE_PERMISSIVE, ) -> ScannedMedia: """Given the contents of some media, along with a mime type, scan the contents for matches against known child abuse media. @@ -130,8 +150,7 @@ def scan_media_from_bytes_with_config( Args: config: The context that will be used to build the request. timeout: - If provided explicitly, a configuration passed to the underlying http client. - It defaults to 5 seconds, and can be disabled by setting it to `None`. + If provided, will set a timeout configuration for the underlying http client. Returns: ScannedMedia: A record of a successful scan of the media. @@ -156,12 +175,18 @@ def scan_media_from_bytes_with_config( response.raise_for_status() return ScannedMedia.from_dict(response.json()) - def scan_media_from_url_with_config(self, config: ScanMediaFromUrl) -> ScannedMedia: + def scan_media_from_url_with_config( + self, + config: ScanMediaFromUrl, + timeout: typing.Optional[httpx.Timeout] = TIMEOUT_READ_PERMISSIVE, + ) -> ScannedMedia: """Given the absolute url that hosts the media we wish to scan, scan the contents of that url for matches against known harmful content. Args: config: The context that will be used to build the request. + timeout: + If provided, will set a timeout configuration for the underlying http client. Returns: ScannedMedia: A record of a successful scan of the media. @@ -177,6 +202,7 @@ def scan_media_from_url_with_config(self, config: ScanMediaFromUrl) -> ScannedMe url=_url, headers={"Content-Type": "application/json"}, json=config.to_dict(), + timeout=timeout, ) if response.is_client_error or response.is_server_error: @@ -186,11 +212,17 @@ def scan_media_from_url_with_config(self, config: ScanMediaFromUrl) -> ScannedMe response.raise_for_status() return ScannedMedia.from_dict(response.json()) - def scan_pdq_hashes(self, config: ScanMediaFromPdq) -> ScannedPDQHashes: + def scan_pdq_hashes( + self, + config: ScanMediaFromPdq, + timeout: typing.Optional[httpx.Timeout] = TIMEOUT_READ_PERMISSIVE, + ) -> ScannedPDQHashes: """ Scan medias for CSAM based on their PDQ hashes. Args: config: The context that will be used to build the request. + timeout: + If provided, will set a timeout configuration for the underlying http client. Returns: ScannedPDQHashes: A record of a batch of PDQ hashes that have been scanned by the Arachnid Shield API @@ -205,6 +237,7 @@ def scan_pdq_hashes(self, config: ScanMediaFromPdq) -> ScannedPDQHashes: url=_url, headers={"Content-Type": "application/json"}, json=config.to_dict(), + timeout=timeout, ) if response.is_client_error or response.is_server_error: error_detail = ErrorDetail.from_dict(response.json()) @@ -229,7 +262,7 @@ async def scan_media_from_bytes( self, contents: typing.Union[bytes, io.BytesIO], mime_type: str, - timeout: typing.Optional[httpx.Timeout] = None, + timeout: typing.Optional[httpx.Timeout] = TIMEOUT_WRITE_PERMISSIVE, ) -> ScannedMedia: """Given the contents of some media, along with a mime type, scan the contents for matches against known child abuse media. @@ -238,8 +271,7 @@ async def scan_media_from_bytes( contents: The raw bytes that represent the media. mime_type: The mimetype of the media. timeout: - If provided, will set a timeout configuration for the underlying http client. - Otherwise, will disable the timeout entirely. + If provided, will set a timeout configuration for the underlying http client. Returns: The record of a successful media scan. @@ -251,12 +283,18 @@ async def scan_media_from_bytes( return await self.scan_media_from_bytes_with_config(ScanMediaFromBytes(contents=contents, mime_type=mime_type), timeout=timeout) - async def scan_media_from_url(self, url: str) -> ScannedMedia: + async def scan_media_from_url( + self, + url: str, + timeout: typing.Optional[httpx.Timeout] = TIMEOUT_READ_PERMISSIVE, + ) -> ScannedMedia: """Given the absolute url that hosts the media we wish to scan, scan the contents of that url for matches against known harmful content. Args: url: The absolute URL to scan. + timeout: + If provided, will set a timeout configuration for the underlying http client. Returns: The record of a successful media scan. @@ -265,13 +303,13 @@ async def scan_media_from_url(self, url: str) -> ScannedMedia: `ArachnidShieldError` on a failed but complete interaction with the Arachnid Shield API, and `httpx.HTTPError` on any other connection failures. """ - return await self.scan_media_from_url_with_config(ScanMediaFromUrl(url=url)) + return await self.scan_media_from_url_with_config(ScanMediaFromUrl(url=url), timeout=timeout) async def scan_media_from_file( self, filepath: pathlib.Path, mime_type_override: typing.Optional[str] = None, - timeout: typing.Optional[httpx.Timeout] = None, + timeout: typing.Optional[httpx.Timeout] = TIMEOUT_WRITE_PERMISSIVE, ) -> ScannedMedia: """Given path to the media file to scan, and an optional value for mime_type that bypasses guessing it based of the filepath, @@ -284,8 +322,7 @@ async def scan_media_from_file( If provided, will use this as the mime_type instead of guessing it from the filepath. timeout: - If provided, will set a timeout configuration for the underlying http client. - Otherwise, will disable the timeout entirely. + If provided, will set a timeout configuration for the underlying http client. Returns: The record of a successful media scan. @@ -318,7 +355,7 @@ async def scan_media_from_file( async def scan_media_from_bytes_with_config( self, config: ScanMediaFromBytes, - timeout: typing.Optional[httpx.Timeout] = httpx.Timeout(5) + timeout: typing.Optional[httpx.Timeout] = TIMEOUT_WRITE_PERMISSIVE, ) -> ScannedMedia: """Given the contents of some media, along with a mime type, scan the contents for matches against known child abuse media. @@ -326,8 +363,7 @@ async def scan_media_from_bytes_with_config( Args: config: The context that will be used to build the request. timeout: - If provided explicitly, a configuration passed to the underlying http client. - It defaults to 5 seconds, and can be disabled by setting it to `None`. + If provided, will set a timeout configuration for the underlying http client. Returns: ScannedMedia: A record of a successful scan of the media. @@ -353,12 +389,18 @@ async def scan_media_from_bytes_with_config( response.raise_for_status() return ScannedMedia.from_dict(response.json()) - async def scan_media_from_url_with_config(self, config: ScanMediaFromUrl) -> ScannedMedia: + async def scan_media_from_url_with_config( + self, + config: ScanMediaFromUrl, + timeout: typing.Optional[httpx.Timeout] = TIMEOUT_READ_PERMISSIVE, + ) -> ScannedMedia: """Given the absolute url that hosts the media we wish to scan, scan the contents of that url for matches against known harmful content. Args: config: The context that will be used to build the request. + timeout: + If provided, will set a timeout configuration for the underlying http client. Returns: ScannedMedia: A record of a successful scan of the media. @@ -374,6 +416,7 @@ async def scan_media_from_url_with_config(self, config: ScanMediaFromUrl) -> Sca url=_url, headers={"Content-Type": "application/json"}, json=config.to_dict(), + timeout=timeout, ) if response.is_client_error or response.is_server_error: @@ -383,11 +426,17 @@ async def scan_media_from_url_with_config(self, config: ScanMediaFromUrl) -> Sca response.raise_for_status() return ScannedMedia.from_dict(response.json()) - async def scan_pdq_hashes(self, config: ScanMediaFromPdq) -> ScannedPDQHashes: + async def scan_pdq_hashes( + self, + config: ScanMediaFromPdq, + timeout: typing.Optional[httpx.Timeout] = TIMEOUT_READ_PERMISSIVE, + ) -> ScannedPDQHashes: """ Scan medias for CSAM based on their PDQ hashes. Args: config: The context that will be used to build the request. + timeout: + If provided, will set a timeout configuration for the underlying http client. Returns: ScannedPDQHashes: A record of a batch of PDQ hashes that have been scanned by the Arachnid Shield API @@ -402,6 +451,7 @@ async def scan_pdq_hashes(self, config: ScanMediaFromPdq) -> ScannedPDQHashes: url=_url, headers={"Content-Type": "application/json"}, json=config.to_dict(), + timeout=timeout, ) if response.is_client_error or response.is_server_error: error_detail = ErrorDetail.from_dict(response.json())