|
14 | 14 | # ============================================================================== |
15 | 15 | """Utilities for fetching documents mentioned in the part stream. |
16 | 16 |
|
17 | | -NOTE: THIS MODULE IS UNDER DEVELOPMENT AND IS NOT COMPLETE YET. |
18 | | -
|
19 | 17 | Referencing URLs of documents (web pages, images, PDFs) in a prompt is a |
20 | 18 | convenient way to provide rich context for a model. While registering `http.get` |
21 | 19 | as a tool is a more flexible and robust approach, it requires an extra model |
22 | | -call and a round trip for each document loaded. This model offers a more |
| 20 | +call and a round trip for each document loaded. This approach offers a more |
23 | 21 | hardwired but faster alternative. |
24 | 22 |
|
25 | 23 | We split the responsibility for fetching documents and deciding what needs |
26 | 24 | fetching. A special `genai_processors.core.text.FetchRequest` part must be used |
27 | 25 | to explicitly reference the document to be fetched. Then `UrlFetch` processor |
28 | 26 | would replace such FetchRequest Parts with the actual content. |
29 | 27 |
|
30 | | -It is very convenient to just mention URL as text in the prompt. However it |
| 28 | +While it is very convenient to just mention URL as text in the prompt, it |
31 | 29 | becomes easy to trigger the fetch unintentionally and can even be dangerous. So |
32 | 30 | it should be applied closer to the UI where user journeys are more well defined. |
33 | | -For example parsing URLs directly pasted in-to a chat interface is probably |
34 | | -fine. For extra safety you may want to require the URL be on its own line. |
| 31 | +For example parsing URLs directly pasted into a chat interface is probably |
| 32 | +fine but recursively following URLs from uncontrolled sources is dangerous. For |
| 33 | +extra safety you may want to require the URL be on its own line. |
35 | 34 | `genai_processors.core.text.UrlExtractor` is a processor for the task. |
36 | 35 |
|
37 | 36 | This process can be refined further: e.g. one can use a fast model |
38 | 37 | (gemini-flash-lite or gemma-nano) to decide whether the URL should be fetched |
39 | 38 | before passing the prompt to a larger LLM. This way we can reduce latency by |
40 | 39 | making decisions fast and fetching multiple documents in parallel. |
| 40 | +
|
| 41 | +You can also consider using an alternative implementation from |
| 42 | +https://github.com/mbeacom/genai-processors-url-fetch/ which has additional |
| 43 | +security features and supports markdown. |
41 | 44 | """ |
| 45 | +from collections.abc import AsyncIterable |
| 46 | + |
| 47 | +from genai_processors import content_api |
| 48 | +from genai_processors import mime_types |
| 49 | +from genai_processors import processor |
| 50 | +from genai_processors.core import text |
| 51 | +import httpx |
| 52 | + |
| 53 | + |
| 54 | +_HEADER_PART = 'Fetch result for URL: ' |
| 55 | + |
| 56 | + |
| 57 | +class UrlFetch(processor.PartProcessor): |
| 58 | + """A processor that fetches documents by URLs. |
| 59 | +
|
| 60 | + This processor replaces genai_processors.core.text.FetchRequest with the |
| 61 | + referenced content. It ignores anything that is not a FetchRequest. |
| 62 | +
|
| 63 | + It is recommended to chain genai_processors.core.text.HtmlCleaner after |
| 64 | + UrlFetch to simplify the HTML pages before sending them to a model. |
| 65 | + """ |
| 66 | + |
| 67 | + def __init__( |
| 68 | + self, |
| 69 | + timeout_seconds: int = 10, |
| 70 | + ): |
| 71 | + """Initializes the UrlFetch processor. |
| 72 | +
|
| 73 | + Args: |
| 74 | + timeout_seconds: The timeout in seconds for the HTTP request. Default set |
| 75 | + to 10 seconds. |
| 76 | + """ |
| 77 | + |
| 78 | + self._client = httpx.AsyncClient( |
| 79 | + follow_redirects=True, |
| 80 | + timeout=timeout_seconds, |
| 81 | + ) |
| 82 | + |
| 83 | + def match(self, part: content_api.ProcessorPart) -> bool: |
| 84 | + return mime_types.is_dataclass(part.mimetype, text.FetchRequest) |
| 85 | + |
| 86 | + @processor.yield_exceptions_as_parts |
| 87 | + async def call( |
| 88 | + self, part: content_api.ProcessorPart |
| 89 | + ) -> AsyncIterable[content_api.ProcessorPartTypes]: |
| 90 | + fetch_request = part.get_dataclass(text.FetchRequest) |
| 91 | + url = fetch_request.url |
| 92 | + yield _HEADER_PART + url + '\n' |
| 93 | + # Set stream=True to enable streaming the response body |
| 94 | + async with self._client.stream('GET', url, timeout=10) as response: |
| 95 | + # Raise an exception for bad status codes (4xx or 5xx) |
| 96 | + response.raise_for_status() |
42 | 97 |
|
43 | | -# A UrlFetch processor will be added to this module later. |
| 98 | + # Iterate over the response chunks |
| 99 | + html_content = [] |
| 100 | + async for chunk in response.aiter_bytes(): |
| 101 | + html_content.append(chunk.decode(response.encoding or 'utf-8')) |
| 102 | + yield content_api.ProcessorPart( |
| 103 | + ''.join(html_content), mimetype=mime_types.TEXT_HTML |
| 104 | + ) |
0 commit comments