Add a UrlFetch processor to fetch Url and extract HTML or text content from webpages.

aelissee · kibergus · commit eb242cd256e4 · 2025-11-10T16:56:24.000Z
This is a thin wrapper around a direct http get method.

PiperOrigin-RevId: 830334554
diff --git a/genai_processors/core/text.py b/genai_processors/core/text.py
@@ -18,10 +18,12 @@
 from collections.abc import AsyncIterable, Callable
 import dataclasses
 import re
-from typing import Mapping, Type
+from typing import Literal, Mapping, Type
 
+import bs4
 import dataclasses_json
 from genai_processors import content_api
+from genai_processors import mime_types
 from genai_processors import processor
 import termcolor
 
@@ -403,6 +405,95 @@ def transform(part: content_api.ProcessorPart):
     )
 
 
+_TAG_DENYLIST = [
+    'head',
+    'script',
+    'noscript',
+    'style',
+    'footer',
+    'aside',
+    'nav',
+    'svg',
+]
+_ATTR_ALLOWLIST = [
+    'alt',
+    'href',
+    'aria-label',
+    'aria-level',
+    'aria-roledescription',
+]
+
+
+def _clean_html(raw_html: str) -> bs4.BeautifulSoup:
+  """Cleans raw HTML into cleaner HTML and plaintext.
+
+  Args:
+    raw_html: raw html document - can include a long prefix, e.g. containing the
+      response header of some service: all the content is stripped until the
+      first occurrence of `<html`.
+
+  Returns:
+    a tuple where the first element is the html content, and the second one is
+    the page content stripped from all html tags and reformatted.
+  """
+  # Should catch <html> with and without attribute, e.g. <html locale='..'>.
+  # The document could be prefixed with some non-html header.
+  html_tag_index = raw_html.find('<html')
+  if html_tag_index >= 0:
+    html_no_header = raw_html[html_tag_index:]
+  else:
+    html_no_header = raw_html
+  soup = bs4.BeautifulSoup(html_no_header, 'html.parser')
+  for entry in soup(_TAG_DENYLIST):
+    entry.decompose()
+  for tag in soup.descendants:
+    if isinstance(tag, bs4.element.Tag):
+      tag.attrs = {
+          key: value
+          for key, value in tag.attrs.items()
+          if key in _ATTR_ALLOWLIST
+      }
+  return soup
+
+
+class HtmlCleaner(processor.PartProcessor):
+  """PartProcessor cleaning up HTML content.
+
+  This part processor will return a new version of a processor part that is of
+  the `text/html` MIME type.
+
+  Based on the `cleaning_mode` argument, the content will be cleaned
+  accordingly and returned in a new part, that is, many tags and attributes will
+  be stripped away. When `plain` mode is selected, the content will be returned
+  as plain text with no formatting.
+
+  Note that the cleaning is done per part. It is recommended to collect the
+  whole html content into a single part. Otherwise, the cleaning might only be
+  done partially, i.e. there is no guarantee that the html content within a
+  single part is a valid html text.
+  """
+
+  def __init__(self, *, cleaning_mode: Literal['html', 'plain'] = 'plain'):
+    self._cleaning_mode = cleaning_mode
+
+  def match(self, part: content_api.ProcessorPart) -> bool:
+    return mime_types.is_html(part.mimetype)
+
+  async def call(
+      self, part: content_api.ProcessorPart
+  ) -> AsyncIterable[content_api.ProcessorPartTypes]:
+    match self._cleaning_mode:
+      case 'html':
+        yield content_api.ProcessorPart(
+            _clean_html(part.text).prettify().strip(),
+            mimetype=mime_types.TEXT_HTML,
+        )
+      case 'plain':
+        yield _clean_html(part.text).get_text().strip()
+      case _:
+        raise ValueError(f'Unsupported cleaning mode: {self._cleaning_mode}')
+
+
 @processor.source()
 async def terminal_input(
     prompt: str = '',
diff --git a/genai_processors/core/web.py b/genai_processors/core/web.py
@@ -14,30 +14,91 @@
 # ==============================================================================
 """Utilities for fetching documents mentioned in the part stream.
 
-NOTE: THIS MODULE IS UNDER DEVELOPMENT AND IS NOT COMPLETE YET.
-
 Referencing URLs of documents (web pages, images, PDFs) in a prompt is a
 convenient way to provide rich context for a model. While registering `http.get`
 as a tool is a more flexible and robust approach, it requires an extra model
-call and a round trip for each document loaded. This model offers a more
+call and a round trip for each document loaded. This approach offers a more
 hardwired but faster alternative.
 
 We split the responsibility for fetching documents and deciding what needs
 fetching. A special `genai_processors.core.text.FetchRequest` part must be used
 to explicitly reference the document to be fetched. Then `UrlFetch` processor
 would replace such FetchRequest Parts with the actual content.
 
-It is very convenient to just mention URL as text in the prompt. However it
+While it is very convenient to just mention URL as text in the prompt, it
 becomes easy to trigger the fetch unintentionally and can even be dangerous. So
 it should be applied closer to the UI where user journeys are more well defined.
-For example parsing URLs directly pasted in-to a chat interface is probably
-fine. For extra safety you may want to require the URL be on its own line.
+For example parsing URLs directly pasted into a chat interface is probably
+fine but recursively following URLs from uncontrolled sources is dangerous. For
+extra safety you may want to require the URL be on its own line.
 `genai_processors.core.text.UrlExtractor` is a processor for the task.
 
 This process can be refined further: e.g. one can use a fast model
 (gemini-flash-lite or gemma-nano) to decide whether the URL should be fetched
 before passing the prompt to a larger LLM. This way we can reduce latency by
 making decisions fast and fetching multiple documents in parallel.
+
+You can also consider using an alternative implementation from
+https://github.com/mbeacom/genai-processors-url-fetch/ which has additional
+security features and supports markdown.
 """
+from collections.abc import AsyncIterable
+
+from genai_processors import content_api
+from genai_processors import mime_types
+from genai_processors import processor
+from genai_processors.core import text
+import httpx
+
+
+_HEADER_PART = 'Fetch result for URL: '
+
+
+class UrlFetch(processor.PartProcessor):
+  """A processor that fetches documents by URLs.
+
+  This processor replaces genai_processors.core.text.FetchRequest with the
+  referenced content. It ignores anything that is not a FetchRequest.
+
+  It is recommended to chain genai_processors.core.text.HtmlCleaner after
+  UrlFetch to simplify the HTML pages before sending them to a model.
+  """
+
+  def __init__(
+      self,
+      timeout_seconds: int = 10,
+  ):
+    """Initializes the UrlFetch processor.
+
+    Args:
+      timeout_seconds: The timeout in seconds for the HTTP request. Default set
+        to 10 seconds.
+    """
+
+    self._client = httpx.AsyncClient(
+        follow_redirects=True,
+        timeout=timeout_seconds,
+    )
+
+  def match(self, part: content_api.ProcessorPart) -> bool:
+    return mime_types.is_dataclass(part.mimetype, text.FetchRequest)
+
+  @processor.yield_exceptions_as_parts
+  async def call(
+      self, part: content_api.ProcessorPart
+  ) -> AsyncIterable[content_api.ProcessorPartTypes]:
+    fetch_request = part.get_dataclass(text.FetchRequest)
+    url = fetch_request.url
+    yield _HEADER_PART + url + '\n'
+    # Set stream=True to enable streaming the response body
+    async with self._client.stream('GET', url, timeout=10) as response:
+      # Raise an exception for bad status codes (4xx or 5xx)
+      response.raise_for_status()
 
-# A UrlFetch processor will be added to this module later.
+      # Iterate over the response chunks
+      html_content = []
+      async for chunk in response.aiter_bytes():
+        html_content.append(chunk.decode(response.encoding or 'utf-8'))
+      yield content_api.ProcessorPart(
+          ''.join(html_content), mimetype=mime_types.TEXT_HTML
+      )
diff --git a/genai_processors/mime_types.py b/genai_processors/mime_types.py
@@ -181,6 +181,11 @@ def is_wav(mime: str) -> bool:
   return mime.lower() == AUDIO_WAV
 
 
+def is_html(mime: str) -> bool:
+  """Returns whether the content is HTML."""
+  return mime.lower().startswith(TEXT_HTML)
+
+
 def is_source_code(mime: str) -> bool:
   """Returns whether the content is a source code in some language."""
   # This list is incomplete and will be extended on as-needed basis.
diff --git a/genai_processors/tests/text_test.py b/genai_processors/tests/text_test.py
@@ -1,6 +1,7 @@
 import asyncio
 import dataclasses
 import re
+import textwrap
 from typing import Sequence
 import unittest
 
@@ -656,5 +657,68 @@ async def one_hello():
       break
 
 
+class HtmlCleanerTest(parameterized.TestCase, unittest.IsolatedAsyncioTestCase):
+  HTML_CONTENT = textwrap.dedent("""\
+      <html>
+      <head><title>Title</title></head>
+      <body>
+      <script>alert('foo')</script>
+      <style>.foo {color: red;}</style>
+      <nav>menu</nav>
+      <div class="content" other-attr="value"><h1>Title</h1>
+        <p>Some text with <a href="http://example.com" target="_blank">link</a></p>
+        <img src="image.png" alt="alt text" width="100"/>
+      </div>
+      <footer>footer</footer>
+      </body>
+      </html>""")
+
+  @parameterized.named_parameters(
+      dict(
+          testcase_name='html',
+          cleaning_mode='html',
+          expected_output=textwrap.dedent("""\
+      <html>
+       <body>
+        <div>
+         <h1>
+          Title
+         </h1>
+         <p>
+          Some text with
+          <a href="http://example.com">
+           link
+          </a>
+         </p>
+         <img alt="alt text"/>
+        </div>
+       </body>
+      </html>"""),
+      ),
+      dict(
+          testcase_name='text',
+          cleaning_mode='plain',
+          expected_output='Title\nSome text with link',
+      ),
+  )
+  async def test_html_cleaner_html_mode(self, cleaning_mode, expected_output):
+    cleaner = text.HtmlCleaner(cleaning_mode=cleaning_mode)
+    input_part = content_api.ProcessorPart(
+        self.HTML_CONTENT, mimetype='text/html'
+    )
+    output = await processor.apply_async(cleaner, [input_part])
+    self.assertEqual(
+        output,
+        [
+            content_api.ProcessorPart(
+                expected_output,
+                mimetype='text/html'
+                if cleaning_mode == 'html'
+                else 'text/plain',
+            )
+        ],
+    )
+
+
 if __name__ == '__main__':
   absltest.main()
diff --git a/genai_processors/tests/web_test.py b/genai_processors/tests/web_test.py
diff --git a/pyproject.toml b/pyproject.toml