Skip to content

Commit eb242cd

Browse files
aelisseekibergus
authored andcommitted
Add a UrlFetch processor to fetch Url and extract HTML or text content from webpages.
This is a thin wrapper around a direct http get method. PiperOrigin-RevId: 830334554
1 parent c1b277c commit eb242cd

File tree

6 files changed

+334
-8
lines changed

6 files changed

+334
-8
lines changed

genai_processors/core/text.py

Lines changed: 92 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,12 @@
1818
from collections.abc import AsyncIterable, Callable
1919
import dataclasses
2020
import re
21-
from typing import Mapping, Type
21+
from typing import Literal, Mapping, Type
2222

23+
import bs4
2324
import dataclasses_json
2425
from genai_processors import content_api
26+
from genai_processors import mime_types
2527
from genai_processors import processor
2628
import termcolor
2729

@@ -403,6 +405,95 @@ def transform(part: content_api.ProcessorPart):
403405
)
404406

405407

408+
_TAG_DENYLIST = [
409+
'head',
410+
'script',
411+
'noscript',
412+
'style',
413+
'footer',
414+
'aside',
415+
'nav',
416+
'svg',
417+
]
418+
_ATTR_ALLOWLIST = [
419+
'alt',
420+
'href',
421+
'aria-label',
422+
'aria-level',
423+
'aria-roledescription',
424+
]
425+
426+
427+
def _clean_html(raw_html: str) -> bs4.BeautifulSoup:
428+
"""Cleans raw HTML into cleaner HTML and plaintext.
429+
430+
Args:
431+
raw_html: raw html document - can include a long prefix, e.g. containing the
432+
response header of some service: all the content is stripped until the
433+
first occurrence of `<html`.
434+
435+
Returns:
436+
a tuple where the first element is the html content, and the second one is
437+
the page content stripped from all html tags and reformatted.
438+
"""
439+
# Should catch <html> with and without attribute, e.g. <html locale='..'>.
440+
# The document could be prefixed with some non-html header.
441+
html_tag_index = raw_html.find('<html')
442+
if html_tag_index >= 0:
443+
html_no_header = raw_html[html_tag_index:]
444+
else:
445+
html_no_header = raw_html
446+
soup = bs4.BeautifulSoup(html_no_header, 'html.parser')
447+
for entry in soup(_TAG_DENYLIST):
448+
entry.decompose()
449+
for tag in soup.descendants:
450+
if isinstance(tag, bs4.element.Tag):
451+
tag.attrs = {
452+
key: value
453+
for key, value in tag.attrs.items()
454+
if key in _ATTR_ALLOWLIST
455+
}
456+
return soup
457+
458+
459+
class HtmlCleaner(processor.PartProcessor):
460+
"""PartProcessor cleaning up HTML content.
461+
462+
This part processor will return a new version of a processor part that is of
463+
the `text/html` MIME type.
464+
465+
Based on the `cleaning_mode` argument, the content will be cleaned
466+
accordingly and returned in a new part, that is, many tags and attributes will
467+
be stripped away. When `plain` mode is selected, the content will be returned
468+
as plain text with no formatting.
469+
470+
Note that the cleaning is done per part. It is recommended to collect the
471+
whole html content into a single part. Otherwise, the cleaning might only be
472+
done partially, i.e. there is no guarantee that the html content within a
473+
single part is a valid html text.
474+
"""
475+
476+
def __init__(self, *, cleaning_mode: Literal['html', 'plain'] = 'plain'):
477+
self._cleaning_mode = cleaning_mode
478+
479+
def match(self, part: content_api.ProcessorPart) -> bool:
480+
return mime_types.is_html(part.mimetype)
481+
482+
async def call(
483+
self, part: content_api.ProcessorPart
484+
) -> AsyncIterable[content_api.ProcessorPartTypes]:
485+
match self._cleaning_mode:
486+
case 'html':
487+
yield content_api.ProcessorPart(
488+
_clean_html(part.text).prettify().strip(),
489+
mimetype=mime_types.TEXT_HTML,
490+
)
491+
case 'plain':
492+
yield _clean_html(part.text).get_text().strip()
493+
case _:
494+
raise ValueError(f'Unsupported cleaning mode: {self._cleaning_mode}')
495+
496+
406497
@processor.source()
407498
async def terminal_input(
408499
prompt: str = '',

genai_processors/core/web.py

Lines changed: 68 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,30 +14,91 @@
1414
# ==============================================================================
1515
"""Utilities for fetching documents mentioned in the part stream.
1616
17-
NOTE: THIS MODULE IS UNDER DEVELOPMENT AND IS NOT COMPLETE YET.
18-
1917
Referencing URLs of documents (web pages, images, PDFs) in a prompt is a
2018
convenient way to provide rich context for a model. While registering `http.get`
2119
as a tool is a more flexible and robust approach, it requires an extra model
22-
call and a round trip for each document loaded. This model offers a more
20+
call and a round trip for each document loaded. This approach offers a more
2321
hardwired but faster alternative.
2422
2523
We split the responsibility for fetching documents and deciding what needs
2624
fetching. A special `genai_processors.core.text.FetchRequest` part must be used
2725
to explicitly reference the document to be fetched. Then `UrlFetch` processor
2826
would replace such FetchRequest Parts with the actual content.
2927
30-
It is very convenient to just mention URL as text in the prompt. However it
28+
While it is very convenient to just mention URL as text in the prompt, it
3129
becomes easy to trigger the fetch unintentionally and can even be dangerous. So
3230
it should be applied closer to the UI where user journeys are more well defined.
33-
For example parsing URLs directly pasted in-to a chat interface is probably
34-
fine. For extra safety you may want to require the URL be on its own line.
31+
For example parsing URLs directly pasted into a chat interface is probably
32+
fine but recursively following URLs from uncontrolled sources is dangerous. For
33+
extra safety you may want to require the URL be on its own line.
3534
`genai_processors.core.text.UrlExtractor` is a processor for the task.
3635
3736
This process can be refined further: e.g. one can use a fast model
3837
(gemini-flash-lite or gemma-nano) to decide whether the URL should be fetched
3938
before passing the prompt to a larger LLM. This way we can reduce latency by
4039
making decisions fast and fetching multiple documents in parallel.
40+
41+
You can also consider using an alternative implementation from
42+
https://github.com/mbeacom/genai-processors-url-fetch/ which has additional
43+
security features and supports markdown.
4144
"""
45+
from collections.abc import AsyncIterable
46+
47+
from genai_processors import content_api
48+
from genai_processors import mime_types
49+
from genai_processors import processor
50+
from genai_processors.core import text
51+
import httpx
52+
53+
54+
_HEADER_PART = 'Fetch result for URL: '
55+
56+
57+
class UrlFetch(processor.PartProcessor):
58+
"""A processor that fetches documents by URLs.
59+
60+
This processor replaces genai_processors.core.text.FetchRequest with the
61+
referenced content. It ignores anything that is not a FetchRequest.
62+
63+
It is recommended to chain genai_processors.core.text.HtmlCleaner after
64+
UrlFetch to simplify the HTML pages before sending them to a model.
65+
"""
66+
67+
def __init__(
68+
self,
69+
timeout_seconds: int = 10,
70+
):
71+
"""Initializes the UrlFetch processor.
72+
73+
Args:
74+
timeout_seconds: The timeout in seconds for the HTTP request. Default set
75+
to 10 seconds.
76+
"""
77+
78+
self._client = httpx.AsyncClient(
79+
follow_redirects=True,
80+
timeout=timeout_seconds,
81+
)
82+
83+
def match(self, part: content_api.ProcessorPart) -> bool:
84+
return mime_types.is_dataclass(part.mimetype, text.FetchRequest)
85+
86+
@processor.yield_exceptions_as_parts
87+
async def call(
88+
self, part: content_api.ProcessorPart
89+
) -> AsyncIterable[content_api.ProcessorPartTypes]:
90+
fetch_request = part.get_dataclass(text.FetchRequest)
91+
url = fetch_request.url
92+
yield _HEADER_PART + url + '\n'
93+
# Set stream=True to enable streaming the response body
94+
async with self._client.stream('GET', url, timeout=10) as response:
95+
# Raise an exception for bad status codes (4xx or 5xx)
96+
response.raise_for_status()
4297

43-
# A UrlFetch processor will be added to this module later.
98+
# Iterate over the response chunks
99+
html_content = []
100+
async for chunk in response.aiter_bytes():
101+
html_content.append(chunk.decode(response.encoding or 'utf-8'))
102+
yield content_api.ProcessorPart(
103+
''.join(html_content), mimetype=mime_types.TEXT_HTML
104+
)

genai_processors/mime_types.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,11 @@ def is_wav(mime: str) -> bool:
181181
return mime.lower() == AUDIO_WAV
182182

183183

184+
def is_html(mime: str) -> bool:
185+
"""Returns whether the content is HTML."""
186+
return mime.lower().startswith(TEXT_HTML)
187+
188+
184189
def is_source_code(mime: str) -> bool:
185190
"""Returns whether the content is a source code in some language."""
186191
# This list is incomplete and will be extended on as-needed basis.

genai_processors/tests/text_test.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import asyncio
22
import dataclasses
33
import re
4+
import textwrap
45
from typing import Sequence
56
import unittest
67

@@ -656,5 +657,68 @@ async def one_hello():
656657
break
657658

658659

660+
class HtmlCleanerTest(parameterized.TestCase, unittest.IsolatedAsyncioTestCase):
661+
HTML_CONTENT = textwrap.dedent("""\
662+
<html>
663+
<head><title>Title</title></head>
664+
<body>
665+
<script>alert('foo')</script>
666+
<style>.foo {color: red;}</style>
667+
<nav>menu</nav>
668+
<div class="content" other-attr="value"><h1>Title</h1>
669+
<p>Some text with <a href="http://example.com" target="_blank">link</a></p>
670+
<img src="image.png" alt="alt text" width="100"/>
671+
</div>
672+
<footer>footer</footer>
673+
</body>
674+
</html>""")
675+
676+
@parameterized.named_parameters(
677+
dict(
678+
testcase_name='html',
679+
cleaning_mode='html',
680+
expected_output=textwrap.dedent("""\
681+
<html>
682+
<body>
683+
<div>
684+
<h1>
685+
Title
686+
</h1>
687+
<p>
688+
Some text with
689+
<a href="http://example.com">
690+
link
691+
</a>
692+
</p>
693+
<img alt="alt text"/>
694+
</div>
695+
</body>
696+
</html>"""),
697+
),
698+
dict(
699+
testcase_name='text',
700+
cleaning_mode='plain',
701+
expected_output='Title\nSome text with link',
702+
),
703+
)
704+
async def test_html_cleaner_html_mode(self, cleaning_mode, expected_output):
705+
cleaner = text.HtmlCleaner(cleaning_mode=cleaning_mode)
706+
input_part = content_api.ProcessorPart(
707+
self.HTML_CONTENT, mimetype='text/html'
708+
)
709+
output = await processor.apply_async(cleaner, [input_part])
710+
self.assertEqual(
711+
output,
712+
[
713+
content_api.ProcessorPart(
714+
expected_output,
715+
mimetype='text/html'
716+
if cleaning_mode == 'html'
717+
else 'text/plain',
718+
)
719+
],
720+
)
721+
722+
659723
if __name__ == '__main__':
660724
absltest.main()

0 commit comments

Comments
 (0)