Skip to content

Commit 75221e2

Browse files
committed
Allow sorting tags in annotated text based on its contents or child tag contents
1 parent a06a482 commit 75221e2

File tree

7 files changed

+189
-0
lines changed

7 files changed

+189
-0
lines changed

changedetectionio/forms.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,53 @@ def process_formdata(self, valuelist):
246246
self.data = {}
247247

248248

249+
class StringSelectorPairListField(StringField):
250+
"""
251+
A StringField that expects each non-empty line in its input to be:
252+
{first_selector}{second_selector}
253+
or just:
254+
{first_selector}
255+
and stores them as a list of (first_selector, second_selector) tuples in self.data.
256+
If the second selector is omitted, it is set to an empty string.
257+
"""
258+
widget = widgets.TextArea()
259+
260+
# Convert self.data (the list of tuples) back into textarea lines
261+
def _value(self):
262+
if self.data:
263+
lines = []
264+
for (first_selector, second_selector) in self.data:
265+
if second_selector:
266+
line = f"{{{first_selector}}}{{{second_selector}}}"
267+
else:
268+
line = f"{{{first_selector}}}"
269+
lines.append(line)
270+
return "\r\n".join(lines)
271+
else:
272+
return u''
273+
274+
# Parse the raw textarea input into a list of (first_selector, second_selector) tuples
275+
def process_formdata(self, valuelist):
276+
if valuelist:
277+
self.data = []
278+
# Split the textarea into lines
279+
lines = valuelist[0].split("\n")
280+
281+
# Filter out empty or whitespace-only lines
282+
cleaned = [line.strip() for line in lines if line.strip()]
283+
284+
for line in cleaned:
285+
# Use regex to capture:
286+
# { first_selector } and optionally { second_selector }
287+
match = re.match(r'^\{([^}]*)\}(?:\{([^}]*)\})?$', line)
288+
if match:
289+
first_selector = match.group(1).strip()
290+
second_selector = match.group(2).strip() if match.group(2) is not None else ""
291+
self.data.append((first_selector, second_selector))
292+
else:
293+
self.data = []
294+
295+
249296
class StringSelectorTagDictField(StringField):
250297
"""
251298
A StringField that expects each non-empty line in its input to be:
@@ -644,6 +691,8 @@ class processor_text_json_diff_form(commonSettingsForm):
644691
extraction_method = RadioField('Extraction method', choices=[('TEXT', 'Extract text only'),('ANNOTATED_TEXT', 'Extract annotated text')], default='TEXT')
645692
annotation_rules = StringSelectorTagDictField('Annotation Rules', [validators.Optional()])
646693

694+
annotated_sort_selectors = StringSelectorPairListField('Sort Annotated text by matched tags', [validators.Optional()])
695+
647696
filter_text_added = BooleanField('Added lines', default=True)
648697
filter_text_replaced = BooleanField('Replaced/changed lines', default=True)
649698
filter_text_removed = BooleanField('Removed lines', default=True)

changedetectionio/html_tools.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -653,6 +653,56 @@ def html_to_annotated_text(
653653
return "".join(tagged_content)
654654

655655

656+
def sort_annotated_text_by_selectors(annotated_xml: str, selector_pairs: List[Tuple[str, str]]) -> str:
657+
from lxml.html import fromstring, tostring
658+
from collections import defaultdict
659+
660+
def find_elements(root, selector):
661+
if selector.startswith(("xpath:", "xpath1:", "//")):
662+
xpath_selector = (selector
663+
.removeprefix("xpath:")
664+
.removeprefix("xpath1:"))
665+
return root.xpath(xpath_selector)
666+
else:
667+
return root.cssselect(selector)
668+
669+
html_tree = fromstring(annotated_xml.strip())
670+
671+
for element_to_sort_selector, sort_identifier_selector in selector_pairs:
672+
# Find all elements-to-sort
673+
elements_to_sort = find_elements(html_tree, element_to_sort_selector)
674+
675+
# Group by parent
676+
parent_map = defaultdict(list)
677+
for el in elements_to_sort:
678+
parent_map[el.getparent()].append(el)
679+
680+
# Sort each group's elements-to-sort by the text of the sort-element
681+
for parent, elements_to_sort in parent_map.items():
682+
def get_sort_key(element):
683+
if sort_identifier_selector:
684+
# Use first sort-element matched by `sort_identifier_selector`
685+
sort_element_matches = find_elements(element, sort_identifier_selector)
686+
if sort_element_matches and sort_element_matches[0].text:
687+
return sort_element_matches[0].text.strip()
688+
elif element.text:
689+
return element.text.strip()
690+
return ""
691+
692+
sorted_elements = sorted(elements_to_sort, key=get_sort_key)
693+
694+
# Remove original elements
695+
for element in elements_to_sort:
696+
parent.remove(element)
697+
698+
# Reattach elements in sorted order
699+
for element in sorted_elements:
700+
parent.append(element)
701+
702+
# Finally, convert the modified DOM back to string maintaining while the input indentation
703+
return tostring(html_tree, pretty_print=False, method="xml").decode("utf-8")
704+
705+
656706
# Does LD+JSON exist with a @type=='product' and a .price set anywhere?
657707
def has_ldjson_product_info(content):
658708
try:

changedetectionio/model/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ def __init__(self, *arg, **kw):
1313
# Requires setting to None on submit if it's the same as the default
1414
# Should be all None by default, so we use the system default in this case.
1515
'annotation_rules': {},
16+
'annotated_sort_selectors': [],
1617
'body': None,
1718
'browser_steps': [],
1819
'browser_steps_last_error_step': None,

changedetectionio/processors/text_json_diff/processor.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,10 @@ def run_changedetection(self, watch):
198198
stripped_text_from_html = html_tools.html_to_annotated_text(html_content=html_content,
199199
annotation_rules=annotation_rules)
200200

201+
watch_annotated_sort_selectors = watch.get('annotated_sort_selectors', [])
202+
if watch_annotated_sort_selectors:
203+
stripped_text_from_html = html_tools.sort_annotated_text_by_selectors(stripped_text_from_html, watch_annotated_sort_selectors)
204+
201205
else:
202206
# extract text
203207
do_anchor = self.datastore.data["settings"]["application"].get("render_anchor_tag_content", False)

changedetectionio/templates/edit.html

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -432,6 +432,25 @@ <h3>Text filtering</h3>
432432
{{ render_checkbox_field(form.trim_text_whitespace) }}
433433
<span class="pure-form-message-inline">Remove any whitespace before and after each line of text</span>
434434
</fieldset>
435+
<fieldset class="pure-control-group" data-visible-for="extraction_method=ANNOTATED_TEXT" >
436+
<div class="pure-control-group">
437+
{{ render_field(form.annotated_sort_selectors, rows=4, placeholder='{<Element-to-Sort Selector>}({<Sort-Identifier Selector>})
438+
{item}{item-price}
439+
{title}
440+
{submission}{time}') }}
441+
<div class="pure-form-message">Sort tags in annotated text based on its contents or child tag contents.</div>
442+
<span data-target="#advanced-help-annotated-sort" class="toggle-show pure-button button-tag button-xsmall">Show advanced help and tips</span><br>
443+
<div id="advanced-help-annotated-sort" class="pure-form-message-inline" style="display: none;">
444+
<ul>
445+
<li> Syntax per Line: <code>{&lt;Element-to-Sort Selector&gt;}({&lt;Sort-Identifier Selector&gt;})</code></li>
446+
<li> <strong>Element-to-Sort</strong> CSS or XPath Selector matching the annotated tag <strong>to be sorted</strong>. </li>
447+
<li> Optional: <strong>Sort-Identifier</strong> CSS or XPath Selector relative to Element-to-Sort matching a child annotated tag <strong>containing the text to base the sorting on.</strong> </li>
448+
<li> Elements are sorted in ascending order. </li>
449+
<li> XPath: Begin selector with forward-slashes or explicitly prefix with <code>xpath:</code>. </li>
450+
</ul>
451+
</div>
452+
</div>
453+
</fieldset>
435454
<fieldset>
436455
<div class="pure-control-group">
437456
{{ render_field(form.trigger_text, rows=5, placeholder="Some text to wait for in a line
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
#!/usr/bin/env python3
2+
"""Test suite for the method to sort annotated text with css selector pairs"""
3+
from ..html_tools import html_to_annotated_text, sort_annotated_text_by_selectors
4+
5+
def test_sort_annotated_text():
6+
# Minimal HTML: two 'outer' divs, each containing 'inner' spans
7+
# that have a 'name' child. The ordering is B,A and D,C so we
8+
# expect it to become A,B and C,D after sorting by name text.
9+
test_html = """
10+
<html>
11+
<body>
12+
<div class="outer">
13+
<span class="inner">Y-Item <span class="name">B</span></span>
14+
<span class="inner">Z-Item <span class="name">A</span></span>
15+
</div>
16+
<div class="outer">
17+
<span class="inner">W-Item <span class="name">D</span></span>
18+
<span class="inner">X-Item <span class="name">C</span></span>
19+
</div>
20+
</body>
21+
</html>
22+
"""
23+
24+
# Annotation rules: outer, inner, name
25+
test_annotation_rules = \
26+
{
27+
"div[class*='outer']": ["outer"],
28+
"span[class*='inner']": ["inner"],
29+
"span[class*='name']": ["name"]
30+
}
31+
32+
# Convert HTML to annotated text
33+
annotated_xml = html_to_annotated_text(
34+
test_html,
35+
test_annotation_rules
36+
)
37+
38+
# We'll test the same sorting logic with three different selector approaches:
39+
# 1) CSS
40+
# 2) XPath (note the second part is .// to stay within context)
41+
# 3) xpath1
42+
selector_groups = [
43+
[("outer", ""), ("outer > inner", "name")], # CSS direct child
44+
[("//outer", ""), ("//inner", "xpath:.//name")], # XPath
45+
[("xpath1://outer", ""), ("xpath1://outer/inner", "xpath1:.//name")] # xpath1
46+
]
47+
48+
# The expected order after sorting each 'outer' group by its 'name' text:
49+
# First <div.outer>: (A, B) instead of (B, A)
50+
# Second <div.outer>: (C, D) instead of (D, C)
51+
expected_annotated_xml = (
52+
'<text><outer><inner>Z-Item<name>A</name></inner>\n'
53+
'<inner>Y-Item<name>B</name></inner></outer><outer><inner>X-Item<name>C</name></inner>\n'
54+
'<inner>W-Item<name>D</name></inner></outer></text>'
55+
)
56+
57+
# Check sorting with each selector approach:
58+
for selectors in selector_groups:
59+
sorted_annotated_xml = sort_annotated_text_by_selectors(annotated_xml, selectors)
60+
61+
assert sorted_annotated_xml == expected_annotated_xml, (
62+
f"Sorting failed for selectors: {selectors}\n"
63+
f"Got:\n{sorted_annotated_xml}\n"
64+
f"Expected:\n{expected_annotated_xml}"
65+
)

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ flask_cors # For the Chrome extension to operate
1313
flask_wtf~=1.2
1414
flask~=2.3
1515
inscriptis~=2.2
16+
cssselect~=1.3.0
1617
pytz
1718
timeago~=1.0
1819
validators~=0.21

0 commit comments

Comments
 (0)