Allow sorting tags in annotated text based on its contents or child tag contents

nicoell · nicoell · commit 75221e280c22 · 2025-03-20T22:09:44.000+01:00
diff --git a/changedetectionio/forms.py b/changedetectionio/forms.py
@@ -246,6 +246,53 @@ def process_formdata(self, valuelist):
             self.data = {}
 
 
+class StringSelectorPairListField(StringField):
+    """
+    A StringField that expects each non-empty line in its input to be:
+        {first_selector}{second_selector}
+    or just:
+        {first_selector}
+    and stores them as a list of (first_selector, second_selector) tuples in self.data.
+    If the second selector is omitted, it is set to an empty string.
+    """
+    widget = widgets.TextArea()
+
+    # Convert self.data (the list of tuples) back into textarea lines
+    def _value(self):
+        if self.data:
+            lines = []
+            for (first_selector, second_selector) in self.data:
+                if second_selector:
+                    line = f"{{{first_selector}}}{{{second_selector}}}"
+                else:
+                    line = f"{{{first_selector}}}"
+                lines.append(line)
+            return "\r\n".join(lines)
+        else:
+            return u''
+
+    # Parse the raw textarea input into a list of (first_selector, second_selector) tuples
+    def process_formdata(self, valuelist):
+        if valuelist:
+            self.data = []
+            # Split the textarea into lines
+            lines = valuelist[0].split("\n")
+
+            # Filter out empty or whitespace-only lines
+            cleaned = [line.strip() for line in lines if line.strip()]
+
+            for line in cleaned:
+                # Use regex to capture:
+                #   { first_selector } and optionally { second_selector }
+                match = re.match(r'^\{([^}]*)\}(?:\{([^}]*)\})?$', line)
+                if match:
+                    first_selector = match.group(1).strip()
+                    second_selector = match.group(2).strip() if match.group(2) is not None else ""
+                    self.data.append((first_selector, second_selector))
+        else:
+            self.data = []
+            
+
 class StringSelectorTagDictField(StringField):
     """
     A StringField that expects each non-empty line in its input to be:
@@ -644,6 +691,8 @@ class processor_text_json_diff_form(commonSettingsForm):
     extraction_method = RadioField('Extraction method', choices=[('TEXT', 'Extract text only'),('ANNOTATED_TEXT', 'Extract annotated text')], default='TEXT')
     annotation_rules = StringSelectorTagDictField('Annotation Rules', [validators.Optional()])
 
+    annotated_sort_selectors = StringSelectorPairListField('Sort Annotated text by matched tags', [validators.Optional()])
+
     filter_text_added = BooleanField('Added lines', default=True)
     filter_text_replaced = BooleanField('Replaced/changed lines', default=True)
     filter_text_removed = BooleanField('Removed lines', default=True)
diff --git a/changedetectionio/html_tools.py b/changedetectionio/html_tools.py
@@ -653,6 +653,56 @@ def html_to_annotated_text(
     return "".join(tagged_content)
 
 
+def sort_annotated_text_by_selectors(annotated_xml: str, selector_pairs: List[Tuple[str, str]]) -> str:
+    from lxml.html import fromstring, tostring
+    from collections import defaultdict
+
+    def find_elements(root, selector):
+        if selector.startswith(("xpath:", "xpath1:", "//")):
+            xpath_selector = (selector
+                              .removeprefix("xpath:")
+                              .removeprefix("xpath1:"))
+            return root.xpath(xpath_selector)
+        else:
+            return root.cssselect(selector)
+
+    html_tree = fromstring(annotated_xml.strip())
+
+    for element_to_sort_selector, sort_identifier_selector in selector_pairs:
+        # Find all elements-to-sort
+        elements_to_sort = find_elements(html_tree, element_to_sort_selector)
+
+        # Group by parent
+        parent_map = defaultdict(list)
+        for el in elements_to_sort:
+            parent_map[el.getparent()].append(el)
+
+        # Sort each group's elements-to-sort by the text of the sort-element
+        for parent, elements_to_sort in parent_map.items():
+            def get_sort_key(element):
+                if sort_identifier_selector:
+                    # Use first sort-element matched by `sort_identifier_selector`
+                    sort_element_matches = find_elements(element, sort_identifier_selector)
+                    if sort_element_matches and sort_element_matches[0].text:
+                        return sort_element_matches[0].text.strip()
+                elif element.text:
+                    return element.text.strip()
+                return ""
+
+            sorted_elements = sorted(elements_to_sort, key=get_sort_key)
+
+            # Remove original elements
+            for element in elements_to_sort:
+                parent.remove(element)
+
+            # Reattach elements in sorted order
+            for element in sorted_elements:
+                parent.append(element)
+
+    # Finally, convert the modified DOM back to string maintaining while the input indentation
+    return tostring(html_tree, pretty_print=False, method="xml").decode("utf-8")
+
+
 # Does LD+JSON exist with a @type=='product' and a .price set anywhere?
 def has_ldjson_product_info(content):
     try:
diff --git a/changedetectionio/model/__init__.py b/changedetectionio/model/__init__.py
@@ -13,6 +13,7 @@ def __init__(self, *arg, **kw):
             # Requires setting to None on submit if it's the same as the default
             # Should be all None by default, so we use the system default in this case.
             'annotation_rules': {},
+            'annotated_sort_selectors': [],
             'body': None,
             'browser_steps': [],
             'browser_steps_last_error_step': None,
diff --git a/changedetectionio/processors/text_json_diff/processor.py b/changedetectionio/processors/text_json_diff/processor.py
@@ -198,6 +198,10 @@ def run_changedetection(self, watch):
                     stripped_text_from_html = html_tools.html_to_annotated_text(html_content=html_content,
                                                                                 annotation_rules=annotation_rules)
 
+                    watch_annotated_sort_selectors = watch.get('annotated_sort_selectors', [])
+                    if watch_annotated_sort_selectors:
+                        stripped_text_from_html = html_tools.sort_annotated_text_by_selectors(stripped_text_from_html, watch_annotated_sort_selectors)
+
                 else:
                     # extract text
                     do_anchor = self.datastore.data["settings"]["application"].get("render_anchor_tag_content", False)
diff --git a/changedetectionio/templates/edit.html b/changedetectionio/templates/edit.html
@@ -432,6 +432,25 @@ <h3>Text filtering</h3>
                     {{ render_checkbox_field(form.trim_text_whitespace) }}
                     <span class="pure-form-message-inline">Remove any whitespace before and after each line of text</span>
                 </fieldset>
+                <fieldset class="pure-control-group" data-visible-for="extraction_method=ANNOTATED_TEXT" >
+                    <div class="pure-control-group">
+                    {{ render_field(form.annotated_sort_selectors, rows=4, placeholder='{<Element-to-Sort Selector>}({<Sort-Identifier Selector>})
+{item}{item-price}
+{title}
+{submission}{time}') }}
+                    <div class="pure-form-message">Sort tags in annotated text based on its contents or child tag contents.</div>
+                    <span data-target="#advanced-help-annotated-sort" class="toggle-show pure-button button-tag button-xsmall">Show advanced help and tips</span><br>
+                    <div id="advanced-help-annotated-sort" class="pure-form-message-inline" style="display: none;">
+                         <ul>
+                             <li> Syntax per Line: <code>{&lt;Element-to-Sort Selector&gt;}({&lt;Sort-Identifier Selector&gt;})</code></li>
+                             <li> <strong>Element-to-Sort</strong> CSS or XPath Selector matching the annotated tag <strong>to be sorted</strong>. </li>
+                             <li> Optional: <strong>Sort-Identifier</strong> CSS or XPath Selector relative to Element-to-Sort matching a child annotated tag <strong>containing the text to base the sorting on.</strong> </li>
+                             <li> Elements are sorted in ascending order. </li>
+                             <li> XPath: Begin selector with forward-slashes or explicitly prefix with <code>xpath:</code>. </li>
+                         </ul>
+                     </div>
+                    </div>
+                </fieldset>
                 <fieldset>
                     <div class="pure-control-group">
                         {{ render_field(form.trigger_text, rows=5, placeholder="Some text to wait for in a line
diff --git a/changedetectionio/tests/test_sort_annotated_text.py b/changedetectionio/tests/test_sort_annotated_text.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python3
+"""Test suite for the method to sort annotated text with css selector pairs"""
+from ..html_tools import html_to_annotated_text, sort_annotated_text_by_selectors
+
+def test_sort_annotated_text():
+    # Minimal HTML: two 'outer' divs, each containing 'inner' spans
+    # that have a 'name' child. The ordering is B,A and D,C so we
+    # expect it to become A,B and C,D after sorting by name text.
+    test_html = """
+        <html>
+          <body>
+            <div class="outer">
+                <span class="inner">Y-Item <span class="name">B</span></span>
+                <span class="inner">Z-Item <span class="name">A</span></span>
+            </div>
+            <div class="outer">
+                <span class="inner">W-Item <span class="name">D</span></span>
+                <span class="inner">X-Item <span class="name">C</span></span>
+            </div>
+          </body>
+        </html>
+        """
+
+    # Annotation rules: outer, inner, name
+    test_annotation_rules = \
+    {
+        "div[class*='outer']": ["outer"],
+        "span[class*='inner']": ["inner"],
+        "span[class*='name']": ["name"]
+    }
+
+    # Convert HTML to annotated text
+    annotated_xml = html_to_annotated_text(
+        test_html,
+        test_annotation_rules
+    )
+
+    # We'll test the same sorting logic with three different selector approaches:
+    # 1) CSS
+    # 2) XPath (note the second part is .// to stay within context)
+    # 3) xpath1
+    selector_groups = [
+        [("outer", ""), ("outer > inner", "name")],  # CSS direct child
+        [("//outer", ""), ("//inner", "xpath:.//name")],  # XPath
+        [("xpath1://outer", ""), ("xpath1://outer/inner", "xpath1:.//name")]  # xpath1
+    ]
+
+    # The expected order after sorting each 'outer' group by its 'name' text:
+    # First <div.outer>: (A, B) instead of (B, A)
+    # Second <div.outer>: (C, D) instead of (D, C)
+    expected_annotated_xml = (
+        '<text><outer><inner>Z-Item<name>A</name></inner>\n'
+        '<inner>Y-Item<name>B</name></inner></outer><outer><inner>X-Item<name>C</name></inner>\n'
+        '<inner>W-Item<name>D</name></inner></outer></text>'
+    )
+
+    # Check sorting with each selector approach:
+    for selectors in selector_groups:
+        sorted_annotated_xml = sort_annotated_text_by_selectors(annotated_xml, selectors)
+
+        assert sorted_annotated_xml == expected_annotated_xml, (
+            f"Sorting failed for selectors: {selectors}\n"
+            f"Got:\n{sorted_annotated_xml}\n"
+            f"Expected:\n{expected_annotated_xml}"
+        )
diff --git a/requirements.txt b/requirements.txt
@@ -13,6 +13,7 @@ flask_cors # For the Chrome extension to operate
 flask_wtf~=1.2
 flask~=2.3
 inscriptis~=2.2
+cssselect~=1.3.0
 pytz
 timeago~=1.0
 validators~=0.21