feat: regex substitution on surt rules match (#780)

mijho · web-flow · commit 59d9beac0571 · 2023-01-31T18:48:19.000-08:00
substituion functionality already exists on a global level for matched
rules but this causes issues when rule sets conflict in the desired
outcome. This change enables setting regex substitution at the rule
level to avoid these conflicts.
diff --git a/pywb/rules.yaml b/pywb/rules.yaml
@@ -110,7 +110,7 @@ rules:
 
       fuzzy_lookup:
         match: '("(?:cursor|cursorindex)":["\d\w]+)'
-        find_all: true
+        re_type: findall
 
     - url_prefix: 'com,facebook)/ajax/pagelet/generic.php/profiletimeline'
       fuzzy_lookup: 'com,facebook\)/.*[?&](__adt=[^&]+).*[&]data=(?:.*?(?:[&]|(profile_id|pagelet_token)[^,]+))'
@@ -175,7 +175,7 @@ rules:
 
       fuzzy_lookup:
         match: '("q[\d]+":|after:\\"[^"]+)'
-        find_all: true
+        re_type: findall
 
     - url_prefix: 'com,facebook)/pages_reaction_units/more'
 
@@ -538,6 +538,12 @@ rules:
       rewrite:
         js_rewrite_location: urls
 
+    - url_prefix: 'com,example)/matched'
+      fuzzy_lookup:
+        re_type: sub
+        match: 'matched'
+        replace: 'replaced'          
+
     # all domain rules -- fallback to this dataset
     #=================================================================
     # Applies to all urls -- should be last
diff --git a/pywb/warcserver/index/fuzzymatcher.py b/pywb/warcserver/index/fuzzymatcher.py
@@ -15,14 +15,15 @@
 # ============================================================================
 FuzzyRule = namedtuple('FuzzyRule',
                        'url_prefix, regex, replace_after, filter_str, ' +
-                       'match_type, find_all')
+                       'match_type, re_type')
 
 
 # ============================================================================
 class FuzzyMatcher(object):
     DEFAULT_FILTER = ['urlkey:{0}']
     DEFAULT_MATCH_TYPE = 'prefix'
     DEFAULT_REPLACE_AFTER = '?'
+    DEFAULT_RE_TYPE = 'search'
 
     FUZZY_SKIP_PARAMS = ('alt_url', 'reverse', 'closest', 'end_key',
                          'url', 'matchType', 'filter')
@@ -58,16 +59,16 @@ def parse_fuzzy_rule(self, rule):
             replace_after = self.DEFAULT_REPLACE_AFTER
             filter_str = self.DEFAULT_FILTER
             match_type = self.DEFAULT_MATCH_TYPE
-            find_all = False
+            re_type = self.DEFAULT_RE_TYPE
 
         else:
             regex = self.make_regex(config.get('match'))
             replace_after = config.get('replace', self.DEFAULT_REPLACE_AFTER)
             filter_str = config.get('filter', self.DEFAULT_FILTER)
             match_type = config.get('type', self.DEFAULT_MATCH_TYPE)
-            find_all = config.get('find_all', False)
+            re_type = config.get('re_type', self.DEFAULT_RE_TYPE)
 
-        return FuzzyRule(url_prefix, regex, replace_after, filter_str, match_type, find_all)
+        return FuzzyRule(url_prefix, regex, replace_after, filter_str, match_type, re_type)
 
     def get_fuzzy_match(self, urlkey, url, params):
         filters = set()
@@ -78,9 +79,12 @@ def get_fuzzy_match(self, urlkey, url, params):
                 continue
 
             groups = None
-            if rule.find_all:
+            if rule.re_type == 'findall':
                 groups = rule.regex.findall(urlkey)
-            else:
+            if rule.re_type == 'sub':
+                matched_rule = rule
+                break
+            elif rule.re_type == 'search':
                 m = rule.regex.search(urlkey)
                 groups = m and m.groups()
 
@@ -102,7 +106,7 @@ def get_fuzzy_match(self, urlkey, url, params):
         no_filters = (not filters or filters == {'urlkey:'}) and (matched_rule.replace_after == '?')
 
         inx = url.find(matched_rule.replace_after)
-        if inx > 0:
+        if inx > 0 and matched_rule.re_type != 'sub':
             length = inx + len(matched_rule.replace_after)
             # don't include trailing '?' for default filter
             if no_filters:
@@ -111,13 +115,17 @@ def get_fuzzy_match(self, urlkey, url, params):
                 if url[length - 1] == '/':
                     length -= 1
             url = url[:length]
-        elif not no_filters:
+        elif not no_filters and matched_rule.re_type != 'sub':
             url += matched_rule.replace_after[0]
 
         if matched_rule.match_type == 'domain':
             host = urlsplit(url).netloc
             url = host.split('.', 1)[1]
 
+        if matched_rule.re_type == 'sub':
+            filters = {'urlkey:'}
+            url = re.sub(rule.regex, rule.replace_after, url)            
+
         fuzzy_params = {'url': url,
                         'matchType': matched_rule.match_type,
                         'filter': filters,
diff --git a/pywb/warcserver/index/test/test_fuzzymatcher.py b/pywb/warcserver/index/test/test_fuzzymatcher.py
@@ -234,3 +234,10 @@ def test_fuzzy_no_deep_path_mime_match(self):
         params = self.get_params(url, actual_url, mime='application/x-shockwave-flash')
         cdx_iter, errs = self.fuzzy(self.source, params)
         assert list(cdx_iter) == []
+
+    def test_fuzzy_sub_replacement(self):
+        url = 'https://example.com/matched'
+        actual_url = 'https://example.com/replaced'
+        params = self.get_params(url, actual_url)
+        cdx_iter, errs = self.fuzzy(self.source, params)
+        assert list(cdx_iter) == self.get_expected(actual_url)