Skip to content

Commit 59d9bea

Browse files
authored
feat: regex substitution on surt rules match (#780)
substituion functionality already exists on a global level for matched rules but this causes issues when rule sets conflict in the desired outcome. This change enables setting regex substitution at the rule level to avoid these conflicts.
1 parent 0758e81 commit 59d9bea

File tree

3 files changed

+31
-10
lines changed

3 files changed

+31
-10
lines changed

pywb/rules.yaml

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ rules:
110110

111111
fuzzy_lookup:
112112
match: '("(?:cursor|cursorindex)":["\d\w]+)'
113-
find_all: true
113+
re_type: findall
114114

115115
- url_prefix: 'com,facebook)/ajax/pagelet/generic.php/profiletimeline'
116116
fuzzy_lookup: 'com,facebook\)/.*[?&](__adt=[^&]+).*[&]data=(?:.*?(?:[&]|(profile_id|pagelet_token)[^,]+))'
@@ -175,7 +175,7 @@ rules:
175175

176176
fuzzy_lookup:
177177
match: '("q[\d]+":|after:\\"[^"]+)'
178-
find_all: true
178+
re_type: findall
179179

180180
- url_prefix: 'com,facebook)/pages_reaction_units/more'
181181

@@ -538,6 +538,12 @@ rules:
538538
rewrite:
539539
js_rewrite_location: urls
540540

541+
- url_prefix: 'com,example)/matched'
542+
fuzzy_lookup:
543+
re_type: sub
544+
match: 'matched'
545+
replace: 'replaced'
546+
541547
# all domain rules -- fallback to this dataset
542548
#=================================================================
543549
# Applies to all urls -- should be last

pywb/warcserver/index/fuzzymatcher.py

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,15 @@
1515
# ============================================================================
1616
FuzzyRule = namedtuple('FuzzyRule',
1717
'url_prefix, regex, replace_after, filter_str, ' +
18-
'match_type, find_all')
18+
'match_type, re_type')
1919

2020

2121
# ============================================================================
2222
class FuzzyMatcher(object):
2323
DEFAULT_FILTER = ['urlkey:{0}']
2424
DEFAULT_MATCH_TYPE = 'prefix'
2525
DEFAULT_REPLACE_AFTER = '?'
26+
DEFAULT_RE_TYPE = 'search'
2627

2728
FUZZY_SKIP_PARAMS = ('alt_url', 'reverse', 'closest', 'end_key',
2829
'url', 'matchType', 'filter')
@@ -58,16 +59,16 @@ def parse_fuzzy_rule(self, rule):
5859
replace_after = self.DEFAULT_REPLACE_AFTER
5960
filter_str = self.DEFAULT_FILTER
6061
match_type = self.DEFAULT_MATCH_TYPE
61-
find_all = False
62+
re_type = self.DEFAULT_RE_TYPE
6263

6364
else:
6465
regex = self.make_regex(config.get('match'))
6566
replace_after = config.get('replace', self.DEFAULT_REPLACE_AFTER)
6667
filter_str = config.get('filter', self.DEFAULT_FILTER)
6768
match_type = config.get('type', self.DEFAULT_MATCH_TYPE)
68-
find_all = config.get('find_all', False)
69+
re_type = config.get('re_type', self.DEFAULT_RE_TYPE)
6970

70-
return FuzzyRule(url_prefix, regex, replace_after, filter_str, match_type, find_all)
71+
return FuzzyRule(url_prefix, regex, replace_after, filter_str, match_type, re_type)
7172

7273
def get_fuzzy_match(self, urlkey, url, params):
7374
filters = set()
@@ -78,9 +79,12 @@ def get_fuzzy_match(self, urlkey, url, params):
7879
continue
7980

8081
groups = None
81-
if rule.find_all:
82+
if rule.re_type == 'findall':
8283
groups = rule.regex.findall(urlkey)
83-
else:
84+
if rule.re_type == 'sub':
85+
matched_rule = rule
86+
break
87+
elif rule.re_type == 'search':
8488
m = rule.regex.search(urlkey)
8589
groups = m and m.groups()
8690

@@ -102,7 +106,7 @@ def get_fuzzy_match(self, urlkey, url, params):
102106
no_filters = (not filters or filters == {'urlkey:'}) and (matched_rule.replace_after == '?')
103107

104108
inx = url.find(matched_rule.replace_after)
105-
if inx > 0:
109+
if inx > 0 and matched_rule.re_type != 'sub':
106110
length = inx + len(matched_rule.replace_after)
107111
# don't include trailing '?' for default filter
108112
if no_filters:
@@ -111,13 +115,17 @@ def get_fuzzy_match(self, urlkey, url, params):
111115
if url[length - 1] == '/':
112116
length -= 1
113117
url = url[:length]
114-
elif not no_filters:
118+
elif not no_filters and matched_rule.re_type != 'sub':
115119
url += matched_rule.replace_after[0]
116120

117121
if matched_rule.match_type == 'domain':
118122
host = urlsplit(url).netloc
119123
url = host.split('.', 1)[1]
120124

125+
if matched_rule.re_type == 'sub':
126+
filters = {'urlkey:'}
127+
url = re.sub(rule.regex, rule.replace_after, url)
128+
121129
fuzzy_params = {'url': url,
122130
'matchType': matched_rule.match_type,
123131
'filter': filters,

pywb/warcserver/index/test/test_fuzzymatcher.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -234,3 +234,10 @@ def test_fuzzy_no_deep_path_mime_match(self):
234234
params = self.get_params(url, actual_url, mime='application/x-shockwave-flash')
235235
cdx_iter, errs = self.fuzzy(self.source, params)
236236
assert list(cdx_iter) == []
237+
238+
def test_fuzzy_sub_replacement(self):
239+
url = 'https://example.com/matched'
240+
actual_url = 'https://example.com/replaced'
241+
params = self.get_params(url, actual_url)
242+
cdx_iter, errs = self.fuzzy(self.source, params)
243+
assert list(cdx_iter) == self.get_expected(actual_url)

0 commit comments

Comments
 (0)