Skip to content

Commit 8d6845a

Browse files
committed
fuzzy match: add support for specifying regex and args seperately for
fuzzy_lookup match
1 parent ffb702c commit 8d6845a

File tree

3 files changed

+62
-22
lines changed

3 files changed

+62
-22
lines changed

pywb/cdx/cdxdomainspecific.py

Lines changed: 10 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,6 @@
1313

1414
#=================================================================
1515
def load_domain_specific_cdx_rules(ds_rules_file, surt_ordered):
16-
"""
17-
>>> (canon, fuzzy) = load_domain_specific_cdx_rules(None, True)
18-
>>> canon('http://test.example.example/path/index.html?a=b&id=value&c=d')
19-
'example,example,test)/path/index.html?id=value'
20-
"""
2116
canon = None
2217
fuzzy = None
2318

@@ -162,24 +157,24 @@ def unsurt(self):
162157

163158
@staticmethod
164159
def make_regex(config):
160+
# just query args
165161
if isinstance(config, list):
166162
string = CDXDomainSpecificRule.make_query_match_regex(config)
167-
# assumes string
163+
164+
# split out base and args
165+
elif isinstance(config, dict):
166+
string = config.get('regex', '')
167+
string += CDXDomainSpecificRule.make_query_match_regex(
168+
config.get('args', []))
169+
170+
# else assume string
168171
else:
169-
string = config
172+
string = str(config)
170173

171174
return re.compile(string)
172175

173176
@staticmethod
174177
def make_query_match_regex(params_list):
175-
r"""
176-
>>> CDXDomainSpecificRule.make_query_match_regex(['para', 'id', 'abc'])
177-
'[?&](abc=[^&]+).*[?&](id=[^&]+).*[?&](para=[^&]+)'
178-
179-
>>> CDXDomainSpecificRule.make_query_match_regex(['id[0]', 'abc()'])
180-
'[?&](abc\\(\\)=[^&]+).*[?&](id\\[0\\]=[^&]+)'
181-
182-
"""
183178
params_list.sort()
184179

185180
def conv(value):
@@ -188,8 +183,3 @@ def conv(value):
188183
params_list = map(conv, params_list)
189184
final_str = '.*'.join(params_list)
190185
return final_str
191-
192-
193-
if __name__ == "__main__":
194-
import doctest
195-
doctest.testmod()
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
r"""
2+
Load Rules
3+
4+
>>> (canon, fuzzy) = load_domain_specific_cdx_rules(None, True)
5+
>>> canon('http://test.example.example/path/index.html?a=b&id=value&c=d')
6+
'example,example,test)/path/index.html?id=value'
7+
8+
9+
# Fuzzy Query Args Builder
10+
>>> CDXDomainSpecificRule.make_query_match_regex(['para', 'id', 'abc'])
11+
'[?&](abc=[^&]+).*[?&](id=[^&]+).*[?&](para=[^&]+)'
12+
13+
>>> CDXDomainSpecificRule.make_query_match_regex(['id[0]', 'abc()'])
14+
'[?&](abc\\(\\)=[^&]+).*[?&](id\\[0\\]=[^&]+)'
15+
16+
17+
# Fuzzy Match Query + Args
18+
19+
# list
20+
>>> CDXDomainSpecificRule.make_regex(['para', 'id', 'abc']).pattern
21+
'[?&](abc=[^&]+).*[?&](id=[^&]+).*[?&](para=[^&]+)'
22+
23+
# dict
24+
>>> CDXDomainSpecificRule.make_regex(dict(regex='com,test,.*\)/', args=['para', 'id', 'abc'])).pattern
25+
'com,test,.*\\)/[?&](abc=[^&]+).*[?&](id=[^&]+).*[?&](para=[^&]+)'
26+
27+
# string
28+
>>> CDXDomainSpecificRule.make_regex('com,test,.*\)/[?&](abc=[^&]+).*[?&](id=[^&]+).*[?&](para=[^&]+)').pattern
29+
'com,test,.*\\)/[?&](abc=[^&]+).*[?&](id=[^&]+).*[?&](para=[^&]+)'
30+
31+
"""
32+
33+
34+
from pywb.cdx.cdxdomainspecific import CDXDomainSpecificRule
35+
from pywb.cdx.cdxdomainspecific import load_domain_specific_cdx_rules
36+
37+
38+
if __name__ == "__main__":
39+
import doctest
40+
doctest.testmod()

pywb/rules.yaml

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -148,12 +148,22 @@ rules:
148148

149149
- url_prefix: 'com,youtube,c'
150150

151-
fuzzy_lookup: 'com,youtube,c.*/videogoodput.*([?&]id=[^&]+)'
151+
fuzzy_lookup:
152+
match:
153+
regex: 'com,youtube,c.*/videogoodput.*'
154+
args:
155+
- id
152156

153157
- url_prefix: 'com,googlevideo,'
154158

155159
fuzzy_lookup:
156-
match: 'com,googlevideo.*/videoplayback.*([?&]id=[^&]+).*([?&]itag=[^&]+).*([?&]mime=[^&]+)'
160+
match:
161+
regex: 'com,googlevideo.*/videoplayback.*'
162+
args:
163+
- id
164+
- itag
165+
- mime
166+
157167
filter:
158168
- '~urlkey:{0}'
159169
- '!mimetype:text/plain'

0 commit comments

Comments
 (0)