Skip to content

Commit 403167f

Browse files
authored
User-Agent Detection Fix + New-Style rewriting on by default + Dependency Update (2.6.6) (#708)
* js rewriting: default to moden js-proxy based rewriting by default, use legacy rewriting only if browsers are older than minimum, as suggested in #707 * user-agent detection: use ua_parser for user-agent detection instead of obsolete werkzeug.useragent, which also did not support browsers >=100 * tests: additional tests for rewriting with various user-agents, defaulting to new-style rewriting for unknown browsers * dockerfile: Update Dockerfile to use py3.8 * tests: skip s3 tests dependent on commoncrawl data (for now, need better s3 tests). * bump to 2.6.6, update CHANGES
1 parent 63ac82e commit 403167f

File tree

8 files changed

+89
-44
lines changed

8 files changed

+89
-44
lines changed

CHANGES.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,11 @@
1+
pywb 2.6.6 changelist
2+
~~~~~~~~~~~~~~~~~~~~~
3+
4+
* dependency: don't use obsolete werkzeug useragent package `#704 <https://github.com/webrecorder/pywb/pull/704>`_
5+
* fix user-agent detection: use ua-parser module, default to new js-proxy mode, unless older browser detected `#707 <https://github.com/webrecorder/pywb/pull/707>`_
6+
* fix tests: disable broken s3 tests for now
7+
* Dockerfile: use python 3.8 by default
8+
19
pywb 2.6.5 changelist
210
~~~~~~~~~~~~~~~~~~~~~
311

Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
ARG PYTHON=python:3.7.2
1+
ARG PYTHON=python:3.8
22

33
FROM $PYTHON
44

pywb/rewrite/default_rewriter.py

Lines changed: 32 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
from pywb import DEFAULT_RULES_FILE
2121

2222
import copy
23-
from werkzeug.useragents import UserAgent
23+
from ua_parser import user_agent_parser
2424

2525

2626
# ============================================================================
@@ -34,7 +34,7 @@ class DefaultRewriter(BaseContentRewriter):
3434

3535
'css': CSSRewriter,
3636

37-
'js': JSLocationOnlyRewriter,
37+
'js': JSWombatProxyRewriter,
3838
'js-proxy': JSNoneRewriter,
3939
'js-worker': JSWorkerRewriter,
4040

@@ -119,33 +119,44 @@ def __init__(self, *args, **kwargs):
119119
super(RewriterWithJSProxy, self).__init__(*args, **kwargs)
120120

121121
def get_rewriter(self, rw_type, rwinfo=None):
122-
if rw_type == 'js' and rwinfo:
123-
# check if UA allows this
124-
if self.ua_allows_obj_proxy(rwinfo.url_rewriter.rewrite_opts):
125-
return JSWombatProxyRewriter
126-
127-
# otherwise, return default rewriter
128-
return super(RewriterWithJSProxy, self).get_rewriter(rw_type, rwinfo)
129-
130-
def ua_allows_obj_proxy(self, opts):
122+
if rw_type != 'js' or not rwinfo:
123+
return super(RewriterWithJSProxy, self).get_rewriter(rw_type, rwinfo)
124+
125+
# check if should use old non-proxy rewriter
126+
if self.ua_no_obj_proxy(rwinfo.url_rewriter.rewrite_opts):
127+
print("loc only")
128+
return JSLocationOnlyRewriter
129+
else:
130+
# otherwise, return default, js proxy-capable rewriter
131+
return JSWombatProxyRewriter
132+
133+
def ua_no_obj_proxy(self, opts):
131134
ua = opts.get('ua')
132135
if not ua:
133136
ua_string = opts.get('ua_string')
134137
if ua_string:
135-
ua = UserAgent(ua_string)
138+
ua = user_agent_parser.ParseUserAgent(ua_string)
136139

137140
if ua is None:
138-
return True
141+
return False
139142

140143
supported = {
141-
'chrome': '49.0',
142-
'firefox': '44.0',
143-
'safari': '10.0',
144-
'opera': '36.0',
145-
'edge': '12.0',
146-
'msie': None,
144+
'chrome': 49,
145+
'firefox': 4,
146+
'safari': 10,
147+
'opera': 36,
148+
'edge': 12,
149+
'ie': 1000,
147150
}
148151

149-
min_vers = supported.get(ua.browser)
152+
min_vers = supported.get(ua.get("family", "").lower())
153+
if not min_vers:
154+
return False
155+
156+
try:
157+
ua_version = int(ua.get("major", 0))
158+
except:
159+
return False
160+
161+
return ua_version < min_vers
150162

151-
return (min_vers and ua.version >= min_vers)

pywb/rewrite/test/test_content_rewriter.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313

1414
from pywb.rewrite.wburl import WbUrl
1515
from pywb.rewrite.url_rewriter import UrlRewriter
16-
from pywb.rewrite.default_rewriter import DefaultRewriter, RewriterWithJSProxy
16+
from pywb.rewrite.default_rewriter import RewriterWithJSProxy
1717

1818
from pywb import get_test_dir
1919

@@ -39,8 +39,7 @@ def headers(request):
3939
class TestContentRewriter(object):
4040
@classmethod
4141
def setup_class(self):
42-
self.content_rewriter = DefaultRewriter()
43-
self.js_proxy_content_rewriter = RewriterWithJSProxy()
42+
self.content_rewriter = RewriterWithJSProxy()
4443

4544
def _create_response_record(self, url, headers, payload, warc_headers):
4645
writer = BufferWARCWriter()
@@ -65,7 +64,6 @@ def rewrite_record(self, headers, content, ts, url='http://example.com/',
6564
record = self._create_response_record(url, headers, content, warc_headers)
6665

6766
wburl = WbUrl(ts + '/' + (request_url or url))
68-
url_rewriter = UrlRewriter(wburl, prefix)
6967

7068
cdx = CDXObject()
7169
cdx['url'] = url
@@ -79,11 +77,13 @@ def insert_func(rule, cdx):
7977
return ''
8078

8179
if use_js_proxy:
82-
rewriter = self.js_proxy_content_rewriter
80+
rewrite_opts = {}
8381
else:
84-
rewriter = self.content_rewriter
82+
rewrite_opts = {'ua_string': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/10.0 Safari/537.36'}
8583

86-
return rewriter(record, url_rewriter, cookie_rewriter=None,
84+
url_rewriter = UrlRewriter(wburl, prefix, rewrite_opts=rewrite_opts)
85+
86+
return self.content_rewriter(record, url_rewriter, cookie_rewriter=None,
8787
head_insert_func=insert_func,
8888
cdx=cdx,
8989
environ=environ)

pywb/utils/test/test_loaders.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@
9797

9898
test_cdx_dir = get_test_dir() + 'cdx/'
9999

100-
100+
@pytest.mark.skip("skip for now, made need different s3 source")
101101
def test_s3_read_1():
102102
pytest.importorskip('boto3')
103103

@@ -112,13 +112,14 @@ def test_s3_read_1():
112112
assert reader.readline() == b'WARC/1.0\r\n'
113113
assert reader.readline() == b'WARC-Type: response\r\n'
114114

115+
@pytest.mark.skip("skip for now, made need different s3 source")
115116
def test_s3_read_2():
116117
pytest.importorskip('boto3')
117118

118119
res = BlockLoader().load('s3://commoncrawl/crawl-data/CC-MAIN-2015-11/index.html')
119120

120121
buff = res.read()
121-
assert len(buff) == 2082
122+
assert len(buff) == 2330
122123

123124
reader = DecompressingBufferedReader(BytesIO(buff))
124125
assert reader.readline() == b'<!DOCTYPE html>\n'

pywb/version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = '2.6.5'
1+
__version__ = '2.6.6'
22

33
if __name__ == '__main__':
44
print(__version__)

requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ jinja2<3.0.0
66
surt>=0.3.1
77
brotlipy
88
pyyaml
9-
werkzeug==1.0.1
9+
werkzeug
1010
webencodings
1111
gevent==20.9.0
1212
webassets==0.12.1
@@ -16,3 +16,4 @@ fakeredis<1.0
1616
tldextract
1717
python-dateutil
1818
markupsafe<2.1.0
19+
ua_parser

tests/test_integration.py

Lines changed: 35 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -269,17 +269,41 @@ def test_replay_js_obj_proxy(self, fmod):
269269
assert resp.content_length != 0
270270
assert resp.content_type == 'application/x-javascript'
271271

272-
# test with Chrome user agent
273-
resp = self.get('/pywb/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod,
274-
headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'})
275-
assert 'let window = _____WB$wombat$assign$function_____(' in resp.text
276-
277-
def test_replay_js_ie11_no_obj_proxy(self, fmod):
278-
# IE11 user-agent, no proxy
279-
resp = self.get('/pywb/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod,
280-
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko'})
281-
282-
assert 'let window = _____WB$wombat$assign$function_____(' not in resp.text
272+
user_agents = [
273+
# chrome
274+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
275+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.3071.115 Safari/537.36'
276+
# firefox
277+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:98.0) Gecko/20100101 Firefox/98.0'
278+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:98.0) Gecko/20100101 Firefox/100.0',
279+
# safari
280+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15'
281+
# other
282+
'some-custom-browser'
283+
]
284+
285+
# test with each user-agent
286+
for ua in user_agents:
287+
resp = self.get('/pywb/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod,
288+
headers={'User-Agent': ua})
289+
290+
assert 'let window = _____WB$wombat$assign$function_____(' in resp.text
291+
292+
def test_replay_js_no_obj_proxy(self, fmod):
293+
user_agents = [
294+
# IE11 user-agent, no proxy
295+
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"
296+
# old chrome
297+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/19.0.3071.115 Safari/537.36'
298+
# old firefox
299+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:98.0) Gecko/20100101 Firefox/12.0'
300+
]
301+
302+
for ua in user_agents:
303+
resp = self.get('/pywb/20140126200625{0}/http://www.iana.org/_js/2013.1/jquery.js', fmod,
304+
headers={'User-Agent': ua})
305+
306+
assert 'let window = _____WB$wombat$assign$function_____(' not in resp.text
283307

284308
def test_replay_non_exact(self, fmod):
285309
# non-exact mode, don't redirect to exact capture

0 commit comments

Comments
 (0)