Skip to content

Commit 238a45b

Browse files
committed
Merge branch 'develop' for 0.6.5
2 parents cc776b6 + d31a4df commit 238a45b

17 files changed

+203
-30
lines changed

CHANGES.rst

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,20 @@
1+
pywb 0.6.5 changelist
2+
~~~~~~~~~~~~~~~~~~~~~
3+
4+
* fix static handling when content type can not be guessed, default to 'application/octet-stream'
5+
6+
* rewrite fix: understand partially encoded urls such as http%3A// in WbUrl, decode correctly
7+
8+
* rewrite fix: rewrite \/\/example.com and \\/\\/example.com in JS same as \\example.com
9+
10+
* cookies: add exact cookie rewriter which sets cookie to exact url only, never collection or host root
11+
12+
* don't rewrite rel=canonical links for services which rely on these
13+
14+
* cdx-indexer: Detect non-gzip chunk encoded .warc.gz/arc.gz archive files and show a meaningful
15+
error message explaining how to fix issue (uncompress and possibly use warctools warc2warc to recompress)
16+
17+
118
pywb 0.6.4 changelist
219
~~~~~~~~~~~~~~~~~~~~~
320

README.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
PyWb 0.6.4
1+
PyWb 0.6.5
22
==========
33

44
.. image:: https://travis-ci.org/ikreymer/pywb.png?branch=master

pywb/framework/wbrequestresponse.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ def normalize_post_query(self):
131131
if not self.wb_url:
132132
return
133133

134-
mime = self.env.get('CONTENT_TYPE').split(';')[0]
134+
mime = self.env.get('CONTENT_TYPE', '').split(';')[0]
135135
length = self.env.get('CONTENT_LENGTH')
136136
stream = self.env['wsgi.input']
137137

pywb/rewrite/cookie_rewriter.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,24 @@ def rewrite_cookie(self, name, morsel):
5555
return morsel
5656

5757

58+
#=================================================================
59+
class ExactPathCookieRewriter(WbUrlBaseCookieRewriter):
60+
"""
61+
Rewrite cookies only using exact path, useful for live rewrite
62+
without a timestamp and to minimize cookie pollution
63+
64+
If path or domain present, simply remove
65+
"""
66+
67+
def rewrite_cookie(self, name, morsel):
68+
if morsel.get('domain'):
69+
del morsel['domain']
70+
# else set cookie to rewritten path
71+
if morsel.get('path'):
72+
del morsel['path']
73+
74+
self._remove_age_opts(morsel)
75+
return morsel
5876
#=================================================================
5977
class RootScopeCookieRewriter(WbUrlBaseCookieRewriter):
6078
"""
@@ -79,5 +97,7 @@ def rewrite_cookie(self, name, morsel):
7997
def get_cookie_rewriter(cookie_scope):
8098
if cookie_scope == 'root':
8199
return RootScopeCookieRewriter
100+
elif cookie_scope == 'exact':
101+
return ExactPathCookieRewriter
82102
else:
83103
return MinimalScopeCookieRewriter

pywb/rewrite/html_rewriter.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,12 @@ def _rewrite_tag_attrs(self, tag, tag_attrs):
174174
elif attr_name == 'crossorigin':
175175
attr_name = '_crossorigin'
176176

177+
# special case: link don't rewrite canonical
178+
elif tag == 'link' and attr_name == 'href':
179+
if not self.has_attr(tag_attrs, ('rel', 'canonical')):
180+
rw_mod = handler.get(attr_name)
181+
attr_value = self._rewrite_url(attr_value, rw_mod)
182+
177183
# special case: meta tag
178184
elif (tag == 'meta') and (attr_name == 'content'):
179185
if self.has_attr(tag_attrs, ('http-equiv', 'refresh')):

pywb/rewrite/test/test_cookie_rewriter.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
r"""
2+
# Default -- MinimalScopeRewriter
23
# No rewriting
34
>>> rewrite_cookie('a=b; c=d;')
45
[('Set-Cookie', 'a=b'), ('Set-Cookie', 'c=d')]
@@ -23,17 +24,24 @@
2324
>>> rewrite_cookie('abc@def=123')
2425
[]
2526
27+
# ExactCookieRewriter
28+
>>> rewrite_cookie('some=value; Path=/diff/path/;', urlrewriter, ExactPathCookieRewriter)
29+
[('Set-Cookie', 'some=value')]
30+
31+
>>> rewrite_cookie('some=value; Domain=.example.com; Path=/diff/path/; Max-Age=1500', urlrewriter, ExactPathCookieRewriter)
32+
[('Set-Cookie', 'some=value')]
33+
2634
"""
2735

2836

29-
from pywb.rewrite.cookie_rewriter import MinimalScopeCookieRewriter
37+
from pywb.rewrite.cookie_rewriter import MinimalScopeCookieRewriter, ExactPathCookieRewriter
3038
from pywb.rewrite.url_rewriter import UrlRewriter
3139

3240
urlrewriter = UrlRewriter('20131226101010/http://example.com/some/path/index.html', '/pywb/')
3341

3442
urlrewriter2 = UrlRewriter('em_/http://example.com/', '/preview/')
3543

3644

37-
def rewrite_cookie(cookie_str, rewriter=urlrewriter):
38-
return MinimalScopeCookieRewriter(rewriter).rewrite(cookie_str)
45+
def rewrite_cookie(cookie_str, rewriter=urlrewriter, cookie_rewriter=MinimalScopeCookieRewriter):
46+
return cookie_rewriter(rewriter).rewrite(cookie_str)
3947

pywb/rewrite/test/test_html_rewriter.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,10 @@
102102
>>> parse('<link href="abc.txt"><div>SomeTest</div>', head_insert = '<script>load_stuff();</script>')
103103
<link href="/web/20131226101010oe_/http://example.com/some/path/abc.txt"><script>load_stuff();</script><div>SomeTest</div>
104104
105+
# don't rewrite rel=canonical
106+
>>> parse('<link rel=canonical href="http://example.com/">')
107+
<link rel="canonical" href="http://example.com/">
108+
105109
# doctype
106110
>>> parse('<!doctype html PUBLIC "public">')
107111
<!doctype html PUBLIC "public">

pywb/rewrite/test/test_regex_rewriters.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,16 @@
4545
>>> _test_js('document_domain = "anotherdomain.com"; window.document.domain = "example.com"')
4646
'document_domain = "anotherdomain.com"; window.document.WB_wombat_domain = "example.com"'
4747
48+
# protocol-rel escapes
49+
>>> _test_js('"//example.com/"')
50+
'"/web/20131010/http://example.com/"'
51+
52+
>>> _test_js(r'"\/\/example.com/"')
53+
'"/web/20131010/http:\\/\\/example.com/"'
54+
55+
>>> _test_js(r'"\\/\\/example.com/"')
56+
'"/web/20131010/http:\\\\/\\\\/example.com/"'
57+
4858
# custom rules added
4959
>>> _test_js('window.location = "http://example.com/abc.html"; some_func(); ', [('some_func\(\).*', RegexRewriter.format('/*{0}*/'), 0)])
5060
'window.WB_wombat_location = "/web/20131010/http://example.com/abc.html"; /*some_func(); */'

pywb/rewrite/test/test_url_rewriter.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,21 @@
5050
>>> do_rewrite(r'http:\/\/some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/')
5151
'localhost:8080/20101226101112/http:\\\\/\\\\/some-other-site.com'
5252
53+
>>> do_rewrite(r'//some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/')
54+
'localhost:8080/20101226101112/http://some-other-site.com'
55+
56+
>>> do_rewrite(r'\/\/some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/')
57+
'localhost:8080/20101226101112/http:\\\\/\\\\/some-other-site.com'
58+
59+
>>> do_rewrite(r'\\/\\/some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/')
60+
'localhost:8080/20101226101112/http:\\\\/\\\\/some-other-site.com'
61+
62+
>>> do_rewrite(r'http:\/\/some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/')
63+
'localhost:8080/20101226101112/http:\\\\/\\\\/some-other-site.com'
64+
65+
>>> do_rewrite(r'http:\/\/some-other-site.com', '20101226101112/http://example.com/index.html', 'localhost:8080/')
66+
'localhost:8080/20101226101112/http:\\\\/\\\\/some-other-site.com'
67+
5368
>>> do_rewrite('../../other.html', '2020/http://example.com/index.html', '/')
5469
'/2020/http://example.com/other.html'
5570

pywb/rewrite/test/test_wburl.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,13 @@
2626
>>> repr(WbUrl('https://example.com/xyz?a=%2f&b=%2E'))
2727
"('latest_replay', '', '', 'https://example.com/xyz?a=%2f&b=%2E', 'https://example.com/xyz?a=%2f&b=%2E')"
2828
29+
# Test scheme partially encoded urls
30+
>>> repr(WbUrl('https%3A//example.com/'))
31+
"('latest_replay', '', '', 'https://example.com/', 'https://example.com/')"
32+
33+
>>> repr(WbUrl('2014/http%3A%2F%2Fexample.com/'))
34+
"('replay', '2014', '', 'http://example.com/', '2014/http://example.com/')"
35+
2936
# Query Urls
3037
# ======================
3138
>>> repr(WbUrl('*/http://example.com/abc?def=a'))
@@ -57,6 +64,21 @@
5764
>>> repr(WbUrl('/example.com/'))
5865
"('latest_replay', '', '', 'http://example.com/', 'http://example.com/')"
5966
67+
# Is_ Tests
68+
>>> u = WbUrl('*/http://example.com/abc?def=a*')
69+
>>> u.is_url_query()
70+
True
71+
72+
>>> u.is_query()
73+
True
74+
75+
>>> u2 = WbUrl('20130102im_/https:/example.com')
76+
>>> u2.is_embed
77+
True
78+
79+
>>> u2.is_replay()
80+
True
81+
6082
6183
# Error Urls
6284
# ======================

0 commit comments

Comments
 (0)