Skip to content

Commit 71a8abe

Browse files
committed
Merge branch 'develop' for 0.6.4
2 parents a4f9138 + f6053a9 commit 71a8abe

24 files changed

+171
-64
lines changed

CHANGES.rst

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,19 @@
1+
pywb 0.6.4 changelist
2+
~~~~~~~~~~~~~~~~~~~~~
3+
4+
* Ignore bad multiline headers in warc.
5+
6+
* Rewrite fix: Don't parse html entities in HTML rewriter.
7+
8+
* Ensure cdx iterator closed when reeading.
9+
10+
* Rewrite fix: remove pywb prefix from any query params.
11+
12+
* Rewrite fix: better JS rewriting, avoid // comments when matching protocol-relative urls.
13+
14+
* WARC metadata and resource records include in cdx from cdx-indexer by default
15+
16+
117
pywb 0.6.3 changelist
218
~~~~~~~~~~~~~~~~~~~~~
319

README.rst

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
1-
PyWb 0.6.3
1+
PyWb 0.6.4
22
==========
33

44
.. image:: https://travis-ci.org/ikreymer/pywb.png?branch=master
55
:target: https://travis-ci.org/ikreymer/pywb
6-
.. image:: https://coveralls.io/repos/ikreymer/pywb/badge.png?branch=master
7-
:target: https://coveralls.io/r/ikreymer/pywb?branch=master
6+
.. image:: https://coveralls.io/repos/ikreymer/pywb/badge.png?branch=develop
7+
:target: https://coveralls.io/r/ikreymer/pywb?branch=develop
88
.. image:: https://img.shields.io/gratipay/ikreymer.svg
99
:target: https://www.gratipay.com/ikreymer/
1010

pywb/cdx/cdxsource.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,17 @@ def __init__(self, filename):
2828
self.filename = filename
2929

3030
def load_cdx(self, query):
31-
source = open(self.filename)
32-
return iter_range(source, query.key, query.end_key)
31+
def do_open():
32+
try:
33+
source = open(self.filename)
34+
gen = iter_range(source, query.key, query.end_key)
35+
for line in gen:
36+
yield line
37+
finally:
38+
source.close()
39+
40+
return do_open()
41+
#return iter_range(do_open(), query.key, query.end_key)
3342

3443
def __str__(self):
3544
return 'CDX File - ' + self.filename

pywb/framework/wbrequestresponse.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,8 @@ def __init__(self, env,
7878
rel_prefix,
7979
env.get('SCRIPT_NAME', '/'),
8080
cookie_scope)
81+
82+
self.urlrewriter.deprefix_url()
8183
else:
8284
# no wb_url, just store blank wb_url
8385
self.wb_url = None

pywb/framework/wsgi_wrappers.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -136,9 +136,9 @@ def handle_exception(self, env, exc, print_trace):
136136
err_details = None
137137

138138
if error_view:
139-
if err_url:
139+
if err_url and isinstance(err_url, str):
140140
err_url = err_url.decode('utf-8', 'ignore')
141-
if err_msg:
141+
if err_msg and isinstance(err_msg, str):
142142
err_msg = err_msg.decode('utf-8', 'ignore')
143143

144144
return error_view.render_response(exc_type=type(exc).__name__,

pywb/rewrite/html_rewriter.py

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -263,10 +263,20 @@ def _internal_close(self): # pragma: no cover
263263

264264
#=================================================================
265265
class HTMLRewriter(HTMLRewriterMixin, HTMLParser):
266+
PARSETAG = re.compile('[<]')
267+
266268
def __init__(self, *args, **kwargs):
267269
HTMLParser.__init__(self)
268270
super(HTMLRewriter, self).__init__(*args, **kwargs)
269271

272+
def reset(self):
273+
HTMLParser.reset(self)
274+
self.interesting = self.PARSETAG
275+
276+
def clear_cdata_mode(self):
277+
HTMLParser.clear_cdata_mode(self)
278+
self.interesting = self.PARSETAG
279+
270280
def feed(self, string):
271281
try:
272282
HTMLParser.feed(self, string)
@@ -311,11 +321,12 @@ def handle_endtag(self, tag):
311321
def handle_data(self, data):
312322
self.parse_data(data)
313323

314-
def handle_entityref(self, data):
315-
self.out.write('&' + data + ';')
316-
317-
def handle_charref(self, data):
318-
self.out.write('&#' + data + ';')
324+
# overriding regex so that these are no longer called
325+
#def handle_entityref(self, data):
326+
# self.out.write('&' + data + ';')
327+
#
328+
#def handle_charref(self, data):
329+
# self.out.write('&#' + data + ';')
319330

320331
def handle_comment(self, data):
321332
self.out.write('<!--')

pywb/rewrite/regex_rewriters.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,8 @@ class JSLinkOnlyRewriter(RegexRewriter):
111111
JS Rewriter which rewrites absolute http://, https:// and // urls
112112
at the beginning of a string
113113
"""
114-
JS_HTTPX = r'(?<="|\'|;)(?:https?:)?\\{0,4}/\\{0,4}/[A-Za-z0-9:_@.-]+'
114+
#JS_HTTPX = r'(?:(?:(?<=["\';])https?:)|(?<=["\']))\\{0,4}/\\{0,4}/[A-Za-z0-9:_@.-]+.*(?=["\s\';&\\])'
115+
JS_HTTPX = r'(?<=["\';])(?:https?:)?\\{0,4}/\\{0,4}/[A-Za-z0-9:_@.\-/\\?&#]+(?=["\';&\\])'
115116

116117
def __init__(self, rewriter, rules=[]):
117118
rules = rules + [

pywb/rewrite/test/test_html_rewriter.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,11 @@
2828
<base href="/web/20131226101010/http://example.com/some/path/static/"/><img src="/web/20131226101010im_/http://example.com/some/path/static/image.gif"/>
2929
3030
# HTML Entities
31-
>>> parse('<a href="">&rsaquo; &nbsp; &#62;</div>')
32-
<a href="">&rsaquo; &nbsp; &#62;</div>
31+
>>> parse('<a href="">&rsaquo; &nbsp; &#62; &#63</div>')
32+
<a href="">&rsaquo; &nbsp; &#62; &#63</div>
33+
34+
>>> parse('<div>X&Y</div> </div>X&Y;</div>')
35+
<div>X&Y</div> </div>X&Y;</div>
3336
3437
# Don't rewrite anchors
3538
>>> parse('<HTML><A Href="#abc">Text</a></hTmL>')

pywb/rewrite/test/test_regex_rewriters.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,9 @@
6161
>>> _test_js('&quot;http:\\/\\/www.example.com\\/some\\/path\\/?query=1&quot;')
6262
'&quot;/web/20131010/http:\\/\\/www.example.com\\/some\\/path\\/?query=1&quot;'
6363
64+
>>> _test_js('"http:\/\/sub-site.example.com\/path-dashes\/path_other\/foo_bar.txt"')
65+
'"/web/20131010/http:\\/\\/sub-site.example.com\\/path-dashes\\/path_other\\/foo_bar.txt"'
66+
6467
6568
#=================================================================
6669
# XML Rewriting

pywb/rewrite/test/test_rewrite_live.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -105,10 +105,10 @@ def test_example_1():
105105
assert '/pywb/20131226101010/http://www.iana.org/domains/example' in buff, buff
106106

107107
def test_example_2_redirect():
108-
status_headers, buff = get_rewritten('http://facebook.com/', urlrewriter)
108+
status_headers, buff = get_rewritten('http://httpbin.org/redirect-to?url=http://example.com/', urlrewriter)
109109

110110
# redirect, no content
111-
assert status_headers.get_statuscode() == '301'
111+
assert status_headers.get_statuscode() == '302'
112112
assert len(buff) == 0
113113

114114

pywb/rewrite/test/test_url_rewriter.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,18 @@
7474
>>> UrlRewriter('2013id_/example.com/file/path/blah.html', '/123/').get_new_url(timestamp='20131024')
7575
'/123/20131024id_/http://example.com/file/path/blah.html'
7676
77+
# deprefix tests
78+
>>> do_deprefix('2013id_/http://example.com/file/path/blah.html?param=http://localhost:8080/pywb/20141226/http://example.com/', '/pywb/', 'http://localhost:8080/pywb/')
79+
'http://example.com/file/path/blah.html?param=http://example.com/'
80+
81+
>>> do_deprefix('2013id_/http://example.com/file/path/blah.html?param=http://localhost:8080/pywb/if_/https://example.com/filename.html', '/pywb/', 'http://localhost:8080/pywb/')
82+
'http://example.com/file/path/blah.html?param=https://example.com/filename.html'
83+
84+
>>> do_deprefix('2013id_/http://example.com/file/path/blah.html?param=http://localhost:8080/pywb/https://example.com/filename.html', '/pywb/', 'http://localhost:8080/pywb/')
85+
'http://example.com/file/path/blah.html?param=https://example.com/filename.html'
86+
87+
>>> do_deprefix('http://example.com/file.html?param=http://localhost:8080/pywb/https%3A//example.com/filename.html&other=value&a=b&param2=http://localhost:8080/pywb/http://test.example.com', '/pywb/', 'http://localhost:8080/pywb/')
88+
'http://example.com/file.html?param=https://example.com/filename.html&other=value&a=b&param2=http://test.example.com'
7789
7890
# HttpsUrlRewriter tests
7991
>>> HttpsUrlRewriter('http://example.com/', None).rewrite('https://example.com/abc')
@@ -86,13 +98,22 @@
8698

8799

88100
from pywb.rewrite.url_rewriter import UrlRewriter, HttpsUrlRewriter
89-
101+
import urllib
90102

91103
def do_rewrite(rel_url, base_url, prefix, mod=None, full_prefix=None):
92104
rewriter = UrlRewriter(base_url, prefix, full_prefix=full_prefix)
93105
return rewriter.rewrite(rel_url, mod)
94106

95107

108+
def do_deprefix(url, rel_prefix, full_prefix):
109+
encoded = urllib.quote_plus(full_prefix)
110+
url = url.replace(full_prefix, encoded)
111+
112+
rewriter = UrlRewriter(url, rel_prefix, full_prefix)
113+
url = rewriter.deprefix_url()
114+
return urllib.unquote_plus(url)
115+
116+
96117
if __name__ == "__main__":
97118
import doctest
98119
doctest.testmod()

pywb/rewrite/url_rewriter.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
import copy
21
import urlparse
32

43
from wburl import WbUrl
@@ -88,6 +87,9 @@ def get_cookie_rewriter(self, scope=None):
8887
cls = get_cookie_rewriter(scope)
8988
return cls(self)
9089

90+
def deprefix_url(self):
91+
return self.wburl.deprefix_url(self.full_prefix)
92+
9193
def __repr__(self):
9294
return "UrlRewriter('{0}', '{1}')".format(self.wburl, self.prefix)
9395

@@ -150,3 +152,6 @@ def rebase_rewriter(self, new_url):
150152

151153
def get_cookie_rewriter(self, scope=None):
152154
return None
155+
156+
def deprefix_url(self):
157+
return self.wburl.url

pywb/rewrite/wburl.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939
"""
4040

4141
import re
42-
42+
import urllib
4343

4444
#=================================================================
4545
class BaseWbUrl(object):
@@ -149,6 +149,14 @@ def set_replay_timestamp(self, timestamp):
149149
self.timestamp = timestamp
150150
self.type = self.REPLAY
151151

152+
153+
def deprefix_url(self, prefix):
154+
prefix = urllib.quote_plus(prefix)
155+
rex_query = '=' + re.escape(prefix) + '([0-9])*([\w]{2}_)?/?'
156+
new_url = re.sub(rex_query, '=', self.url)
157+
self.url = new_url
158+
return self.url
159+
152160
# Str Representation
153161
# ====================
154162
def to_str(self, **overrides):

pywb/rules.yaml

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -11,18 +11,18 @@ rules:
1111
# facebook rules
1212
#=================================================================
1313
- url_prefix: 'com,facebook)/ajax/pagelet/generic.php/'
14-
14+
1515
fuzzy_lookup: 'com,facebook\)/.*[?&]data=(.*?(?:[&]|query_type[^,]+))'
16-
16+
1717
- url_prefix: 'com,facebook)/ajax/ufi/'
18-
18+
1919
fuzzy_lookup:
2020
- ft_ent_identifier
2121
- lsd
2222

2323
- url_prefix: 'com,facebook)/ajax/chat/hovercard/sidebar.php'
2424

25-
fuzzy_lookup:
25+
fuzzy_lookup:
2626
- ids[0]
2727

2828
- url_prefix: 'com,facebook)/login.php'
@@ -82,20 +82,21 @@ rules:
8282
#=================================================================
8383

8484
- url_prefix: 'com,google,plus)/_/stream/getactivities'
85-
86-
fuzzy_lookup: '(egk[^"]+).*(f.sid=[^&]+)'
87-
85+
86+
# fuzzy_lookup: '(egk[^"]+)?.*(f.sid=[^&]+)'
87+
fuzzy_lookup: 'f.req=.*\]\]\]\,\"([^"]+).*(f.sid=[^&]+)'
88+
8889
- url_prefix: 'com,google,plus)/_/stream/squarestream'
89-
90+
9091
fuzzy_lookup: '(cai[^"]+).*(f.sid=[^&]+)'
91-
92+
9293
- url_prefix: 'com,google,plus)/_/communities/rt/landing'
93-
94+
9495
fuzzy_lookup: 'com,google,plus\)/_/.*?.*\,(\d{13}\])&.*(f.sid=[^&]+).*'
95-
96+
9697

9798
- url_prefix: 'com,google,plus)/_/'
98-
99+
99100
fuzzy_lookup: 'com,google,plus\)/_/.*?.*(f.sid=[^&]+)'
100101

101102

pywb/static/wombat.js

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -708,11 +708,11 @@ WB_wombat_init = (function() {
708708
}
709709

710710
//============================================
711-
function wombat_init(replay_prefix, capture_date, orig_scheme, orig_host, timestamp) {
711+
function wombat_init(replay_prefix, capture_date, orig_scheme, orig_host, timestamp, mod) {
712712
wb_replay_prefix = replay_prefix;
713713

714714
if (wb_replay_prefix) {
715-
wb_replay_date_prefix = replay_prefix + capture_date + "/";
715+
wb_replay_date_prefix = replay_prefix + capture_date + mod + "/";
716716

717717
if (capture_date.length > 0) {
718718
wb_capture_date_part = "/" + capture_date + "/";

pywb/ui/head_insert.html

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@
77
"{{ cdx['timestamp'] if include_ts else ''}}",
88
"{{ urlsplit.scheme }}",
99
"{{ urlsplit.netloc }}",
10-
"{{ cdx.timestamp | format_ts('%s') }}");
10+
"{{ cdx.timestamp | format_ts('%s') }}",
11+
"{{ wbrequest.wb_url.mod }}");
1112
</script>
1213
{% endif %}
1314
<script>

pywb/utils/statusandheaders.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,8 @@ def parse(self, stream, full_statusline=None):
169169

170170
# append continuation lines, if any
171171
while next_line and next_line.startswith((' ', '\t')):
172-
value += next_line
172+
if value is not None:
173+
value += next_line
173174
next_line, total_read = _strip_count(stream.readline(),
174175
total_read)
175176

pywb/utils/test/test_statusandheaders.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,10 @@
3232
# empty
3333
>>> st2 = StatusAndHeadersParser(['HTTP/1.0']).parse(BytesIO(status_headers_2)); x = st2.validate_statusline('204 No Content'); st2
3434
StatusAndHeaders(protocol = '', statusline = '204 No Content', headers = [])
35+
36+
37+
>>> StatusAndHeadersParser(['HTTP/1.0']).parse(BytesIO(status_headers_3))
38+
StatusAndHeaders(protocol = 'HTTP/1.0', statusline = '204 Empty', headers = [('Content-Type', 'Value'), ('Content-Length', '0')])
3539
"""
3640

3741

@@ -54,6 +58,14 @@
5458
5559
"""
5660

61+
status_headers_3 = "\
62+
HTTP/1.0 204 Empty\r\n\
63+
Content-Type: Value\r\n\
64+
%Invalid%\r\n\
65+
\tMultiline\r\n\
66+
Content-Length: 0\r\n\
67+
\r\n"
68+
5769

5870
if __name__ == "__main__":
5971
import doctest

0 commit comments

Comments
 (0)