Skip to content

Commit f4e5a7d

Browse files
committed
Merge branch 'develop'
2 parents 5024234 + 2fba976 commit f4e5a7d

20 files changed

+423
-142
lines changed

README.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
PyWb 0.30.1
1+
PyWb 0.31.0
22
===========
33

44
.. image:: https://travis-ci.org/ikreymer/pywb.svg?branch=master

pywb/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = '0.30.1'
1+
__version__ = '0.31.0'
22

33
DEFAULT_CONFIG = 'pywb/default_config.yaml'
44

pywb/cdx/cdxobject.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ def __init__(self, cdxline=b''):
153153
raise CDXException(msg)
154154

155155
for header, field in zip(cdxformat, fields):
156-
self[header] = field.decode('utf-8')
156+
self[header] = to_native_str(field, 'utf-8')
157157

158158
self.cdxline = cdxline
159159

@@ -213,7 +213,7 @@ def conv_to_json(obj, fields=None):
213213

214214
def __str__(self):
215215
if self.cdxline:
216-
return self.cdxline.decode('utf-8')
216+
return to_native_str(self.cdxline, 'utf-8')
217217

218218
if not self._from_json:
219219
return ' '.join(str(val) for val in six.itervalues(self))
@@ -263,7 +263,7 @@ def __init__(self, idxline):
263263
raise CDXException(msg.format(len(fields), self.NUM_REQ_FIELDS))
264264

265265
for header, field in zip(self.FORMAT, fields):
266-
self[header] = field.decode('utf-8')
266+
self[header] = to_native_str(field, 'utf-8')
267267

268268
self['offset'] = int(self['offset'])
269269
self['length'] = int(self['length'])
@@ -285,4 +285,4 @@ def to_json(self, fields=None):
285285
return json_encode(self) + '\n'
286286

287287
def __str__(self):
288-
return self.idxline.decode('utf-8')
288+
return to_native_str(self.idxline, 'utf-8')

pywb/framework/wbrequestresponse.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -184,14 +184,15 @@ def normalize_post_query(self):
184184
if not self.wb_url:
185185
return
186186

187-
mime = self.env.get('CONTENT_TYPE', '').split(';')[0]
187+
mime = self.env.get('CONTENT_TYPE', '')
188188
length = self.env.get('CONTENT_LENGTH')
189189
stream = self.env['wsgi.input']
190190

191191
buffered_stream = BytesIO()
192192

193193
post_query = extract_post_query('POST', mime, length, stream,
194-
buffered_stream=buffered_stream)
194+
buffered_stream=buffered_stream,
195+
environ=self.env)
195196

196197
if post_query:
197198
self.env['wsgi.input'] = buffered_stream

pywb/rewrite/html_rewriter.py

Lines changed: 31 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ def __init__(self, url_rewriter,
120120

121121
def _rewrite_meta_refresh(self, meta_refresh):
122122
if not meta_refresh:
123-
return None
123+
return ''
124124

125125
m = self.META_REFRESH_REGEX.match(meta_refresh)
126126
if not m:
@@ -133,6 +133,9 @@ def _rewrite_meta_refresh(self, meta_refresh):
133133
return meta_refresh
134134

135135
def _rewrite_base(self, url, mod=''):
136+
if not url:
137+
return ''
138+
136139
url = self._ensure_url_has_path(url)
137140

138141
base_url = self._rewrite_url(url, mod)
@@ -183,11 +186,11 @@ def _ensure_url_has_path(self, url):
183186

184187
def _rewrite_url(self, value, mod=None):
185188
if not value:
186-
return None
189+
return ''
187190

188191
value = value.strip()
189192
if not value:
190-
return None
193+
return ''
191194

192195
value = self.try_unescape(value)
193196
return self.url_rewriter.rewrite(value, mod)
@@ -209,21 +212,24 @@ def try_unescape(self, value):
209212
return new_value
210213

211214
def _rewrite_srcset(self, value, mod=''):
215+
if not value:
216+
return ''
217+
212218
values = value.split(',')
213-
values = map(lambda x: self._rewrite_url(x.strip()), values)
219+
values = [self._rewrite_url(v.strip()) for v in values]
214220
return ', '.join(values)
215221

216222
def _rewrite_css(self, css_content):
217223
if css_content:
218224
return self.css_rewriter.rewrite(css_content)
219225
else:
220-
return None
226+
return ''
221227

222228
def _rewrite_script(self, script_content):
223229
if script_content:
224230
return self.js_rewriter.rewrite(script_content)
225231
else:
226-
return None
232+
return ''
227233

228234
def has_attr(self, tag_attrs, attr):
229235
name, value = attr
@@ -252,6 +258,11 @@ def _rewrite_tag_attrs(self, tag, tag_attrs):
252258
self.out.write('<' + tag)
253259

254260
for attr_name, attr_value in tag_attrs:
261+
empty_attr = False
262+
if attr_value is None:
263+
attr_value = ''
264+
empty_attr = True
265+
255266
# special case: inline JS/event handler
256267
if ((attr_value and attr_value.startswith('javascript:'))
257268
or attr_name.startswith('on')):
@@ -324,7 +335,7 @@ def _rewrite_tag_attrs(self, tag, tag_attrs):
324335
attr_value = self._rewrite_url(attr_value, rw_mod)
325336

326337
# write the attr!
327-
self._write_attr(attr_name, attr_value)
338+
self._write_attr(attr_name, attr_value, empty_attr)
328339

329340
return True
330341

@@ -347,11 +358,17 @@ def _rewrite_head(self, start_end):
347358

348359
return True
349360

350-
def _write_attr(self, name, value):
351-
# parser doesn't differentiate between 'attr=""' and just 'attr'
352-
# 'attr=""' is more common, so use that form
353-
if value:
361+
def _write_attr(self, name, value, empty_attr):
362+
# if empty_attr is set, just write 'attr'!
363+
if empty_attr:
364+
self.out.write(' ' + name)
365+
366+
# write with value, if set
367+
elif value:
368+
354369
self.out.write(' ' + name + '="' + value.replace('"', '&quot;') + '"')
370+
371+
# otherwise, 'attr=""' is more common, so use that form
355372
else:
356373
self.out.write(' ' + name + '=""')
357374

@@ -421,8 +438,9 @@ def clear_cdata_mode(self):
421438
def feed(self, string):
422439
try:
423440
HTMLParser.feed(self, string)
424-
except Exception: # pragma: no cover
425-
# only raised in 2.6
441+
except Exception as e: # pragma: no cover
442+
import traceback
443+
traceback.print_exc()
426444
self.out.write(string)
427445

428446
def _internal_close(self):

pywb/rewrite/rewrite_amf.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
from io import BytesIO
2+
from six.moves import zip
3+
from pywb.rewrite.rewrite_content import RewriteContent
4+
5+
6+
# ============================================================================
7+
# Expiermental: not fully tested
8+
class RewriteContentAMF(RewriteContent): #pragma: no cover
9+
def handle_custom_rewrite(self, text_type, status_headers, stream, env):
10+
11+
if status_headers.get_header('Content-Type') == 'application/x-amf':
12+
stream = self.rewrite_amf(stream, env)
13+
14+
return (super(RewriteContentAMF, self).
15+
handle_custom_rewrite(text_type, status_headers, stream, env))
16+
17+
def rewrite_amf(self, stream, env):
18+
try:
19+
from pyamf import remoting
20+
21+
iobuff = BytesIO()
22+
while True:
23+
buff = stream.read()
24+
if not buff:
25+
break
26+
iobuff.write(buff)
27+
28+
iobuff.seek(0)
29+
res = remoting.decode(iobuff)
30+
31+
if env and env.get('pywb.inputdata'):
32+
inputdata = env.get('pywb.inputdata')
33+
34+
new_list = []
35+
36+
for src, target in zip(inputdata.bodies, res.bodies):
37+
#print(target[0] + ' = ' + src[0])
38+
39+
#print('messageId => corrId ' + target[1].body.correlationId + ' => ' + src[1].body[0].messageId)
40+
target[1].body.correlationId = src[1].body[0].messageId
41+
42+
new_list.append((src[0], target[1]))
43+
44+
res.bodies = new_list
45+
46+
return BytesIO(remoting.encode(res).getvalue())
47+
48+
except Exception as e:
49+
import traceback
50+
traceback.print_exc()
51+
print(e)
52+
return stream

pywb/rewrite/rewrite_content.py

Lines changed: 25 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import yaml
55
import re
66

7-
from chardet.universaldetector import UniversalDetector
7+
#from chardet.universaldetector import UniversalDetector
88
from io import BytesIO
99

1010
from pywb.rewrite.header_rewriter import RewrittenStatusAndHeaders
@@ -21,7 +21,7 @@
2121

2222

2323
#=================================================================
24-
class RewriteContent:
24+
class RewriteContent(object):
2525
HEAD_REGEX = re.compile(b'<\s*head\\b[^>]*[>]+', re.I)
2626

2727
TAG_REGEX = re.compile(b'^\s*\<')
@@ -77,6 +77,7 @@ def _rewrite_headers(self, urlrewriter, rule, status_headers, stream,
7777

7878

7979
def _check_encoding(self, rewritten_headers, stream, enc):
80+
matched = False
8081
if (rewritten_headers.
8182
contains_removed_header('content-encoding', enc)):
8283

@@ -87,14 +88,15 @@ def _check_encoding(self, rewritten_headers, stream, enc):
8788
stream = DecompressingBufferedReader(stream, decomp_type=enc)
8889

8990
rewritten_headers.status_headers.remove_header('content-length')
91+
matched = True
9092

91-
return stream
93+
return matched, stream
9294

9395

9496

9597
def rewrite_content(self, urlrewriter, status_headers, stream,
9698
head_insert_func=None, urlkey='',
97-
cdx=None, cookie_rewriter=None):
99+
cdx=None, cookie_rewriter=None, env=None):
98100

99101
wb_url = urlrewriter.wburl
100102

@@ -118,9 +120,12 @@ def rewrite_content(self, urlrewriter, status_headers, stream,
118120

119121
status_headers = rewritten_headers.status_headers
120122

121-
# use rewritten headers, but no further rewriting needed
122-
if rewritten_headers.text_type is None:
123-
return (status_headers, self.stream_to_gen(stream), False)
123+
res = self.handle_custom_rewrite(rewritten_headers.text_type,
124+
status_headers,
125+
stream,
126+
env)
127+
if res:
128+
return res
124129

125130
# Handle text content rewriting
126131
# ====================================================================
@@ -136,8 +141,12 @@ def rewrite_content(self, urlrewriter, status_headers, stream,
136141
encoding = None
137142
first_buff = b''
138143

139-
stream = self._check_encoding(rewritten_headers, stream, 'gzip')
140-
stream = self._check_encoding(rewritten_headers, stream, 'deflate')
144+
for decomp_type in BufferedReader.get_supported_decompressors():
145+
matched, stream = self._check_encoding(rewritten_headers,
146+
stream,
147+
decomp_type)
148+
if matched:
149+
break
141150

142151
if mod == 'js_':
143152
text_type, stream = self._resolve_text_type('js',
@@ -237,6 +246,11 @@ def rewrite_content(self, urlrewriter, status_headers, stream,
237246

238247
return (status_headers, gen, True)
239248

249+
def handle_custom_rewrite(self, text_type, status_headers, stream, env):
250+
# use rewritten headers, but no further rewriting needed
251+
if text_type is None:
252+
return (status_headers, self.stream_to_gen(stream), False)
253+
240254
@staticmethod
241255
def _extract_html_charset(buff, status_headers):
242256
charset = None
@@ -360,3 +374,5 @@ def rewrite_text_stream_to_gen(stream, rewrite_func,
360374

361375
finally:
362376
stream.close()
377+
378+

pywb/rewrite/test/test_html_rewriter.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,12 @@
4949
>>> parse('<base href="static/"/><img src="image.gif"/>', urlrewriter=no_base_canon_rewriter)
5050
<base href="static/"/><img src="/web/20131226101010im_/http://example.com/some/path/static/image.gif"/>
5151
52+
# Empty url
53+
>>> parse('<base href="">')
54+
<base href="">
55+
56+
>>> parse('<base href>')
57+
<base href>
5258
5359
5460
# HTML Entities
@@ -66,6 +72,10 @@
6672
>>> parse('<input value="&amp;X&amp;&quot;">X</input>')
6773
<input value="&amp;X&amp;&quot;">X</input>
6874
75+
# Empty values should be ignored
76+
>>> parse('<input name="foo" value>')
77+
<input name="foo" value>
78+
6979
# SKIPPED
7080
# Unicode -- default with %-encoding
7181
#>>> parse(u'<a href="http://испытание.испытание/">испытание</a>')
@@ -92,7 +102,7 @@
92102
<meta http-equiv="refresh" content="text/html; charset=utf-8"/>
93103
94104
>>> parse('<META http-equiv="refresh" content>')
95-
<meta http-equiv="refresh" content="">
105+
<meta http-equiv="refresh" content>
96106
97107
>>> parse('<meta property="og:image" content="http://example.com/example.jpg">')
98108
<meta property="og:image" content="/web/20131226101010/http://example.com/example.jpg">
@@ -115,6 +125,10 @@
115125
>>> parse('<img srcset="//example.com/1x 1x, //example.com/foo 2x, https://example.com/bar 4x">')
116126
<img srcset="/web/20131226101010///example.com/1x 1x, /web/20131226101010///example.com/foo 2x, /web/20131226101010/https://example.com/bar 4x">
117127
128+
# empty srcset attrib
129+
>>> parse('<img srcset="">')
130+
<img srcset="">
131+
118132
# Script tag
119133
>>> parse('<script>window.location = "http://example.com/a/b/c.html"</script>')
120134
<script>window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html"</script>
@@ -131,7 +145,7 @@
131145
<script>/*<![CDATA[*/window.WB_wombat_location = "/web/20131226101010/http://example.com/a/b/c.html;/*]]>*/"</script>
132146
133147
>>> parse('<div style="background: url(\'abc.html\')" onblah onclick="location = \'redirect.html\'"></div>')
134-
<div style="background: url('/web/20131226101010/http://example.com/some/path/abc.html')" onblah="" onclick="WB_wombat_location = 'redirect.html'"></div>
148+
<div style="background: url('/web/20131226101010/http://example.com/some/path/abc.html')" onblah onclick="WB_wombat_location = 'redirect.html'"></div>
135149
136150
>>> parse('<i style="background-image: url(http://foo-.bar_.example.com/)"></i>')
137151
<i style="background-image: url(/web/20131226101010/http://foo-.bar_.example.com/)"></i>

0 commit comments

Comments
 (0)