Skip to content

Commit 1249b41

Browse files
authored
rewrite: detect edge-case where html starts with BOM characters followed followed <!DOCTYPE html> as html (#758)
tests: add test that now results in correct html rewriting fixes #756
1 parent 2ccd8eb commit 1249b41

File tree

2 files changed

+12
-1
lines changed

2 files changed

+12
-1
lines changed

pywb/rewrite/content_rewriter.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -391,7 +391,7 @@ def rewrite_text_stream_to_gen(self, stream, rwinfo):
391391
# ============================================================================
392392
class RewriteInfo(object):
393393
TAG_REGEX = re.compile(b'^(\xef\xbb\xbf)?\s*\<')
394-
TAG_REGEX2 = re.compile(b'^.*<\w+[\s>]')
394+
TAG_REGEX2 = re.compile(b'^.*<[!]?\w+[\s>]')
395395
JSON_REGEX = re.compile(b'^\s*[{[][{"]') # if it starts with this then highly likely not HTML
396396

397397
JSONP_CONTAINS = ['callback=jQuery',

pywb/rewrite/test/test_content_rewriter.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,17 @@ def test_rewrite_html_utf_8(self):
141141
assert ('Content-Type', 'text/html; charset=utf-8') in headers.headers
142142
assert b''.join(gen).decode('utf-8') == exp
143143

144+
def test_rewrite_html_ignore_bom(self):
145+
headers = {'Content-Type': 'text/html'}
146+
content = u'\ufeff\ufeff\ufeff<!DOCTYPE html>\n<head>\n<a href="http://example.com"></a></body></html>'
147+
148+
headers, gen, is_rw = self.rewrite_record(headers, content, ts='201701mp_')
149+
150+
exp = '\ufeff\ufeff\ufeff<!DOCTYPE html>\n<head>\n<a href="http://localhost:8080/prefix/201701/http://example.com"></a></body></html>'
151+
assert is_rw
152+
assert ('Content-Type', 'text/html') in headers.headers
153+
assert b''.join(gen).decode('utf-8') == exp
154+
144155
def test_rewrite_html_utf_8_anchor(self):
145156
headers = {'Content-Type': 'text/html; charset=utf-8'}
146157
content = u'<html><body><a href="#éxample-tésté"></a></body></html>'

0 commit comments

Comments
 (0)