Skip to content

Commit ffb702c

Browse files
committed
rewrite: content detection for specific case: if content type is html and mod type is css
or js, peek stream to determine actual type. Addresses #31 in part. Fix typo in wb_frame.js
1 parent 8f57ce6 commit ffb702c

File tree

3 files changed

+49
-17
lines changed

3 files changed

+49
-17
lines changed

pywb/rewrite/rewrite_content.py

Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,15 @@
1313
from pywb.utils.dsrules import RuleSet
1414
from pywb.utils.statusandheaders import StatusAndHeaders
1515
from pywb.utils.bufferedreaders import DecompressingBufferedReader
16-
from pywb.utils.bufferedreaders import ChunkedDataReader
16+
from pywb.utils.bufferedreaders import ChunkedDataReader, BufferedReader
1717

1818

1919
#=================================================================
2020
class RewriteContent:
2121
HEAD_REGEX = re.compile(r'<\s*head\b[^>]*[>]+', re.I)
2222

23+
TAG_REGEX = re.compile(r'^\s*\<')
24+
2325
BUFF_SIZE = 16384
2426

2527
def __init__(self, ds_rules_file=None, is_framed_replay=False):
@@ -106,11 +108,6 @@ def rewrite_content(self, urlrewriter, headers, stream,
106108
# default text_type
107109
mod = wb_url.mod
108110

109-
if mod == 'js_':
110-
text_type = 'js'
111-
elif mod == 'cs_':
112-
text_type = 'css'
113-
114111
stream_raw = False
115112
encoding = None
116113
first_buff = None
@@ -124,6 +121,15 @@ def rewrite_content(self, urlrewriter, headers, stream,
124121
else:
125122
stream = DecompressingBufferedReader(stream)
126123

124+
if mod == 'js_':
125+
text_type, stream = self._resolve_text_type('js',
126+
text_type,
127+
stream)
128+
elif mod == 'cs_':
129+
text_type, stream = self._resolve_text_type('css',
130+
text_type,
131+
stream)
132+
127133
rewriter_class = rule.rewriters[text_type]
128134

129135
# for html, need to perform header insert, supply js, css, xml
@@ -173,6 +179,22 @@ def rewrite_content(self, urlrewriter, headers, stream,
173179

174180
return (status_headers, gen, True)
175181

182+
@staticmethod
183+
def _resolve_text_type(mod, text_type, stream):
184+
# only attempt to resolve between html and other text types
185+
if text_type != 'html':
186+
return mod, stream
187+
188+
buff = stream.read(128)
189+
190+
wrapped_stream = BufferedReader(stream, starting_data=buff)
191+
192+
# check if starts with a tag, then likely html
193+
if RewriteContent.TAG_REGEX.match(buff):
194+
mod = 'html'
195+
196+
return mod, wrapped_stream
197+
176198
def _head_insert_only_gen(self, insert_str, stream):
177199
max_len = 1024
178200
buff = ''

pywb/rewrite/test/test_rewrite_content.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,18 +3,28 @@
33

44
ur"""
55
# full seq
6-
>>> print RewriteContent._decode_buff('\xce\xb4\xce\xbf\xce\xba', BytesIO(''), 'utf-8')
6+
#>>> print RewriteContent._decode_buff('\xce\xb4\xce\xbf\xce\xba', BytesIO(''), 'utf-8')
77
δοκ
88
99
# read split bytes, read rest
10-
>>> b = BytesIO('\xbf\xce\xba')
11-
>>> sys.stdout.write(RewriteContent._decode_buff('\xce\xb4\xce', b, 'utf-8')); sys.stdout.write(RewriteContent._decode_buff(b.read(), b, 'utf-8'))
10+
#>>> b = BytesIO('\xbf\xce\xba')
11+
#>>> sys.stdout.write(RewriteContent._decode_buff('\xce\xb4\xce', b, 'utf-8')); sys.stdout.write(RewriteContent._decode_buff(b.read(), b, 'utf-8'))
1212
δοκ
1313
1414
# invalid seq
15-
>>> print RewriteContent._decode_buff('\xce\xb4\xce', BytesIO('\xfe'), 'utf-8')
15+
#>>> print RewriteContent._decode_buff('\xce\xb4\xce', BytesIO('\xfe'), 'utf-8')
1616
Traceback (most recent call last):
17-
UnicodeDecodeError: 'utf8' codec can't decode byte 0xce in position 2: invalid continuation byte
17+
"UnicodeDecodeError: 'utf8' codec can't decode byte 0xce in position 2: invalid continuation byte"
18+
19+
>>> text_type, stream = RewriteContent._resolve_text_type('js', 'html', BytesIO(' <html></html>'))
20+
>>> print (text_type, stream.read())
21+
('html', ' <html></html>')
22+
23+
>>> text_type, stream = RewriteContent._resolve_text_type('js', 'html', BytesIO(' function() { return 0; }'))
24+
>>> print (text_type, stream.read())
25+
('js', ' function() { return 0; }')
26+
27+
1828
"""
1929

2030
from pywb.rewrite.rewrite_content import RewriteContent

pywb/static/wb_frame.js

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,8 @@ function make_inner_url(url, ts)
4545
function push_state(url, timestamp, capture_str, is_live) {
4646
if (window.frames[0].WB_wombat_location) {
4747
curr_href = window.frames[0].WB_wombat_location.href;
48-
49-
// If not current url, don't update
48+
49+
// If not current url, don't update
5050
if (url != curr_href) {
5151
return;
5252
}
@@ -59,21 +59,21 @@ function push_state(url, timestamp, capture_str, is_live) {
5959
state.url = url;
6060
state.capture_str = capture_str;
6161
state.is_live = is_live;
62-
62+
6363
window.history.replaceState(state, "", state.inner_url);
6464

6565
set_state(state);
6666
}
6767

6868
function pop_state(state) {
6969
set_state(state);
70-
70+
7171
window.frames[0].src = state.inner_url;
7272
}
7373

7474
function extract_ts(url)
7575
{
76-
var result = value.match(TS_REGEX);
76+
var result = url.match(TS_REGEX);
7777
if (!result) {
7878
return "";
7979
}
@@ -112,7 +112,7 @@ function set_state(state) {
112112

113113
window.onpopstate = function(event) {
114114
var state = event.state;
115-
115+
116116
if (state) {
117117
pop_state(state);
118118
}

0 commit comments

Comments
 (0)