Skip to content

Commit eeb35ea

Browse files
committed
proxy: add ProxyRouter wrapper to check for content-length and, if missing, perform full buffering (http1.0) or chunked encoding (http1.1) (separate from replay view buffering)
add tests for buffering and chunked encoding, fixes #143, also tests no banner url-rewrite only proxy related to #142
1 parent 0c96591 commit eeb35ea

File tree

5 files changed

+125
-8
lines changed

5 files changed

+125
-8
lines changed

pywb/framework/proxy.py

Lines changed: 57 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,15 @@
1010
import ssl
1111

1212
from pywb.rewrite.url_rewriter import SchemeOnlyUrlRewriter
13+
from pywb.rewrite.rewrite_content import RewriteContent
1314
from pywb.utils.wbexception import BadRequestException
1415

1516
from pywb.utils.bufferedreaders import BufferedReader
1617

1718
from pywb.framework.proxy_resolvers import ProxyAuthResolver, CookieResolver, IPCacheResolver
1819

20+
from tempfile import SpooledTemporaryFile
21+
1922

2023
#=================================================================
2124
class ProxyArchivalRouter(ArchivalRouter):
@@ -55,6 +58,7 @@ class ProxyRouter(object):
5558

5659
BLOCK_SIZE = 4096
5760
DEF_MAGIC_NAME = 'pywb.proxy'
61+
BUFF_RESPONSE_MEM_SIZE = 1024*1024
5862

5963
CERT_DL_PEM = '/pywb-ca.pem'
6064
CERT_DL_P12 = '/pywb-ca.p12'
@@ -222,12 +226,63 @@ def __call__(self, env):
222226
wbrequest.wb_url.mod = 'uo_'
223227

224228
response = route.handler(wbrequest)
229+
if not response:
230+
return None
225231

232+
# add extra headers for replay responses
226233
if wbrequest.wb_url and wbrequest.wb_url.is_replay():
227234
response.status_headers.replace_headers(self.extra_headers)
228235

236+
# check for content-length
237+
res = response.status_headers.get_header('content-length')
238+
try:
239+
if int(res) > 0:
240+
return response
241+
except:
242+
pass
243+
244+
# need to either chunk or buffer to get content-length
245+
if env.get('SERVER_PROTOCOL') == 'HTTP/1.1':
246+
response.status_headers.remove_header('content-length')
247+
response.status_headers.headers.append(('Transfer-Encoding', 'chunked'))
248+
response.body = self._chunk_encode(response.body)
249+
else:
250+
response.body = self._buffer_response(response.status_headers,
251+
response.body)
252+
229253
return response
230254

255+
@staticmethod
256+
def _chunk_encode(orig_iter):
257+
for buff in orig_iter:
258+
chunk = bytes(buff)
259+
if not len(chunk):
260+
continue
261+
chunk_len = '%X\r\n' % len(chunk)
262+
yield chunk_len
263+
yield chunk
264+
yield '\r\n'
265+
266+
yield '0\r\n\r\n'
267+
268+
@staticmethod
269+
def _buffer_response(status_headers, iterator):
270+
out = SpooledTemporaryFile(ProxyRouter.BUFF_RESPONSE_MEM_SIZE)
271+
size = 0
272+
273+
for buff in iterator:
274+
buff = bytes(buff)
275+
size += len(buff)
276+
out.write(buff)
277+
278+
content_length_str = str(size)
279+
# remove existing content length
280+
status_headers.replace_header('Content-Length',
281+
content_length_str)
282+
283+
out.seek(0)
284+
return RewriteContent.stream_to_gen(out)
285+
231286
def get_request_socket(self, env):
232287
if not self.ca:
233288
return None
@@ -259,7 +314,8 @@ def handle_connect(self, env):
259314
return WbResponse.text_response('HTTPS Proxy Not Supported',
260315
'405 HTTPS Proxy Not Supported')
261316

262-
sock.send('HTTP/1.0 200 Connection Established\r\n')
317+
sock.send('HTTP/1.1 200 Connection Established\r\n')
318+
sock.send('Proxy-Connection: close\r\n')
263319
sock.send('Server: pywb proxy\r\n')
264320
sock.send('\r\n')
265321

pywb/rewrite/test/test_rewrite_live.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -203,7 +203,7 @@ def test_example_1():
203203
status_headers, buff = get_rewritten('http://example.com/', urlrewriter, req_headers={'Connection': 'close'})
204204

205205
# verify header rewriting
206-
assert (('X-Archive-Orig-Content-Length', '1270') in status_headers.headers), status_headers
206+
assert status_headers.get_header('x-archive-orig-content-length') == '1270', status_headers
207207

208208

209209
# verify utf-8 charset detection

pywb/webapp/replay_views.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -187,12 +187,7 @@ def replay_capture(self, wbrequest, cdx, cdx_loader, failed_files):
187187
content_len = 0
188188

189189
if content_len <= 0:
190-
# if proxy mode, must set content-length (or use chunked)
191-
if wbrequest.options.get('is_proxy'):
192-
max_size = 0
193-
else:
194-
max_size = self.buffer_max_size
195-
190+
max_size = self.buffer_max_size
196191
response_iter = self.buffered_response(status_headers,
197192
response_iter,
198193
max_size)
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
collections:
2+
all:
3+
- ./sample_archive/cdx/iana.cdx
4+
5+
archive_paths: ./sample_archive/warcs/
6+
7+
enable_http_proxy: true
8+
9+
buffer_response: false
10+
11+
proxy_options:
12+
enable_https_proxy: false
13+
14+
cookie_resolver: ip
15+
use_default_coll: all
16+
17+
use_banner: false
18+
use_client_rewrite: false

tests/test_proxy_http_no_banner.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
from pytest import raises
2+
import webtest
3+
import base64
4+
5+
from pywb.webapp.pywb_init import create_wb_router
6+
from pywb.framework.wsgi_wrappers import init_app
7+
from pywb.cdx.cdxobject import CDXObject
8+
9+
from urlparse import urlsplit
10+
11+
from server_mock import make_setup_module, BaseIntegration
12+
13+
setup_module = make_setup_module('tests/test_config_proxy_no_banner.yaml')
14+
15+
class TestProxyNoBanner(BaseIntegration):
16+
def get_url(self, uri, addr='127.0.0.1', server_protocol='HTTP/1.0'):
17+
parts = urlsplit(uri)
18+
env = dict(REQUEST_URI=uri, QUERY_STRING=parts.query, SCRIPT_NAME='',
19+
SERVER_PROTOCOL=server_protocol, REMOTE_ADDR=addr)
20+
# 'Simulating' proxy by settings REQUEST_URI explicitly to full url with empty SCRIPT_NAME
21+
return self.testapp.get('/x-ignore-this-x', extra_environ=env)
22+
23+
def test_proxy_chunked(self):
24+
resp = self.get_url('http://www.iana.org/_img/2013.1/icann-logo.svg', server_protocol='HTTP/1.1')
25+
assert resp.content_type == 'image/svg+xml'
26+
assert resp.headers['Transfer-Encoding'] == 'chunked'
27+
assert int(resp.headers['Content-Length']) == len(resp.body)
28+
29+
def test_proxy_buffered(self):
30+
resp = self.get_url('http://www.iana.org/_img/2013.1/icann-logo.svg', server_protocol='HTTP/1.0')
31+
assert resp.content_type == 'image/svg+xml'
32+
assert 'Transfer-Encoding' not in resp.headers
33+
assert int(resp.headers['Content-Length']) == len(resp.body)
34+
35+
def test_proxy_html_url_only_rewrite_buffered(self):
36+
resp = self.get_url('http://www.iana.org/', server_protocol='HTTP/1.0')
37+
assert 'Transfer-Encoding' not in resp.headers
38+
assert int(resp.headers['Content-Length']) == len(resp.body)
39+
40+
def test_proxy_js_url_only_rewrite_buffered(self):
41+
resp = self.get_url('http://www.iana.org/_js/2013.1/iana.js', server_protocol='HTTP/1.0')
42+
assert 'Transfer-Encoding' not in resp.headers
43+
assert int(resp.headers['Content-Length']) == len(resp.body)
44+
45+
def test_proxy_js_url_only_rewrite_chunked(self):
46+
resp = self.get_url('http://www.iana.org/_js/2013.1/iana.js', server_protocol='HTTP/1.1')
47+
assert resp.headers['Transfer-Encoding'] == 'chunked'
48+
assert int(resp.headers['Content-Length']) == len(resp.body)

0 commit comments

Comments
 (0)