Skip to content

Commit c121198

Browse files
authored
revisit of redirect optimization: (#753)
- if a revisit is of a redirect (3xx response) and revisit has http headers, return the http headers with empty payload -- don't bother loading the original record builds on changes in #751 - cleanup redirect revisit tests from #751
1 parent 0cc912d commit c121198

File tree

3 files changed

+16
-7
lines changed

3 files changed

+16
-7
lines changed

pywb/warcserver/resource/resolvingloader.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,14 @@ def load_headers_and_payload(self, cdx, failed_files, cdx_loader):
7575
# two index lookups
7676
# Case 1: if mimetype is still warc/revisit
7777
if cdx.get('mime') == 'warc/revisit' and headers_record:
78+
if headers_record.http_headers:
79+
status = headers_record.http_headers.get_statuscode()
80+
# optimization: if redirect, don't load payload record, as it'll be ignored by browser
81+
# always replay zero-length payload
82+
if status and status.startswith('3'):
83+
headers_record.http_headers.replace_header('Content-Length', '0')
84+
return headers_record, headers_record
85+
7886
payload_record = self._load_different_url_payload(cdx,
7987
headers_record,
8088
failed_files,

tests/test_integration.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,13 +104,11 @@ def test_replay_content_head(self, fmod):
104104
def test_replay_content_head_non_zero_content_length_match(self):
105105
resp = self.testapp.get('/pywb/20140126200625id_/http://www.iana.org/_js/2013.1/jquery.js', status=200)
106106
length = resp.content_length
107-
print('length', length)
108107

109108
# Content-Length included if non-zero
110109
resp = self.testapp.head('/pywb/20140126200625id_/http://www.iana.org/_js/2013.1/jquery.js', status=200)
111110

112111
#assert resp.headers['Content-Length'] == length
113-
print('length', resp.content_length)
114112
assert resp.content_length == length
115113

116114
def test_replay_content(self, fmod):

tests/test_redirect_revisits.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,11 @@
1+
from .base_config_test import BaseConfigTest, CollsDirMixin, fmod
12

23
from io import BytesIO
34
import os
45

56
from warcio import WARCWriter, StatusAndHeaders
67
from pywb.manager.manager import main as wb_manager
78

8-
from .base_config_test import BaseConfigTest, CollsDirMixin, fmod
9-
109

1110
# ============================================================================
1211
class TestRevisits(CollsDirMixin, BaseConfigTest):
@@ -125,18 +124,22 @@ def test_different_url_revisit_orig_headers(self, fmod):
125124
res = self.get('/revisits/20220101{0}/http://example.com/', fmod, status=301)
126125
assert res.headers["Custom"] == "4"
127126
assert res.headers["Location"].endswith("/20220101{0}/https://example.com/redirect-4".format(fmod))
128-
assert res.text == 'some\ntext'
127+
assert res.content_length == 0
128+
assert res.text == ''
129129

130-
def test_different_url_revisit_and_response(self, fmod):
130+
def test_different_url_response_and_revisit(self, fmod):
131+
# response
131132
res = self.get('/revisits/20200101{0}/http://example.com/orig-2', fmod, status=301)
132133
assert res.headers["Custom"] == "2"
133134
assert res.headers["Location"].endswith("/20200101{0}/https://example.com/redirect-2".format(fmod))
134135
assert res.text == 'some\ntext'
135136

137+
# revisit
136138
res = self.get('/revisits/20220101{0}/http://example.com/orig-2', fmod, status=301)
137139
assert res.headers["Custom"] == "3"
138140
assert res.headers["Location"].endswith("/20220101{0}/https://example.com/redirect-3".format(fmod))
139-
assert res.text == 'some\ntext'
141+
assert res.content_length == 0
142+
assert res.text == ''
140143

141144
def test_orig(self, fmod):
142145
res = self.get('/revisits/20200101{0}/http://example.com/orig-1', fmod, status=301)

0 commit comments

Comments
 (0)