Skip to content

Commit f190190

Browse files
authored
Revisit headers load fix (#751)
* revisit loading fix for revisit records with http headers: - if revisit record has http headers, always use those headers - otherwise, continue to use http headers from payload record - parse headers of http and payload records on initial lookup, to simplify loading - tests: add test for loading revisit records with different urls, different headers but same payload - fix for sul-dlss/was-pywb#64 * also bump version to 2.6.8
1 parent 49393ce commit f190190

File tree

4 files changed

+168
-18
lines changed

4 files changed

+168
-18
lines changed

pywb/version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = '2.6.7'
1+
__version__ = '2.6.8'
22

33
if __name__ == '__main__':
44
print(__version__)

pywb/warcserver/resource/responseloader.py

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ def __init__(self, paths, cdx_source):
172172
self.resolvers = self.make_resolvers(self.paths)
173173

174174
self.resolve_loader = ResolvingLoader(self.resolvers,
175-
no_record_parse=True)
175+
no_record_parse=False)
176176

177177
self.headers_parser = StatusAndHeadersParser([], verify=False)
178178

@@ -206,18 +206,20 @@ def local_index_query(local_params):
206206
local_index_query))
207207

208208
http_headers_buff = None
209+
209210
if payload.rec_type in ('response', 'revisit'):
210211
status = cdx.get('status')
211212

213+
try:
214+
orig_size = payload.raw_stream.tell()
215+
except:
216+
orig_size = 0
217+
218+
http_headers = headers.http_headers or payload.http_headers
219+
212220
# if status is not set and not, 2xx, 4xx, 5xx
213221
# go through self-redirect check just in case
214222
if not status or not status.startswith(('2', '4', '5')):
215-
http_headers = self.headers_parser.parse(payload.raw_stream)
216-
try:
217-
orig_size = payload.raw_stream.tell()
218-
except:
219-
orig_size = 0
220-
221223
try:
222224
self.raise_on_self_redirect(params, cdx,
223225
http_headers.get_statuscode(),
@@ -227,15 +229,15 @@ def local_index_query(local_params):
227229
no_except_close(payload.raw_stream)
228230
raise
229231

230-
http_headers_buff = http_headers.to_bytes()
232+
http_headers_buff = http_headers and http_headers.to_bytes()
231233

232-
# if new http_headers_buff is different length,
233-
# attempt to adjust content-length on the WARC record
234-
if orig_size and len(http_headers_buff) != orig_size:
235-
orig_cl = payload.rec_headers.get_header('Content-Length')
236-
if orig_cl:
237-
new_cl = int(orig_cl) + (len(http_headers_buff) - orig_size)
238-
payload.rec_headers.replace_header('Content-Length', str(new_cl))
234+
# if new http_headers_buff is different length,
235+
# attempt to adjust content-length on the WARC record
236+
if http_headers and orig_size and len(http_headers_buff) != orig_size:
237+
orig_cl = payload.rec_headers.get_header('Content-Length')
238+
if orig_cl:
239+
new_cl = int(orig_cl) + (len(http_headers_buff) - orig_size)
240+
payload.rec_headers.replace_header('Content-Length', str(new_cl))
239241

240242
warc_headers = payload.rec_headers
241243

tests/test_integration.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -102,13 +102,15 @@ def test_replay_content_head(self, fmod):
102102
assert not resp.headers.get('Content-Length')
103103

104104
def test_replay_content_head_non_zero_content_length_match(self):
105-
resp = self.testapp.get('/pywb/id_/http://www.iana.org/_js/2013.1/jquery.js', status=200)
105+
resp = self.testapp.get('/pywb/20140126200625id_/http://www.iana.org/_js/2013.1/jquery.js', status=200)
106106
length = resp.content_length
107+
print('length', length)
107108

108109
# Content-Length included if non-zero
109-
resp = self.testapp.head('/pywb/id_/http://www.iana.org/_js/2013.1/jquery.js', status=200)
110+
resp = self.testapp.head('/pywb/20140126200625id_/http://www.iana.org/_js/2013.1/jquery.js', status=200)
110111

111112
#assert resp.headers['Content-Length'] == length
113+
print('length', resp.content_length)
112114
assert resp.content_length == length
113115

114116
def test_replay_content(self, fmod):

tests/test_redirect_revisits.py

Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
2+
from io import BytesIO
3+
import os
4+
5+
from warcio import WARCWriter, StatusAndHeaders
6+
from pywb.manager.manager import main as wb_manager
7+
8+
from .base_config_test import BaseConfigTest, CollsDirMixin, fmod
9+
10+
11+
# ============================================================================
12+
class TestRevisits(CollsDirMixin, BaseConfigTest):
13+
@classmethod
14+
def setup_class(cls):
15+
super(TestRevisits, cls).setup_class('config_test.yaml')
16+
17+
18+
def create_revisit_record(self, url, date, headers, refers_to_uri, refers_to_date):
19+
http_headers = StatusAndHeaders(
20+
"301 Permanent Redirect", headers, protocol="HTTP/1.0"
21+
)
22+
23+
return self.writer.create_revisit_record(
24+
url,
25+
digest="sha1:B6QJ6BNJ3R4B23XXMRKZKHLPGJY2VE4O",
26+
refers_to_uri=refers_to_uri,
27+
refers_to_date=refers_to_date,
28+
warc_headers_dict={"WARC-Date": date},
29+
http_headers=http_headers,
30+
)
31+
32+
33+
def create_response_record(self, url, date, headers, payload):
34+
http_headers = StatusAndHeaders(
35+
"301 Permanent Redirect", headers, protocol="HTTP/1.0"
36+
)
37+
38+
return self.writer.create_warc_record(
39+
url,
40+
record_type="response",
41+
http_headers=http_headers,
42+
payload=BytesIO(payload),
43+
warc_headers_dict={"WARC-Date": date},
44+
length=len(payload),
45+
)
46+
47+
def create(self):
48+
payload = b"some\ntext"
49+
50+
# record 1
51+
self.writer.write_record(
52+
self.create_response_record(
53+
"http://example.com/orig-1",
54+
"2020-01-01T00:00:00Z",
55+
[
56+
("Content-Type", 'text/plain; charset="UTF-8"'),
57+
("Location", "https://example.com/redirect-1"),
58+
("Content-Length", str(len(payload))),
59+
("Custom", "1"),
60+
],
61+
payload,
62+
)
63+
)
64+
65+
# record 2
66+
self.writer.write_record(
67+
self.create_response_record(
68+
"http://example.com/orig-2",
69+
"2020-01-01T00:00:00Z",
70+
[
71+
("Content-Type", 'text/plain; charset="UTF-8"'),
72+
("Location", "https://example.com/redirect-2"),
73+
("Content-Length", str(len(payload))),
74+
("Custom", "2"),
75+
],
76+
payload,
77+
)
78+
)
79+
80+
# record 3
81+
self.writer.write_record(
82+
self.create_revisit_record(
83+
"http://example.com/orig-2",
84+
"2022-01-01T00:00:00Z",
85+
[
86+
("Content-Type", 'text/plain; charset="UTF-8"'),
87+
("Location", "https://example.com/redirect-3"),
88+
("Content-Length", str(len(payload))),
89+
("Custom", "3"),
90+
],
91+
refers_to_uri="http://example.com/orig-1",
92+
refers_to_date="2020-01-01T00:00:00Z",
93+
)
94+
)
95+
96+
# record 4
97+
self.writer.write_record(
98+
self.create_revisit_record(
99+
"http://example.com/",
100+
"2022-01-01T00:00:00Z",
101+
[
102+
("Content-Type", 'text/plain; charset="UTF-8"'),
103+
("Location", "https://example.com/redirect-4"),
104+
("Content-Length", str(len(payload))),
105+
("Custom", "4"),
106+
],
107+
refers_to_uri="http://example.com/orig-2",
108+
refers_to_date="2020-01-01T00:00:00Z",
109+
)
110+
)
111+
112+
def test_init(self):
113+
filename = os.path.join(self.root_dir, 'redir.warc.gz')
114+
with open(filename, 'wb') as fh:
115+
self.writer = WARCWriter(fh, gzip=True)
116+
self.create()
117+
118+
wb_manager(['init', 'revisits'])
119+
120+
wb_manager(['add', 'revisits', filename])
121+
122+
assert os.path.isfile(os.path.join(self.root_dir, self.COLLS_DIR, 'revisits', 'indexes', 'index.cdxj'))
123+
124+
def test_different_url_revisit_orig_headers(self, fmod):
125+
res = self.get('/revisits/20220101{0}/http://example.com/', fmod, status=301)
126+
assert res.headers["Custom"] == "4"
127+
assert res.headers["Location"].endswith("/20220101{0}/https://example.com/redirect-4".format(fmod))
128+
assert res.text == 'some\ntext'
129+
130+
def test_different_url_revisit_and_response(self, fmod):
131+
res = self.get('/revisits/20200101{0}/http://example.com/orig-2', fmod, status=301)
132+
assert res.headers["Custom"] == "2"
133+
assert res.headers["Location"].endswith("/20200101{0}/https://example.com/redirect-2".format(fmod))
134+
assert res.text == 'some\ntext'
135+
136+
res = self.get('/revisits/20220101{0}/http://example.com/orig-2', fmod, status=301)
137+
assert res.headers["Custom"] == "3"
138+
assert res.headers["Location"].endswith("/20220101{0}/https://example.com/redirect-3".format(fmod))
139+
assert res.text == 'some\ntext'
140+
141+
def test_orig(self, fmod):
142+
res = self.get('/revisits/20200101{0}/http://example.com/orig-1', fmod, status=301)
143+
assert res.headers["Custom"] == "1"
144+
assert res.headers["Location"].endswith("/20200101{0}/https://example.com/redirect-1".format(fmod))
145+
assert res.text == 'some\ntext'
146+

0 commit comments

Comments
 (0)