Skip to content

Commit 4303ce4

Browse files
committed
warc indexing: better handling of records with content-length to small, read first line to get to warc end (fixes indexing of warc in ikreymer/webarchiveplayer#14)
1 parent 2922801 commit 4303ce4

File tree

1 file changed

+24
-2
lines changed

1 file changed

+24
-2
lines changed

pywb/warc/archiveiterator.py

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,14 @@ class ArchiveIterator(object):
4343
package which will create a properly chunked gzip file:
4444
4545
warc2warc -Z myfile.{0} > myfile.{0}.gz
46-
"""
46+
"""
47+
48+
INC_RECORD = """\
49+
WARNING: Record not followed by newline, perhaps Content-Length is invalid
50+
Offset: {0}
51+
Remainder: {1}
52+
"""
53+
4754

4855
def __init__(self, fileobj, no_record_parse=False,
4956
verify_http=False):
@@ -130,15 +137,30 @@ def _consume_blanklines(self):
130137
131138
count empty_size so that it can be substracted from
132139
the record length for uncompressed
140+
141+
if first line read is not blank, likely error in WARC/ARC,
142+
display a warning
133143
"""
134144
empty_size = 0
145+
first_line = True
146+
135147
while True:
136148
line = self.reader.readline()
137149
if len(line) == 0:
138150
return None, empty_size
139151

140-
if line.rstrip() == '':
152+
stripped = line.rstrip()
153+
154+
if stripped == '' or first_line:
141155
empty_size += len(line)
156+
157+
if stripped != '':
158+
# if first line is not blank,
159+
# likely content-length was invalid, display warning
160+
err_offset = self.fh.tell() - self.reader.rem_length() - empty_size
161+
sys.stderr.write(self.INC_RECORD.format(err_offset, line))
162+
163+
first_line = False
142164
continue
143165

144166
return line, empty_size

0 commit comments

Comments
 (0)