Skip to content

Commit 454486b

Browse files
authored
[#799] wb-manager: Add wacz archives to collection with --uncompress-wacz (#800)
Add WACZ support for `wb-manager add` by unpacking WACZ files with --uncompress-wacz. A future commit will add pywb support for WACZ files without requiring them to be unpacked.
1 parent b869330 commit 454486b

File tree

6 files changed

+225
-23
lines changed

6 files changed

+225
-23
lines changed

docs/manual/apps.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ The tool can be used while ``wayback`` is running, and pywb will detect many cha
4545
It can be used to:
4646

4747
* Create a new collection -- ``wb-manager init <coll>``
48-
* Add WARCs to collection -- ``wb-manager add <coll> <warc>``
48+
* Add WARCs or WACZs to collection -- ``wb-manager add <coll> <warc/wacz>``
4949
* Add override templates
5050
* Add and remove metadata to a collections ``metadata.yaml``
5151
* List all collections

pywb/manager/manager.py

Lines changed: 127 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,15 @@
55
import heapq
66
import yaml
77
import re
8+
import gzip
89
import six
910

1011
from distutils.util import strtobool
1112
from pkg_resources import resource_string, get_distribution
1213

1314
from argparse import ArgumentParser, RawTextHelpFormatter
15+
from tempfile import mkdtemp
16+
from zipfile import ZipFile
1417

1518
from pywb.utils.loaders import load_yaml_config
1619
from warcio.timeutils import timestamp20_now
@@ -47,6 +50,9 @@ class CollectionsManager(object):
4750

4851
COLLS_DIR = 'collections'
4952

53+
WARC_RX = re.compile(r'.*\.w?arc(\.gz)?$')
54+
WACZ_RX = re.compile(r'.*\.wacz$')
55+
5056
def __init__(self, coll_name, colls_dir=None, must_exist=True):
5157
colls_dir = colls_dir or self.COLLS_DIR
5258
self.default_config = load_yaml_config(DEFAULT_CONFIG)
@@ -115,29 +121,127 @@ def _assert_coll_exists(self):
115121
'To create a new collection, run\n\n{1} init {0}')
116122
raise IOError(msg.format(self.coll_name, sys.argv[0]))
117123

118-
def add_warcs(self, warcs):
124+
def add_archives(self, archives, uncompress_wacz=False):
119125
if not os.path.isdir(self.archive_dir):
120126
raise IOError('Directory {0} does not exist'.
121127
format(self.archive_dir))
122128

123-
full_paths = []
124-
duplicate_warcs = []
125-
for filename in warcs:
126-
filename = os.path.abspath(filename)
129+
invalid_archives = []
130+
warc_paths = []
131+
for archive in archives:
132+
if self.WARC_RX.match(archive):
133+
full_path = self._add_warc(archive)
134+
if full_path:
135+
warc_paths.append(full_path)
136+
elif self.WACZ_RX.match(archive):
137+
if uncompress_wacz:
138+
self._add_wacz_uncompressed(archive)
139+
else:
140+
raise NotImplementedError('Adding waczs without unpacking is not yet implemented. Use '
141+
'\'--uncompress-wacz\' flag to add the wacz\'s content.')
142+
else:
143+
invalid_archives.append(archive)
144+
145+
self._index_merge_warcs(warc_paths, self.DEF_INDEX_FILE)
146+
147+
if invalid_archives:
148+
logging.warning(f'Invalid archives weren\'t added: {", ".join(invalid_archives)}')
149+
150+
def _add_warc(self, warc):
151+
filename = os.path.abspath(warc)
152+
153+
# don't overwrite existing warcs with duplicate names
154+
if os.path.exists(os.path.join(self.archive_dir, os.path.basename(filename))):
155+
logging.warning(f'Warc {filename} wasn\'t added because of duplicate name.')
156+
return None
157+
158+
shutil.copy2(filename, self.archive_dir)
159+
full_path = os.path.join(self.archive_dir, filename)
160+
logging.info('Copied ' + filename + ' to ' + self.archive_dir)
161+
return full_path
162+
163+
def _add_wacz_uncompressed(self, wacz):
164+
wacz = os.path.abspath(wacz)
165+
temp_dir = mkdtemp()
166+
warc_regex = re.compile(r'.+\.warc(\.gz)?$')
167+
cdx_regex = re.compile(r'.+\.cdx(\.gz)?$')
168+
with ZipFile(wacz, 'r') as wacz_zip_file:
169+
archive_members = wacz_zip_file.namelist()
170+
warc_files = [file for file in archive_members if warc_regex.match(file)]
171+
if not warc_files:
172+
logging.warning(f'WACZ {wacz} does not contain any warc files.')
173+
return
174+
175+
# extract warc files
176+
for warc_file in warc_files:
177+
wacz_zip_file.extract(warc_file, temp_dir)
127178

128-
# don't overwrite existing warcs with duplicate names
129-
if os.path.exists(os.path.join(self.archive_dir, os.path.basename(filename))):
130-
duplicate_warcs.append(filename)
179+
cdx_files = [file for file in archive_members if cdx_regex.match(file)]
180+
if not cdx_files:
181+
logging.warning(f'WACZ {wacz} does not contain any indices.')
182+
return
183+
184+
for cdx_file in cdx_files:
185+
wacz_zip_file.extract(cdx_file, temp_dir)
186+
187+
# copy extracted warc files to collections archive dir, use wacz filename as filename with added index if
188+
# multiple warc files exist
189+
warc_filename_mapping = {}
190+
full_paths = []
191+
for idx, extracted_warc_file in enumerate(warc_files):
192+
_, warc_ext = os.path.splitext(extracted_warc_file)
193+
if warc_ext == '.gz':
194+
warc_ext = '.warc.gz'
195+
warc_filename = os.path.basename(wacz)
196+
warc_filename, _ = os.path.splitext(warc_filename)
197+
warc_filename = f'{warc_filename}-{idx}{warc_ext}'
198+
warc_destination_path = os.path.join(self.archive_dir, warc_filename)
199+
200+
if os.path.exists(warc_destination_path):
201+
logging.warning(f'Warc {warc_filename} wasn\'t added because of duplicate name.')
131202
continue
132203

133-
shutil.copy2(filename, self.archive_dir)
134-
full_paths.append(os.path.join(self.archive_dir, filename))
135-
logging.info('Copied ' + filename + ' to ' + self.archive_dir)
204+
warc_filename_mapping[os.path.basename(extracted_warc_file)] = warc_filename
205+
shutil.copy2(os.path.join(temp_dir, extracted_warc_file), warc_destination_path)
206+
full_paths.append(warc_destination_path)
136207

137-
self._index_merge_warcs(full_paths, self.DEF_INDEX_FILE)
208+
# rewrite filenames in wacz indices and merge them with collection index file
209+
for cdx_file in cdx_files:
210+
self._add_wacz_index(os.path.join(self.indexes_dir, self.DEF_INDEX_FILE), os.path.join(temp_dir, cdx_file),
211+
warc_filename_mapping)
138212

139-
if duplicate_warcs:
140-
logging.warning(f'Warcs {", ".join(duplicate_warcs)} weren\'t added because of duplicate names.')
213+
# delete temporary files
214+
shutil.rmtree(temp_dir)
215+
216+
@staticmethod
217+
def _add_wacz_index(collection_index_path, wacz_index_path, filename_mapping):
218+
from pywb.warcserver.index.cdxobject import CDXObject
219+
220+
# copy collection index to temporary directory
221+
tempdir = mkdtemp()
222+
collection_index_name = os.path.basename(collection_index_path)
223+
collection_index_temp_path = os.path.join(tempdir, collection_index_name)
224+
225+
if os.path.exists(collection_index_path):
226+
shutil.copy2(collection_index_path, collection_index_temp_path)
227+
228+
with open(collection_index_temp_path, 'a') as collection_index_temp_file:
229+
if wacz_index_path.endswith('.gz'):
230+
wacz_index_file = gzip.open(wacz_index_path, 'rb')
231+
else:
232+
wacz_index_file = open(wacz_index_path, 'rb')
233+
collection_index_temp_file.write('\n')
234+
for line in wacz_index_file.readlines():
235+
cdx_object = CDXObject(cdxline=line)
236+
if cdx_object['filename'] in filename_mapping:
237+
cdx_object['filename'] = filename_mapping[cdx_object['filename']]
238+
collection_index_temp_file.write(cdx_object.to_cdxj())
239+
240+
wacz_index_file.close()
241+
242+
# copy temporary index back to original location and delete temporary directory
243+
shutil.move(collection_index_temp_path, collection_index_path)
244+
shutil.rmtree(tempdir)
141245

142246
def reindex(self):
143247
cdx_file = os.path.join(self.indexes_dir, self.DEF_INDEX_FILE)
@@ -383,16 +487,17 @@ def do_list(r):
383487
listcmd = subparsers.add_parser('list', help=list_help)
384488
listcmd.set_defaults(func=do_list)
385489

386-
# Add Warcs
490+
# Add Warcs or Waczs
387491
def do_add(r):
388492
m = CollectionsManager(r.coll_name)
389-
m.add_warcs(r.files)
390-
391-
addwarc_help = 'Copy ARCS/WARCS to collection directory and reindex'
392-
addwarc = subparsers.add_parser('add', help=addwarc_help)
393-
addwarc.add_argument('coll_name')
394-
addwarc.add_argument('files', nargs='+')
395-
addwarc.set_defaults(func=do_add)
493+
m.add_archives(r.files, r.uncompress_wacz)
494+
495+
add_archives_help = 'Copy ARCS/WARCS/WACZ to collection directory and reindex'
496+
add_archives = subparsers.add_parser('add', help=add_archives_help)
497+
add_archives.add_argument('--uncompress-wacz', dest='uncompress_wacz', action='store_true')
498+
add_archives.add_argument('coll_name')
499+
add_archives.add_argument('files', nargs='+')
500+
add_archives.set_defaults(func=do_add)
396501

397502
# Reindex All
398503
def do_reindex(r):

sample_archive/cdxj/example.cdx.gz

194 Bytes
Binary file not shown.
485 Bytes
Binary file not shown.
4.09 KB
Binary file not shown.

tests/test_manager.py

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
import os
2+
3+
import pytest
4+
5+
from pywb.manager.manager import CollectionsManager
6+
7+
VALID_WACZ_PATH = 'sample_archive/waczs/valid_example_1.wacz'
8+
INVALID_WACZ_PATH = 'sample_archive/waczs/invalid_example_1.wacz'
9+
10+
TEST_COLLECTION_NAME = 'test-col'
11+
12+
13+
class TestManager:
14+
def test_add_valid_wacz_uncompressed(self, tmp_path):
15+
"""Test if adding a valid wacz file to a collection succeeds"""
16+
manager = self.get_test_collections_manager(tmp_path)
17+
manager._add_wacz_uncompressed(VALID_WACZ_PATH)
18+
assert 'valid_example_1-0.warc' in os.listdir(manager.archive_dir)
19+
assert manager.DEF_INDEX_FILE in os.listdir(manager.indexes_dir)
20+
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
21+
assert '"filename": "valid_example_1-0.warc"' in f.read()
22+
23+
def test_add_invalid_wacz_uncompressed(self, tmp_path, caplog):
24+
"""Test if adding an invalid wacz file to a collection fails"""
25+
manager = self.get_test_collections_manager(tmp_path)
26+
manager._add_wacz_uncompressed(INVALID_WACZ_PATH)
27+
assert 'invalid_example_1-0.warc' not in os.listdir(manager.archive_dir)
28+
assert 'sample_archive/waczs/invalid_example_1.wacz does not contain any warc files.' in caplog.text
29+
30+
index_path = os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE)
31+
if os.path.exists(index_path):
32+
with open(index_path, 'r') as f:
33+
assert '"filename": "invalid_example_1-0.warc"' not in f.read()
34+
35+
def test_add_valid_archives_uncompressed_wacz(self, tmp_path):
36+
manager = self.get_test_collections_manager(tmp_path)
37+
archives = ['sample_archive/warcs/example.arc', 'sample_archive/warcs/example.arc.gz',
38+
'sample_archive/warcs/example.warc', 'sample_archive/warcs/example.warc.gz',
39+
'sample_archive/waczs/valid_example_1.wacz']
40+
manager.add_archives(archives, uncompress_wacz=True)
41+
42+
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
43+
index_text = f.read()
44+
45+
for archive in archives:
46+
archive = os.path.basename(archive)
47+
48+
if archive.endswith('wacz'):
49+
archive = 'valid_example_1-0.warc'
50+
51+
assert archive in os.listdir(manager.archive_dir)
52+
assert archive in index_text
53+
54+
def test_add_valid_archives_dont_uncompress_wacz(self, tmp_path):
55+
manager = self.get_test_collections_manager(tmp_path)
56+
archives = ['sample_archive/warcs/example.arc', 'sample_archive/warcs/example.arc.gz',
57+
'sample_archive/warcs/example.warc', 'sample_archive/warcs/example.warc.gz',
58+
'sample_archive/waczs/valid_example_1.wacz']
59+
60+
with pytest.raises(NotImplementedError):
61+
manager.add_archives(archives, uncompress_wacz=False)
62+
63+
def test_add_invalid_archives_uncompress_wacz(self, tmp_path, caplog):
64+
manager = self.get_test_collections_manager(tmp_path)
65+
manager.add_archives(['sample_archive/warcs/example.warc', 'sample_archive/text_content/sample.html'],
66+
uncompress_wacz=True)
67+
assert 'sample.html' not in os.listdir(manager.archive_dir)
68+
assert 'example.warc' in os.listdir(manager.archive_dir)
69+
assert "Invalid archives weren't added: sample_archive/text_content/sample.html" in caplog.messages
70+
71+
def test_merge_wacz_index(self, tmp_path):
72+
manager = self.get_test_collections_manager(tmp_path)
73+
manager._add_wacz_index(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE),
74+
'sample_archive/cdxj/example.cdxj',
75+
{'example.warc.gz': 'rewritten.warc.gz'})
76+
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
77+
index_content = f.read()
78+
79+
assert 'example.warc.gz' not in index_content
80+
assert 'rewritten.warc.gz' in index_content
81+
82+
def test_merge_wacz_index_gzip(self, tmp_path):
83+
manager = self.get_test_collections_manager(tmp_path)
84+
manager._add_wacz_index(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE),
85+
'sample_archive/cdxj/example.cdx.gz',
86+
{'example-collection.warc': 'rewritten.warc'})
87+
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
88+
index_content = f.read()
89+
90+
assert 'example-collection.warc' not in index_content
91+
assert 'rewritten.warc' in index_content
92+
93+
@staticmethod
94+
def get_test_collections_manager(collections_path):
95+
manager = CollectionsManager(TEST_COLLECTION_NAME, colls_dir=collections_path, must_exist=False)
96+
manager.add_collection()
97+
return manager

0 commit comments

Comments
 (0)