Skip to content

Commit e89924b

Browse files
authored
Rename --uncompress-wacz to --unpack-wacz and add docs (#901)
Also adds help text for wb-manager add --unpack-wacz option in CLI
1 parent b4c91c6 commit e89924b

File tree

4 files changed

+33
-19
lines changed

4 files changed

+33
-19
lines changed

docs/manual/apps.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,8 @@ The tool can be used while ``wayback`` is running, and pywb will detect many cha
4545
It can be used to:
4646

4747
* Create a new collection -- ``wb-manager init <coll>``
48-
* Add WARCs or WACZs to collection -- ``wb-manager add <coll> <warc/wacz>``
48+
* Add WARCs to collection -- ``wb-manager add <coll> <warc>``
49+
* Unpack WACZs to add their WARCs and indices to collection -- ``wb-manager add --unpack-wacz <coll> <wacz>``
4950
* Add override templates
5051
* Add and remove metadata to a collections ``metadata.yaml``
5152
* List all collections

docs/manual/usage.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,8 @@ Using Existing Web Archive Collections
114114
Existing archives of WARCs/ARCs files can be used with pywb with minimal amount of setup. By using ``wb-manager add``,
115115
WARC/ARC files will automatically be placed in the collection archive directory and indexed.
116116

117+
In pywb 2.8.0 and later, preliminary support for WACZ files is also added with ``wb-manager add --unpack-wacz``. This will unpack the provided WACZ file, adding its WARCs and indices to the collection.
118+
117119
By default ``wb-manager``, places new collections in ``collections/<coll name>`` subdirectory in the current working directory. To specify a different root directory, the ``wb-manager -d <dir>``. Other options can be set in the config file.
118120

119121
If you have a large number of existing CDX index files, pywb will be able to read them as well after running through a simple conversion process.

pywb/manager/manager.py

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ def _assert_coll_exists(self):
121121
'To create a new collection, run\n\n{1} init {0}')
122122
raise IOError(msg.format(self.coll_name, sys.argv[0]))
123123

124-
def add_archives(self, archives, uncompress_wacz=False):
124+
def add_archives(self, archives, unpack_wacz=False):
125125
if not os.path.isdir(self.archive_dir):
126126
raise IOError('Directory {0} does not exist'.
127127
format(self.archive_dir))
@@ -134,11 +134,11 @@ def add_archives(self, archives, uncompress_wacz=False):
134134
if full_path:
135135
warc_paths.append(full_path)
136136
elif self.WACZ_RX.match(archive):
137-
if uncompress_wacz:
138-
self._add_wacz_uncompressed(archive)
137+
if unpack_wacz:
138+
self._add_wacz_unpacked(archive)
139139
else:
140140
raise NotImplementedError('Adding waczs without unpacking is not yet implemented. Use '
141-
'\'--uncompress-wacz\' flag to add the wacz\'s content.')
141+
'\'--unpack-wacz\' flag to add the wacz\'s content.')
142142
else:
143143
invalid_archives.append(archive)
144144

@@ -160,7 +160,7 @@ def _add_warc(self, warc):
160160
logging.info('Copied ' + filename + ' to ' + self.archive_dir)
161161
return full_path
162162

163-
def _add_wacz_uncompressed(self, wacz):
163+
def _add_wacz_unpacked(self, wacz):
164164
wacz = os.path.abspath(wacz)
165165
temp_dir = mkdtemp()
166166
warc_regex = re.compile(r'.+\.warc(\.gz)?$')
@@ -494,11 +494,17 @@ def do_list(r):
494494
# Add Warcs or Waczs
495495
def do_add(r):
496496
m = CollectionsManager(r.coll_name)
497-
m.add_archives(r.files, r.uncompress_wacz)
497+
m.add_archives(r.files, r.unpack_wacz)
498498

499-
add_archives_help = 'Copy ARCS/WARCS/WACZ to collection directory and reindex'
499+
add_archives_help = 'Copy ARCs/WARCs to collection directory and reindex'
500+
add_unpack_wacz_help = 'Copy WARCs from WACZ to collection directory and reindex'
500501
add_archives = subparsers.add_parser('add', help=add_archives_help)
501-
add_archives.add_argument('--uncompress-wacz', dest='uncompress_wacz', action='store_true')
502+
add_archives.add_argument(
503+
'--unpack-wacz',
504+
dest='unpack_wacz',
505+
action='store_true',
506+
help=add_unpack_wacz_help
507+
)
502508
add_archives.add_argument('coll_name')
503509
add_archives.add_argument('files', nargs='+')
504510
add_archives.set_defaults(func=do_add)

tests/test_manager.py

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11,19 +11,19 @@
1111

1212

1313
class TestManager:
14-
def test_add_valid_wacz_uncompressed(self, tmp_path):
14+
def test_add_valid_wacz_unpacked(self, tmp_path):
1515
"""Test if adding a valid wacz file to a collection succeeds"""
1616
manager = self.get_test_collections_manager(tmp_path)
17-
manager._add_wacz_uncompressed(VALID_WACZ_PATH)
17+
manager._add_wacz_unpacked(VALID_WACZ_PATH)
1818
assert 'valid_example_1-0.warc' in os.listdir(manager.archive_dir)
1919
assert manager.DEF_INDEX_FILE in os.listdir(manager.indexes_dir)
2020
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
2121
assert '"filename": "valid_example_1-0.warc"' in f.read()
2222

23-
def test_add_invalid_wacz_uncompressed(self, tmp_path, caplog):
23+
def test_add_invalid_wacz_unpacked(self, tmp_path, caplog):
2424
"""Test if adding an invalid wacz file to a collection fails"""
2525
manager = self.get_test_collections_manager(tmp_path)
26-
manager._add_wacz_uncompressed(INVALID_WACZ_PATH)
26+
manager._add_wacz_unpacked(INVALID_WACZ_PATH)
2727
assert 'invalid_example_1-0.warc' not in os.listdir(manager.archive_dir)
2828
assert 'sample_archive/waczs/invalid_example_1.wacz does not contain any warc files.' in caplog.text
2929

@@ -32,12 +32,12 @@ def test_add_invalid_wacz_uncompressed(self, tmp_path, caplog):
3232
with open(index_path, 'r') as f:
3333
assert '"filename": "invalid_example_1-0.warc"' not in f.read()
3434

35-
def test_add_valid_archives_uncompressed_wacz(self, tmp_path):
35+
def test_add_valid_archives_unpack_wacz(self, tmp_path):
3636
manager = self.get_test_collections_manager(tmp_path)
3737
archives = ['sample_archive/warcs/example.arc', 'sample_archive/warcs/example.arc.gz',
3838
'sample_archive/warcs/example.warc', 'sample_archive/warcs/example.warc.gz',
3939
'sample_archive/waczs/valid_example_1.wacz']
40-
manager.add_archives(archives, uncompress_wacz=True)
40+
manager.add_archives(archives, unpack_wacz=True)
4141

4242
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
4343
index_text = f.read()
@@ -51,19 +51,19 @@ def test_add_valid_archives_uncompressed_wacz(self, tmp_path):
5151
assert archive in os.listdir(manager.archive_dir)
5252
assert archive in index_text
5353

54-
def test_add_valid_archives_dont_uncompress_wacz(self, tmp_path):
54+
def test_add_valid_archives_dont_unpack_wacz(self, tmp_path):
5555
manager = self.get_test_collections_manager(tmp_path)
5656
archives = ['sample_archive/warcs/example.arc', 'sample_archive/warcs/example.arc.gz',
5757
'sample_archive/warcs/example.warc', 'sample_archive/warcs/example.warc.gz',
5858
'sample_archive/waczs/valid_example_1.wacz']
5959

6060
with pytest.raises(NotImplementedError):
61-
manager.add_archives(archives, uncompress_wacz=False)
61+
manager.add_archives(archives, unpack_wacz=False)
6262

63-
def test_add_invalid_archives_uncompress_wacz(self, tmp_path, caplog):
63+
def test_add_invalid_archives_unpack_wacz(self, tmp_path, caplog):
6464
manager = self.get_test_collections_manager(tmp_path)
6565
manager.add_archives(['sample_archive/warcs/example.warc', 'sample_archive/text_content/sample.html'],
66-
uncompress_wacz=True)
66+
unpack_wacz=True)
6767
assert 'sample.html' not in os.listdir(manager.archive_dir)
6868
assert 'example.warc' in os.listdir(manager.archive_dir)
6969
assert "Invalid archives weren't added: sample_archive/text_content/sample.html" in caplog.messages
@@ -91,10 +91,15 @@ def test_merge_wacz_index_gzip(self, tmp_path):
9191
{'example-collection.warc': 'rewritten.warc'})
9292
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
9393
index_content = f.read()
94+
index_content = index_content.strip()
9495

9596
assert 'example-collection.warc' not in index_content
9697
assert 'rewritten.warc' in index_content
9798

99+
# check that collection index is sorted
100+
index_lines = index_content.split('\n')
101+
assert sorted(index_lines) == index_lines
102+
98103
@staticmethod
99104
def get_test_collections_manager(collections_path):
100105
manager = CollectionsManager(TEST_COLLECTION_NAME, colls_dir=collections_path, must_exist=False)

0 commit comments

Comments
 (0)