Skip to content

Commit b9f1609

Browse files
authored
Handle WARC filename conflicts with wb-manager add (#902)
Append -index to end of filename prior to extension until there is no conflict Also makes sure this behavior is documented in tests
1 parent e89924b commit b9f1609

File tree

2 files changed

+54
-10
lines changed

2 files changed

+54
-10
lines changed

pywb/manager/manager.py

Lines changed: 26 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import re
88
import gzip
99
import six
10+
import pathlib
1011

1112
from distutils.util import strtobool
1213
from pkg_resources import resource_string, get_distribution
@@ -147,18 +148,32 @@ def add_archives(self, archives, unpack_wacz=False):
147148
if invalid_archives:
148149
logging.warning(f'Invalid archives weren\'t added: {", ".join(invalid_archives)}')
149150

151+
def _rename_warc(self, warc_basename):
152+
dupe_idx = 1
153+
ext = ''.join(pathlib.Path(warc_basename).suffixes)
154+
pre_ext_name = warc_basename.split(ext)[0]
155+
156+
while True:
157+
new_basename = f'{pre_ext_name}-{dupe_idx}{ext}'
158+
if not os.path.exists(os.path.join(self.archive_dir, new_basename)):
159+
break
160+
dupe_idx += 1
161+
162+
return new_basename
163+
150164
def _add_warc(self, warc):
151-
filename = os.path.abspath(warc)
165+
warc_source = os.path.abspath(warc)
166+
source_dir, warc_basename = os.path.split(warc_source)
152167

153168
# don't overwrite existing warcs with duplicate names
154-
if os.path.exists(os.path.join(self.archive_dir, os.path.basename(filename))):
155-
logging.warning(f'Warc {filename} wasn\'t added because of duplicate name.')
156-
return None
169+
if os.path.exists(os.path.join(self.archive_dir, warc_basename)):
170+
warc_basename = self._rename_warc(warc_basename)
171+
logging.info(f'Warc {os.path.basename(warc)} already exists - renamed to {warc_basename}.')
157172

158-
shutil.copy2(filename, self.archive_dir)
159-
full_path = os.path.join(self.archive_dir, filename)
160-
logging.info('Copied ' + filename + ' to ' + self.archive_dir)
161-
return full_path
173+
warc_dest = os.path.join(self.archive_dir, warc_basename)
174+
shutil.copy2(warc_source, warc_dest)
175+
logging.info(f'Copied {warc} to {self.archive_dir} as {warc_basename}')
176+
return warc_dest
162177

163178
def _add_wacz_unpacked(self, wacz):
164179
wacz = os.path.abspath(wacz)
@@ -198,8 +213,9 @@ def _add_wacz_unpacked(self, wacz):
198213
warc_destination_path = os.path.join(self.archive_dir, warc_filename)
199214

200215
if os.path.exists(warc_destination_path):
201-
logging.warning(f'Warc {warc_filename} wasn\'t added because of duplicate name.')
202-
continue
216+
warc_filename = self._rename_warc(warc_filename)
217+
logging.info(f'Warc {warc_destination_path} already exists - renamed to {warc_filename}.')
218+
warc_destination_path = os.path.join(self.archive_dir, warc_filename)
203219

204220
warc_filename_mapping[os.path.basename(extracted_warc_file)] = warc_filename
205221
shutil.copy2(os.path.join(temp_dir, extracted_warc_file), warc_destination_path)

tests/test_manager.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,20 @@ def test_add_valid_wacz_unpacked(self, tmp_path):
2020
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
2121
assert '"filename": "valid_example_1-0.warc"' in f.read()
2222

23+
def test_add_valid_wacz_unpacked_dupe_name(self, tmp_path):
24+
"""Test if warc that already exists is renamed with -index suffix"""
25+
manager = self.get_test_collections_manager(tmp_path)
26+
manager._add_wacz_unpacked(VALID_WACZ_PATH)
27+
# Add it again to see if there are name conflicts
28+
manager._add_wacz_unpacked(VALID_WACZ_PATH)
29+
assert 'valid_example_1-0.warc' in os.listdir(manager.archive_dir)
30+
assert 'valid_example_1-0-1.warc' in os.listdir(manager.archive_dir)
31+
assert manager.DEF_INDEX_FILE in os.listdir(manager.indexes_dir)
32+
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
33+
data = f.read()
34+
assert '"filename": "valid_example_1-0.warc"' in data
35+
assert '"filename": "valid_example_1-0-1.warc"' in data
36+
2337
def test_add_invalid_wacz_unpacked(self, tmp_path, caplog):
2438
"""Test if adding an invalid wacz file to a collection fails"""
2539
manager = self.get_test_collections_manager(tmp_path)
@@ -51,6 +65,20 @@ def test_add_valid_archives_unpack_wacz(self, tmp_path):
5165
assert archive in os.listdir(manager.archive_dir)
5266
assert archive in index_text
5367

68+
def test_add_valid_archives_dupe_name(self, tmp_path):
69+
manager = self.get_test_collections_manager(tmp_path)
70+
warc_filename = 'sample_archive/warcs/example.warc.gz'
71+
manager.add_archives([warc_filename, warc_filename])
72+
73+
with open(os.path.join(manager.indexes_dir, manager.DEF_INDEX_FILE), 'r') as f:
74+
index_text = f.read()
75+
76+
expected_archives = ('example.warc.gz', 'example-1.warc.gz')
77+
78+
for archive in expected_archives:
79+
assert archive in os.listdir(manager.archive_dir)
80+
assert archive in index_text
81+
5482
def test_add_valid_archives_dont_unpack_wacz(self, tmp_path):
5583
manager = self.get_test_collections_manager(tmp_path)
5684
archives = ['sample_archive/warcs/example.arc', 'sample_archive/warcs/example.arc.gz',

0 commit comments

Comments
 (0)