|
7 | 7 | import re
|
8 | 8 | import gzip
|
9 | 9 | import six
|
| 10 | +import pathlib |
10 | 11 |
|
11 | 12 | from distutils.util import strtobool
|
12 | 13 | from pkg_resources import resource_string, get_distribution
|
@@ -147,18 +148,32 @@ def add_archives(self, archives, unpack_wacz=False):
|
147 | 148 | if invalid_archives:
|
148 | 149 | logging.warning(f'Invalid archives weren\'t added: {", ".join(invalid_archives)}')
|
149 | 150 |
|
| 151 | + def _rename_warc(self, warc_basename): |
| 152 | + dupe_idx = 1 |
| 153 | + ext = ''.join(pathlib.Path(warc_basename).suffixes) |
| 154 | + pre_ext_name = warc_basename.split(ext)[0] |
| 155 | + |
| 156 | + while True: |
| 157 | + new_basename = f'{pre_ext_name}-{dupe_idx}{ext}' |
| 158 | + if not os.path.exists(os.path.join(self.archive_dir, new_basename)): |
| 159 | + break |
| 160 | + dupe_idx += 1 |
| 161 | + |
| 162 | + return new_basename |
| 163 | + |
150 | 164 | def _add_warc(self, warc):
|
151 |
| - filename = os.path.abspath(warc) |
| 165 | + warc_source = os.path.abspath(warc) |
| 166 | + source_dir, warc_basename = os.path.split(warc_source) |
152 | 167 |
|
153 | 168 | # don't overwrite existing warcs with duplicate names
|
154 |
| - if os.path.exists(os.path.join(self.archive_dir, os.path.basename(filename))): |
155 |
| - logging.warning(f'Warc {filename} wasn\'t added because of duplicate name.') |
156 |
| - return None |
| 169 | + if os.path.exists(os.path.join(self.archive_dir, warc_basename)): |
| 170 | + warc_basename = self._rename_warc(warc_basename) |
| 171 | + logging.info(f'Warc {os.path.basename(warc)} already exists - renamed to {warc_basename}.') |
157 | 172 |
|
158 |
| - shutil.copy2(filename, self.archive_dir) |
159 |
| - full_path = os.path.join(self.archive_dir, filename) |
160 |
| - logging.info('Copied ' + filename + ' to ' + self.archive_dir) |
161 |
| - return full_path |
| 173 | + warc_dest = os.path.join(self.archive_dir, warc_basename) |
| 174 | + shutil.copy2(warc_source, warc_dest) |
| 175 | + logging.info(f'Copied {warc} to {self.archive_dir} as {warc_basename}') |
| 176 | + return warc_dest |
162 | 177 |
|
163 | 178 | def _add_wacz_unpacked(self, wacz):
|
164 | 179 | wacz = os.path.abspath(wacz)
|
@@ -198,8 +213,9 @@ def _add_wacz_unpacked(self, wacz):
|
198 | 213 | warc_destination_path = os.path.join(self.archive_dir, warc_filename)
|
199 | 214 |
|
200 | 215 | if os.path.exists(warc_destination_path):
|
201 |
| - logging.warning(f'Warc {warc_filename} wasn\'t added because of duplicate name.') |
202 |
| - continue |
| 216 | + warc_filename = self._rename_warc(warc_filename) |
| 217 | + logging.info(f'Warc {warc_destination_path} already exists - renamed to {warc_filename}.') |
| 218 | + warc_destination_path = os.path.join(self.archive_dir, warc_filename) |
203 | 219 |
|
204 | 220 | warc_filename_mapping[os.path.basename(extracted_warc_file)] = warc_filename
|
205 | 221 | shutil.copy2(os.path.join(temp_dir, extracted_warc_file), warc_destination_path)
|
|
0 commit comments