|
5 | 5 | import heapq
|
6 | 6 | import yaml
|
7 | 7 | import re
|
| 8 | +import gzip |
8 | 9 | import six
|
9 | 10 |
|
10 | 11 | from distutils.util import strtobool
|
11 | 12 | from pkg_resources import resource_string, get_distribution
|
12 | 13 |
|
13 | 14 | from argparse import ArgumentParser, RawTextHelpFormatter
|
| 15 | +from tempfile import mkdtemp |
| 16 | +from zipfile import ZipFile |
14 | 17 |
|
15 | 18 | from pywb.utils.loaders import load_yaml_config
|
16 | 19 | from warcio.timeutils import timestamp20_now
|
@@ -47,6 +50,9 @@ class CollectionsManager(object):
|
47 | 50 |
|
48 | 51 | COLLS_DIR = 'collections'
|
49 | 52 |
|
| 53 | + WARC_RX = re.compile(r'.*\.w?arc(\.gz)?$') |
| 54 | + WACZ_RX = re.compile(r'.*\.wacz$') |
| 55 | + |
50 | 56 | def __init__(self, coll_name, colls_dir=None, must_exist=True):
|
51 | 57 | colls_dir = colls_dir or self.COLLS_DIR
|
52 | 58 | self.default_config = load_yaml_config(DEFAULT_CONFIG)
|
@@ -115,29 +121,127 @@ def _assert_coll_exists(self):
|
115 | 121 | 'To create a new collection, run\n\n{1} init {0}')
|
116 | 122 | raise IOError(msg.format(self.coll_name, sys.argv[0]))
|
117 | 123 |
|
118 |
| - def add_warcs(self, warcs): |
| 124 | + def add_archives(self, archives, uncompress_wacz=False): |
119 | 125 | if not os.path.isdir(self.archive_dir):
|
120 | 126 | raise IOError('Directory {0} does not exist'.
|
121 | 127 | format(self.archive_dir))
|
122 | 128 |
|
123 |
| - full_paths = [] |
124 |
| - duplicate_warcs = [] |
125 |
| - for filename in warcs: |
126 |
| - filename = os.path.abspath(filename) |
| 129 | + invalid_archives = [] |
| 130 | + warc_paths = [] |
| 131 | + for archive in archives: |
| 132 | + if self.WARC_RX.match(archive): |
| 133 | + full_path = self._add_warc(archive) |
| 134 | + if full_path: |
| 135 | + warc_paths.append(full_path) |
| 136 | + elif self.WACZ_RX.match(archive): |
| 137 | + if uncompress_wacz: |
| 138 | + self._add_wacz_uncompressed(archive) |
| 139 | + else: |
| 140 | + raise NotImplementedError('Adding waczs without unpacking is not yet implemented. Use ' |
| 141 | + '\'--uncompress-wacz\' flag to add the wacz\'s content.') |
| 142 | + else: |
| 143 | + invalid_archives.append(archive) |
| 144 | + |
| 145 | + self._index_merge_warcs(warc_paths, self.DEF_INDEX_FILE) |
| 146 | + |
| 147 | + if invalid_archives: |
| 148 | + logging.warning(f'Invalid archives weren\'t added: {", ".join(invalid_archives)}') |
| 149 | + |
| 150 | + def _add_warc(self, warc): |
| 151 | + filename = os.path.abspath(warc) |
| 152 | + |
| 153 | + # don't overwrite existing warcs with duplicate names |
| 154 | + if os.path.exists(os.path.join(self.archive_dir, os.path.basename(filename))): |
| 155 | + logging.warning(f'Warc {filename} wasn\'t added because of duplicate name.') |
| 156 | + return None |
| 157 | + |
| 158 | + shutil.copy2(filename, self.archive_dir) |
| 159 | + full_path = os.path.join(self.archive_dir, filename) |
| 160 | + logging.info('Copied ' + filename + ' to ' + self.archive_dir) |
| 161 | + return full_path |
| 162 | + |
| 163 | + def _add_wacz_uncompressed(self, wacz): |
| 164 | + wacz = os.path.abspath(wacz) |
| 165 | + temp_dir = mkdtemp() |
| 166 | + warc_regex = re.compile(r'.+\.warc(\.gz)?$') |
| 167 | + cdx_regex = re.compile(r'.+\.cdx(\.gz)?$') |
| 168 | + with ZipFile(wacz, 'r') as wacz_zip_file: |
| 169 | + archive_members = wacz_zip_file.namelist() |
| 170 | + warc_files = [file for file in archive_members if warc_regex.match(file)] |
| 171 | + if not warc_files: |
| 172 | + logging.warning(f'WACZ {wacz} does not contain any warc files.') |
| 173 | + return |
| 174 | + |
| 175 | + # extract warc files |
| 176 | + for warc_file in warc_files: |
| 177 | + wacz_zip_file.extract(warc_file, temp_dir) |
127 | 178 |
|
128 |
| - # don't overwrite existing warcs with duplicate names |
129 |
| - if os.path.exists(os.path.join(self.archive_dir, os.path.basename(filename))): |
130 |
| - duplicate_warcs.append(filename) |
| 179 | + cdx_files = [file for file in archive_members if cdx_regex.match(file)] |
| 180 | + if not cdx_files: |
| 181 | + logging.warning(f'WACZ {wacz} does not contain any indices.') |
| 182 | + return |
| 183 | + |
| 184 | + for cdx_file in cdx_files: |
| 185 | + wacz_zip_file.extract(cdx_file, temp_dir) |
| 186 | + |
| 187 | + # copy extracted warc files to collections archive dir, use wacz filename as filename with added index if |
| 188 | + # multiple warc files exist |
| 189 | + warc_filename_mapping = {} |
| 190 | + full_paths = [] |
| 191 | + for idx, extracted_warc_file in enumerate(warc_files): |
| 192 | + _, warc_ext = os.path.splitext(extracted_warc_file) |
| 193 | + if warc_ext == '.gz': |
| 194 | + warc_ext = '.warc.gz' |
| 195 | + warc_filename = os.path.basename(wacz) |
| 196 | + warc_filename, _ = os.path.splitext(warc_filename) |
| 197 | + warc_filename = f'{warc_filename}-{idx}{warc_ext}' |
| 198 | + warc_destination_path = os.path.join(self.archive_dir, warc_filename) |
| 199 | + |
| 200 | + if os.path.exists(warc_destination_path): |
| 201 | + logging.warning(f'Warc {warc_filename} wasn\'t added because of duplicate name.') |
131 | 202 | continue
|
132 | 203 |
|
133 |
| - shutil.copy2(filename, self.archive_dir) |
134 |
| - full_paths.append(os.path.join(self.archive_dir, filename)) |
135 |
| - logging.info('Copied ' + filename + ' to ' + self.archive_dir) |
| 204 | + warc_filename_mapping[os.path.basename(extracted_warc_file)] = warc_filename |
| 205 | + shutil.copy2(os.path.join(temp_dir, extracted_warc_file), warc_destination_path) |
| 206 | + full_paths.append(warc_destination_path) |
136 | 207 |
|
137 |
| - self._index_merge_warcs(full_paths, self.DEF_INDEX_FILE) |
| 208 | + # rewrite filenames in wacz indices and merge them with collection index file |
| 209 | + for cdx_file in cdx_files: |
| 210 | + self._add_wacz_index(os.path.join(self.indexes_dir, self.DEF_INDEX_FILE), os.path.join(temp_dir, cdx_file), |
| 211 | + warc_filename_mapping) |
138 | 212 |
|
139 |
| - if duplicate_warcs: |
140 |
| - logging.warning(f'Warcs {", ".join(duplicate_warcs)} weren\'t added because of duplicate names.') |
| 213 | + # delete temporary files |
| 214 | + shutil.rmtree(temp_dir) |
| 215 | + |
| 216 | + @staticmethod |
| 217 | + def _add_wacz_index(collection_index_path, wacz_index_path, filename_mapping): |
| 218 | + from pywb.warcserver.index.cdxobject import CDXObject |
| 219 | + |
| 220 | + # copy collection index to temporary directory |
| 221 | + tempdir = mkdtemp() |
| 222 | + collection_index_name = os.path.basename(collection_index_path) |
| 223 | + collection_index_temp_path = os.path.join(tempdir, collection_index_name) |
| 224 | + |
| 225 | + if os.path.exists(collection_index_path): |
| 226 | + shutil.copy2(collection_index_path, collection_index_temp_path) |
| 227 | + |
| 228 | + with open(collection_index_temp_path, 'a') as collection_index_temp_file: |
| 229 | + if wacz_index_path.endswith('.gz'): |
| 230 | + wacz_index_file = gzip.open(wacz_index_path, 'rb') |
| 231 | + else: |
| 232 | + wacz_index_file = open(wacz_index_path, 'rb') |
| 233 | + collection_index_temp_file.write('\n') |
| 234 | + for line in wacz_index_file.readlines(): |
| 235 | + cdx_object = CDXObject(cdxline=line) |
| 236 | + if cdx_object['filename'] in filename_mapping: |
| 237 | + cdx_object['filename'] = filename_mapping[cdx_object['filename']] |
| 238 | + collection_index_temp_file.write(cdx_object.to_cdxj()) |
| 239 | + |
| 240 | + wacz_index_file.close() |
| 241 | + |
| 242 | + # copy temporary index back to original location and delete temporary directory |
| 243 | + shutil.move(collection_index_temp_path, collection_index_path) |
| 244 | + shutil.rmtree(tempdir) |
141 | 245 |
|
142 | 246 | def reindex(self):
|
143 | 247 | cdx_file = os.path.join(self.indexes_dir, self.DEF_INDEX_FILE)
|
@@ -383,16 +487,17 @@ def do_list(r):
|
383 | 487 | listcmd = subparsers.add_parser('list', help=list_help)
|
384 | 488 | listcmd.set_defaults(func=do_list)
|
385 | 489 |
|
386 |
| - # Add Warcs |
| 490 | + # Add Warcs or Waczs |
387 | 491 | def do_add(r):
|
388 | 492 | m = CollectionsManager(r.coll_name)
|
389 |
| - m.add_warcs(r.files) |
390 |
| - |
391 |
| - addwarc_help = 'Copy ARCS/WARCS to collection directory and reindex' |
392 |
| - addwarc = subparsers.add_parser('add', help=addwarc_help) |
393 |
| - addwarc.add_argument('coll_name') |
394 |
| - addwarc.add_argument('files', nargs='+') |
395 |
| - addwarc.set_defaults(func=do_add) |
| 493 | + m.add_archives(r.files, r.uncompress_wacz) |
| 494 | + |
| 495 | + add_archives_help = 'Copy ARCS/WARCS/WACZ to collection directory and reindex' |
| 496 | + add_archives = subparsers.add_parser('add', help=add_archives_help) |
| 497 | + add_archives.add_argument('--uncompress-wacz', dest='uncompress_wacz', action='store_true') |
| 498 | + add_archives.add_argument('coll_name') |
| 499 | + add_archives.add_argument('files', nargs='+') |
| 500 | + add_archives.set_defaults(func=do_add) |
396 | 501 |
|
397 | 502 | # Reindex All
|
398 | 503 | def do_reindex(r):
|
|
0 commit comments