| 
29 | 29 | import datetime  | 
30 | 30 | import itertools  | 
31 | 31 | import logging  | 
32 |  | -import openpyxl  | 
33 | 32 | import os  | 
34 | 33 | import shutil  | 
35 |  | -import stat  | 
36 | 34 | import subprocess  | 
37 | 35 | import sys  | 
38 | 36 | import time  | 
@@ -258,174 +256,6 @@ def create_spreadsheet(args, sips, volumes, logger):  | 
258 | 256 |     logger.info("Description CSV created.")  | 
259 | 257 | 
 
  | 
260 | 258 | 
 
  | 
261 |  | -def create_aspace_excel_sheet(args, sips, volumes, logger):  | 
262 |  | -    """Create new copy of ASpace XLSX and append rows describing disk images."""  | 
263 |  | -    xlsx_path = os.path.abspath(os.path.join(args.destination, "description.xlsx"))  | 
264 |  | -    template_path = os.path.abspath(  | 
265 |  | -        os.path.join(THIS_DIR, "aspace_template", "aspace_import_template.xlsx")  | 
266 |  | -    )  | 
267 |  | - | 
268 |  | -    try:  | 
269 |  | -        shutil.copyfile(template_path, xlsx_path)  | 
270 |  | -    except OSError as err:  | 
271 |  | -        logger.error(f"Unable to copy ASpace template to destination: {err}")  | 
272 |  | - | 
273 |  | -    # Set ASpace file permissions  | 
274 |  | -    try:  | 
275 |  | -        os.chmod(  | 
276 |  | -            xlsx_path,  | 
277 |  | -            stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP | stat.S_IROTH,  | 
278 |  | -        )  | 
279 |  | -    except OSError as err:  | 
280 |  | -        logger.error(f"Error setting permissions: {err}")  | 
281 |  | - | 
282 |  | -    workbook = openpyxl.load_workbook(filename=xlsx_path)  | 
283 |  | -    worksheet = workbook["Data"]  | 
284 |  | - | 
285 |  | -    # TODO: Deduplicate with create_speadsheet  | 
286 |  | -    # Maybe create separate method that creates dict with info, and handle  | 
287 |  | -    # opening/writing csv or xlsx separately  | 
288 |  | -    for item in sorted(os.listdir(sips)):  | 
289 |  | -        sip_path = os.path.join(sips, item)  | 
290 |  | - | 
291 |  | -        if not os.path.isdir(sip_path):  | 
292 |  | -            continue  | 
293 |  | - | 
294 |  | -        disk_volumes = volumes[item]  | 
295 |  | -        number_volumes = len(disk_volumes)  | 
296 |  | - | 
297 |  | -        date_earliest = ""  | 
298 |  | -        date_latest = ""  | 
299 |  | - | 
300 |  | -        # Get and sum information from all DFXML files generated  | 
301 |  | -        dfxml_files = []  | 
302 |  | -        subdoc_dir = os.path.join(sip_path, "metadata", "submissionDocumentation")  | 
303 |  | -        if args.bagfiles:  | 
304 |  | -            subdoc_dir = os.path.join(  | 
305 |  | -                sip_path, "data", "metadata", "submissionDocumentation"  | 
306 |  | -            )  | 
307 |  | -        for root, _, files in os.walk(subdoc_dir):  | 
308 |  | -            for file in files:  | 
309 |  | -                if file.startswith("dfxml"):  | 
310 |  | -                    dfxml_files.append(os.path.join(root, file))  | 
311 |  | - | 
312 |  | -        dfxml_files_info = []  | 
313 |  | -        for dfxml_file in dfxml_files:  | 
314 |  | -            dfxml_info = _parse_dfxml(dfxml_file, logger)  | 
315 |  | -            if not dfxml_info:  | 
316 |  | -                logger.warning(  | 
317 |  | -                    "No fileobjects in DFXML file {} - possibly file system fiwalk doesn't recognize".format(  | 
318 |  | -                        dfxml_file  | 
319 |  | -                    )  | 
320 |  | -                )  | 
321 |  | -                continue  | 
322 |  | -            dfxml_files_info.append(dfxml_info)  | 
323 |  | - | 
324 |  | -        file_count = sum([dfxml_info["files"] for dfxml_info in dfxml_files_info])  | 
325 |  | -        total_bytes = sum([dfxml_info["bytes"] for dfxml_info in dfxml_files_info])  | 
326 |  | -        file_systems = [volume["file_system"] for volume in disk_volumes]  | 
327 |  | -        # Deduplicate list  | 
328 |  | -        file_systems = list(dict.fromkeys(file_systems))  | 
329 |  | -        file_systems_str = ", ".join(file_systems)  | 
330 |  | - | 
331 |  | -        for dfxml_info in dfxml_files_info:  | 
332 |  | -            if not date_earliest or dfxml_info["date_earliest"] < date_earliest:  | 
333 |  | -                date_earliest = dfxml_info["date_earliest"]  | 
334 |  | -            if not date_latest or dfxml_info["date_latest"] > date_latest:  | 
335 |  | -                date_latest = dfxml_info["date_latest"]  | 
336 |  | - | 
337 |  | -        # Create list with empty string for each of template's columns  | 
338 |  | -        row_to_write = []  | 
339 |  | -        for _ in range(173):  | 
340 |  | -            row_to_write.append("")  | 
341 |  | - | 
342 |  | -        # Row indices for fields to write  | 
343 |  | -        INDEX_FILENAME = 6  | 
344 |  | -        INDEX_LEVEL_OF_DESCRIPTION = 8  | 
345 |  | -        INDEX_DATE_START = 23  | 
346 |  | -        INDEX_DATE_END = 24  | 
347 |  | -        INDEX_EXTENT_NUMBER = 34  | 
348 |  | -        INDEX_EXTENT_TYPE = 35  | 
349 |  | -        INDEX_SIZE = 36  | 
350 |  | -        INDEX_SCOPE_CONTENTS = 170  | 
351 |  | - | 
352 |  | -        # Fields that are always constant  | 
353 |  | -        row_to_write[INDEX_FILENAME] = item  | 
354 |  | -        row_to_write[INDEX_LEVEL_OF_DESCRIPTION] = "File"  | 
355 |  | - | 
356 |  | -        if file_count == 0:  | 
357 |  | -            row_to_write[  | 
358 |  | -                INDEX_SCOPE_CONTENTS  | 
359 |  | -            ] = "Error gathering statistics from SIP directory"  | 
360 |  | - | 
361 |  | -            worksheet.append(row_to_write)  | 
362 |  | - | 
363 |  | -            logger.error("Unable to read DFXML files for {}".format(sip_path))  | 
364 |  | -            continue  | 
365 |  | - | 
366 |  | -        # Get file formats from Brunnhilde  | 
367 |  | -        file_formats = []  | 
368 |  | -        file_format_csv = os.path.join(  | 
369 |  | -            sip_path,  | 
370 |  | -            "metadata",  | 
371 |  | -            "submissionDocumentation",  | 
372 |  | -            "brunnhilde",  | 
373 |  | -            "csv_reports",  | 
374 |  | -            "formats.csv",  | 
375 |  | -        )  | 
376 |  | -        if args.bagfiles:  | 
377 |  | -            file_format_csv = os.path.join(  | 
378 |  | -                sip_path,  | 
379 |  | -                "data",  | 
380 |  | -                "metadata",  | 
381 |  | -                "submissionDocumentation",  | 
382 |  | -                "brunnhilde",  | 
383 |  | -                "csv_reports",  | 
384 |  | -                "formats.csv",  | 
385 |  | -            )  | 
386 |  | - | 
387 |  | -        try:  | 
388 |  | -            with open(file_format_csv, "r") as f:  | 
389 |  | -                reader = csv.reader(f)  | 
390 |  | -                next(reader)  | 
391 |  | -                for row in itertools.islice(reader, 5):  | 
392 |  | -                    file_formats.append(row[0])  | 
393 |  | -        except:  | 
394 |  | -            file_formats.append(  | 
395 |  | -                "ERROR! No Brunnhilde formats.csv file to pull formats from."  | 
396 |  | -            )  | 
397 |  | - | 
398 |  | -        file_formats = [element or "Unidentified" for element in file_formats]  | 
399 |  | -        file_formats_str = ", ".join(file_formats)  | 
400 |  | - | 
401 |  | -        if number_volumes > 1:  | 
402 |  | -            scope_content = "Files exported from {} volumes with file systems: {}. File formats: {}".format(  | 
403 |  | -                number_volumes, file_systems_str, file_formats_str  | 
404 |  | -            )  | 
405 |  | -        else:  | 
406 |  | -            scope_content = (  | 
407 |  | -                "Files exported from {} file system volume. File formats: {}".format(  | 
408 |  | -                    disk_volumes[0]["file_system"], file_formats_str  | 
409 |  | -                )  | 
410 |  | -            )  | 
411 |  | - | 
412 |  | -        row_to_write[INDEX_DATE_START] = str(date_earliest[:4])  | 
413 |  | -        row_to_write[INDEX_DATE_END] = str(date_latest[:4])  | 
414 |  | -        row_to_write[INDEX_EXTENT_NUMBER] = str(file_count)  | 
415 |  | -        row_to_write[INDEX_EXTENT_TYPE] = "digital files"  | 
416 |  | -        row_to_write[INDEX_SIZE] = str(human_readable_size(total_bytes))  | 
417 |  | -        row_to_write[INDEX_SCOPE_CONTENTS] = scope_content  | 
418 |  | - | 
419 |  | -        worksheet.append(row_to_write)  | 
420 |  | - | 
421 |  | -        logger.info("Described %s successfully." % (sip_path))  | 
422 |  | - | 
423 |  | -    workbook.save(filename=xlsx_path)  | 
424 |  | -    workbook.close()  | 
425 |  | - | 
426 |  | -    logger.info("ArchivesSpace description XLSX created.")  | 
427 |  | - | 
428 |  | - | 
429 | 259 | def _parse_dfxml(dfxml_path, logger, export_all=False):  | 
430 | 260 |     """Parse DFXML and return dict of information for spreadsheet."""  | 
431 | 261 |     volume_info = {  | 
@@ -593,12 +423,6 @@ def _make_parser():  | 
593 | 423 |         help="Export AppleDouble resource forks from HFS-formatted disks",  | 
594 | 424 |         action="store_true",  | 
595 | 425 |     )  | 
596 |  | -    parser.add_argument(  | 
597 |  | -        "-c",  | 
598 |  | -        "--csv",  | 
599 |  | -        help="Write description CSV (old default) instead of ArchivesSpace XLSX",  | 
600 |  | -        action="store_true",  | 
601 |  | -    )  | 
602 | 426 |     parser.add_argument("--quiet", action="store_true", help="Write only errors to log")  | 
603 | 427 |     parser.add_argument(  | 
604 | 428 |         "source", help="Source directory containing disk images (and related files)"  | 
@@ -740,16 +564,10 @@ def main():  | 
740 | 564 |             )  | 
741 | 565 | 
 
  | 
742 | 566 |     # write description  | 
743 |  | -    if args.csv:  | 
744 |  | -        try:  | 
745 |  | -            create_spreadsheet(args, sips, volumes, logger)  | 
746 |  | -        except Exception as err:  | 
747 |  | -            logger.error(f"Error creating description csv: {err}")  | 
748 |  | -    else:  | 
749 |  | -        try:  | 
750 |  | -            create_aspace_excel_sheet(args, sips, volumes, logger)  | 
751 |  | -        except Exception as err:  | 
752 |  | -            logger.error(f"Error creating ArchivesSpace description xlsx: {err}")  | 
 | 567 | +    try:  | 
 | 568 | +        create_spreadsheet(args, sips, volumes, logger)  | 
 | 569 | +    except Exception as err:  | 
 | 570 | +        logger.error(f"Error creating description csv: {err}")  | 
753 | 571 | 
 
  | 
754 | 572 |     # print unprocessed list  | 
755 | 573 |     if unprocessed:  | 
 | 
0 commit comments