download size estimate and parallel gzipping

rpetit3 · rpetit3 · commit 24de23eadc00 · 2024-10-07T19:42:46.000Z
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,10 @@
 # Changelog
 
+## 1.2.1
+
+- added parallel gzipping of assemblies in `bactopia-atb-formatter`
+- added size estimation to `bactopia-atb-formatter` output
+
 ## 1.2.0
 
 - added `bactopia-atb-downloader` to download All-the-Bacteria assemblies
diff --git a/bactopia/cli/atb/atb_downloader.py b/bactopia/cli/atb/atb_downloader.py
@@ -11,7 +11,14 @@
 import bactopia
 from bactopia.atb import parse_atb_file_list
 from bactopia.ncbi import is_biosample, taxid2name
-from bactopia.utils import download_url, execute, file_exists, mkdir, validate_file
+from bactopia.utils import (
+    download_url,
+    execute,
+    file_exists,
+    mkdir,
+    pgzip,
+    validate_file,
+)
 
 # Set up Rich
 stderr = rich.console.Console(stderr=True)
@@ -32,6 +39,7 @@
                 "--atb-file-list-url",
                 "--dry-run",
                 "--progress",
+                "--cpus",
                 "--uncompressed",
                 "--remove-archives",
             ],
@@ -91,6 +99,11 @@
     is_flag=True,
     help="Show download progress bar",
 )
+@click.option(
+    "--cpus",
+    default=4,
+    help="The total number of cpus to use for downloading and compressing",
+)
 @click.option(
     "--uncompressed",
     "-u",
@@ -125,6 +138,7 @@ def atb_downloader(
     atb_file_list_url,
     dry_run,
     progress,
+    cpus,
     uncompressed,
     remove_archives,
     ncbi_api_key,
@@ -194,102 +208,124 @@ def atb_downloader(
             logging.error(f"Species not found in ATB file list: {query_species}")
             sys.exit(1)
         matched_samples = species[query_species]
+
+    # Estimate total size of downloads
+    total_size = 0
+    for archive, info in archives_to_download.items():
+        total_size += float(info["size"])
+
     logging.info(f"Found {len(matched_samples)} samples to extract")
     logging.debug(f"Samples: {matched_samples}")
-    logging.info(f"Found {len(archives_to_download)} archives to download")
+    logging.info(
+        f"Found {len(archives_to_download)} archives (~{int(total_size):,} MB) to download"
+    )
     if verbose:
         for archive in archives_to_download:
             logging.debug(f"Archive: {archive}")
 
     # Check if archives exist, otherwise download
-    logging.info(f"Downloading archives to: {outdir}/archives")
-    mkdir(f"{outdir}/archives")
-    for archive, info in archives_to_download.items():
-        archive_path = f"{str(outdir)}/archives/{archive}"
-        if file_exists(archive_path):
-            logging.info(f"Using existing archive: {archive_path}")
-            archive_path = validate_file(archive_path)
-        else:
-            logging.info(f"Downloading archive to: {archive_path}")
-            if dry_run:
-                logging.info(f"Would download: {info['url']} to {archive_path}")
+    if not dry_run:
+        logging.info(f"Downloading archives to: {outdir}/archives")
+        mkdir(f"{outdir}/archives")
+        for archive, info in archives_to_download.items():
+            archive_path = f"{str(outdir)}/archives/{archive}"
+            if file_exists(archive_path):
+                logging.info(f"Using existing archive: {archive_path}")
+                archive_path = validate_file(archive_path)
             else:
-                archive_path = download_url(info["url"], archive_path, progress)
+                logging.info(f"Downloading archive to: {archive_path}")
+                if dry_run:
+                    logging.info(f"Would download: {info['url']} to {archive_path}")
+                else:
+                    archive_path = download_url(info["url"], archive_path, progress)
 
-    # Extract each of the archives
-    cleanup = []
-    for archive, info in archives_to_download.items():
-        archive_path = f"{str(outdir)}/archives/{archive}"
-        if dry_run:
-            logging.info(f"Would have extracted: {archive_path}")
-        else:
-            logging.info(f"Extracting: {archive_path}")
-            stdout, stderr = execute(
-                f"tar xf {archive_path} -C {outdir}", capture=True, allow_fail=True
-            )
-            cleanup_dir = f"{outdir}/{archive.replace('.tar.xz', '')}"
-            logging.debug(f"Adding {cleanup_dir} to cleanup list")
-            cleanup.append(f"{outdir}/{archive.replace('.tar.xz', '')}")
+        # Extract each of the archives
+        cleanup = []
+        for archive, info in archives_to_download.items():
+            archive_path = f"{str(outdir)}/archives/{archive}"
+            if dry_run:
+                logging.info(f"Would have extracted: {archive_path}")
+            else:
+                logging.info(f"Extracting: {archive_path}")
+                stdout, stderr = execute(
+                    f"tar xf {archive_path} -C {outdir}", capture=True, allow_fail=True
+                )
+                cleanup_dir = f"{outdir}/{archive.replace('.tar.xz', '')}"
+                logging.debug(f"Adding {cleanup_dir} to cleanup list")
+                cleanup.append(f"{outdir}/{archive.replace('.tar.xz', '')}")
+    else:
+        logging.info("Would have downloaded and extracted archives")
 
     # Move samples into species directories, then compress
     species_dirs = {}
-    logging.info(f"Moving {len(matched_samples)} samples to: {outdir}")
-    for i, sample in enumerate(matched_samples):
-        if uncompressed:
-            logging.info(f"Moving sample {i+1} of {len(matched_samples)}: {sample}")
-        else:
-            logging.info(
-                f"Moving and compressing sample {i+1} of {len(matched_samples)}: {sample}"
-            )
-        info = samples[sample]
-        species = info["species_sylph"].lower().replace(" ", "_")
-        if species not in species_dirs:
-            species_dirs[species] = True
-            mkdir(f"{outdir}/{species}")
-
-        archive_file = f"{outdir}/{info['filename']}"
-        if file_exists(archive_file):
-            sample_filename = info["filename"].split("/")[-1]
-            sample_out = f"{outdir}/{species}/{sample_filename}"
+    needs_compression = []
+    if not dry_run:
+        logging.info(f"Moving {len(matched_samples)} samples to: {outdir}")
+        for i, sample in enumerate(matched_samples):
+            logging.debug(f"Moving sample {i+1} of {len(matched_samples)}: {sample}")
+            info = samples[sample]
+            species = info["species_sylph"].lower().replace(" ", "_")
+            if species not in species_dirs:
+                species_dirs[species] = True
+                mkdir(f"{outdir}/{species}")
 
+            archive_file = f"{outdir}/{info['filename']}"
             if file_exists(archive_file):
-                if (file_exists(sample_out) and not force) or ():
-                    logging.debug(
-                        f"Sample already exists: {sample_out}...skipping unless --force provided"
-                    )
-                elif file_exists(f"{sample_out}.gz") and not force:
-                    logging.debug(
-                        f"Sample already exists: {sample_out}.gz...skipping unless --force provided"
-                    )
-                else:
-                    logging.debug(f"Moving {archive_file} to {sample_out}")
-                    stdout, stderr = execute(
-                        f"mv {archive_file} {sample_out}", capture=True, allow_fail=True
-                    )
+                sample_filename = info["filename"].split("/")[-1]
+                sample_out = f"{outdir}/{species}/{sample_filename}"
 
-                    # Compress unless --uncompressed provided
-                    if not uncompressed:
-                        logging.debug(f"Compressing {sample_out}")
+                if file_exists(archive_file):
+                    if (file_exists(sample_out) and not force) or ():
+                        logging.debug(
+                            f"Sample already exists: {sample_out}...skipping unless --force provided"
+                        )
+
+                        # Compress unless --uncompressed provided
+                        if not uncompressed:
+                            needs_compression.append(sample_out)
+                    elif file_exists(f"{sample_out}.gz") and not force:
+                        logging.debug(
+                            f"Sample already exists: {sample_out}.gz...skipping unless --force provided"
+                        )
+                    else:
+                        logging.debug(f"Moving {archive_file} to {sample_out}")
                         stdout, stderr = execute(
-                            f"gzip --force {sample_out}", capture=True, allow_fail=True
+                            f"mv {archive_file} {sample_out}",
+                            capture=True,
+                            allow_fail=True,
                         )
+
+                        # Compress unless --uncompressed provided
+                        if not uncompressed:
+                            needs_compression.append(sample_out)
+                else:
+                    logging.warning(f"Unable to find {info['filename']}")
             else:
-                logging.warning(f"Unable to find {info['filename']}")
-        else:
-            logging.warning(f"{outdir}/{info['filename']}")
+                logging.warning(f"{outdir}/{info['filename']}")
+
+        # Compress samples
+        if len(needs_compression):
+            logging.info(f"Compressing {len(needs_compression)} samples")
+            pgzip(needs_compression, cpus)
 
-    # Cleanup
-    for archive in cleanup:
-        if file_exists(archive):
-            logging.info(f"Removing extracted files: {archive}")
-            stdout, stderr = execute(f"rm -rf {archive}", capture=True, allow_fail=True)
+        # Cleanup
+        for archive in cleanup:
+            if file_exists(archive):
+                logging.info(f"Removing extracted files: {archive}")
+                stdout, stderr = execute(
+                    f"rm -rf {archive}", capture=True, allow_fail=True
+                )
 
-    if remove_archives:
+        if remove_archives:
+            logging.info(
+                "Provided --remove-archives, removing all downloaded archives in {outdir}/archives"
+            )
+            stdout, stderr = execute(
+                f"rm -rf {outdir}/archives", capture=True, allow_fail=True
+            )
+    else:
         logging.info(
-            "Provided --remove-archives, removing all downloaded archives in {outdir}/archives"
-        )
-        stdout, stderr = execute(
-            f"rm -rf {outdir}/archives", capture=True, allow_fail=True
+            "Would have moved samples to species directories and cleaned up archives"
         )
 
 
diff --git a/bactopia/utils.py b/bactopia/utils.py
@@ -7,6 +7,7 @@
 import requests
 import tqdm
 from executor import ExternalCommand, ExternalCommandFailed
+from tqdm.contrib.concurrent import process_map
 
 NCBI_GENOME_SIZE_URL = (
     "https://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/species_genome_size.txt.gz"
@@ -47,6 +48,41 @@ def execute(
             return None
 
 
+def pgzip(files: list, cpus: int) -> list:
+    """
+    Parallel gzip a list of files
+
+    Args:
+        files (list): A list of files to gzip
+        cpus (int): The number of cpus to use
+
+    Returns:
+        list: A list of gzipped files
+    """
+    return process_map(
+        _gzip,
+        files,
+        max_workers=cpus,
+        chunksize=1,
+        bar_format="{l_bar}{bar:80}{r_bar}{bar:-80b}",
+        desc="Gzipping",
+    )
+
+
+def _gzip(filename: str) -> str:
+    """
+    Gzip a file
+
+    Args:
+        filename (str): The file to gzip
+
+    Returns:
+        str: The path to the gzipped file
+    """
+    stdout, stderr = execute(f"gzip --force {filename}", capture=True, allow_fail=True)
+    return f"{filename}.gz"
+
+
 def get_platform() -> str:
     """
     Get the platform of the executing machine
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "bactopia"
-version = "1.2.0"
+version = "1.2.1"
 description = "A Python package for working with Bactopia"
 authors = [
     "Robert A. Petit III <robbie.petit@gmail.com>",