|
11 | 11 | import bactopia
|
12 | 12 | from bactopia.atb import parse_atb_file_list
|
13 | 13 | from bactopia.ncbi import is_biosample, taxid2name
|
14 |
| -from bactopia.utils import download_url, execute, file_exists, mkdir, validate_file |
| 14 | +from bactopia.utils import ( |
| 15 | + download_url, |
| 16 | + execute, |
| 17 | + file_exists, |
| 18 | + mkdir, |
| 19 | + pgzip, |
| 20 | + validate_file, |
| 21 | +) |
15 | 22 |
|
16 | 23 | # Set up Rich
|
17 | 24 | stderr = rich.console.Console(stderr=True)
|
|
32 | 39 | "--atb-file-list-url",
|
33 | 40 | "--dry-run",
|
34 | 41 | "--progress",
|
| 42 | + "--cpus", |
35 | 43 | "--uncompressed",
|
36 | 44 | "--remove-archives",
|
37 | 45 | ],
|
|
91 | 99 | is_flag=True,
|
92 | 100 | help="Show download progress bar",
|
93 | 101 | )
|
| 102 | +@click.option( |
| 103 | + "--cpus", |
| 104 | + default=4, |
| 105 | + help="The total number of cpus to use for downloading and compressing", |
| 106 | +) |
94 | 107 | @click.option(
|
95 | 108 | "--uncompressed",
|
96 | 109 | "-u",
|
@@ -125,6 +138,7 @@ def atb_downloader(
|
125 | 138 | atb_file_list_url,
|
126 | 139 | dry_run,
|
127 | 140 | progress,
|
| 141 | + cpus, |
128 | 142 | uncompressed,
|
129 | 143 | remove_archives,
|
130 | 144 | ncbi_api_key,
|
@@ -194,102 +208,124 @@ def atb_downloader(
|
194 | 208 | logging.error(f"Species not found in ATB file list: {query_species}")
|
195 | 209 | sys.exit(1)
|
196 | 210 | matched_samples = species[query_species]
|
| 211 | + |
| 212 | + # Estimate total size of downloads |
| 213 | + total_size = 0 |
| 214 | + for archive, info in archives_to_download.items(): |
| 215 | + total_size += float(info["size"]) |
| 216 | + |
197 | 217 | logging.info(f"Found {len(matched_samples)} samples to extract")
|
198 | 218 | logging.debug(f"Samples: {matched_samples}")
|
199 |
| - logging.info(f"Found {len(archives_to_download)} archives to download") |
| 219 | + logging.info( |
| 220 | + f"Found {len(archives_to_download)} archives (~{int(total_size):,} MB) to download" |
| 221 | + ) |
200 | 222 | if verbose:
|
201 | 223 | for archive in archives_to_download:
|
202 | 224 | logging.debug(f"Archive: {archive}")
|
203 | 225 |
|
204 | 226 | # Check if archives exist, otherwise download
|
205 |
| - logging.info(f"Downloading archives to: {outdir}/archives") |
206 |
| - mkdir(f"{outdir}/archives") |
207 |
| - for archive, info in archives_to_download.items(): |
208 |
| - archive_path = f"{str(outdir)}/archives/{archive}" |
209 |
| - if file_exists(archive_path): |
210 |
| - logging.info(f"Using existing archive: {archive_path}") |
211 |
| - archive_path = validate_file(archive_path) |
212 |
| - else: |
213 |
| - logging.info(f"Downloading archive to: {archive_path}") |
214 |
| - if dry_run: |
215 |
| - logging.info(f"Would download: {info['url']} to {archive_path}") |
| 227 | + if not dry_run: |
| 228 | + logging.info(f"Downloading archives to: {outdir}/archives") |
| 229 | + mkdir(f"{outdir}/archives") |
| 230 | + for archive, info in archives_to_download.items(): |
| 231 | + archive_path = f"{str(outdir)}/archives/{archive}" |
| 232 | + if file_exists(archive_path): |
| 233 | + logging.info(f"Using existing archive: {archive_path}") |
| 234 | + archive_path = validate_file(archive_path) |
216 | 235 | else:
|
217 |
| - archive_path = download_url(info["url"], archive_path, progress) |
| 236 | + logging.info(f"Downloading archive to: {archive_path}") |
| 237 | + if dry_run: |
| 238 | + logging.info(f"Would download: {info['url']} to {archive_path}") |
| 239 | + else: |
| 240 | + archive_path = download_url(info["url"], archive_path, progress) |
218 | 241 |
|
219 |
| - # Extract each of the archives |
220 |
| - cleanup = [] |
221 |
| - for archive, info in archives_to_download.items(): |
222 |
| - archive_path = f"{str(outdir)}/archives/{archive}" |
223 |
| - if dry_run: |
224 |
| - logging.info(f"Would have extracted: {archive_path}") |
225 |
| - else: |
226 |
| - logging.info(f"Extracting: {archive_path}") |
227 |
| - stdout, stderr = execute( |
228 |
| - f"tar xf {archive_path} -C {outdir}", capture=True, allow_fail=True |
229 |
| - ) |
230 |
| - cleanup_dir = f"{outdir}/{archive.replace('.tar.xz', '')}" |
231 |
| - logging.debug(f"Adding {cleanup_dir} to cleanup list") |
232 |
| - cleanup.append(f"{outdir}/{archive.replace('.tar.xz', '')}") |
| 242 | + # Extract each of the archives |
| 243 | + cleanup = [] |
| 244 | + for archive, info in archives_to_download.items(): |
| 245 | + archive_path = f"{str(outdir)}/archives/{archive}" |
| 246 | + if dry_run: |
| 247 | + logging.info(f"Would have extracted: {archive_path}") |
| 248 | + else: |
| 249 | + logging.info(f"Extracting: {archive_path}") |
| 250 | + stdout, stderr = execute( |
| 251 | + f"tar xf {archive_path} -C {outdir}", capture=True, allow_fail=True |
| 252 | + ) |
| 253 | + cleanup_dir = f"{outdir}/{archive.replace('.tar.xz', '')}" |
| 254 | + logging.debug(f"Adding {cleanup_dir} to cleanup list") |
| 255 | + cleanup.append(f"{outdir}/{archive.replace('.tar.xz', '')}") |
| 256 | + else: |
| 257 | + logging.info("Would have downloaded and extracted archives") |
233 | 258 |
|
234 | 259 | # Move samples into species directories, then compress
|
235 | 260 | species_dirs = {}
|
236 |
| - logging.info(f"Moving {len(matched_samples)} samples to: {outdir}") |
237 |
| - for i, sample in enumerate(matched_samples): |
238 |
| - if uncompressed: |
239 |
| - logging.info(f"Moving sample {i+1} of {len(matched_samples)}: {sample}") |
240 |
| - else: |
241 |
| - logging.info( |
242 |
| - f"Moving and compressing sample {i+1} of {len(matched_samples)}: {sample}" |
243 |
| - ) |
244 |
| - info = samples[sample] |
245 |
| - species = info["species_sylph"].lower().replace(" ", "_") |
246 |
| - if species not in species_dirs: |
247 |
| - species_dirs[species] = True |
248 |
| - mkdir(f"{outdir}/{species}") |
249 |
| - |
250 |
| - archive_file = f"{outdir}/{info['filename']}" |
251 |
| - if file_exists(archive_file): |
252 |
| - sample_filename = info["filename"].split("/")[-1] |
253 |
| - sample_out = f"{outdir}/{species}/{sample_filename}" |
| 261 | + needs_compression = [] |
| 262 | + if not dry_run: |
| 263 | + logging.info(f"Moving {len(matched_samples)} samples to: {outdir}") |
| 264 | + for i, sample in enumerate(matched_samples): |
| 265 | + logging.debug(f"Moving sample {i+1} of {len(matched_samples)}: {sample}") |
| 266 | + info = samples[sample] |
| 267 | + species = info["species_sylph"].lower().replace(" ", "_") |
| 268 | + if species not in species_dirs: |
| 269 | + species_dirs[species] = True |
| 270 | + mkdir(f"{outdir}/{species}") |
254 | 271 |
|
| 272 | + archive_file = f"{outdir}/{info['filename']}" |
255 | 273 | if file_exists(archive_file):
|
256 |
| - if (file_exists(sample_out) and not force) or (): |
257 |
| - logging.debug( |
258 |
| - f"Sample already exists: {sample_out}...skipping unless --force provided" |
259 |
| - ) |
260 |
| - elif file_exists(f"{sample_out}.gz") and not force: |
261 |
| - logging.debug( |
262 |
| - f"Sample already exists: {sample_out}.gz...skipping unless --force provided" |
263 |
| - ) |
264 |
| - else: |
265 |
| - logging.debug(f"Moving {archive_file} to {sample_out}") |
266 |
| - stdout, stderr = execute( |
267 |
| - f"mv {archive_file} {sample_out}", capture=True, allow_fail=True |
268 |
| - ) |
| 274 | + sample_filename = info["filename"].split("/")[-1] |
| 275 | + sample_out = f"{outdir}/{species}/{sample_filename}" |
269 | 276 |
|
270 |
| - # Compress unless --uncompressed provided |
271 |
| - if not uncompressed: |
272 |
| - logging.debug(f"Compressing {sample_out}") |
| 277 | + if file_exists(archive_file): |
| 278 | + if (file_exists(sample_out) and not force) or (): |
| 279 | + logging.debug( |
| 280 | + f"Sample already exists: {sample_out}...skipping unless --force provided" |
| 281 | + ) |
| 282 | + |
| 283 | + # Compress unless --uncompressed provided |
| 284 | + if not uncompressed: |
| 285 | + needs_compression.append(sample_out) |
| 286 | + elif file_exists(f"{sample_out}.gz") and not force: |
| 287 | + logging.debug( |
| 288 | + f"Sample already exists: {sample_out}.gz...skipping unless --force provided" |
| 289 | + ) |
| 290 | + else: |
| 291 | + logging.debug(f"Moving {archive_file} to {sample_out}") |
273 | 292 | stdout, stderr = execute(
|
274 |
| - f"gzip --force {sample_out}", capture=True, allow_fail=True |
| 293 | + f"mv {archive_file} {sample_out}", |
| 294 | + capture=True, |
| 295 | + allow_fail=True, |
275 | 296 | )
|
| 297 | + |
| 298 | + # Compress unless --uncompressed provided |
| 299 | + if not uncompressed: |
| 300 | + needs_compression.append(sample_out) |
| 301 | + else: |
| 302 | + logging.warning(f"Unable to find {info['filename']}") |
276 | 303 | else:
|
277 |
| - logging.warning(f"Unable to find {info['filename']}") |
278 |
| - else: |
279 |
| - logging.warning(f"{outdir}/{info['filename']}") |
| 304 | + logging.warning(f"{outdir}/{info['filename']}") |
| 305 | + |
| 306 | + # Compress samples |
| 307 | + if len(needs_compression): |
| 308 | + logging.info(f"Compressing {len(needs_compression)} samples") |
| 309 | + pgzip(needs_compression, cpus) |
280 | 310 |
|
281 |
| - # Cleanup |
282 |
| - for archive in cleanup: |
283 |
| - if file_exists(archive): |
284 |
| - logging.info(f"Removing extracted files: {archive}") |
285 |
| - stdout, stderr = execute(f"rm -rf {archive}", capture=True, allow_fail=True) |
| 311 | + # Cleanup |
| 312 | + for archive in cleanup: |
| 313 | + if file_exists(archive): |
| 314 | + logging.info(f"Removing extracted files: {archive}") |
| 315 | + stdout, stderr = execute( |
| 316 | + f"rm -rf {archive}", capture=True, allow_fail=True |
| 317 | + ) |
286 | 318 |
|
287 |
| - if remove_archives: |
| 319 | + if remove_archives: |
| 320 | + logging.info( |
| 321 | + "Provided --remove-archives, removing all downloaded archives in {outdir}/archives" |
| 322 | + ) |
| 323 | + stdout, stderr = execute( |
| 324 | + f"rm -rf {outdir}/archives", capture=True, allow_fail=True |
| 325 | + ) |
| 326 | + else: |
288 | 327 | logging.info(
|
289 |
| - "Provided --remove-archives, removing all downloaded archives in {outdir}/archives" |
290 |
| - ) |
291 |
| - stdout, stderr = execute( |
292 |
| - f"rm -rf {outdir}/archives", capture=True, allow_fail=True |
| 328 | + "Would have moved samples to species directories and cleaned up archives" |
293 | 329 | )
|
294 | 330 |
|
295 | 331 |
|
|
0 commit comments