From 3abe21f2536d286783b400a97a3f48c927775d8d Mon Sep 17 00:00:00 2001 From: Julia Klugherz Date: Mon, 24 Jun 2024 14:47:32 -0400 Subject: [PATCH] unzip submission summary before join --- v03_pipeline/lib/reference_data/clinvar.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/v03_pipeline/lib/reference_data/clinvar.py b/v03_pipeline/lib/reference_data/clinvar.py index 405f44ab5..46d06708f 100644 --- a/v03_pipeline/lib/reference_data/clinvar.py +++ b/v03_pipeline/lib/reference_data/clinvar.py @@ -1,5 +1,6 @@ import gzip import os +import shutil import subprocess import tempfile import urllib @@ -176,14 +177,21 @@ def download_and_import_clinvar_submission_summary() -> hl.Table: with tempfile.NamedTemporaryFile( suffix='.txt.gz', delete=False, - ) as tmp_file: + ) as tmp_file, tempfile.NamedTemporaryFile( + suffix='.txt', + delete=False, + ) as unzipped_tmp_file: urllib.request.urlretrieve(CLINVAR_SUBMISSION_SUMMARY_URL, tmp_file.name) # noqa: S310 + # Unzip the gzipped file first to fix gzip files being read by hail with single partition + with gzip.open(tmp_file.name, 'rb') as f_in, open(unzipped_tmp_file.name, 'wb') as f_out: + shutil.copyfileobj(f_in, f_out) + gcs_tmp_file_name = os.path.join( Env.HAIL_TMPDIR, - os.path.basename(tmp_file.name), + os.path.basename(unzipped_tmp_file.name), ) - safely_move_to_gcs(tmp_file.name, gcs_tmp_file_name) - ht = hl.import_table( + safely_move_to_gcs(unzipped_tmp_file.name, gcs_tmp_file_name) + return hl.import_table( gcs_tmp_file_name, force=True, filter='^(#[^:]*:|^##).*$', # removes all comments except for the header line @@ -193,7 +201,5 @@ def download_and_import_clinvar_submission_summary() -> hl.Table: 'ReportedPhenotypeInfo': hl.tstr, }, missing='-', + min_partitions=MIN_HT_PARTITIONS, ) - # NB: min_partitions fails with force=True during `import_table`, but - # an immediate repartition here works. - return ht.repartition(MIN_HT_PARTITIONS)