Skip to content

Commit 73b2a68

Browse files
authored
unzip submission summary before join (#816)
1 parent 7a9094c commit 73b2a68

File tree

1 file changed

+13
-7
lines changed

1 file changed

+13
-7
lines changed

v03_pipeline/lib/reference_data/clinvar.py

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import gzip
22
import os
3+
import shutil
34
import subprocess
45
import tempfile
56
import urllib
@@ -176,14 +177,21 @@ def download_and_import_clinvar_submission_summary() -> hl.Table:
176177
with tempfile.NamedTemporaryFile(
177178
suffix='.txt.gz',
178179
delete=False,
179-
) as tmp_file:
180+
) as tmp_file, tempfile.NamedTemporaryFile(
181+
suffix='.txt',
182+
delete=False,
183+
) as unzipped_tmp_file:
180184
urllib.request.urlretrieve(CLINVAR_SUBMISSION_SUMMARY_URL, tmp_file.name) # noqa: S310
185+
# Unzip the gzipped file first to fix gzip files being read by hail with single partition
186+
with gzip.open(tmp_file.name, 'rb') as f_in, open(unzipped_tmp_file.name, 'wb') as f_out:
187+
shutil.copyfileobj(f_in, f_out)
188+
181189
gcs_tmp_file_name = os.path.join(
182190
Env.HAIL_TMPDIR,
183-
os.path.basename(tmp_file.name),
191+
os.path.basename(unzipped_tmp_file.name),
184192
)
185-
safely_move_to_gcs(tmp_file.name, gcs_tmp_file_name)
186-
ht = hl.import_table(
193+
safely_move_to_gcs(unzipped_tmp_file.name, gcs_tmp_file_name)
194+
return hl.import_table(
187195
gcs_tmp_file_name,
188196
force=True,
189197
filter='^(#[^:]*:|^##).*$', # removes all comments except for the header line
@@ -193,7 +201,5 @@ def download_and_import_clinvar_submission_summary() -> hl.Table:
193201
'ReportedPhenotypeInfo': hl.tstr,
194202
},
195203
missing='-',
204+
min_partitions=MIN_HT_PARTITIONS,
196205
)
197-
# NB: min_partitions fails with force=True during `import_table`, but
198-
# an immediate repartition here works.
199-
return ht.repartition(MIN_HT_PARTITIONS)

0 commit comments

Comments
 (0)