1
1
import gzip
2
2
import os
3
+ import shutil
3
4
import subprocess
4
5
import tempfile
5
6
import urllib
@@ -176,14 +177,21 @@ def download_and_import_clinvar_submission_summary() -> hl.Table:
176
177
with tempfile .NamedTemporaryFile (
177
178
suffix = '.txt.gz' ,
178
179
delete = False ,
179
- ) as tmp_file :
180
+ ) as tmp_file , tempfile .NamedTemporaryFile (
181
+ suffix = '.txt' ,
182
+ delete = False ,
183
+ ) as unzipped_tmp_file :
180
184
urllib .request .urlretrieve (CLINVAR_SUBMISSION_SUMMARY_URL , tmp_file .name ) # noqa: S310
185
+ # Unzip the gzipped file first to fix gzip files being read by hail with single partition
186
+ with gzip .open (tmp_file .name , 'rb' ) as f_in , open (unzipped_tmp_file .name , 'wb' ) as f_out :
187
+ shutil .copyfileobj (f_in , f_out )
188
+
181
189
gcs_tmp_file_name = os .path .join (
182
190
Env .HAIL_TMPDIR ,
183
- os .path .basename (tmp_file .name ),
191
+ os .path .basename (unzipped_tmp_file .name ),
184
192
)
185
- safely_move_to_gcs (tmp_file .name , gcs_tmp_file_name )
186
- ht = hl .import_table (
193
+ safely_move_to_gcs (unzipped_tmp_file .name , gcs_tmp_file_name )
194
+ return hl .import_table (
187
195
gcs_tmp_file_name ,
188
196
force = True ,
189
197
filter = '^(#[^:]*:|^##).*$' , # removes all comments except for the header line
@@ -193,7 +201,5 @@ def download_and_import_clinvar_submission_summary() -> hl.Table:
193
201
'ReportedPhenotypeInfo' : hl .tstr ,
194
202
},
195
203
missing = '-' ,
204
+ min_partitions = MIN_HT_PARTITIONS ,
196
205
)
197
- # NB: min_partitions fails with force=True during `import_table`, but
198
- # an immediate repartition here works.
199
- return ht .repartition (MIN_HT_PARTITIONS )
0 commit comments