Skip to content

Commit e5290ed

Browse files
authored
use_new_shuffle=None (#818)
* use_new_shuffle=None * try * lint
1 parent f6346e4 commit e5290ed

File tree

2 files changed

+7
-12
lines changed

2 files changed

+7
-12
lines changed

v03_pipeline/lib/reference_data/clinvar.py

Lines changed: 3 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import gzip
22
import os
3-
import shutil
43
import subprocess
54
import tempfile
65
import urllib
@@ -166,7 +165,6 @@ def join_to_submission_summary_ht(vcf_ht: hl.Table) -> hl.Table:
166165
Submitters=hl.agg.collect(ht.Submitter),
167166
Conditions=hl.agg.collect(ht.ReportedPhenotypeInfo),
168167
)
169-
ht = ht.key_by('VariationID')
170168
return vcf_ht.annotate(
171169
submitters=ht[vcf_ht.rsid].Submitters,
172170
conditions=ht[vcf_ht.rsid].Conditions,
@@ -177,20 +175,13 @@ def download_and_import_clinvar_submission_summary() -> hl.Table:
177175
with tempfile.NamedTemporaryFile(
178176
suffix='.txt.gz',
179177
delete=False,
180-
) as tmp_file, tempfile.NamedTemporaryFile(
181-
suffix='.txt',
182-
delete=False,
183-
) as unzipped_tmp_file:
178+
) as tmp_file:
184179
urllib.request.urlretrieve(CLINVAR_SUBMISSION_SUMMARY_URL, tmp_file.name) # noqa: S310
185-
# Unzip the gzipped file first to fix gzip files being read by hail with single partition
186-
with gzip.open(tmp_file.name, 'rb') as f_in, open(unzipped_tmp_file.name, 'wb') as f_out:
187-
shutil.copyfileobj(f_in, f_out)
188-
189180
gcs_tmp_file_name = os.path.join(
190181
Env.HAIL_TMPDIR,
191-
os.path.basename(unzipped_tmp_file.name),
182+
os.path.basename(tmp_file.name),
192183
)
193-
safely_move_to_gcs(unzipped_tmp_file.name, gcs_tmp_file_name)
184+
safely_move_to_gcs(tmp_file.name, gcs_tmp_file_name)
194185
return hl.import_table(
195186
gcs_tmp_file_name,
196187
force=True,

v03_pipeline/lib/reference_data/dataset_table_operations.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,10 @@ def update_or_create_joined_ht(
3636
continue
3737

3838
# Join the new one!
39+
hl._set_flags(use_new_shuffle=None, no_whole_stage_codegen='1') # noqa: SLF001
3940
dataset_ht = get_dataset_ht(dataset, reference_genome)
4041
dataset_ht, _ = checkpoint(dataset_ht)
42+
hl._set_flags(use_new_shuffle='1', no_whole_stage_codegen='1') # noqa: SLF001
4143
joined_ht = joined_ht.join(dataset_ht, 'outer')
4244
joined_ht = annotate_dataset_globals(joined_ht, dataset, dataset_ht)
4345

@@ -214,8 +216,10 @@ def join_hts(
214216
),
215217
)
216218
for dataset in reference_dataset_collection.datasets(dataset_type):
219+
hl._set_flags(use_new_shuffle=None, no_whole_stage_codegen='1') # noqa: SLF001
217220
dataset_ht = get_dataset_ht(dataset, reference_genome)
218221
dataset_ht, _ = checkpoint(dataset_ht)
222+
hl._set_flags(use_new_shuffle='1', no_whole_stage_codegen='1') # noqa: SLF001
219223
joined_ht = joined_ht.join(dataset_ht, 'outer')
220224
joined_ht = annotate_dataset_globals(joined_ht, dataset, dataset_ht)
221225
return joined_ht

0 commit comments

Comments
 (0)