merge

bpblanken · bpblanken · commit 0c5d8fc81d49 · 2024-06-24T17:15:09.000-04:00
diff --git a/v03_pipeline/lib/misc/family_entries.py b/v03_pipeline/lib/misc/family_entries.py
@@ -98,13 +98,22 @@ def remove_family_guids(
     family_guids: hl.SetExpression,
 ) -> hl.Table:
     # Remove families from the existing project table structure (both the entries arrays and the globals are mutated)
-    family_indexes_to_keep = hl.array(
-        hl.enumerate(ht.globals.family_guids)
-        .filter(lambda item: ~family_guids.contains(item[1]))
-        .map(lambda item: item[0]),
+    family_indexes_to_keep = hl.eval(
+        hl.array(
+            hl.enumerate(ht.globals.family_guids)
+            .filter(lambda item: ~family_guids.contains(item[1]))
+            .map(lambda item: item[0]),
+        ),
     )
     ht = ht.annotate(
-        family_entries=family_indexes_to_keep.map(lambda i: ht.family_entries[i]),
+        # NB: this "should" work without the extra if statement (and does in the tests)
+        # however, experiments on dataproc showed this statement hanging with an empty
+        # unevaluated indexes array.
+        family_entries=hl.array(family_indexes_to_keep).map(
+            lambda i: ht.family_entries[i],
+        )
+        if len(family_indexes_to_keep) > 0
+        else hl.empty_array(ht.family_entries.dtype.element_type),
     )
     ht = ht.filter(hl.any(ht.family_entries.map(hl.is_defined)))
     return ht.annotate_globals(
diff --git a/v03_pipeline/lib/misc/lookup.py b/v03_pipeline/lib/misc/lookup.py
@@ -128,22 +128,23 @@ def remove_project(
     existing_project_guids = hl.eval(ht.globals.project_guids)
     if project_guid not in existing_project_guids:
         return ht
-    project_indexes_to_keep = (
+    project_indexes_to_keep = hl.eval(
         hl.enumerate(existing_project_guids)
         .filter(lambda item: item[1] != project_guid)
-        .map(lambda item: item[0])
+        .map(lambda item: item[0]),
     )
     ht = ht.annotate(
         project_stats=(
-            project_indexes_to_keep.map(
-                lambda i: ht.project_stats[i],
-            )
+            # See "remove_family_guids" func for why this was necessary
+            hl.array(project_indexes_to_keep).map(lambda i: ht.project_stats[i])
+            if len(project_indexes_to_keep) > 0
+            else hl.empty_array(ht.project_stats.dtype.element_type)
         ),
     )
     ht = ht.filter(hl.any(ht.project_stats.map(hl.is_defined)))
     return ht.annotate_globals(
-        project_guids=project_indexes_to_keep.map(
-            lambda i: ht.project_guids[i],
+        project_guids=ht.project_guids.filter(
+            lambda p: p != project_guid,
         ),
         project_families=hl.dict(
             ht.project_families.items().filter(lambda item: item[0] != project_guid),
diff --git a/v03_pipeline/lib/reference_data/clinvar.py b/v03_pipeline/lib/reference_data/clinvar.py
@@ -1,6 +1,5 @@
 import gzip
 import os
-import shutil
 import subprocess
 import tempfile
 import urllib
@@ -166,7 +165,6 @@ def join_to_submission_summary_ht(vcf_ht: hl.Table) -> hl.Table:
         Submitters=hl.agg.collect(ht.Submitter),
         Conditions=hl.agg.collect(ht.ReportedPhenotypeInfo),
     )
-    ht = ht.key_by('VariationID')
     return vcf_ht.annotate(
         submitters=ht[vcf_ht.rsid].Submitters,
         conditions=ht[vcf_ht.rsid].Conditions,
@@ -177,23 +175,13 @@ def download_and_import_clinvar_submission_summary() -> hl.Table:
     with tempfile.NamedTemporaryFile(
         suffix='.txt.gz',
         delete=False,
-    ) as tmp_file, tempfile.NamedTemporaryFile(
-        suffix='.txt',
-        delete=False,
-    ) as unzipped_tmp_file:
+    ) as tmp_file:
         urllib.request.urlretrieve(CLINVAR_SUBMISSION_SUMMARY_URL, tmp_file.name)  # noqa: S310
-        # Unzip the gzipped file first to fix gzip files being read by hail with single partition
-        with gzip.open(tmp_file.name, 'rb') as f_in, open(
-            unzipped_tmp_file.name,
-            'wb',
-        ) as f_out:
-            shutil.copyfileobj(f_in, f_out)
-
         gcs_tmp_file_name = os.path.join(
             Env.HAIL_TMPDIR,
-            os.path.basename(unzipped_tmp_file.name),
+            os.path.basename(tmp_file.name),
         )
-        safely_move_to_gcs(unzipped_tmp_file.name, gcs_tmp_file_name)
+        safely_move_to_gcs(tmp_file.name, gcs_tmp_file_name)
         return hl.import_table(
             gcs_tmp_file_name,
             force=True,
diff --git a/v03_pipeline/lib/reference_data/dataset_table_operations.py b/v03_pipeline/lib/reference_data/dataset_table_operations.py
@@ -36,8 +36,10 @@ def update_or_create_joined_ht(
             continue
 
         # Join the new one!
+        hl._set_flags(use_new_shuffle=None, no_whole_stage_codegen='1')  # noqa: SLF001
         dataset_ht = get_dataset_ht(dataset, reference_genome)
         dataset_ht, _ = checkpoint(dataset_ht)
+        hl._set_flags(use_new_shuffle='1', no_whole_stage_codegen='1')  # noqa: SLF001
         joined_ht = joined_ht.join(dataset_ht, 'outer')
         joined_ht = annotate_dataset_globals(joined_ht, dataset, dataset_ht)
 
@@ -214,8 +216,10 @@ def join_hts(
         ),
     )
     for dataset in reference_dataset_collection.datasets(dataset_type):
+        hl._set_flags(use_new_shuffle=None, no_whole_stage_codegen='1')  # noqa: SLF001
         dataset_ht = get_dataset_ht(dataset, reference_genome)
         dataset_ht, _ = checkpoint(dataset_ht)
+        hl._set_flags(use_new_shuffle='1', no_whole_stage_codegen='1')  # noqa: SLF001
         joined_ht = joined_ht.join(dataset_ht, 'outer')
         joined_ht = annotate_dataset_globals(joined_ht, dataset, dataset_ht)
     return joined_ht