Merge pull request #910 from broadinstitute/dev

bpblanken · web-flow · commit 5ffa3e31c9b6 · 2024-09-27T13:59:06.000-04:00
Dev
diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
@@ -36,7 +36,6 @@ jobs:
         run: ruff . --output-format github
       - name: Unit Tests
         run: |
-          export HAIL_TMP_DIR=/tmp
           export GRCH37_TO_GRCH38_LIFTOVER_REF_PATH=v03_pipeline/var/test/liftover/grch37_to_grch38.over.chain.gz
           export GRCH38_TO_GRCH37_LIFTOVER_REF_PATH=v03_pipeline/var/test/liftover/grch38_to_grch37.over.chain.gz
           export ACCESS_PRIVATE_REFERENCE_DATASETS=1
diff --git a/v03_pipeline/lib/misc/callsets.py b/v03_pipeline/lib/misc/callsets.py
@@ -30,7 +30,7 @@ def get_callset_ht(
     return callset_ht.distinct()
 
 
-def additional_row_fields(
+def get_additional_row_fields(
     mt: hl.MatrixTable,
     dataset_type: DatasetType,
     skip_check_sex_and_relatedness: bool,
diff --git a/v03_pipeline/lib/misc/validation.py b/v03_pipeline/lib/misc/validation.py
@@ -1,6 +1,19 @@
+from typing import Any
+
 import hail as hl
 
-from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType, Sex
+from v03_pipeline.lib.model import (
+    CachedReferenceDatasetQuery,
+    DatasetType,
+    Env,
+    ReferenceGenome,
+    SampleType,
+    Sex,
+)
+from v03_pipeline.lib.paths import (
+    cached_reference_dataset_query_path,
+    sex_check_table_path,
+)
 
 AMBIGUOUS_THRESHOLD_PERC: float = 0.01  # Fraction of samples identified as "ambiguous_sex" above which an error will be thrown.
 MIN_ROWS_PER_CONTIG = 100
@@ -11,9 +24,40 @@ class SeqrValidationError(Exception):
     pass
 
 
+def get_validation_dependencies(
+    dataset_type: DatasetType,
+    reference_genome: ReferenceGenome,
+    callset_path: str,
+    skip_check_sex_and_relatedness: bool,
+    **_: Any,
+) -> dict[str, hl.Table]:
+    deps = {}
+    deps['coding_and_noncoding_variants_ht'] = hl.read_table(
+        cached_reference_dataset_query_path(
+            reference_genome,
+            dataset_type,
+            CachedReferenceDatasetQuery.GNOMAD_CODING_AND_NONCODING_VARIANTS,
+        ),
+    )
+    if (
+        Env.CHECK_SEX_AND_RELATEDNESS
+        and dataset_type.check_sex_and_relatedness
+        and not skip_check_sex_and_relatedness
+    ):
+        deps['sex_check_ht'] = hl.read_table(
+            sex_check_table_path(
+                reference_genome,
+                dataset_type,
+                callset_path,
+            ),
+        )
+    return deps
+
+
 def validate_allele_type(
     mt: hl.MatrixTable,
     dataset_type: DatasetType,
+    **_: Any,
 ) -> None:
     ht = mt.rows()
     ht = ht.filter(
@@ -31,6 +75,7 @@ def validate_allele_type(
 
 def validate_no_duplicate_variants(
     mt: hl.MatrixTable,
+    **_: Any,
 ) -> None:
     ht = mt.rows()
     ht = ht.group_by(*ht.key).aggregate(n=hl.agg.count())
@@ -44,6 +89,7 @@ def validate_expected_contig_frequency(
     mt: hl.MatrixTable,
     reference_genome: ReferenceGenome,
     min_rows_per_contig: int = MIN_ROWS_PER_CONTIG,
+    **_: Any,
 ) -> None:
     rows_per_contig = mt.aggregate_rows(hl.agg.counter(mt.locus.contig))
     missing_contigs = (
@@ -69,6 +115,7 @@ def validate_imported_field_types(
     mt: hl.MatrixTable,
     dataset_type: DatasetType,
     additional_row_fields: dict[str, hl.expr.types.HailType | set],
+    **_: Any,
 ) -> None:
     def _validate_field(
         mt_schema: hl.StructExpression,
@@ -104,8 +151,12 @@ def _validate_field(
 
 def validate_imputed_sex_ploidy(
     mt: hl.MatrixTable,
-    sex_check_ht: hl.Table,
+    # NB: sex_check_ht will be undefined if sex checking is disabled for the run
+    sex_check_ht: hl.Table | None = None,
+    **_: Any,
 ) -> None:
+    if not sex_check_ht:
+        return
     mt = mt.select_cols(
         discrepant=(
             (
@@ -132,6 +183,7 @@ def validate_sample_type(
     reference_genome: ReferenceGenome,
     sample_type: SampleType,
     sample_type_match_threshold: float = SAMPLE_TYPE_MATCH_THRESHOLD,
+    **_: Any,
 ) -> None:
     coding_variants_ht = coding_and_noncoding_variants_ht.filter(
         coding_and_noncoding_variants_ht.coding,
diff --git a/v03_pipeline/lib/misc/validation_test.py b/v03_pipeline/lib/misc/validation_test.py
@@ -1,4 +1,5 @@
 import unittest
+from unittest.mock import Mock, patch
 
 import hail as hl
 
@@ -80,7 +81,9 @@ def test_validate_allele_type(self) -> None:
             DatasetType.SNV_INDEL,
         )
 
-    def test_validate_imputed_sex_ploidy(self) -> None:
+    @patch('v03_pipeline.lib.misc.validation.Env')
+    def test_validate_imputed_sex_ploidy(self, mock_env: Mock) -> None:
+        mock_env.CHECK_SEX_AND_RELATEDNESS = True
         sex_check_ht = hl.read_table(TEST_SEX_CHECK_1)
         mt = hl.MatrixTable.from_parts(
             rows={
diff --git a/v03_pipeline/lib/tasks/base/base_loading_run_params.py b/v03_pipeline/lib/tasks/base/base_loading_run_params.py
@@ -16,6 +16,7 @@ class BaseLoadingRunParams(luigi.Task):
     # - The "Loading Pipeline" params are shared with
     # tasks that may remove data from or change the
     # structure of the persisted Hail Tables.
+    run_id = luigi.Parameter()
     sample_type = luigi.EnumParameter(enum=SampleType)
     callset_path = luigi.Parameter()
     ignore_missing_samples_when_remapping = luigi.BoolParameter(
diff --git a/v03_pipeline/lib/tasks/update_lookup_table.py b/v03_pipeline/lib/tasks/update_lookup_table.py
@@ -24,7 +24,6 @@ class UpdateLookupTableTask(BaseUpdateLookupTableTask):
     project_guids = luigi.ListParameter()
     project_remap_paths = luigi.ListParameter()
     project_pedigree_paths = luigi.ListParameter()
-    run_id = luigi.Parameter()
 
     def complete(self) -> bool:
         return super().complete() and hl.eval(
diff --git a/v03_pipeline/lib/tasks/update_project_table_test.py b/v03_pipeline/lib/tasks/update_project_table_test.py
@@ -13,13 +13,16 @@
     'v03_pipeline/var/test/pedigrees/test_pedigree_3_different_families.tsv'
 )
 
+TEST_RUN_ID = 'manual__2024-04-03'
+
 
 class UpdateProjectTableTaskTest(MockedDatarootTestCase):
     def test_update_project_table_task(self) -> None:
         worker = luigi.worker.Worker()
         upt_task = UpdateProjectTableTask(
             reference_genome=ReferenceGenome.GRCh38,
             dataset_type=DatasetType.SNV_INDEL,
+            run_id=TEST_RUN_ID,
             sample_type=SampleType.WGS,
             callset_path=TEST_VCF,
             project_guid='R0113_test_project',
@@ -128,6 +131,7 @@ def test_update_project_table_task_different_pedigree(self) -> None:
         upt_task = UpdateProjectTableTask(
             reference_genome=ReferenceGenome.GRCh38,
             dataset_type=DatasetType.SNV_INDEL,
+            run_id=TEST_RUN_ID,
             sample_type=SampleType.WGS,
             callset_path=TEST_VCF,
             project_guid='R0113_test_project',
@@ -140,6 +144,7 @@ def test_update_project_table_task_different_pedigree(self) -> None:
         upt_task = UpdateProjectTableTask(
             reference_genome=ReferenceGenome.GRCh38,
             dataset_type=DatasetType.SNV_INDEL,
+            run_id=TEST_RUN_ID,
             sample_type=SampleType.WGS,
             callset_path=TEST_VCF,
             project_guid='R0113_test_project',
diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples.py
@@ -23,7 +23,6 @@ class UpdateVariantAnnotationsTableWithNewSamplesTask(
     project_guids = luigi.ListParameter()
     project_remap_paths = luigi.ListParameter()
     project_pedigree_paths = luigi.ListParameter()
-    run_id = luigi.Parameter()
 
     def requires(self) -> list[luigi.Task]:
         return [
diff --git a/v03_pipeline/lib/tasks/validate_callset.py b/v03_pipeline/lib/tasks/validate_callset.py
@@ -2,21 +2,18 @@
 import luigi
 import luigi.util
 
-from v03_pipeline.lib.misc.callsets import additional_row_fields
 from v03_pipeline.lib.misc.validation import (
+    get_validation_dependencies,
     validate_allele_type,
     validate_expected_contig_frequency,
-    validate_imported_field_types,
     validate_imputed_sex_ploidy,
     validate_no_duplicate_variants,
     validate_sample_type,
 )
 from v03_pipeline.lib.model import CachedReferenceDatasetQuery
 from v03_pipeline.lib.model.environment import Env
 from v03_pipeline.lib.paths import (
-    cached_reference_dataset_query_path,
     imported_callset_path,
-    sex_check_table_path,
 )
 from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams
 from v03_pipeline.lib.tasks.base.base_update import BaseUpdateTask
@@ -63,8 +60,8 @@ def requires(self) -> list[luigi.Task]:
             ]
         if (
             Env.CHECK_SEX_AND_RELATEDNESS
-            and not self.skip_check_sex_and_relatedness
             and self.dataset_type.check_sex_and_relatedness
+            and not self.skip_check_sex_and_relatedness
         ):
             requirements = [
                 *requirements,
@@ -83,17 +80,6 @@ def update_table(self, mt: hl.MatrixTable) -> hl.MatrixTable:
                 self.callset_path,
             ),
         )
-        # This validation isn't override-able.  If a field is the wrong
-        # type, the pipeline will likely hard-fail downstream.
-        validate_imported_field_types(
-            mt,
-            self.dataset_type,
-            additional_row_fields(
-                mt,
-                self.dataset_type,
-                self.skip_check_sex_and_relatedness,
-            ),
-        )
         if self.dataset_type.can_run_validation:
             # Rather than throwing an error, we silently remove invalid contigs.
             # This happens fairly often for AnVIL requests.
@@ -104,38 +90,34 @@ def update_table(self, mt: hl.MatrixTable) -> hl.MatrixTable:
             )
 
         if not self.skip_validation and self.dataset_type.can_run_validation:
-            validate_allele_type(mt, self.dataset_type)
-            validate_no_duplicate_variants(mt)
-            validate_expected_contig_frequency(mt, self.reference_genome)
-            coding_and_noncoding_ht = hl.read_table(
-                cached_reference_dataset_query_path(
-                    self.reference_genome,
-                    self.dataset_type,
-                    CachedReferenceDatasetQuery.GNOMAD_CODING_AND_NONCODING_VARIANTS,
-                ),
+            validation_dependencies = get_validation_dependencies(
+                **self.param_kwargs,
+            )
+            validate_allele_type(
+                mt,
+                **self.param_kwargs,
+                **validation_dependencies,
+            )
+            validate_no_duplicate_variants(
+                mt,
+                **self.param_kwargs,
+                **validation_dependencies,
+            )
+            validate_expected_contig_frequency(
+                mt,
+                **self.param_kwargs,
+                **validation_dependencies,
             )
             validate_sample_type(
                 mt,
-                coding_and_noncoding_ht,
-                self.reference_genome,
-                self.sample_type,
+                **self.param_kwargs,
+                **validation_dependencies,
+            )
+            validate_imputed_sex_ploidy(
+                mt,
+                **self.param_kwargs,
+                **validation_dependencies,
             )
-            if (
-                Env.CHECK_SEX_AND_RELATEDNESS
-                and not self.skip_check_sex_and_relatedness
-                and self.dataset_type.check_sex_and_relatedness
-            ):
-                sex_check_ht = hl.read_table(
-                    sex_check_table_path(
-                        self.reference_genome,
-                        self.dataset_type,
-                        self.callset_path,
-                    ),
-                )
-                validate_imputed_sex_ploidy(
-                    mt,
-                    sex_check_ht,
-                )
         return mt.select_globals(
             callset_path=self.callset_path,
             validated_sample_type=self.sample_type.value,
diff --git a/v03_pipeline/lib/tasks/write_family_table_test.py b/v03_pipeline/lib/tasks/write_family_table_test.py
@@ -12,13 +12,16 @@
 TEST_PEDIGREE_3 = 'v03_pipeline/var/test/pedigrees/test_pedigree_3.tsv'
 TEST_PEDIGREE_5 = 'v03_pipeline/var/test/pedigrees/test_pedigree_5.tsv'
 
+TEST_RUN_ID = 'manual__2024-04-03'
+
 
 class WriteFamilyTableTaskTest(MockedDatarootTestCase):
     def test_snv_write_family_table_task(self) -> None:
         worker = luigi.worker.Worker()
         wft_task = WriteFamilyTableTask(
             reference_genome=ReferenceGenome.GRCh38,
             dataset_type=DatasetType.SNV_INDEL,
+            run_id=TEST_RUN_ID,
             sample_type=SampleType.WGS,
             callset_path=TEST_SNV_INDEL_VCF,
             project_guid='R0113_test_project',
@@ -156,6 +159,7 @@ def test_sv_write_family_table_task(self) -> None:
         write_family_table_task = WriteFamilyTableTask(
             reference_genome=ReferenceGenome.GRCh38,
             dataset_type=DatasetType.SV,
+            run_id=TEST_RUN_ID,
             sample_type=SampleType.WGS,
             callset_path=TEST_SV_VCF,
             project_guid='R0115_test_project2',
@@ -408,6 +412,7 @@ def test_gcnv_write_family_table_task(self) -> None:
         write_family_table_task = WriteFamilyTableTask(
             reference_genome=ReferenceGenome.GRCh38,
             dataset_type=DatasetType.GCNV,
+            run_id=TEST_RUN_ID,
             sample_type=SampleType.WES,
             callset_path=TEST_GCNV_BED_FILE,
             project_guid='R0115_test_project2',
diff --git a/v03_pipeline/lib/tasks/write_imported_callset.py b/v03_pipeline/lib/tasks/write_imported_callset.py
@@ -2,13 +2,16 @@
 import luigi
 import luigi.util
 
-from v03_pipeline.lib.misc.callsets import additional_row_fields
+from v03_pipeline.lib.misc.callsets import get_additional_row_fields
 from v03_pipeline.lib.misc.io import (
     import_callset,
     import_vcf,
     select_relevant_fields,
     split_multi_hts,
 )
+from v03_pipeline.lib.misc.validation import (
+    validate_imported_field_types,
+)
 from v03_pipeline.lib.misc.vets import annotate_vets
 from v03_pipeline.lib.model.environment import Env
 from v03_pipeline.lib.paths import (
@@ -79,14 +82,22 @@ def create_table(self) -> hl.MatrixTable:
             )
             filters_ht = import_vcf(filters_path, self.reference_genome).rows()
             mt = mt.annotate_rows(filters=filters_ht[mt.row_key].filters)
+        additional_row_fields = get_additional_row_fields(
+            mt,
+            self.dataset_type,
+            self.skip_check_sex_and_relatedness,
+        )
         mt = select_relevant_fields(
             mt,
             self.dataset_type,
-            additional_row_fields(
-                mt,
-                self.dataset_type,
-                self.skip_check_sex_and_relatedness,
-            ),
+            additional_row_fields,
+        )
+        # This validation isn't override-able by the skip option.
+        # If a field is the wrong type, the pipeline will likely hard-fail downstream.
+        validate_imported_field_types(
+            mt,
+            self.dataset_type,
+            additional_row_fields,
         )
         if self.dataset_type.has_multi_allelic_variants:
             mt = split_multi_hts(mt)
diff --git a/v03_pipeline/lib/tasks/write_metadata_for_run.py b/v03_pipeline/lib/tasks/write_metadata_for_run.py
@@ -17,7 +17,6 @@ class WriteMetadataForRunTask(luigi.Task):
     project_guids = luigi.ListParameter()
     project_remap_paths = luigi.ListParameter()
     project_pedigree_paths = luigi.ListParameter()
-    run_id = luigi.Parameter()
 
     def output(self) -> luigi.Target:
         return GCSorLocalTarget(
diff --git a/v03_pipeline/lib/tasks/write_new_variants_table.py b/v03_pipeline/lib/tasks/write_new_variants_table.py
diff --git a/v03_pipeline/lib/tasks/write_project_family_tables_test.py b/v03_pipeline/lib/tasks/write_project_family_tables_test.py
diff --git a/v03_pipeline/lib/tasks/write_relatedness_check_table_test.py b/v03_pipeline/lib/tasks/write_relatedness_check_table_test.py
diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py
diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py
diff --git a/v03_pipeline/lib/tasks/write_variant_annotations_vcf_test.py b/v03_pipeline/lib/tasks/write_variant_annotations_vcf_test.py