Validation refactoring (#904)

bpblanken · web-flow · commit 2bef2ff2a9ae · 2024-09-23T17:31:22.000-04:00
* make run id shared

* A fix

* another test

* A few more

* Few more

* Unnecessary

* ruf

* run id

* string

* unused

* missed one

* Fix it correctly

* missed one

* Last one!

* Validation refactoring

* Fix it

* case this function better

* missing comma

* change arg name

* Moving

* PR comments
diff --git a/v03_pipeline/lib/misc/callsets.py b/v03_pipeline/lib/misc/callsets.py
@@ -30,7 +30,7 @@ def get_callset_ht(
     return callset_ht.distinct()
 
 
-def additional_row_fields(
+def get_additional_row_fields(
     mt: hl.MatrixTable,
     dataset_type: DatasetType,
     skip_check_sex_and_relatedness: bool,
diff --git a/v03_pipeline/lib/misc/validation.py b/v03_pipeline/lib/misc/validation.py
@@ -1,6 +1,19 @@
+from typing import Any
+
 import hail as hl
 
-from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType, Sex
+from v03_pipeline.lib.model import (
+    CachedReferenceDatasetQuery,
+    DatasetType,
+    Env,
+    ReferenceGenome,
+    SampleType,
+    Sex,
+)
+from v03_pipeline.lib.paths import (
+    cached_reference_dataset_query_path,
+    sex_check_table_path,
+)
 
 AMBIGUOUS_THRESHOLD_PERC: float = 0.01  # Fraction of samples identified as "ambiguous_sex" above which an error will be thrown.
 MIN_ROWS_PER_CONTIG = 100
@@ -11,9 +24,40 @@ class SeqrValidationError(Exception):
     pass
 
 
+def get_validation_dependencies(
+    dataset_type: DatasetType,
+    reference_genome: ReferenceGenome,
+    callset_path: str,
+    skip_check_sex_and_relatedness: bool,
+    **_: Any,
+) -> dict[str, hl.Table]:
+    deps = {}
+    deps['coding_and_noncoding_variants_ht'] = hl.read_table(
+        cached_reference_dataset_query_path(
+            reference_genome,
+            dataset_type,
+            CachedReferenceDatasetQuery.GNOMAD_CODING_AND_NONCODING_VARIANTS,
+        ),
+    )
+    if (
+        Env.CHECK_SEX_AND_RELATEDNESS
+        and dataset_type.check_sex_and_relatedness
+        and not skip_check_sex_and_relatedness
+    ):
+        deps['sex_check_ht'] = hl.read_table(
+            sex_check_table_path(
+                reference_genome,
+                dataset_type,
+                callset_path,
+            ),
+        )
+    return deps
+
+
 def validate_allele_type(
     mt: hl.MatrixTable,
     dataset_type: DatasetType,
+    **_: Any,
 ) -> None:
     ht = mt.rows()
     ht = ht.filter(
@@ -31,6 +75,7 @@ def validate_allele_type(
 
 def validate_no_duplicate_variants(
     mt: hl.MatrixTable,
+    **_: Any,
 ) -> None:
     ht = mt.rows()
     ht = ht.group_by(*ht.key).aggregate(n=hl.agg.count())
@@ -44,6 +89,7 @@ def validate_expected_contig_frequency(
     mt: hl.MatrixTable,
     reference_genome: ReferenceGenome,
     min_rows_per_contig: int = MIN_ROWS_PER_CONTIG,
+    **_: Any,
 ) -> None:
     rows_per_contig = mt.aggregate_rows(hl.agg.counter(mt.locus.contig))
     missing_contigs = (
@@ -69,6 +115,7 @@ def validate_imported_field_types(
     mt: hl.MatrixTable,
     dataset_type: DatasetType,
     additional_row_fields: dict[str, hl.expr.types.HailType | set],
+    **_: Any,
 ) -> None:
     def _validate_field(
         mt_schema: hl.StructExpression,
@@ -104,8 +151,12 @@ def _validate_field(
 
 def validate_imputed_sex_ploidy(
     mt: hl.MatrixTable,
-    sex_check_ht: hl.Table,
+    # NB: sex_check_ht will be undefined if sex checking is disabled for the run
+    sex_check_ht: hl.Table | None = None,
+    **_: Any,
 ) -> None:
+    if not sex_check_ht:
+        return
     mt = mt.select_cols(
         discrepant=(
             (
@@ -132,6 +183,7 @@ def validate_sample_type(
     reference_genome: ReferenceGenome,
     sample_type: SampleType,
     sample_type_match_threshold: float = SAMPLE_TYPE_MATCH_THRESHOLD,
+    **_: Any,
 ) -> None:
     coding_variants_ht = coding_and_noncoding_variants_ht.filter(
         coding_and_noncoding_variants_ht.coding,
diff --git a/v03_pipeline/lib/misc/validation_test.py b/v03_pipeline/lib/misc/validation_test.py
@@ -1,4 +1,5 @@
 import unittest
+from unittest.mock import Mock, patch
 
 import hail as hl
 
@@ -80,7 +81,9 @@ def test_validate_allele_type(self) -> None:
             DatasetType.SNV_INDEL,
         )
 
-    def test_validate_imputed_sex_ploidy(self) -> None:
+    @patch('v03_pipeline.lib.misc.validation.Env')
+    def test_validate_imputed_sex_ploidy(self, mock_env: Mock) -> None:
+        mock_env.CHECK_SEX_AND_RELATEDNESS = True
         sex_check_ht = hl.read_table(TEST_SEX_CHECK_1)
         mt = hl.MatrixTable.from_parts(
             rows={
diff --git a/v03_pipeline/lib/tasks/validate_callset.py b/v03_pipeline/lib/tasks/validate_callset.py
@@ -2,21 +2,18 @@
 import luigi
 import luigi.util
 
-from v03_pipeline.lib.misc.callsets import additional_row_fields
 from v03_pipeline.lib.misc.validation import (
+    get_validation_dependencies,
     validate_allele_type,
     validate_expected_contig_frequency,
-    validate_imported_field_types,
     validate_imputed_sex_ploidy,
     validate_no_duplicate_variants,
     validate_sample_type,
 )
 from v03_pipeline.lib.model import CachedReferenceDatasetQuery
 from v03_pipeline.lib.model.environment import Env
 from v03_pipeline.lib.paths import (
-    cached_reference_dataset_query_path,
     imported_callset_path,
-    sex_check_table_path,
 )
 from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams
 from v03_pipeline.lib.tasks.base.base_update import BaseUpdateTask
@@ -63,8 +60,8 @@ def requires(self) -> list[luigi.Task]:
             ]
         if (
             Env.CHECK_SEX_AND_RELATEDNESS
-            and not self.skip_check_sex_and_relatedness
             and self.dataset_type.check_sex_and_relatedness
+            and not self.skip_check_sex_and_relatedness
         ):
             requirements = [
                 *requirements,
@@ -83,17 +80,6 @@ def update_table(self, mt: hl.MatrixTable) -> hl.MatrixTable:
                 self.callset_path,
             ),
         )
-        # This validation isn't override-able.  If a field is the wrong
-        # type, the pipeline will likely hard-fail downstream.
-        validate_imported_field_types(
-            mt,
-            self.dataset_type,
-            additional_row_fields(
-                mt,
-                self.dataset_type,
-                self.skip_check_sex_and_relatedness,
-            ),
-        )
         if self.dataset_type.can_run_validation:
             # Rather than throwing an error, we silently remove invalid contigs.
             # This happens fairly often for AnVIL requests.
@@ -104,38 +90,34 @@ def update_table(self, mt: hl.MatrixTable) -> hl.MatrixTable:
             )
 
         if not self.skip_validation and self.dataset_type.can_run_validation:
-            validate_allele_type(mt, self.dataset_type)
-            validate_no_duplicate_variants(mt)
-            validate_expected_contig_frequency(mt, self.reference_genome)
-            coding_and_noncoding_ht = hl.read_table(
-                cached_reference_dataset_query_path(
-                    self.reference_genome,
-                    self.dataset_type,
-                    CachedReferenceDatasetQuery.GNOMAD_CODING_AND_NONCODING_VARIANTS,
-                ),
+            validation_dependencies = get_validation_dependencies(
+                **self.param_kwargs,
+            )
+            validate_allele_type(
+                mt,
+                **self.param_kwargs,
+                **validation_dependencies,
+            )
+            validate_no_duplicate_variants(
+                mt,
+                **self.param_kwargs,
+                **validation_dependencies,
+            )
+            validate_expected_contig_frequency(
+                mt,
+                **self.param_kwargs,
+                **validation_dependencies,
             )
             validate_sample_type(
                 mt,
-                coding_and_noncoding_ht,
-                self.reference_genome,
-                self.sample_type,
+                **self.param_kwargs,
+                **validation_dependencies,
+            )
+            validate_imputed_sex_ploidy(
+                mt,
+                **self.param_kwargs,
+                **validation_dependencies,
             )
-            if (
-                Env.CHECK_SEX_AND_RELATEDNESS
-                and not self.skip_check_sex_and_relatedness
-                and self.dataset_type.check_sex_and_relatedness
-            ):
-                sex_check_ht = hl.read_table(
-                    sex_check_table_path(
-                        self.reference_genome,
-                        self.dataset_type,
-                        self.callset_path,
-                    ),
-                )
-                validate_imputed_sex_ploidy(
-                    mt,
-                    sex_check_ht,
-                )
         return mt.select_globals(
             callset_path=self.callset_path,
             validated_sample_type=self.sample_type.value,
diff --git a/v03_pipeline/lib/tasks/write_imported_callset.py b/v03_pipeline/lib/tasks/write_imported_callset.py
@@ -2,13 +2,16 @@
 import luigi
 import luigi.util
 
-from v03_pipeline.lib.misc.callsets import additional_row_fields
+from v03_pipeline.lib.misc.callsets import get_additional_row_fields
 from v03_pipeline.lib.misc.io import (
     import_callset,
     import_vcf,
     select_relevant_fields,
     split_multi_hts,
 )
+from v03_pipeline.lib.misc.validation import (
+    validate_imported_field_types,
+)
 from v03_pipeline.lib.misc.vets import annotate_vets
 from v03_pipeline.lib.model.environment import Env
 from v03_pipeline.lib.paths import (
@@ -79,14 +82,22 @@ def create_table(self) -> hl.MatrixTable:
             )
             filters_ht = import_vcf(filters_path, self.reference_genome).rows()
             mt = mt.annotate_rows(filters=filters_ht[mt.row_key].filters)
+        additional_row_fields = get_additional_row_fields(
+            mt,
+            self.dataset_type,
+            self.skip_check_sex_and_relatedness,
+        )
         mt = select_relevant_fields(
             mt,
             self.dataset_type,
-            additional_row_fields(
-                mt,
-                self.dataset_type,
-                self.skip_check_sex_and_relatedness,
-            ),
+            additional_row_fields,
+        )
+        # This validation isn't override-able by the skip option.
+        # If a field is the wrong type, the pipeline will likely hard-fail downstream.
+        validate_imported_field_types(
+            mt,
+            self.dataset_type,
+            additional_row_fields,
         )
         if self.dataset_type.has_multi_allelic_variants:
             mt = split_multi_hts(mt)
diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py
@@ -62,8 +62,8 @@ def requires(self) -> list[luigi.Task]:
         ]
         if (
             Env.CHECK_SEX_AND_RELATEDNESS
-            and not self.skip_check_sex_and_relatedness
             and self.dataset_type.check_sex_and_relatedness
+            and not self.skip_check_sex_and_relatedness
         ):
             requirements = [
                 *requirements,
@@ -98,8 +98,8 @@ def create_table(self) -> hl.MatrixTable:
         families_failed_sex_check = {}
         if (
             Env.CHECK_SEX_AND_RELATEDNESS
-            and not self.skip_check_sex_and_relatedness
             and self.dataset_type.check_sex_and_relatedness
+            and not self.skip_check_sex_and_relatedness
         ):
             relatedness_check_ht = hl.read_table(self.input()[2].path)
             sex_check_ht = hl.read_table(self.input()[3].path)