broadinstitute
diff --git a/‎v03_pipeline/lib/misc/sample_ids.py
Lines changed: 5 additions & 17 deletions b/‎v03_pipeline/lib/misc/sample_ids.py
Lines changed: 5 additions & 17 deletions
diff --git a/‎v03_pipeline/lib/misc/sample_ids_test.py
Lines changed: 5 additions & 5 deletions b/‎v03_pipeline/lib/misc/sample_ids_test.py
Lines changed: 5 additions & 5 deletions
diff --git a/‎v03_pipeline/lib/tasks/validate_callset_test.py
Lines changed: 1 addition & 0 deletions b/‎v03_pipeline/lib/tasks/validate_callset_test.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎v03_pipeline/lib/tasks/write_imported_callset.py
Lines changed: 49 additions & 59 deletions b/‎v03_pipeline/lib/tasks/write_imported_callset.py
Lines changed: 49 additions & 59 deletions
diff --git a/‎v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py
Lines changed: 30 additions & 31 deletions b/‎v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py
Lines changed: 30 additions & 31 deletions
@@ -3,28 +3,16 @@
 import hail as hl
 
 from v03_pipeline.lib.logger import get_logger
+from v03_pipeline.lib.misc.validation import SeqrValidationError
 
 logger = get_logger(__name__)
 
 
-class MatrixTableSampleSetError(Exception):
-    def __init__(self, message, missing_samples):
-        super().__init__(message)
-        self.missing_samples = missing_samples
-
-
-def vcf_remap(mt: hl.MatrixTable) -> hl.MatrixTable:
-    # TODO: add logic from Mike to remap vcf samples delivered from Broad WGS
-    return mt
-
-
 def remap_sample_ids(
     mt: hl.MatrixTable,
     project_remap_ht: hl.Table,
     ignore_missing_samples_when_remapping: bool,
 ) -> hl.MatrixTable:
-    mt = vcf_remap(mt)
-
     collected_remap = project_remap_ht.collect()
     s_dups = [k for k, v in Counter([r.s for r in collected_remap]).items() if v > 1]
     seqr_dups = [
@@ -33,7 +21,7 @@ def remap_sample_ids(
 
     if len(s_dups) > 0 or len(seqr_dups) > 0:
         msg = f'Duplicate s or seqr_id entries in remap file were found. Duplicate s:{s_dups}. Duplicate seqr_id:{seqr_dups}.'
-        raise ValueError(msg)
+        raise SeqrValidationError(msg)
 
     missing_samples = project_remap_ht.anti_join(mt.cols()).collect()
     remap_count = len(collected_remap)
@@ -48,7 +36,7 @@ def remap_sample_ids(
         if ignore_missing_samples_when_remapping:
             logger.info(message)
         else:
-            raise MatrixTableSampleSetError(message, missing_samples)
+            raise SeqrValidationError(message)
 
     mt = mt.annotate_cols(**project_remap_ht[mt.s])
     remap_expr = hl.if_else(hl.is_missing(mt.seqr_id), mt.s, mt.seqr_id)
@@ -67,7 +55,7 @@ def subset_samples(
     anti_join_ht_count = anti_join_ht.count()
     if subset_count == 0:
         message = '0 sample ids found the subset HT, something is probably wrong.'
-        raise MatrixTableSampleSetError(message, [])
+        raise SeqrValidationError(message)
 
     if anti_join_ht_count != 0:
         missing_samples = anti_join_ht.s.collect()
@@ -77,7 +65,7 @@ def subset_samples(
             f"IDs that aren't in the callset: {missing_samples}\n"
             f'All callset sample IDs:{mt.s.collect()}'
         )
-        raise MatrixTableSampleSetError(message, missing_samples)
+        raise SeqrValidationError(message)
     logger.info(f'Subsetted to {subset_count} sample ids')
     mt = mt.semi_join_cols(sample_subset_ht)
     return mt.filter_rows(hl.agg.any(hl.is_defined(mt.GT)))
@@ -3,10 +3,10 @@
 import hail as hl
 
 from v03_pipeline.lib.misc.sample_ids import (
-    MatrixTableSampleSetError,
     remap_sample_ids,
     subset_samples,
 )
+from v03_pipeline.lib.misc.validation import SeqrValidationError
 
 CALLSET_MT = hl.MatrixTable.from_parts(
     rows={'variants': [1, 2]},
@@ -76,7 +76,7 @@ def test_remap_sample_ids_remap_has_duplicate(self) -> None:
             key='s',
         )
 
-        with self.assertRaises(ValueError):
+        with self.assertRaises(SeqrValidationError):
             remap_sample_ids(
                 CALLSET_MT,
                 project_remap_ht,
@@ -99,7 +99,7 @@ def test_remap_sample_ids_remap_has_missing_samples(self) -> None:
             key='s',
         )
 
-        with self.assertRaises(MatrixTableSampleSetError):
+        with self.assertRaises(SeqrValidationError):
             remap_sample_ids(
                 CALLSET_MT,
                 project_remap_ht,
@@ -114,7 +114,7 @@ def test_subset_samples_zero_samples(self):
             key='s',
         )
 
-        with self.assertRaises(MatrixTableSampleSetError):
+        with self.assertRaises(SeqrValidationError):
             subset_samples(
                 CALLSET_MT,
                 sample_subset_ht,
@@ -132,7 +132,7 @@ def test_subset_samples_missing_samples(self):
             key='s',
         )
 
-        with self.assertRaises(MatrixTableSampleSetError):
+        with self.assertRaises(SeqrValidationError):
             subset_samples(
                 CALLSET_MT,
                 sample_subset_ht,
 
@@ -83,5 +83,6 @@ def test_validate_callset_multiple_exceptions(
                         'Missing the following expected contigs:chr10, chr11, chr12, chr13, chr14, chr15, chr16, chr17, chr18, chr19, chr2, chr20, chr21, chr22, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chrX',
                         'Sample type validation error: dataset sample-type is specified as WES but appears to be WGS because it contains many common non-coding variants',
                     ],
+                    'failed_family_samples': {},
                 },
             )
@@ -10,7 +10,6 @@
     split_multi_hts,
 )
 from v03_pipeline.lib.misc.validation import (
-    SeqrValidationError,
     validate_imported_field_types,
 )
 from v03_pipeline.lib.misc.vets import annotate_vets
@@ -24,7 +23,7 @@
 from v03_pipeline.lib.tasks.files import CallsetTask, GCSorLocalTarget
 from v03_pipeline.lib.tasks.write_tdr_metrics_files import WriteTDRMetricsFilesTask
 from v03_pipeline.lib.tasks.write_validation_errors_for_run import (
-    WriteValidationErrorsForRunTask,
+    with_persisted_validation_errors,
 )
 
 
@@ -77,65 +76,56 @@ def requires(self) -> list[luigi.Task]:
             CallsetTask(self.callset_path),
         ]
 
+    @with_persisted_validation_errors
     def create_table(self) -> hl.MatrixTable:
-        try:
-            # NB: throws SeqrValidationError
-            mt = import_callset(
-                self.callset_path,
-                self.reference_genome,
-                self.dataset_type,
-            )
-            filters_path = None
-            if (
-                FeatureFlag.EXPECT_WES_FILTERS
-                and not self.skip_expect_filters
-                and self.dataset_type.expect_filters(
-                    self.sample_type,
-                )
-            ):
-                filters_path = valid_filters_path(
-                    self.dataset_type,
-                    self.sample_type,
-                    self.callset_path,
-                )
-                filters_ht = import_vcf(filters_path, self.reference_genome).rows()
-                mt = mt.annotate_rows(filters=filters_ht[mt.row_key].filters)
-            additional_row_fields = get_additional_row_fields(
-                mt,
-                self.dataset_type,
-                self.skip_check_sex_and_relatedness,
+        # NB: throws SeqrValidationError
+        mt = import_callset(
+            self.callset_path,
+            self.reference_genome,
+            self.dataset_type,
+        )
+        filters_path = None
+        if (
+            FeatureFlag.EXPECT_WES_FILTERS
+            and not self.skip_expect_filters
+            and self.dataset_type.expect_filters(
+                self.sample_type,
             )
-            # NB: throws SeqrValidationError
-            mt = select_relevant_fields(
-                mt,
+        ):
+            filters_path = valid_filters_path(
                 self.dataset_type,
-                additional_row_fields,
+                self.sample_type,
+                self.callset_path,
             )
-            # This validation isn't override-able by the skip option.
-            # If a field is the wrong type, the pipeline will likely hard-fail downstream.
+            filters_ht = import_vcf(filters_path, self.reference_genome).rows()
+            mt = mt.annotate_rows(filters=filters_ht[mt.row_key].filters)
+        additional_row_fields = get_additional_row_fields(
+            mt,
+            self.dataset_type,
+            self.skip_check_sex_and_relatedness,
+        )
+        # NB: throws SeqrValidationError
+        mt = select_relevant_fields(
+            mt,
+            self.dataset_type,
+            additional_row_fields,
+        )
+        # This validation isn't override-able by the skip option.
+        # If a field is the wrong type, the pipeline will likely hard-fail downstream.
+        # NB: throws SeqrValidationError
+        validate_imported_field_types(
+            mt,
+            self.dataset_type,
+            additional_row_fields,
+        )
+        if self.dataset_type.has_multi_allelic_variants:
             # NB: throws SeqrValidationError
-            validate_imported_field_types(
-                mt,
-                self.dataset_type,
-                additional_row_fields,
-            )
-            if self.dataset_type.has_multi_allelic_variants:
-                # NB: throws SeqrValidationError
-                mt = split_multi_hts(mt, self.skip_validation)
-            # Special handling of variant-level filter annotation for VETs filters.
-            # The annotations are present on the sample-level FT field but are
-            # expected upstream on "filters".
-            mt = annotate_vets(mt)
-            return mt.select_globals(
-                callset_path=self.callset_path,
-                filters_path=filters_path or hl.missing(hl.tstr),
-            )
-        except SeqrValidationError as e:
-            write_validation_errors_for_run_task = self.clone(
-                WriteValidationErrorsForRunTask,
-                error_messages=[str(e)],
-            )
-            write_validation_errors_for_run_task.run()
-            raise SeqrValidationError(
-                write_validation_errors_for_run_task.to_single_error_message(),
-            ) from e
+            mt = split_multi_hts(mt, self.skip_validation)
+        # Special handling of variant-level filter annotation for VETs filters.
+        # The annotations are present on the sample-level FT field but are
+        # expected upstream on "filters".
+        mt = annotate_vets(mt)
+        return mt.select_globals(
+            callset_path=self.callset_path,
+            filters_path=filters_path or hl.missing(hl.tstr),
+        )
@@ -2,7 +2,6 @@
 import luigi
 import luigi.util
 
-from v03_pipeline.lib.logger import get_logger
 from v03_pipeline.lib.misc.family_loading_failures import (
     get_families_failed_missing_samples,
     get_families_failed_relatedness_check,
@@ -16,6 +15,7 @@
 )
 from v03_pipeline.lib.misc.pedigree import parse_pedigree_ht_to_families
 from v03_pipeline.lib.misc.sample_ids import remap_sample_ids, subset_samples
+from v03_pipeline.lib.misc.validation import SeqrValidationError
 from v03_pipeline.lib.model.feature_flag import FeatureFlag
 from v03_pipeline.lib.paths import (
     relatedness_check_table_path,
@@ -29,8 +29,19 @@
     WriteRelatednessCheckTsvTask,
 )
 from v03_pipeline.lib.tasks.write_sex_check_table import WriteSexCheckTableTask
+from v03_pipeline.lib.tasks.write_validation_errors_for_run import (
+    with_persisted_validation_errors,
+)
+
 
-logger = get_logger(__name__)
+def format_failures(failed_families):
+    return {
+        f.family_guid: {
+            'samples': sorted(f.samples.keys()),
+            'reasons': reasons,
+        }
+        for f, reasons in failed_families.items()
+    }
 
 
 @luigi.util.inherits(BaseLoadingRunParams)
@@ -73,6 +84,7 @@ def requires(self) -> list[luigi.Task]:
             ]
         return requirements
 
+    @with_persisted_validation_errors
     def create_table(self) -> hl.MatrixTable:
         callset_mt = hl.read_matrix_table(self.input()[0].path)
         pedigree_ht = import_pedigree(self.input()[1].path)
@@ -130,16 +142,21 @@ def create_table(self) -> hl.MatrixTable:
             - families_failed_sex_check.keys()
         )
         if not len(loadable_families):
-            msg = (
-                f'families_failed_missing_samples: {families_failed_missing_samples}\n'
-                f'families_failed_relatedness_check: {families_failed_relatedness_check}\n'
-                f'families_failed_sex_check: {families_failed_sex_check}'
-            )
-            logger.info(
+            msg = 'All families failed validation checks'
+            raise SeqrValidationError(
                 msg,
+                {
+                    'failed_family_samples': {
+                        'missing_samples': format_failures(
+                            families_failed_missing_samples,
+                        ),
+                        'relatedness_check': format_failures(
+                            families_failed_relatedness_check,
+                        ),
+                        'sex_check': format_failures(families_failed_sex_check),
+                    },
+                },
             )
-            msg = 'All families failed checks'
-            raise RuntimeError(msg)
 
         mt = subset_samples(
             callset_mt,
@@ -172,33 +189,15 @@ def create_table(self) -> hl.MatrixTable:
             ),
             failed_family_samples=hl.Struct(
                 missing_samples=(
-                    {
-                        f.family_guid: {
-                            'samples': sorted(f.samples.keys()),
-                            'reasons': reasons,
-                        }
-                        for f, reasons in families_failed_missing_samples.items()
-                    }
+                    format_failures(families_failed_missing_samples)
                     or hl.empty_dict(hl.tstr, hl.tdict(hl.tstr, hl.tarray(hl.tstr)))
                 ),
                 relatedness_check=(
-                    {
-                        f.family_guid: {
-                            'samples': sorted(f.samples.keys()),
-                            'reasons': reasons,
-                        }
-                        for f, reasons in families_failed_relatedness_check.items()
-                    }
+                    format_failures(families_failed_relatedness_check)
                     or hl.empty_dict(hl.tstr, hl.tdict(hl.tstr, hl.tarray(hl.tstr)))
                 ),
                 sex_check=(
-                    {
-                        f.family_guid: {
-                            'samples': sorted(f.samples.keys()),
-                            'reasons': reasons,
-                        }
-                        for f, reasons in families_failed_sex_check.items()
-                    }
+                    format_failures(families_failed_sex_check)
                     or hl.empty_dict(hl.tstr, hl.tdict(hl.tstr, hl.tarray(hl.tstr)))
                 ),
             ),
Original file line number	Diff line number	Diff line change
`@@ -83,5 +83,6 @@ def test_validate_callset_multiple_exceptions(`
`83`	`83`	`'Missing the following expected contigs:chr10, chr11, chr12, chr13, chr14, chr15, chr16, chr17, chr18, chr19, chr2, chr20, chr21, chr22, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chrX',`
`84`	`84`	`'Sample type validation error: dataset sample-type is specified as WES but appears to be WGS because it contains many common non-coding variants',`
`85`	`85`	`],`
	`86`	`+ 'failed_family_samples': {},`
`86`	`87`	`},`
`87`	`88`	`)`