broadinstitute
diff --git a/‎v03_pipeline/lib/misc/io.py
Lines changed: 6 additions & 1 deletion b/‎v03_pipeline/lib/misc/io.py
Lines changed: 6 additions & 1 deletion
diff --git a/‎v03_pipeline/lib/misc/io_test.py
Lines changed: 1 addition & 0 deletions b/‎v03_pipeline/lib/misc/io_test.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎v03_pipeline/lib/misc/validation.py
Lines changed: 6 additions & 2 deletions b/‎v03_pipeline/lib/misc/validation.py
Lines changed: 6 additions & 2 deletions
diff --git a/‎v03_pipeline/lib/misc/validation_test.py
Lines changed: 9 additions & 2 deletions b/‎v03_pipeline/lib/misc/validation_test.py
Lines changed: 9 additions & 2 deletions
diff --git a/‎v03_pipeline/lib/misc/vets_test.py
Lines changed: 1 addition & 1 deletion b/‎v03_pipeline/lib/misc/vets_test.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎v03_pipeline/lib/model/dataset_type.py
Lines changed: 10 additions & 0 deletions b/‎v03_pipeline/lib/model/dataset_type.py
Lines changed: 10 additions & 0 deletions
diff --git a/‎v03_pipeline/lib/paths.py
Lines changed: 15 additions & 0 deletions b/‎v03_pipeline/lib/paths.py
Lines changed: 15 additions & 0 deletions
diff --git a/‎v03_pipeline/lib/paths_test.py
Lines changed: 11 additions & 0 deletions b/‎v03_pipeline/lib/paths_test.py
Lines changed: 11 additions & 0 deletions
diff --git a/‎v03_pipeline/lib/tasks/validate_callset.py
Lines changed: 34 additions & 27 deletions b/‎v03_pipeline/lib/tasks/validate_callset.py
Lines changed: 34 additions & 27 deletions
diff --git a/‎v03_pipeline/lib/tasks/validate_callset_test.py
Lines changed: 92 additions & 0 deletions b/‎v03_pipeline/lib/tasks/validate_callset_test.py
Lines changed: 92 additions & 0 deletions
@@ -80,6 +80,7 @@ def compute_hail_n_partitions(file_size_b: int) -> int:
 )
 def split_multi_hts(
     mt: hl.MatrixTable,
+    skip_validation: bool,
     max_samples_split_multi_shuffle=MAX_SAMPLES_SPLIT_MULTI_SHUFFLE,
 ) -> hl.MatrixTable:
     bi = mt.filter_rows(hl.len(mt.alleles) == BIALLELIC)
@@ -94,7 +95,11 @@ def split_multi_hts(
         permit_shuffle=mt.count()[1] < max_samples_split_multi_shuffle,
     )
     mt = split.union_rows(bi)
-    return mt.distinct_by_row()
+    # If we've disabled validation (which is expected to throw an exception
+    # for duplicate variants, we would like to distinc )
+    if skip_validation:
+        return mt.distinct_by_row()
+    return mt
 
 
 def import_gcnv_bed_file(callset_path: str) -> hl.MatrixTable:
 
@@ -170,5 +170,6 @@ def test_split_multi_failure(self) -> None:
             )
             .key_rows_by('locus', 'alleles')
             .repartition(1),
+            False,
             1,
         )
@@ -75,13 +75,17 @@ def validate_allele_type(
 
 def validate_no_duplicate_variants(
     mt: hl.MatrixTable,
+    reference_genome: ReferenceGenome,
+    dataset_type: DatasetType,
     **_: Any,
 ) -> None:
     ht = mt.rows()
     ht = ht.group_by(*ht.key).aggregate(n=hl.agg.count())
     ht = ht.filter(ht.n > 1)
+    ht = ht.select()
     if ht.count() > 0:
-        msg = f'Variants are present multiple times in the callset: {ht.collect()}'
+        variant_format = dataset_type.table_key_format_fn(reference_genome)
+        msg = f'Variants are present multiple times in the callset: {[variant_format(v) for v in ht.collect()]}'
         raise SeqrValidationError(msg)
 
 
@@ -99,7 +103,7 @@ def validate_expected_contig_frequency(
     )
     if missing_contigs:
         msg = 'Missing the following expected contigs:{}'.format(
-            ', '.join(missing_contigs),
+            ', '.join(sorted(missing_contigs)),
         )
         raise SeqrValidationError(msg)
 
 
@@ -171,15 +171,22 @@ def test_validate_no_duplicate_variants(self) -> None:
                         reference_genome='GRCh38',
                     ),
                 ],
+                'alleles': [
+                    ['A', 'C'],
+                    ['A', 'C'],
+                    ['A', 'C'],
+                ],
             },
             cols={'s': ['sample_1']},
             entries={'HL': [[0.0], [0.0], [0.0]]},
-        ).key_rows_by('locus')
+        ).key_rows_by('locus', 'alleles')
         self.assertRaisesRegex(
             SeqrValidationError,
-            'Variants are present multiple times in the callset',
+            "Variants are present multiple times in the callset: \\['1-2-A-C'\\]",
             validate_no_duplicate_variants,
             mt,
+            ReferenceGenome.GRCh38,
+            DatasetType.SNV_INDEL,
         )
 
     def test_validate_expected_contig_frequency(self) -> None:
 
@@ -89,7 +89,7 @@ def test_annotate_vets(self) -> None:
             cols={'s': ['sample_1']},
             entries={'HL': [[0.0], [0.0], [0.0], [0.0], [0.0], [0.0]]},
         ).key_rows_by('locus', 'alleles')
-        dragen_mt = split_multi_hts(dragen_mt)
+        dragen_mt = split_multi_hts(dragen_mt, False)
         dragen_mt = annotate_vets(dragen_mt)
         self.assertListEqual(
             dragen_mt.filters.collect(),
 
@@ -29,6 +29,16 @@ def table_key_type(
             DatasetType.SV: hl.tstruct(variant_id=hl.tstr),
         }.get(self, default_key)
 
+    def table_key_format_fn(
+        self,
+        reference_genome: ReferenceGenome,
+    ) -> Callable[[hl.StructExpression], str]:
+        if self in {DatasetType.GCNV, DatasetType.SV}:
+            return lambda s: s.variant_id
+        return (
+            lambda s: f'{s.locus.contig if reference_genome == ReferenceGenome.GRCh37 else s.locus.contig.replace("chr", "")}-{s.locus.position}-{s.alleles[0]}-{s.alleles[1]}'
+        )
+
     @property
     def col_fields(
         self,
 
@@ -109,6 +109,21 @@ def imported_callset_path(
     )
 
 
+def validation_errors_for_run_path(
+    reference_genome: ReferenceGenome,
+    dataset_type: DatasetType,
+    run_id: str,
+) -> str:
+    return os.path.join(
+        runs_path(
+            reference_genome,
+            dataset_type,
+        ),
+        run_id,
+        'validation_errors.json',
+    )
+
+
 def metadata_for_run_path(
     reference_genome: ReferenceGenome,
     dataset_type: DatasetType,
 
@@ -24,6 +24,7 @@
     sex_check_table_path,
     valid_filters_path,
     valid_reference_dataset_collection_path,
+    validation_errors_for_run_path,
     variant_annotations_table_path,
 )
 
@@ -141,6 +142,16 @@ def test_relatedness_check_table_path(self) -> None:
             '/seqr/seqr-loading-temp/v3.1/GRCh38/SNV_INDEL/relatedness_check/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.ht',
         )
 
+    def test_validation_errors_for_run_path(self) -> None:
+        self.assertEqual(
+            validation_errors_for_run_path(
+                ReferenceGenome.GRCh38,
+                DatasetType.SNV_INDEL,
+                'manual__2023-06-26T18:30:09.349671+00:00',
+            ),
+            '/seqr/hail-search-data/v3.1/GRCh38/SNV_INDEL/runs/manual__2023-06-26T18:30:09.349671+00:00/validation_errors.json',
+        )
+
     def test_metadata_for_run_path(self) -> None:
         self.assertEqual(
             metadata_for_run_path(
 
@@ -3,6 +3,7 @@
 import luigi.util
 
 from v03_pipeline.lib.misc.validation import (
+    SeqrValidationError,
     get_validation_dependencies,
     validate_allele_type,
     validate_expected_contig_frequency,
@@ -23,6 +24,9 @@
 )
 from v03_pipeline.lib.tasks.write_imported_callset import WriteImportedCallsetTask
 from v03_pipeline.lib.tasks.write_sex_check_table import WriteSexCheckTableTask
+from v03_pipeline.lib.tasks.write_validation_errors_for_run import (
+    WriteValidationErrorsForRunTask,
+)
 
 
 @luigi.util.inherits(BaseLoadingRunParams)
@@ -88,35 +92,38 @@ def update_table(self, mt: hl.MatrixTable) -> hl.MatrixTable:
                     mt.locus.contig,
                 ),
             )
-
-        if not self.skip_validation and self.dataset_type.can_run_validation:
-            validation_dependencies = get_validation_dependencies(
-                **self.param_kwargs,
-            )
-            validate_allele_type(
-                mt,
-                **self.param_kwargs,
-                **validation_dependencies,
+        validation_exceptions = []
+        if self.skip_validation or not self.dataset_type.can_run_validation:
+            return mt.select_globals(
+                callset_path=self.callset_path,
+                validated_sample_type=self.sample_type.value,
             )
-            validate_no_duplicate_variants(
-                mt,
-                **self.param_kwargs,
-                **validation_dependencies,
-            )
-            validate_expected_contig_frequency(
-                mt,
-                **self.param_kwargs,
-                **validation_dependencies,
-            )
-            validate_sample_type(
-                mt,
-                **self.param_kwargs,
-                **validation_dependencies,
+        validation_dependencies = get_validation_dependencies(
+            **self.param_kwargs,
+        )
+        for validation_f in [
+            validate_allele_type,
+            validate_imputed_sex_ploidy,
+            validate_no_duplicate_variants,
+            validate_expected_contig_frequency,
+            validate_sample_type,
+        ]:
+            try:
+                validation_f(
+                    mt,
+                    **self.param_kwargs,
+                    **validation_dependencies,
+                )
+            except SeqrValidationError as e:  # noqa: PERF203
+                validation_exceptions.append(e)
+        if validation_exceptions:
+            write_validation_errors_for_run_task = self.clone(
+                WriteValidationErrorsForRunTask,
+                error_messages=[str(e) for e in validation_exceptions],
             )
-            validate_imputed_sex_ploidy(
-                mt,
-                **self.param_kwargs,
-                **validation_dependencies,
+            write_validation_errors_for_run_task.run()
+            raise SeqrValidationError(
+                write_validation_errors_for_run_task.to_single_error_message(),
             )
         return mt.select_globals(
             callset_path=self.callset_path,
 
@@ -0,0 +1,92 @@
+import json
+import shutil
+from unittest.mock import Mock, patch
+
+import luigi.worker
+
+from v03_pipeline.lib.model import (
+    CachedReferenceDatasetQuery,
+    DatasetType,
+    ReferenceGenome,
+    SampleType,
+)
+from v03_pipeline.lib.paths import (
+    cached_reference_dataset_query_path,
+)
+from v03_pipeline.lib.tasks.validate_callset import (
+    ValidateCallsetTask,
+)
+from v03_pipeline.lib.tasks.write_validation_errors_for_run import (
+    WriteValidationErrorsForRunTask,
+)
+from v03_pipeline.lib.test.mock_complete_task import MockCompleteTask
+from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase
+
+TEST_CODING_NONCODING_CRDQ_1 = (
+    'v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht'
+)
+MULTIPLE_VALIDATION_EXCEPTIONS_VCF = (
+    'v03_pipeline/var/test/callsets/multiple_validation_exceptions.vcf'
+)
+
+TEST_RUN_ID = 'manual__2024-04-03'
+
+
+class ValidateCallsetTest(MockedDatarootTestCase):
+    def setUp(self) -> None:
+        super().setUp()
+        shutil.copytree(
+            TEST_CODING_NONCODING_CRDQ_1,
+            cached_reference_dataset_query_path(
+                ReferenceGenome.GRCh38,
+                DatasetType.SNV_INDEL,
+                CachedReferenceDatasetQuery.GNOMAD_CODING_AND_NONCODING_VARIANTS,
+            ),
+        )
+
+    @patch(
+        'v03_pipeline.lib.tasks.validate_callset.UpdatedCachedReferenceDatasetQuery',
+    )
+    def test_validate_callset_multiple_exceptions(
+        self,
+        mock_updated_cached_reference_dataset_query: Mock,
+    ) -> None:
+        mock_updated_cached_reference_dataset_query.return_value = MockCompleteTask()
+        worker = luigi.worker.Worker()
+        validate_callset_task = ValidateCallsetTask(
+            reference_genome=ReferenceGenome.GRCh38,
+            dataset_type=DatasetType.SNV_INDEL,
+            sample_type=SampleType.WES,
+            # NB:
+            # This callset contains duplicate rows for chr1:902088,
+            # a NON_REF allele type at position chr1: 902024, missing
+            # all contigs but chr1, and contains non-coding variants.
+            callset_path=MULTIPLE_VALIDATION_EXCEPTIONS_VCF,
+            skip_validation=False,
+            run_id=TEST_RUN_ID,
+        )
+        worker.add(validate_callset_task)
+        worker.run()
+        self.assertFalse(validate_callset_task.complete())
+
+        write_validation_errors_task = WriteValidationErrorsForRunTask(
+            reference_genome=ReferenceGenome.GRCh38,
+            dataset_type=DatasetType.SNV_INDEL,
+            sample_type=SampleType.WES,
+            callset_path=MULTIPLE_VALIDATION_EXCEPTIONS_VCF,
+            skip_validation=False,
+            run_id=TEST_RUN_ID,
+        )
+        self.assertTrue(write_validation_errors_task.complete())
+        with write_validation_errors_task.output().open('r') as f:
+            self.assertDictEqual(
+                json.load(f),
+                {
+                    'error_messages': [
+                        "Alleles with invalid AlleleType are present in the callset: [('G', '<NON_REF>')]",
+                        "Variants are present multiple times in the callset: ['1-902088-G-A']",
+                        'Missing the following expected contigs:chr10, chr11, chr12, chr13, chr14, chr15, chr16, chr17, chr18, chr19, chr2, chr20, chr21, chr22, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chrX',
+                        'Sample type validation error: dataset sample-type is specified as WES but appears to be WGS because it contains many common non-coding variants',
+                    ],
+                },
+            )
Original file line number	Diff line number	Diff line change
`@@ -170,5 +170,6 @@ def test_split_multi_failure(self) -> None:`
`170`	`170`	`)`
`171`	`171`	`.key_rows_by('locus', 'alleles')`
`172`	`172`	`.repartition(1),`
	`173`	`+ False,`
`173`	`174`	`1,`
`174`	`175`	`)`