Skip to content

Commit 1ab1a25

Browse files
authored
Update pipeline to deliver a validation_errors file, with all validation exceptions. (#899)
* Make functions error checkable * First test passing * whacking * Fix tests * testing the methods! * another test working * cleanup * tweak * Run all validations * First pass * ruff * Scaffold done * format * ruff * reformat * cleanup * cleanup * make run id shared * A fix * another test * A few more * Few more * Unnecessary * ruf * run id * string * unused * missed one * Fix it correctly * missed one * Last one! * Validation refactoring * Fix it * case this function better * missing comma * change arg name * Moving * no yield * Add test file * Include VCF with multiple validation exceptions * improve validation formatting * Finish off test * Remove caching block * Fix merge * add comments
1 parent 2bef2ff commit 1ab1a25

38 files changed

+445
-80
lines changed

v03_pipeline/lib/misc/io.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ def compute_hail_n_partitions(file_size_b: int) -> int:
8080
)
8181
def split_multi_hts(
8282
mt: hl.MatrixTable,
83+
skip_validation: bool,
8384
max_samples_split_multi_shuffle=MAX_SAMPLES_SPLIT_MULTI_SHUFFLE,
8485
) -> hl.MatrixTable:
8586
bi = mt.filter_rows(hl.len(mt.alleles) == BIALLELIC)
@@ -94,7 +95,11 @@ def split_multi_hts(
9495
permit_shuffle=mt.count()[1] < max_samples_split_multi_shuffle,
9596
)
9697
mt = split.union_rows(bi)
97-
return mt.distinct_by_row()
98+
# If we've disabled validation (which is expected to throw an exception
99+
# for duplicate variants, we would like to distinc )
100+
if skip_validation:
101+
return mt.distinct_by_row()
102+
return mt
98103

99104

100105
def import_gcnv_bed_file(callset_path: str) -> hl.MatrixTable:

v03_pipeline/lib/misc/io_test.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,5 +170,6 @@ def test_split_multi_failure(self) -> None:
170170
)
171171
.key_rows_by('locus', 'alleles')
172172
.repartition(1),
173+
False,
173174
1,
174175
)

v03_pipeline/lib/misc/validation.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,13 +75,17 @@ def validate_allele_type(
7575

7676
def validate_no_duplicate_variants(
7777
mt: hl.MatrixTable,
78+
reference_genome: ReferenceGenome,
79+
dataset_type: DatasetType,
7880
**_: Any,
7981
) -> None:
8082
ht = mt.rows()
8183
ht = ht.group_by(*ht.key).aggregate(n=hl.agg.count())
8284
ht = ht.filter(ht.n > 1)
85+
ht = ht.select()
8386
if ht.count() > 0:
84-
msg = f'Variants are present multiple times in the callset: {ht.collect()}'
87+
variant_format = dataset_type.table_key_format_fn(reference_genome)
88+
msg = f'Variants are present multiple times in the callset: {[variant_format(v) for v in ht.collect()]}'
8589
raise SeqrValidationError(msg)
8690

8791

@@ -99,7 +103,7 @@ def validate_expected_contig_frequency(
99103
)
100104
if missing_contigs:
101105
msg = 'Missing the following expected contigs:{}'.format(
102-
', '.join(missing_contigs),
106+
', '.join(sorted(missing_contigs)),
103107
)
104108
raise SeqrValidationError(msg)
105109

v03_pipeline/lib/misc/validation_test.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -171,15 +171,22 @@ def test_validate_no_duplicate_variants(self) -> None:
171171
reference_genome='GRCh38',
172172
),
173173
],
174+
'alleles': [
175+
['A', 'C'],
176+
['A', 'C'],
177+
['A', 'C'],
178+
],
174179
},
175180
cols={'s': ['sample_1']},
176181
entries={'HL': [[0.0], [0.0], [0.0]]},
177-
).key_rows_by('locus')
182+
).key_rows_by('locus', 'alleles')
178183
self.assertRaisesRegex(
179184
SeqrValidationError,
180-
'Variants are present multiple times in the callset',
185+
"Variants are present multiple times in the callset: \\['1-2-A-C'\\]",
181186
validate_no_duplicate_variants,
182187
mt,
188+
ReferenceGenome.GRCh38,
189+
DatasetType.SNV_INDEL,
183190
)
184191

185192
def test_validate_expected_contig_frequency(self) -> None:

v03_pipeline/lib/misc/vets_test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ def test_annotate_vets(self) -> None:
8989
cols={'s': ['sample_1']},
9090
entries={'HL': [[0.0], [0.0], [0.0], [0.0], [0.0], [0.0]]},
9191
).key_rows_by('locus', 'alleles')
92-
dragen_mt = split_multi_hts(dragen_mt)
92+
dragen_mt = split_multi_hts(dragen_mt, False)
9393
dragen_mt = annotate_vets(dragen_mt)
9494
self.assertListEqual(
9595
dragen_mt.filters.collect(),

v03_pipeline/lib/model/dataset_type.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,16 @@ def table_key_type(
2929
DatasetType.SV: hl.tstruct(variant_id=hl.tstr),
3030
}.get(self, default_key)
3131

32+
def table_key_format_fn(
33+
self,
34+
reference_genome: ReferenceGenome,
35+
) -> Callable[[hl.StructExpression], str]:
36+
if self in {DatasetType.GCNV, DatasetType.SV}:
37+
return lambda s: s.variant_id
38+
return (
39+
lambda s: f'{s.locus.contig if reference_genome == ReferenceGenome.GRCh37 else s.locus.contig.replace("chr", "")}-{s.locus.position}-{s.alleles[0]}-{s.alleles[1]}'
40+
)
41+
3242
@property
3343
def col_fields(
3444
self,

v03_pipeline/lib/paths.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,21 @@ def imported_callset_path(
109109
)
110110

111111

112+
def validation_errors_for_run_path(
113+
reference_genome: ReferenceGenome,
114+
dataset_type: DatasetType,
115+
run_id: str,
116+
) -> str:
117+
return os.path.join(
118+
runs_path(
119+
reference_genome,
120+
dataset_type,
121+
),
122+
run_id,
123+
'validation_errors.json',
124+
)
125+
126+
112127
def metadata_for_run_path(
113128
reference_genome: ReferenceGenome,
114129
dataset_type: DatasetType,

v03_pipeline/lib/paths_test.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
sex_check_table_path,
2525
valid_filters_path,
2626
valid_reference_dataset_collection_path,
27+
validation_errors_for_run_path,
2728
variant_annotations_table_path,
2829
)
2930

@@ -141,6 +142,16 @@ def test_relatedness_check_table_path(self) -> None:
141142
'/seqr/seqr-loading-temp/v3.1/GRCh38/SNV_INDEL/relatedness_check/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.ht',
142143
)
143144

145+
def test_validation_errors_for_run_path(self) -> None:
146+
self.assertEqual(
147+
validation_errors_for_run_path(
148+
ReferenceGenome.GRCh38,
149+
DatasetType.SNV_INDEL,
150+
'manual__2023-06-26T18:30:09.349671+00:00',
151+
),
152+
'/seqr/hail-search-data/v3.1/GRCh38/SNV_INDEL/runs/manual__2023-06-26T18:30:09.349671+00:00/validation_errors.json',
153+
)
154+
144155
def test_metadata_for_run_path(self) -> None:
145156
self.assertEqual(
146157
metadata_for_run_path(

v03_pipeline/lib/tasks/validate_callset.py

Lines changed: 34 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import luigi.util
44

55
from v03_pipeline.lib.misc.validation import (
6+
SeqrValidationError,
67
get_validation_dependencies,
78
validate_allele_type,
89
validate_expected_contig_frequency,
@@ -23,6 +24,9 @@
2324
)
2425
from v03_pipeline.lib.tasks.write_imported_callset import WriteImportedCallsetTask
2526
from v03_pipeline.lib.tasks.write_sex_check_table import WriteSexCheckTableTask
27+
from v03_pipeline.lib.tasks.write_validation_errors_for_run import (
28+
WriteValidationErrorsForRunTask,
29+
)
2630

2731

2832
@luigi.util.inherits(BaseLoadingRunParams)
@@ -88,35 +92,38 @@ def update_table(self, mt: hl.MatrixTable) -> hl.MatrixTable:
8892
mt.locus.contig,
8993
),
9094
)
91-
92-
if not self.skip_validation and self.dataset_type.can_run_validation:
93-
validation_dependencies = get_validation_dependencies(
94-
**self.param_kwargs,
95-
)
96-
validate_allele_type(
97-
mt,
98-
**self.param_kwargs,
99-
**validation_dependencies,
95+
validation_exceptions = []
96+
if self.skip_validation or not self.dataset_type.can_run_validation:
97+
return mt.select_globals(
98+
callset_path=self.callset_path,
99+
validated_sample_type=self.sample_type.value,
100100
)
101-
validate_no_duplicate_variants(
102-
mt,
103-
**self.param_kwargs,
104-
**validation_dependencies,
105-
)
106-
validate_expected_contig_frequency(
107-
mt,
108-
**self.param_kwargs,
109-
**validation_dependencies,
110-
)
111-
validate_sample_type(
112-
mt,
113-
**self.param_kwargs,
114-
**validation_dependencies,
101+
validation_dependencies = get_validation_dependencies(
102+
**self.param_kwargs,
103+
)
104+
for validation_f in [
105+
validate_allele_type,
106+
validate_imputed_sex_ploidy,
107+
validate_no_duplicate_variants,
108+
validate_expected_contig_frequency,
109+
validate_sample_type,
110+
]:
111+
try:
112+
validation_f(
113+
mt,
114+
**self.param_kwargs,
115+
**validation_dependencies,
116+
)
117+
except SeqrValidationError as e: # noqa: PERF203
118+
validation_exceptions.append(e)
119+
if validation_exceptions:
120+
write_validation_errors_for_run_task = self.clone(
121+
WriteValidationErrorsForRunTask,
122+
error_messages=[str(e) for e in validation_exceptions],
115123
)
116-
validate_imputed_sex_ploidy(
117-
mt,
118-
**self.param_kwargs,
119-
**validation_dependencies,
124+
write_validation_errors_for_run_task.run()
125+
raise SeqrValidationError(
126+
write_validation_errors_for_run_task.to_single_error_message(),
120127
)
121128
return mt.select_globals(
122129
callset_path=self.callset_path,
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
import json
2+
import shutil
3+
from unittest.mock import Mock, patch
4+
5+
import luigi.worker
6+
7+
from v03_pipeline.lib.model import (
8+
CachedReferenceDatasetQuery,
9+
DatasetType,
10+
ReferenceGenome,
11+
SampleType,
12+
)
13+
from v03_pipeline.lib.paths import (
14+
cached_reference_dataset_query_path,
15+
)
16+
from v03_pipeline.lib.tasks.validate_callset import (
17+
ValidateCallsetTask,
18+
)
19+
from v03_pipeline.lib.tasks.write_validation_errors_for_run import (
20+
WriteValidationErrorsForRunTask,
21+
)
22+
from v03_pipeline.lib.test.mock_complete_task import MockCompleteTask
23+
from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase
24+
25+
TEST_CODING_NONCODING_CRDQ_1 = (
26+
'v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht'
27+
)
28+
MULTIPLE_VALIDATION_EXCEPTIONS_VCF = (
29+
'v03_pipeline/var/test/callsets/multiple_validation_exceptions.vcf'
30+
)
31+
32+
TEST_RUN_ID = 'manual__2024-04-03'
33+
34+
35+
class ValidateCallsetTest(MockedDatarootTestCase):
36+
def setUp(self) -> None:
37+
super().setUp()
38+
shutil.copytree(
39+
TEST_CODING_NONCODING_CRDQ_1,
40+
cached_reference_dataset_query_path(
41+
ReferenceGenome.GRCh38,
42+
DatasetType.SNV_INDEL,
43+
CachedReferenceDatasetQuery.GNOMAD_CODING_AND_NONCODING_VARIANTS,
44+
),
45+
)
46+
47+
@patch(
48+
'v03_pipeline.lib.tasks.validate_callset.UpdatedCachedReferenceDatasetQuery',
49+
)
50+
def test_validate_callset_multiple_exceptions(
51+
self,
52+
mock_updated_cached_reference_dataset_query: Mock,
53+
) -> None:
54+
mock_updated_cached_reference_dataset_query.return_value = MockCompleteTask()
55+
worker = luigi.worker.Worker()
56+
validate_callset_task = ValidateCallsetTask(
57+
reference_genome=ReferenceGenome.GRCh38,
58+
dataset_type=DatasetType.SNV_INDEL,
59+
sample_type=SampleType.WES,
60+
# NB:
61+
# This callset contains duplicate rows for chr1:902088,
62+
# a NON_REF allele type at position chr1: 902024, missing
63+
# all contigs but chr1, and contains non-coding variants.
64+
callset_path=MULTIPLE_VALIDATION_EXCEPTIONS_VCF,
65+
skip_validation=False,
66+
run_id=TEST_RUN_ID,
67+
)
68+
worker.add(validate_callset_task)
69+
worker.run()
70+
self.assertFalse(validate_callset_task.complete())
71+
72+
write_validation_errors_task = WriteValidationErrorsForRunTask(
73+
reference_genome=ReferenceGenome.GRCh38,
74+
dataset_type=DatasetType.SNV_INDEL,
75+
sample_type=SampleType.WES,
76+
callset_path=MULTIPLE_VALIDATION_EXCEPTIONS_VCF,
77+
skip_validation=False,
78+
run_id=TEST_RUN_ID,
79+
)
80+
self.assertTrue(write_validation_errors_task.complete())
81+
with write_validation_errors_task.output().open('r') as f:
82+
self.assertDictEqual(
83+
json.load(f),
84+
{
85+
'error_messages': [
86+
"Alleles with invalid AlleleType are present in the callset: [('G', '<NON_REF>')]",
87+
"Variants are present multiple times in the callset: ['1-902088-G-A']",
88+
'Missing the following expected contigs:chr10, chr11, chr12, chr13, chr14, chr15, chr16, chr17, chr18, chr19, chr2, chr20, chr21, chr22, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chrX',
89+
'Sample type validation error: dataset sample-type is specified as WES but appears to be WGS because it contains many common non-coding variants',
90+
],
91+
},
92+
)

0 commit comments

Comments
 (0)