Skip to content

Commit d010aa0

Browse files
authored
Merge pull request #886 from broadinstitute/dev
Raise a validation error if there are SNV_INDEL symbolic alleles (#884)
2 parents 2dd9316 + fcf165b commit d010aa0

File tree

4 files changed

+32
-6
lines changed

4 files changed

+32
-6
lines changed

v03_pipeline/lib/misc/validation.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,19 @@ class SeqrValidationError(Exception):
1313

1414
def validate_allele_type(
1515
mt: hl.MatrixTable,
16+
dataset_type: DatasetType,
1617
) -> None:
1718
ht = mt.rows()
1819
ht = ht.filter(
19-
hl.numeric_allele_type(ht.alleles[0], ht.alleles[1])
20-
== hl.genetics.allele_type.AlleleType.UNKNOWN,
20+
dataset_type.invalid_allele_types.contains(
21+
hl.numeric_allele_type(ht.alleles[0], ht.alleles[1]),
22+
),
2123
)
2224
if ht.count() > 0:
23-
msg = f'Alleles with Unknown AlleleType are present in the callset: {ht.alleles.collect()}'
25+
collected_alleles = sorted(
26+
[tuple(x) for x in ht.aggregate(hl.agg.collect_as_set(ht.alleles))],
27+
)
28+
msg = f'Alleles with invalid AlleleType are present in the callset: {collected_alleles}'
2429
raise SeqrValidationError(msg)
2530

2631

v03_pipeline/lib/misc/validation_test.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,23 +54,30 @@ def test_validate_allele_type(self) -> None:
5454
position=3,
5555
reference_genome='GRCh38',
5656
),
57+
hl.Locus(
58+
contig='chr1',
59+
position=4,
60+
reference_genome='GRCh38',
61+
),
5762
],
5863
'alleles': [
5964
['A', 'T'],
6065
# NB: star alleles should pass through this validation just fine,
6166
# but are eventually filtered out upstream.
6267
['A', '*'],
6368
['A', '-'],
69+
['A', '<NON_REF>'],
6470
],
6571
},
6672
cols={'s': ['sample_1']},
67-
entries={'HL': [[0.0], [0.0], [0.0]]},
73+
entries={'HL': [[0.0], [0.0], [0.0], [0.0]]},
6874
).key_rows_by('locus', 'alleles')
6975
self.assertRaisesRegex(
7076
SeqrValidationError,
71-
"Alleles with Unknown AlleleType are present in the callset: \\[\\['A', '-'\\]\\]",
77+
"Alleles with invalid AlleleType are present in the callset: \\[\\('A', '-'\\), \\('A', '<NON_REF>'\\)\\]",
7278
validate_allele_type,
7379
mt,
80+
DatasetType.SNV_INDEL,
7481
)
7582

7683
def test_validate_imputed_sex_ploidy(self) -> None:

v03_pipeline/lib/model/dataset_type.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,20 @@ def excluded_filters(self) -> hl.SetExpression:
144144
DatasetType.GCNV: hl.empty_set(hl.tstr),
145145
}[self]
146146

147+
@property
148+
def invalid_allele_types(self) -> hl.SetExpression:
149+
return {
150+
DatasetType.SV: hl.set([hl.genetics.allele_type.AlleleType.UNKNOWN]),
151+
}.get(
152+
self,
153+
hl.set(
154+
[
155+
hl.genetics.allele_type.AlleleType.UNKNOWN,
156+
hl.genetics.allele_type.AlleleType.SYMBOLIC,
157+
],
158+
),
159+
)
160+
147161
@property
148162
def has_lookup_table(self) -> bool:
149163
return self in {DatasetType.SNV_INDEL, DatasetType.MITO}

v03_pipeline/lib/tasks/validate_callset.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ def update_table(self, mt: hl.MatrixTable) -> hl.MatrixTable:
104104
)
105105

106106
if not self.skip_validation and self.dataset_type.can_run_validation:
107-
validate_allele_type(mt)
107+
validate_allele_type(mt, self.dataset_type)
108108
validate_no_duplicate_variants(mt)
109109
validate_expected_contig_frequency(mt, self.reference_genome)
110110
coding_and_noncoding_ht = hl.read_table(

0 commit comments

Comments
 (0)