Skip to content

Commit 56ced7a

Browse files
authored
validate AD length (#1058)
* validate AD length * change error message language * handle empties * format * ruff
1 parent aa0027b commit 56ced7a

File tree

4 files changed

+81
-1
lines changed

4 files changed

+81
-1
lines changed

v03_pipeline/lib/misc/validation.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,26 @@ def validate_allele_type(
4848
raise SeqrValidationError(msg)
4949

5050

51+
def validate_allele_depth_length(
52+
mt: hl.MatrixTable,
53+
reference_genome: ReferenceGenome,
54+
dataset_type: DatasetType,
55+
**_: Any,
56+
) -> None:
57+
ht = mt.select_rows(
58+
found_ad_lengths=hl.agg.collect_as_set(hl.len(mt.AD)).remove(
59+
hl.missing(hl.tint32),
60+
),
61+
).rows()
62+
ht = ht.filter(
63+
hl.len(ht.found_ad_lengths) > 1,
64+
)
65+
if ht.count() > 0:
66+
variant_format = dataset_type.table_key_format_fn(reference_genome)
67+
msg = f'Found variants with unequal Allele Depth array lengths over samples (first 10, if applicable): {({variant_format(v): v.found_ad_lengths for v in ht.take(10)})}'
68+
raise SeqrValidationError(msg)
69+
70+
5171
def validate_no_duplicate_variants(
5272
t: hl.Table | hl.MatrixTable,
5373
reference_genome: ReferenceGenome,

v03_pipeline/lib/misc/validation_test.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
from v03_pipeline.lib.misc.validation import (
66
SeqrValidationError,
7+
validate_allele_depth_length,
78
validate_allele_type,
89
validate_expected_contig_frequency,
910
validate_imported_field_types,
@@ -122,6 +123,63 @@ def test_validate_allele_type(self) -> None:
122123
DatasetType.SNV_INDEL,
123124
)
124125

126+
def test_validate_allele_depth_length(self) -> None:
127+
mt = (
128+
hl.MatrixTable.from_parts(
129+
rows={
130+
'locus': [
131+
hl.Locus(
132+
contig='chr1',
133+
position=1,
134+
reference_genome='GRCh38',
135+
),
136+
hl.Locus(
137+
contig='chr1',
138+
position=2,
139+
reference_genome='GRCh38',
140+
),
141+
hl.Locus(
142+
contig='chr1',
143+
position=3,
144+
reference_genome='GRCh38',
145+
),
146+
hl.Locus(
147+
contig='chr1',
148+
position=4,
149+
reference_genome='GRCh38',
150+
),
151+
],
152+
'alleles': [
153+
['A', 'T'],
154+
# NB: star alleles should pass through this validation just fine,
155+
# but are eventually filtered out upstream.
156+
['A', 'TC', 'TG'],
157+
['A', 'TTT'],
158+
['A', 'CCC'],
159+
],
160+
},
161+
cols={'s': ['sample_1', 'sample_2']},
162+
entries={
163+
'AD': [
164+
[[1, 0], [1, 0]],
165+
[[1], [1, 0, 1]],
166+
[[1, 0], [1]],
167+
[[1, 0], [1, 0]],
168+
],
169+
},
170+
)
171+
.key_rows_by('locus', 'alleles')
172+
.key_cols_by('s')
173+
)
174+
self.assertRaisesRegex(
175+
SeqrValidationError,
176+
"Found variants with unequal Allele Depth array lengths over samples \\(first 10, if applicable\\): \\{'1-2-A-TC-TG': \\{1, 3\\}, '1-3-A-TTT': \\{1, 2\\}\\}",
177+
validate_allele_depth_length,
178+
mt,
179+
ReferenceGenome.GRCh38,
180+
DatasetType.SNV_INDEL,
181+
)
182+
125183
def test_validate_imputed_sex_ploidy(self) -> None:
126184
female_sample = 'HG00731_1'
127185
male_sample_1 = 'HG00732_1'

v03_pipeline/lib/model/dataset_type.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def table_key_format_fn(
3636
if self in {DatasetType.GCNV, DatasetType.SV}:
3737
return lambda s: s.variant_id
3838
return (
39-
lambda s: f'{s.locus.contig if reference_genome == ReferenceGenome.GRCh37 else s.locus.contig.replace("chr", "")}-{s.locus.position}-{s.alleles[0]}-{s.alleles[1]}'
39+
lambda s: f'{s.locus.contig if reference_genome == ReferenceGenome.GRCh37 else s.locus.contig.replace("chr", "")}-{s.locus.position}-{"-".join(s.alleles)}'
4040
)
4141

4242
@property

v03_pipeline/lib/tasks/validate_callset.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
from v03_pipeline.lib.misc.validation import (
66
SeqrValidationError,
7+
validate_allele_depth_length,
78
validate_allele_type,
89
validate_expected_contig_frequency,
910
validate_imputed_sex_ploidy,
@@ -123,6 +124,7 @@ def update_table(self, mt: hl.MatrixTable) -> hl.MatrixTable:
123124
)
124125
validation_dependencies = self.get_validation_dependencies()
125126
for validation_f in [
127+
validate_allele_depth_length,
126128
validate_allele_type,
127129
validate_imputed_sex_ploidy,
128130
validate_no_duplicate_variants,

0 commit comments

Comments
 (0)