Skip to content

Commit d5d323a

Browse files
committed
Merge remote-tracking branch 'origin/dev' into relax-ploidy-validation
2 parents 049c208 + 56ced7a commit d5d323a

File tree

4 files changed

+81
-1
lines changed

4 files changed

+81
-1
lines changed

v03_pipeline/lib/misc/validation.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,26 @@ def validate_allele_type(
4747
raise SeqrValidationError(msg)
4848

4949

50+
def validate_allele_depth_length(
51+
mt: hl.MatrixTable,
52+
reference_genome: ReferenceGenome,
53+
dataset_type: DatasetType,
54+
**_: Any,
55+
) -> None:
56+
ht = mt.select_rows(
57+
found_ad_lengths=hl.agg.collect_as_set(hl.len(mt.AD)).remove(
58+
hl.missing(hl.tint32),
59+
),
60+
).rows()
61+
ht = ht.filter(
62+
hl.len(ht.found_ad_lengths) > 1,
63+
)
64+
if ht.count() > 0:
65+
variant_format = dataset_type.table_key_format_fn(reference_genome)
66+
msg = f'Found variants with unequal Allele Depth array lengths over samples (first 10, if applicable): {({variant_format(v): v.found_ad_lengths for v in ht.take(10)})}'
67+
raise SeqrValidationError(msg)
68+
69+
5070
def validate_no_duplicate_variants(
5171
t: hl.Table | hl.MatrixTable,
5272
reference_genome: ReferenceGenome,

v03_pipeline/lib/misc/validation_test.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
from v03_pipeline.lib.misc.validation import (
66
SeqrValidationError,
7+
validate_allele_depth_length,
78
validate_allele_type,
89
validate_expected_contig_frequency,
910
validate_imported_field_types,
@@ -120,6 +121,63 @@ def test_validate_allele_type(self) -> None:
120121
DatasetType.SNV_INDEL,
121122
)
122123

124+
def test_validate_allele_depth_length(self) -> None:
125+
mt = (
126+
hl.MatrixTable.from_parts(
127+
rows={
128+
'locus': [
129+
hl.Locus(
130+
contig='chr1',
131+
position=1,
132+
reference_genome='GRCh38',
133+
),
134+
hl.Locus(
135+
contig='chr1',
136+
position=2,
137+
reference_genome='GRCh38',
138+
),
139+
hl.Locus(
140+
contig='chr1',
141+
position=3,
142+
reference_genome='GRCh38',
143+
),
144+
hl.Locus(
145+
contig='chr1',
146+
position=4,
147+
reference_genome='GRCh38',
148+
),
149+
],
150+
'alleles': [
151+
['A', 'T'],
152+
# NB: star alleles should pass through this validation just fine,
153+
# but are eventually filtered out upstream.
154+
['A', 'TC', 'TG'],
155+
['A', 'TTT'],
156+
['A', 'CCC'],
157+
],
158+
},
159+
cols={'s': ['sample_1', 'sample_2']},
160+
entries={
161+
'AD': [
162+
[[1, 0], [1, 0]],
163+
[[1], [1, 0, 1]],
164+
[[1, 0], [1]],
165+
[[1, 0], [1, 0]],
166+
],
167+
},
168+
)
169+
.key_rows_by('locus', 'alleles')
170+
.key_cols_by('s')
171+
)
172+
self.assertRaisesRegex(
173+
SeqrValidationError,
174+
"Found variants with unequal Allele Depth array lengths over samples \\(first 10, if applicable\\): \\{'1-2-A-TC-TG': \\{1, 3\\}, '1-3-A-TTT': \\{1, 2\\}\\}",
175+
validate_allele_depth_length,
176+
mt,
177+
ReferenceGenome.GRCh38,
178+
DatasetType.SNV_INDEL,
179+
)
180+
123181
def test_validate_imported_field_types(self) -> None:
124182
mt = hl.read_matrix_table(TEST_MITO_MT)
125183
validate_imported_field_types(mt, DatasetType.MITO, {})

v03_pipeline/lib/model/dataset_type.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def table_key_format_fn(
3636
if self in {DatasetType.GCNV, DatasetType.SV}:
3737
return lambda s: s.variant_id
3838
return (
39-
lambda s: f'{s.locus.contig if reference_genome == ReferenceGenome.GRCh37 else s.locus.contig.replace("chr", "")}-{s.locus.position}-{s.alleles[0]}-{s.alleles[1]}'
39+
lambda s: f'{s.locus.contig if reference_genome == ReferenceGenome.GRCh37 else s.locus.contig.replace("chr", "")}-{s.locus.position}-{"-".join(s.alleles)}'
4040
)
4141

4242
@property

v03_pipeline/lib/tasks/validate_callset.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
from v03_pipeline.lib.misc.validation import (
66
SeqrValidationError,
7+
validate_allele_depth_length,
78
validate_allele_type,
89
validate_expected_contig_frequency,
910
validate_no_duplicate_variants,
@@ -91,6 +92,7 @@ def update_table(self, mt: hl.MatrixTable) -> hl.MatrixTable:
9192
),
9293
)
9394
for validation_f in [
95+
validate_allele_depth_length,
9496
validate_allele_type,
9597
validate_no_duplicate_variants,
9698
validate_expected_contig_frequency,

0 commit comments

Comments
 (0)