Skip to content

Commit 19b81ca

Browse files
committed
handle set of dataset types during allele type validation
1 parent bfde429 commit 19b81ca

File tree

3 files changed

+27
-19
lines changed

3 files changed

+27
-19
lines changed

v03_pipeline/lib/misc/validation.py

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from collections.abc import Iterable
12
from typing import Any
23

34
import hail as hl
@@ -27,25 +28,29 @@ def __init__(
2728

2829
def validate_allele_type(
2930
t: hl.Table | hl.MatrixTable,
30-
dataset_type: DatasetType,
31+
dataset_type: DatasetType | Iterable[DatasetType],
3132
**_: Any,
3233
) -> None:
3334
ht = t.rows() if isinstance(t, hl.MatrixTable) else t
34-
ht = ht.filter(
35-
dataset_type.invalid_allele_types.contains(
36-
hl.numeric_allele_type(ht.alleles[0], ht.alleles[1]),
37-
),
35+
dataset_types = (
36+
[dataset_type] if isinstance(dataset_type, DatasetType) else dataset_type
3837
)
39-
if ht.count() > 0:
40-
collected_alleles = sorted(
41-
[tuple(x) for x in ht.aggregate(hl.agg.collect_as_set(ht.alleles))],
38+
for dataset_type in dataset_types:
39+
ht = ht.filter(
40+
dataset_type.invalid_allele_types.contains(
41+
hl.numeric_allele_type(ht.alleles[0], ht.alleles[1]),
42+
),
4243
)
43-
# Handle case where all invalid alleles are NON_REF, indicating a gvcf:
44-
if all('<NON_REF>' in alleles for alleles in collected_alleles):
45-
msg = 'Alleles with invalid allele <NON_REF> are present in the callset. This appears to be a GVCF containing records for sites with no variants.'
44+
if ht.count() > 0:
45+
collected_alleles = sorted(
46+
[tuple(x) for x in ht.aggregate(hl.agg.collect_as_set(ht.alleles))],
47+
)
48+
# Handle case where all invalid alleles are NON_REF, indicating a gvcf:
49+
if all('<NON_REF>' in alleles for alleles in collected_alleles):
50+
msg = 'Alleles with invalid allele <NON_REF> are present in the callset. This appears to be a GVCF containing records for sites with no variants.'
51+
raise SeqrValidationError(msg)
52+
msg = f'Alleles with invalid AlleleType are present in the callset: {collected_alleles[:10]}'
4653
raise SeqrValidationError(msg)
47-
msg = f'Alleles with invalid AlleleType are present in the callset: {collected_alleles[:10]}'
48-
raise SeqrValidationError(msg)
4954

5055

5156
def validate_no_duplicate_variants(

v03_pipeline/lib/misc/validation_test.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ def test_validate_allele_type(self) -> None:
8585
"Alleles with invalid AlleleType are present in the callset: \\[\\('A', '-'\\), \\('A', '<NON_REF>'\\)\\]",
8686
validate_allele_type,
8787
mt,
88-
DatasetType.SNV_INDEL,
88+
[DatasetType.SNV_INDEL],
8989
)
9090

9191
mt = (
@@ -119,7 +119,7 @@ def test_validate_allele_type(self) -> None:
119119
'Alleles with invalid allele <NON_REF> are present in the callset. This appears to be a GVCF containing records for sites with no variants.',
120120
validate_allele_type,
121121
mt,
122-
DatasetType.SNV_INDEL,
122+
[DatasetType.SNV_INDEL],
123123
)
124124

125125
def test_validate_imputed_sex_ploidy(self) -> None:

v03_pipeline/lib/reference_datasets/reference_dataset.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,12 @@ def version(self, reference_genome: ReferenceGenome) -> str:
104104
)
105105
return version
106106

107+
def dataset_types(
108+
self,
109+
reference_genome: ReferenceGenome,
110+
) -> frozenset[DatasetType]:
111+
return CONFIG[self][reference_genome][DATASET_TYPES]
112+
107113
@property
108114
def enums(self) -> dict | None:
109115
return CONFIG[self].get(ENUMS)
@@ -143,10 +149,7 @@ def get_ht(
143149
if enum_selects:
144150
ht = ht.transmute(**enum_selects)
145151
ht = filter_contigs(ht, reference_genome)
146-
# Reference Datasets are DatasetType agnostic, but these
147-
# methods (in theory) support SV/GCNV. SNV_INDEL
148-
# is passed as a proxy for non-SV/GCNV.
149-
validate_allele_type(ht, DatasetType.SNV_INDEL)
152+
validate_allele_type(ht, self.dataset_types)
150153
validate_no_duplicate_variants(ht, reference_genome, DatasetType.SNV_INDEL)
151154
# NB: we do not filter with "filter" here
152155
# ReferenceDatasets are DatasetType agnostic and that

0 commit comments

Comments
 (0)