Skip to content

Commit a5330e5

Browse files
committed
this is a cleaner approach
1 parent 19b81ca commit a5330e5

File tree

3 files changed

+17
-21
lines changed

3 files changed

+17
-21
lines changed

v03_pipeline/lib/misc/validation.py

Lines changed: 13 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
from collections.abc import Iterable
21
from typing import Any
32

43
import hail as hl
@@ -28,29 +27,25 @@ def __init__(
2827

2928
def validate_allele_type(
3029
t: hl.Table | hl.MatrixTable,
31-
dataset_type: DatasetType | Iterable[DatasetType],
30+
dataset_type: DatasetType,
3231
**_: Any,
3332
) -> None:
3433
ht = t.rows() if isinstance(t, hl.MatrixTable) else t
35-
dataset_types = (
36-
[dataset_type] if isinstance(dataset_type, DatasetType) else dataset_type
34+
ht = ht.filter(
35+
dataset_type.invalid_allele_types.contains(
36+
hl.numeric_allele_type(ht.alleles[0], ht.alleles[1]),
37+
),
3738
)
38-
for dataset_type in dataset_types:
39-
ht = ht.filter(
40-
dataset_type.invalid_allele_types.contains(
41-
hl.numeric_allele_type(ht.alleles[0], ht.alleles[1]),
42-
),
39+
if ht.count() > 0:
40+
collected_alleles = sorted(
41+
[tuple(x) for x in ht.aggregate(hl.agg.collect_as_set(ht.alleles))],
4342
)
44-
if ht.count() > 0:
45-
collected_alleles = sorted(
46-
[tuple(x) for x in ht.aggregate(hl.agg.collect_as_set(ht.alleles))],
47-
)
48-
# Handle case where all invalid alleles are NON_REF, indicating a gvcf:
49-
if all('<NON_REF>' in alleles for alleles in collected_alleles):
50-
msg = 'Alleles with invalid allele <NON_REF> are present in the callset. This appears to be a GVCF containing records for sites with no variants.'
51-
raise SeqrValidationError(msg)
52-
msg = f'Alleles with invalid AlleleType are present in the callset: {collected_alleles[:10]}'
43+
# Handle case where all invalid alleles are NON_REF, indicating a gvcf:
44+
if all('<NON_REF>' in alleles for alleles in collected_alleles):
45+
msg = 'Alleles with invalid allele <NON_REF> are present in the callset. This appears to be a GVCF containing records for sites with no variants.'
5346
raise SeqrValidationError(msg)
47+
msg = f'Alleles with invalid AlleleType are present in the callset: {collected_alleles[:10]}'
48+
raise SeqrValidationError(msg)
5449

5550

5651
def validate_no_duplicate_variants(

v03_pipeline/lib/misc/validation_test.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ def test_validate_allele_type(self) -> None:
8585
"Alleles with invalid AlleleType are present in the callset: \\[\\('A', '-'\\), \\('A', '<NON_REF>'\\)\\]",
8686
validate_allele_type,
8787
mt,
88-
[DatasetType.SNV_INDEL],
88+
DatasetType.SNV_INDEL,
8989
)
9090

9191
mt = (
@@ -119,7 +119,7 @@ def test_validate_allele_type(self) -> None:
119119
'Alleles with invalid allele <NON_REF> are present in the callset. This appears to be a GVCF containing records for sites with no variants.',
120120
validate_allele_type,
121121
mt,
122-
[DatasetType.SNV_INDEL],
122+
DatasetType.SNV_INDEL,
123123
)
124124

125125
def test_validate_imputed_sex_ploidy(self) -> None:

v03_pipeline/lib/reference_datasets/reference_dataset.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,8 @@ def get_ht(
149149
if enum_selects:
150150
ht = ht.transmute(**enum_selects)
151151
ht = filter_contigs(ht, reference_genome)
152-
validate_allele_type(ht, self.dataset_types)
152+
for dataset_type in self.dataset_types(reference_genome)
153+
validate_allele_type(ht, dataset_type)
153154
validate_no_duplicate_variants(ht, reference_genome, DatasetType.SNV_INDEL)
154155
# NB: we do not filter with "filter" here
155156
# ReferenceDatasets are DatasetType agnostic and that

0 commit comments

Comments
 (0)