Skip to content

Commit cccf45a

Browse files
authored
Dev (#931)
* Caid refactor (#927) * CAID refactor * lint * lint * remove mock * string defaults * format * ruff * add mock values * missed a test * ruff * Improve allele type validation (#925) * Improve allele type validation * ruff * cleanup
1 parent b572fb5 commit cccf45a

File tree

3 files changed

+37
-3
lines changed

3 files changed

+37
-3
lines changed

v03_pipeline/lib/misc/validation.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,11 @@ def validate_allele_type(
6969
collected_alleles = sorted(
7070
[tuple(x) for x in ht.aggregate(hl.agg.collect_as_set(ht.alleles))],
7171
)
72-
msg = f'Alleles with invalid AlleleType are present in the callset: {collected_alleles}'
72+
# Handle case where all invalid alleles are NON_REF, indicating a gvcf:
73+
if all('<NON_REF>' in alleles for alleles in collected_alleles):
74+
msg = 'Alleles with invalid allele <NON_REF> are present in the callset. This appears to be a GVCF containing records for sites with no variants.'
75+
raise SeqrValidationError(msg)
76+
msg = f'Alleles with invalid AlleleType are present in the callset: {collected_alleles[:10]}'
7377
raise SeqrValidationError(msg)
7478

7579

@@ -85,7 +89,7 @@ def validate_no_duplicate_variants(
8589
ht = ht.select()
8690
if ht.count() > 0:
8791
variant_format = dataset_type.table_key_format_fn(reference_genome)
88-
msg = f'Variants are present multiple times in the callset: {[variant_format(v) for v in ht.collect()]}'
92+
msg = f'Variants are present multiple times in the callset: {[variant_format(v) for v in ht.collect()][:10]}'
8993
raise SeqrValidationError(msg)
9094

9195

v03_pipeline/lib/misc/validation_test.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,36 @@ def test_validate_allele_type(self) -> None:
8181
DatasetType.SNV_INDEL,
8282
)
8383

84+
mt = hl.MatrixTable.from_parts(
85+
rows={
86+
'locus': [
87+
hl.Locus(
88+
contig='chr1',
89+
position=1,
90+
reference_genome='GRCh38',
91+
),
92+
hl.Locus(
93+
contig='chr1',
94+
position=2,
95+
reference_genome='GRCh38',
96+
),
97+
],
98+
'alleles': [
99+
['C', '<NON_REF>'],
100+
['A', '<NON_REF>'],
101+
],
102+
},
103+
cols={'s': ['sample_1']},
104+
entries={'HL': [[0.0], [0.0]]},
105+
).key_rows_by('locus', 'alleles')
106+
self.assertRaisesRegex(
107+
SeqrValidationError,
108+
'Alleles with invalid allele <NON_REF> are present in the callset. This appears to be a GVCF containing records for sites with no variants.',
109+
validate_allele_type,
110+
mt,
111+
DatasetType.SNV_INDEL,
112+
)
113+
84114
@patch('v03_pipeline.lib.misc.validation.Env')
85115
def test_validate_imputed_sex_ploidy(self, mock_env: Mock) -> None:
86116
mock_env.CHECK_SEX_AND_RELATEDNESS = True

v03_pipeline/lib/tasks/validate_callset_test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ def test_validate_callset_multiple_exceptions(
8383
json.load(f),
8484
{
8585
'error_messages': [
86-
"Alleles with invalid AlleleType are present in the callset: [('G', '<NON_REF>')]",
86+
'Alleles with invalid allele <NON_REF> are present in the callset. This appears to be a GVCF containing records for sites with no variants.',
8787
"Variants are present multiple times in the callset: ['1-902088-G-A']",
8888
'Missing the following expected contigs:chr10, chr11, chr12, chr13, chr14, chr15, chr16, chr17, chr18, chr19, chr2, chr20, chr21, chr22, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chrX',
8989
'Sample type validation error: dataset sample-type is specified as WES but appears to be WGS because it contains many common non-coding variants',

0 commit comments

Comments
 (0)