Skip to content

Commit 0600039

Browse files
authored
Benb/validate with allele type (#785)
* Bump requirements * add validation * format
1 parent 8c94cc9 commit 0600039

File tree

5 files changed

+58
-7
lines changed

5 files changed

+58
-7
lines changed

requirements.in

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
elasticsearch==7.9.1
22
google-api-python-client>=1.8.0
3-
hail==0.2.128
3+
hail==0.2.130
44
luigi>=3.4.0
55
gnomad==0.6.4
66
google-cloud-storage>=2.14.0

requirements.txt

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ google-resumable-media==2.7.0
129129
# via google-cloud-storage
130130
googleapis-common-protos==1.61.0
131131
# via google-api-core
132-
hail==0.2.128
132+
hail==0.2.130
133133
# via -r requirements.in
134134
hdbscan==0.8.33
135135
# via gnomad
@@ -202,7 +202,7 @@ numpy==1.26.2
202202
# scipy
203203
oauthlib==3.2.2
204204
# via requests-oauthlib
205-
orjson==3.9.11
205+
orjson==3.9.10
206206
# via hail
207207
packaging==23.2
208208
# via
@@ -230,7 +230,6 @@ protobuf==3.20.2
230230
# via
231231
# google-api-core
232232
# googleapis-common-protos
233-
# hail
234233
ptyprocess==0.7.0
235234
# via pexpect
236235
pure-eval==0.2.2
@@ -252,9 +251,7 @@ pygments==2.17.2
252251
# ipython
253252
# rich
254253
pyjwt[crypto]==2.8.0
255-
# via
256-
# msal
257-
# pyjwt
254+
# via msal
258255
pyparsing==3.1.1
259256
# via httplib2
260257
pyspark==3.3.3

v03_pipeline/lib/misc/validation.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,19 @@ class SeqrValidationError(Exception):
1111
pass
1212

1313

14+
def validate_allele_type(
15+
mt: hl.MatrixTable,
16+
) -> None:
17+
ht = mt.rows()
18+
ht = ht.filter(
19+
hl.numeric_allele_type(ht.alleles[0], ht.alleles[1])
20+
== hl.genetics.allele_type.AlleleType.UNKNOWN,
21+
)
22+
if ht.count() > 0:
23+
msg = f'Alleles with Unknown AlleleType are present in the callset: {ht.alleles.collect()}'
24+
raise SeqrValidationError(msg)
25+
26+
1427
def validate_no_duplicate_variants(
1528
mt: hl.MatrixTable,
1629
) -> None:

v03_pipeline/lib/misc/validation_test.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
from v03_pipeline.lib.misc.validation import (
66
SeqrValidationError,
7+
validate_allele_type,
78
validate_expected_contig_frequency,
89
validate_imputed_sex_ploidy,
910
validate_no_duplicate_variants,
@@ -32,6 +33,44 @@ def _mt_from_contigs(contigs):
3233

3334

3435
class ValidationTest(unittest.TestCase):
36+
def test_validate_allele_type(self) -> None:
37+
mt = hl.MatrixTable.from_parts(
38+
rows={
39+
'locus': [
40+
hl.Locus(
41+
contig='chr1',
42+
position=1,
43+
reference_genome='GRCh38',
44+
),
45+
hl.Locus(
46+
contig='chr1',
47+
position=2,
48+
reference_genome='GRCh38',
49+
),
50+
hl.Locus(
51+
contig='chr1',
52+
position=3,
53+
reference_genome='GRCh38',
54+
),
55+
],
56+
'alleles': [
57+
['A', 'T'],
58+
# NB: star alleles should pass through this validation just fine,
59+
# but are eventually filtered out upstream.
60+
['A', '*'],
61+
['A', '-'],
62+
],
63+
},
64+
cols={'s': ['sample_1']},
65+
entries={'HL': [[0.0], [0.0], [0.0]]},
66+
).key_rows_by('locus', 'alleles')
67+
self.assertRaisesRegex(
68+
SeqrValidationError,
69+
"Alleles with Unknown AlleleType are present in the callset: \\[\\['A', '-'\\]\\]",
70+
validate_allele_type,
71+
mt,
72+
)
73+
3574
def test_validate_imputed_sex_ploidy(self) -> None:
3675
sex_check_ht = hl.read_table(TEST_SEX_CHECK_1)
3776
mt = hl.MatrixTable.from_parts(

v03_pipeline/lib/tasks/write_imported_callset.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
split_multi_hts,
88
)
99
from v03_pipeline.lib.misc.validation import (
10+
validate_allele_type,
1011
validate_expected_contig_frequency,
1112
validate_imputed_sex_ploidy,
1213
validate_no_duplicate_variants,
@@ -134,6 +135,7 @@ def create_table(self) -> hl.MatrixTable:
134135
),
135136
)
136137
if self.validate and self.dataset_type.can_run_validation:
138+
validate_allele_type(mt)
137139
validate_no_duplicate_variants(mt)
138140
validate_expected_contig_frequency(mt, self.reference_genome)
139141
coding_and_noncoding_ht = hl.read_table(

0 commit comments

Comments
 (0)