Skip to content

Commit d1519f0

Browse files
authored
Add new SV annotations for VCF export. (#857)
* Add SV annotations * ruff * push * ruff * Update update_variant_annotations_table_with_new_samples_test.py
1 parent ffa4313 commit d1519f0

File tree

4 files changed

+233
-0
lines changed

4 files changed

+233
-0
lines changed

v03_pipeline/lib/annotations/sv.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,41 @@ def _sv_types(ht: hl.Table) -> hl.ArrayExpression:
8181
return ht.alleles[1].replace('[<>]', '').split(':', 2)
8282

8383

84+
def alleles(ht: hl.Table, **_: Any) -> hl.ArrayExpression:
85+
return hl.array(
86+
[
87+
'N',
88+
hl.if_else(
89+
(
90+
hl.is_defined(ht.sv_type_detail_id)
91+
& (hl.array(SV_TYPES)[ht.sv_type_id] != 'CPX')
92+
),
93+
hl.format(
94+
'<%s:%s>',
95+
hl.array(SV_TYPES)[ht.sv_type_id],
96+
hl.array(SV_TYPE_DETAILS)[ht.sv_type_detail_id],
97+
),
98+
hl.format('<%s>', hl.array(SV_TYPES)[ht.sv_type_id]),
99+
),
100+
],
101+
)
102+
103+
104+
def info(ht: hl.Table, **_: Any) -> hl.StructExpression:
105+
return hl.Struct(
106+
ALGORITHMS=ht.algorithms,
107+
END=ht.start_locus.position,
108+
CHR2=ht.end_locus.contig,
109+
END2=ht.end_locus.position,
110+
SVTYPE=hl.array(SV_TYPES)[ht.sv_type_id],
111+
SVLEN=ht.sv_len,
112+
)
113+
114+
115+
def locus(ht: hl.Table, **_: Any) -> hl.LocusExpression:
116+
return ht.start_locus
117+
118+
84119
def algorithms(ht: hl.Table, **_: Any) -> hl.Expression:
85120
return hl.str(',').join(ht['info.ALGORITHMS'])
86121

@@ -205,6 +240,10 @@ def strvctvre(ht: hl.Table, **_: Any) -> hl.Expression:
205240
return hl.struct(score=hl.parse_float32(ht['info.StrVCTVRE']))
206241

207242

243+
def sv_len(ht: hl.Table, **_: Any) -> hl.Expression:
244+
return ht['info.SVLEN']
245+
246+
208247
def sv_type_id(ht: hl.Table, **_: Any) -> hl.Expression:
209248
return SV_TYPES_LOOKUP[_sv_types(ht)[0]]
210249

Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
import unittest
2+
3+
import hail as hl
4+
5+
from v03_pipeline.lib.annotations.fields import get_fields
6+
from v03_pipeline.lib.model import DatasetType
7+
8+
9+
class SVTest(unittest.TestCase):
10+
def test_sv_export_annotations(self) -> None:
11+
ht = hl.Table.parallelize(
12+
[
13+
hl.Struct(
14+
id=0,
15+
algorithms='manta',
16+
end_locus=hl.Locus(
17+
contig='chr5',
18+
position=20404,
19+
reference_genome='GRCh38',
20+
),
21+
start_locus=hl.Locus(
22+
contig='chr1',
23+
position=180928,
24+
reference_genome='GRCh38',
25+
),
26+
sv_len=123,
27+
sv_type_id=2,
28+
sv_type_detail_id=None,
29+
),
30+
hl.Struct(
31+
id=1,
32+
algorithms='manta',
33+
end_locus=hl.Locus(
34+
contig='chr1',
35+
position=789481,
36+
reference_genome='GRCh38',
37+
),
38+
start_locus=hl.Locus(
39+
contig='chr1',
40+
position=789481,
41+
reference_genome='GRCh38',
42+
),
43+
sv_len=245,
44+
sv_type_id=2,
45+
sv_type_detail_id=None,
46+
),
47+
hl.Struct(
48+
id=2,
49+
algorithms='manta',
50+
end_locus=hl.Locus(
51+
contig='chr1',
52+
position=6559723,
53+
reference_genome='GRCh38',
54+
),
55+
start_locus=hl.Locus(
56+
contig='chr1',
57+
position=6558902,
58+
reference_genome='GRCh38',
59+
),
60+
sv_len=245,
61+
sv_type_id=3,
62+
sv_type_detail_id=2,
63+
),
64+
hl.Struct(
65+
id=3,
66+
algorithms='manta',
67+
end_locus=hl.Locus(
68+
contig='chr1',
69+
position=6559723,
70+
reference_genome='GRCh38',
71+
),
72+
start_locus=hl.Locus(
73+
contig='chr1',
74+
position=6558902,
75+
reference_genome='GRCh38',
76+
),
77+
sv_len=245,
78+
sv_type_id=7,
79+
sv_type_detail_id=6,
80+
),
81+
],
82+
hl.tstruct(
83+
id=hl.tint32,
84+
algorithms=hl.tstr,
85+
end_locus=hl.tlocus('GRCh38'),
86+
start_locus=hl.tlocus('GRCh38'),
87+
sv_len=hl.tint32,
88+
sv_type_id=hl.tint32,
89+
sv_type_detail_id=hl.tint32,
90+
),
91+
key='id',
92+
)
93+
ht = ht.select(
94+
**get_fields(
95+
ht,
96+
DatasetType.SV.export_vcf_annotation_fns,
97+
),
98+
)
99+
self.assertEqual(
100+
ht.collect(),
101+
[
102+
hl.Struct(
103+
id=0,
104+
locus=hl.Locus(
105+
contig='chr1',
106+
position=180928,
107+
reference_genome='GRCh38',
108+
),
109+
alleles=['N', '<BND>'],
110+
info=hl.Struct(
111+
ALGORITHMS='manta',
112+
END=180928,
113+
CHR2='chr5',
114+
END2=20404,
115+
SVTYPE='BND',
116+
SVLEN=123,
117+
),
118+
),
119+
hl.Struct(
120+
id=1,
121+
locus=hl.Locus(
122+
contig='chr1',
123+
position=789481,
124+
reference_genome='GRCh38',
125+
),
126+
alleles=['N', '<BND>'],
127+
info=hl.Struct(
128+
ALGORITHMS='manta',
129+
END=789481,
130+
CHR2='chr1',
131+
END2=789481,
132+
SVTYPE='BND',
133+
SVLEN=245,
134+
),
135+
),
136+
hl.Struct(
137+
id=2,
138+
locus=hl.Locus(
139+
contig='chr1',
140+
position=6558902,
141+
reference_genome='GRCh38',
142+
),
143+
alleles=['N', '<CPX>'],
144+
info=hl.Struct(
145+
ALGORITHMS='manta',
146+
END=6558902,
147+
CHR2='chr1',
148+
END2=6559723,
149+
SVTYPE='CPX',
150+
SVLEN=245,
151+
),
152+
),
153+
hl.Struct(
154+
id=3,
155+
locus=hl.Locus(
156+
contig='chr1',
157+
position=6558902,
158+
reference_genome='GRCh38',
159+
),
160+
alleles=['N', '<INS:ME:SVA>'],
161+
info=hl.Struct(
162+
ALGORITHMS='manta',
163+
END=6558902,
164+
CHR2='chr1',
165+
END2=6559723,
166+
SVTYPE='INS',
167+
SVLEN=245,
168+
),
169+
),
170+
],
171+
)

v03_pipeline/lib/model/dataset_type.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,7 @@ def row_fields(
116116
'info.N_HET': hl.tint32,
117117
'info.N_HOMALT': hl.tint32,
118118
'info.StrVCTVRE': hl.tstr,
119+
'info.SVLEN': hl.tint32,
119120
**sv.CONSEQ_PREDICTED_GENE_COLS,
120121
},
121122
DatasetType.GCNV: {
@@ -239,6 +240,7 @@ def formatting_annotation_fns(
239240
sv.strvctvre,
240241
sv.sv_type_id,
241242
sv.sv_type_detail_id,
243+
sv.sv_len,
242244
shared.xpos,
243245
],
244246
DatasetType.GCNV: [
@@ -335,3 +337,13 @@ def lookup_table_annotation_fns(self) -> list[Callable[..., hl.Expression]]:
335337
@property
336338
def should_send_to_allele_registry(self):
337339
return self == DatasetType.SNV_INDEL
340+
341+
@property
342+
def export_vcf_annotation_fns(self) -> list[Callable[..., hl.Expression]]:
343+
return {
344+
DatasetType.SV: [
345+
sv.locus,
346+
sv.alleles,
347+
sv.info,
348+
],
349+
}[self]

v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1172,6 +1172,7 @@ def test_sv_update_vat(
11721172
reference_genome='GRCh38',
11731173
),
11741174
strvctvre=hl.Struct(score=None),
1175+
sv_len=-1,
11751176
sv_type_id=2,
11761177
sv_type_detail_id=None,
11771178
xpos=1000180928,
@@ -1214,6 +1215,7 @@ def test_sv_update_vat(
12141215
reference_genome='GRCh38',
12151216
),
12161217
strvctvre=hl.Struct(score=None),
1218+
sv_len=223225007,
12171219
sv_type_id=2,
12181220
sv_type_detail_id=None,
12191221
xpos=1000789481,
@@ -1266,6 +1268,7 @@ def test_sv_update_vat(
12661268
reference_genome='GRCh38',
12671269
),
12681270
strvctvre=hl.Struct(score=None),
1271+
sv_len=821,
12691272
sv_type_id=3,
12701273
sv_type_detail_id=2,
12711274
xpos=1006558902,
@@ -1321,6 +1324,7 @@ def test_sv_update_vat(
13211324
reference_genome='GRCh38',
13221325
),
13231326
strvctvre=hl.Struct(score=None),
1327+
sv_len=534718,
13241328
sv_type_id=3,
13251329
sv_type_detail_id=9,
13261330
xpos=1180540234,
@@ -1373,6 +1377,7 @@ def test_sv_update_vat(
13731377
reference_genome='GRCh38',
13741378
),
13751379
strvctvre=hl.Struct(score=None),
1380+
sv_len=841,
13761381
sv_type_id=3,
13771382
sv_type_detail_id=12,
13781383
xpos=1016088760,
@@ -1430,6 +1435,7 @@ def test_sv_update_vat(
14301435
reference_genome='GRCh38',
14311436
),
14321437
strvctvre=hl.Struct(score=None),
1438+
sv_len=52921,
14331439
sv_type_id=3,
14341440
sv_type_detail_id=13,
14351441
xpos=1021427498,
@@ -1471,6 +1477,7 @@ def test_sv_update_vat(
14711477
reference_genome='GRCh38',
14721478
),
14731479
strvctvre=hl.Struct(score=None),
1480+
sv_len=14532,
14741481
sv_type_id=5,
14751482
sv_type_detail_id=None,
14761483
xpos=1000413968,
@@ -1508,6 +1515,7 @@ def test_sv_update_vat(
15081515
reference_genome='GRCh38',
15091516
),
15101517
strvctvre=hl.Struct(score=None),
1518+
sv_len=6000,
15111519
sv_type_id=6,
15121520
sv_type_detail_id=None,
15131521
xpos=1000257666,
@@ -1549,6 +1557,7 @@ def test_sv_update_vat(
15491557
reference_genome='GRCh38',
15501558
),
15511559
strvctvre=hl.Struct(score=None),
1560+
sv_len=955,
15521561
sv_type_id=7,
15531562
sv_type_detail_id=6,
15541563
xpos=1017465707,
@@ -1593,6 +1602,7 @@ def test_sv_update_vat(
15931602
reference_genome='GRCh38',
15941603
),
15951604
strvctvre=hl.Struct(score=hl.eval(hl.float32(0.1255))),
1605+
sv_len=298,
15961606
sv_type_id=7,
15971607
sv_type_detail_id=4,
15981608
xpos=1004228405,
@@ -1634,6 +1644,7 @@ def test_sv_update_vat(
16341644
reference_genome='GRCh38',
16351645
),
16361646
strvctvre=hl.Struct(score=None),
1647+
sv_len=5520,
16371648
sv_type_id=7,
16381649
sv_type_detail_id=5,
16391650
xpos=1048963084,

0 commit comments

Comments
 (0)