Skip to content

Commit 9e19acb

Browse files
authored
Benb/vep annotations (#788)
* vep config updates * vep work * more vep * Rename mock data * remove am_class * fix all the types * add utr annotation * add check_ref, fix fields test * update * remove maxDiff * update schema again * A few bugfixes * Fix type * rename the constant * large batch of new changes * add as annotations * add more new fields to mock data * format * ws * ruff * fix test for new structure * ruff * more annotations work * use correct mock data * cleanup * allow mito to not use new select * refactor * cleanup * annotations tests correct! * another test * fix missing slash * some small bugfixes * regulatory biotypes are disjoint * ruff format * unused * fix enums * Update vep.py * Update vep_test.py * merged * move tests * remove merged references * remove transcript_rank * ruff * Add ensembl to refseq transcript id mapping. (#801) * rename function * kickstarter * annotations * add mock * ruff * Fix merge issue * ruff
1 parent 686c2cf commit 9e19acb

21 files changed

+1513
-298
lines changed

v03_pipeline/bin/vep-110-GRCh38.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ gcloud storage cp --billing-project $PROJECT 'gs://seqr-reference-data/vep/110/A
4747

4848
gcloud storage cat --billing-project $PROJECT gs://seqr-reference-data/vep_data/loftee-beta/${ASSEMBLY}.tar | tar -xf - -C /vep_data/ &
4949

50-
# Copied from ftp://ftp.ensembl.org/pub/release-110/variation/indexed_vep_cache/homo_sapiens_merged_vep_110_${ASSEMBLY}.tar.gz
50+
# Copied from ftp://ftp.ensembl.org/pub/release-110/variation/indexed_vep_cache/homo_sapiens_vep_110_${ASSEMBLY}.tar.gz
5151
gcloud storage cat --billing-project $PROJECT gs://seqr-reference-data/vep/110/homo_sapiens_vep_110_${ASSEMBLY}.tar.gz | tar -xzf - -C /vep_data/ &
5252

5353
# Generated with:

v03_pipeline/lib/annotations/enums.py

Lines changed: 37 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,5 @@
1-
from __future__ import annotations
2-
3-
from typing import TYPE_CHECKING
4-
51
import hail as hl
62

7-
if TYPE_CHECKING:
8-
from v03_pipeline.lib.model import DatasetType, ReferenceGenome
9-
103
BIOTYPES = [
114
'IG_C_gene',
125
'IG_D_gene',
@@ -77,25 +70,38 @@
7770
'disrupted_domain',
7871
'vaultRNA/vault_RNA',
7972
'vaultRNA',
73+
'vault_RNA',
8074
'bidirectional_promoter_lncRNA',
8175
'3prime_overlapping_ncrna',
8276
]
8377

84-
CONSEQUENCE_TERMS = [
78+
REGULATORY_BIOTYPES = [
79+
'enhancer',
80+
'promoter',
81+
'CTCF_binding_site',
82+
'TF_binding_site',
83+
'open_chromatin_region',
84+
]
85+
86+
TRANSCRIPT_CONSEQUENCE_TERMS = [
8587
'transcript_ablation',
8688
'splice_acceptor_variant',
8789
'splice_donor_variant',
8890
'stop_gained',
8991
'frameshift_variant',
9092
'stop_lost',
91-
'start_lost', # new in v81
92-
'initiator_codon_variant', # deprecated
93+
'start_lost',
9394
'transcript_amplification',
95+
'feature_elongation',
96+
'feature_truncation',
9497
'inframe_insertion',
9598
'inframe_deletion',
9699
'missense_variant',
97-
'protein_altering_variant', # new in v79
100+
'protein_altering_variant',
101+
'splice_donor_5th_base_variant',
98102
'splice_region_variant',
103+
'splice_donor_region_variant',
104+
'splice_polypyrimidine_tract_variant',
99105
'incomplete_terminal_codon_variant',
100106
'start_retained_variant',
101107
'stop_retained_variant',
@@ -105,22 +111,37 @@
105111
'5_prime_UTR_variant',
106112
'3_prime_UTR_variant',
107113
'non_coding_transcript_exon_variant',
108-
'non_coding_exon_variant', # deprecated
109114
'intron_variant',
110115
'NMD_transcript_variant',
111116
'non_coding_transcript_variant',
112-
'nc_transcript_variant', # deprecated
117+
'coding_transcript_variant',
113118
'upstream_gene_variant',
114119
'downstream_gene_variant',
120+
'intergenic_variant',
121+
'sequence_variant',
122+
]
123+
124+
MOTIF_CONSEQUENCE_TERMS = [
115125
'TFBS_ablation',
116126
'TFBS_amplification',
117127
'TF_binding_site_variant',
128+
'TFBS_fusion',
129+
'TFBS_translocation',
130+
]
131+
132+
REGULATORY_CONSEQUENCE_TERMS = [
118133
'regulatory_region_ablation',
119134
'regulatory_region_amplification',
120-
'feature_elongation',
121135
'regulatory_region_variant',
122-
'feature_truncation',
123-
'intergenic_variant',
136+
'regulatory_region_fusion',
137+
]
138+
139+
FIVEUTR_CONSEQUENCES = [
140+
'5_prime_UTR_premature_start_codon_gain_variant', # uAUG_gained
141+
'5_prime_UTR_premature_start_codon_loss_variant', # uAUG_lost
142+
'5_prime_UTR_stop_codon_gain_variant', # uSTOP_gained
143+
'5_prime_UTR_stop_codon_loss_variant', # uSTOP_lost
144+
'5_prime_UTR_uORF_frameshift_variant', # uFrameshift
124145
]
125146

126147
LOF_FILTERS = [
@@ -219,50 +240,3 @@
219240
CLINVAR_PATHOGENICITIES_LOOKUP = hl.dict(
220241
hl.enumerate(CLINVAR_PATHOGENICITIES, index_first=False),
221242
)
222-
223-
224-
def annotate_enums(
225-
ht: hl.Table,
226-
reference_genome: ReferenceGenome,
227-
dataset_type: DatasetType,
228-
) -> hl.Table:
229-
formatting_annotation_names = {
230-
fa.__name__ for fa in dataset_type.formatting_annotation_fns(reference_genome)
231-
}
232-
if 'sorted_transcript_consequences' in formatting_annotation_names:
233-
ht = ht.annotate_globals(
234-
enums=ht.enums.annotate(
235-
sorted_transcript_consequences=hl.Struct(
236-
biotype=BIOTYPES,
237-
consequence_term=CONSEQUENCE_TERMS,
238-
lof_filter=LOF_FILTERS,
239-
),
240-
),
241-
)
242-
if 'mitotip' in formatting_annotation_names:
243-
ht = ht.annotate_globals(
244-
enums=ht.enums.annotate(
245-
mitotip=hl.Struct(
246-
trna_prediction=MITOTIP_PATHOGENICITIES,
247-
),
248-
),
249-
)
250-
if 'sv_type_id' in formatting_annotation_names:
251-
ht = ht.annotate_globals(
252-
enums=ht.enums.annotate(
253-
sv_type=SV_TYPES,
254-
),
255-
)
256-
if 'sv_type_detail_id' in formatting_annotation_names:
257-
ht = ht.annotate_globals(
258-
enums=ht.enums.annotate(sv_type_detail=SV_TYPE_DETAILS),
259-
)
260-
if 'sorted_gene_consequences' in formatting_annotation_names:
261-
ht = ht.annotate_globals(
262-
enums=ht.enums.annotate(
263-
sorted_gene_consequences=hl.Struct(
264-
major_consequence=SV_CONSEQUENCE_RANKS,
265-
),
266-
),
267-
)
268-
return ht

v03_pipeline/lib/annotations/expression_helpers.py

Lines changed: 19 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
import hail as hl
22

3-
from v03_pipeline.lib.annotations.enums import CONSEQUENCE_TERMS
3+
from v03_pipeline.lib.annotations.enums import TRANSCRIPT_CONSEQUENCE_TERMS
44

5-
CONSEQUENCE_TERM_RANK_LOOKUP = hl.dict(
6-
hl.enumerate(CONSEQUENCE_TERMS, index_first=False),
5+
TRANSCRIPT_CONSEQUENCE_TERM_RANK_LOOKUP = hl.dict(
6+
hl.enumerate(TRANSCRIPT_CONSEQUENCE_TERMS, index_first=False),
77
)
88
HGVSC_CONSEQUENCES = hl.set(
99
['splice_donor_variant', 'splice_acceptor_variant', 'splice_region_variant'],
1010
)
11-
OMIT_CONSEQUENCE_TERMS = [
11+
OMIT_TRANSCRIPT_CONSEQUENCE_TERMS = [
1212
'upstream_gene_variant',
1313
'downstream_gene_variant',
1414
]
@@ -124,7 +124,7 @@ def get_expr_for_xpos(locus: hl.expr.LocusExpression) -> hl.expr.Int64Expression
124124
def get_expr_for_vep_sorted_transcript_consequences_array(
125125
vep_root,
126126
include_coding_annotations=True,
127-
omit_consequences=OMIT_CONSEQUENCE_TERMS,
127+
omit_consequences=OMIT_TRANSCRIPT_CONSEQUENCE_TERMS,
128128
):
129129
"""Sort transcripts by 3 properties:
130130
@@ -193,7 +193,7 @@ def get_expr_for_vep_sorted_transcript_consequences_array(
193193
c.consequence_terms.size() > 0,
194194
hl.sorted(
195195
c.consequence_terms,
196-
key=lambda t: CONSEQUENCE_TERM_RANK_LOOKUP.get(t),
196+
key=lambda t: TRANSCRIPT_CONSEQUENCE_TERM_RANK_LOOKUP.get(t),
197197
)[0],
198198
hl.null(hl.tstr),
199199
),
@@ -205,24 +205,30 @@ def get_expr_for_vep_sorted_transcript_consequences_array(
205205
category=(
206206
hl.case()
207207
.when(
208-
CONSEQUENCE_TERM_RANK_LOOKUP.get(c.major_consequence)
209-
<= CONSEQUENCE_TERM_RANK_LOOKUP.get('frameshift_variant'),
208+
TRANSCRIPT_CONSEQUENCE_TERM_RANK_LOOKUP.get(c.major_consequence)
209+
<= TRANSCRIPT_CONSEQUENCE_TERM_RANK_LOOKUP.get(
210+
'frameshift_variant',
211+
),
210212
'lof',
211213
)
212214
.when(
213-
CONSEQUENCE_TERM_RANK_LOOKUP.get(c.major_consequence)
214-
<= CONSEQUENCE_TERM_RANK_LOOKUP.get('missense_variant'),
215+
TRANSCRIPT_CONSEQUENCE_TERM_RANK_LOOKUP.get(c.major_consequence)
216+
<= TRANSCRIPT_CONSEQUENCE_TERM_RANK_LOOKUP.get(
217+
'missense_variant',
218+
),
215219
'missense',
216220
)
217221
.when(
218-
CONSEQUENCE_TERM_RANK_LOOKUP.get(c.major_consequence)
219-
<= CONSEQUENCE_TERM_RANK_LOOKUP.get('synonymous_variant'),
222+
TRANSCRIPT_CONSEQUENCE_TERM_RANK_LOOKUP.get(c.major_consequence)
223+
<= TRANSCRIPT_CONSEQUENCE_TERM_RANK_LOOKUP.get(
224+
'synonymous_variant',
225+
),
220226
'synonymous',
221227
)
222228
.default('other')
223229
),
224230
hgvs=_get_expr_for_formatted_hgvs(c),
225-
major_consequence_rank=CONSEQUENCE_TERM_RANK_LOOKUP.get(
231+
major_consequence_rank=TRANSCRIPT_CONSEQUENCE_TERM_RANK_LOOKUP.get(
226232
c.major_consequence,
227233
),
228234
),

v03_pipeline/lib/annotations/fields_test.py

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from v03_pipeline.lib.paths import valid_reference_dataset_collection_path
1313
from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase
1414
from v03_pipeline.lib.vep import run_vep
15-
from v03_pipeline.var.test.vep.mock_vep_data import MOCK_VEP_DATA
15+
from v03_pipeline.var.test.vep.mock_vep_data import MOCK_37_VEP_DATA, MOCK_38_VEP_DATA
1616

1717
TEST_COMBINED_1 = 'v03_pipeline/var/test/reference_data/test_combined_1.ht'
1818
TEST_INTERVAL_1 = 'v03_pipeline/var/test/reference_data/test_interval_1.ht'
@@ -34,23 +34,20 @@ def setUp(self) -> None:
3434
@patch('v03_pipeline.lib.vep.validate_vep_config_reference_genome')
3535
@patch('v03_pipeline.lib.vep.hl.vep')
3636
def test_get_formatting_fields(self, mock_vep: Mock, mock_validate: Mock) -> None:
37-
ht = hl.read_table(TEST_COMBINED_1)
38-
mock_vep.return_value = ht.annotate(vep=MOCK_VEP_DATA)
3937
mock_validate.return_value = None
40-
ht = run_vep(
41-
ht,
42-
DatasetType.SNV_INDEL,
43-
ReferenceGenome.GRCh38,
44-
)
38+
ht = hl.read_table(TEST_COMBINED_1)
4539
ht = ht.annotate(rsid='abcd')
4640
for reference_genome, expected_fields in [
4741
(
4842
ReferenceGenome.GRCh38,
4943
[
44+
'check_ref',
5045
'screen',
5146
'gnomad_non_coding_constraint',
5247
'rg37_locus',
5348
'rsid',
49+
'sorted_motif_feature_consequences',
50+
'sorted_regulatory_feature_consequences',
5451
'sorted_transcript_consequences',
5552
'variant_id',
5653
'xpos',
@@ -66,6 +63,16 @@ def test_get_formatting_fields(self, mock_vep: Mock, mock_validate: Mock) -> Non
6663
],
6764
),
6865
]:
66+
mock_vep.return_value = ht.annotate(
67+
vep=MOCK_37_VEP_DATA
68+
if reference_genome == ReferenceGenome.GRCh37
69+
else MOCK_38_VEP_DATA,
70+
)
71+
ht = run_vep(
72+
ht,
73+
DatasetType.SNV_INDEL,
74+
reference_genome,
75+
)
6976
self.assertCountEqual(
7077
list(
7178
get_fields(
@@ -87,6 +94,15 @@ def test_get_formatting_fields(self, mock_vep: Mock, mock_validate: Mock) -> Non
8794
)
8895
if rdc.requires_annotation
8996
},
97+
**(
98+
{
99+
'gencode_ensembl_to_refseq_id_mapping': hl.dict(
100+
{'a': 'b'},
101+
),
102+
}
103+
if reference_genome == ReferenceGenome.GRCh38
104+
else {}
105+
),
90106
dataset_type=DatasetType.SNV_INDEL,
91107
reference_genome=reference_genome,
92108
liftover_ref_path=LIFTOVER,

v03_pipeline/lib/annotations/misc.py

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
import hail as hl
2+
3+
from v03_pipeline.lib.annotations.enums import (
4+
BIOTYPES,
5+
FIVEUTR_CONSEQUENCES,
6+
LOF_FILTERS,
7+
MITOTIP_PATHOGENICITIES,
8+
MOTIF_CONSEQUENCE_TERMS,
9+
REGULATORY_BIOTYPES,
10+
REGULATORY_CONSEQUENCE_TERMS,
11+
SV_CONSEQUENCE_RANKS,
12+
SV_TYPE_DETAILS,
13+
SV_TYPES,
14+
TRANSCRIPT_CONSEQUENCE_TERMS,
15+
)
16+
from v03_pipeline.lib.model import DatasetType
17+
from v03_pipeline.lib.model.definitions import ReferenceGenome
18+
19+
20+
def annotate_enums(
21+
ht: hl.Table,
22+
reference_genome: ReferenceGenome,
23+
dataset_type: DatasetType,
24+
) -> hl.Table:
25+
formatting_annotation_names = {
26+
fa.__name__ for fa in dataset_type.formatting_annotation_fns(reference_genome)
27+
}
28+
if 'sorted_motif_feature_consequences' in formatting_annotation_names:
29+
ht = ht.annotate_globals(
30+
enums=ht.enums.annotate(
31+
sorted_motif_feature_consequences=hl.Struct(
32+
consequence_term=MOTIF_CONSEQUENCE_TERMS,
33+
),
34+
),
35+
)
36+
if 'sorted_regulatory_feature_consequences' in formatting_annotation_names:
37+
ht = ht.annotate_globals(
38+
enums=ht.enums.annotate(
39+
sorted_regulatory_feature_consequences=hl.Struct(
40+
biotype=REGULATORY_BIOTYPES,
41+
consequence_term=REGULATORY_CONSEQUENCE_TERMS,
42+
),
43+
),
44+
)
45+
if 'sorted_transcript_consequences' in formatting_annotation_names:
46+
ht = ht.annotate_globals(
47+
enums=ht.enums.annotate(
48+
sorted_transcript_consequences=hl.Struct(
49+
biotype=BIOTYPES,
50+
consequence_term=TRANSCRIPT_CONSEQUENCE_TERMS,
51+
**(
52+
{
53+
'loftee': hl.Struct(
54+
lof_filter=LOF_FILTERS,
55+
),
56+
'utrannotator': hl.Struct(
57+
fiveutr_consequence=FIVEUTR_CONSEQUENCES,
58+
),
59+
}
60+
if reference_genome == ReferenceGenome.GRCh38
61+
and dataset_type == DatasetType.SNV_INDEL
62+
else {
63+
'lof_filter': LOF_FILTERS,
64+
}
65+
),
66+
),
67+
),
68+
)
69+
if 'mitotip' in formatting_annotation_names:
70+
ht = ht.annotate_globals(
71+
enums=ht.enums.annotate(
72+
mitotip=hl.Struct(
73+
trna_prediction=MITOTIP_PATHOGENICITIES,
74+
),
75+
),
76+
)
77+
if 'sv_type_id' in formatting_annotation_names:
78+
ht = ht.annotate_globals(
79+
enums=ht.enums.annotate(
80+
sv_type=SV_TYPES,
81+
),
82+
)
83+
if 'sv_type_detail_id' in formatting_annotation_names:
84+
ht = ht.annotate_globals(
85+
enums=ht.enums.annotate(sv_type_detail=SV_TYPE_DETAILS),
86+
)
87+
if 'sorted_gene_consequences' in formatting_annotation_names:
88+
ht = ht.annotate_globals(
89+
enums=ht.enums.annotate(
90+
sorted_gene_consequences=hl.Struct(
91+
major_consequence=SV_CONSEQUENCE_RANKS,
92+
),
93+
),
94+
)
95+
return ht

0 commit comments

Comments
 (0)