broadinstitute
diff --git a/‎v03_pipeline/bin/vep-110-GRCh38.sh
Lines changed: 1 addition & 1 deletion b/‎v03_pipeline/bin/vep-110-GRCh38.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎v03_pipeline/lib/annotations/enums.py
Lines changed: 37 additions & 63 deletions b/‎v03_pipeline/lib/annotations/enums.py
Lines changed: 37 additions & 63 deletions
diff --git a/‎v03_pipeline/lib/annotations/expression_helpers.py
Lines changed: 19 additions & 13 deletions b/‎v03_pipeline/lib/annotations/expression_helpers.py
Lines changed: 19 additions & 13 deletions
diff --git a/‎v03_pipeline/lib/annotations/fields_test.py
Lines changed: 24 additions & 8 deletions b/‎v03_pipeline/lib/annotations/fields_test.py
Lines changed: 24 additions & 8 deletions
diff --git a/‎v03_pipeline/lib/annotations/misc.py
Lines changed: 95 additions & 0 deletions b/‎v03_pipeline/lib/annotations/misc.py
Lines changed: 95 additions & 0 deletions
@@ -47,7 +47,7 @@ gcloud storage cp --billing-project $PROJECT 'gs://seqr-reference-data/vep/110/A
 
 gcloud storage cat --billing-project $PROJECT gs://seqr-reference-data/vep_data/loftee-beta/${ASSEMBLY}.tar | tar -xf - -C /vep_data/ &
 
-# Copied from ftp://ftp.ensembl.org/pub/release-110/variation/indexed_vep_cache/homo_sapiens_merged_vep_110_${ASSEMBLY}.tar.gz
+# Copied from ftp://ftp.ensembl.org/pub/release-110/variation/indexed_vep_cache/homo_sapiens_vep_110_${ASSEMBLY}.tar.gz
 gcloud storage cat --billing-project $PROJECT gs://seqr-reference-data/vep/110/homo_sapiens_vep_110_${ASSEMBLY}.tar.gz | tar -xzf - -C /vep_data/ &
 
 # Generated with:
 
@@ -1,12 +1,5 @@
-from __future__ import annotations
-
-from typing import TYPE_CHECKING
-
 import hail as hl
 
-if TYPE_CHECKING:
-    from v03_pipeline.lib.model import DatasetType, ReferenceGenome
-
 BIOTYPES = [
     'IG_C_gene',
     'IG_D_gene',
@@ -77,25 +70,38 @@
     'disrupted_domain',
     'vaultRNA/vault_RNA',
     'vaultRNA',
+    'vault_RNA',
     'bidirectional_promoter_lncRNA',
     '3prime_overlapping_ncrna',
 ]
 
-CONSEQUENCE_TERMS = [
+REGULATORY_BIOTYPES = [
+    'enhancer',
+    'promoter',
+    'CTCF_binding_site',
+    'TF_binding_site',
+    'open_chromatin_region',
+]
+
+TRANSCRIPT_CONSEQUENCE_TERMS = [
     'transcript_ablation',
     'splice_acceptor_variant',
     'splice_donor_variant',
     'stop_gained',
     'frameshift_variant',
     'stop_lost',
-    'start_lost',  # new in v81
-    'initiator_codon_variant',  # deprecated
+    'start_lost',
     'transcript_amplification',
+    'feature_elongation',
+    'feature_truncation',
     'inframe_insertion',
     'inframe_deletion',
     'missense_variant',
-    'protein_altering_variant',  # new in v79
+    'protein_altering_variant',
+    'splice_donor_5th_base_variant',
     'splice_region_variant',
+    'splice_donor_region_variant',
+    'splice_polypyrimidine_tract_variant',
     'incomplete_terminal_codon_variant',
     'start_retained_variant',
     'stop_retained_variant',
@@ -105,22 +111,37 @@
     '5_prime_UTR_variant',
     '3_prime_UTR_variant',
     'non_coding_transcript_exon_variant',
-    'non_coding_exon_variant',  # deprecated
     'intron_variant',
     'NMD_transcript_variant',
     'non_coding_transcript_variant',
-    'nc_transcript_variant',  # deprecated
+    'coding_transcript_variant',
     'upstream_gene_variant',
     'downstream_gene_variant',
+    'intergenic_variant',
+    'sequence_variant',
+]
+
+MOTIF_CONSEQUENCE_TERMS = [
     'TFBS_ablation',
     'TFBS_amplification',
     'TF_binding_site_variant',
+    'TFBS_fusion',
+    'TFBS_translocation',
+]
+
+REGULATORY_CONSEQUENCE_TERMS = [
     'regulatory_region_ablation',
     'regulatory_region_amplification',
-    'feature_elongation',
     'regulatory_region_variant',
-    'feature_truncation',
-    'intergenic_variant',
+    'regulatory_region_fusion',
+]
+
+FIVEUTR_CONSEQUENCES = [
+    '5_prime_UTR_premature_start_codon_gain_variant',  # uAUG_gained
+    '5_prime_UTR_premature_start_codon_loss_variant',  # uAUG_lost
+    '5_prime_UTR_stop_codon_gain_variant',  # uSTOP_gained
+    '5_prime_UTR_stop_codon_loss_variant',  # uSTOP_lost
+    '5_prime_UTR_uORF_frameshift_variant',  # uFrameshift
 ]
 
 LOF_FILTERS = [
@@ -219,50 +240,3 @@
 CLINVAR_PATHOGENICITIES_LOOKUP = hl.dict(
     hl.enumerate(CLINVAR_PATHOGENICITIES, index_first=False),
 )
-
-
-def annotate_enums(
-    ht: hl.Table,
-    reference_genome: ReferenceGenome,
-    dataset_type: DatasetType,
-) -> hl.Table:
-    formatting_annotation_names = {
-        fa.__name__ for fa in dataset_type.formatting_annotation_fns(reference_genome)
-    }
-    if 'sorted_transcript_consequences' in formatting_annotation_names:
-        ht = ht.annotate_globals(
-            enums=ht.enums.annotate(
-                sorted_transcript_consequences=hl.Struct(
-                    biotype=BIOTYPES,
-                    consequence_term=CONSEQUENCE_TERMS,
-                    lof_filter=LOF_FILTERS,
-                ),
-            ),
-        )
-    if 'mitotip' in formatting_annotation_names:
-        ht = ht.annotate_globals(
-            enums=ht.enums.annotate(
-                mitotip=hl.Struct(
-                    trna_prediction=MITOTIP_PATHOGENICITIES,
-                ),
-            ),
-        )
-    if 'sv_type_id' in formatting_annotation_names:
-        ht = ht.annotate_globals(
-            enums=ht.enums.annotate(
-                sv_type=SV_TYPES,
-            ),
-        )
-    if 'sv_type_detail_id' in formatting_annotation_names:
-        ht = ht.annotate_globals(
-            enums=ht.enums.annotate(sv_type_detail=SV_TYPE_DETAILS),
-        )
-    if 'sorted_gene_consequences' in formatting_annotation_names:
-        ht = ht.annotate_globals(
-            enums=ht.enums.annotate(
-                sorted_gene_consequences=hl.Struct(
-                    major_consequence=SV_CONSEQUENCE_RANKS,
-                ),
-            ),
-        )
-    return ht
@@ -1,14 +1,14 @@
 import hail as hl
 
-from v03_pipeline.lib.annotations.enums import CONSEQUENCE_TERMS
+from v03_pipeline.lib.annotations.enums import TRANSCRIPT_CONSEQUENCE_TERMS
 
-CONSEQUENCE_TERM_RANK_LOOKUP = hl.dict(
-    hl.enumerate(CONSEQUENCE_TERMS, index_first=False),
+TRANSCRIPT_CONSEQUENCE_TERM_RANK_LOOKUP = hl.dict(
+    hl.enumerate(TRANSCRIPT_CONSEQUENCE_TERMS, index_first=False),
 )
 HGVSC_CONSEQUENCES = hl.set(
     ['splice_donor_variant', 'splice_acceptor_variant', 'splice_region_variant'],
 )
-OMIT_CONSEQUENCE_TERMS = [
+OMIT_TRANSCRIPT_CONSEQUENCE_TERMS = [
     'upstream_gene_variant',
     'downstream_gene_variant',
 ]
@@ -124,7 +124,7 @@ def get_expr_for_xpos(locus: hl.expr.LocusExpression) -> hl.expr.Int64Expression
 def get_expr_for_vep_sorted_transcript_consequences_array(
     vep_root,
     include_coding_annotations=True,
-    omit_consequences=OMIT_CONSEQUENCE_TERMS,
+    omit_consequences=OMIT_TRANSCRIPT_CONSEQUENCE_TERMS,
 ):
     """Sort transcripts by 3 properties:
 
@@ -193,7 +193,7 @@ def get_expr_for_vep_sorted_transcript_consequences_array(
                     c.consequence_terms.size() > 0,
                     hl.sorted(
                         c.consequence_terms,
-                        key=lambda t: CONSEQUENCE_TERM_RANK_LOOKUP.get(t),
+                        key=lambda t: TRANSCRIPT_CONSEQUENCE_TERM_RANK_LOOKUP.get(t),
                     )[0],
                     hl.null(hl.tstr),
                 ),
@@ -205,24 +205,30 @@ def get_expr_for_vep_sorted_transcript_consequences_array(
                 category=(
                     hl.case()
                     .when(
-                        CONSEQUENCE_TERM_RANK_LOOKUP.get(c.major_consequence)
-                        <= CONSEQUENCE_TERM_RANK_LOOKUP.get('frameshift_variant'),
+                        TRANSCRIPT_CONSEQUENCE_TERM_RANK_LOOKUP.get(c.major_consequence)
+                        <= TRANSCRIPT_CONSEQUENCE_TERM_RANK_LOOKUP.get(
+                            'frameshift_variant',
+                        ),
                         'lof',
                     )
                     .when(
-                        CONSEQUENCE_TERM_RANK_LOOKUP.get(c.major_consequence)
-                        <= CONSEQUENCE_TERM_RANK_LOOKUP.get('missense_variant'),
+                        TRANSCRIPT_CONSEQUENCE_TERM_RANK_LOOKUP.get(c.major_consequence)
+                        <= TRANSCRIPT_CONSEQUENCE_TERM_RANK_LOOKUP.get(
+                            'missense_variant',
+                        ),
                         'missense',
                     )
                     .when(
-                        CONSEQUENCE_TERM_RANK_LOOKUP.get(c.major_consequence)
-                        <= CONSEQUENCE_TERM_RANK_LOOKUP.get('synonymous_variant'),
+                        TRANSCRIPT_CONSEQUENCE_TERM_RANK_LOOKUP.get(c.major_consequence)
+                        <= TRANSCRIPT_CONSEQUENCE_TERM_RANK_LOOKUP.get(
+                            'synonymous_variant',
+                        ),
                         'synonymous',
                     )
                     .default('other')
                 ),
                 hgvs=_get_expr_for_formatted_hgvs(c),
-                major_consequence_rank=CONSEQUENCE_TERM_RANK_LOOKUP.get(
+                major_consequence_rank=TRANSCRIPT_CONSEQUENCE_TERM_RANK_LOOKUP.get(
                     c.major_consequence,
                 ),
             ),
 
@@ -12,7 +12,7 @@
 from v03_pipeline.lib.paths import valid_reference_dataset_collection_path
 from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase
 from v03_pipeline.lib.vep import run_vep
-from v03_pipeline.var.test.vep.mock_vep_data import MOCK_VEP_DATA
+from v03_pipeline.var.test.vep.mock_vep_data import MOCK_37_VEP_DATA, MOCK_38_VEP_DATA
 
 TEST_COMBINED_1 = 'v03_pipeline/var/test/reference_data/test_combined_1.ht'
 TEST_INTERVAL_1 = 'v03_pipeline/var/test/reference_data/test_interval_1.ht'
@@ -34,23 +34,20 @@ def setUp(self) -> None:
     @patch('v03_pipeline.lib.vep.validate_vep_config_reference_genome')
     @patch('v03_pipeline.lib.vep.hl.vep')
     def test_get_formatting_fields(self, mock_vep: Mock, mock_validate: Mock) -> None:
-        ht = hl.read_table(TEST_COMBINED_1)
-        mock_vep.return_value = ht.annotate(vep=MOCK_VEP_DATA)
         mock_validate.return_value = None
-        ht = run_vep(
-            ht,
-            DatasetType.SNV_INDEL,
-            ReferenceGenome.GRCh38,
-        )
+        ht = hl.read_table(TEST_COMBINED_1)
         ht = ht.annotate(rsid='abcd')
         for reference_genome, expected_fields in [
             (
                 ReferenceGenome.GRCh38,
                 [
+                    'check_ref',
                     'screen',
                     'gnomad_non_coding_constraint',
                     'rg37_locus',
                     'rsid',
+                    'sorted_motif_feature_consequences',
+                    'sorted_regulatory_feature_consequences',
                     'sorted_transcript_consequences',
                     'variant_id',
                     'xpos',
@@ -66,6 +63,16 @@ def test_get_formatting_fields(self, mock_vep: Mock, mock_validate: Mock) -> Non
                 ],
             ),
         ]:
+            mock_vep.return_value = ht.annotate(
+                vep=MOCK_37_VEP_DATA
+                if reference_genome == ReferenceGenome.GRCh37
+                else MOCK_38_VEP_DATA,
+            )
+            ht = run_vep(
+                ht,
+                DatasetType.SNV_INDEL,
+                reference_genome,
+            )
             self.assertCountEqual(
                 list(
                     get_fields(
@@ -87,6 +94,15 @@ def test_get_formatting_fields(self, mock_vep: Mock, mock_validate: Mock) -> Non
                             )
                             if rdc.requires_annotation
                         },
+                        **(
+                            {
+                                'gencode_ensembl_to_refseq_id_mapping': hl.dict(
+                                    {'a': 'b'},
+                                ),
+                            }
+                            if reference_genome == ReferenceGenome.GRCh38
+                            else {}
+                        ),
                         dataset_type=DatasetType.SNV_INDEL,
                         reference_genome=reference_genome,
                         liftover_ref_path=LIFTOVER,
 
@@ -0,0 +1,95 @@
+import hail as hl
+
+from v03_pipeline.lib.annotations.enums import (
+    BIOTYPES,
+    FIVEUTR_CONSEQUENCES,
+    LOF_FILTERS,
+    MITOTIP_PATHOGENICITIES,
+    MOTIF_CONSEQUENCE_TERMS,
+    REGULATORY_BIOTYPES,
+    REGULATORY_CONSEQUENCE_TERMS,
+    SV_CONSEQUENCE_RANKS,
+    SV_TYPE_DETAILS,
+    SV_TYPES,
+    TRANSCRIPT_CONSEQUENCE_TERMS,
+)
+from v03_pipeline.lib.model import DatasetType
+from v03_pipeline.lib.model.definitions import ReferenceGenome
+
+
+def annotate_enums(
+    ht: hl.Table,
+    reference_genome: ReferenceGenome,
+    dataset_type: DatasetType,
+) -> hl.Table:
+    formatting_annotation_names = {
+        fa.__name__ for fa in dataset_type.formatting_annotation_fns(reference_genome)
+    }
+    if 'sorted_motif_feature_consequences' in formatting_annotation_names:
+        ht = ht.annotate_globals(
+            enums=ht.enums.annotate(
+                sorted_motif_feature_consequences=hl.Struct(
+                    consequence_term=MOTIF_CONSEQUENCE_TERMS,
+                ),
+            ),
+        )
+    if 'sorted_regulatory_feature_consequences' in formatting_annotation_names:
+        ht = ht.annotate_globals(
+            enums=ht.enums.annotate(
+                sorted_regulatory_feature_consequences=hl.Struct(
+                    biotype=REGULATORY_BIOTYPES,
+                    consequence_term=REGULATORY_CONSEQUENCE_TERMS,
+                ),
+            ),
+        )
+    if 'sorted_transcript_consequences' in formatting_annotation_names:
+        ht = ht.annotate_globals(
+            enums=ht.enums.annotate(
+                sorted_transcript_consequences=hl.Struct(
+                    biotype=BIOTYPES,
+                    consequence_term=TRANSCRIPT_CONSEQUENCE_TERMS,
+                    **(
+                        {
+                            'loftee': hl.Struct(
+                                lof_filter=LOF_FILTERS,
+                            ),
+                            'utrannotator': hl.Struct(
+                                fiveutr_consequence=FIVEUTR_CONSEQUENCES,
+                            ),
+                        }
+                        if reference_genome == ReferenceGenome.GRCh38
+                        and dataset_type == DatasetType.SNV_INDEL
+                        else {
+                            'lof_filter': LOF_FILTERS,
+                        }
+                    ),
+                ),
+            ),
+        )
+    if 'mitotip' in formatting_annotation_names:
+        ht = ht.annotate_globals(
+            enums=ht.enums.annotate(
+                mitotip=hl.Struct(
+                    trna_prediction=MITOTIP_PATHOGENICITIES,
+                ),
+            ),
+        )
+    if 'sv_type_id' in formatting_annotation_names:
+        ht = ht.annotate_globals(
+            enums=ht.enums.annotate(
+                sv_type=SV_TYPES,
+            ),
+        )
+    if 'sv_type_detail_id' in formatting_annotation_names:
+        ht = ht.annotate_globals(
+            enums=ht.enums.annotate(sv_type_detail=SV_TYPE_DETAILS),
+        )
+    if 'sorted_gene_consequences' in formatting_annotation_names:
+        ht = ht.annotate_globals(
+            enums=ht.enums.annotate(
+                sorted_gene_consequences=hl.Struct(
+                    major_consequence=SV_CONSEQUENCE_RANKS,
+                ),
+            ),
+        )
+    return ht