Skip to content

Commit c4050a7

Browse files
authored
VEP Config update (#883)
1 parent d821661 commit c4050a7

12 files changed

+17
-46
lines changed

.github/workflows/dev-release.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,5 +45,4 @@ jobs:
4545
run: |-
4646
gcloud storage rm -r gs://seqr-luigi/releases/dev/latest/ || echo 'No latest release'
4747
gcloud storage cp v03_pipeline/bin/* gs://seqr-luigi/releases/dev/latest/bin/
48-
gcloud storage cp v03_pipeline/var/vep_config/* gs://seqr-luigi/releases/dev/latest/var/vep_config
4948
gcloud storage cp dist/*.whl gs://seqr-luigi/releases/dev/latest/pyscripts.zip

.github/workflows/prod-release.yml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,11 +53,10 @@ jobs:
5353
run: |-
5454
gcloud storage rm -r gs://seqr-luigi/releases/prod/latest/ || echo 'No latest release'
5555
gcloud storage cp v03_pipeline/bin/* gs://seqr-luigi/releases/prod/latest/bin/
56-
gcloud storage cp v03_pipeline/var/vep_config/* gs://seqr-luigi/releases/prod/latest/var/vep_config
5756
gcloud storage cp dist/*.whl gs://seqr-luigi/releases/prod/latest/pyscripts.zip
5857
gcloud storage cp v03_pipeline/bin/* gs://seqr-luigi/releases/prod/$TAG_NAME/bin/
59-
gcloud storage cp v03_pipeline/var/vep_config/* gs://seqr-luigi/releases/prod/$TAG_NAME/var/vep_config
6058
gcloud storage cp dist/*.whl gs://seqr-luigi/releases/prod/$TAG_NAME/pyscripts.zip
59+
gcloud storage cp v03_pipeline/var/vep/* gs://seqr-reference-data/vep/
6160
6261
- name: Create tag
6362
uses: actions/github-script@v7

v03_pipeline/bin/dataproc_vep_init.bash

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@ set -x
1515

1616
export PROJECT="$(gcloud config get-value project)"
1717
export ENVIRONMENT="$(/usr/share/google/get_metadata_value attributes/ENVIRONMENT)"
18-
export VEP_CONFIG_PATH="$(/usr/share/google/get_metadata_value attributes/VEP_CONFIG_PATH)"
1918
export REFERENCE_GENOME="$(/usr/share/google/get_metadata_value attributes/REFERENCE_GENOME)"
2019

2120
# Install docker
@@ -37,8 +36,6 @@ apt-get install -y --allow-unauthenticated docker-ce
3736
sleep 60
3837
sudo service docker restart
3938

40-
gcloud storage cp gs://seqr-luigi/releases/$ENVIRONMENT/latest/var/vep_config/vep-$REFERENCE_GENOME.json $VEP_CONFIG_PATH
41-
4239
cat >/vep.c <<EOF
4340
#include <unistd.h>
4441
#include <stdio.h>

v03_pipeline/bin/download_vep_data.bash

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,13 +26,16 @@ case $REFERENCE_GENOME in
2626

2727
# Copied from the UTRAnnotator repo (https://github.com/ImperialCardioGenetics/UTRannotator/tree/master)
2828
'gs://seqr-reference-data/vep/GRCh38/uORF_5UTR_GRCh38_PUBLIC.txt'
29+
30+
'gs://seqr-reference-data/vep/GRCh38/vep-GRCh38.json'
2931
)
3032
;;
3133
GRCh37)
3234
VEP_REFERENCE_DATA_FILES=(
3335
'gs://seqr-reference-data/vep_data/loftee-beta/GRCh37.tar.gz'
3436
'gs://seqr-reference-data/vep/GRCh37/homo_sapiens_vep_110_GRCh37.tar.gz'
3537
'gs://seqr-reference-data/vep/GRCh37/Homo_sapiens.GRCh37.dna.primary_assembly.fa.*'
38+
'gs://seqr-reference-data/vep/GRCh37/vep-GRCh37.json'
3639
)
3740
;;
3841
*)

v03_pipeline/deploy/Dockerfile

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,15 @@ WORKDIR /v03_pipeline
1010
COPY requirements.txt .
1111
RUN python3 -m pip install --no-cache-dir -r ./requirements.txt
1212

13-
# VEP
14-
COPY v03_pipeline/bin/vep /vep
15-
1613
# Application Code
17-
COPY v03_pipeline/ .
14+
COPY v03_pipeline/api api
15+
COPY v03_pipeline/bin bin
16+
COPY v03_pipeline/lib lib
17+
COPY v03_pipeline/migrations migrations
1818

19-
RUN cp ./var/spark_config/spark-defaults.conf /usr/local/lib/python3.10/dist-packages/pyspark/conf/spark-defaults.conf
19+
# Special paths
20+
COPY v03_pipeline/var/spark_config/spark-defaults.conf /usr/local/lib/python3.10/dist-packages/pyspark/conf/spark-defaults.conf
21+
COPY v03_pipeline/bin/vep /vep
2022

2123
WORKDIR /
2224
EXPOSE 5000

v03_pipeline/lib/annotations/fields_test.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,10 +35,8 @@ def setUp(self) -> None:
3535
),
3636
)
3737

38-
@patch('v03_pipeline.lib.vep.validate_vep_config_reference_genome')
3938
@patch('v03_pipeline.lib.vep.hl.vep')
40-
def test_get_formatting_fields(self, mock_vep: Mock, mock_validate: Mock) -> None:
41-
mock_validate.return_value = None
39+
def test_get_formatting_fields(self, mock_vep: Mock) -> None:
4240
for reference_genome, ht, expected_fields in [
4341
(
4442
ReferenceGenome.GRCh38,

v03_pipeline/lib/annotations/snv_indel_test.py

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -83,12 +83,10 @@ def test_allele_count_annotations(self) -> None:
8383
],
8484
)
8585

86-
@patch('v03_pipeline.lib.vep.validate_vep_config_reference_genome')
8786
@patch('v03_pipeline.lib.vep.hl.vep')
8887
def test_sorted_transcript_consequences_37(
8988
self,
9089
mock_vep: Mock,
91-
mock_validate: Mock,
9290
) -> None:
9391
ht = hl.Table.parallelize(
9492
[
@@ -108,7 +106,6 @@ def test_sorted_transcript_consequences_37(
108106
key=['locus', 'alleles'],
109107
)
110108
mock_vep.return_value = ht.annotate(vep=MOCK_37_VEP_DATA)
111-
mock_validate.return_value = None
112109
ht = run_vep(
113110
ht,
114111
DatasetType.SNV_INDEL,
@@ -166,12 +163,10 @@ def test_sorted_transcript_consequences_37(
166163
],
167164
)
168165

169-
@patch('v03_pipeline.lib.vep.validate_vep_config_reference_genome')
170166
@patch('v03_pipeline.lib.vep.hl.vep')
171167
def test_sorted_transcript_consequences_38(
172168
self,
173169
mock_vep: Mock,
174-
mock_validate: Mock,
175170
) -> None:
176171
ht = hl.Table.parallelize(
177172
[
@@ -191,7 +186,6 @@ def test_sorted_transcript_consequences_38(
191186
key=['locus', 'alleles'],
192187
)
193188
mock_vep.return_value = ht.annotate(vep=MOCK_38_VEP_DATA)
194-
mock_validate.return_value = None
195189
ht = run_vep(
196190
ht,
197191
DatasetType.SNV_INDEL,
@@ -322,12 +316,10 @@ def test_sorted_transcript_consequences_38(
322316
],
323317
)
324318

325-
@patch('v03_pipeline.lib.vep.validate_vep_config_reference_genome')
326319
@patch('v03_pipeline.lib.vep.hl.vep')
327320
def test_sorted_other_feature_consequences(
328321
self,
329322
mock_vep: Mock,
330-
mock_validate: Mock,
331323
) -> None:
332324
ht = hl.Table.parallelize(
333325
[
@@ -347,7 +339,6 @@ def test_sorted_other_feature_consequences(
347339
key=['locus', 'alleles'],
348340
)
349341
mock_vep.return_value = ht.annotate(vep=MOCK_38_VEP_DATA)
350-
mock_validate.return_value = None
351342
ht = run_vep(
352343
ht,
353344
DatasetType.SNV_INDEL,

v03_pipeline/lib/model/environment.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,6 @@
2121
'REFERENCE_DATASETS',
2222
'/seqr-reference-data',
2323
)
24-
VEP_CONFIG_PATH = os.environ.get('VEP_CONFIG_PATH', None)
25-
VEP_CONFIG_URI = os.environ.get('VEP_CONFIG_URI', None)
26-
2724
# Allele registry secrets :/
2825
ALLELE_REGISTRY_SECRET_NAME = os.environ.get('ALLELE_REGISTRY_SECRET_NAME', None)
2926
PROJECT_ID = os.environ.get('PROJECT_ID', None)
@@ -52,5 +49,3 @@ class Env:
5249
PROJECT_ID: str | None = PROJECT_ID
5350
REFERENCE_DATASETS: str = REFERENCE_DATASETS
5451
SHOULD_REGISTER_ALLELES: bool = SHOULD_REGISTER_ALLELES
55-
VEP_CONFIG_PATH: str | None = VEP_CONFIG_PATH
56-
VEP_CONFIG_URI: str | None = VEP_CONFIG_URI

v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -221,14 +221,12 @@ def test_missing_interval_reference(
221221
)
222222
@patch.object(ReferenceGenome, 'standard_contigs', new_callable=PropertyMock)
223223
@patch('v03_pipeline.lib.vep.hl.vep')
224-
@patch('v03_pipeline.lib.vep.validate_vep_config_reference_genome')
225224
@patch(
226225
'v03_pipeline.lib.tasks.write_new_variants_table.load_gencode_ensembl_to_refseq_id',
227226
)
228227
def test_multiple_update_vat(
229228
self,
230229
mock_load_gencode_ensembl_to_refseq_id: Mock,
231-
mock_vep_validate: Mock,
232230
mock_vep: Mock,
233231
mock_standard_contigs: Mock,
234232
mock_update_vat_with_rdc_task: Mock,
@@ -246,7 +244,6 @@ def test_multiple_update_vat(
246244
)
247245
)
248246
mock_vep.side_effect = lambda ht, **_: ht.annotate(vep=MOCK_38_VEP_DATA)
249-
mock_vep_validate.return_value = None
250247
mock_load_gencode_ensembl_to_refseq_id.return_value = hl.dict(
251248
{'ENST00000327044': 'NM_015658.4'},
252249
)
@@ -677,10 +674,8 @@ def test_multiple_update_vat(
677674
'v03_pipeline.lib.tasks.write_new_variants_table.UpdateVariantAnnotationsTableWithUpdatedReferenceDataset',
678675
)
679676
@patch('v03_pipeline.lib.vep.hl.vep')
680-
@patch('v03_pipeline.lib.vep.validate_vep_config_reference_genome')
681677
def test_update_vat_grch37(
682678
self,
683-
mock_vep_validate: Mock,
684679
mock_vep: Mock,
685680
mock_update_vat_with_rdc_task: Mock,
686681
mock_register_alleles: Mock,
@@ -694,7 +689,6 @@ def test_update_vat_grch37(
694689
)
695690
)
696691
mock_vep.side_effect = lambda ht, **_: ht.annotate(vep=MOCK_37_VEP_DATA)
697-
mock_vep_validate.return_value = None
698692
mock_register_alleles.side_effect = None
699693
worker = luigi.worker.Worker()
700694
uvatwns_task = UpdateVariantAnnotationsTableWithNewSamplesTask(
@@ -841,14 +835,12 @@ def test_update_vat_grch37(
841835
)
842836
@patch('v03_pipeline.lib.model.reference_dataset_collection.Env')
843837
@patch('v03_pipeline.lib.vep.hl.vep')
844-
@patch('v03_pipeline.lib.vep.validate_vep_config_reference_genome')
845838
@patch(
846839
'v03_pipeline.lib.tasks.write_new_variants_table.load_gencode_ensembl_to_refseq_id',
847840
)
848841
def test_update_vat_without_accessing_private_datasets(
849842
self,
850843
mock_load_gencode_ensembl_to_refseq_id: Mock,
851-
mock_vep_validate: Mock,
852844
mock_vep: Mock,
853845
mock_rdc_env: Mock,
854846
mock_update_vat_with_rdc_task: Mock,
@@ -874,7 +866,6 @@ def test_update_vat_without_accessing_private_datasets(
874866
)
875867
mock_rdc_env.ACCESS_PRIVATE_REFERENCE_DATASETS = False
876868
mock_vep.side_effect = lambda ht, **_: ht.annotate(vep=MOCK_38_VEP_DATA)
877-
mock_vep_validate.return_value = None
878869
mock_register_alleles.side_effect = None
879870
worker = luigi.worker.Worker()
880871
uvatwns_task = UpdateVariantAnnotationsTableWithNewSamplesTask(

v03_pipeline/lib/vep.py

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,10 @@
1-
import hail as hl
1+
from string import Template
22

3-
from v03_pipeline.lib.model import DatasetType, Env, ReferenceGenome
3+
import hail as hl
44

5+
from v03_pipeline.lib.model import DatasetType, ReferenceGenome
56

6-
def validate_vep_config_reference_genome(reference_genome) -> None:
7-
with open(Env.VEP_CONFIG_PATH) as f:
8-
if reference_genome.value not in f.read():
9-
msg = f'Vep config does not match supplied reference genome {reference_genome.value}'
10-
raise ValueError(msg)
7+
VEP_CONFIG_URI = Template('file:///vep_data/vep-$reference_genome.json')
118

129

1310
def run_vep(
@@ -17,10 +14,9 @@ def run_vep(
1714
) -> hl.Table:
1815
if not dataset_type.veppable:
1916
return ht
20-
validate_vep_config_reference_genome(reference_genome)
2117
return hl.vep(
2218
ht,
23-
config=Env.VEP_CONFIG_URI,
19+
config=VEP_CONFIG_URI.substitute(reference_genome=reference_genome.value),
2420
name='vep',
2521
block_size=1000,
2622
tolerate_parse_error=True,

0 commit comments

Comments
 (0)