Skip to content

VEP Config update #883

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .github/workflows/dev-release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,5 +45,4 @@ jobs:
run: |-
gcloud storage rm -r gs://seqr-luigi/releases/dev/latest/ || echo 'No latest release'
gcloud storage cp v03_pipeline/bin/* gs://seqr-luigi/releases/dev/latest/bin/
gcloud storage cp v03_pipeline/var/vep_config/* gs://seqr-luigi/releases/dev/latest/var/vep_config
gcloud storage cp dist/*.whl gs://seqr-luigi/releases/dev/latest/pyscripts.zip
3 changes: 1 addition & 2 deletions .github/workflows/prod-release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,11 +53,10 @@ jobs:
run: |-
gcloud storage rm -r gs://seqr-luigi/releases/prod/latest/ || echo 'No latest release'
gcloud storage cp v03_pipeline/bin/* gs://seqr-luigi/releases/prod/latest/bin/
gcloud storage cp v03_pipeline/var/vep_config/* gs://seqr-luigi/releases/prod/latest/var/vep_config
gcloud storage cp dist/*.whl gs://seqr-luigi/releases/prod/latest/pyscripts.zip
gcloud storage cp v03_pipeline/bin/* gs://seqr-luigi/releases/prod/$TAG_NAME/bin/
gcloud storage cp v03_pipeline/var/vep_config/* gs://seqr-luigi/releases/prod/$TAG_NAME/var/vep_config
gcloud storage cp dist/*.whl gs://seqr-luigi/releases/prod/$TAG_NAME/pyscripts.zip
gcloud storage cp v03_pipeline/var/vep/* gs://seqr-reference-data/vep/

- name: Create tag
uses: actions/github-script@v7
Expand Down
3 changes: 0 additions & 3 deletions v03_pipeline/bin/dataproc_vep_init.bash
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ set -x

export PROJECT="$(gcloud config get-value project)"
export ENVIRONMENT="$(/usr/share/google/get_metadata_value attributes/ENVIRONMENT)"
export VEP_CONFIG_PATH="$(/usr/share/google/get_metadata_value attributes/VEP_CONFIG_PATH)"
export REFERENCE_GENOME="$(/usr/share/google/get_metadata_value attributes/REFERENCE_GENOME)"

# Install docker
Expand All @@ -37,8 +36,6 @@ apt-get install -y --allow-unauthenticated docker-ce
sleep 60
sudo service docker restart

gcloud storage cp gs://seqr-luigi/releases/$ENVIRONMENT/latest/var/vep_config/vep-$REFERENCE_GENOME.json $VEP_CONFIG_PATH

cat >/vep.c <<EOF
#include <unistd.h>
#include <stdio.h>
Expand Down
3 changes: 3 additions & 0 deletions v03_pipeline/bin/download_vep_data.bash
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,16 @@ case $REFERENCE_GENOME in

# Copied from the UTRAnnotator repo (https://github.com/ImperialCardioGenetics/UTRannotator/tree/master)
'gs://seqr-reference-data/vep/GRCh38/uORF_5UTR_GRCh38_PUBLIC.txt'

'gs://seqr-reference-data/vep/GRCh38/vep-GRCh38.json'
)
;;
GRCh37)
VEP_REFERENCE_DATA_FILES=(
'gs://seqr-reference-data/vep_data/loftee-beta/GRCh37.tar.gz'
'gs://seqr-reference-data/vep/GRCh37/homo_sapiens_vep_110_GRCh37.tar.gz'
'gs://seqr-reference-data/vep/GRCh37/Homo_sapiens.GRCh37.dna.primary_assembly.fa.*'
'gs://seqr-reference-data/vep/GRCh37/vep-GRCh37.json'
)
;;
*)
Expand Down
12 changes: 7 additions & 5 deletions v03_pipeline/deploy/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,15 @@ WORKDIR /v03_pipeline
COPY requirements.txt .
RUN python3 -m pip install --no-cache-dir -r ./requirements.txt

# VEP
COPY v03_pipeline/bin/vep /vep

# Application Code
COPY v03_pipeline/ .
COPY v03_pipeline/api api
COPY v03_pipeline/bin bin
COPY v03_pipeline/lib lib
COPY v03_pipeline/migrations migrations

RUN cp ./var/spark_config/spark-defaults.conf /usr/local/lib/python3.10/dist-packages/pyspark/conf/spark-defaults.conf
# Special paths
COPY v03_pipeline/var/spark_config/spark-defaults.conf /usr/local/lib/python3.10/dist-packages/pyspark/conf/spark-defaults.conf
COPY v03_pipeline/bin/vep /vep

WORKDIR /
EXPOSE 5000
Expand Down
4 changes: 1 addition & 3 deletions v03_pipeline/lib/annotations/fields_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,8 @@ def setUp(self) -> None:
),
)

@patch('v03_pipeline.lib.vep.validate_vep_config_reference_genome')
@patch('v03_pipeline.lib.vep.hl.vep')
def test_get_formatting_fields(self, mock_vep: Mock, mock_validate: Mock) -> None:
mock_validate.return_value = None
def test_get_formatting_fields(self, mock_vep: Mock) -> None:
for reference_genome, ht, expected_fields in [
(
ReferenceGenome.GRCh38,
Expand Down
9 changes: 0 additions & 9 deletions v03_pipeline/lib/annotations/snv_indel_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,12 +83,10 @@ def test_allele_count_annotations(self) -> None:
],
)

@patch('v03_pipeline.lib.vep.validate_vep_config_reference_genome')
@patch('v03_pipeline.lib.vep.hl.vep')
def test_sorted_transcript_consequences_37(
self,
mock_vep: Mock,
mock_validate: Mock,
) -> None:
ht = hl.Table.parallelize(
[
Expand All @@ -108,7 +106,6 @@ def test_sorted_transcript_consequences_37(
key=['locus', 'alleles'],
)
mock_vep.return_value = ht.annotate(vep=MOCK_37_VEP_DATA)
mock_validate.return_value = None
ht = run_vep(
ht,
DatasetType.SNV_INDEL,
Expand Down Expand Up @@ -166,12 +163,10 @@ def test_sorted_transcript_consequences_37(
],
)

@patch('v03_pipeline.lib.vep.validate_vep_config_reference_genome')
@patch('v03_pipeline.lib.vep.hl.vep')
def test_sorted_transcript_consequences_38(
self,
mock_vep: Mock,
mock_validate: Mock,
) -> None:
ht = hl.Table.parallelize(
[
Expand All @@ -191,7 +186,6 @@ def test_sorted_transcript_consequences_38(
key=['locus', 'alleles'],
)
mock_vep.return_value = ht.annotate(vep=MOCK_38_VEP_DATA)
mock_validate.return_value = None
ht = run_vep(
ht,
DatasetType.SNV_INDEL,
Expand Down Expand Up @@ -322,12 +316,10 @@ def test_sorted_transcript_consequences_38(
],
)

@patch('v03_pipeline.lib.vep.validate_vep_config_reference_genome')
@patch('v03_pipeline.lib.vep.hl.vep')
def test_sorted_other_feature_consequences(
self,
mock_vep: Mock,
mock_validate: Mock,
) -> None:
ht = hl.Table.parallelize(
[
Expand All @@ -347,7 +339,6 @@ def test_sorted_other_feature_consequences(
key=['locus', 'alleles'],
)
mock_vep.return_value = ht.annotate(vep=MOCK_38_VEP_DATA)
mock_validate.return_value = None
ht = run_vep(
ht,
DatasetType.SNV_INDEL,
Expand Down
5 changes: 0 additions & 5 deletions v03_pipeline/lib/model/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,6 @@
'REFERENCE_DATASETS',
'/seqr-reference-data',
)
VEP_CONFIG_PATH = os.environ.get('VEP_CONFIG_PATH', None)
VEP_CONFIG_URI = os.environ.get('VEP_CONFIG_URI', None)

# Allele registry secrets :/
ALLELE_REGISTRY_SECRET_NAME = os.environ.get('ALLELE_REGISTRY_SECRET_NAME', None)
PROJECT_ID = os.environ.get('PROJECT_ID', None)
Expand Down Expand Up @@ -52,5 +49,3 @@ class Env:
PROJECT_ID: str | None = PROJECT_ID
REFERENCE_DATASETS: str = REFERENCE_DATASETS
SHOULD_REGISTER_ALLELES: bool = SHOULD_REGISTER_ALLELES
VEP_CONFIG_PATH: str | None = VEP_CONFIG_PATH
VEP_CONFIG_URI: str | None = VEP_CONFIG_URI
Original file line number Diff line number Diff line change
Expand Up @@ -221,14 +221,12 @@ def test_missing_interval_reference(
)
@patch.object(ReferenceGenome, 'standard_contigs', new_callable=PropertyMock)
@patch('v03_pipeline.lib.vep.hl.vep')
@patch('v03_pipeline.lib.vep.validate_vep_config_reference_genome')
@patch(
'v03_pipeline.lib.tasks.write_new_variants_table.load_gencode_ensembl_to_refseq_id',
)
def test_multiple_update_vat(
self,
mock_load_gencode_ensembl_to_refseq_id: Mock,
mock_vep_validate: Mock,
mock_vep: Mock,
mock_standard_contigs: Mock,
mock_update_vat_with_rdc_task: Mock,
Expand All @@ -246,7 +244,6 @@ def test_multiple_update_vat(
)
)
mock_vep.side_effect = lambda ht, **_: ht.annotate(vep=MOCK_38_VEP_DATA)
mock_vep_validate.return_value = None
mock_load_gencode_ensembl_to_refseq_id.return_value = hl.dict(
{'ENST00000327044': 'NM_015658.4'},
)
Expand Down Expand Up @@ -677,10 +674,8 @@ def test_multiple_update_vat(
'v03_pipeline.lib.tasks.write_new_variants_table.UpdateVariantAnnotationsTableWithUpdatedReferenceDataset',
)
@patch('v03_pipeline.lib.vep.hl.vep')
@patch('v03_pipeline.lib.vep.validate_vep_config_reference_genome')
def test_update_vat_grch37(
self,
mock_vep_validate: Mock,
mock_vep: Mock,
mock_update_vat_with_rdc_task: Mock,
mock_register_alleles: Mock,
Expand All @@ -694,7 +689,6 @@ def test_update_vat_grch37(
)
)
mock_vep.side_effect = lambda ht, **_: ht.annotate(vep=MOCK_37_VEP_DATA)
mock_vep_validate.return_value = None
mock_register_alleles.side_effect = None
worker = luigi.worker.Worker()
uvatwns_task = UpdateVariantAnnotationsTableWithNewSamplesTask(
Expand Down Expand Up @@ -841,14 +835,12 @@ def test_update_vat_grch37(
)
@patch('v03_pipeline.lib.model.reference_dataset_collection.Env')
@patch('v03_pipeline.lib.vep.hl.vep')
@patch('v03_pipeline.lib.vep.validate_vep_config_reference_genome')
@patch(
'v03_pipeline.lib.tasks.write_new_variants_table.load_gencode_ensembl_to_refseq_id',
)
def test_update_vat_without_accessing_private_datasets(
self,
mock_load_gencode_ensembl_to_refseq_id: Mock,
mock_vep_validate: Mock,
mock_vep: Mock,
mock_rdc_env: Mock,
mock_update_vat_with_rdc_task: Mock,
Expand All @@ -874,7 +866,6 @@ def test_update_vat_without_accessing_private_datasets(
)
mock_rdc_env.ACCESS_PRIVATE_REFERENCE_DATASETS = False
mock_vep.side_effect = lambda ht, **_: ht.annotate(vep=MOCK_38_VEP_DATA)
mock_vep_validate.return_value = None
mock_register_alleles.side_effect = None
worker = luigi.worker.Worker()
uvatwns_task = UpdateVariantAnnotationsTableWithNewSamplesTask(
Expand Down
14 changes: 5 additions & 9 deletions v03_pipeline/lib/vep.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
import hail as hl
from string import Template

from v03_pipeline.lib.model import DatasetType, Env, ReferenceGenome
import hail as hl

from v03_pipeline.lib.model import DatasetType, ReferenceGenome

def validate_vep_config_reference_genome(reference_genome) -> None:
with open(Env.VEP_CONFIG_PATH) as f:
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No longer necessary since the reference genome is now embedded as part of the path.

if reference_genome.value not in f.read():
msg = f'Vep config does not match supplied reference genome {reference_genome.value}'
raise ValueError(msg)
VEP_CONFIG_URI = Template('file:///vep_data/vep-$reference_genome.json')


def run_vep(
Expand All @@ -17,10 +14,9 @@ def run_vep(
) -> hl.Table:
if not dataset_type.veppable:
return ht
validate_vep_config_reference_genome(reference_genome)
return hl.vep(
ht,
config=Env.VEP_CONFIG_URI,
config=VEP_CONFIG_URI.substitute(reference_genome=reference_genome.value),
name='vep',
block_size=1000,
tolerate_parse_error=True,
Expand Down
Loading