diff --git a/v03_pipeline/bin/dataproc_vep_init.bash b/v03_pipeline/bin/dataproc_vep_init.bash index 2be6c7b7e..fe880f260 100755 --- a/v03_pipeline/bin/dataproc_vep_init.bash +++ b/v03_pipeline/bin/dataproc_vep_init.bash @@ -52,9 +52,9 @@ EOF gcc -Wall -Werror -O2 /vep.c -o /vep chmod u+s /vep -gcloud storage cp gs://seqr-luigi/releases/$ENVIRONMENT/latest/bin/download_vep_data.bash /download_vep_data.bash -chmod +x /download_vep_data.bash -./download_vep_data.bash $REFERENCE_GENOME +gcloud storage cp gs://seqr-luigi/releases/$ENVIRONMENT/latest/bin/download_vep_reference_data.bash /download_vep_reference_data.bash +chmod +x /download_vep_reference_data.bash +./download_vep_reference_data.bash $REFERENCE_GENOME gcloud storage cp gs://seqr-luigi/releases/$ENVIRONMENT/latest/bin/vep /vep.bash chmod +x /vep.bash diff --git a/v03_pipeline/bin/download_vep_data.bash b/v03_pipeline/bin/download_vep_reference_data.bash similarity index 85% rename from v03_pipeline/bin/download_vep_data.bash rename to v03_pipeline/bin/download_vep_reference_data.bash index 36cc26b60..1535bb526 100755 --- a/v03_pipeline/bin/download_vep_data.bash +++ b/v03_pipeline/bin/download_vep_reference_data.bash @@ -3,7 +3,7 @@ set -eux REFERENCE_GENOME=$1 -VEP_DATA=/seqr/vep_data +VEP_REFERENCE_DATASETS_DIR=${VEP_REFERENCE_DATASETS_DIR:-/seqr/vep-reference-data} case $REFERENCE_GENOME in GRCh38) @@ -43,20 +43,20 @@ case $REFERENCE_GENOME in exit 1 esac -if [ -f $VEP_DATA/$REFERENCE_GENOME/_SUCCESS ]; then +if [ -f $VEP_REFERENCE_DATASETS_DIR/$REFERENCE_GENOME/_SUCCESS ]; then echo "Skipping download because already successful" exit 0; fi -mkdir -p $VEP_DATA/$REFERENCE_GENOME; +mkdir -p $VEP_REFERENCE_DATASETS_DIR/$REFERENCE_GENOME; for vep_reference_data_file in ${VEP_REFERENCE_DATA_FILES[@]}; do if [[ $vep_reference_data_file == *.tar.gz ]]; then echo "Downloading and extracting" $vep_reference_data_file; - gsutil cat $vep_reference_data_file | tar -xzf - -C $VEP_DATA/$REFERENCE_GENOME/ & + gsutil cat $vep_reference_data_file | tar -xzf - -C $VEP_REFERENCE_DATASETS_DIR/$REFERENCE_GENOME/ & else echo "Downloading" $vep_reference_data_file; - gsutil cat $vep_reference_data_file $VEP_DATA/$REFERENCE_GENOME/ & + gsutil cat $vep_reference_data_file $VEP_REFERENCE_DATASETS_DIR/$REFERENCE_GENOME/ & fi done; wait -touch $VEP_DATA/$REFERENCE_GENOME/_SUCCESS +touch $VEP_REFERENCE_DATASETS_DIR/$REFERENCE_GENOME/_SUCCESS diff --git a/v03_pipeline/bin/vep b/v03_pipeline/bin/vep index b3558545e..33996bf27 100755 --- a/v03_pipeline/bin/vep +++ b/v03_pipeline/bin/vep @@ -3,7 +3,7 @@ set -eux REFERENCE_GENOME=$1 -VEP_DATA=/seqr/vep_data +VEP_REFERENCE_DATASETS_DIR=${VEP_REFERENCE_DATASETS_DIR:-/seqr/vep-reference-data} VEP_DOCKER_IMAGE="gcr.io/seqr-project/vep-docker-image" case $REFERENCE_GENOME in @@ -17,5 +17,5 @@ case $REFERENCE_GENOME in esac shift # Remove the REFERENCE_GENOME arg. -docker run --platform linux/amd64 -i -v $VEP_DATA/$REFERENCE_GENOME:/opt/vep/.vep/:ro $VEP_DOCKER_IMAGE:$REFERENCE_GENOME \ +docker run --platform linux/amd64 -i -v $VEP_REFERENCE_DATASETS_DIR/$REFERENCE_GENOME:/opt/vep/.vep/:ro $VEP_DOCKER_IMAGE:$REFERENCE_GENOME \ /opt/vep/src/ensembl-vep/vep $@ diff --git a/v03_pipeline/lib/model/environment.py b/v03_pipeline/lib/model/environment.py index 0af69d91b..7b5a9792d 100644 --- a/v03_pipeline/lib/model/environment.py +++ b/v03_pipeline/lib/model/environment.py @@ -21,6 +21,10 @@ 'REFERENCE_DATASETS_DIR', '/seqr/seqr-reference-data', ) +VEP_REFERENCE_DATASETS_DIR = os.environ.get( + 'VEP_REFERENCE_DATASETS_DIR', + '/seqr/vep-reference-data', +) # Allele registry secrets :/ ALLELE_REGISTRY_SECRET_NAME = os.environ.get('ALLELE_REGISTRY_SECRET_NAME', None) @@ -50,3 +54,4 @@ class Env: PROJECT_ID: str | None = PROJECT_ID REFERENCE_DATASETS_DIR: str = REFERENCE_DATASETS_DIR SHOULD_REGISTER_ALLELES: bool = SHOULD_REGISTER_ALLELES + VEP_REFERENCE_DATASETS_DIR: str = VEP_REFERENCE_DATASETS_DIR diff --git a/v03_pipeline/lib/vep.py b/v03_pipeline/lib/vep.py index 6f84d4646..bc9befd32 100644 --- a/v03_pipeline/lib/vep.py +++ b/v03_pipeline/lib/vep.py @@ -2,10 +2,10 @@ import hail as hl -from v03_pipeline.lib.model import DatasetType, ReferenceGenome +from v03_pipeline.lib.model import DatasetType, Env, ReferenceGenome VEP_CONFIG_URI = Template( - 'file:///seqr/vep_data/$reference_genome/vep-$reference_genome.json', + 'file://$vep_reference_datasets_dir/$reference_genome/vep-$reference_genome.json', ) @@ -18,7 +18,10 @@ def run_vep( return ht return hl.vep( ht, - config=VEP_CONFIG_URI.substitute(reference_genome=reference_genome.value), + config=VEP_CONFIG_URI.substitute( + vep_reference_datasets_dir=Env.VEP_REFERENCE_DATASETS_DIR, + reference_genome=reference_genome.value, + ), name='vep', block_size=1000, tolerate_parse_error=True,