Skip to content

Commit 16b2715

Browse files
authored
VEP 110 docker image and dataproc init script (#758)
* Add VEP docker image * simplify * bump version * Add cloudbuild * first pass * a bit of cleanup * ws * ws * A few tweaks * twiddle options * Bunch of config * working! * Update vep-GRCh38.json * Update vep-110-GRCh38.sh * missing slash * more VEP * some vep cleanup * Remove genesplicer
1 parent 97257c9 commit 16b2715

File tree

6 files changed

+128
-1
lines changed

6 files changed

+128
-1
lines changed
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# Run locally with:
2+
#
3+
# gcloud builds submit --quiet --substitutions='_VEP_VERSION=110' --config .cloudbuild/vep-docker.cloudbuild.yaml v03_pipeline/
4+
steps:
5+
- name: 'gcr.io/kaniko-project/executor:v1.3.0'
6+
args:
7+
- --destination=gcr.io/seqr-project/vep-docker-image:${_VEP_VERSION}
8+
- --dockerfile=deploy/Dockerfile.vep
9+
- --cache=true
10+
- --cache-ttl=168h
11+
12+
timeout: 1800s

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ version = {attr = "v03_pipeline.__version__"}
1111

1212
[tool.setuptools.packages.find]
1313
include = ["v03_pipeline*"]
14-
exclude = ["v03_pipeline.bin", "v03_pipeline*test*"]
14+
exclude = ["v03_pipeline.bin", "v03_pipeline.deploy", "v03_pipeline*test*"]
1515
namespaces = false
1616

1717
[tool.mypy]

v03_pipeline/bin/vep-110-GRCh38.sh

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
#
2+
# VEP init action for dataproc
3+
#
4+
# adapted/copied from
5+
# https://github.com/broadinstitute/gnomad_methods/blob/main/init_scripts/vep105-init.sh
6+
# and gs://hail-common/hailctl/dataproc/0.2.128/vep-GRCh38.sh
7+
#
8+
9+
set -x
10+
11+
export PROJECT="$(gcloud config get-value project)"
12+
export VEP_CONFIG_PATH="$(/usr/share/google/get_metadata_value attributes/VEP_CONFIG_PATH)"
13+
export VEP_REPLICATE="$(/usr/share/google/get_metadata_value attributes/VEP_REPLICATE)"
14+
export ASSEMBLY=GRCh38
15+
export VEP_DOCKER_IMAGE=gcr.io/seqr-project/vep-docker-image
16+
17+
mkdir -p /vep_data
18+
19+
# Install docker
20+
apt-get update
21+
apt-get -y install \
22+
apt-transport-https \
23+
ca-certificates \
24+
curl \
25+
gnupg2 \
26+
software-properties-common \
27+
tabix
28+
curl -fsSL https://download.docker.com/linux/debian/gpg | sudo apt-key add -
29+
sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/debian $(lsb_release -cs) stable"
30+
sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/debian $(lsb_release -cs) stable"
31+
apt-get update
32+
apt-get install -y --allow-unauthenticated docker-ce
33+
34+
# https://github.com/hail-is/hail/issues/12936
35+
sleep 60
36+
sudo service docker restart
37+
38+
# Copied from the repo at v03_pipeline/var/vep_config
39+
gcloud storage cp --billing-project $PROJECT gs://seqr-reference-data/vep/110/vep-${ASSEMBLY}.json $VEP_CONFIG_PATH
40+
41+
# Copied from the UTRAnnotator repo (https://github.com/ImperialCardioGenetics/UTRannotator/tree/master)
42+
gcloud storage cp --billing-project $PROJECT gs://seqr-reference-data/vep/110/uORF_5UTR_${ASSEMBLY}_PUBLIC.txt /vep_data/ &
43+
44+
# Raw data files copied from the bucket (https://console.cloud.google.com/storage/browser/dm_alphamissense;tab=objects?prefix=&forceOnObjectsSortingFiltering=false)
45+
# Some investigation led us to want to combine the canonical and non-canonical transcript tsvs (run inside the VEP docker container):
46+
# cat AlphaMissense_hg38.tsv.gz | gunzip | grep -v '#' | awk 'BEGIN { OFS = "\t" };{$6=""; print $0}' > AlphaMissense_combined_hg38.tsv
47+
# cat AlphaMissense_isoforms_hg38.tsv.gz | gunzip | grep -v '#' >> AlphaMissense_combined_hg38.tsv
48+
# cat AlphaMissense_combined_hg38.tsv | sort --parallel=12 --buffer-size=20G -k1,1 -k2,2n > AlphaMissense_combined_sorted_hg38.tsv
49+
# cat AlphaMissense_combined_sorted_hg38.tsv | sed '1i #CHROM\tPOS\tREF\tALT\tgenome\ttranscript_id\tprotein_variant\tam_pathogenicity\tam_class' > AlphaMissense_hg38.tsv
50+
# bgzip AlphaMissense_hg38.tsv
51+
# tabix -s 1 -b 2 -e 2 -f -S 1 AlphaMissense_hg38.tsv.gz
52+
gcloud storage cp --billing-project $PROJECT 'gs://seqr-reference-data/vep/110/AlphaMissense_hg38.tsv.*' /vep_data/ &
53+
54+
gcloud storage cat --billing-project $PROJECT gs://seqr-reference-data/vep_data/loftee-beta/${ASSEMBLY}.tar | tar -xf - -C /vep_data/ &
55+
56+
# Copied from ftp://ftp.ensembl.org/pub/release-110/variation/indexed_vep_cache/homo_sapiens_merged_vep_110_${ASSEMBLY}.tar.gz
57+
gcloud storage cat --billing-project $PROJECT gs://seqr-reference-data/vep/110/homo_sapiens_vep_110_${ASSEMBLY}.tar.gz | tar -xzf - -C /vep_data/ &
58+
59+
# Generated with:
60+
# curl -O ftp://ftp.ensembl.org/pub/release-110/fasta/homo_sapiens/dna/Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa.gz > Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa.gz
61+
# gzip -d Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa.gz
62+
# bgzip Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa
63+
# samtools faidx Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa.gz
64+
gcloud storage cp --billing-project $PROJECT 'gs://seqr-reference-data/vep/110/Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa.*' /vep_data/ &
65+
docker pull ${VEP_DOCKER_IMAGE} &
66+
wait
67+
68+
cat >/vep.c <<EOF
69+
#include <unistd.h>
70+
#include <stdio.h>
71+
72+
int
73+
main(int argc, char *const argv[]) {
74+
if (setuid(geteuid()))
75+
perror( "setuid" );
76+
77+
execv("/vep.sh", argv);
78+
return 0;
79+
}
80+
EOF
81+
gcc -Wall -Werror -O2 /vep.c -o /vep
82+
chmod u+s /vep
83+
84+
cat >/vep.sh <<EOF
85+
#!/bin/bash
86+
87+
docker run -i -v /vep_data/:/opt/vep/.vep/:ro ${VEP_DOCKER_IMAGE} \
88+
/opt/vep/src/ensembl-vep/vep "\$@"
89+
EOF
90+
chmod +x /vep.sh

v03_pipeline/deploy/Dockerfile.vep

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
FROM ubuntu:18.04 as build
2+
3+
# Adapted from https://hub.docker.com/layers/konradjk/vep95_loftee/latest/images/sha256-d5f1a155293412acb5af4811142ba6907bad1cd708ca4000528f6317b784440e?context=explore
4+
# and https://github.com/broadinstitute/gnomad_methods/blob/main/docker_files/Dockerfile_VEP105
5+
RUN apt-get update && apt-get -y install wget libncurses5-dev libncursesw5-dev libbz2-dev liblzma-dev build-essential libz-dev git
6+
RUN wget https://github.com/samtools/samtools/releases/download/1.7/samtools-1.7.tar.bz2 && tar xjvf samtools-1.7.tar.bz2 && cd samtools-1.7 && make && make install
7+
RUN git clone -b grch38 https://github.com/konradjk/loftee.git
8+
9+
FROM ensemblorg/ensembl-vep:release_110.1 as runtime
10+
RUN cpanm DBD::SQLite
11+
COPY --from=build /usr/local/bin/samtools /usr/local/bin/samtools
12+
# semantics of mv vs COPY are different such that we don't need the '*' when moving files.
13+
COPY --from=build /loftee/ /plugins

v03_pipeline/lib/vep.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,4 +24,5 @@ def run_vep(
2424
name='vep',
2525
block_size=1000,
2626
tolerate_parse_error=True,
27+
csq=False,
2728
)
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
{
2+
"command": [
3+
"bash",
4+
"-c",
5+
"/vep --format vcf -json --hgvs --biotype --canonical --mane --numbers --minimal --allele_number --no_stats --cache --offline --assembly GRCh38 --fasta /opt/vep/.vep/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz --plugin LoF,loftee_path:/plugins,gerp_bigwig:/opt/vep/.vep/gerp_conservation_scores.homo_sapiens.GRCh38.bw,human_ancestor_fa:/opt/vep/.vep/human_ancestor.fa.gz,conservation_file:/opt/vep/.vep/loftee.sql --plugin UTRAnnotator,file=/opt/vep/.vep/uORF_5UTR_GRCh38_PUBLIC.txt --plugin SpliceRegion,Extended --plugin AlphaMissense,file=/opt/vep/.vep/AlphaMissense_hg38.tsv.gz --dir_plugins /plugins -o STDOUT | sed s/5utr/fiveutr/g"
6+
],
7+
"env": {
8+
"PERL5LIB": "/plugins"
9+
},
10+
"vep_json_schema": "Struct{allele_string:String,end:Int32,id:String,input:String,intergenic_consequences:Array[Struct{allele_num:Int32,ancestral:String,consequence_terms:Array[String],context:String,impact:String,variant_allele:String}],minimised:Int32,most_severe_consequence:String,seq_region_name:String,start:Int32,strand:Int32,transcript_consequences:Array[Struct{allele_num:Int32,amino_acids:String,biotype:String,canonical:Int32,cdna_end:Int32,cdna_start:Int32,cds_end:Int32,cds_start:Int32,codons:String,consequence_terms:Array[String],distance:Int32,flags:String,gene_id:String,hgvsc:String,hgvsp:String,hgvs_offset:Int32,impact:String,mane_select:String,mane_plus_clinical:String,protein_end:Int32,protein_start:Int32,strand:Int32,transcript_id:String,variant_allele:String,lof:String,lof_flags:String,lof_filter:String,lof_info:String,existing_inframe_oorfs: Int32,existing_outofframe_oorfs: Int32,existing_uorfs: Int32,fiveutr_consequence: String,fiveutr_annotation: Dict[String, Dict[String, String]],spliceregion: Array[String],am_pathogenicity: Float32,am_class: String}]}"
11+
}

0 commit comments

Comments
 (0)