Skip to content

main into dev #880

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Aug 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .cloudbuild/vep-docker.cloudbuild.yaml
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
# Run locally with:
#
# gcloud builds submit --quiet --substitutions='_VEP_VERSION=110' --config .cloudbuild/vep-docker.cloudbuild.yaml v03_pipeline/deploy
# gcloud builds submit --quiet --substitutions='_REFERENCE_GENOME=GRCh38' --config .cloudbuild/vep-docker.cloudbuild.yaml v03_pipeline/deploy
steps:
- name: 'gcr.io/kaniko-project/executor:v1.3.0'
args:
- --destination=gcr.io/seqr-project/vep-docker-image:${_VEP_VERSION}
- --dockerfile=Dockerfile.vep
- --destination=gcr.io/seqr-project/vep-docker-image:${_REFERENCE_GENOME}
- --dockerfile=Dockerfile.vep_${_REFERENCE_GENOME}
- --cache=true
- --cache-ttl=168h

Expand Down
4 changes: 4 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
*test*
.git
.vscode
.idea
6 changes: 4 additions & 2 deletions .github/workflows/prod-release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,11 @@ jobs:
shell: bash
run: |-
gcloud storage rm -r gs://seqr-luigi/releases/prod/latest/ || echo 'No latest release'
gcloud storage cp v03_pipeline/bin/* gs://seqr-luigi/releases/prod/latest/
gcloud storage cp v03_pipeline/bin/* gs://seqr-luigi/releases/prod/latest/bin/
gcloud storage cp v03_pipeline/var/vep_config/* gs://seqr-luigi/releases/prod/latest/var/vep_config
gcloud storage cp dist/*.whl gs://seqr-luigi/releases/prod/latest/pyscripts.zip
gcloud storage cp v03_pipeline/bin/* gs://seqr-luigi/releases/prod/$TAG_NAME/
gcloud storage cp v03_pipeline/bin/* gs://seqr-luigi/releases/prod/$TAG_NAME/bin/
gcloud storage cp v03_pipeline/var/vep_config/* gs://seqr-luigi/releases/prod/$TAG_NAME/var/vep_config
gcloud storage cp dist/*.whl gs://seqr-luigi/releases/prod/$TAG_NAME/pyscripts.zip

- name: Create tag
Expand Down
2 changes: 1 addition & 1 deletion requirements.in
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
elasticsearch==7.9.1
google-api-python-client>=1.8.0
hail==0.2.130
hail==0.2.132
luigi>=3.4.0
gnomad==0.6.4
google-cloud-storage>=2.14.0
Expand Down
13 changes: 6 additions & 7 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ grpcio==1.63.0
# grpcio-status
grpcio-status==1.48.2
# via google-api-core
hail==0.2.130
hail==0.2.132
# via -r requirements.in
hdbscan==0.8.33
# via gnomad
Expand Down Expand Up @@ -221,7 +221,7 @@ numpy==1.26.2
# scipy
oauthlib==3.2.2
# via requests-oauthlib
orjson==3.9.10
orjson==3.10.6
# via hail
packaging==23.2
# via
Expand Down Expand Up @@ -254,12 +254,13 @@ protobuf==3.20.2
# googleapis-common-protos
# grpc-google-iam-v1
# grpcio-status
# hail
# proto-plus
ptyprocess==0.7.0
# via pexpect
pure-eval==0.2.2
# via stack-data
py4j==0.10.9.5
py4j==0.10.9.7
# via pyspark
pyasn1==0.5.1
# via
Expand All @@ -276,12 +277,10 @@ pygments==2.17.2
# ipython
# rich
pyjwt[crypto]==2.8.0
# via
# msal
# pyjwt
# via msal
pyparsing==3.1.1
# via httplib2
pyspark==3.3.3
pyspark==3.5.1
# via hail
python-daemon==3.0.1
# via luigi
Expand Down
Empty file added v03_pipeline/api/__init__.py
Empty file.
18 changes: 18 additions & 0 deletions v03_pipeline/api/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from aiohttp import web

from v03_pipeline.api.app import init_web_app
from v03_pipeline.lib.logger import get_logger


def run():
app = init_web_app()
logger = get_logger(__name__)
web.run_app(
app,
host='0.0.0.0', # noqa: S104
port=5000,
access_log=logger,
)


run()
17 changes: 17 additions & 0 deletions v03_pipeline/api/app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from aiohttp import web

from v03_pipeline.lib.tasks import * # noqa: F403


async def status(_: web.Request) -> web.Response:
return web.json_response({'success': True})


async def init_web_app():
app = web.Application()
app.add_routes(
[
web.get('/status', status),
],
)
return app
64 changes: 64 additions & 0 deletions v03_pipeline/bin/dataproc_vep_init.bash
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
#!/bin/bash

#
# VEP init action for dataproc
#
# adapted/copied from
# https://github.com/broadinstitute/gnomad_methods/blob/main/init_scripts/vep105-init.sh
# and gs://hail-common/hailctl/dataproc/0.2.128/vep-GRCh38.sh
#
# NB: This is code used for initializing a dataproc cluster and runs as an intialization
# action when the rest of our code is unavailable.
#

set -x

export PROJECT="$(gcloud config get-value project)"
export ENVIRONMENT="$(/usr/share/google/get_metadata_value attributes/ENVIRONMENT)"
export VEP_CONFIG_PATH="$(/usr/share/google/get_metadata_value attributes/VEP_CONFIG_PATH)"
export REFERENCE_GENOME="$(/usr/share/google/get_metadata_value attributes/REFERENCE_GENOME)"

# Install docker
apt-get update
apt-get -y install \
apt-transport-https \
ca-certificates \
curl \
gnupg2 \
software-properties-common \
tabix
curl -fsSL https://download.docker.com/linux/debian/gpg | sudo apt-key add -
sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/debian $(lsb_release -cs) stable"
sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/debian $(lsb_release -cs) stable"
apt-get update
apt-get install -y --allow-unauthenticated docker-ce

# https://github.com/hail-is/hail/issues/12936
sleep 60
sudo service docker restart

gcloud storage cp gs://seqr-luigi/releases/$ENVIRONMENT/latest/var/vep_config/vep-$REFERENCE_GENOME.json $VEP_CONFIG_PATH

cat >/vep.c <<EOF
#include <unistd.h>
#include <stdio.h>

int
main(int argc, char *const argv[]) {
if (setuid(geteuid()))
perror( "setuid" );

execv("/vep.bash", argv);
return 0;
}
EOF
gcc -Wall -Werror -O2 /vep.c -o /vep
chmod u+s /vep

gcloud storage cp gs://seqr-luigi/releases/$ENVIRONMENT/latest/bin/download_vep_data.bash /download_vep_data.bash
chmod +x /download_vep_data.bash
./download_vep_data.bash $REFERENCE_GENOME

gcloud storage cp gs://seqr-luigi/releases/$ENVIRONMENT/latest/bin/vep /vep.bash
chmod +x /vep.bash

19 changes: 19 additions & 0 deletions v03_pipeline/bin/download_reference_data.bash
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/usr/bin/env bash

set -eux

REFERENCE_GENOME=$1
SEQR_REFERENCE_DATA=/seqr-reference-data

case $REFERENCE_GENOME in
GRCh38)
;;
GRCh37)
;;
*)
echo "Invalid reference genome $REFERENCE_GENOME, should be GRCh37 or GRCh38"
exit 1
esac

mkdir -p $SEQR_REFERENCE_DATA/$REFERENCE_GENOME;
gcloud storage cp -r "gs://seqr-reference-data/v03/$REFERENCE_GENOME/*" $SEQR_REFERENCE_DATA/$REFERENCE_GENOME/
59 changes: 59 additions & 0 deletions v03_pipeline/bin/download_vep_data.bash
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#!/usr/bin/env bash

set -eux

REFERENCE_GENOME=$1
VEP_DATA=/vep_data

case $REFERENCE_GENOME in
GRCh38)
VEP_REFERENCE_DATA_FILES=(
'gs://seqr-reference-data/vep_data/loftee-beta/GRCh38.tar.gz'

# Raw data files copied from the bucket (https://console.cloud.google.com/storage/browser/dm_alphamissense;tab=objects?prefix=&forceOnObjectsSortingFiltering=false)
# tabix -s 1 -b 2 -e 2 -f -S 1 AlphaMissense_hg38.tsv.gz
'gs://seqr-reference-data/vep/GRCh38/AlphaMissense_hg38.tsv.*'

# Generated with:
# curl -O ftp://ftp.ensembl.org/pub/release-110/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz > Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz
# gzip -d Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz
# bgzip Homo_sapiens.GRCh38.dna.primary_assembly.fa
# samtools faidx Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz
'gs://seqr-reference-data/vep/GRCh38/Homo_sapiens.GRCh38.dna.primary_assembly.fa.*'

# Copied from ftp://ftp.ensembl.org/pub/release-110/variation/indexed_vep_cache/homo_sapiens_vep_110_GRCh38.tar.gz
'gs://seqr-reference-data/vep/GRCh38/homo_sapiens_vep_110_GRCh38.tar.gz'

# Copied from the UTRAnnotator repo (https://github.com/ImperialCardioGenetics/UTRannotator/tree/master)
'gs://seqr-reference-data/vep/GRCh38/uORF_5UTR_GRCh38_PUBLIC.txt'
)
;;
GRCh37)
VEP_REFERENCE_DATA_FILES=(
'gs://seqr-reference-data/vep_data/loftee-beta/GRCh37.tar.gz'
'gs://seqr-reference-data/vep/GRCh37/homo_sapiens_vep_110_GRCh37.tar.gz'
'gs://seqr-reference-data/vep/GRCh37/Homo_sapiens.GRCh37.dna.primary_assembly.fa.*'
)
;;
*)
echo "Invalid reference genome $REFERENCE_GENOME, should be GRCh37 or GRCh38"
exit 1
esac

if [ -f $VEP_DATA/$REFERENCE_GENOME/_SUCCESS ]; then
echo "Skipping download because already successful"
exit 0;
fi

mkdir -p $VEP_DATA/$REFERENCE_GENOME;
for vep_reference_data_file in ${VEP_REFERENCE_DATA_FILES[@]}; do
if [[ $vep_reference_data_file == *.tar.gz ]]; then
echo "Downloading and extracting" $vep_reference_data_file;
gcloud storage cat $vep_reference_data_file | tar -xzf - -C $VEP_DATA/$REFERENCE_GENOME/ &
else
echo "Downloading" $vep_reference_data_file;
gcloud storage cp $vep_reference_data_file $VEP_DATA/$REFERENCE_GENOME/ &
fi
done;
wait
touch $VEP_DATA/$REFERENCE_GENOME/_SUCCESS
21 changes: 21 additions & 0 deletions v03_pipeline/bin/vep
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/bin/bash

set -eux

REFERENCE_GENOME=$1
VEP_DATA=/vep_data
VEP_DOCKER_IMAGE="gcr.io/seqr-project/vep-docker-image"

case $REFERENCE_GENOME in
GRCh38)
;;
GRCh37)
;;
*)
echo "Invalid reference genome $REFERENCE_GENOME, should be GRCh37 or GRCh38"
exit 1
esac

shift # Remove the REFERENCE_GENOME arg.
docker run --platform linux/amd64 -i -v $VEP_DATA/$REFERENCE_GENOME:/opt/vep/.vep/:ro $VEP_DOCKER_IMAGE:$REFERENCE_GENOME \
/opt/vep/src/ensembl-vep/vep $@
85 changes: 0 additions & 85 deletions v03_pipeline/bin/vep-110-GRCh38.sh

This file was deleted.

Loading
Loading