diff --git a/.cloudbuild/vep-docker.cloudbuild.yaml b/.cloudbuild/vep-docker.cloudbuild.yaml index b1a645847..34f714069 100644 --- a/.cloudbuild/vep-docker.cloudbuild.yaml +++ b/.cloudbuild/vep-docker.cloudbuild.yaml @@ -1,11 +1,11 @@ # Run locally with: # -# gcloud builds submit --quiet --substitutions='_VEP_VERSION=110' --config .cloudbuild/vep-docker.cloudbuild.yaml v03_pipeline/deploy +# gcloud builds submit --quiet --substitutions='_REFERENCE_GENOME=GRCh38' --config .cloudbuild/vep-docker.cloudbuild.yaml v03_pipeline/deploy steps: - name: 'gcr.io/kaniko-project/executor:v1.3.0' args: - - --destination=gcr.io/seqr-project/vep-docker-image:${_VEP_VERSION} - - --dockerfile=Dockerfile.vep + - --destination=gcr.io/seqr-project/vep-docker-image:${_REFERENCE_GENOME} + - --dockerfile=Dockerfile.vep_${_REFERENCE_GENOME} - --cache=true - --cache-ttl=168h diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 000000000..8c63b691b --- /dev/null +++ b/.dockerignore @@ -0,0 +1,4 @@ +*test* +.git +.vscode +.idea diff --git a/.github/workflows/prod-release.yml b/.github/workflows/prod-release.yml index f1d44901e..6fb051b4d 100644 --- a/.github/workflows/prod-release.yml +++ b/.github/workflows/prod-release.yml @@ -52,9 +52,11 @@ jobs: shell: bash run: |- gcloud storage rm -r gs://seqr-luigi/releases/prod/latest/ || echo 'No latest release' - gcloud storage cp v03_pipeline/bin/* gs://seqr-luigi/releases/prod/latest/ + gcloud storage cp v03_pipeline/bin gs://seqr-luigi/releases/prod/latest/bin/ + gcloud storage cp v03_pipeline/var/vep_config gs://seqr-luigi/releases/prod/latest/var/vep_config gcloud storage cp dist/*.whl gs://seqr-luigi/releases/prod/latest/pyscripts.zip - gcloud storage cp v03_pipeline/bin/* gs://seqr-luigi/releases/prod/$TAG_NAME/ + gcloud storage cp v03_pipeline/bin gs://seqr-luigi/releases/prod/$TAG_NAME/bin/ + gcloud storage cp v03_pipeline/var/vep_config gs://seqr-luigi/releases/prod/$TAG_NAME/var/vep_config gcloud storage cp dist/*.whl gs://seqr-luigi/releases/prod/$TAG_NAME/pyscripts.zip - name: Create tag diff --git a/requirements.in b/requirements.in index e1219fd47..4a07d13bf 100644 --- a/requirements.in +++ b/requirements.in @@ -1,6 +1,6 @@ elasticsearch==7.9.1 google-api-python-client>=1.8.0 -hail==0.2.130 +hail==0.2.132 luigi>=3.4.0 gnomad==0.6.4 google-cloud-storage>=2.14.0 diff --git a/requirements.txt b/requirements.txt index ce05963d0..82c093ed2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -148,7 +148,7 @@ grpcio==1.63.0 # grpcio-status grpcio-status==1.48.2 # via google-api-core -hail==0.2.130 +hail==0.2.132 # via -r requirements.in hdbscan==0.8.33 # via gnomad @@ -221,7 +221,7 @@ numpy==1.26.2 # scipy oauthlib==3.2.2 # via requests-oauthlib -orjson==3.9.10 +orjson==3.10.6 # via hail packaging==23.2 # via @@ -254,12 +254,13 @@ protobuf==3.20.2 # googleapis-common-protos # grpc-google-iam-v1 # grpcio-status + # hail # proto-plus ptyprocess==0.7.0 # via pexpect pure-eval==0.2.2 # via stack-data -py4j==0.10.9.5 +py4j==0.10.9.7 # via pyspark pyasn1==0.5.1 # via @@ -276,12 +277,10 @@ pygments==2.17.2 # ipython # rich pyjwt[crypto]==2.8.0 - # via - # msal - # pyjwt + # via msal pyparsing==3.1.1 # via httplib2 -pyspark==3.3.3 +pyspark==3.5.1 # via hail python-daemon==3.0.1 # via luigi diff --git a/v03_pipeline/api/__init__.py b/v03_pipeline/api/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/v03_pipeline/api/__main__.py b/v03_pipeline/api/__main__.py new file mode 100644 index 000000000..de9668dd8 --- /dev/null +++ b/v03_pipeline/api/__main__.py @@ -0,0 +1,18 @@ +from aiohttp import web + +from v03_pipeline.api.app import init_web_app +from v03_pipeline.lib.logger import get_logger + + +def run(): + app = init_web_app() + logger = get_logger(__name__) + web.run_app( + app, + host='0.0.0.0', # noqa: S104 + port=5000, + access_log=logger, + ) + + +run() diff --git a/v03_pipeline/api/app.py b/v03_pipeline/api/app.py new file mode 100644 index 000000000..9cdb615a7 --- /dev/null +++ b/v03_pipeline/api/app.py @@ -0,0 +1,17 @@ +from aiohttp import web + +from v03_pipeline.lib.tasks import * # noqa: F403 + + +async def status(_: web.Request) -> web.Response: + return web.json_response({'success': True}) + + +async def init_web_app(): + app = web.Application() + app.add_routes( + [ + web.get('/status', status), + ], + ) + return app diff --git a/v03_pipeline/bin/dataproc_vep_init.bash b/v03_pipeline/bin/dataproc_vep_init.bash new file mode 100644 index 000000000..3499891c7 --- /dev/null +++ b/v03_pipeline/bin/dataproc_vep_init.bash @@ -0,0 +1,64 @@ +#!/bin/bash + +# +# VEP init action for dataproc +# +# adapted/copied from +# https://github.com/broadinstitute/gnomad_methods/blob/main/init_scripts/vep105-init.sh +# and gs://hail-common/hailctl/dataproc/0.2.128/vep-GRCh38.sh +# +# NB: This is code used for initializing a dataproc cluster and runs as an intialization +# action when the rest of our code is unavailable. +# + +set -x + +export PROJECT="$(gcloud config get-value project)" +export ENVIRONMENT="$(/usr/share/google/get_metadata_value attributes/ENVIRONMENT)" +export VEP_CONFIG_PATH="$(/usr/share/google/get_metadata_value attributes/VEP_CONFIG_PATH)" +export REFERENCE_GENOME="$(/usr/share/google/get_metadata_value attributes/REFERENCE_GENOME)" + +# Install docker +apt-get update +apt-get -y install \ + apt-transport-https \ + ca-certificates \ + curl \ + gnupg2 \ + software-properties-common \ + tabix +curl -fsSL https://download.docker.com/linux/debian/gpg | sudo apt-key add - +sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/debian $(lsb_release -cs) stable" +sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/debian $(lsb_release -cs) stable" +apt-get update +apt-get install -y --allow-unauthenticated docker-ce + +# https://github.com/hail-is/hail/issues/12936 +sleep 60 +sudo service docker restart + +gcloud storage cp gs://seqr-luigi/releases/$ENVIRONMENT/latest/var/vep_config/vep-$REFERENCE_GENOME.json $VEP_CONFIG_PATH + +cat >/vep.c < +#include + +int +main(int argc, char *const argv[]) { + if (setuid(geteuid())) + perror( "setuid" ); + + execv("/vep.bash", argv); + return 0; +} +EOF +gcc -Wall -Werror -O2 /vep.c -o /vep +chmod u+s /vep + +gcloud storage cp gs://seqr-luigi/releases/$ENVIRONMENT/latest/bin/download_vep_data.bash /download_vep_data.bash +chmod +x /download_vep_data.bash +./download_vep_data.bash $REFERENCE_GENOME + +gcloud storage cp gs://seqr-luigi/releases/$ENVIRONMENT/latest/bin/vep /vep.bash +chmod +x /vep.bash + diff --git a/v03_pipeline/bin/download_reference_data.bash b/v03_pipeline/bin/download_reference_data.bash new file mode 100644 index 000000000..73ce43ba2 --- /dev/null +++ b/v03_pipeline/bin/download_reference_data.bash @@ -0,0 +1,19 @@ +#!/usr/bin/env bash + +set -eux + +REFERENCE_GENOME=$1 +SEQR_REFERENCE_DATA=/seqr-reference-data + +case $REFERENCE_GENOME in + GRCh38) + ;; + GRCh37) + ;; + *) + echo "Invalid reference genome $REFERENCE_GENOME, should be GRCh37 or GRCh38" + exit 1 +esac + +mkdir -p $SEQR_REFERENCE_DATA/$REFERENCE_GENOME; +gcloud storage cp -r "gs://seqr-reference-data/v03/$REFERENCE_GENOME/*" $SEQR_REFERENCE_DATA/$REFERENCE_GENOME/ diff --git a/v03_pipeline/bin/download_vep_data.bash b/v03_pipeline/bin/download_vep_data.bash new file mode 100755 index 000000000..2176b052c --- /dev/null +++ b/v03_pipeline/bin/download_vep_data.bash @@ -0,0 +1,59 @@ +#!/usr/bin/env bash + +set -eux + +REFERENCE_GENOME=$1 +VEP_DATA=/vep_data + +case $REFERENCE_GENOME in + GRCh38) + VEP_REFERENCE_DATA_FILES=( + 'gs://seqr-reference-data/vep_data/loftee-beta/GRCh38.tar.gz' + + # Raw data files copied from the bucket (https://console.cloud.google.com/storage/browser/dm_alphamissense;tab=objects?prefix=&forceOnObjectsSortingFiltering=false) + # tabix -s 1 -b 2 -e 2 -f -S 1 AlphaMissense_hg38.tsv.gz + 'gs://seqr-reference-data/vep/GRCh38/AlphaMissense_hg38.tsv.*' + + # Generated with: + # curl -O ftp://ftp.ensembl.org/pub/release-110/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz > Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz + # gzip -d Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz + # bgzip Homo_sapiens.GRCh38.dna.primary_assembly.fa + # samtools faidx Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz + 'gs://seqr-reference-data/vep/GRCh38/Homo_sapiens.GRCh38.dna.primary_assembly.fa.*' + + # Copied from ftp://ftp.ensembl.org/pub/release-110/variation/indexed_vep_cache/homo_sapiens_vep_110_GRCh38.tar.gz + 'gs://seqr-reference-data/vep/GRCh38/homo_sapiens_vep_110_GRCh38.tar.gz' + + # Copied from the UTRAnnotator repo (https://github.com/ImperialCardioGenetics/UTRannotator/tree/master) + 'gs://seqr-reference-data/vep/GRCh38/uORF_5UTR_GRCh38_PUBLIC.txt' + ) + ;; + GRCh37) + VEP_REFERENCE_DATA_FILES=( + 'gs://seqr-reference-data/vep_data/loftee-beta/GRCh37.tar.gz' + 'gs://seqr-reference-data/vep/GRCh37/homo_sapiens_vep_110_GRCh37.tar.gz' + 'gs://seqr-reference-data/vep/GRCh37/Homo_sapiens.GRCh37.dna.primary_assembly.fa.*' + ) + ;; + *) + echo "Invalid reference genome $REFERENCE_GENOME, should be GRCh37 or GRCh38" + exit 1 +esac + +if [ -f $VEP_DATA/$REFERENCE_GENOME/_SUCCESS ]; then + echo "Skipping download because already successful" + exit 0; +fi + +mkdir -p $VEP_DATA/$REFERENCE_GENOME; +for vep_reference_data_file in ${VEP_REFERENCE_DATA_FILES[@]}; do + if [[ $vep_reference_data_file == *.tar.gz ]]; then + echo "Downloading and extracting" $vep_reference_data_file; + gcloud storage cat $vep_reference_data_file | tar -xzf - -C $VEP_DATA/$REFERENCE_GENOME/ & + else + echo "Downloading" $vep_reference_data_file; + gcloud storage cp $vep_reference_data_file $VEP_DATA/$REFERENCE_GENOME/ & + fi +done; +wait +touch $VEP_DATA/$REFERENCE_GENOME/_SUCCESS diff --git a/v03_pipeline/bin/vep b/v03_pipeline/bin/vep new file mode 100755 index 000000000..58b26dfa6 --- /dev/null +++ b/v03_pipeline/bin/vep @@ -0,0 +1,21 @@ +#!/bin/bash + +set -eux + +REFERENCE_GENOME=$1 +VEP_DATA=/vep_data +VEP_DOCKER_IMAGE="gcr.io/seqr-project/vep-docker-image" + +case $REFERENCE_GENOME in + GRCh38) + ;; + GRCh37) + ;; + *) + echo "Invalid reference genome $REFERENCE_GENOME, should be GRCh37 or GRCh38" + exit 1 +esac + +shift # Remove the REFERENCE_GENOME arg. +docker run --platform linux/amd64 -i -v $VEP_DATA/$REFERENCE_GENOME:/opt/vep/.vep/:ro $VEP_DOCKER_IMAGE:$REFERENCE_GENOME \ + /opt/vep/src/ensembl-vep/vep $@ diff --git a/v03_pipeline/bin/vep-110-GRCh38.sh b/v03_pipeline/bin/vep-110-GRCh38.sh deleted file mode 100644 index dead36366..000000000 --- a/v03_pipeline/bin/vep-110-GRCh38.sh +++ /dev/null @@ -1,85 +0,0 @@ -# -# VEP init action for dataproc -# -# adapted/copied from -# https://github.com/broadinstitute/gnomad_methods/blob/main/init_scripts/vep105-init.sh -# and gs://hail-common/hailctl/dataproc/0.2.128/vep-GRCh38.sh -# - -set -x - -export PROJECT="$(gcloud config get-value project)" -export VEP_CONFIG_PATH="$(/usr/share/google/get_metadata_value attributes/VEP_CONFIG_PATH)" -export VEP_REPLICATE="$(/usr/share/google/get_metadata_value attributes/VEP_REPLICATE)" -export ASSEMBLY=GRCh38 -export VEP_DOCKER_IMAGE=gcr.io/seqr-project/vep-docker-image:GRCh38 - -mkdir -p /vep_data - -# Install docker -apt-get update -apt-get -y install \ - apt-transport-https \ - ca-certificates \ - curl \ - gnupg2 \ - software-properties-common \ - tabix -curl -fsSL https://download.docker.com/linux/debian/gpg | sudo apt-key add - -sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/debian $(lsb_release -cs) stable" -sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/debian $(lsb_release -cs) stable" -apt-get update -apt-get install -y --allow-unauthenticated docker-ce - -# https://github.com/hail-is/hail/issues/12936 -sleep 60 -sudo service docker restart - -# Copied from the repo at v03_pipeline/var/vep_config -gcloud storage cp --billing-project $PROJECT gs://seqr-reference-data/vep/GRCh38/vep-${ASSEMBLY}.json $VEP_CONFIG_PATH - -# Copied from the UTRAnnotator repo (https://github.com/ImperialCardioGenetics/UTRannotator/tree/master) -gcloud storage cp --billing-project $PROJECT gs://seqr-reference-data/vep/GRCh38/uORF_5UTR_${ASSEMBLY}_PUBLIC.txt /vep_data/ & - -# Raw data files copied from the bucket (https://console.cloud.google.com/storage/browser/dm_alphamissense;tab=objects?prefix=&forceOnObjectsSortingFiltering=false) -# tabix -s 1 -b 2 -e 2 -f -S 1 AlphaMissense_hg38.tsv.gz -gcloud storage cp --billing-project $PROJECT 'gs://seqr-reference-data/vep/GRCh38/AlphaMissense_hg38.tsv.*' /vep_data/ & - -gcloud storage cat --billing-project $PROJECT gs://seqr-reference-data/vep_data/loftee-beta/${ASSEMBLY}.tar | tar -xf - -C /vep_data/ & - -# Copied from ftp://ftp.ensembl.org/pub/release-110/variation/indexed_vep_cache/homo_sapiens_vep_110_${ASSEMBLY}.tar.gz -gcloud storage cat --billing-project $PROJECT gs://seqr-reference-data/vep/GRCh38/homo_sapiens_vep_110_${ASSEMBLY}.tar.gz | tar -xzf - -C /vep_data/ & - -# Generated with: -# curl -O ftp://ftp.ensembl.org/pub/release-110/fasta/homo_sapiens/dna/Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa.gz > Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa.gz -# gzip -d Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa.gz -# bgzip Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa -# samtools faidx Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa.gz -gcloud storage cp --billing-project $PROJECT "gs://seqr-reference-data/vep/GRCh38/Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa.*" /vep_data/ & -docker pull ${VEP_DOCKER_IMAGE} & -wait - -cat >/vep.c < -#include - -int -main(int argc, char *const argv[]) { - if (setuid(geteuid())) - perror( "setuid" ); - - execv("/vep.sh", argv); - return 0; -} -EOF -gcc -Wall -Werror -O2 /vep.c -o /vep -chmod u+s /vep - -cat >/vep.sh < None: @patch('v03_pipeline.lib.vep.hl.vep') def test_get_formatting_fields(self, mock_vep: Mock, mock_validate: Mock) -> None: mock_validate.return_value = None - ht = hl.read_table(TEST_COMBINED_1) - ht = ht.annotate(rsid='abcd') - for reference_genome, expected_fields in [ + for reference_genome, ht, expected_fields in [ ( ReferenceGenome.GRCh38, + hl.Table.parallelize( + [ + { + 'locus': hl.Locus( + contig='chr1', + position=1, + reference_genome='GRCh38', + ), + 'alleles': ['A', 'C'], + 'rsid': 'abcd', + }, + ], + hl.tstruct( + locus=hl.tlocus('GRCh38'), + alleles=hl.tarray(hl.tstr), + rsid=hl.tstr, + ), + key=['locus', 'alleles'], + ), [ 'check_ref', 'screen', @@ -55,7 +76,27 @@ def test_get_formatting_fields(self, mock_vep: Mock, mock_validate: Mock) -> Non ), ( ReferenceGenome.GRCh37, + hl.Table.parallelize( + [ + { + 'locus': hl.Locus( + contig='1', + position=1, + reference_genome='GRCh37', + ), + 'alleles': ['A', 'C'], + 'rsid': 'abcd', + }, + ], + hl.tstruct( + locus=hl.tlocus('GRCh37'), + alleles=hl.tarray(hl.tstr), + rsid=hl.tstr, + ), + key=['locus', 'alleles'], + ), [ + 'rg38_locus', 'rsid', 'sorted_transcript_consequences', 'variant_id', @@ -68,7 +109,7 @@ def test_get_formatting_fields(self, mock_vep: Mock, mock_validate: Mock) -> Non if reference_genome == ReferenceGenome.GRCh37 else MOCK_38_VEP_DATA, ) - ht = run_vep( + ht = run_vep( # noqa: PLW2901 ht, DatasetType.SNV_INDEL, reference_genome, @@ -99,13 +140,15 @@ def test_get_formatting_fields(self, mock_vep: Mock, mock_validate: Mock) -> Non 'gencode_ensembl_to_refseq_id_mapping': hl.dict( {'a': 'b'}, ), + 'grch38_to_grch37_liftover_ref_path': GRCH38_TO_GRCH37_LIFTOVER_REF_PATH, } if reference_genome == ReferenceGenome.GRCh38 - else {} + else { + 'grch37_to_grch38_liftover_ref_path': GRCH37_TO_GRCH38_LIFTOVER_REF_PATH, + } ), dataset_type=DatasetType.SNV_INDEL, reference_genome=reference_genome, - liftover_ref_path=LIFTOVER, ).keys(), ), expected_fields, diff --git a/v03_pipeline/lib/annotations/gcnv.py b/v03_pipeline/lib/annotations/gcnv.py index 785da2f54..e0be33021 100644 --- a/v03_pipeline/lib/annotations/gcnv.py +++ b/v03_pipeline/lib/annotations/gcnv.py @@ -2,9 +2,8 @@ import hail as hl -from v03_pipeline.lib.annotations import expression_helpers +from v03_pipeline.lib.annotations import expression_helpers, liftover from v03_pipeline.lib.annotations.enums import SV_CONSEQUENCE_RANKS, SV_TYPES -from v03_pipeline.lib.annotations.shared import add_rg38_liftover from v03_pipeline.lib.misc.gcnv import parse_gcnv_genes from v03_pipeline.lib.model.definitions import ReferenceGenome @@ -86,10 +85,10 @@ def QS(mt: hl.MatrixTable, **_: Any) -> hl.Expression: # noqa: N802 def rg37_locus( ht: hl.Table, - liftover_ref_path: str, + grch38_to_grch37_liftover_ref_path: str, **_: Any, ) -> hl.Expression | None: - add_rg38_liftover(liftover_ref_path) + liftover.add_rg38_liftover(grch38_to_grch37_liftover_ref_path) return hl.liftover( start_locus(ht, ReferenceGenome.GRCh38), ReferenceGenome.GRCh37.value, @@ -98,10 +97,10 @@ def rg37_locus( def rg37_locus_end( ht: hl.Table, - liftover_ref_path: str, + grch38_to_grch37_liftover_ref_path: str, **_: Any, ) -> hl.Expression | None: - add_rg38_liftover(liftover_ref_path) + liftover.add_rg38_liftover(grch38_to_grch37_liftover_ref_path) return hl.liftover( end_locus(ht, ReferenceGenome.GRCh38), ReferenceGenome.GRCh37.value, diff --git a/v03_pipeline/lib/annotations/liftover.py b/v03_pipeline/lib/annotations/liftover.py new file mode 100644 index 000000000..ec155efc2 --- /dev/null +++ b/v03_pipeline/lib/annotations/liftover.py @@ -0,0 +1,17 @@ +import hail as hl + +from v03_pipeline.lib.model.definitions import ReferenceGenome + + +def add_rg38_liftover(grch38_to_grch37_liftover_ref_path: str) -> None: + rg37 = hl.get_reference(ReferenceGenome.GRCh37.value) + rg38 = hl.get_reference(ReferenceGenome.GRCh38.value) + if not rg38.has_liftover(rg37): + rg38.add_liftover(grch38_to_grch37_liftover_ref_path, rg37) + + +def add_rg37_liftover(grch37_to_grch38_liftover_ref_path: str) -> None: + rg37 = hl.get_reference(ReferenceGenome.GRCh37.value) + rg38 = hl.get_reference(ReferenceGenome.GRCh38.value) + if not rg37.has_liftover(rg38): + rg37.add_liftover(grch37_to_grch38_liftover_ref_path, rg38) diff --git a/v03_pipeline/lib/annotations/shared.py b/v03_pipeline/lib/annotations/shared.py index 3c5d05c6c..a3a62d235 100644 --- a/v03_pipeline/lib/annotations/shared.py +++ b/v03_pipeline/lib/annotations/shared.py @@ -2,7 +2,7 @@ import hail as hl -from v03_pipeline.lib.annotations import expression_helpers +from v03_pipeline.lib.annotations import expression_helpers, liftover from v03_pipeline.lib.annotations.vep import ( transcript_consequences_sort, vep_85_transcript_consequences_select, @@ -10,13 +10,6 @@ from v03_pipeline.lib.model.definitions import ReferenceGenome -def add_rg38_liftover(liftover_ref_path: str) -> None: - rg37 = hl.get_reference(ReferenceGenome.GRCh37.value) - rg38 = hl.get_reference(ReferenceGenome.GRCh38.value) - if not rg38.has_liftover(rg37): - rg38.add_liftover(liftover_ref_path, rg37) - - def GT(mt: hl.MatrixTable, **_: Any) -> hl.Expression: # noqa: N802 return mt.GT @@ -32,10 +25,10 @@ def rsid(mt: hl.MatrixTable, **_: Any) -> hl.Expression: def rg37_locus( ht: hl.Table, - liftover_ref_path: str, + grch38_to_grch37_liftover_ref_path: str, **_: Any, ) -> hl.Expression | None: - add_rg38_liftover(liftover_ref_path) + liftover.add_rg38_liftover(grch38_to_grch37_liftover_ref_path) return hl.liftover(ht.locus, ReferenceGenome.GRCh37.value) diff --git a/v03_pipeline/lib/annotations/snv_indel.py b/v03_pipeline/lib/annotations/snv_indel.py index d24319457..44a26b044 100644 --- a/v03_pipeline/lib/annotations/snv_indel.py +++ b/v03_pipeline/lib/annotations/snv_indel.py @@ -3,6 +3,7 @@ import hail as hl +from v03_pipeline.lib.annotations import liftover from v03_pipeline.lib.annotations.enums import ( MOTIF_CONSEQUENCE_TERMS, REGULATORY_BIOTYPES, @@ -12,6 +13,7 @@ transcript_consequences_sort, vep_110_transcript_consequences_select, ) +from v03_pipeline.lib.model.definitions import ReferenceGenome MOTIF_CONSEQUENCE_TERMS_LOOKUP = hl.dict( hl.enumerate(MOTIF_CONSEQUENCE_TERMS, index_first=False), @@ -85,6 +87,15 @@ def gnomad_non_coding_constraint( ) +def rg38_locus( + ht: hl.Table, + grch37_to_grch38_liftover_ref_path: str, + **_: Any, +) -> hl.Expression | None: + liftover.add_rg37_liftover(grch37_to_grch38_liftover_ref_path) + return hl.liftover(ht.locus, ReferenceGenome.GRCh38.value) + + def screen( ht: hl.Table, interval_ht: hl.Table, diff --git a/v03_pipeline/lib/annotations/sv.py b/v03_pipeline/lib/annotations/sv.py index 5c75b0c94..de7aeb65c 100644 --- a/v03_pipeline/lib/annotations/sv.py +++ b/v03_pipeline/lib/annotations/sv.py @@ -2,12 +2,12 @@ import hail as hl +from v03_pipeline.lib.annotations import liftover from v03_pipeline.lib.annotations.enums import ( SV_CONSEQUENCE_RANKS, SV_TYPE_DETAILS, SV_TYPES, ) -from v03_pipeline.lib.annotations.shared import add_rg38_liftover from v03_pipeline.lib.model.definitions import ReferenceGenome CONSEQ_PREDICTED_PREFIX = 'info.PREDICTED_' @@ -194,10 +194,10 @@ def gt_stats(ht: hl.Table, **_: Any) -> hl.Expression: def rg37_locus_end( ht: hl.Table, - liftover_ref_path: str, + grch38_to_grch37_liftover_ref_path: str, **_: Any, ) -> hl.Expression | None: - add_rg38_liftover(liftover_ref_path) + liftover.add_rg38_liftover(grch38_to_grch37_liftover_ref_path) end = end_locus(ht) return hl.or_missing( hl.is_defined(end), diff --git a/v03_pipeline/lib/model/dataset_type.py b/v03_pipeline/lib/model/dataset_type.py index 51ed76b08..69a0f146c 100644 --- a/v03_pipeline/lib/model/dataset_type.py +++ b/v03_pipeline/lib/model/dataset_type.py @@ -217,6 +217,7 @@ def formatting_annotation_fns( shared.variant_id, shared.xpos, shared.sorted_transcript_consequences, + snv_indel.rg38_locus, ], DatasetType.MITO: [ mito.common_low_heteroplasmy, @@ -258,11 +259,9 @@ def formatting_annotation_fns( return GRCh37_fns[self] return { DatasetType.SNV_INDEL: [ - *[ - x - for x in GRCh37_fns[DatasetType.SNV_INDEL] - if x != shared.sorted_transcript_consequences - ], + shared.rsid, + shared.variant_id, + shared.xpos, snv_indel.gnomad_non_coding_constraint, snv_indel.screen, shared.rg37_locus, diff --git a/v03_pipeline/lib/model/environment.py b/v03_pipeline/lib/model/environment.py index 4681939ca..e0fd29fc5 100644 --- a/v03_pipeline/lib/model/environment.py +++ b/v03_pipeline/lib/model/environment.py @@ -4,8 +4,12 @@ # NB: using os.environ.get inside the dataclass defaults gives a lint error. HAIL_TMPDIR = os.environ.get('HAIL_TMPDIR', '/tmp') # noqa: S108 HAIL_SEARCH_DATA = os.environ.get('HAIL_SEARCH_DATA', '/hail-search-data') -LIFTOVER_REF_PATH = os.environ.get( - 'LIFTOVER_REF_PATH', +GRCH37_TO_GRCH38_LIFTOVER_REF_PATH = os.environ.get( + 'GRCH37_TO_GRCH38_LIFTOVER_REF_PATH', + 'gs://hail-common/references/grch37_to_grch38.over.chain.gz', +) +GRCH38_TO_GRCH37_LIFTOVER_REF_PATH = os.environ.get( + 'GRCH38_TO_GRCH37_LIFTOVER_REF_PATH', 'gs://hail-common/references/grch38_to_grch37.over.chain.gz', ) LOADING_DATASETS = os.environ.get('LOADING_DATASETS', '/seqr-loading-temp') @@ -42,7 +46,8 @@ class Env: EXPECT_WES_FILTERS: bool = EXPECT_WES_FILTERS HAIL_TMPDIR: str = HAIL_TMPDIR HAIL_SEARCH_DATA: str = HAIL_SEARCH_DATA - LIFTOVER_REF_PATH: str = LIFTOVER_REF_PATH + GRCH37_TO_GRCH38_LIFTOVER_REF_PATH: str = GRCH37_TO_GRCH38_LIFTOVER_REF_PATH + GRCH38_TO_GRCH37_LIFTOVER_REF_PATH: str = GRCH38_TO_GRCH37_LIFTOVER_REF_PATH LOADING_DATASETS: str = LOADING_DATASETS PRIVATE_REFERENCE_DATASETS: str = PRIVATE_REFERENCE_DATASETS PROJECT_ID: str | None = PROJECT_ID diff --git a/v03_pipeline/lib/tasks/base/base_migrate.py b/v03_pipeline/lib/tasks/base/base_migrate.py index 710a1158b..acb96ffcf 100644 --- a/v03_pipeline/lib/tasks/base/base_migrate.py +++ b/v03_pipeline/lib/tasks/base/base_migrate.py @@ -33,10 +33,10 @@ def complete(self) -> luigi.Target: self.dataset_type, ) not in migration.reference_genome_dataset_types: return True - mt = hl.read_table(self.output().path) - if not hasattr(mt, 'migrations'): + ht = hl.read_table(self.output().path) + if not hasattr(ht, 'migrations'): return False - return hl.eval(mt.globals.migrations.index(self.migration_name) >= 0) + return hl.eval(ht.globals.migrations.contains(self.migration_name)) return False def update_table(self, ht: hl.Table) -> hl.Table: diff --git a/v03_pipeline/lib/tasks/migrate_lookup_table.py b/v03_pipeline/lib/tasks/migrate_lookup_table.py index 4fe9d7ac4..a5f14c56e 100644 --- a/v03_pipeline/lib/tasks/migrate_lookup_table.py +++ b/v03_pipeline/lib/tasks/migrate_lookup_table.py @@ -11,7 +11,7 @@ class MigrateLookupTableTask(BaseMigrateTask): @property - def migrations_path(self): + def migrations_path(self) -> str: return v03_pipeline.migrations.lookup.__path__[0] def output(self) -> luigi.Target: diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py index b8d53b983..aeec33654 100644 --- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py +++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py @@ -729,6 +729,107 @@ def test_update_vat_grch37( ], ) self.assertFalse(hasattr(ht, 'rg37_locus')) + self.assertEqual( + ht.collect()[0], + hl.Struct( + locus=hl.Locus(contig=1, position=871269, reference_genome='GRCh37'), + alleles=['A', 'C'], + rsid=None, + variant_id='1-871269-A-C', + xpos=1000871269, + sorted_transcript_consequences=[ + hl.Struct( + amino_acids='S/L', + canonical=1, + codons='tCg/tTg', + gene_id='ENSG00000188976', + hgvsc='ENST00000327044.6:c.1667C>T', + hgvsp='ENSP00000317992.6:p.Ser556Leu', + transcript_id='ENST00000327044', + biotype_id=39, + consequence_term_ids=[9], + is_lof_nagnag=None, + lof_filter_ids=[0, 1], + ), + hl.Struct( + amino_acids=None, + canonical=None, + codons=None, + gene_id='ENSG00000188976', + hgvsc='ENST00000477976.1:n.3114C>T', + hgvsp=None, + transcript_id='ENST00000477976', + biotype_id=38, + consequence_term_ids=[23, 26], + is_lof_nagnag=None, + lof_filter_ids=None, + ), + hl.Struct( + amino_acids=None, + canonical=None, + codons=None, + gene_id='ENSG00000188976', + hgvsc='ENST00000483767.1:n.523C>T', + hgvsp=None, + transcript_id='ENST00000483767', + biotype_id=38, + consequence_term_ids=[23, 26], + is_lof_nagnag=None, + lof_filter_ids=None, + ), + ], + rg38_locus=hl.Locus( + contig='chr1', + position=935889, + reference_genome='GRCh38', + ), + cadd=hl.Struct(PHRED=9.699999809265137), + clinvar=hl.Struct( + alleleId=None, + conflictingPathogenicities=None, + goldStars=None, + pathogenicity_id=None, + assertion_ids=None, + submitters=None, + conditions=None, + ), + eigen=hl.Struct(Eigen_phred=1.5880000591278076), + exac=hl.Struct( + AF_POPMAX=0.0004100881633348763, + AF=0.0004633000062312931, + AC_Adj=51, + AC_Het=51, + AC_Hom=0, + AC_Hemi=None, + AN_Adj=108288, + ), + gnomad_exomes=hl.Struct( + AF=0.00012876000255346298, + AN=240758, + AC=31, + Hom=0, + AF_POPMAX_OR_GLOBAL=0.0001119549197028391, + FAF_AF=9.315000352216884e-05, + Hemi=0, + ), + gnomad_genomes=None, + mpc=None, + primate_ai=None, + splice_ai=hl.Struct( + delta_score=0.029999999329447746, + splice_consequence_id=3, + ), + topmed=None, + dbnsfp=hl.Struct( + REVEL_score=0.0430000014603138, + SIFT_score=None, + Polyphen2_HVAR_score=None, + MutationTaster_pred_id=0, + ), + hgmd=None, + gt_stats=hl.Struct(AC=0, AN=6, AF=0.0, hom=0), + ), + ) @patch('v03_pipeline.lib.tasks.write_new_variants_table.register_alleles_in_chunks') @patch( diff --git a/v03_pipeline/lib/tasks/validate_callset.py b/v03_pipeline/lib/tasks/validate_callset.py index 903891cf9..ea62acece 100644 --- a/v03_pipeline/lib/tasks/validate_callset.py +++ b/v03_pipeline/lib/tasks/validate_callset.py @@ -110,6 +110,7 @@ def update_table(self, mt: hl.MatrixTable) -> hl.MatrixTable: mt.locus.contig, ), ) + if not self.skip_validation and self.dataset_type.can_run_validation: validate_allele_type(mt) validate_no_duplicate_variants(mt) diff --git a/v03_pipeline/lib/tasks/write_new_variants_table.py b/v03_pipeline/lib/tasks/write_new_variants_table.py index 9f99d1549..b6bb937bd 100644 --- a/v03_pipeline/lib/tasks/write_new_variants_table.py +++ b/v03_pipeline/lib/tasks/write_new_variants_table.py @@ -63,7 +63,12 @@ def annotation_dependencies(self) -> dict[str, hl.Table]: deps['gencode_gene_symbol_to_gene_id_mapping'] = hl.literal( load_gencode_gene_symbol_to_gene_id(GENCODE_RELEASE, ''), ) - deps['liftover_ref_path'] = Env.LIFTOVER_REF_PATH + deps[ + 'grch37_to_grch38_liftover_ref_path' + ] = Env.GRCH37_TO_GRCH38_LIFTOVER_REF_PATH + deps[ + 'grch38_to_grch37_liftover_ref_path' + ] = Env.GRCH38_TO_GRCH37_LIFTOVER_REF_PATH return deps def output(self) -> luigi.Target: diff --git a/v03_pipeline/migrations/annotations/0003_add_rg38_locus.py b/v03_pipeline/migrations/annotations/0003_add_rg38_locus.py new file mode 100644 index 000000000..3ff6eeff6 --- /dev/null +++ b/v03_pipeline/migrations/annotations/0003_add_rg38_locus.py @@ -0,0 +1,19 @@ +import hail as hl + +from v03_pipeline.lib.annotations import snv_indel +from v03_pipeline.lib.migration.base_migration import BaseMigration +from v03_pipeline.lib.model import DatasetType, Env, ReferenceGenome + + +class AddRG38Locus(BaseMigration): + reference_genome_dataset_types: frozenset[ + tuple[ReferenceGenome, DatasetType] + ] = frozenset( + ((ReferenceGenome.GRCh37, DatasetType.SNV_INDEL),), + ) + + @staticmethod + def migrate(ht: hl.Table) -> hl.Table: + return ht.annotate( + rg38_locus=snv_indel.rg38_locus(ht, Env.GRCH37_TO_GRCH38_LIFTOVER_REF_PATH), + ) diff --git a/v03_pipeline/var/spark_config/spark-defaults.conf b/v03_pipeline/var/spark_config/spark-defaults.conf new file mode 100644 index 000000000..3837171fb --- /dev/null +++ b/v03_pipeline/var/spark_config/spark-defaults.conf @@ -0,0 +1,9 @@ +spark.driver.memory 5G +spark.master local[2] +spark.driver.extraJavaOptions -Xss4M +spark.executor.extraJavaOptions -Xss4M +spark.executor.memoryOverhead 5g +spark.driver.maxResultSize 30g +spark.kryoserializer.buffer.max 1g +spark.memory.fraction 0.1 +spark.default.parallelism 1 diff --git a/v03_pipeline/var/test/liftover/grch37_to_grch38.over.chain.gz b/v03_pipeline/var/test/liftover/grch37_to_grch38.over.chain.gz new file mode 100644 index 000000000..534942192 Binary files /dev/null and b/v03_pipeline/var/test/liftover/grch37_to_grch38.over.chain.gz differ diff --git a/v03_pipeline/var/vep_config/vep-GRCh37.json b/v03_pipeline/var/vep_config/vep-GRCh37.json new file mode 100644 index 000000000..d3461b965 --- /dev/null +++ b/v03_pipeline/var/vep_config/vep-GRCh37.json @@ -0,0 +1,21 @@ +{"command": [ + "/vep", + "GRCh37", + "--warning_file", "STDERR", + "--format", "vcf", + "__OUTPUT_FORMAT_FLAG__", + "--everything", + "--allele_number", + "--no_stats", + "--cache", "--offline", + "--minimal", + "--assembly", "GRCh37", + "--fasta", "/opt/vep/.vep/Homo_sapiens.GRCh37.dna.primary_assembly.fa.gz", + "--plugin", "LoF,human_ancestor_fa:/opt/vep/.vep/loftee_data/GRCh37/human_ancestor.fa.gz,filter_position:0.05,min_intron_size:15,conservation_file:/opt/vep/.vep/loftee_data/GRCh37/phylocsf_gerp.sql,gerp_file:/opt/vep/.vep/loftee_data/GRCh37/GERP_scores.final.sorted.txt.gz", + "-o", "STDOUT" +], + "env": { + "PERL5LIB": "" + }, + "vep_json_schema": "Struct{assembly_name:String,allele_string:String,ancestral:String,colocated_variants:Array[Struct{aa_allele:String,aa_maf:Float64,afr_allele:String,afr_maf:Float64,allele_string:String,amr_allele:String,amr_maf:Float64,clin_sig:Array[String],end:Int32,eas_allele:String,eas_maf:Float64,ea_allele:String,ea_maf:Float64,eur_allele:String,eur_maf:Float64,exac_adj_allele:String,exac_adj_maf:Float64,exac_allele:String,exac_afr_allele:String,exac_afr_maf:Float64,exac_amr_allele:String,exac_amr_maf:Float64,exac_eas_allele:String,exac_eas_maf:Float64,exac_fin_allele:String,exac_fin_maf:Float64,exac_maf:Float64,exac_nfe_allele:String,exac_nfe_maf:Float64,exac_oth_allele:String,exac_oth_maf:Float64,exac_sas_allele:String,exac_sas_maf:Float64,id:String,minor_allele:String,minor_allele_freq:Float64,phenotype_or_disease:Int32,pubmed:Array[Int32],sas_allele:String,sas_maf:Float64,somatic:Int32,start:Int32,strand:Int32}],context:String,end:Int32,id:String,input:String,intergenic_consequences:Array[Struct{allele_num:Int32,consequence_terms:Array[String],impact:String,minimised:Int32,variant_allele:String}],most_severe_consequence:String,motif_feature_consequences:Array[Struct{allele_num:Int32,consequence_terms:Array[String],high_inf_pos:String,impact:String,minimised:Int32,motif_feature_id:String,motif_name:String,motif_pos:Int32,motif_score_change:Float64,strand:Int32,variant_allele:String}],regulatory_feature_consequences:Array[Struct{allele_num:Int32,biotype:String,consequence_terms:Array[String],impact:String,minimised:Int32,regulatory_feature_id:String,variant_allele:String}],seq_region_name:String,start:Int32,strand:Int32,transcript_consequences:Array[Struct{allele_num:Int32,amino_acids:String,biotype:String,canonical:Int32,ccds:String,cdna_start:Int32,cdna_end:Int32,cds_end:Int32,cds_start:Int32,codons:String,consequence_terms:Array[String],distance:Int32,domains:Array[Struct{db:String,name:String}],exon:String,gene_id:String,gene_pheno:Int32,gene_symbol:String,gene_symbol_source:String,hgnc_id:String,hgvsc:String,hgvsp:String,hgvs_offset:Int32,impact:String,intron:String,lof:String,lof_flags:String,lof_filter:String,lof_info:String,minimised:Int32,polyphen_prediction:String,polyphen_score:Float64,protein_end:Int32,protein_start:Int32,protein_id:String,sift_prediction:String,sift_score:Float64,strand:Int32,swissprot:String,transcript_id:String,trembl:String,uniparc:String,variant_allele:String}],variant_class:String}" +} diff --git a/v03_pipeline/var/vep_config/vep-GRCh38.json b/v03_pipeline/var/vep_config/vep-GRCh38.json index 4c61b73e6..c454c29a2 100644 --- a/v03_pipeline/var/vep_config/vep-GRCh38.json +++ b/v03_pipeline/var/vep_config/vep-GRCh38.json @@ -2,7 +2,7 @@ "command": [ "bash", "-c", - "/vep --warning_file STDERR --format vcf -json --hgvs --biotype --canonical --mane --minimal --numbers --regulatory --allele_number --no_stats --cache --offline --assembly GRCh38 --fasta /opt/vep/.vep/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz --check_ref --dont_skip --plugin LoF,loftee_path:/plugins,gerp_bigwig:/opt/vep/.vep/gerp_conservation_scores.homo_sapiens.GRCh38.bw,human_ancestor_fa:/opt/vep/.vep/human_ancestor.fa.gz,conservation_file:/opt/vep/.vep/loftee.sql --plugin UTRAnnotator,file=/opt/vep/.vep/uORF_5UTR_GRCh38_PUBLIC.txt --plugin SpliceRegion,Extended --plugin AlphaMissense,file=/opt/vep/.vep/AlphaMissense_hg38.tsv.gz --dir_plugins /plugins -o STDOUT | sed s/5utr/fiveutr/g" + "/vep GRCh38 --warning_file STDERR --format vcf -json --hgvs --biotype --canonical --mane --minimal --numbers --regulatory --allele_number --no_stats --cache --offline --assembly GRCh38 --fasta /opt/vep/.vep/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz --check_ref --dont_skip --plugin LoF,loftee_path:/plugins,gerp_bigwig:/opt/vep/.vep/gerp_conservation_scores.homo_sapiens.GRCh38.bw,human_ancestor_fa:/opt/vep/.vep/human_ancestor.fa.gz,conservation_file:/opt/vep/.vep/loftee.sql --plugin UTRAnnotator,file=/opt/vep/.vep/uORF_5UTR_GRCh38_PUBLIC.txt --plugin SpliceRegion,Extended --plugin AlphaMissense,file=/opt/vep/.vep/AlphaMissense_hg38.tsv.gz --dir_plugins /plugins -o STDOUT | sed s/5utr/fiveutr/g" ], "env": { "PERL5LIB": "/plugins"