diff --git a/.github/workflows/dev-release.yml b/.github/workflows/dev-release.yml index 9caa834ce..ce2971b8c 100644 --- a/.github/workflows/dev-release.yml +++ b/.github/workflows/dev-release.yml @@ -45,5 +45,4 @@ jobs: run: |- gcloud storage rm -r gs://seqr-luigi/releases/dev/latest/ || echo 'No latest release' gcloud storage cp v03_pipeline/bin/* gs://seqr-luigi/releases/dev/latest/bin/ - gcloud storage cp v03_pipeline/var/vep_config/* gs://seqr-luigi/releases/dev/latest/var/vep_config gcloud storage cp dist/*.whl gs://seqr-luigi/releases/dev/latest/pyscripts.zip diff --git a/.github/workflows/prod-release.yml b/.github/workflows/prod-release.yml index 3eed86663..5b836ddf7 100644 --- a/.github/workflows/prod-release.yml +++ b/.github/workflows/prod-release.yml @@ -53,11 +53,10 @@ jobs: run: |- gcloud storage rm -r gs://seqr-luigi/releases/prod/latest/ || echo 'No latest release' gcloud storage cp v03_pipeline/bin/* gs://seqr-luigi/releases/prod/latest/bin/ - gcloud storage cp v03_pipeline/var/vep_config/* gs://seqr-luigi/releases/prod/latest/var/vep_config gcloud storage cp dist/*.whl gs://seqr-luigi/releases/prod/latest/pyscripts.zip gcloud storage cp v03_pipeline/bin/* gs://seqr-luigi/releases/prod/$TAG_NAME/bin/ - gcloud storage cp v03_pipeline/var/vep_config/* gs://seqr-luigi/releases/prod/$TAG_NAME/var/vep_config gcloud storage cp dist/*.whl gs://seqr-luigi/releases/prod/$TAG_NAME/pyscripts.zip + gcloud storage cp v03_pipeline/var/vep/* gs://seqr-reference-data/vep/ - name: Create tag uses: actions/github-script@v7 diff --git a/v03_pipeline/bin/dataproc_vep_init.bash b/v03_pipeline/bin/dataproc_vep_init.bash index 3499891c7..2be6c7b7e 100644 --- a/v03_pipeline/bin/dataproc_vep_init.bash +++ b/v03_pipeline/bin/dataproc_vep_init.bash @@ -15,7 +15,6 @@ set -x export PROJECT="$(gcloud config get-value project)" export ENVIRONMENT="$(/usr/share/google/get_metadata_value attributes/ENVIRONMENT)" -export VEP_CONFIG_PATH="$(/usr/share/google/get_metadata_value attributes/VEP_CONFIG_PATH)" export REFERENCE_GENOME="$(/usr/share/google/get_metadata_value attributes/REFERENCE_GENOME)" # Install docker @@ -37,8 +36,6 @@ apt-get install -y --allow-unauthenticated docker-ce sleep 60 sudo service docker restart -gcloud storage cp gs://seqr-luigi/releases/$ENVIRONMENT/latest/var/vep_config/vep-$REFERENCE_GENOME.json $VEP_CONFIG_PATH - cat >/vep.c < #include diff --git a/v03_pipeline/bin/download_vep_data.bash b/v03_pipeline/bin/download_vep_data.bash index 2176b052c..53069a1b4 100755 --- a/v03_pipeline/bin/download_vep_data.bash +++ b/v03_pipeline/bin/download_vep_data.bash @@ -26,6 +26,8 @@ case $REFERENCE_GENOME in # Copied from the UTRAnnotator repo (https://github.com/ImperialCardioGenetics/UTRannotator/tree/master) 'gs://seqr-reference-data/vep/GRCh38/uORF_5UTR_GRCh38_PUBLIC.txt' + + 'gs://seqr-reference-data/vep/GRCh38/vep-GRCh38.json' ) ;; GRCh37) @@ -33,6 +35,7 @@ case $REFERENCE_GENOME in 'gs://seqr-reference-data/vep_data/loftee-beta/GRCh37.tar.gz' 'gs://seqr-reference-data/vep/GRCh37/homo_sapiens_vep_110_GRCh37.tar.gz' 'gs://seqr-reference-data/vep/GRCh37/Homo_sapiens.GRCh37.dna.primary_assembly.fa.*' + 'gs://seqr-reference-data/vep/GRCh37/vep-GRCh37.json' ) ;; *) diff --git a/v03_pipeline/deploy/Dockerfile b/v03_pipeline/deploy/Dockerfile index a5f967321..fa70a08ce 100644 --- a/v03_pipeline/deploy/Dockerfile +++ b/v03_pipeline/deploy/Dockerfile @@ -10,13 +10,15 @@ WORKDIR /v03_pipeline COPY requirements.txt . RUN python3 -m pip install --no-cache-dir -r ./requirements.txt -# VEP -COPY v03_pipeline/bin/vep /vep - # Application Code -COPY v03_pipeline/ . +COPY v03_pipeline/api api +COPY v03_pipeline/bin bin +COPY v03_pipeline/lib lib +COPY v03_pipeline/migrations migrations -RUN cp ./var/spark_config/spark-defaults.conf /usr/local/lib/python3.10/dist-packages/pyspark/conf/spark-defaults.conf +# Special paths +COPY v03_pipeline/var/spark_config/spark-defaults.conf /usr/local/lib/python3.10/dist-packages/pyspark/conf/spark-defaults.conf +COPY v03_pipeline/bin/vep /vep WORKDIR / EXPOSE 5000 diff --git a/v03_pipeline/lib/annotations/fields_test.py b/v03_pipeline/lib/annotations/fields_test.py index 29212b209..904369e23 100644 --- a/v03_pipeline/lib/annotations/fields_test.py +++ b/v03_pipeline/lib/annotations/fields_test.py @@ -35,10 +35,8 @@ def setUp(self) -> None: ), ) - @patch('v03_pipeline.lib.vep.validate_vep_config_reference_genome') @patch('v03_pipeline.lib.vep.hl.vep') - def test_get_formatting_fields(self, mock_vep: Mock, mock_validate: Mock) -> None: - mock_validate.return_value = None + def test_get_formatting_fields(self, mock_vep: Mock) -> None: for reference_genome, ht, expected_fields in [ ( ReferenceGenome.GRCh38, diff --git a/v03_pipeline/lib/annotations/snv_indel_test.py b/v03_pipeline/lib/annotations/snv_indel_test.py index eba8541a6..78c19ad5d 100644 --- a/v03_pipeline/lib/annotations/snv_indel_test.py +++ b/v03_pipeline/lib/annotations/snv_indel_test.py @@ -83,12 +83,10 @@ def test_allele_count_annotations(self) -> None: ], ) - @patch('v03_pipeline.lib.vep.validate_vep_config_reference_genome') @patch('v03_pipeline.lib.vep.hl.vep') def test_sorted_transcript_consequences_37( self, mock_vep: Mock, - mock_validate: Mock, ) -> None: ht = hl.Table.parallelize( [ @@ -108,7 +106,6 @@ def test_sorted_transcript_consequences_37( key=['locus', 'alleles'], ) mock_vep.return_value = ht.annotate(vep=MOCK_37_VEP_DATA) - mock_validate.return_value = None ht = run_vep( ht, DatasetType.SNV_INDEL, @@ -166,12 +163,10 @@ def test_sorted_transcript_consequences_37( ], ) - @patch('v03_pipeline.lib.vep.validate_vep_config_reference_genome') @patch('v03_pipeline.lib.vep.hl.vep') def test_sorted_transcript_consequences_38( self, mock_vep: Mock, - mock_validate: Mock, ) -> None: ht = hl.Table.parallelize( [ @@ -191,7 +186,6 @@ def test_sorted_transcript_consequences_38( key=['locus', 'alleles'], ) mock_vep.return_value = ht.annotate(vep=MOCK_38_VEP_DATA) - mock_validate.return_value = None ht = run_vep( ht, DatasetType.SNV_INDEL, @@ -322,12 +316,10 @@ def test_sorted_transcript_consequences_38( ], ) - @patch('v03_pipeline.lib.vep.validate_vep_config_reference_genome') @patch('v03_pipeline.lib.vep.hl.vep') def test_sorted_other_feature_consequences( self, mock_vep: Mock, - mock_validate: Mock, ) -> None: ht = hl.Table.parallelize( [ @@ -347,7 +339,6 @@ def test_sorted_other_feature_consequences( key=['locus', 'alleles'], ) mock_vep.return_value = ht.annotate(vep=MOCK_38_VEP_DATA) - mock_validate.return_value = None ht = run_vep( ht, DatasetType.SNV_INDEL, diff --git a/v03_pipeline/lib/model/environment.py b/v03_pipeline/lib/model/environment.py index 0680412b5..2fa8f43ba 100644 --- a/v03_pipeline/lib/model/environment.py +++ b/v03_pipeline/lib/model/environment.py @@ -21,9 +21,6 @@ 'REFERENCE_DATASETS', '/seqr-reference-data', ) -VEP_CONFIG_PATH = os.environ.get('VEP_CONFIG_PATH', None) -VEP_CONFIG_URI = os.environ.get('VEP_CONFIG_URI', None) - # Allele registry secrets :/ ALLELE_REGISTRY_SECRET_NAME = os.environ.get('ALLELE_REGISTRY_SECRET_NAME', None) PROJECT_ID = os.environ.get('PROJECT_ID', None) @@ -52,5 +49,3 @@ class Env: PROJECT_ID: str | None = PROJECT_ID REFERENCE_DATASETS: str = REFERENCE_DATASETS SHOULD_REGISTER_ALLELES: bool = SHOULD_REGISTER_ALLELES - VEP_CONFIG_PATH: str | None = VEP_CONFIG_PATH - VEP_CONFIG_URI: str | None = VEP_CONFIG_URI diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py index 8ad4755a0..e1730e25d 100644 --- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py +++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py @@ -221,14 +221,12 @@ def test_missing_interval_reference( ) @patch.object(ReferenceGenome, 'standard_contigs', new_callable=PropertyMock) @patch('v03_pipeline.lib.vep.hl.vep') - @patch('v03_pipeline.lib.vep.validate_vep_config_reference_genome') @patch( 'v03_pipeline.lib.tasks.write_new_variants_table.load_gencode_ensembl_to_refseq_id', ) def test_multiple_update_vat( self, mock_load_gencode_ensembl_to_refseq_id: Mock, - mock_vep_validate: Mock, mock_vep: Mock, mock_standard_contigs: Mock, mock_update_vat_with_rdc_task: Mock, @@ -246,7 +244,6 @@ def test_multiple_update_vat( ) ) mock_vep.side_effect = lambda ht, **_: ht.annotate(vep=MOCK_38_VEP_DATA) - mock_vep_validate.return_value = None mock_load_gencode_ensembl_to_refseq_id.return_value = hl.dict( {'ENST00000327044': 'NM_015658.4'}, ) @@ -677,10 +674,8 @@ def test_multiple_update_vat( 'v03_pipeline.lib.tasks.write_new_variants_table.UpdateVariantAnnotationsTableWithUpdatedReferenceDataset', ) @patch('v03_pipeline.lib.vep.hl.vep') - @patch('v03_pipeline.lib.vep.validate_vep_config_reference_genome') def test_update_vat_grch37( self, - mock_vep_validate: Mock, mock_vep: Mock, mock_update_vat_with_rdc_task: Mock, mock_register_alleles: Mock, @@ -694,7 +689,6 @@ def test_update_vat_grch37( ) ) mock_vep.side_effect = lambda ht, **_: ht.annotate(vep=MOCK_37_VEP_DATA) - mock_vep_validate.return_value = None mock_register_alleles.side_effect = None worker = luigi.worker.Worker() uvatwns_task = UpdateVariantAnnotationsTableWithNewSamplesTask( @@ -841,14 +835,12 @@ def test_update_vat_grch37( ) @patch('v03_pipeline.lib.model.reference_dataset_collection.Env') @patch('v03_pipeline.lib.vep.hl.vep') - @patch('v03_pipeline.lib.vep.validate_vep_config_reference_genome') @patch( 'v03_pipeline.lib.tasks.write_new_variants_table.load_gencode_ensembl_to_refseq_id', ) def test_update_vat_without_accessing_private_datasets( self, mock_load_gencode_ensembl_to_refseq_id: Mock, - mock_vep_validate: Mock, mock_vep: Mock, mock_rdc_env: Mock, mock_update_vat_with_rdc_task: Mock, @@ -874,7 +866,6 @@ def test_update_vat_without_accessing_private_datasets( ) mock_rdc_env.ACCESS_PRIVATE_REFERENCE_DATASETS = False mock_vep.side_effect = lambda ht, **_: ht.annotate(vep=MOCK_38_VEP_DATA) - mock_vep_validate.return_value = None mock_register_alleles.side_effect = None worker = luigi.worker.Worker() uvatwns_task = UpdateVariantAnnotationsTableWithNewSamplesTask( diff --git a/v03_pipeline/lib/vep.py b/v03_pipeline/lib/vep.py index 2abc3b68c..6545c23d8 100644 --- a/v03_pipeline/lib/vep.py +++ b/v03_pipeline/lib/vep.py @@ -1,13 +1,10 @@ -import hail as hl +from string import Template -from v03_pipeline.lib.model import DatasetType, Env, ReferenceGenome +import hail as hl +from v03_pipeline.lib.model import DatasetType, ReferenceGenome -def validate_vep_config_reference_genome(reference_genome) -> None: - with open(Env.VEP_CONFIG_PATH) as f: - if reference_genome.value not in f.read(): - msg = f'Vep config does not match supplied reference genome {reference_genome.value}' - raise ValueError(msg) +VEP_CONFIG_URI = Template('file:///vep_data/vep-$reference_genome.json') def run_vep( @@ -17,10 +14,9 @@ def run_vep( ) -> hl.Table: if not dataset_type.veppable: return ht - validate_vep_config_reference_genome(reference_genome) return hl.vep( ht, - config=Env.VEP_CONFIG_URI, + config=VEP_CONFIG_URI.substitute(reference_genome=reference_genome.value), name='vep', block_size=1000, tolerate_parse_error=True, diff --git a/v03_pipeline/var/vep_config/vep-GRCh37.json b/v03_pipeline/var/vep/GRCh37/vep-GRCh37.json similarity index 100% rename from v03_pipeline/var/vep_config/vep-GRCh37.json rename to v03_pipeline/var/vep/GRCh37/vep-GRCh37.json diff --git a/v03_pipeline/var/vep_config/vep-GRCh38.json b/v03_pipeline/var/vep/GRCh38/vep-GRCh38.json similarity index 100% rename from v03_pipeline/var/vep_config/vep-GRCh38.json rename to v03_pipeline/var/vep/GRCh38/vep-GRCh38.json