diff --git a/v03_pipeline/bin/vep-110-GRCh38.sh b/v03_pipeline/bin/vep-110-GRCh38.sh index cd649b2ad..dead36366 100644 --- a/v03_pipeline/bin/vep-110-GRCh38.sh +++ b/v03_pipeline/bin/vep-110-GRCh38.sh @@ -12,7 +12,7 @@ export PROJECT="$(gcloud config get-value project)" export VEP_CONFIG_PATH="$(/usr/share/google/get_metadata_value attributes/VEP_CONFIG_PATH)" export VEP_REPLICATE="$(/usr/share/google/get_metadata_value attributes/VEP_REPLICATE)" export ASSEMBLY=GRCh38 -export VEP_DOCKER_IMAGE=gcr.io/seqr-project/vep-docker-image:110 +export VEP_DOCKER_IMAGE=gcr.io/seqr-project/vep-docker-image:GRCh38 mkdir -p /vep_data @@ -36,26 +36,26 @@ sleep 60 sudo service docker restart # Copied from the repo at v03_pipeline/var/vep_config -gcloud storage cp --billing-project $PROJECT gs://seqr-reference-data/vep/110/vep-${ASSEMBLY}.json $VEP_CONFIG_PATH +gcloud storage cp --billing-project $PROJECT gs://seqr-reference-data/vep/GRCh38/vep-${ASSEMBLY}.json $VEP_CONFIG_PATH # Copied from the UTRAnnotator repo (https://github.com/ImperialCardioGenetics/UTRannotator/tree/master) -gcloud storage cp --billing-project $PROJECT gs://seqr-reference-data/vep/110/uORF_5UTR_${ASSEMBLY}_PUBLIC.txt /vep_data/ & +gcloud storage cp --billing-project $PROJECT gs://seqr-reference-data/vep/GRCh38/uORF_5UTR_${ASSEMBLY}_PUBLIC.txt /vep_data/ & # Raw data files copied from the bucket (https://console.cloud.google.com/storage/browser/dm_alphamissense;tab=objects?prefix=&forceOnObjectsSortingFiltering=false) # tabix -s 1 -b 2 -e 2 -f -S 1 AlphaMissense_hg38.tsv.gz -gcloud storage cp --billing-project $PROJECT 'gs://seqr-reference-data/vep/110/AlphaMissense_hg38.tsv.*' /vep_data/ & +gcloud storage cp --billing-project $PROJECT 'gs://seqr-reference-data/vep/GRCh38/AlphaMissense_hg38.tsv.*' /vep_data/ & gcloud storage cat --billing-project $PROJECT gs://seqr-reference-data/vep_data/loftee-beta/${ASSEMBLY}.tar | tar -xf - -C /vep_data/ & # Copied from ftp://ftp.ensembl.org/pub/release-110/variation/indexed_vep_cache/homo_sapiens_vep_110_${ASSEMBLY}.tar.gz -gcloud storage cat --billing-project $PROJECT gs://seqr-reference-data/vep/110/homo_sapiens_vep_110_${ASSEMBLY}.tar.gz | tar -xzf - -C /vep_data/ & +gcloud storage cat --billing-project $PROJECT gs://seqr-reference-data/vep/GRCh38/homo_sapiens_vep_110_${ASSEMBLY}.tar.gz | tar -xzf - -C /vep_data/ & # Generated with: # curl -O ftp://ftp.ensembl.org/pub/release-110/fasta/homo_sapiens/dna/Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa.gz > Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa.gz # gzip -d Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa.gz # bgzip Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa # samtools faidx Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa.gz -gcloud storage cp --billing-project $PROJECT "gs://seqr-reference-data/vep/110/Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa.*" /vep_data/ & +gcloud storage cp --billing-project $PROJECT "gs://seqr-reference-data/vep/GRCh38/Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa.*" /vep_data/ & docker pull ${VEP_DOCKER_IMAGE} & wait diff --git a/v03_pipeline/lib/model/reference_dataset_collection.py b/v03_pipeline/lib/model/reference_dataset_collection.py index 7f87ff154..f784b105d 100644 --- a/v03_pipeline/lib/model/reference_dataset_collection.py +++ b/v03_pipeline/lib/model/reference_dataset_collection.py @@ -45,6 +45,7 @@ def datasets(self, dataset_type: DatasetType) -> list[str]: 'hmtvar', 'mitomap', 'mitimpact', + 'local_constraint_mito', ], (ReferenceDatasetCollection.HGMD, DatasetType.SNV_INDEL): ['hgmd'], (ReferenceDatasetCollection.INTERVAL, DatasetType.SNV_INDEL): [ diff --git a/v03_pipeline/lib/reference_data/config.py b/v03_pipeline/lib/reference_data/config.py index f38d2773f..714ee9a57 100644 --- a/v03_pipeline/lib/reference_data/config.py +++ b/v03_pipeline/lib/reference_data/config.py @@ -16,6 +16,9 @@ parsed_clnsig, ) from v03_pipeline.lib.reference_data.hgmd import download_and_import_hgmd_vcf +from v03_pipeline.lib.reference_data.mito import ( + download_and_import_local_constraint_tsv, +) def import_locus_intervals( @@ -533,4 +536,14 @@ def custom_mpc_select(ht): 'custom_import': import_locus_intervals, }, }, + 'local_constraint_mito': { + '38': { + 'version': '2024-07-24', + # Originally sourced from https://www.biorxiv.org/content/10.1101/2022.12.16.520778v2.supplementary-material + # Supplementary Table 7. + 'source_path': 'gs://seqr-reference-data/GRCh38/mitochondrial/local_constraint.tsv', + 'custom_import': download_and_import_local_constraint_tsv, + 'select': {'score': 'MLC_score'}, + }, + }, } diff --git a/v03_pipeline/lib/reference_data/mito.py b/v03_pipeline/lib/reference_data/mito.py new file mode 100644 index 000000000..7df647324 --- /dev/null +++ b/v03_pipeline/lib/reference_data/mito.py @@ -0,0 +1,16 @@ +import hail as hl + +from v03_pipeline.lib.model.definitions import ReferenceGenome + + +def download_and_import_local_constraint_tsv( + url: str, + reference_genome: ReferenceGenome, +) -> hl.Table: + ht = hl.import_table(url, types={'Position': hl.tint32, 'MLC_score': hl.tfloat32}) + ht = ht.select( + locus=hl.locus('chrM', ht.Position, reference_genome.value), + alleles=[ht.Reference, ht.Alternate], + MLC_score=ht.MLC_score, + ) + return ht.key_by('locus', 'alleles') diff --git a/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py b/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py index fe68eebe3..33ab87ed0 100644 --- a/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py +++ b/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py @@ -628,6 +628,21 @@ ), }, }, + 'local_constraint_mito': { + '38': { + **CONFIG['local_constraint_mito']['38'], + 'custom_import': lambda *_: hl.Table.parallelize( + [], + hl.tstruct( + locus=hl.tlocus('GRCh38'), + alleles=hl.tarray(hl.tstr), + MLC_score=hl.tfloat32, + ), + key=['locus', 'alleles'], + globals=hl.Struct(), + ), + }, + }, } @@ -960,6 +975,7 @@ def test_update_vat_with_updated_rdc_mito_38( clinvar_mito='ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz', dbnsfp_mito='gs://seqr-reference-data/GRCh38/dbNSFP/v4.2/dbNSFP4.2a_variant.with_new_scores.ht', high_constraint_region_mito='gs://seqr-reference-data/GRCh38/mitochondrial/Helix high constraint intervals Feb-15-2022.tsv', + local_constraint_mito='gs://seqr-reference-data/GRCh38/mitochondrial/local_constraint.tsv', ), versions=hl.Struct( gnomad_mito='v3.1', @@ -970,6 +986,7 @@ def test_update_vat_with_updated_rdc_mito_38( clinvar_mito='2023-07-22', dbnsfp_mito='4.2', high_constraint_region_mito='Feb-15-2022', + local_constraint_mito='2024-07-24', ), enums=hl.Struct( gnomad_mito=hl.Struct(), @@ -985,6 +1002,7 @@ def test_update_vat_with_updated_rdc_mito_38( MutationTaster_pred=['D', 'A', 'N', 'P'], ), high_constraint_region_mito=hl.Struct(), + local_constraint_mito=hl.Struct(), sorted_transcript_consequences=hl.Struct( biotype=BIOTYPES, consequence_term=TRANSCRIPT_CONSEQUENCE_TERMS, @@ -1041,6 +1059,7 @@ def test_update_vat_with_updated_rdc_mito_38( mitomap=None, mitimpact=hl.Struct(score=0.5199999809265137), high_constraint_region_mito=True, + local_constraint_mito=hl.Struct(score=0.5), ), ], ) diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py index d22dae749..3d23a1278 100644 --- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py +++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py @@ -842,6 +842,7 @@ def test_mito_update_vat( hmtvar='gs://seqr-reference-data/GRCh38/mitochondrial/HmtVar/HmtVar%20Jan.%2010%202022.ht', mitomap='gs://seqr-reference-data/GRCh38/mitochondrial/MITOMAP/mitomap-confirmed-mutations-2022-02-04.ht', mitimpact='gs://seqr-reference-data/GRCh38/mitochondrial/MitImpact/MitImpact_db_3.0.7.ht', + local_constraint_mito='gs://seqr-reference-data/GRCh38/mitochondrial/local_constraint.tsv', ), versions=hl.Struct( high_constraint_region_mito='Feb-15-2022', @@ -852,9 +853,11 @@ def test_mito_update_vat( hmtvar='Jan. 10 2022', mitomap='Feb. 04 2022', mitimpact='3.0.7', + local_constraint_mito='2024-07-24', ), enums=hl.Struct( high_constraint_region_mito=hl.Struct(), + local_constraint_mito=hl.Struct(), clinvar_mito=hl.Struct( assertion=CLINVAR_ASSERTIONS, pathogenicity=CLINVAR_PATHOGENICITIES, @@ -920,6 +923,7 @@ def test_mito_update_vat( AF_hom=0.0, AN=4, ), + local_constraint_mito=None, ), hl.Struct( locus=hl.Locus( @@ -955,6 +959,7 @@ def test_mito_update_vat( AF_hom=0.0, AN=4, ), + local_constraint_mito=None, ), hl.Struct( locus=hl.Locus( @@ -990,6 +995,7 @@ def test_mito_update_vat( AF_hom=0.0, AN=4, ), + local_constraint_mito=None, ), hl.Struct( locus=hl.Locus( @@ -1025,6 +1031,7 @@ def test_mito_update_vat( AF_hom=0.0, AN=4, ), + local_constraint_mito=None, ), hl.Struct( locus=hl.Locus( @@ -1060,6 +1067,7 @@ def test_mito_update_vat( AF_hom=0.0, AN=4, ), + local_constraint_mito=None, ), ], ) diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/.README.txt.crc b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/.README.txt.crc index 436531ab2..e08d4d12b 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/.README.txt.crc and b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/.README.txt.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/.metadata.json.gz.crc index d1c46b25e..d328f484c 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/.metadata.json.gz.crc and b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/README.txt b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/README.txt index 0160eb2ca..704275b10 100644 --- a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/README.txt +++ b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/README.txt @@ -1,3 +1,3 @@ This folder comprises a Hail (www.hail.is) native Table or MatrixTable. Written with version 0.2.130-bea04d9c79b5 - Created at 2024/05/20 14:08:17 \ No newline at end of file + Created at 2024/07/24 14:11:11 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/.metadata.json.gz.crc index d5cf430bc..94dde309e 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/.metadata.json.gz.crc and b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/metadata.json.gz index 9e842012f..d7c36221c 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/metadata.json.gz and b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/parts/.part-0.crc index 12c819f3c..7b3d99c48 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/parts/.part-0.crc and b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/parts/.part-0.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/parts/part-0 index f4252dc82..2493dddf9 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/parts/part-0 and b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/parts/part-0 differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.idx/.index.crc b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.idx/.index.crc deleted file mode 100644 index 12ec58de2..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.idx/.index.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.idx/.index.crc b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.idx/.index.crc new file mode 100644 index 000000000..9cce2f7ae Binary files /dev/null and b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.idx/.index.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.idx/.metadata.json.gz.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.idx/.metadata.json.gz.crc rename to v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.idx/.metadata.json.gz.crc diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.idx/index b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.idx/index similarity index 61% rename from v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.idx/index rename to v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.idx/index index 7b548374b..1c70e02f0 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.idx/index and b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.idx/index differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.idx/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.idx/metadata.json.gz similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.idx/metadata.json.gz rename to v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.idx/metadata.json.gz diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/metadata.json.gz index b41476cb2..95672cd45 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/metadata.json.gz and b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/.metadata.json.gz.crc index 30c202768..a927fa9da 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/.metadata.json.gz.crc and b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/metadata.json.gz index 29e15e9d8..213bdb7aa 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/metadata.json.gz and b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/.part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.crc b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/.part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.crc deleted file mode 100644 index f8028adef..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/.part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/.part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.crc b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/.part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.crc new file mode 100644 index 000000000..089436f1c Binary files /dev/null and b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/.part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2 b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2 deleted file mode 100644 index f78553dbf..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2 and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90 b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90 new file mode 100644 index 000000000..bf1bf60c2 Binary files /dev/null and b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90 differ