Skip to content

Dev #847

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jul 25, 2024
Merged

Dev #847

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions v03_pipeline/bin/vep-110-GRCh38.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ export PROJECT="$(gcloud config get-value project)"
export VEP_CONFIG_PATH="$(/usr/share/google/get_metadata_value attributes/VEP_CONFIG_PATH)"
export VEP_REPLICATE="$(/usr/share/google/get_metadata_value attributes/VEP_REPLICATE)"
export ASSEMBLY=GRCh38
export VEP_DOCKER_IMAGE=gcr.io/seqr-project/vep-docker-image:110
export VEP_DOCKER_IMAGE=gcr.io/seqr-project/vep-docker-image:GRCh38

mkdir -p /vep_data

Expand All @@ -36,26 +36,26 @@ sleep 60
sudo service docker restart

# Copied from the repo at v03_pipeline/var/vep_config
gcloud storage cp --billing-project $PROJECT gs://seqr-reference-data/vep/110/vep-${ASSEMBLY}.json $VEP_CONFIG_PATH
gcloud storage cp --billing-project $PROJECT gs://seqr-reference-data/vep/GRCh38/vep-${ASSEMBLY}.json $VEP_CONFIG_PATH

# Copied from the UTRAnnotator repo (https://github.com/ImperialCardioGenetics/UTRannotator/tree/master)
gcloud storage cp --billing-project $PROJECT gs://seqr-reference-data/vep/110/uORF_5UTR_${ASSEMBLY}_PUBLIC.txt /vep_data/ &
gcloud storage cp --billing-project $PROJECT gs://seqr-reference-data/vep/GRCh38/uORF_5UTR_${ASSEMBLY}_PUBLIC.txt /vep_data/ &

# Raw data files copied from the bucket (https://console.cloud.google.com/storage/browser/dm_alphamissense;tab=objects?prefix=&forceOnObjectsSortingFiltering=false)
# tabix -s 1 -b 2 -e 2 -f -S 1 AlphaMissense_hg38.tsv.gz
gcloud storage cp --billing-project $PROJECT 'gs://seqr-reference-data/vep/110/AlphaMissense_hg38.tsv.*' /vep_data/ &
gcloud storage cp --billing-project $PROJECT 'gs://seqr-reference-data/vep/GRCh38/AlphaMissense_hg38.tsv.*' /vep_data/ &

gcloud storage cat --billing-project $PROJECT gs://seqr-reference-data/vep_data/loftee-beta/${ASSEMBLY}.tar | tar -xf - -C /vep_data/ &

# Copied from ftp://ftp.ensembl.org/pub/release-110/variation/indexed_vep_cache/homo_sapiens_vep_110_${ASSEMBLY}.tar.gz
gcloud storage cat --billing-project $PROJECT gs://seqr-reference-data/vep/110/homo_sapiens_vep_110_${ASSEMBLY}.tar.gz | tar -xzf - -C /vep_data/ &
gcloud storage cat --billing-project $PROJECT gs://seqr-reference-data/vep/GRCh38/homo_sapiens_vep_110_${ASSEMBLY}.tar.gz | tar -xzf - -C /vep_data/ &

# Generated with:
# curl -O ftp://ftp.ensembl.org/pub/release-110/fasta/homo_sapiens/dna/Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa.gz > Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa.gz
# gzip -d Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa.gz
# bgzip Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa
# samtools faidx Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa.gz
gcloud storage cp --billing-project $PROJECT "gs://seqr-reference-data/vep/110/Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa.*" /vep_data/ &
gcloud storage cp --billing-project $PROJECT "gs://seqr-reference-data/vep/GRCh38/Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa.*" /vep_data/ &
docker pull ${VEP_DOCKER_IMAGE} &
wait

Expand Down
1 change: 1 addition & 0 deletions v03_pipeline/lib/model/reference_dataset_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def datasets(self, dataset_type: DatasetType) -> list[str]:
'hmtvar',
'mitomap',
'mitimpact',
'local_constraint_mito',
],
(ReferenceDatasetCollection.HGMD, DatasetType.SNV_INDEL): ['hgmd'],
(ReferenceDatasetCollection.INTERVAL, DatasetType.SNV_INDEL): [
Expand Down
13 changes: 13 additions & 0 deletions v03_pipeline/lib/reference_data/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@
parsed_clnsig,
)
from v03_pipeline.lib.reference_data.hgmd import download_and_import_hgmd_vcf
from v03_pipeline.lib.reference_data.mito import (
download_and_import_local_constraint_tsv,
)


def import_locus_intervals(
Expand Down Expand Up @@ -533,4 +536,14 @@ def custom_mpc_select(ht):
'custom_import': import_locus_intervals,
},
},
'local_constraint_mito': {
'38': {
'version': '2024-07-24',
# Originally sourced from https://www.biorxiv.org/content/10.1101/2022.12.16.520778v2.supplementary-material
# Supplementary Table 7.
'source_path': 'gs://seqr-reference-data/GRCh38/mitochondrial/local_constraint.tsv',
'custom_import': download_and_import_local_constraint_tsv,
'select': {'score': 'MLC_score'},
},
},
}
16 changes: 16 additions & 0 deletions v03_pipeline/lib/reference_data/mito.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import hail as hl

from v03_pipeline.lib.model.definitions import ReferenceGenome


def download_and_import_local_constraint_tsv(
url: str,
reference_genome: ReferenceGenome,
) -> hl.Table:
ht = hl.import_table(url, types={'Position': hl.tint32, 'MLC_score': hl.tfloat32})
ht = ht.select(
locus=hl.locus('chrM', ht.Position, reference_genome.value),
alleles=[ht.Reference, ht.Alternate],
MLC_score=ht.MLC_score,
)
return ht.key_by('locus', 'alleles')
Original file line number Diff line number Diff line change
Expand Up @@ -628,6 +628,21 @@
),
},
},
'local_constraint_mito': {
'38': {
**CONFIG['local_constraint_mito']['38'],
'custom_import': lambda *_: hl.Table.parallelize(
[],
hl.tstruct(
locus=hl.tlocus('GRCh38'),
alleles=hl.tarray(hl.tstr),
MLC_score=hl.tfloat32,
),
key=['locus', 'alleles'],
globals=hl.Struct(),
),
},
},
}


Expand Down Expand Up @@ -960,6 +975,7 @@ def test_update_vat_with_updated_rdc_mito_38(
clinvar_mito='ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz',
dbnsfp_mito='gs://seqr-reference-data/GRCh38/dbNSFP/v4.2/dbNSFP4.2a_variant.with_new_scores.ht',
high_constraint_region_mito='gs://seqr-reference-data/GRCh38/mitochondrial/Helix high constraint intervals Feb-15-2022.tsv',
local_constraint_mito='gs://seqr-reference-data/GRCh38/mitochondrial/local_constraint.tsv',
),
versions=hl.Struct(
gnomad_mito='v3.1',
Expand All @@ -970,6 +986,7 @@ def test_update_vat_with_updated_rdc_mito_38(
clinvar_mito='2023-07-22',
dbnsfp_mito='4.2',
high_constraint_region_mito='Feb-15-2022',
local_constraint_mito='2024-07-24',
),
enums=hl.Struct(
gnomad_mito=hl.Struct(),
Expand All @@ -985,6 +1002,7 @@ def test_update_vat_with_updated_rdc_mito_38(
MutationTaster_pred=['D', 'A', 'N', 'P'],
),
high_constraint_region_mito=hl.Struct(),
local_constraint_mito=hl.Struct(),
sorted_transcript_consequences=hl.Struct(
biotype=BIOTYPES,
consequence_term=TRANSCRIPT_CONSEQUENCE_TERMS,
Expand Down Expand Up @@ -1041,6 +1059,7 @@ def test_update_vat_with_updated_rdc_mito_38(
mitomap=None,
mitimpact=hl.Struct(score=0.5199999809265137),
high_constraint_region_mito=True,
local_constraint_mito=hl.Struct(score=0.5),
),
],
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -842,6 +842,7 @@ def test_mito_update_vat(
hmtvar='gs://seqr-reference-data/GRCh38/mitochondrial/HmtVar/HmtVar%20Jan.%2010%202022.ht',
mitomap='gs://seqr-reference-data/GRCh38/mitochondrial/MITOMAP/mitomap-confirmed-mutations-2022-02-04.ht',
mitimpact='gs://seqr-reference-data/GRCh38/mitochondrial/MitImpact/MitImpact_db_3.0.7.ht',
local_constraint_mito='gs://seqr-reference-data/GRCh38/mitochondrial/local_constraint.tsv',
),
versions=hl.Struct(
high_constraint_region_mito='Feb-15-2022',
Expand All @@ -852,9 +853,11 @@ def test_mito_update_vat(
hmtvar='Jan. 10 2022',
mitomap='Feb. 04 2022',
mitimpact='3.0.7',
local_constraint_mito='2024-07-24',
),
enums=hl.Struct(
high_constraint_region_mito=hl.Struct(),
local_constraint_mito=hl.Struct(),
clinvar_mito=hl.Struct(
assertion=CLINVAR_ASSERTIONS,
pathogenicity=CLINVAR_PATHOGENICITIES,
Expand Down Expand Up @@ -920,6 +923,7 @@ def test_mito_update_vat(
AF_hom=0.0,
AN=4,
),
local_constraint_mito=None,
),
hl.Struct(
locus=hl.Locus(
Expand Down Expand Up @@ -955,6 +959,7 @@ def test_mito_update_vat(
AF_hom=0.0,
AN=4,
),
local_constraint_mito=None,
),
hl.Struct(
locus=hl.Locus(
Expand Down Expand Up @@ -990,6 +995,7 @@ def test_mito_update_vat(
AF_hom=0.0,
AN=4,
),
local_constraint_mito=None,
),
hl.Struct(
locus=hl.Locus(
Expand Down Expand Up @@ -1025,6 +1031,7 @@ def test_mito_update_vat(
AF_hom=0.0,
AN=4,
),
local_constraint_mito=None,
),
hl.Struct(
locus=hl.Locus(
Expand Down Expand Up @@ -1060,6 +1067,7 @@ def test_mito_update_vat(
AF_hom=0.0,
AN=4,
),
local_constraint_mito=None,
),
],
)
Expand Down
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
This folder comprises a Hail (www.hail.is) native Table or MatrixTable.
Written with version 0.2.130-bea04d9c79b5
Created at 2024/05/20 14:08:17
Created at 2024/07/24 14:11:11
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading