Skip to content

Commit 94fc293

Browse files
authored
Dev (#847)
* Move vep files (#844) * Add mito local constraint (#845) * Add mito local constraint * Fix tests * lint
1 parent c33e5e1 commit 94fc293

25 files changed

+64
-7
lines changed

v03_pipeline/bin/vep-110-GRCh38.sh

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ export PROJECT="$(gcloud config get-value project)"
1212
export VEP_CONFIG_PATH="$(/usr/share/google/get_metadata_value attributes/VEP_CONFIG_PATH)"
1313
export VEP_REPLICATE="$(/usr/share/google/get_metadata_value attributes/VEP_REPLICATE)"
1414
export ASSEMBLY=GRCh38
15-
export VEP_DOCKER_IMAGE=gcr.io/seqr-project/vep-docker-image:110
15+
export VEP_DOCKER_IMAGE=gcr.io/seqr-project/vep-docker-image:GRCh38
1616

1717
mkdir -p /vep_data
1818

@@ -36,26 +36,26 @@ sleep 60
3636
sudo service docker restart
3737

3838
# Copied from the repo at v03_pipeline/var/vep_config
39-
gcloud storage cp --billing-project $PROJECT gs://seqr-reference-data/vep/110/vep-${ASSEMBLY}.json $VEP_CONFIG_PATH
39+
gcloud storage cp --billing-project $PROJECT gs://seqr-reference-data/vep/GRCh38/vep-${ASSEMBLY}.json $VEP_CONFIG_PATH
4040

4141
# Copied from the UTRAnnotator repo (https://github.com/ImperialCardioGenetics/UTRannotator/tree/master)
42-
gcloud storage cp --billing-project $PROJECT gs://seqr-reference-data/vep/110/uORF_5UTR_${ASSEMBLY}_PUBLIC.txt /vep_data/ &
42+
gcloud storage cp --billing-project $PROJECT gs://seqr-reference-data/vep/GRCh38/uORF_5UTR_${ASSEMBLY}_PUBLIC.txt /vep_data/ &
4343

4444
# Raw data files copied from the bucket (https://console.cloud.google.com/storage/browser/dm_alphamissense;tab=objects?prefix=&forceOnObjectsSortingFiltering=false)
4545
# tabix -s 1 -b 2 -e 2 -f -S 1 AlphaMissense_hg38.tsv.gz
46-
gcloud storage cp --billing-project $PROJECT 'gs://seqr-reference-data/vep/110/AlphaMissense_hg38.tsv.*' /vep_data/ &
46+
gcloud storage cp --billing-project $PROJECT 'gs://seqr-reference-data/vep/GRCh38/AlphaMissense_hg38.tsv.*' /vep_data/ &
4747

4848
gcloud storage cat --billing-project $PROJECT gs://seqr-reference-data/vep_data/loftee-beta/${ASSEMBLY}.tar | tar -xf - -C /vep_data/ &
4949

5050
# Copied from ftp://ftp.ensembl.org/pub/release-110/variation/indexed_vep_cache/homo_sapiens_vep_110_${ASSEMBLY}.tar.gz
51-
gcloud storage cat --billing-project $PROJECT gs://seqr-reference-data/vep/110/homo_sapiens_vep_110_${ASSEMBLY}.tar.gz | tar -xzf - -C /vep_data/ &
51+
gcloud storage cat --billing-project $PROJECT gs://seqr-reference-data/vep/GRCh38/homo_sapiens_vep_110_${ASSEMBLY}.tar.gz | tar -xzf - -C /vep_data/ &
5252

5353
# Generated with:
5454
# curl -O ftp://ftp.ensembl.org/pub/release-110/fasta/homo_sapiens/dna/Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa.gz > Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa.gz
5555
# gzip -d Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa.gz
5656
# bgzip Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa
5757
# samtools faidx Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa.gz
58-
gcloud storage cp --billing-project $PROJECT "gs://seqr-reference-data/vep/110/Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa.*" /vep_data/ &
58+
gcloud storage cp --billing-project $PROJECT "gs://seqr-reference-data/vep/GRCh38/Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa.*" /vep_data/ &
5959
docker pull ${VEP_DOCKER_IMAGE} &
6060
wait
6161

v03_pipeline/lib/model/reference_dataset_collection.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ def datasets(self, dataset_type: DatasetType) -> list[str]:
4545
'hmtvar',
4646
'mitomap',
4747
'mitimpact',
48+
'local_constraint_mito',
4849
],
4950
(ReferenceDatasetCollection.HGMD, DatasetType.SNV_INDEL): ['hgmd'],
5051
(ReferenceDatasetCollection.INTERVAL, DatasetType.SNV_INDEL): [

v03_pipeline/lib/reference_data/config.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@
1616
parsed_clnsig,
1717
)
1818
from v03_pipeline.lib.reference_data.hgmd import download_and_import_hgmd_vcf
19+
from v03_pipeline.lib.reference_data.mito import (
20+
download_and_import_local_constraint_tsv,
21+
)
1922

2023

2124
def import_locus_intervals(
@@ -533,4 +536,14 @@ def custom_mpc_select(ht):
533536
'custom_import': import_locus_intervals,
534537
},
535538
},
539+
'local_constraint_mito': {
540+
'38': {
541+
'version': '2024-07-24',
542+
# Originally sourced from https://www.biorxiv.org/content/10.1101/2022.12.16.520778v2.supplementary-material
543+
# Supplementary Table 7.
544+
'source_path': 'gs://seqr-reference-data/GRCh38/mitochondrial/local_constraint.tsv',
545+
'custom_import': download_and_import_local_constraint_tsv,
546+
'select': {'score': 'MLC_score'},
547+
},
548+
},
536549
}
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
import hail as hl
2+
3+
from v03_pipeline.lib.model.definitions import ReferenceGenome
4+
5+
6+
def download_and_import_local_constraint_tsv(
7+
url: str,
8+
reference_genome: ReferenceGenome,
9+
) -> hl.Table:
10+
ht = hl.import_table(url, types={'Position': hl.tint32, 'MLC_score': hl.tfloat32})
11+
ht = ht.select(
12+
locus=hl.locus('chrM', ht.Position, reference_genome.value),
13+
alleles=[ht.Reference, ht.Alternate],
14+
MLC_score=ht.MLC_score,
15+
)
16+
return ht.key_by('locus', 'alleles')

v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -628,6 +628,21 @@
628628
),
629629
},
630630
},
631+
'local_constraint_mito': {
632+
'38': {
633+
**CONFIG['local_constraint_mito']['38'],
634+
'custom_import': lambda *_: hl.Table.parallelize(
635+
[],
636+
hl.tstruct(
637+
locus=hl.tlocus('GRCh38'),
638+
alleles=hl.tarray(hl.tstr),
639+
MLC_score=hl.tfloat32,
640+
),
641+
key=['locus', 'alleles'],
642+
globals=hl.Struct(),
643+
),
644+
},
645+
},
631646
}
632647

633648

@@ -960,6 +975,7 @@ def test_update_vat_with_updated_rdc_mito_38(
960975
clinvar_mito='ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz',
961976
dbnsfp_mito='gs://seqr-reference-data/GRCh38/dbNSFP/v4.2/dbNSFP4.2a_variant.with_new_scores.ht',
962977
high_constraint_region_mito='gs://seqr-reference-data/GRCh38/mitochondrial/Helix high constraint intervals Feb-15-2022.tsv',
978+
local_constraint_mito='gs://seqr-reference-data/GRCh38/mitochondrial/local_constraint.tsv',
963979
),
964980
versions=hl.Struct(
965981
gnomad_mito='v3.1',
@@ -970,6 +986,7 @@ def test_update_vat_with_updated_rdc_mito_38(
970986
clinvar_mito='2023-07-22',
971987
dbnsfp_mito='4.2',
972988
high_constraint_region_mito='Feb-15-2022',
989+
local_constraint_mito='2024-07-24',
973990
),
974991
enums=hl.Struct(
975992
gnomad_mito=hl.Struct(),
@@ -985,6 +1002,7 @@ def test_update_vat_with_updated_rdc_mito_38(
9851002
MutationTaster_pred=['D', 'A', 'N', 'P'],
9861003
),
9871004
high_constraint_region_mito=hl.Struct(),
1005+
local_constraint_mito=hl.Struct(),
9881006
sorted_transcript_consequences=hl.Struct(
9891007
biotype=BIOTYPES,
9901008
consequence_term=TRANSCRIPT_CONSEQUENCE_TERMS,
@@ -1041,6 +1059,7 @@ def test_update_vat_with_updated_rdc_mito_38(
10411059
mitomap=None,
10421060
mitimpact=hl.Struct(score=0.5199999809265137),
10431061
high_constraint_region_mito=True,
1062+
local_constraint_mito=hl.Struct(score=0.5),
10441063
),
10451064
],
10461065
)

v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -842,6 +842,7 @@ def test_mito_update_vat(
842842
hmtvar='gs://seqr-reference-data/GRCh38/mitochondrial/HmtVar/HmtVar%20Jan.%2010%202022.ht',
843843
mitomap='gs://seqr-reference-data/GRCh38/mitochondrial/MITOMAP/mitomap-confirmed-mutations-2022-02-04.ht',
844844
mitimpact='gs://seqr-reference-data/GRCh38/mitochondrial/MitImpact/MitImpact_db_3.0.7.ht',
845+
local_constraint_mito='gs://seqr-reference-data/GRCh38/mitochondrial/local_constraint.tsv',
845846
),
846847
versions=hl.Struct(
847848
high_constraint_region_mito='Feb-15-2022',
@@ -852,9 +853,11 @@ def test_mito_update_vat(
852853
hmtvar='Jan. 10 2022',
853854
mitomap='Feb. 04 2022',
854855
mitimpact='3.0.7',
856+
local_constraint_mito='2024-07-24',
855857
),
856858
enums=hl.Struct(
857859
high_constraint_region_mito=hl.Struct(),
860+
local_constraint_mito=hl.Struct(),
858861
clinvar_mito=hl.Struct(
859862
assertion=CLINVAR_ASSERTIONS,
860863
pathogenicity=CLINVAR_PATHOGENICITIES,
@@ -920,6 +923,7 @@ def test_mito_update_vat(
920923
AF_hom=0.0,
921924
AN=4,
922925
),
926+
local_constraint_mito=None,
923927
),
924928
hl.Struct(
925929
locus=hl.Locus(
@@ -955,6 +959,7 @@ def test_mito_update_vat(
955959
AF_hom=0.0,
956960
AN=4,
957961
),
962+
local_constraint_mito=None,
958963
),
959964
hl.Struct(
960965
locus=hl.Locus(
@@ -990,6 +995,7 @@ def test_mito_update_vat(
990995
AF_hom=0.0,
991996
AN=4,
992997
),
998+
local_constraint_mito=None,
993999
),
9941000
hl.Struct(
9951001
locus=hl.Locus(
@@ -1025,6 +1031,7 @@ def test_mito_update_vat(
10251031
AF_hom=0.0,
10261032
AN=4,
10271033
),
1034+
local_constraint_mito=None,
10281035
),
10291036
hl.Struct(
10301037
locus=hl.Locus(
@@ -1060,6 +1067,7 @@ def test_mito_update_vat(
10601067
AF_hom=0.0,
10611068
AN=4,
10621069
),
1070+
local_constraint_mito=None,
10631071
),
10641072
],
10651073
)
Binary file not shown.
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
This folder comprises a Hail (www.hail.is) native Table or MatrixTable.
22
Written with version 0.2.130-bea04d9c79b5
3-
Created at 2024/05/20 14:08:17
3+
Created at 2024/07/24 14:11:11
Binary file not shown.
Binary file not shown.
Binary file not shown.

0 commit comments

Comments
 (0)