From 9177de2e598774901af0e7c17a13a931141adb49 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 13 Jun 2024 18:18:27 -0400 Subject: [PATCH] vep migration --- v03_pipeline/bin/vep_migration.py | 39 +++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 v03_pipeline/bin/vep_migration.py diff --git a/v03_pipeline/bin/vep_migration.py b/v03_pipeline/bin/vep_migration.py new file mode 100644 index 000000000..30e517f25 --- /dev/null +++ b/v03_pipeline/bin/vep_migration.py @@ -0,0 +1,39 @@ +import os +import hail as hl + +# Need to use the GCP bucket as temp storage for very large callset joins +hl.init(tmp_dir='gs://seqr-scratch-temp', idempotent=True) + +# Interval ref data join causes shuffle death, this prevents it +hl._set_flags(use_new_shuffle='1', no_whole_stage_codegen='1') +sc = hl.spark_context() +sc.addPyFile('gs://seqr-luigi/releases/dev/latest/pyscripts.zip') + +from v03_pipeline.lib.model import DatasetType, ReferenceGenome +from v03_pipeline.lib.misc.io import write +from v03_pipeline.lib.annotations import snv_indel, misc +from v03_pipeline.lib.reference_data.gencode.mapping_gene_ids import load_gencode_ensembl_to_refseq_id + + +gencode_ensembl_to_refseq_id_mapping = hl.literal(load_gencode_ensembl_to_refseq_id(44)) +ht = hl.read_table('gs://seqr-hail-search-data/v03/GRCh38/SNV_INDEL/annotations.ht') +ht = ht.repartition(2500) +ht = ht.write('gs://seqr-scratch-temp/annotations_repartitioned.ht') +ht = hl.vep( + ht, + config=os.environ['VEP_CONFIG_URI'], + name='vep', + block_size=1000, + tolerate_parse_error=True, + csq=False, +) +ht = ht.write('gs://seqr-scratch-temp/annotations_repartitioned_vep.ht') +ht = ht.annotate( + check_ref=snv_indel.check_ref(ht), + sorted_transcript_consequences=snv_indel.sorted_transcript_consequences(ht, gencode_ensembl_to_refseq_id_mapping=gencode_ensembl_to_refseq_id_mapping), + sorted_regulatory_feature_consequences=snv_indel.sorted_regulatory_feature_consequences(ht), + sorted_motif_feature_consequences=snv_indel.sorted_motif_feature_consequences(ht), +) +ht = ht.drop('vep', 'vep_proc_id') +ht = misc.annotate_enums(ht, ReferenceGenome.GRCh38, DatsetType.SNV_INDEL) +write(ht, 'gs://seqr-scratch-temp/annotations.ht') \ No newline at end of file