diff --git a/pyproject.toml b/pyproject.toml index adc5d947a..73826b005 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -64,6 +64,9 @@ inline-quotes = "single" 'SLF001', # allow private access 'PLR0913', # allow high arity functions ] +'*migration*' = [ + 'N999', # allow invalid module names +] [tool.ruff.pylint] max-args = 6 diff --git a/v03_pipeline/bin/vep-110-GRCh38.sh b/v03_pipeline/bin/vep-110-GRCh38.sh index cd649b2ad..dead36366 100644 --- a/v03_pipeline/bin/vep-110-GRCh38.sh +++ b/v03_pipeline/bin/vep-110-GRCh38.sh @@ -12,7 +12,7 @@ export PROJECT="$(gcloud config get-value project)" export VEP_CONFIG_PATH="$(/usr/share/google/get_metadata_value attributes/VEP_CONFIG_PATH)" export VEP_REPLICATE="$(/usr/share/google/get_metadata_value attributes/VEP_REPLICATE)" export ASSEMBLY=GRCh38 -export VEP_DOCKER_IMAGE=gcr.io/seqr-project/vep-docker-image:110 +export VEP_DOCKER_IMAGE=gcr.io/seqr-project/vep-docker-image:GRCh38 mkdir -p /vep_data @@ -36,26 +36,26 @@ sleep 60 sudo service docker restart # Copied from the repo at v03_pipeline/var/vep_config -gcloud storage cp --billing-project $PROJECT gs://seqr-reference-data/vep/110/vep-${ASSEMBLY}.json $VEP_CONFIG_PATH +gcloud storage cp --billing-project $PROJECT gs://seqr-reference-data/vep/GRCh38/vep-${ASSEMBLY}.json $VEP_CONFIG_PATH # Copied from the UTRAnnotator repo (https://github.com/ImperialCardioGenetics/UTRannotator/tree/master) -gcloud storage cp --billing-project $PROJECT gs://seqr-reference-data/vep/110/uORF_5UTR_${ASSEMBLY}_PUBLIC.txt /vep_data/ & +gcloud storage cp --billing-project $PROJECT gs://seqr-reference-data/vep/GRCh38/uORF_5UTR_${ASSEMBLY}_PUBLIC.txt /vep_data/ & # Raw data files copied from the bucket (https://console.cloud.google.com/storage/browser/dm_alphamissense;tab=objects?prefix=&forceOnObjectsSortingFiltering=false) # tabix -s 1 -b 2 -e 2 -f -S 1 AlphaMissense_hg38.tsv.gz -gcloud storage cp --billing-project $PROJECT 'gs://seqr-reference-data/vep/110/AlphaMissense_hg38.tsv.*' /vep_data/ & +gcloud storage cp --billing-project $PROJECT 'gs://seqr-reference-data/vep/GRCh38/AlphaMissense_hg38.tsv.*' /vep_data/ & gcloud storage cat --billing-project $PROJECT gs://seqr-reference-data/vep_data/loftee-beta/${ASSEMBLY}.tar | tar -xf - -C /vep_data/ & # Copied from ftp://ftp.ensembl.org/pub/release-110/variation/indexed_vep_cache/homo_sapiens_vep_110_${ASSEMBLY}.tar.gz -gcloud storage cat --billing-project $PROJECT gs://seqr-reference-data/vep/110/homo_sapiens_vep_110_${ASSEMBLY}.tar.gz | tar -xzf - -C /vep_data/ & +gcloud storage cat --billing-project $PROJECT gs://seqr-reference-data/vep/GRCh38/homo_sapiens_vep_110_${ASSEMBLY}.tar.gz | tar -xzf - -C /vep_data/ & # Generated with: # curl -O ftp://ftp.ensembl.org/pub/release-110/fasta/homo_sapiens/dna/Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa.gz > Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa.gz # gzip -d Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa.gz # bgzip Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa # samtools faidx Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa.gz -gcloud storage cp --billing-project $PROJECT "gs://seqr-reference-data/vep/110/Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa.*" /vep_data/ & +gcloud storage cp --billing-project $PROJECT "gs://seqr-reference-data/vep/GRCh38/Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa.*" /vep_data/ & docker pull ${VEP_DOCKER_IMAGE} & wait diff --git a/v03_pipeline/lib/annotations/sv.py b/v03_pipeline/lib/annotations/sv.py index 582474df4..5c75b0c94 100644 --- a/v03_pipeline/lib/annotations/sv.py +++ b/v03_pipeline/lib/annotations/sv.py @@ -81,6 +81,41 @@ def _sv_types(ht: hl.Table) -> hl.ArrayExpression: return ht.alleles[1].replace('[<>]', '').split(':', 2) +def alleles(ht: hl.Table, **_: Any) -> hl.ArrayExpression: + return hl.array( + [ + 'N', + hl.if_else( + ( + hl.is_defined(ht.sv_type_detail_id) + & (hl.array(SV_TYPES)[ht.sv_type_id] != 'CPX') + ), + hl.format( + '<%s:%s>', + hl.array(SV_TYPES)[ht.sv_type_id], + hl.array(SV_TYPE_DETAILS)[ht.sv_type_detail_id], + ), + hl.format('<%s>', hl.array(SV_TYPES)[ht.sv_type_id]), + ), + ], + ) + + +def info(ht: hl.Table, **_: Any) -> hl.StructExpression: + return hl.Struct( + ALGORITHMS=ht.algorithms, + END=ht.start_locus.position, + CHR2=ht.end_locus.contig, + END2=ht.end_locus.position, + SVTYPE=hl.array(SV_TYPES)[ht.sv_type_id], + SVLEN=ht.sv_len, + ) + + +def locus(ht: hl.Table, **_: Any) -> hl.LocusExpression: + return ht.start_locus + + def algorithms(ht: hl.Table, **_: Any) -> hl.Expression: return hl.str(',').join(ht['info.ALGORITHMS']) @@ -205,6 +240,10 @@ def strvctvre(ht: hl.Table, **_: Any) -> hl.Expression: return hl.struct(score=hl.parse_float32(ht['info.StrVCTVRE'])) +def sv_len(ht: hl.Table, **_: Any) -> hl.Expression: + return ht['info.SVLEN'] + + def sv_type_id(ht: hl.Table, **_: Any) -> hl.Expression: return SV_TYPES_LOOKUP[_sv_types(ht)[0]] diff --git a/v03_pipeline/lib/annotations/sv_test.py b/v03_pipeline/lib/annotations/sv_test.py new file mode 100644 index 000000000..53e098069 --- /dev/null +++ b/v03_pipeline/lib/annotations/sv_test.py @@ -0,0 +1,171 @@ +import unittest + +import hail as hl + +from v03_pipeline.lib.annotations.fields import get_fields +from v03_pipeline.lib.model import DatasetType + + +class SVTest(unittest.TestCase): + def test_sv_export_annotations(self) -> None: + ht = hl.Table.parallelize( + [ + hl.Struct( + id=0, + algorithms='manta', + end_locus=hl.Locus( + contig='chr5', + position=20404, + reference_genome='GRCh38', + ), + start_locus=hl.Locus( + contig='chr1', + position=180928, + reference_genome='GRCh38', + ), + sv_len=123, + sv_type_id=2, + sv_type_detail_id=None, + ), + hl.Struct( + id=1, + algorithms='manta', + end_locus=hl.Locus( + contig='chr1', + position=789481, + reference_genome='GRCh38', + ), + start_locus=hl.Locus( + contig='chr1', + position=789481, + reference_genome='GRCh38', + ), + sv_len=245, + sv_type_id=2, + sv_type_detail_id=None, + ), + hl.Struct( + id=2, + algorithms='manta', + end_locus=hl.Locus( + contig='chr1', + position=6559723, + reference_genome='GRCh38', + ), + start_locus=hl.Locus( + contig='chr1', + position=6558902, + reference_genome='GRCh38', + ), + sv_len=245, + sv_type_id=3, + sv_type_detail_id=2, + ), + hl.Struct( + id=3, + algorithms='manta', + end_locus=hl.Locus( + contig='chr1', + position=6559723, + reference_genome='GRCh38', + ), + start_locus=hl.Locus( + contig='chr1', + position=6558902, + reference_genome='GRCh38', + ), + sv_len=245, + sv_type_id=7, + sv_type_detail_id=6, + ), + ], + hl.tstruct( + id=hl.tint32, + algorithms=hl.tstr, + end_locus=hl.tlocus('GRCh38'), + start_locus=hl.tlocus('GRCh38'), + sv_len=hl.tint32, + sv_type_id=hl.tint32, + sv_type_detail_id=hl.tint32, + ), + key='id', + ) + ht = ht.select( + **get_fields( + ht, + DatasetType.SV.export_vcf_annotation_fns, + ), + ) + self.assertEqual( + ht.collect(), + [ + hl.Struct( + id=0, + locus=hl.Locus( + contig='chr1', + position=180928, + reference_genome='GRCh38', + ), + alleles=['N', ''], + info=hl.Struct( + ALGORITHMS='manta', + END=180928, + CHR2='chr5', + END2=20404, + SVTYPE='BND', + SVLEN=123, + ), + ), + hl.Struct( + id=1, + locus=hl.Locus( + contig='chr1', + position=789481, + reference_genome='GRCh38', + ), + alleles=['N', ''], + info=hl.Struct( + ALGORITHMS='manta', + END=789481, + CHR2='chr1', + END2=789481, + SVTYPE='BND', + SVLEN=245, + ), + ), + hl.Struct( + id=2, + locus=hl.Locus( + contig='chr1', + position=6558902, + reference_genome='GRCh38', + ), + alleles=['N', ''], + info=hl.Struct( + ALGORITHMS='manta', + END=6558902, + CHR2='chr1', + END2=6559723, + SVTYPE='CPX', + SVLEN=245, + ), + ), + hl.Struct( + id=3, + locus=hl.Locus( + contig='chr1', + position=6558902, + reference_genome='GRCh38', + ), + alleles=['N', ''], + info=hl.Struct( + ALGORITHMS='manta', + END=6558902, + CHR2='chr1', + END2=6559723, + SVTYPE='INS', + SVLEN=245, + ), + ), + ], + ) diff --git a/v03_pipeline/lib/migration/__init__.py b/v03_pipeline/lib/migration/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/v03_pipeline/lib/migration/base_migration.py b/v03_pipeline/lib/migration/base_migration.py new file mode 100644 index 000000000..0259a50f4 --- /dev/null +++ b/v03_pipeline/lib/migration/base_migration.py @@ -0,0 +1,16 @@ +from abc import ABC, abstractmethod + +import hail as hl + +from v03_pipeline.lib.model import DatasetType, ReferenceGenome + + +class BaseMigration(ABC): + reference_genome_dataset_types: frozenset[ + tuple[ReferenceGenome, DatasetType] + ] = None + + @staticmethod + @abstractmethod + def migrate(ht: hl.Table) -> hl.Table: + pass diff --git a/v03_pipeline/lib/migration/misc.py b/v03_pipeline/lib/migration/misc.py new file mode 100644 index 000000000..5fe8ea911 --- /dev/null +++ b/v03_pipeline/lib/migration/misc.py @@ -0,0 +1,33 @@ +import inspect +import pkgutil +import re + +from v03_pipeline.lib.migration.base_migration import BaseMigration + +MIGRATION_NAME_PATTERN = r'(\d\d\d\d_.*)' + + +def list_migrations( + path: str, +) -> list[tuple[str, BaseMigration]]: + migrations = [] + for loader, name, _ in pkgutil.iter_modules([path]): + match = re.search(MIGRATION_NAME_PATTERN, name) + if match: + module = loader.find_module(name).load_module(name) + implemented_migration = next( + ( + m + for m in module.__dict__.values() + # Return objects that are + # classes, subclasses of the BaseMigration + # and also NOT the BaseMigration class. + if inspect.isclass(m) + and issubclass(m, BaseMigration) + and m != BaseMigration + ), + None, + ) + if implemented_migration: + migrations.append((match.group(1), implemented_migration)) + return sorted(migrations, key=lambda x: x[0]) diff --git a/v03_pipeline/lib/migration/misc_test.py b/v03_pipeline/lib/migration/misc_test.py new file mode 100644 index 000000000..8f8f16fdd --- /dev/null +++ b/v03_pipeline/lib/migration/misc_test.py @@ -0,0 +1,47 @@ +import os +import shutil +import tempfile +import unittest + +from v03_pipeline.lib.migration.base_migration import BaseMigration +from v03_pipeline.lib.migration.misc import list_migrations + + +class TestListMigrations(unittest.TestCase): + def setUp(self): + self.tmpdir = tempfile.TemporaryDirectory() + for migration in [ + '__init__.py', + '1111_a_migration.py', + '0000_migration.py', + '001_test.py', + 'abcd_test.py', + '0000_migration.txt', + ]: + with open(os.path.join(self.tmpdir.name, migration), 'w') as f: + f.write( + """ +from v03_pipeline.lib.migration.base_migration import BaseMigration +class ImplementedMigration(BaseMigration): + pass + """, + ) + + def tearDown(self): + if os.path.isdir(self.tmpdir.name): + shutil.rmtree(self.tmpdir.name) + + def test_list_migrations(self): + self.assertEqual( + [ + (x, issubclass(y, BaseMigration)) + for (x, y) in list_migrations(self.tmpdir.name) + ], + [ + ('0000_migration', True), + ('1111_a_migration', True), + ], + ) + self.assertTrue( + all(hasattr(x[1], 'migrate') for x in list_migrations(self.tmpdir.name)), + ) diff --git a/v03_pipeline/lib/misc/io.py b/v03_pipeline/lib/misc/io.py index c420c84e9..599e791b8 100644 --- a/v03_pipeline/lib/misc/io.py +++ b/v03_pipeline/lib/misc/io.py @@ -1,8 +1,10 @@ +import hashlib import math import os import uuid import hail as hl +import hailtop.fs as hfs from v03_pipeline.lib.misc.gcnv import parse_gcnv_genes from v03_pipeline.lib.misc.nested_field import parse_nested_field @@ -200,6 +202,17 @@ def import_pedigree(pedigree_path: str) -> hl.Table: ) +def remap_pedigree_hash(remap_path: str, pedigree_path: str) -> hl.Int32Expression: + sha256 = hashlib.sha256() + if hfs.exists(remap_path): + with hfs.open(remap_path) as f1: + sha256.update(f1.read().encode('utf8')) + with hfs.open(pedigree_path) as f2: + sha256.update(f2.read().encode('utf8')) + # maximum 4 byte int + return hl.int32(int(sha256.hexdigest()[:8], 16)) + + def checkpoint(t: hl.Table | hl.MatrixTable) -> tuple[hl.Table | hl.MatrixTable, str]: suffix = 'mt' if isinstance(t, hl.MatrixTable) else 'ht' read_fn = hl.read_matrix_table if isinstance(t, hl.MatrixTable) else hl.read_table diff --git a/v03_pipeline/lib/misc/io_test.py b/v03_pipeline/lib/misc/io_test.py index 2955557c1..ab0638d8c 100644 --- a/v03_pipeline/lib/misc/io_test.py +++ b/v03_pipeline/lib/misc/io_test.py @@ -6,13 +6,16 @@ compute_hail_n_partitions, file_size_bytes, import_imputed_sex, + remap_pedigree_hash, ) TEST_IMPUTED_SEX = 'v03_pipeline/var/test/sex_check/test_imputed_sex.tsv' TEST_IMPUTED_SEX_UNEXPECTED_VALUE = ( 'v03_pipeline/var/test/sex_check/test_imputed_sex_unexpected_value.tsv' ) +TEST_PEDIGREE_3 = 'v03_pipeline/var/test/pedigrees/test_pedigree_3.tsv' TEST_MITO_MT = 'v03_pipeline/var/test/callsets/mito_1.mt' +TEST_REMAP = 'v03_pipeline/var/test/remaps/test_remap_1.tsv' TEST_SV_VCF = 'v03_pipeline/var/test/callsets/sv_1.vcf' @@ -46,3 +49,14 @@ def test_import_imputed_sex_unexpected_value(self) -> None: 'Found unexpected value Unknown in imputed sex file', ht.collect, ) + + def test_remap_pedigree_hash(self) -> None: + self.assertEqual( + hl.eval( + remap_pedigree_hash( + TEST_REMAP, + TEST_PEDIGREE_3, + ), + ), + -560434714, + ) diff --git a/v03_pipeline/lib/model/dataset_type.py b/v03_pipeline/lib/model/dataset_type.py index e7af6983f..51ed76b08 100644 --- a/v03_pipeline/lib/model/dataset_type.py +++ b/v03_pipeline/lib/model/dataset_type.py @@ -116,6 +116,7 @@ def row_fields( 'info.N_HET': hl.tint32, 'info.N_HOMALT': hl.tint32, 'info.StrVCTVRE': hl.tstr, + 'info.SVLEN': hl.tint32, **sv.CONSEQ_PREDICTED_GENE_COLS, }, DatasetType.GCNV: { @@ -239,6 +240,7 @@ def formatting_annotation_fns( sv.strvctvre, sv.sv_type_id, sv.sv_type_detail_id, + sv.sv_len, shared.xpos, ], DatasetType.GCNV: [ @@ -335,3 +337,17 @@ def lookup_table_annotation_fns(self) -> list[Callable[..., hl.Expression]]: @property def should_send_to_allele_registry(self): return self == DatasetType.SNV_INDEL + + @property + def should_export_to_vcf(self): + return self == DatasetType.SV + + @property + def export_vcf_annotation_fns(self) -> list[Callable[..., hl.Expression]]: + return { + DatasetType.SV: [ + sv.locus, + sv.alleles, + sv.info, + ], + }[self] diff --git a/v03_pipeline/lib/model/reference_dataset_collection.py b/v03_pipeline/lib/model/reference_dataset_collection.py index 7f87ff154..f784b105d 100644 --- a/v03_pipeline/lib/model/reference_dataset_collection.py +++ b/v03_pipeline/lib/model/reference_dataset_collection.py @@ -45,6 +45,7 @@ def datasets(self, dataset_type: DatasetType) -> list[str]: 'hmtvar', 'mitomap', 'mitimpact', + 'local_constraint_mito', ], (ReferenceDatasetCollection.HGMD, DatasetType.SNV_INDEL): ['hgmd'], (ReferenceDatasetCollection.INTERVAL, DatasetType.SNV_INDEL): [ diff --git a/v03_pipeline/lib/paths.py b/v03_pipeline/lib/paths.py index 628470212..3dc0ae179 100644 --- a/v03_pipeline/lib/paths.py +++ b/v03_pipeline/lib/paths.py @@ -269,6 +269,20 @@ def variant_annotations_table_path( ) +def variant_annotations_vcf_path( + reference_genome: ReferenceGenome, + dataset_type: DatasetType, +) -> str: + return os.path.join( + _v03_pipeline_prefix( + Env.HAIL_SEARCH_DATA, + reference_genome, + dataset_type, + ), + 'annotations.vcf.bgz', + ) + + def new_variants_table_path( reference_genome: ReferenceGenome, dataset_type: DatasetType, diff --git a/v03_pipeline/lib/reference_data/config.py b/v03_pipeline/lib/reference_data/config.py index f38d2773f..714ee9a57 100644 --- a/v03_pipeline/lib/reference_data/config.py +++ b/v03_pipeline/lib/reference_data/config.py @@ -16,6 +16,9 @@ parsed_clnsig, ) from v03_pipeline.lib.reference_data.hgmd import download_and_import_hgmd_vcf +from v03_pipeline.lib.reference_data.mito import ( + download_and_import_local_constraint_tsv, +) def import_locus_intervals( @@ -533,4 +536,14 @@ def custom_mpc_select(ht): 'custom_import': import_locus_intervals, }, }, + 'local_constraint_mito': { + '38': { + 'version': '2024-07-24', + # Originally sourced from https://www.biorxiv.org/content/10.1101/2022.12.16.520778v2.supplementary-material + # Supplementary Table 7. + 'source_path': 'gs://seqr-reference-data/GRCh38/mitochondrial/local_constraint.tsv', + 'custom_import': download_and_import_local_constraint_tsv, + 'select': {'score': 'MLC_score'}, + }, + }, } diff --git a/v03_pipeline/lib/reference_data/mito.py b/v03_pipeline/lib/reference_data/mito.py new file mode 100644 index 000000000..7df647324 --- /dev/null +++ b/v03_pipeline/lib/reference_data/mito.py @@ -0,0 +1,16 @@ +import hail as hl + +from v03_pipeline.lib.model.definitions import ReferenceGenome + + +def download_and_import_local_constraint_tsv( + url: str, + reference_genome: ReferenceGenome, +) -> hl.Table: + ht = hl.import_table(url, types={'Position': hl.tint32, 'MLC_score': hl.tfloat32}) + ht = ht.select( + locus=hl.locus('chrM', ht.Position, reference_genome.value), + alleles=[ht.Reference, ht.Alternate], + MLC_score=ht.MLC_score, + ) + return ht.key_by('locus', 'alleles') diff --git a/v03_pipeline/lib/tasks/__init__.py b/v03_pipeline/lib/tasks/__init__.py index 4223135cc..624173f72 100644 --- a/v03_pipeline/lib/tasks/__init__.py +++ b/v03_pipeline/lib/tasks/__init__.py @@ -4,6 +4,10 @@ DeleteProjectFamilyTablesTask, ) from v03_pipeline.lib.tasks.delete_project_table import DeleteProjectTableTask +from v03_pipeline.lib.tasks.migrate_all_lookup_tables import MigrateAllLookupTablesTask +from v03_pipeline.lib.tasks.migrate_all_variant_annotations_tables import ( + MigrateAllVariantAnnotationsTablesTask, +) from v03_pipeline.lib.tasks.reference_data.update_cached_reference_dataset_queries import ( UpdateCachedReferenceDatasetQueries, ) @@ -39,6 +43,8 @@ 'DeleteFamilyTablesTask', 'DeleteProjectFamilyTablesTask', 'DeleteProjectTableTask', + 'MigrateAllLookupTablesTask', + 'MigrateAllVariantAnnotationsTablesTask', 'UpdateProjectTableTask', 'UpdateProjectTableWithDeletedFamiliesTask', 'UpdateLookupTableTask', diff --git a/v03_pipeline/lib/tasks/base/base_migrate.py b/v03_pipeline/lib/tasks/base/base_migrate.py new file mode 100644 index 000000000..4611d97c9 --- /dev/null +++ b/v03_pipeline/lib/tasks/base/base_migrate.py @@ -0,0 +1,49 @@ +import hail as hl +import luigi + +from v03_pipeline.lib.migration.misc import list_migrations +from v03_pipeline.lib.tasks.base.base_update import BaseUpdateTask + + +class BaseMigrateTask(BaseUpdateTask): + migration_name = luigi.Parameter() + + @property + def migrations_path(self): + raise NotImplementedError + + def requires(self) -> luigi.Task | None: + # Require the previous migration + defined_migrations = [x[0] for x in list_migrations(self.migrations_path)] + for i, migration in enumerate(defined_migrations): + if i > 0 and migration == self.migration_name: + return self.clone( + self.__class__, + migration_name=defined_migrations[i - 1], + ) + return None + + def complete(self) -> luigi.Target: + if super().complete(): + migration = dict( + list_migrations(self.migrations_path), + )[self.migration_name] + if ( + self.reference_genome, + self.dataset_type, + ) not in migration.reference_genome_dataset_types: + return True + mt = hl.read_table(self.output().path) + return hl.eval(mt.globals.migrations.index(self.migration_name) >= 0) + return False + + def update_table(self, ht: hl.Table) -> hl.Table: + migration = dict(list_migrations(self.migrations_path))[self.migration_name] + if ( + (self.reference_genome, self.dataset_type) + ) in migration.reference_genome_dataset_types: + ht = migration.migrate(ht) + return ht.annotate_globals( + migrations=ht.globals.migrations.append(self.migration_name), + ) + return ht diff --git a/v03_pipeline/lib/tasks/base/base_update_lookup_table.py b/v03_pipeline/lib/tasks/base/base_update_lookup_table.py index 83f13be38..2c32eb507 100644 --- a/v03_pipeline/lib/tasks/base/base_update_lookup_table.py +++ b/v03_pipeline/lib/tasks/base/base_update_lookup_table.py @@ -36,6 +36,12 @@ def initialize_table(self) -> hl.Table: globals=hl.Struct( project_guids=hl.empty_array(hl.tstr), project_families=hl.empty_dict(hl.tstr, hl.tarray(hl.tstr)), - updates=hl.empty_set(hl.tstruct(callset=hl.tstr, project_guid=hl.tstr)), + updates=hl.empty_set( + hl.tstruct( + callset=hl.tstr, + project_guid=hl.tstr, + remap_pedigree_hash=hl.tint32, + ), + ), ), ) diff --git a/v03_pipeline/lib/tasks/base/base_update_project_table.py b/v03_pipeline/lib/tasks/base/base_update_project_table.py index 0040b6ad9..ad7ee0d1e 100644 --- a/v03_pipeline/lib/tasks/base/base_update_project_table.py +++ b/v03_pipeline/lib/tasks/base/base_update_project_table.py @@ -32,6 +32,8 @@ def initialize_table(self) -> hl.Table: globals=hl.Struct( family_guids=hl.empty_array(hl.tstr), family_samples=hl.empty_dict(hl.tstr, hl.tarray(hl.tstr)), - updates=hl.empty_set(hl.tstr), + updates=hl.empty_set( + hl.tstruct(callset=hl.tstr, remap_pedigree_hash=hl.tint32), + ), ), ) diff --git a/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py b/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py index 56dc00ea8..7505378d1 100644 --- a/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py +++ b/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py @@ -66,7 +66,14 @@ def initialize_table(self) -> hl.Table: paths=hl.Struct(), versions=hl.Struct(), enums=hl.Struct(), - updates=hl.empty_set(hl.tstruct(callset=hl.tstr, project_guid=hl.tstr)), + updates=hl.empty_set( + hl.tstruct( + callset=hl.tstr, + project_guid=hl.tstr, + remap_pedigree_hash=hl.tint32, + ), + ), + migrations=hl.empty_array(hl.tstr), ), ) @@ -102,5 +109,6 @@ def annotate_globals( **rdc_globals.enums, ), updates=ht.globals.updates, + migrations=ht.globals.migrations, ) return annotate_enums(ht, self.reference_genome, self.dataset_type) diff --git a/v03_pipeline/lib/tasks/delete_project_family_tables_test.py b/v03_pipeline/lib/tasks/delete_project_family_tables_test.py index 3cb56f1c8..75f11987a 100644 --- a/v03_pipeline/lib/tasks/delete_project_family_tables_test.py +++ b/v03_pipeline/lib/tasks/delete_project_family_tables_test.py @@ -115,7 +115,12 @@ def setUp(self) -> None: family_guids=['family_a', 'family_b'], family_samples={'family_a': ['1', '2', '3'], 'family_b': ['4']}, sample_type='WGS', - updates={'v03_pipeline/var/test/callsets/1kg_30variants.vcf'}, + updates={ + hl.Struct( + callset='v03_pipeline/var/test/callsets/1kg_30variants.vcf', + remap_pedigree_hash=123, + ), + }, ), ) ht.write( diff --git a/v03_pipeline/lib/tasks/migrate_all_lookup_tables.py b/v03_pipeline/lib/tasks/migrate_all_lookup_tables.py new file mode 100644 index 000000000..0f8978234 --- /dev/null +++ b/v03_pipeline/lib/tasks/migrate_all_lookup_tables.py @@ -0,0 +1,37 @@ +import luigi + +import v03_pipeline.migrations.lookup +from v03_pipeline.lib.migration.misc import list_migrations +from v03_pipeline.lib.model import DatasetType, ReferenceGenome +from v03_pipeline.lib.tasks.migrate_lookup_table import ( + MigrateLookupTableTask, +) + + +class MigrateAllLookupTablesTask(luigi.Task): + reference_genome = luigi.EnumParameter(enum=ReferenceGenome) + dataset_type = luigi.EnumParameter(enum=DatasetType) + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.dynamic_migration_tasks = [] + + def complete(self) -> bool: + return len(self.dynamic_migration_tasks) >= 1 and all( + migration_task.complete() for migration_task in self.migration_task + ) + + def run(self): + for migration in list_migrations(v03_pipeline.migrations.lookup.__path__): + if ( + (self.reference_genome, self.dataset_type) + in migration.reference_genome_dataset_types + and self.dataset_type.has_lookup_table + ): + self.dynamic_migration_tasks.append( + MigrateLookupTableTask( + self.reference_genome, + self.dataset_type, + ), + ) + yield self.dynamic_migration_tasks diff --git a/v03_pipeline/lib/tasks/migrate_all_variant_annotations_tables.py b/v03_pipeline/lib/tasks/migrate_all_variant_annotations_tables.py new file mode 100644 index 000000000..dff1a251c --- /dev/null +++ b/v03_pipeline/lib/tasks/migrate_all_variant_annotations_tables.py @@ -0,0 +1,36 @@ +import luigi + +import v03_pipeline.migrations.annotations +from v03_pipeline.lib.migration.misc import list_migrations +from v03_pipeline.lib.model import DatasetType, ReferenceGenome +from v03_pipeline.lib.tasks.migrate_variant_annotations_table import ( + MigrateVariantAnnotationsTableTask, +) + + +class MigrateAllVariantAnnotationsTablesTask(luigi.Task): + reference_genome = luigi.EnumParameter(enum=ReferenceGenome) + dataset_type = luigi.EnumParameter(enum=DatasetType) + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.dynamic_migration_tasks = [] + + def complete(self) -> bool: + return len(self.dynamic_migration_tasks) >= 1 and all( + migration_task.complete() for migration_task in self.migration_task + ) + + def run(self): + for migration in list_migrations(v03_pipeline.migrations.annotations.__path__): + if ( + self.reference_genome, + self.dataset_type, + ) in migration.reference_genome_dataset_types: + self.dynamic_migration_tasks.append( + MigrateVariantAnnotationsTableTask( + self.reference_genome, + self.dataset_type, + ), + ) + yield self.dynamic_migration_tasks diff --git a/v03_pipeline/lib/tasks/migrate_lookup_table.py b/v03_pipeline/lib/tasks/migrate_lookup_table.py new file mode 100644 index 000000000..4fe9d7ac4 --- /dev/null +++ b/v03_pipeline/lib/tasks/migrate_lookup_table.py @@ -0,0 +1,54 @@ +import hail as hl +import luigi + +import v03_pipeline.migrations.lookup +from v03_pipeline.lib.paths import ( + lookup_table_path, +) +from v03_pipeline.lib.tasks.base.base_migrate import BaseMigrateTask +from v03_pipeline.lib.tasks.files import GCSorLocalTarget + + +class MigrateLookupTableTask(BaseMigrateTask): + @property + def migrations_path(self): + return v03_pipeline.migrations.lookup.__path__[0] + + def output(self) -> luigi.Target: + return GCSorLocalTarget( + lookup_table_path( + self.reference_genome, + self.dataset_type, + ), + ) + + def initialize_table(self) -> hl.Table: + key_type = self.dataset_type.table_key_type(self.reference_genome) + return hl.Table.parallelize( + [], + hl.tstruct( + **key_type, + project_stats=hl.tarray( + hl.tarray( + hl.tstruct( + **{ + field: hl.tint32 + for field in self.dataset_type.lookup_table_fields_and_genotype_filter_fns + }, + ), + ), + ), + ), + key=key_type.fields, + globals=hl.Struct( + project_guids=hl.empty_array(hl.tstr), + project_families=hl.empty_dict(hl.tstr, hl.tarray(hl.tstr)), + updates=hl.empty_set( + hl.tstruct( + callset=hl.tstr, + project_guid=hl.tstr, + remap_pedigree_hash=hl.tint32, + ), + ), + ), + ) diff --git a/v03_pipeline/lib/tasks/migrate_variant_annotations_table.py b/v03_pipeline/lib/tasks/migrate_variant_annotations_table.py new file mode 100644 index 000000000..44e3debde --- /dev/null +++ b/v03_pipeline/lib/tasks/migrate_variant_annotations_table.py @@ -0,0 +1,44 @@ +import hail as hl +import luigi + +import v03_pipeline.migrations.annotations +from v03_pipeline.lib.paths import ( + variant_annotations_table_path, +) +from v03_pipeline.lib.tasks.base.base_migrate import BaseMigrateTask +from v03_pipeline.lib.tasks.files import GCSorLocalTarget + + +class MigrateVariantAnnotationsTableTask(BaseMigrateTask): + @property + def migrations_path(self): + return v03_pipeline.migrations.annotations.__path__[0] + + def output(self) -> luigi.Target: + return GCSorLocalTarget( + variant_annotations_table_path( + self.reference_genome, + self.dataset_type, + ), + ) + + def initialize_table(self) -> hl.Table: + key_type = self.dataset_type.table_key_type(self.reference_genome) + return hl.Table.parallelize( + [], + key_type, + key=key_type.fields, + globals=hl.Struct( + paths=hl.Struct(), + versions=hl.Struct(), + enums=hl.Struct(), + updates=hl.empty_set( + hl.tstruct( + callset=hl.tstr, + project_guid=hl.tstr, + remap_pedigree_hash=hl.tint32, + ), + ), + migrations=hl.empty_array(hl.tstr), + ), + ) diff --git a/v03_pipeline/lib/tasks/migrate_variant_annotations_table_test.py b/v03_pipeline/lib/tasks/migrate_variant_annotations_table_test.py new file mode 100644 index 000000000..550a8a330 --- /dev/null +++ b/v03_pipeline/lib/tasks/migrate_variant_annotations_table_test.py @@ -0,0 +1,154 @@ +from unittest import mock + +import hail as hl +import luigi.worker + +from v03_pipeline.lib.migration.base_migration import BaseMigration +from v03_pipeline.lib.model import DatasetType, ReferenceGenome +from v03_pipeline.lib.tasks.migrate_variant_annotations_table import ( + MigrateVariantAnnotationsTableTask, +) +from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase + + +class MockMigration(BaseMigration): + reference_genome_dataset_types: frozenset[ + tuple[ReferenceGenome, DatasetType] + ] = frozenset( + ((ReferenceGenome.GRCh38, DatasetType.GCNV),), + ) + + @staticmethod + def migrate(ht: hl.Table) -> hl.Table: + ht = ht.annotate( + variant_id_id=hl.format('%s_id', ht.variant_id), + ) + return ht.annotate_globals(mock_migration='a mock migration') + + +class MockMigration2(BaseMigration): + reference_genome_dataset_types: frozenset[ + tuple[ReferenceGenome, DatasetType] + ] = frozenset( + ((ReferenceGenome.GRCh38, DatasetType.GCNV),), + ) + + @staticmethod + def migrate(ht: hl.Table) -> hl.Table: + return ht.annotate_globals(mock_migration2='a second mock migration') + + +class MigrateVariantAnnotationsTableTaskTest(MockedDatarootTestCase): + @mock.patch( + 'v03_pipeline.lib.tasks.base.base_migrate.list_migrations', + ) + def test_mock_migration( + self, + mock_list_migrations: mock.Mock, + ) -> None: + mock_list_migrations.return_value = [ + ('0012_mock_migration', MockMigration), + ] + worker = luigi.worker.Worker() + task = MigrateVariantAnnotationsTableTask( + dataset_type=DatasetType.GCNV, + reference_genome=ReferenceGenome.GRCh38, + migration_name='0012_mock_migration', + ) + worker.add(task) + worker.run() + self.assertTrue(task.output().exists()) + self.assertTrue(task.complete()) + ht = hl.read_table(task.output().path) + self.assertEqual( + ht.globals.collect(), + [ + hl.Struct( + paths=hl.Struct(), + versions=hl.Struct(), + enums=hl.Struct(), + updates=set(), + migrations=['0012_mock_migration'], + mock_migration='a mock migration', + ), + ], + ) + self.assertEqual( + ht.collect(), + [], + ) + + @mock.patch( + 'v03_pipeline.lib.tasks.base.base_migrate.list_migrations', + ) + def test_migration_is_noop_for_other_dataset_types( + self, + mock_list_migrations: mock.Mock, + ) -> None: + mock_list_migrations.return_value = [ + ('0012_mock_migration', MockMigration), + ] + worker = luigi.worker.Worker() + task = MigrateVariantAnnotationsTableTask( + dataset_type=DatasetType.SV, + reference_genome=ReferenceGenome.GRCh38, + migration_name='0012_mock_migration', + ) + worker.add(task) + worker.run() + self.assertTrue(task.output().exists()) + self.assertTrue(task.complete()) + ht = hl.read_table(task.output().path) + self.assertEqual( + ht.globals.collect(), + [ + hl.Struct( + paths=hl.Struct(), + versions=hl.Struct(), + enums=hl.Struct(), + updates=set(), + migrations=[], + ), + ], + ) + + @mock.patch( + 'v03_pipeline.lib.tasks.base.base_migrate.list_migrations', + ) + def test_migration_dependency( + self, + mock_list_migrations: mock.Mock, + ) -> None: + mock_list_migrations.return_value = [ + ('0012_mock_migration', MockMigration), + ('0013_mock_migration2', MockMigration2), + ] + worker = luigi.worker.Worker() + task = MigrateVariantAnnotationsTableTask( + dataset_type=DatasetType.GCNV, + reference_genome=ReferenceGenome.GRCh38, + migration_name='0013_mock_migration2', + ) + worker.add(task) + worker.run() + self.assertTrue(task.output().exists()) + self.assertTrue(task.complete()) + ht = hl.read_table(task.output().path) + self.assertEqual( + ht.globals.collect(), + [ + hl.Struct( + paths=hl.Struct(), + versions=hl.Struct(), + enums=hl.Struct(), + updates=set(), + migrations=['0012_mock_migration', '0013_mock_migration2'], + mock_migration='a mock migration', + mock_migration2='a second mock migration', + ), + ], + ) + self.assertEqual( + ht.collect(), + [], + ) diff --git a/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py b/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py index fe68eebe3..147dfe9e0 100644 --- a/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py +++ b/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py @@ -628,6 +628,21 @@ ), }, }, + 'local_constraint_mito': { + '38': { + **CONFIG['local_constraint_mito']['38'], + 'custom_import': lambda *_: hl.Table.parallelize( + [], + hl.tstruct( + locus=hl.tlocus('GRCh38'), + alleles=hl.tarray(hl.tstr), + MLC_score=hl.tfloat32, + ), + key=['locus', 'alleles'], + globals=hl.Struct(), + ), + }, + }, } @@ -728,6 +743,7 @@ def test_update_vat_with_updated_rdc_snv_indel_38( versions=hl.Struct(), enums=hl.Struct(), updates=hl.empty_set(hl.tstruct(callset=hl.tstr, project_guid=hl.tstr)), + migrations=hl.empty_array(hl.tstr), ), ) task = UpdateVariantAnnotationsTableWithUpdatedReferenceDataset( @@ -898,6 +914,7 @@ def test_update_vat_with_updated_rdc_snv_indel_38( ), ), ), + migrations=[], updates=set(), ), ], @@ -934,6 +951,7 @@ def test_update_vat_with_updated_rdc_mito_38( versions=hl.Struct(), enums=hl.Struct(), updates=hl.empty_set(hl.tstruct(callset=hl.tstr, project_guid=hl.tstr)), + migrations=hl.empty_array(hl.tstr), ), ) task = UpdateVariantAnnotationsTableWithUpdatedReferenceDataset( @@ -960,6 +978,7 @@ def test_update_vat_with_updated_rdc_mito_38( clinvar_mito='ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz', dbnsfp_mito='gs://seqr-reference-data/GRCh38/dbNSFP/v4.2/dbNSFP4.2a_variant.with_new_scores.ht', high_constraint_region_mito='gs://seqr-reference-data/GRCh38/mitochondrial/Helix high constraint intervals Feb-15-2022.tsv', + local_constraint_mito='gs://seqr-reference-data/GRCh38/mitochondrial/local_constraint.tsv', ), versions=hl.Struct( gnomad_mito='v3.1', @@ -970,6 +989,7 @@ def test_update_vat_with_updated_rdc_mito_38( clinvar_mito='2023-07-22', dbnsfp_mito='4.2', high_constraint_region_mito='Feb-15-2022', + local_constraint_mito='2024-07-24', ), enums=hl.Struct( gnomad_mito=hl.Struct(), @@ -985,6 +1005,7 @@ def test_update_vat_with_updated_rdc_mito_38( MutationTaster_pred=['D', 'A', 'N', 'P'], ), high_constraint_region_mito=hl.Struct(), + local_constraint_mito=hl.Struct(), sorted_transcript_consequences=hl.Struct( biotype=BIOTYPES, consequence_term=TRANSCRIPT_CONSEQUENCE_TERMS, @@ -994,6 +1015,7 @@ def test_update_vat_with_updated_rdc_mito_38( trna_prediction=MITOTIP_PATHOGENICITIES, ), ), + migrations=[], updates=set(), ), ], @@ -1041,6 +1063,7 @@ def test_update_vat_with_updated_rdc_mito_38( mitomap=None, mitimpact=hl.Struct(score=0.5199999809265137), high_constraint_region_mito=True, + local_constraint_mito=hl.Struct(score=0.5), ), ], ) @@ -1076,6 +1099,7 @@ def test_update_vat_with_updated_rdc_snv_indel_37( versions=hl.Struct(), enums=hl.Struct(), updates=hl.empty_set(hl.tstruct(callset=hl.tstr, project_guid=hl.tstr)), + migrations=hl.empty_array(hl.tstr), ), ) task = UpdateVariantAnnotationsTableWithUpdatedReferenceDataset( @@ -1155,6 +1179,7 @@ def test_update_vat_with_updated_rdc_snv_indel_37( lof_filter=LOF_FILTERS, ), ), + migrations=[], updates=set(), ), ], diff --git a/v03_pipeline/lib/tasks/update_lookup_table.py b/v03_pipeline/lib/tasks/update_lookup_table.py index eb04d1d48..3f9fc4b46 100644 --- a/v03_pipeline/lib/tasks/update_lookup_table.py +++ b/v03_pipeline/lib/tasks/update_lookup_table.py @@ -2,6 +2,7 @@ import luigi import luigi.util +from v03_pipeline.lib.misc.io import remap_pedigree_hash from v03_pipeline.lib.misc.lookup import ( compute_callset_lookup_ht, join_lookup_hts, @@ -35,9 +36,13 @@ def complete(self) -> bool: hl.Struct( callset=self.callset_path, project_guid=project_guid, + remap_pedigree_hash=remap_pedigree_hash( + self.project_remap_paths[i], + self.project_pedigree_paths[i], + ), ), ) - for project_guid in self.project_guids + for i, project_guid in enumerate(self.project_guids) ], ), hl.read_table(self.output().path).updates, @@ -76,6 +81,10 @@ def update_table(self, ht: hl.Table) -> hl.Table: hl.Struct( callset=self.callset_path, project_guid=project_guid, + remap_pedigree_hash=remap_pedigree_hash( + self.project_remap_paths[i], + self.project_pedigree_paths[i], + ), ), ), ) @@ -102,6 +111,10 @@ def update_table(self, ht: hl.Table) -> hl.Table: hl.Struct( callset=self.callset_path, project_guid=project_guid, + remap_pedigree_hash=remap_pedigree_hash( + self.project_remap_paths[i], + self.project_pedigree_paths[i], + ), ), ), ) diff --git a/v03_pipeline/lib/tasks/update_lookup_table_test.py b/v03_pipeline/lib/tasks/update_lookup_table_test.py index 8551d873e..073627fa6 100644 --- a/v03_pipeline/lib/tasks/update_lookup_table_test.py +++ b/v03_pipeline/lib/tasks/update_lookup_table_test.py @@ -1,6 +1,7 @@ import hail as hl import luigi.worker +from v03_pipeline.lib.misc.io import remap_pedigree_hash from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType from v03_pipeline.lib.tasks.update_lookup_table import ( UpdateLookupTableTask, @@ -39,7 +40,13 @@ def test_skip_update_lookup_table_task(self) -> None: project_guids=[], project_families={}, updates={ - hl.Struct(callset=TEST_VCF, project_guid='R0555_seqr_demo'), + hl.Struct( + callset=TEST_VCF, + project_guid='R0555_seqr_demo', + remap_pedigree_hash=hl.eval( + remap_pedigree_hash(TEST_REMAP, TEST_PEDIGREE_3), + ), + ), }, ), ], @@ -70,7 +77,13 @@ def test_update_lookup_table_task(self) -> None: project_guids=['R0113_test_project'], project_families={'R0113_test_project': ['abc_1']}, updates={ - hl.Struct(callset=TEST_VCF, project_guid='R0113_test_project'), + hl.Struct( + callset=TEST_VCF, + project_guid='R0113_test_project', + remap_pedigree_hash=hl.eval( + remap_pedigree_hash(TEST_REMAP, TEST_PEDIGREE_3), + ), + ), }, ), ], diff --git a/v03_pipeline/lib/tasks/update_lookup_table_with_deleted_families_test.py b/v03_pipeline/lib/tasks/update_lookup_table_with_deleted_families_test.py index 70915ef9d..5b103d453 100644 --- a/v03_pipeline/lib/tasks/update_lookup_table_with_deleted_families_test.py +++ b/v03_pipeline/lib/tasks/update_lookup_table_with_deleted_families_test.py @@ -123,8 +123,16 @@ def test_delete_project( project_guids=['project_a', 'project_b'], project_families={'project_a': ['1', '2', '3'], 'project_b': ['4']}, updates={ - hl.Struct(project_guid='project_a', callset='abc'), - hl.Struct(project_guid='project_b', callset='abc'), + hl.Struct( + project_guid='project_a', + callset='abc', + remap_pedigree_hash=123, + ), + hl.Struct( + project_guid='project_b', + callset='abc', + remap_pedigree_hash=123, + ), }, ), ) @@ -147,8 +155,16 @@ def test_delete_project( project_guids=['project_a', 'project_b'], project_families={'project_a': ['2'], 'project_b': ['4']}, updates={ - hl.Struct(project_guid='project_a', callset='abc'), - hl.Struct(project_guid='project_b', callset='abc'), + hl.Struct( + project_guid='project_a', + callset='abc', + remap_pedigree_hash=123, + ), + hl.Struct( + project_guid='project_b', + callset='abc', + remap_pedigree_hash=123, + ), }, ), ], diff --git a/v03_pipeline/lib/tasks/update_lookup_table_with_deleted_project_test.py b/v03_pipeline/lib/tasks/update_lookup_table_with_deleted_project_test.py index e40e034ec..c1f112631 100644 --- a/v03_pipeline/lib/tasks/update_lookup_table_with_deleted_project_test.py +++ b/v03_pipeline/lib/tasks/update_lookup_table_with_deleted_project_test.py @@ -122,8 +122,16 @@ def test_delete_project( project_guids=['project_a', 'project_b'], project_families={'project_a': ['1', '2', '3'], 'project_b': ['4']}, updates={ - hl.Struct(project_guid='project_a', callset='abc'), - hl.Struct(project_guid='project_b', callset='abc'), + hl.Struct( + project_guid='project_a', + callset='abc', + remap_pedigree_hash=123, + ), + hl.Struct( + project_guid='project_b', + callset='abc', + remap_pedigree_hash=123, + ), }, ), ) @@ -144,7 +152,13 @@ def test_delete_project( hl.Struct( project_guids=['project_b'], project_families={'project_b': ['4']}, - updates={hl.Struct(project_guid='project_b', callset='abc')}, + updates={ + hl.Struct( + project_guid='project_b', + callset='abc', + remap_pedigree_hash=123, + ), + }, ), ], ) diff --git a/v03_pipeline/lib/tasks/update_project_table.py b/v03_pipeline/lib/tasks/update_project_table.py index c7ea539e4..a2ca742ba 100644 --- a/v03_pipeline/lib/tasks/update_project_table.py +++ b/v03_pipeline/lib/tasks/update_project_table.py @@ -8,6 +8,7 @@ join_family_entries_hts, remove_family_guids, ) +from v03_pipeline.lib.misc.io import remap_pedigree_hash from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams from v03_pipeline.lib.tasks.base.base_update_project_table import ( BaseUpdateProjectTableTask, @@ -28,7 +29,13 @@ def complete(self) -> bool: and super().complete() and hl.eval( hl.read_table(self.output().path).updates.contains( - self.callset_path, + hl.Struct( + callset=self.callset_path, + remap_pedigree_hash=remap_pedigree_hash( + self.project_remap_path, + self.project_pedigree_path, + ), + ), ), ) ) @@ -62,5 +69,13 @@ def update_table(self, ht: hl.Table) -> hl.Table: family_guids=ht.family_guids, family_samples=ht.family_samples, sample_type=self.sample_type.value, - updates=ht.updates.add(self.callset_path), + updates=ht.updates.add( + hl.Struct( + callset=self.callset_path, + remap_pedigree_hash=remap_pedigree_hash( + self.project_remap_path, + self.project_pedigree_path, + ), + ), + ), ) diff --git a/v03_pipeline/lib/tasks/update_project_table_test.py b/v03_pipeline/lib/tasks/update_project_table_test.py index c0a5a4e57..4dba40590 100644 --- a/v03_pipeline/lib/tasks/update_project_table_test.py +++ b/v03_pipeline/lib/tasks/update_project_table_test.py @@ -1,6 +1,7 @@ import hail as hl import luigi.worker +from v03_pipeline.lib.misc.io import remap_pedigree_hash from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType from v03_pipeline.lib.tasks.update_project_table import UpdateProjectTableTask from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase @@ -8,6 +9,9 @@ TEST_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf' TEST_REMAP = 'v03_pipeline/var/test/remaps/test_remap_1.tsv' TEST_PEDIGREE_3 = 'v03_pipeline/var/test/pedigrees/test_pedigree_3.tsv' +TEST_PEDIGREE_3_DIFFERENT_FAMILIES = ( + 'v03_pipeline/var/test/pedigrees/test_pedigree_3_different_families.tsv' +) class UpdateProjectTableTaskTest(MockedDatarootTestCase): @@ -36,7 +40,17 @@ def test_update_project_table_task(self) -> None: 'abc_1': ['HG00731_1', 'HG00732_1', 'HG00733_1'], }, sample_type=SampleType.WGS.value, - updates={'v03_pipeline/var/test/callsets/1kg_30variants.vcf'}, + updates={ + hl.Struct( + callset='v03_pipeline/var/test/callsets/1kg_30variants.vcf', + remap_pedigree_hash=hl.eval( + remap_pedigree_hash( + TEST_REMAP, + TEST_PEDIGREE_3, + ), + ), + ), + }, ), ], ) @@ -108,3 +122,95 @@ def test_update_project_table_task(self) -> None: ), ], ) + + def test_update_project_table_task_different_pedigree(self) -> None: + worker = luigi.worker.Worker() + upt_task = UpdateProjectTableTask( + reference_genome=ReferenceGenome.GRCh38, + dataset_type=DatasetType.SNV_INDEL, + sample_type=SampleType.WGS, + callset_path=TEST_VCF, + project_guid='R0113_test_project', + project_remap_path=TEST_REMAP, + project_pedigree_path=TEST_PEDIGREE_3, + skip_validation=True, + ) + worker.add(upt_task) + worker.run() + upt_task = UpdateProjectTableTask( + reference_genome=ReferenceGenome.GRCh38, + dataset_type=DatasetType.SNV_INDEL, + sample_type=SampleType.WGS, + callset_path=TEST_VCF, + project_guid='R0113_test_project', + project_remap_path=TEST_REMAP, + project_pedigree_path=TEST_PEDIGREE_3_DIFFERENT_FAMILIES, + skip_validation=True, + ) + worker.add(upt_task) + worker.run() + self.assertTrue(upt_task.complete()) + worker.add(upt_task) + worker.run() + ht = hl.read_table(upt_task.output().path) + self.assertCountEqual( + ht.globals.collect(), + [ + hl.Struct( + family_guids=['abc_1'], + family_samples={ + 'abc_1': ['HG00731_1', 'HG00733_1'], + }, + sample_type=SampleType.WGS.value, + updates={ + hl.Struct( + callset='v03_pipeline/var/test/callsets/1kg_30variants.vcf', + remap_pedigree_hash=hl.eval( + remap_pedigree_hash( + TEST_REMAP, + TEST_PEDIGREE_3, + ), + ), + ), + hl.Struct( + callset='v03_pipeline/var/test/callsets/1kg_30variants.vcf', + remap_pedigree_hash=hl.eval( + remap_pedigree_hash( + TEST_REMAP, + TEST_PEDIGREE_3_DIFFERENT_FAMILIES, + ), + ), + ), + }, + ), + ], + ) + + self.assertCountEqual( + ht.collect()[0], + hl.Struct( + locus=hl.Locus( + contig='chr1', + position=876499, + reference_genome='GRCh38', + ), + alleles=['A', 'G'], + filters=set(), + family_entries=[ + [ + hl.Struct( + GQ=21, + AB=1.0, + DP=7, + GT=hl.Call(alleles=[1, 1], phased=False), + ), + hl.Struct( + GQ=12, + AB=1.0, + DP=4, + GT=hl.Call(alleles=[1, 1], phased=False), + ), + ], + ], + ), + ) diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_deleted_families_test.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_deleted_families_test.py index 67410ef18..3d0c9df8b 100644 --- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_deleted_families_test.py +++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_deleted_families_test.py @@ -85,8 +85,16 @@ def setUp(self) -> None: project_guids=['project_a', 'project_b'], project_families={'project_a': ['1', '2', '3'], 'project_b': ['4']}, updates={ - hl.Struct(callset='abc', project_guid='project_a'), - hl.Struct(callset='123', project_guid='project_b'), + hl.Struct( + callset='abc', + project_guid='project_a', + remap_pedigree_hash=123, + ), + hl.Struct( + callset='123', + project_guid='project_b', + remap_pedigree_hash=123, + ), }, ), ) @@ -123,8 +131,16 @@ def setUp(self) -> None: key='id', globals=hl.Struct( updates={ - hl.Struct(callset='abc', project_guid='project_a'), - hl.Struct(callset='123', project_guid='project_b'), + hl.Struct( + callset='abc', + project_guid='project_a', + remap_pedigree_hash=123, + ), + hl.Struct( + callset='123', + project_guid='project_b', + remap_pedigree_hash=123, + ), }, ), ) @@ -151,8 +167,16 @@ def test_update_annotations_with_deleted_project(self) -> None: [ hl.Struct( updates={ - hl.Struct(callset='abc', project_guid='project_a'), - hl.Struct(callset='123', project_guid='project_b'), + hl.Struct( + callset='abc', + project_guid='project_a', + remap_pedigree_hash=123, + ), + hl.Struct( + callset='123', + project_guid='project_b', + remap_pedigree_hash=123, + ), }, ), ], diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_deleted_project_test.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_deleted_project_test.py index 295a9577b..c144bffec 100644 --- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_deleted_project_test.py +++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_deleted_project_test.py @@ -93,8 +93,16 @@ def setUp(self) -> None: project_guids=['project_a', 'project_b'], project_families={'project_a': ['1', '2', '3'], 'project_b': ['4']}, updates={ - hl.Struct(callset='abc', project_guid='project_a'), - hl.Struct(callset='123', project_guid='project_b'), + hl.Struct( + callset='abc', + project_guid='project_a', + remap_pedigree_hash=123, + ), + hl.Struct( + callset='123', + project_guid='project_b', + remap_pedigree_hash=123, + ), }, ), ) @@ -131,8 +139,16 @@ def setUp(self) -> None: key='id', globals=hl.Struct( updates={ - hl.Struct(callset='abc', project_guid='project_a'), - hl.Struct(callset='123', project_guid='project_b'), + hl.Struct( + callset='abc', + project_guid='project_a', + remap_pedigree_hash=123, + ), + hl.Struct( + callset='123', + project_guid='project_b', + remap_pedigree_hash=123, + ), }, ), ) @@ -156,7 +172,15 @@ def test_update_annotations_with_deleted_project(self) -> None: self.assertEqual( ht.globals.collect(), [ - hl.Struct(updates={hl.Struct(callset='abc', project_guid='project_a')}), + hl.Struct( + updates={ + hl.Struct( + callset='abc', + project_guid='project_a', + remap_pedigree_hash=123, + ), + }, + ), ], ) self.assertEqual( diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples.py index 5ba1448c9..ef6442f22 100644 --- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples.py +++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples.py @@ -4,6 +4,7 @@ from v03_pipeline.lib.annotations.fields import get_fields from v03_pipeline.lib.misc.callsets import get_callset_ht +from v03_pipeline.lib.misc.io import remap_pedigree_hash from v03_pipeline.lib.paths import ( lookup_table_path, new_variants_table_path, @@ -42,9 +43,13 @@ def complete(self) -> bool: hl.Struct( callset=self.callset_path, project_guid=project_guid, + remap_pedigree_hash=remap_pedigree_hash( + self.project_remap_paths[i], + self.project_pedigree_paths[i], + ), ), ) - for project_guid in self.project_guids + for i, project_guid in enumerate(self.project_guids) ], ), hl.read_table(self.output().path).updates, @@ -96,8 +101,15 @@ def update_table(self, ht: hl.Table) -> hl.Table: return ht.annotate_globals( updates=ht.updates.union( { - hl.Struct(callset=self.callset_path, project_guid=project_guid) - for project_guid in self.project_guids + hl.Struct( + callset=self.callset_path, + project_guid=project_guid, + remap_pedigree_hash=remap_pedigree_hash( + self.project_remap_paths[i], + self.project_pedigree_paths[i], + ), + ) + for i, project_guid in enumerate(self.project_guids) }, ), ) diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py index d22dae749..b8d53b983 100644 --- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py +++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py @@ -19,6 +19,7 @@ SV_TYPES, TRANSCRIPT_CONSEQUENCE_TERMS, ) +from v03_pipeline.lib.misc.io import remap_pedigree_hash from v03_pipeline.lib.misc.validation import validate_expected_contig_frequency from v03_pipeline.lib.model import ( CachedReferenceDatasetQuery, @@ -400,6 +401,9 @@ def test_multiple_update_vat( hl.Struct( callset=TEST_SNV_INDEL_VCF, project_guid='R0113_test_project', + remap_pedigree_hash=hl.eval( + remap_pedigree_hash(TEST_REMAP, TEST_PEDIGREE_3), + ), ), }, ], @@ -550,10 +554,22 @@ def test_multiple_update_vat( hl.Struct( callset='v03_pipeline/var/test/callsets/1kg_30variants.vcf', project_guid='R0113_test_project', + remap_pedigree_hash=hl.eval( + remap_pedigree_hash( + TEST_REMAP, + TEST_PEDIGREE_3, + ), + ), ), hl.Struct( callset='v03_pipeline/var/test/callsets/1kg_30variants.vcf', project_guid='R0114_project4', + remap_pedigree_hash=hl.eval( + remap_pedigree_hash( + TEST_REMAP, + TEST_PEDIGREE_4, + ), + ), ), }, paths=hl.Struct( @@ -588,6 +604,7 @@ def test_multiple_update_vat( screen=None, hgmd='HGMD_Pro_2023', ), + migrations=[], enums=hl.Struct( cadd=hl.Struct(), clinvar=hl.Struct( @@ -842,6 +859,7 @@ def test_mito_update_vat( hmtvar='gs://seqr-reference-data/GRCh38/mitochondrial/HmtVar/HmtVar%20Jan.%2010%202022.ht', mitomap='gs://seqr-reference-data/GRCh38/mitochondrial/MITOMAP/mitomap-confirmed-mutations-2022-02-04.ht', mitimpact='gs://seqr-reference-data/GRCh38/mitochondrial/MitImpact/MitImpact_db_3.0.7.ht', + local_constraint_mito='gs://seqr-reference-data/GRCh38/mitochondrial/local_constraint.tsv', ), versions=hl.Struct( high_constraint_region_mito='Feb-15-2022', @@ -852,9 +870,11 @@ def test_mito_update_vat( hmtvar='Jan. 10 2022', mitomap='Feb. 04 2022', mitimpact='3.0.7', + local_constraint_mito='2024-07-24', ), enums=hl.Struct( high_constraint_region_mito=hl.Struct(), + local_constraint_mito=hl.Struct(), clinvar_mito=hl.Struct( assertion=CLINVAR_ASSERTIONS, pathogenicity=CLINVAR_PATHOGENICITIES, @@ -874,10 +894,17 @@ def test_mito_update_vat( ), mitotip=hl.Struct(trna_prediction=MITOTIP_PATHOGENICITIES), ), + migrations=[], updates={ hl.Struct( callset='v03_pipeline/var/test/callsets/mito_1.mt', project_guid='R0115_test_project2', + remap_pedigree_hash=hl.eval( + remap_pedigree_hash( + 'not_a_real_file', + TEST_PEDIGREE_5, + ), + ), ), }, ), @@ -920,6 +947,7 @@ def test_mito_update_vat( AF_hom=0.0, AN=4, ), + local_constraint_mito=None, ), hl.Struct( locus=hl.Locus( @@ -955,6 +983,7 @@ def test_mito_update_vat( AF_hom=0.0, AN=4, ), + local_constraint_mito=None, ), hl.Struct( locus=hl.Locus( @@ -990,6 +1019,7 @@ def test_mito_update_vat( AF_hom=0.0, AN=4, ), + local_constraint_mito=None, ), hl.Struct( locus=hl.Locus( @@ -1025,6 +1055,7 @@ def test_mito_update_vat( AF_hom=0.0, AN=4, ), + local_constraint_mito=None, ), hl.Struct( locus=hl.Locus( @@ -1060,6 +1091,7 @@ def test_mito_update_vat( AF_hom=0.0, AN=4, ), + local_constraint_mito=None, ), ], ) @@ -1111,10 +1143,17 @@ def test_sv_update_vat( major_consequence=SV_CONSEQUENCE_RANKS, ), ), + migrations=[], updates={ hl.Struct( callset=TEST_SV_VCF, project_guid='R0115_test_project2', + remap_pedigree_hash=hl.eval( + remap_pedigree_hash( + 'not_a_real_file', + TEST_PEDIGREE_5, + ), + ), ), }, ), @@ -1161,6 +1200,7 @@ def test_sv_update_vat( reference_genome='GRCh38', ), strvctvre=hl.Struct(score=None), + sv_len=-1, sv_type_id=2, sv_type_detail_id=None, xpos=1000180928, @@ -1203,6 +1243,7 @@ def test_sv_update_vat( reference_genome='GRCh38', ), strvctvre=hl.Struct(score=None), + sv_len=223225007, sv_type_id=2, sv_type_detail_id=None, xpos=1000789481, @@ -1255,6 +1296,7 @@ def test_sv_update_vat( reference_genome='GRCh38', ), strvctvre=hl.Struct(score=None), + sv_len=821, sv_type_id=3, sv_type_detail_id=2, xpos=1006558902, @@ -1310,6 +1352,7 @@ def test_sv_update_vat( reference_genome='GRCh38', ), strvctvre=hl.Struct(score=None), + sv_len=534718, sv_type_id=3, sv_type_detail_id=9, xpos=1180540234, @@ -1362,6 +1405,7 @@ def test_sv_update_vat( reference_genome='GRCh38', ), strvctvre=hl.Struct(score=None), + sv_len=841, sv_type_id=3, sv_type_detail_id=12, xpos=1016088760, @@ -1419,6 +1463,7 @@ def test_sv_update_vat( reference_genome='GRCh38', ), strvctvre=hl.Struct(score=None), + sv_len=52921, sv_type_id=3, sv_type_detail_id=13, xpos=1021427498, @@ -1460,6 +1505,7 @@ def test_sv_update_vat( reference_genome='GRCh38', ), strvctvre=hl.Struct(score=None), + sv_len=14532, sv_type_id=5, sv_type_detail_id=None, xpos=1000413968, @@ -1497,6 +1543,7 @@ def test_sv_update_vat( reference_genome='GRCh38', ), strvctvre=hl.Struct(score=None), + sv_len=6000, sv_type_id=6, sv_type_detail_id=None, xpos=1000257666, @@ -1538,6 +1585,7 @@ def test_sv_update_vat( reference_genome='GRCh38', ), strvctvre=hl.Struct(score=None), + sv_len=955, sv_type_id=7, sv_type_detail_id=6, xpos=1017465707, @@ -1582,6 +1630,7 @@ def test_sv_update_vat( reference_genome='GRCh38', ), strvctvre=hl.Struct(score=hl.eval(hl.float32(0.1255))), + sv_len=298, sv_type_id=7, sv_type_detail_id=4, xpos=1004228405, @@ -1623,6 +1672,7 @@ def test_sv_update_vat( reference_genome='GRCh38', ), strvctvre=hl.Struct(score=None), + sv_len=5520, sv_type_id=7, sv_type_detail_id=5, xpos=1048963084, @@ -1671,10 +1721,17 @@ def test_gcnv_update_vat( major_consequence=SV_CONSEQUENCE_RANKS, ), ), + migrations=[], updates={ hl.Struct( callset=TEST_GCNV_BED_FILE, project_guid='R0115_test_project2', + remap_pedigree_hash=hl.eval( + remap_pedigree_hash( + 'not_a_real_file', + TEST_PEDIGREE_5, + ), + ), ), }, ), diff --git a/v03_pipeline/lib/tasks/write_new_variants_table.py b/v03_pipeline/lib/tasks/write_new_variants_table.py index b70dc2a6f..9f99d1549 100644 --- a/v03_pipeline/lib/tasks/write_new_variants_table.py +++ b/v03_pipeline/lib/tasks/write_new_variants_table.py @@ -10,6 +10,7 @@ ) from v03_pipeline.lib.misc.allele_registry import register_alleles_in_chunks from v03_pipeline.lib.misc.callsets import get_callset_ht +from v03_pipeline.lib.misc.io import remap_pedigree_hash from v03_pipeline.lib.misc.math import constrain from v03_pipeline.lib.model import Env, ReferenceDatasetCollection from v03_pipeline.lib.paths import ( @@ -128,9 +129,13 @@ def complete(self) -> bool: hl.Struct( callset=self.callset_path, project_guid=project_guid, + remap_pedigree_hash=remap_pedigree_hash( + self.project_remap_paths[i], + self.project_pedigree_paths[i], + ), ), ) - for project_guid in self.project_guids + for i, project_guid in enumerate(self.project_guids) ], ), hl.read_table(self.output().path).updates, @@ -218,7 +223,14 @@ def create_table(self) -> hl.Table: return new_variants_ht.select_globals( updates={ - hl.Struct(callset=self.callset_path, project_guid=project_guid) - for project_guid in self.project_guids + hl.Struct( + callset=self.callset_path, + project_guid=project_guid, + remap_pedigree_hash=remap_pedigree_hash( + self.project_remap_paths[i], + self.project_pedigree_paths[i], + ), + ) + for i, project_guid in enumerate(self.project_guids) }, ) diff --git a/v03_pipeline/lib/tasks/write_project_family_tables_test.py b/v03_pipeline/lib/tasks/write_project_family_tables_test.py index 074d0a294..09ac0b030 100644 --- a/v03_pipeline/lib/tasks/write_project_family_tables_test.py +++ b/v03_pipeline/lib/tasks/write_project_family_tables_test.py @@ -87,22 +87,20 @@ def test_snv_write_project_family_tables_task(self) -> None: DatasetType.SNV_INDEL, 'R0113_test_project', ), - ).family_guids.collect(), + ).family_guids.collect()[0], [ - [ - '123_1', - '234_1', - '345_1', - '456_1', - '567_1', - '678_1', - '789_1', - '890_1', - '901_1', - 'bcd_1', - 'cde_1', - 'def_1', - 'efg_1', - ], + '123_1', + '234_1', + '345_1', + '456_1', + '567_1', + '678_1', + '789_1', + '890_1', + '901_1', + 'bcd_1', + 'cde_1', + 'def_1', + 'efg_1', ], ) diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py index 609c3b014..b1e4bc547 100644 --- a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py +++ b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py @@ -12,6 +12,7 @@ does_file_exist, import_pedigree, import_remap, + remap_pedigree_hash, ) from v03_pipeline.lib.misc.pedigree import parse_pedigree_ht_to_families from v03_pipeline.lib.misc.sample_ids import remap_sample_ids, subset_samples @@ -36,7 +37,17 @@ class WriteRemappedAndSubsettedCallsetTask(BaseWriteTask): project_pedigree_path = luigi.Parameter() def complete(self) -> luigi.Target: - return not self.force and super().complete() + return ( + not self.force + and super().complete() + and hl.eval( + hl.read_matrix_table(self.output().path).globals.remap_pedigree_hash + == remap_pedigree_hash( + self.project_remap_path, + self.project_pedigree_path, + ), + ) + ) def output(self) -> luigi.Target: return GCSorLocalTarget( @@ -145,6 +156,10 @@ def create_table(self) -> hl.MatrixTable: if field not in self.dataset_type.row_fields: mt = mt.drop(field) return mt.select_globals( + remap_pedigree_hash=remap_pedigree_hash( + self.project_remap_path, + self.project_pedigree_path, + ), family_samples=( { f.family_guid: sorted(f.samples.keys()) diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py index 48f2b481a..ad4cd3640 100644 --- a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py +++ b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py @@ -4,6 +4,7 @@ import hail as hl import luigi.worker +from v03_pipeline.lib.misc.io import remap_pedigree_hash from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType from v03_pipeline.lib.paths import relatedness_check_table_path, sex_check_table_path from v03_pipeline.lib.tasks.write_remapped_and_subsetted_callset import ( @@ -94,6 +95,12 @@ def test_write_remapped_and_subsetted_callset_task( mt.globals.collect(), [ hl.Struct( + remap_pedigree_hash=hl.eval( + remap_pedigree_hash( + TEST_REMAP, + TEST_PEDIGREE_3, + ), + ), failed_family_samples=hl.Struct( missing_samples={}, relatedness_check={}, @@ -131,6 +138,12 @@ def test_write_remapped_and_subsetted_callset_task_failed_sex_check_family( mt.globals.collect(), [ hl.Struct( + remap_pedigree_hash=hl.eval( + remap_pedigree_hash( + TEST_REMAP, + TEST_PEDIGREE_4, + ), + ), family_samples={ '123_1': ['NA19675_1'], '234_1': ['NA19678_1'], diff --git a/v03_pipeline/lib/tasks/write_variant_annotations_vcf.py b/v03_pipeline/lib/tasks/write_variant_annotations_vcf.py new file mode 100644 index 000000000..8c5d0e4f4 --- /dev/null +++ b/v03_pipeline/lib/tasks/write_variant_annotations_vcf.py @@ -0,0 +1,40 @@ +import hail as hl +import luigi + +from v03_pipeline.lib.annotations.fields import get_fields +from v03_pipeline.lib.paths import variant_annotations_vcf_path +from v03_pipeline.lib.tasks.base.base_hail_table import BaseHailTableTask +from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams +from v03_pipeline.lib.tasks.base.base_update_variant_annotations_table import ( + BaseUpdateVariantAnnotationsTableTask, +) +from v03_pipeline.lib.tasks.files import GCSorLocalTarget + + +@luigi.util.inherits(BaseLoadingRunParams) +class WriteVariantAnnotationsVCF(BaseHailTableTask): + def output(self) -> luigi.Target: + return GCSorLocalTarget( + variant_annotations_vcf_path( + self.reference_genome, + self.dataset_type, + ), + ) + + def complete(self) -> bool: + return not self.dataset_type.should_export_to_vcf + + def requires(self) -> luigi.Task: + return self.clone(BaseUpdateVariantAnnotationsTableTask, force=False) + + def run(self) -> None: + ht = hl.read_table(self.input().path) + ht = ht.annotate( + **get_fields( + ht, + self.dataset_type.export_vcf_annotation_fns, + **self.param_kwargs, + ), + ) + ht = ht.key_by('locus', 'alleles') + hl.export_vcf(ht, self.output().path, tabix=True) diff --git a/v03_pipeline/lib/tasks/write_variant_annotations_vcf_test.py b/v03_pipeline/lib/tasks/write_variant_annotations_vcf_test.py new file mode 100644 index 000000000..d96096be0 --- /dev/null +++ b/v03_pipeline/lib/tasks/write_variant_annotations_vcf_test.py @@ -0,0 +1,76 @@ +from unittest.mock import Mock, patch + +import hailtop.fs as hfs +import luigi.worker + +from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType +from v03_pipeline.lib.tasks.update_variant_annotations_table_with_new_samples import ( + UpdateVariantAnnotationsTableWithNewSamplesTask, +) +from v03_pipeline.lib.tasks.write_variant_annotations_vcf import ( + WriteVariantAnnotationsVCF, +) +from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase + +TEST_SV_VCF = 'v03_pipeline/var/test/callsets/sv_1.vcf' +TEST_PEDIGREE_5 = 'v03_pipeline/var/test/pedigrees/test_pedigree_5.tsv' + +GENE_ID_MAPPING = { + 'OR4F5': 'ENSG00000186092', + 'PLEKHG4B': 'ENSG00000153404', + 'OR4F16': 'ENSG00000186192', + 'OR4F29': 'ENSG00000284733', + 'FBXO28': 'ENSG00000143756', + 'SAMD11': 'ENSG00000187634', + 'C1orf174': 'ENSG00000198912', + 'TAS1R1': 'ENSG00000173662', + 'FAM131C': 'ENSG00000185519', + 'RCC2': 'ENSG00000179051', + 'NBPF3': 'ENSG00000142794', + 'AGBL4': 'ENSG00000186094', + 'KIAA1614': 'ENSG00000135835', + 'MR1': 'ENSG00000153029', + 'STX6': 'ENSG00000135823', + 'XPR1': 'ENSG00000143324', +} + + +class WriteVariantAnnotationsVCFTest(MockedDatarootTestCase): + @patch( + 'v03_pipeline.lib.tasks.write_new_variants_table.load_gencode_gene_symbol_to_gene_id', + ) + def test_sv_export_vcf( + self, + mock_load_gencode: Mock, + ) -> None: + mock_load_gencode.return_value = GENE_ID_MAPPING + worker = luigi.worker.Worker() + update_variant_annotations_task = ( + UpdateVariantAnnotationsTableWithNewSamplesTask( + reference_genome=ReferenceGenome.GRCh38, + dataset_type=DatasetType.SV, + sample_type=SampleType.WGS, + callset_path=TEST_SV_VCF, + project_guids=['R0115_test_project2'], + project_remap_paths=['not_a_real_file'], + project_pedigree_paths=[TEST_PEDIGREE_5], + skip_validation=True, + run_id='run_id1', + ) + ) + worker.add(update_variant_annotations_task) + worker.run() + self.assertTrue(update_variant_annotations_task.complete()) + write_variant_annotations_vcf_task = WriteVariantAnnotationsVCF( + reference_genome=ReferenceGenome.GRCh38, + dataset_type=DatasetType.SV, + sample_type=SampleType.WGS, + callset_path=TEST_SV_VCF, + ) + worker.add(write_variant_annotations_vcf_task) + worker.run() + self.assertTrue( + hfs.exists( + write_variant_annotations_vcf_task.output().path, + ), + ) diff --git a/v03_pipeline/migrations/__init__.py b/v03_pipeline/migrations/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/v03_pipeline/migrations/annotations/0001_add_migrations_global.py b/v03_pipeline/migrations/annotations/0001_add_migrations_global.py new file mode 100644 index 000000000..c255ff215 --- /dev/null +++ b/v03_pipeline/migrations/annotations/0001_add_migrations_global.py @@ -0,0 +1,24 @@ +import hail as hl + +from v03_pipeline.lib.migration.base_migration import BaseMigration +from v03_pipeline.lib.model import DatasetType, ReferenceGenome + + +class AddMigrationsGlobals(BaseMigration): + @property + def reference_genome_dataset_types() -> ( + frozenset[tuple[ReferenceGenome, DatasetType]] + ): + return frozenset( + ( + (ReferenceGenome.GRCh37, DatasetType.SNV_INDEL), + (ReferenceGenome.GRCh38, DatasetType.SNV_INDEL), + (ReferenceGenome.GRCh38, DatasetType.MITO), + (ReferenceGenome.GRCh38, DatasetType.GCNV), + (ReferenceGenome.GRCh38, DatasetType.SV), + ), + ) + + @staticmethod + def migrate(ht: hl.Table) -> hl.Table: + return ht.annotate_globals(migrations=hl.empty_list(hl.str)) diff --git a/v03_pipeline/migrations/annotations/0002_add_remap_pedigree_hash.py b/v03_pipeline/migrations/annotations/0002_add_remap_pedigree_hash.py new file mode 100644 index 000000000..89d900542 --- /dev/null +++ b/v03_pipeline/migrations/annotations/0002_add_remap_pedigree_hash.py @@ -0,0 +1,28 @@ +import hail as hl + +from v03_pipeline.lib.migration.base_migration import BaseMigration +from v03_pipeline.lib.model import DatasetType, ReferenceGenome + + +class AddRemapPedigreeHash(BaseMigration): + @property + def reference_genome_dataset_types() -> ( + frozenset[tuple[ReferenceGenome, DatasetType]] + ): + return frozenset( + ( + (ReferenceGenome.GRCh37, DatasetType.SNV_INDEL), + (ReferenceGenome.GRCh38, DatasetType.SNV_INDEL), + (ReferenceGenome.GRCh38, DatasetType.MITO), + (ReferenceGenome.GRCh38, DatasetType.GCNV), + (ReferenceGenome.GRCh38, DatasetType.SV), + ), + ) + + @staticmethod + def migrate(ht: hl.Table) -> hl.Table: + return ht.annotate_globals( + updates=ht.globals.updates.map( + lambda u: u.annotate(remap_pedigree_hash=hl.missing(hl.tint32)), + ), + ) diff --git a/v03_pipeline/migrations/annotations/__init__.py b/v03_pipeline/migrations/annotations/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/v03_pipeline/migrations/lookup/0001_remove_null_families.py b/v03_pipeline/migrations/lookup/0001_remove_null_families.py new file mode 100644 index 000000000..59123ea70 --- /dev/null +++ b/v03_pipeline/migrations/lookup/0001_remove_null_families.py @@ -0,0 +1,27 @@ +import hail as hl + +from v03_pipeline.lib.migration.base_migration import BaseMigration +from v03_pipeline.lib.model import DatasetType, ReferenceGenome + + +class RemoveNullFamilies(BaseMigration): + @property + def reference_genome_dataset_types() -> ( + frozenset[tuple[ReferenceGenome, DatasetType]] + ): + return frozenset( + ( + (ReferenceGenome.GRCh37, DatasetType.SNV_INDEL), + (ReferenceGenome.GRCh38, DatasetType.SNV_INDEL), + (ReferenceGenome.GRCh38, DatasetType.MITO), + ), + ) + + @staticmethod + def migrate(ht: hl.Table) -> hl.Table: + ht = ht.annotate( + project_stats=ht.project_stats.map( + lambda ps: hl.or_missing(hl.all(ps.map(hl.is_defined)), ps), + ), + ) + return ht.annotate_globals(migrations=hl.empty_list(hl.str)) diff --git a/v03_pipeline/migrations/lookup/0002_add_remap_pedigree_hash.py b/v03_pipeline/migrations/lookup/0002_add_remap_pedigree_hash.py new file mode 100644 index 000000000..a0815249b --- /dev/null +++ b/v03_pipeline/migrations/lookup/0002_add_remap_pedigree_hash.py @@ -0,0 +1,26 @@ +import hail as hl + +from v03_pipeline.lib.migration.base_migration import BaseMigration +from v03_pipeline.lib.model import DatasetType, ReferenceGenome + + +class AddRemapPedigreeHash(BaseMigration): + @property + def reference_genome_dataset_types() -> ( + frozenset[tuple[ReferenceGenome, DatasetType]] + ): + return frozenset( + ( + (ReferenceGenome.GRCh37, DatasetType.SNV_INDEL), + (ReferenceGenome.GRCh38, DatasetType.SNV_INDEL), + (ReferenceGenome.GRCh38, DatasetType.MITO), + ), + ) + + @staticmethod + def migrate(ht: hl.Table) -> hl.Table: + return ht.annotate_globals( + updates=ht.globals.updates.map( + lambda u: u.annotate(remap_pedigree_hash=hl.missing(hl.tint32)), + ), + ) diff --git a/v03_pipeline/migrations/lookup/__init__.py b/v03_pipeline/migrations/lookup/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/v03_pipeline/var/test/pedigrees/test_pedigree_3_different_families.tsv b/v03_pipeline/var/test/pedigrees/test_pedigree_3_different_families.tsv new file mode 100644 index 000000000..00e73ff2e --- /dev/null +++ b/v03_pipeline/var/test/pedigrees/test_pedigree_3_different_families.tsv @@ -0,0 +1,3 @@ +Project_GUID Family_GUID Family_ID Individual_ID Paternal_ID Maternal_ID Sex +R0113_test_project abc_1 abc HG00731_1 F +R0113_test_project abc_1 abc HG00733_1 HG00732_1 HG00731_1 F diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/.README.txt.crc b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/.README.txt.crc index 436531ab2..e08d4d12b 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/.README.txt.crc and b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/.README.txt.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/.metadata.json.gz.crc index d1c46b25e..d328f484c 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/.metadata.json.gz.crc and b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/README.txt b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/README.txt index 0160eb2ca..704275b10 100644 --- a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/README.txt +++ b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/README.txt @@ -1,3 +1,3 @@ This folder comprises a Hail (www.hail.is) native Table or MatrixTable. Written with version 0.2.130-bea04d9c79b5 - Created at 2024/05/20 14:08:17 \ No newline at end of file + Created at 2024/07/24 14:11:11 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/.metadata.json.gz.crc index d5cf430bc..94dde309e 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/.metadata.json.gz.crc and b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/metadata.json.gz index 9e842012f..d7c36221c 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/metadata.json.gz and b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/parts/.part-0.crc index 12c819f3c..7b3d99c48 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/parts/.part-0.crc and b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/parts/.part-0.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/parts/part-0 index f4252dc82..2493dddf9 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/parts/part-0 and b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/parts/part-0 differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.idx/.index.crc b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.idx/.index.crc deleted file mode 100644 index 12ec58de2..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.idx/.index.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.idx/.index.crc b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.idx/.index.crc new file mode 100644 index 000000000..9cce2f7ae Binary files /dev/null and b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.idx/.index.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.idx/.metadata.json.gz.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.idx/.metadata.json.gz.crc rename to v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.idx/.metadata.json.gz.crc diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.idx/index b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.idx/index similarity index 61% rename from v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.idx/index rename to v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.idx/index index 7b548374b..1c70e02f0 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.idx/index and b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.idx/index differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.idx/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.idx/metadata.json.gz similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.idx/metadata.json.gz rename to v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.idx/metadata.json.gz diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/metadata.json.gz index b41476cb2..95672cd45 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/metadata.json.gz and b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/.metadata.json.gz.crc index 30c202768..a927fa9da 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/.metadata.json.gz.crc and b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/metadata.json.gz index 29e15e9d8..213bdb7aa 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/metadata.json.gz and b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/.part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.crc b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/.part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.crc deleted file mode 100644 index f8028adef..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/.part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/.part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.crc b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/.part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.crc new file mode 100644 index 000000000..089436f1c Binary files /dev/null and b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/.part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2 b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2 deleted file mode 100644 index f78553dbf..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2 and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90 b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90 new file mode 100644 index 000000000..bf1bf60c2 Binary files /dev/null and b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90 differ