diff --git a/.cloudbuild/vep-docker.cloudbuild.yaml b/.cloudbuild/vep-docker.cloudbuild.yaml index fa3d39f57..b1a645847 100644 --- a/.cloudbuild/vep-docker.cloudbuild.yaml +++ b/.cloudbuild/vep-docker.cloudbuild.yaml @@ -1,11 +1,11 @@ # Run locally with: # -# gcloud builds submit --quiet --substitutions='_VEP_VERSION=110' --config .cloudbuild/vep-docker.cloudbuild.yaml v03_pipeline/ +# gcloud builds submit --quiet --substitutions='_VEP_VERSION=110' --config .cloudbuild/vep-docker.cloudbuild.yaml v03_pipeline/deploy steps: - name: 'gcr.io/kaniko-project/executor:v1.3.0' args: - --destination=gcr.io/seqr-project/vep-docker-image:${_VEP_VERSION} - - --dockerfile=deploy/Dockerfile.vep + - --dockerfile=Dockerfile.vep - --cache=true - --cache-ttl=168h diff --git a/pyproject.toml b/pyproject.toml index 59bec645f..adc5d947a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,7 +41,6 @@ ignore = [ "FBT", # flake-boolean-trap... disallows boolean args to functions... fixing this code will require refactors. "ANN", # flake8-annotations is for typed code "DJ", # django specific - "T20", # forbids print, we print quite a bit "PYI", # pyi is typing stub files "PT", # pytest specific "PTH", # pathlib is preferred, but we're not using it yet diff --git a/requirements.in b/requirements.in index 42b767ccb..e1219fd47 100644 --- a/requirements.in +++ b/requirements.in @@ -1,6 +1,7 @@ elasticsearch==7.9.1 google-api-python-client>=1.8.0 -hail==0.2.128 +hail==0.2.130 luigi>=3.4.0 gnomad==0.6.4 google-cloud-storage>=2.14.0 +google-cloud-secret-manager>=2.20.0 diff --git a/requirements.txt b/requirements.txt index ccc808971..b9a3cc37b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -97,10 +97,11 @@ frozenlist==1.4.0 # hail gnomad==0.6.4 # via -r requirements.in -google-api-core==2.14.0 +google-api-core[grpc]==2.14.0 # via # google-api-python-client # google-cloud-core + # google-cloud-secret-manager # google-cloud-storage google-api-python-client==2.108.0 # via -r requirements.in @@ -111,6 +112,7 @@ google-auth==2.23.4 # google-auth-httplib2 # google-auth-oauthlib # google-cloud-core + # google-cloud-secret-manager # google-cloud-storage # hail google-auth-httplib2==0.1.1 @@ -119,6 +121,8 @@ google-auth-oauthlib==0.8.0 # via hail google-cloud-core==2.4.1 # via google-cloud-storage +google-cloud-secret-manager==2.20.0 + # via -r requirements.in google-cloud-storage==2.14.0 # via -r requirements.in google-crc32c==1.5.0 @@ -127,9 +131,22 @@ google-crc32c==1.5.0 # google-resumable-media google-resumable-media==2.7.0 # via google-cloud-storage -googleapis-common-protos==1.61.0 +googleapis-common-protos[grpc]==1.61.0 + # via + # google-api-core + # grpc-google-iam-v1 + # grpcio-status +grpc-google-iam-v1==0.13.0 + # via google-cloud-secret-manager +grpcio==1.63.0 + # via + # google-api-core + # googleapis-common-protos + # grpc-google-iam-v1 + # grpcio-status +grpcio-status==1.48.2 # via google-api-core -hail==0.2.128 +hail==0.2.130 # via -r requirements.in hdbscan==0.8.33 # via gnomad @@ -202,7 +219,7 @@ numpy==1.26.2 # scipy oauthlib==3.2.2 # via requests-oauthlib -orjson==3.9.11 +orjson==3.9.10 # via hail packaging==23.2 # via @@ -226,11 +243,16 @@ portalocker==2.8.2 # via msal-extensions prompt-toolkit==3.0.41 # via ipython +proto-plus==1.23.0 + # via google-cloud-secret-manager protobuf==3.20.2 # via # google-api-core + # google-cloud-secret-manager # googleapis-common-protos - # hail + # grpc-google-iam-v1 + # grpcio-status + # proto-plus ptyprocess==0.7.0 # via pexpect pure-eval==0.2.2 diff --git a/v03_pipeline/bin/vep-110-GRCh38.sh b/v03_pipeline/bin/vep-110-GRCh38.sh index dbb497beb..1156dce62 100644 --- a/v03_pipeline/bin/vep-110-GRCh38.sh +++ b/v03_pipeline/bin/vep-110-GRCh38.sh @@ -42,12 +42,6 @@ gcloud storage cp --billing-project $PROJECT gs://seqr-reference-data/vep/110/ve gcloud storage cp --billing-project $PROJECT gs://seqr-reference-data/vep/110/uORF_5UTR_${ASSEMBLY}_PUBLIC.txt /vep_data/ & # Raw data files copied from the bucket (https://console.cloud.google.com/storage/browser/dm_alphamissense;tab=objects?prefix=&forceOnObjectsSortingFiltering=false) -# Some investigation led us to want to combine the canonical and non-canonical transcript tsvs (run inside the VEP docker container): -# cat AlphaMissense_hg38.tsv.gz | gunzip | grep -v '#' | awk 'BEGIN { OFS = "\t" };{$6=""; print $0}' > AlphaMissense_combined_hg38.tsv -# cat AlphaMissense_isoforms_hg38.tsv.gz | gunzip | grep -v '#' >> AlphaMissense_combined_hg38.tsv -# cat AlphaMissense_combined_hg38.tsv | sort --parallel=12 --buffer-size=20G -k1,1 -k2,2n > AlphaMissense_combined_sorted_hg38.tsv -# cat AlphaMissense_combined_sorted_hg38.tsv | sed '1i #CHROM\tPOS\tREF\tALT\tgenome\ttranscript_id\tprotein_variant\tam_pathogenicity\tam_class' > AlphaMissense_hg38.tsv -# bgzip AlphaMissense_hg38.tsv # tabix -s 1 -b 2 -e 2 -f -S 1 AlphaMissense_hg38.tsv.gz gcloud storage cp --billing-project $PROJECT 'gs://seqr-reference-data/vep/110/AlphaMissense_hg38.tsv.*' /vep_data/ & diff --git a/v03_pipeline/bin/write_cached_reference_dataset_query_ht.py b/v03_pipeline/bin/write_cached_reference_dataset_query_ht.py deleted file mode 100755 index 5bf5c956f..000000000 --- a/v03_pipeline/bin/write_cached_reference_dataset_query_ht.py +++ /dev/null @@ -1,93 +0,0 @@ -#!/usr/bin/env python3 -import argparse - -import hail as hl - -from v03_pipeline.lib.misc.io import write -from v03_pipeline.lib.model import ( - CachedReferenceDatasetQuery, - DatasetType, - ReferenceDatasetCollection, - ReferenceGenome, -) -from v03_pipeline.lib.paths import ( - valid_cached_reference_dataset_query_path, - valid_reference_dataset_collection_path, -) -from v03_pipeline.lib.reference_data.config import CONFIG -from v03_pipeline.lib.reference_data.dataset_table_operations import ( - import_ht_from_config_path, -) - - -def get_ht( - dataset_type: DatasetType, - reference_genome: ReferenceGenome, - query: CachedReferenceDatasetQuery, -) -> hl.Table: - # If the query is defined over an uncombined reference dataset, use the combiner config. - if query.query_raw_dataset: - config = CONFIG[query.dataset(dataset_type)][reference_genome.v02_value] - return import_ht_from_config_path( - config, - query.dataset(dataset_type), - reference_genome, - ) - return hl.read_table( - valid_reference_dataset_collection_path( - reference_genome, - dataset_type, - ReferenceDatasetCollection.COMBINED, - ), - ) - - -def run( - dataset_type: DatasetType, - reference_genome: ReferenceGenome, - query: CachedReferenceDatasetQuery, -): - ht = get_ht(dataset_type, reference_genome, query) - ht = query.query(ht, dataset_type=dataset_type, reference_genome=reference_genome) - destination_path = valid_cached_reference_dataset_query_path( - reference_genome, - dataset_type, - query, - ) - print(f'Uploading ht to {destination_path}') - write(ht, destination_path) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument( - '--reference-genome', - type=ReferenceGenome, - choices=list(ReferenceGenome), - default=ReferenceGenome.GRCh38, - ) - parser.add_argument( - '--dataset-type', - type=DatasetType, - choices=list(DatasetType), - default=None, - help='When used, update the passed dataset, otherwise run all datasets.', - ) - parser.add_argument( - '--query', - type=CachedReferenceDatasetQuery, - choices=list(CachedReferenceDatasetQuery), - required=True, - ) - args, _ = parser.parse_known_args() - if ( - args.query - and args.query - not in CachedReferenceDatasetQuery.for_reference_genome_dataset_type( - args.reference_genome, - args.dataset_type, - ) - ): - msg = f'{args.query} is not a valid query for {DatasetType}' - raise ValueError(msg) - run(args.dataset_type, args.reference_genome, args.query) diff --git a/v03_pipeline/lib/misc/allele_registry.py b/v03_pipeline/lib/misc/allele_registry.py new file mode 100644 index 000000000..24abd7405 --- /dev/null +++ b/v03_pipeline/lib/misc/allele_registry.py @@ -0,0 +1,215 @@ +import dataclasses +import hashlib +import json +import math +import time +import uuid + +import hail as hl +import hailtop.fs as hfs +import requests +from google.cloud import secretmanager +from requests import HTTPError + +from v03_pipeline.lib.logger import get_logger +from v03_pipeline.lib.model import Env, ReferenceGenome + +MAX_VARIANTS_PER_REQUEST = 1000000 +ALLELE_REGISTRY_URL = 'https://reg.genome.network/alleles?file=vcf&fields=none+@id+genomicAlleles+externalRecords.{}.id' +HTTP_REQUEST_TIMEOUT_S = 420 + +logger = get_logger(__name__) + + +@dataclasses.dataclass +class AlleleRegistryError: + base_url: str + error_type: str + description: str + message: str + input_line: str | None + + @classmethod + def from_api_response(cls, response: dict, base_url: str): + return cls( + base_url=base_url, + error_type=response['errorType'], + description=response['description'], + message=response['message'], + input_line=response.get('inputLine'), + ) + + def __str__(self) -> str: + msg = ( + f'\nAPI URL: {self.base_url}\nTYPE: {self.error_type}' + f'\nDESCRIPTION: {self.description}\nMESSAGE: {self.message}' + ) + return ( + msg if self.input_line is None else f'{msg}\nINPUT_LINE: {self.input_line}' + ) + + +def register_alleles_in_chunks( + ht: hl.Table, + reference_genome: ReferenceGenome, + base_url: str = ALLELE_REGISTRY_URL, + chunk_size: int = MAX_VARIANTS_PER_REQUEST, +): + num_rows = ht.count() + num_chunks = math.ceil(num_rows / chunk_size) + logger.info( + f'Registering {num_rows} allele(s) in chunks of {chunk_size} in {num_chunks} request(s).', + ) + for start_idx in range(0, num_rows, chunk_size): + end_idx = start_idx + chunk_size + if end_idx == chunk_size: + chunk_ht = ht.head(chunk_size) + elif end_idx <= num_rows: + chunk_ht = ht.head(end_idx).tail(chunk_size) + else: + chunk_ht = ht.tail(end_idx - num_rows) + yield register_alleles(chunk_ht, reference_genome, base_url) + + +def register_alleles( + ht: hl.Table, + reference_genome: ReferenceGenome, + base_url: str, +) -> hl.Table: + uuid4 = uuid.uuid4() + raw_vcf_file_name = f'{Env.HAIL_TMPDIR}/r_{uuid4}.vcf' + formatted_vcf_file_name = f'{Env.HAIL_TMPDIR}/f_{uuid4}.vcf' + + # Export the variants to a VCF + hl.export_vcf(ht, raw_vcf_file_name) + + # Reformat the VCF created by hail's 'export_vcf' function to be compatible with the Allele Registry + with hfs.open(raw_vcf_file_name, 'r') as vcf_in, hfs.open( + formatted_vcf_file_name, + 'w', + ) as vcf_out: + vcf_out.writelines(reference_genome.allele_registry_vcf_header) + for line in vcf_in: + if not line.startswith('#'): + # NB: The Allele Registry does not accept contigs prefixed with 'chr', even for GRCh38 + vcf_out.write(line.replace('chr', '')) + + logger.info('Calling the ClinGen Allele Registry') + with hfs.open(formatted_vcf_file_name, 'r') as vcf_in: + data = vcf_in.read() + res = requests.put( + url=build_url(base_url, reference_genome), + data=data, + timeout=HTTP_REQUEST_TIMEOUT_S, + ) + return handle_api_response(res, base_url, reference_genome) + + +def build_url(base_url: str, reference_genome: ReferenceGenome) -> str: + login, password = get_ar_credentials_from_secret_manager() + + # Request a gnomad ID for the correct reference genome + base_url = base_url.format(reference_genome.allele_registry_gnomad_id) + + # adapted from https://reg.clinicalgenome.org/doc/scripts/request_with_payload.py + identity = hashlib.sha1((login + password).encode('utf-8')).hexdigest() # noqa: S324 + gb_time = str(int(time.time())) + token = hashlib.sha1((base_url + identity + gb_time).encode('utf-8')).hexdigest() # noqa: S324 + return base_url + '&gbLogin=' + login + '&gbTime=' + gb_time + '&gbToken=' + token + + +def get_ar_credentials_from_secret_manager() -> tuple[str, str]: + if Env.ALLELE_REGISTRY_SECRET_NAME is None: + msg = ( + 'SHOULD_REGISTER_ALLELES is True but cannot get allele registry credentials ' + 'because ALLELE_REGISTRY_SECRET_NAME is not set' + ) + raise ValueError(msg) + + client = secretmanager.SecretManagerServiceClient() + name = client.secret_version_path( + Env.PROJECT_ID, + Env.ALLELE_REGISTRY_SECRET_NAME, + 'latest', + ) + response = client.access_secret_version(request={'name': name}) + payload_dict = json.loads(response.payload.data.decode('UTF-8')) + return payload_dict['login'], payload_dict['password'] + + +def handle_api_response( + res: requests.Response, + base_url: str, + reference_genome: ReferenceGenome, +) -> hl.Table: + response = res.json() + if not res.ok or 'errorType' in response: + error = AlleleRegistryError.from_api_response(response, base_url) + logger.error(error) + raise HTTPError(error.message) + + parsed_structs = [] + errors = [] + unmappable_variants = [] + for allele_response in response: + if 'errorType' in allele_response: + errors.append( + AlleleRegistryError.from_api_response(allele_response, base_url), + ) + continue + + # Extract CAID and allele info + caid = allele_response['@id'].split('/')[-1] + allele_info = next( + record + for record in allele_response['genomicAlleles'] + if record['referenceGenome'] == reference_genome.value + ) + chrom = allele_info['chromosome'] + pos = allele_info['coordinates'][0]['end'] + ref = allele_info['coordinates'][0]['referenceAllele'] + alt = allele_info['coordinates'][0]['allele'] + + if ref == '' or alt == '': + # AR will turn alleles like ["A","ATT"] to ["", "TT"] so try using gnomad IDs instead + if 'externalRecords' in allele_response: + gnomad_id = allele_response['externalRecords'][ + reference_genome.allele_registry_gnomad_id + ][0]['id'] + chrom, pos, ref, alt = gnomad_id.split('-') + else: + unmappable_variants.append(allele_response) + continue + + struct = hl.Struct( + locus=hl.Locus( + f'chr{chrom}' if reference_genome == ReferenceGenome.GRCh38 else chrom, + int(pos), + reference_genome=reference_genome.value, + ), + alleles=[ref, alt], + CAID=caid, + ) + parsed_structs.append(struct) + + logger.info( + f'{len(response) - len(errors)} out of {len(response)} variants returned CAID(s)', + ) + if unmappable_variants: + logger.info( + f'{len(unmappable_variants)} registered variant(s) cannot be mapped back to ours. ' + f'\nFirst unmappable variant:\n{unmappable_variants[0]}', + ) + if errors: + logger.warning( + f'{len(errors)} failed. First error: {errors[0]}', + ) + return hl.Table.parallelize( + parsed_structs, + hl.tstruct( + locus=hl.tlocus(reference_genome.value), + alleles=hl.tarray(hl.tstr), + CAID=hl.tstr, + ), + key=('locus', 'alleles'), + ) diff --git a/v03_pipeline/lib/misc/allele_registry_test.py b/v03_pipeline/lib/misc/allele_registry_test.py new file mode 100644 index 000000000..83a9ceda6 --- /dev/null +++ b/v03_pipeline/lib/misc/allele_registry_test.py @@ -0,0 +1,233 @@ +import shutil +import tempfile +from unittest.mock import ANY, Mock, patch + +import hail as hl +import requests + +from v03_pipeline.lib.misc.allele_registry import ( + HTTP_REQUEST_TIMEOUT_S as ALLELE_REGISTRY_TIMEOUT, +) +from v03_pipeline.lib.misc.allele_registry import ( + register_alleles, + register_alleles_in_chunks, +) +from v03_pipeline.lib.model import ReferenceGenome +from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase + +TEST_SERVER_URL = 'http://reg.test.genome.network/alleles?file=vcf&fields=none+@id' + + +class AlleleRegistryTest(MockedDatarootTestCase): + def setUp(self): + self.temp_dir = tempfile.TemporaryDirectory() + + def tearDown(self): + shutil.rmtree(self.temp_dir.name) + + @patch.object(requests, 'put') + @patch( + 'v03_pipeline.lib.misc.allele_registry.get_ar_credentials_from_secret_manager', + ) + @patch('v03_pipeline.lib.misc.allele_registry.Env') + @patch('v03_pipeline.lib.misc.allele_registry.logger') + def test_register_alleles_38( + self, + mock_logger: Mock, + mock_env: Mock, + mock_get_credentials: Mock, + mock_put_request: Mock, + ): + mock_get_credentials.return_value = ('', '') + mock_env.HAIL_TMPDIR = self.temp_dir.name + + new_variants_ht = hl.Table.parallelize( + [ + { + 'locus': hl.Locus( + contig='chr1', + position=10126, + reference_genome='GRCh38', + ), + 'alleles': ['TA', 'T'], + 'rsid': 'rs370233999', + }, + { + 'locus': hl.Locus( + contig='chr1', + position=10129, + reference_genome='GRCh38', + ), + 'alleles': ['T', 'TC'], + 'rsid': 'rs370233997', + }, + { + 'locus': hl.Locus( + contig='chr1', + position=10128, + reference_genome='GRCh38', + ), + 'alleles': ['A', 'G'], + 'rsid': 'rs370234000', + }, + { + 'locus': hl.Locus( + contig='chr1', + position=10469, + reference_genome='GRCh38', + ), + 'alleles': ['C', 'G'], + 'rsid': 'rs370233998', + }, + ], + hl.tstruct( + locus=hl.tlocus(ReferenceGenome.GRCh38.value), + alleles=hl.tarray(hl.tstr), + rsid=hl.tstr, + ), + key=('locus', 'alleles'), + ) + + mock_response = Mock() + mock_put_request.return_value = mock_response + mock_response.ok = True + mock_response.json.return_value = [ + { + '@id': 'http://reg.genome.network/allele/CA997563840', + 'genomicAlleles': [ + { + 'chromosome': '1', + 'coordinates': [ + { + 'allele': '', # alt allele is '' + 'end': 10128, + 'referenceAllele': 'A', + 'start': 10127, + }, + ], + 'referenceGenome': 'GRCh38', + }, + ], + 'externalRecords': { + 'gnomAD_4': [{'id': '1-10126-TA-T'}], + }, # has gnomad ID + }, + { + '@id': 'http://reg.genome.network/allele/CA16716503', + 'genomicAlleles': [ + { + 'chromosome': '1', + 'coordinates': [ + { + 'allele': 'C', + 'end': 10131, + 'referenceAllele': '', # ref allele is '' and does not have a gnomad ID + 'start': 10131, + }, + ], + 'referenceGenome': 'GRCh38', + }, + ], + }, + { + '@id': 'http://reg.genome.network/allele/CA997563845', + 'genomicAlleles': [ + { + 'chromosome': '1', + 'coordinates': [ + { + 'allele': 'G', + 'end': 10128, + 'referenceAllele': 'A', + 'start': 10127, + }, + ], + 'referenceGenome': 'GRCh38', + }, + ], + 'externalRecords': {'gnomAD_4': [{'id': '1-10128-A-G'}]}, + }, + { + 'description': 'Given allele cannot be mapped in consistent way to reference genome.', + 'errorType': 'InternalServerError', + 'inputLine': 'Cannot align NC_000001.10 [10468,10469).', + 'message': '1 10469 rs370233998 C G . . .', + }, + ] + + ar_ht = register_alleles( + new_variants_ht, + ReferenceGenome.GRCh38, + TEST_SERVER_URL, + ) + self.assertEqual( + ar_ht.collect(), + [ + hl.Struct( + locus=hl.Locus('chr1', 10126, 'GRCh38'), + alleles=['TA', 'T'], + CAID='CA997563840', + ), + hl.Struct( + locus=hl.Locus('chr1', 10128, 'GRCh38'), + alleles=['A', 'G'], + CAID='CA997563845', + ), + ], + ) + mock_put_request.assert_called_once_with( + url=ANY, + data=f'{"".join(ReferenceGenome.GRCh38.allele_registry_vcf_header)}' + f'1\t10126\trs370233999\tTA\tT\t.\t.\t.\n' + f'1\t10128\trs370234000\tA\tG\t.\t.\t.\n' + f'1\t10129\trs370233997\tT\tTC\t.\t.\t.\n' + f'1\t10469\trs370233998\tC\tG\t.\t.\t.\n', + timeout=ALLELE_REGISTRY_TIMEOUT, + ) + mock_logger.warning.assert_called_once_with( + '1 failed. First error: \n' + 'API URL: http://reg.test.genome.network/alleles?file=vcf&fields=none+@id\n' + 'TYPE: InternalServerError\n' + 'DESCRIPTION: Given allele cannot be mapped in consistent way to reference genome.\n' + 'MESSAGE: 1\t10469\trs370233998\tC\tG\t.\t.\t.\n' + 'INPUT_LINE: Cannot align NC_000001.10 [10468,10469).', + ) + + @patch('v03_pipeline.lib.misc.allele_registry.register_alleles') + def test_register_alleles_in_chunks(self, mock_register_alleles): + chunk_size = 10 + ht = hl.Table.parallelize( + [{'x': x} for x in range(chunk_size * 3 + 5)], # 35 rows, expect 4 chunks + hl.tstruct(x=hl.tint32), + key='x', + ) + + # Instead of actually calling register_alleles, capture and assert on + # the value of 'x' in the first row of each chunk and number of rows in each chunk + def _side_effect(chunk_ht: hl.Table, *_): + value_in_first_row = hl.eval(chunk_ht.take(1)[0].x) + num_rows_in_chunk = chunk_ht.count() + return value_in_first_row, num_rows_in_chunk + + mock_register_alleles.side_effect = _side_effect + generator = register_alleles_in_chunks( + ht=ht, + reference_genome=ReferenceGenome.GRCh38, + base_url=TEST_SERVER_URL, + chunk_size=chunk_size, + ) + self.assertEqual(list(generator), [(0, 10), (10, 10), (20, 10), (30, 5)]) + + def test_register_alleles_in_chunks_no_new_variants(self): + ht = hl.Table.parallelize( + [], + hl.tstruct(x=hl.tint32), + key='x', + ) + empty_generator = register_alleles_in_chunks( + ht=ht, + reference_genome=ReferenceGenome.GRCh38, + base_url=TEST_SERVER_URL, + ) + with self.assertRaises(StopIteration): + next(empty_generator) diff --git a/v03_pipeline/lib/misc/sample_ids.py b/v03_pipeline/lib/misc/sample_ids.py index d2174fe50..ca5407b5e 100644 --- a/v03_pipeline/lib/misc/sample_ids.py +++ b/v03_pipeline/lib/misc/sample_ids.py @@ -2,6 +2,10 @@ import hail as hl +from v03_pipeline.lib.logger import get_logger + +logger = get_logger(__name__) + class MatrixTableSampleSetError(Exception): def __init__(self, message, missing_samples): @@ -42,7 +46,7 @@ def remap_sample_ids( f'All callset sample IDs:{mt.s.collect()}' ) if ignore_missing_samples_when_remapping: - print(message) + logger.info(message) else: raise MatrixTableSampleSetError(message, missing_samples) @@ -50,7 +54,7 @@ def remap_sample_ids( remap_expr = hl.if_else(hl.is_missing(mt.seqr_id), mt.s, mt.seqr_id) mt = mt.annotate_cols(seqr_id=remap_expr, vcf_id=mt.s) mt = mt.key_cols_by(s=mt.seqr_id) - print(f'Remapped {remap_count} sample ids...') + logger.info(f'Remapped {remap_count} sample ids...') return mt @@ -77,9 +81,9 @@ def subset_samples( if ( subset_count > anti_join_ht_count ) and ignore_missing_samples_when_subsetting: - print(message) + logger.info(message) else: raise MatrixTableSampleSetError(message, missing_samples) - print(f'Subsetted to {subset_count} sample ids') + logger.info(f'Subsetted to {subset_count} sample ids') mt = mt.semi_join_cols(sample_subset_ht) return mt.filter_rows(hl.agg.any(hl.is_defined(mt.GT))) diff --git a/v03_pipeline/lib/misc/validation.py b/v03_pipeline/lib/misc/validation.py index 84699a25b..86b672ddc 100644 --- a/v03_pipeline/lib/misc/validation.py +++ b/v03_pipeline/lib/misc/validation.py @@ -11,6 +11,19 @@ class SeqrValidationError(Exception): pass +def validate_allele_type( + mt: hl.MatrixTable, +) -> None: + ht = mt.rows() + ht = ht.filter( + hl.numeric_allele_type(ht.alleles[0], ht.alleles[1]) + == hl.genetics.allele_type.AlleleType.UNKNOWN, + ) + if ht.count() > 0: + msg = f'Alleles with Unknown AlleleType are present in the callset: {ht.alleles.collect()}' + raise SeqrValidationError(msg) + + def validate_no_duplicate_variants( mt: hl.MatrixTable, ) -> None: diff --git a/v03_pipeline/lib/misc/validation_test.py b/v03_pipeline/lib/misc/validation_test.py index 2ce4b3422..0512d9284 100644 --- a/v03_pipeline/lib/misc/validation_test.py +++ b/v03_pipeline/lib/misc/validation_test.py @@ -4,6 +4,7 @@ from v03_pipeline.lib.misc.validation import ( SeqrValidationError, + validate_allele_type, validate_expected_contig_frequency, validate_imputed_sex_ploidy, validate_no_duplicate_variants, @@ -32,6 +33,44 @@ def _mt_from_contigs(contigs): class ValidationTest(unittest.TestCase): + def test_validate_allele_type(self) -> None: + mt = hl.MatrixTable.from_parts( + rows={ + 'locus': [ + hl.Locus( + contig='chr1', + position=1, + reference_genome='GRCh38', + ), + hl.Locus( + contig='chr1', + position=2, + reference_genome='GRCh38', + ), + hl.Locus( + contig='chr1', + position=3, + reference_genome='GRCh38', + ), + ], + 'alleles': [ + ['A', 'T'], + # NB: star alleles should pass through this validation just fine, + # but are eventually filtered out upstream. + ['A', '*'], + ['A', '-'], + ], + }, + cols={'s': ['sample_1']}, + entries={'HL': [[0.0], [0.0], [0.0]]}, + ).key_rows_by('locus', 'alleles') + self.assertRaisesRegex( + SeqrValidationError, + "Alleles with Unknown AlleleType are present in the callset: \\[\\['A', '-'\\]\\]", + validate_allele_type, + mt, + ) + def test_validate_imputed_sex_ploidy(self) -> None: sex_check_ht = hl.read_table(TEST_SEX_CHECK_1) mt = hl.MatrixTable.from_parts( diff --git a/v03_pipeline/lib/model/dataset_type.py b/v03_pipeline/lib/model/dataset_type.py index 107d651ce..cc9123848 100644 --- a/v03_pipeline/lib/model/dataset_type.py +++ b/v03_pipeline/lib/model/dataset_type.py @@ -281,3 +281,7 @@ def lookup_table_annotation_fns(self) -> list[Callable[..., hl.Expression]]: mito.gt_stats, ], }[self] + + @property + def should_send_to_allele_registry(self): + return self == DatasetType.SNV_INDEL diff --git a/v03_pipeline/lib/model/definitions.py b/v03_pipeline/lib/model/definitions.py index 15939e36a..d4966d558 100644 --- a/v03_pipeline/lib/model/definitions.py +++ b/v03_pipeline/lib/model/definitions.py @@ -66,6 +66,76 @@ def contig_recoding(self, include_mt: bool = False) -> dict[str, str]: return recode + @property + def allele_registry_vcf_header(self) -> list[str]: + return { + ReferenceGenome.GRCh37: [ + '##fileformat=VCFv4.2\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '#CHROM POS ID REF ALT QUAL FILTER INFO\n', + ], + ReferenceGenome.GRCh38: [ + '##fileformat=VCFv4.2\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '#CHROM POS ID REF ALT QUAL FILTER INFO\n', + ], + }[self] + + @property + def allele_registry_gnomad_id(self) -> str: + return { + ReferenceGenome.GRCh37: 'gnomAD_2', + ReferenceGenome.GRCh38: 'gnomAD_4', + }[self] + class SampleType(Enum): WES = 'WES' diff --git a/v03_pipeline/lib/model/environment.py b/v03_pipeline/lib/model/environment.py index 81bf1f6b3..d89567d8b 100644 --- a/v03_pipeline/lib/model/environment.py +++ b/v03_pipeline/lib/model/environment.py @@ -19,16 +19,22 @@ ) VEP_CONFIG_PATH = os.environ.get('VEP_CONFIG_PATH', None) VEP_CONFIG_URI = os.environ.get('VEP_CONFIG_URI', None) +SHOULD_REGISTER_ALLELES = os.environ.get('SHOULD_REGISTER_ALLELES') == '1' +ALLELE_REGISTRY_SECRET_NAME = os.environ.get('ALLELE_REGISTRY_SECRET_NAME', None) +PROJECT_ID = os.environ.get('PROJECT_ID', None) @dataclass class Env: ACCESS_PRIVATE_REFERENCE_DATASETS: bool = ACCESS_PRIVATE_REFERENCE_DATASETS + ALLELE_REGISTRY_SECRET_NAME: str | None = ALLELE_REGISTRY_SECRET_NAME REFERENCE_DATA_AUTO_UPDATE: bool = REFERENCE_DATA_AUTO_UPDATE HAIL_TMPDIR: str = HAIL_TMPDIR HAIL_SEARCH_DATA: str = HAIL_SEARCH_DATA LOADING_DATASETS: str = LOADING_DATASETS PRIVATE_REFERENCE_DATASETS: str = PRIVATE_REFERENCE_DATASETS + PROJECT_ID: str | None = PROJECT_ID REFERENCE_DATASETS: str = REFERENCE_DATASETS + SHOULD_REGISTER_ALLELES: bool = SHOULD_REGISTER_ALLELES VEP_CONFIG_PATH: str | None = VEP_CONFIG_PATH VEP_CONFIG_URI: str | None = VEP_CONFIG_URI diff --git a/v03_pipeline/lib/reference_data/compare_globals.py b/v03_pipeline/lib/reference_data/compare_globals.py index 0b5c2993d..1feb0ac12 100644 --- a/v03_pipeline/lib/reference_data/compare_globals.py +++ b/v03_pipeline/lib/reference_data/compare_globals.py @@ -21,7 +21,7 @@ class Globals: paths: dict[str, str] versions: dict[str, str] enums: dict[str, dict[str, list[str]]] - selects: dict[str, set[str]] + selects: dict[str, dict[str, hl.dtype]] def __getitem__(self, name: str): return getattr(self, name) @@ -50,7 +50,11 @@ def from_dataset_configs( dataset_ht = dataset_ht.transmute( **get_enum_select_fields(dataset_ht, dataset_config), ) - selects[dataset] = set(dataset_ht.row) - set(dataset_ht.key) + selects[dataset] = { + k: v.dtype + for k, v in dict(dataset_ht.row).items() + if k not in set(dataset_ht.key) + } return cls(paths, versions, enums, selects) @classmethod @@ -69,32 +73,52 @@ def from_ht( if dataset in ht.row: # NB: handle an edge case (mito high constraint) where we annotate a bool from the reference dataset collection selects[dataset] = ( - set(ht[dataset]) + {k: v.dtype for k, v in dict(ht[dataset]).items()} if isinstance(ht[dataset], hl.StructExpression) - else set() + else {} ) return cls(paths, versions, enums, selects) +def validate_selects_types( + ht1_globals: Globals, + ht2_globals: Globals, + dataset: str, +) -> None: + # Assert that all shared annotations have identical types + shared_selects = ( + ht1_globals['selects'][dataset].keys() + & ht2_globals['selects'].get(dataset).keys() + ) + mismatched_select_types = [ + (select, ht2_globals['selects'][dataset][select]) + for select in shared_selects + if ( + ht1_globals['selects'][dataset][select] + != ht2_globals['selects'][dataset][select] + ) + ] + if mismatched_select_types: + msg = f'Unexpected field types detected in {dataset}: {mismatched_select_types}' + raise ValueError(msg) + + def get_datasets_to_update( ht1_globals: Globals, ht2_globals: Globals, validate_selects: bool = True, ) -> list[str]: datasets_to_update = set() - for field in dataclasses.fields(Globals): if field.name == 'selects' and not validate_selects: continue - datasets_to_update.update( ht1_globals[field.name].keys() ^ ht2_globals[field.name].keys(), ) for dataset in ht1_globals[field.name].keys() & ht2_globals[field.name].keys(): - if ht1_globals[field.name].get(dataset) != ht2_globals[field.name].get( - dataset, - ): + if field.name == 'selects': + validate_selects_types(ht1_globals, ht2_globals, dataset) + if ht1_globals[field.name][dataset] != ht2_globals[field.name][dataset]: logger.info(f'{field.name} mismatch for {dataset}') datasets_to_update.add(dataset) - return sorted(datasets_to_update) diff --git a/v03_pipeline/lib/reference_data/compare_globals_test.py b/v03_pipeline/lib/reference_data/compare_globals_test.py index 0d290489b..786964fcb 100644 --- a/v03_pipeline/lib/reference_data/compare_globals_test.py +++ b/v03_pipeline/lib/reference_data/compare_globals_test.py @@ -103,8 +103,15 @@ def test_create_globals_from_dataset_configs( self.assertTrue( dataset_config_globals.selects == { - 'a': {'test_select', 'test_enum_id'}, - 'b': {'test_select', 'field2', 'test_enum_id'}, + 'a': { + 'test_select': hl.tint32, + 'test_enum_id': hl.tint32, + }, + 'b': { + 'test_select': hl.tint32, + 'field2': hl.tint32, + 'test_enum_id': hl.tint32, + }, }, ) @@ -134,7 +141,11 @@ def test_create_globals_from_dataset_configs_single_dataset(self, mock_read_tabl self.assertTrue( dataset_config_globals.selects == { - 'b': {'test_select', 'field2', 'test_enum_id'}, + 'b': { + 'test_select': hl.tint32, + 'field2': hl.tint32, + 'test_enum_id': hl.tint32, + }, }, ) @@ -186,8 +197,8 @@ def test_from_rdc_or_annotations_ht(self): self.assertTrue( rdc_globals.selects == { - 'gnomad_non_coding_constraint': {'z_score'}, - 'screen': {'region_type_ids'}, + 'gnomad_non_coding_constraint': {'z_score': hl.tfloat32}, + 'screen': {'region_type_ids': hl.tarray(hl.tint32)}, }, ) @@ -198,13 +209,13 @@ def test_get_datasets_to_update_version_different(self): # 'a' has a different version, 'c' is missing version in ht2_globals versions={'a': 'v2', 'b': 'v2', 'c': 'v1'}, enums={'a': {}, 'b': {}, 'c': {}}, - selects={'a': set(), 'b': set()}, + selects={'a': {}, 'b': {}}, ), ht2_globals=Globals( paths={'a': 'a_path', 'b': 'b_path'}, versions={'a': 'v1', 'b': 'v2'}, enums={'a': {}, 'b': {}}, - selects={'a': set(), 'b': set()}, + selects={'a': {}, 'b': {}}, ), ) self.assertTrue(result == ['a', 'c']) @@ -216,13 +227,13 @@ def test_get_datasets_to_update_path_different(self): paths={'a': 'a_path', 'b': 'old_b_path', 'c': 'extra_c_path'}, versions={'a': 'v1', 'b': 'v2'}, enums={'a': {}, 'b': {}}, - selects={'a': set(), 'b': set()}, + selects={'a': {}, 'b': {}}, ), ht2_globals=Globals( paths={'a': 'a_path', 'b': 'b_path'}, versions={'a': 'v1', 'b': 'v2'}, enums={'a': {}, 'b': {}}, - selects={'a': set(), 'b': set()}, + selects={'a': {}, 'b': {}}, ), ) self.assertTrue(result == ['b', 'c']) @@ -238,13 +249,13 @@ def test_get_datasets_to_update_enum_different(self): 'b': {'enum_key_1': []}, 'c': {}, }, - selects={'a': set(), 'b': set()}, + selects={'a': {}, 'b': {}}, ), ht2_globals=Globals( paths={'a': 'a_path', 'b': 'b_path'}, versions={'a': 'v1', 'b': 'v2'}, enums={'a': {'test_enum': ['C', 'D']}, 'b': {'enum_key_2': []}}, - selects={'a': set(), 'b': set()}, + selects={'a': {}, 'b': {}}, ), ) self.assertTrue(result == ['a', 'b', 'c']) @@ -257,16 +268,54 @@ def test_get_datasets_to_update_select_different(self): enums={'a': {}, 'b': {}}, # 'a' has extra select, 'b' has different select, 'c' is missing select in ht2_globals selects={ - 'a': {'field1', 'field2'}, - 'b': {'test_select'}, - 'c': set('test_select'), + 'a': {'field1': hl.tint32, 'field2': hl.tint32}, + 'b': {'test_select': hl.tint32}, + 'c': {'test_select': hl.tint32}, }, ), ht2_globals=Globals( paths={'a': 'a_path', 'b': 'b_path'}, versions={'a': 'v1', 'b': 'v2'}, enums={'a': {}, 'b': {}}, - selects={'a': {'field1'}, 'b': {'test_select_2'}}, + selects={'a': {'field1': hl.tint32}, 'b': {'test_select_2': hl.tint32}}, ), ) self.assertTrue(result == ['a', 'b', 'c']) + + def test_get_datasets_to_update_select_type_validation(self): + self.assertRaisesRegex( + ValueError, + "Unexpected field types detected in a: \\[\\('field1', dtype\\('int32'\\)\\)\\]", + get_datasets_to_update, + ht1_globals=Globals( + paths={'a': 'a_path'}, + versions={'a': 'v1'}, + enums={'a': {}}, + selects={ + 'a': {'field1': hl.tarray(hl.tint32)}, + }, + ), + ht2_globals=Globals( + paths={'a': 'a_path'}, + versions={'a': 'v1'}, + enums={'a': {}}, + selects={'a': {'field1': hl.tint32, 'field2': hl.tint32}}, + ), + ) + result = get_datasets_to_update( + ht1_globals=Globals( + paths={'a': 'a_path'}, + versions={'a': 'v1'}, + enums={'a': {}}, + selects={ + 'a': {'field1': hl.tarray(hl.tint32)}, + }, + ), + ht2_globals=Globals( + paths={'a': 'a_path'}, + versions={'a': 'v1'}, + enums={'a': {}}, + selects={'a': {'field1': hl.tarray(hl.tint32), 'field2': hl.tint32}}, + ), + ) + self.assertTrue(result == ['a']) diff --git a/v03_pipeline/lib/reference_data/config.py b/v03_pipeline/lib/reference_data/config.py index 8f7953576..54a9f4603 100644 --- a/v03_pipeline/lib/reference_data/config.py +++ b/v03_pipeline/lib/reference_data/config.py @@ -198,7 +198,7 @@ def custom_mpc_select(ht): 'pathogenicity': CLINVAR_PATHOGENICITIES, 'assertion': CLINVAR_ASSERTIONS, }, - 'filter': lambda ht: ~(ht.locus.contig == 'MT'), + 'filter': lambda ht: ht.locus.contig != 'MT', }, '38': { 'custom_import': download_and_import_latest_clinvar_vcf, @@ -209,7 +209,7 @@ def custom_mpc_select(ht): 'pathogenicity': CLINVAR_PATHOGENICITIES, 'assertion': CLINVAR_ASSERTIONS, }, - 'filter': lambda ht: ~(ht.locus.contig == 'chrM'), + 'filter': lambda ht: ht.locus.contig != 'chrM', }, }, 'dbnsfp': { diff --git a/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py b/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py index 842686c5c..67a5492bf 100644 --- a/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py +++ b/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py @@ -45,7 +45,7 @@ hl.tstruct( locus=hl.tlocus('GRCh38'), alleles=hl.tarray(hl.tstr), - PHRED=hl.tint32, + PHRED=hl.tfloat32, ), key=['locus', 'alleles'], globals=hl.Struct( @@ -760,7 +760,7 @@ def test_update_vat_with_updated_rdc_snv_indel_38( conditions=None, ), dbnsfp=hl.Struct( - REVEL_score=0.043, + REVEL_score=0.0430000014603138, SIFT_score=None, Polyphen2_HVAR_score=None, MutationTaster_pred_id=0, @@ -1168,7 +1168,7 @@ def test_update_vat_with_updated_rdc_snv_indel_37( conditions=None, ), dbnsfp=hl.Struct( - REVEL_score=0.043, + REVEL_score=0.0430000014603138, SIFT_score=None, Polyphen2_HVAR_score=None, MutationTaster_pred_id=0, diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py index b05fae398..21fe5f532 100644 --- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py +++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py @@ -202,6 +202,8 @@ def test_missing_interval_reference( worker.run() self.assertFalse(uvatwns_task.complete()) + @patch('v03_pipeline.lib.tasks.write_new_variants_table.register_alleles_in_chunks') + @patch('v03_pipeline.lib.tasks.write_new_variants_table.Env') @patch( 'v03_pipeline.lib.tasks.write_imported_callset.UpdatedCachedReferenceDatasetQuery', ) @@ -221,8 +223,10 @@ def test_multiple_update_vat( mock_vep: Mock, mock_standard_contigs: Mock, mock_update_vat_with_rdc_task: Mock, - mock_update_rdc_task: Mock, mock_updated_cached_reference_dataset_query, + mock_env: Mock, + mock_register_alleles: Mock, + mock_update_rdc_task: Mock, ) -> None: mock_updated_cached_reference_dataset_query.return_value = MockCompleteTask() mock_update_rdc_task.return_value = MockCompleteTask() @@ -235,6 +239,64 @@ def test_multiple_update_vat( ) mock_vep.side_effect = lambda ht, **_: ht.annotate(vep=MOCK_VEP_DATA) mock_vep_validate.return_value = None + # make register_alleles return CAIDs for 4 of 30 variants + mock_env.SHOULD_REGISTER_ALLELES = True + mock_register_alleles.side_effect = [ + iter( + [ + hl.Table.parallelize( + [ + hl.Struct( + locus=hl.Locus( + contig='chr1', + position=871269, + reference_genome='GRCh38', + ), + alleles=['A', 'C'], + CAID='CA1', + ), + hl.Struct( + locus=hl.Locus( + contig='chr1', + position=874734, + reference_genome='GRCh38', + ), + alleles=['C', 'T'], + CAID='CA2', + ), + hl.Struct( + locus=hl.Locus( + contig='chr1', + position=876499, + reference_genome='GRCh38', + ), + alleles=['A', 'G'], + CAID='CA3', + ), + hl.Struct( + locus=hl.Locus( + contig='chr1', + position=878314, + reference_genome='GRCh38', + ), + alleles=['G', 'C'], + CAID='CA4', + ), + ], + hl.tstruct( + locus=hl.tlocus('GRCh38'), + alleles=hl.tarray(hl.tstr), + CAID=hl.tstr, + ), + key=('locus', 'alleles'), + ), + ], + ), + iter( + [], + ), # for the second call, there are no new variants, return empty iterator + ] + mock_standard_contigs.return_value = {'chr1'} # This creates a mock validation table with 1 coding and 1 non-coding variant # explicitly chosen from the VCF. @@ -308,6 +370,7 @@ def test_multiple_update_vat( x for x in ht.select( 'gt_stats', + 'CAID', ).collect() if x.locus.position <= 871269 # noqa: PLR2004 ], @@ -320,6 +383,7 @@ def test_multiple_update_vat( ), alleles=['A', 'C'], gt_stats=hl.Struct(AC=0, AN=6, AF=0.0, hom=0), + CAID='CA1', ), ], ) @@ -363,6 +427,7 @@ def test_multiple_update_vat( 'xpos', 'gt_stats', 'screen', + 'CAID', ).collect() if x.locus.position <= 878809 # noqa: PLR2004 ], @@ -392,6 +457,7 @@ def test_multiple_update_vat( xpos=1000871269, gt_stats=hl.Struct(AC=1, AN=32, AF=0.03125, hom=0), screen=hl.Struct(region_type_ids=[1]), + CAID='CA1', ), hl.Struct( locus=hl.Locus( @@ -407,6 +473,7 @@ def test_multiple_update_vat( xpos=1000874734, gt_stats=hl.Struct(AC=1, AN=32, AF=0.03125, hom=0), screen=hl.Struct(region_type_ids=[]), + CAID='CA2', ), hl.Struct( locus=hl.Locus( @@ -422,6 +489,7 @@ def test_multiple_update_vat( xpos=1000876499, gt_stats=hl.Struct(AC=31, AN=32, AF=0.96875, hom=15), screen=hl.Struct(region_type_ids=[]), + CAID='CA3', ), hl.Struct( locus=hl.Locus( @@ -437,6 +505,7 @@ def test_multiple_update_vat( xpos=1000878314, gt_stats=hl.Struct(AC=3, AN=32, AF=0.09375, hom=0), screen=hl.Struct(region_type_ids=[]), + CAID='CA4', ), hl.Struct( locus=hl.Locus( @@ -452,6 +521,7 @@ def test_multiple_update_vat( xpos=1000878809, gt_stats=hl.Struct(AC=1, AN=32, AF=0.03125, hom=0), screen=hl.Struct(region_type_ids=[]), + CAID=None, ), ], ) @@ -564,6 +634,7 @@ def test_multiple_update_vat( ], ) + @patch('v03_pipeline.lib.tasks.write_new_variants_table.register_alleles_in_chunks') @patch( 'v03_pipeline.lib.tasks.write_new_variants_table.UpdateVariantAnnotationsTableWithUpdatedReferenceDataset', ) @@ -574,6 +645,7 @@ def test_update_vat_grch37( mock_vep_validate: Mock, mock_vep: Mock, mock_update_vat_with_rdc_task: Mock, + mock_register_alleles: Mock, mock_update_rdc_task: Mock, ) -> None: mock_update_rdc_task.return_value = MockCompleteTask() @@ -586,6 +658,7 @@ def test_update_vat_grch37( ) mock_vep.side_effect = lambda ht, **_: ht.annotate(vep=MOCK_VEP_DATA) mock_vep_validate.return_value = None + mock_register_alleles.side_effect = None worker = luigi.worker.Worker() uvatwns_task = UpdateVariantAnnotationsTableWithNewSamplesTask( reference_genome=ReferenceGenome.GRCh37, @@ -625,6 +698,7 @@ def test_update_vat_grch37( ) self.assertFalse(hasattr(ht, 'rg37_locus')) + @patch('v03_pipeline.lib.tasks.write_new_variants_table.register_alleles_in_chunks') @patch( 'v03_pipeline.lib.tasks.write_new_variants_table.UpdateVariantAnnotationsTableWithUpdatedReferenceDataset', ) @@ -637,6 +711,7 @@ def test_update_vat_without_accessing_private_datasets( mock_vep: Mock, mock_rdc_env: Mock, mock_update_vat_with_rdc_task: Mock, + mock_register_alleles: Mock, mock_update_rdc_task: Mock, ) -> None: mock_update_rdc_task.return_value = MockCompleteTask() @@ -657,6 +732,7 @@ def test_update_vat_without_accessing_private_datasets( mock_rdc_env.ACCESS_PRIVATE_REFERENCE_DATASETS = False mock_vep.side_effect = lambda ht, **_: ht.annotate(vep=MOCK_VEP_DATA) mock_vep_validate.return_value = None + mock_register_alleles.side_effect = None worker = luigi.worker.Worker() uvatwns_task = UpdateVariantAnnotationsTableWithNewSamplesTask( reference_genome=ReferenceGenome.GRCh38, @@ -696,12 +772,14 @@ def test_update_vat_without_accessing_private_datasets( ], ) + @patch('v03_pipeline.lib.tasks.write_new_variants_table.register_alleles_in_chunks') @patch( 'v03_pipeline.lib.tasks.write_new_variants_table.UpdateVariantAnnotationsTableWithUpdatedReferenceDataset', ) def test_mito_update_vat( self, mock_update_vat_with_rdc_task: Mock, + mock_register_alleles: Mock, mock_update_rdc_task: Mock, ) -> None: mock_update_rdc_task.return_value = MockCompleteTask() @@ -712,6 +790,7 @@ def test_mito_update_vat( sample_type=SampleType.WGS, ) ) + mock_register_alleles.side_effect = None worker = luigi.worker.Worker() update_variant_annotations_task = ( UpdateVariantAnnotationsTableWithNewSamplesTask( diff --git a/v03_pipeline/lib/tasks/write_imported_callset.py b/v03_pipeline/lib/tasks/write_imported_callset.py index d443b1854..c8b795821 100644 --- a/v03_pipeline/lib/tasks/write_imported_callset.py +++ b/v03_pipeline/lib/tasks/write_imported_callset.py @@ -7,6 +7,7 @@ split_multi_hts, ) from v03_pipeline.lib.misc.validation import ( + validate_allele_type, validate_expected_contig_frequency, validate_imputed_sex_ploidy, validate_no_duplicate_variants, @@ -134,6 +135,7 @@ def create_table(self) -> hl.MatrixTable: ), ) if self.validate and self.dataset_type.can_run_validation: + validate_allele_type(mt) validate_no_duplicate_variants(mt) validate_expected_contig_frequency(mt, self.reference_genome) coding_and_noncoding_ht = hl.read_table( diff --git a/v03_pipeline/lib/tasks/write_new_variants_table.py b/v03_pipeline/lib/tasks/write_new_variants_table.py index cdb65af7e..34ea5ee6a 100644 --- a/v03_pipeline/lib/tasks/write_new_variants_table.py +++ b/v03_pipeline/lib/tasks/write_new_variants_table.py @@ -7,6 +7,7 @@ from v03_pipeline.lib.annotations.rdc_dependencies import ( get_rdc_annotation_dependencies, ) +from v03_pipeline.lib.misc.allele_registry import register_alleles_in_chunks from v03_pipeline.lib.misc.callsets import callset_project_pairs, get_callset_ht from v03_pipeline.lib.misc.math import constrain from v03_pipeline.lib.model import Env, ReferenceDatasetCollection @@ -242,6 +243,28 @@ def create_table(self) -> hl.Table: rdc_ht = self.annotation_dependencies[f'{rdc.value}_ht'] new_variants_ht = new_variants_ht.join(rdc_ht, 'left') + # Register the new variant alleles to the Clingen Allele Registry + # and annotate new_variants table with CAID. + if ( + Env.SHOULD_REGISTER_ALLELES + and self.dataset_type.should_send_to_allele_registry + ): + ar_ht = hl.Table.parallelize( + [], + hl.tstruct( + locus=hl.tlocus(self.reference_genome.value), + alleles=hl.tarray(hl.tstr), + CAID=hl.tstr, + ), + key=('locus', 'alleles'), + ) + for ar_ht_chunk in register_alleles_in_chunks( + new_variants_ht, + self.reference_genome, + ): + ar_ht = ar_ht.union(ar_ht_chunk) + new_variants_ht = new_variants_ht.join(ar_ht, 'left') + return new_variants_ht.annotate_globals( updates={ hl.Struct(callset=callset_path, project_guid=project_guid) diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/.README.txt.crc b/v03_pipeline/var/test/reference_data/test_combined_1.ht/.README.txt.crc index a98fcdead..1c47b9a3c 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_1.ht/.README.txt.crc and b/v03_pipeline/var/test/reference_data/test_combined_1.ht/.README.txt.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_1.ht/.metadata.json.gz.crc index b2ace029a..db7a7824c 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_1.ht/.metadata.json.gz.crc and b/v03_pipeline/var/test/reference_data/test_combined_1.ht/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/README.txt b/v03_pipeline/var/test/reference_data/test_combined_1.ht/README.txt index 856600d4e..e46de4296 100644 --- a/v03_pipeline/var/test/reference_data/test_combined_1.ht/README.txt +++ b/v03_pipeline/var/test/reference_data/test_combined_1.ht/README.txt @@ -1,3 +1,3 @@ This folder comprises a Hail (www.hail.is) native Table or MatrixTable. - Written with version 0.2.128-eead8100a1c1 - Created at 2024/05/09 20:02:21 \ No newline at end of file + Written with version 0.2.130-bea04d9c79b5 + Created at 2024/05/20 13:48:16 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-963ef6be-cd5f-443a-970c-8b5cf2bcd090.idx/.index.crc b/v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-6a5a9d6a-4ded-424b-9735-922a5346e7ad.idx/.index.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-963ef6be-cd5f-443a-970c-8b5cf2bcd090.idx/.index.crc rename to v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-6a5a9d6a-4ded-424b-9735-922a5346e7ad.idx/.index.crc diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-963ef6be-cd5f-443a-970c-8b5cf2bcd090.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-6a5a9d6a-4ded-424b-9735-922a5346e7ad.idx/.metadata.json.gz.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-963ef6be-cd5f-443a-970c-8b5cf2bcd090.idx/.metadata.json.gz.crc rename to v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-6a5a9d6a-4ded-424b-9735-922a5346e7ad.idx/.metadata.json.gz.crc diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-963ef6be-cd5f-443a-970c-8b5cf2bcd090.idx/index b/v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-6a5a9d6a-4ded-424b-9735-922a5346e7ad.idx/index similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-963ef6be-cd5f-443a-970c-8b5cf2bcd090.idx/index rename to v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-6a5a9d6a-4ded-424b-9735-922a5346e7ad.idx/index diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-963ef6be-cd5f-443a-970c-8b5cf2bcd090.idx/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-6a5a9d6a-4ded-424b-9735-922a5346e7ad.idx/metadata.json.gz similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-963ef6be-cd5f-443a-970c-8b5cf2bcd090.idx/metadata.json.gz rename to v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-6a5a9d6a-4ded-424b-9735-922a5346e7ad.idx/metadata.json.gz diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_1.ht/metadata.json.gz index 2ad9c32f8..d00565756 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_1.ht/metadata.json.gz and b/v03_pipeline/var/test/reference_data/test_combined_1.ht/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/.metadata.json.gz.crc index e8f7e30d1..ddb5e7f25 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/.metadata.json.gz.crc and b/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/metadata.json.gz index 2ad21d8dc..19968eb85 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/metadata.json.gz and b/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/parts/.part-0-6a5a9d6a-4ded-424b-9735-922a5346e7ad.crc b/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/parts/.part-0-6a5a9d6a-4ded-424b-9735-922a5346e7ad.crc new file mode 100644 index 000000000..dd555f553 Binary files /dev/null and b/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/parts/.part-0-6a5a9d6a-4ded-424b-9735-922a5346e7ad.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/parts/.part-0-963ef6be-cd5f-443a-970c-8b5cf2bcd090.crc b/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/parts/.part-0-963ef6be-cd5f-443a-970c-8b5cf2bcd090.crc deleted file mode 100644 index 8d1ea48d7..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/parts/.part-0-963ef6be-cd5f-443a-970c-8b5cf2bcd090.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/parts/part-0-6a5a9d6a-4ded-424b-9735-922a5346e7ad b/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/parts/part-0-6a5a9d6a-4ded-424b-9735-922a5346e7ad new file mode 100644 index 000000000..446fb5491 Binary files /dev/null and b/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/parts/part-0-6a5a9d6a-4ded-424b-9735-922a5346e7ad differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/parts/part-0-963ef6be-cd5f-443a-970c-8b5cf2bcd090 b/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/parts/part-0-963ef6be-cd5f-443a-970c-8b5cf2bcd090 deleted file mode 100644 index a75be974c..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/parts/part-0-963ef6be-cd5f-443a-970c-8b5cf2bcd090 and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/.README.txt.crc b/v03_pipeline/var/test/reference_data/test_combined_37.ht/.README.txt.crc index 80764f419..1b96b5393 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_37.ht/.README.txt.crc and b/v03_pipeline/var/test/reference_data/test_combined_37.ht/.README.txt.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_37.ht/.metadata.json.gz.crc index d542606ef..82e0d4035 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_37.ht/.metadata.json.gz.crc and b/v03_pipeline/var/test/reference_data/test_combined_37.ht/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/README.txt b/v03_pipeline/var/test/reference_data/test_combined_37.ht/README.txt index e4ab633c1..e38d73d71 100644 --- a/v03_pipeline/var/test/reference_data/test_combined_37.ht/README.txt +++ b/v03_pipeline/var/test/reference_data/test_combined_37.ht/README.txt @@ -1,3 +1,3 @@ This folder comprises a Hail (www.hail.is) native Table or MatrixTable. - Written with version 0.2.128-eead8100a1c1 - Created at 2024/03/21 11:28:13 \ No newline at end of file + Written with version 0.2.130-bea04d9c79b5 + Created at 2024/05/20 15:38:26 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-ac85fcb0-1e7c-453f-be81-9cd356dc49ff.idx/.index.crc b/v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-ac85fcb0-1e7c-453f-be81-9cd356dc49ff.idx/.index.crc new file mode 100644 index 000000000..26d303267 Binary files /dev/null and b/v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-ac85fcb0-1e7c-453f-be81-9cd356dc49ff.idx/.index.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-ac85fcb0-1e7c-453f-be81-9cd356dc49ff.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-ac85fcb0-1e7c-453f-be81-9cd356dc49ff.idx/.metadata.json.gz.crc new file mode 100644 index 000000000..0e401dc36 Binary files /dev/null and b/v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-ac85fcb0-1e7c-453f-be81-9cd356dc49ff.idx/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-ac85fcb0-1e7c-453f-be81-9cd356dc49ff.idx/index b/v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-ac85fcb0-1e7c-453f-be81-9cd356dc49ff.idx/index new file mode 100644 index 000000000..df93a68fe Binary files /dev/null and b/v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-ac85fcb0-1e7c-453f-be81-9cd356dc49ff.idx/index differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-ac85fcb0-1e7c-453f-be81-9cd356dc49ff.idx/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-ac85fcb0-1e7c-453f-be81-9cd356dc49ff.idx/metadata.json.gz new file mode 100644 index 000000000..9152e863f Binary files /dev/null and b/v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-ac85fcb0-1e7c-453f-be81-9cd356dc49ff.idx/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-da748236-79a9-461e-a62a-a9280e863d48.idx/.index.crc b/v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-da748236-79a9-461e-a62a-a9280e863d48.idx/.index.crc deleted file mode 100644 index b2a8ab886..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-da748236-79a9-461e-a62a-a9280e863d48.idx/.index.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-da748236-79a9-461e-a62a-a9280e863d48.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-da748236-79a9-461e-a62a-a9280e863d48.idx/.metadata.json.gz.crc deleted file mode 100644 index 86fea9936..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-da748236-79a9-461e-a62a-a9280e863d48.idx/.metadata.json.gz.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-da748236-79a9-461e-a62a-a9280e863d48.idx/index b/v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-da748236-79a9-461e-a62a-a9280e863d48.idx/index deleted file mode 100644 index cf394c105..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-da748236-79a9-461e-a62a-a9280e863d48.idx/index and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-da748236-79a9-461e-a62a-a9280e863d48.idx/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-da748236-79a9-461e-a62a-a9280e863d48.idx/metadata.json.gz deleted file mode 100644 index aaacf6d64..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-da748236-79a9-461e-a62a-a9280e863d48.idx/metadata.json.gz and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_37.ht/metadata.json.gz index 3b0ac2334..00685b5ad 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_37.ht/metadata.json.gz and b/v03_pipeline/var/test/reference_data/test_combined_37.ht/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/.metadata.json.gz.crc index 45e676f0d..8474f90b1 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/.metadata.json.gz.crc and b/v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/metadata.json.gz index c49423eac..b83d7239a 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/metadata.json.gz and b/v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/parts/.part-0-ac85fcb0-1e7c-453f-be81-9cd356dc49ff.crc b/v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/parts/.part-0-ac85fcb0-1e7c-453f-be81-9cd356dc49ff.crc new file mode 100644 index 000000000..b4cce3d8b Binary files /dev/null and b/v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/parts/.part-0-ac85fcb0-1e7c-453f-be81-9cd356dc49ff.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/parts/.part-0-da748236-79a9-461e-a62a-a9280e863d48.crc b/v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/parts/.part-0-da748236-79a9-461e-a62a-a9280e863d48.crc deleted file mode 100644 index 333fa639b..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/parts/.part-0-da748236-79a9-461e-a62a-a9280e863d48.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/parts/part-0-ac85fcb0-1e7c-453f-be81-9cd356dc49ff b/v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/parts/part-0-ac85fcb0-1e7c-453f-be81-9cd356dc49ff new file mode 100644 index 000000000..07ea95686 Binary files /dev/null and b/v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/parts/part-0-ac85fcb0-1e7c-453f-be81-9cd356dc49ff differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/parts/part-0-da748236-79a9-461e-a62a-a9280e863d48 b/v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/parts/part-0-da748236-79a9-461e-a62a-a9280e863d48 deleted file mode 100644 index df40d6fa1..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/parts/part-0-da748236-79a9-461e-a62a-a9280e863d48 and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/.README.txt.crc b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/.README.txt.crc index c59177797..436531ab2 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/.README.txt.crc and b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/.README.txt.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/.metadata.json.gz.crc index 4dbe2f2d1..d1c46b25e 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/.metadata.json.gz.crc and b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/README.txt b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/README.txt index 9c642f4ea..0160eb2ca 100644 --- a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/README.txt +++ b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/README.txt @@ -1,3 +1,3 @@ This folder comprises a Hail (www.hail.is) native Table or MatrixTable. - Written with version 0.2.128-eead8100a1c1 - Created at 2024/03/21 11:35:30 \ No newline at end of file + Written with version 0.2.130-bea04d9c79b5 + Created at 2024/05/20 14:08:17 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-262c4389-1ff9-432a-ab5c-4c9cfa547dc4.idx/.index.crc b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-262c4389-1ff9-432a-ab5c-4c9cfa547dc4.idx/.index.crc deleted file mode 100644 index da7dad46d..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-262c4389-1ff9-432a-ab5c-4c9cfa547dc4.idx/.index.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.idx/.index.crc b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.idx/.index.crc new file mode 100644 index 000000000..12ec58de2 Binary files /dev/null and b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.idx/.index.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-262c4389-1ff9-432a-ab5c-4c9cfa547dc4.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.idx/.metadata.json.gz.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-262c4389-1ff9-432a-ab5c-4c9cfa547dc4.idx/.metadata.json.gz.crc rename to v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.idx/.metadata.json.gz.crc diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-262c4389-1ff9-432a-ab5c-4c9cfa547dc4.idx/index b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.idx/index similarity index 61% rename from v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-262c4389-1ff9-432a-ab5c-4c9cfa547dc4.idx/index rename to v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.idx/index index 967495b9e..7b548374b 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-262c4389-1ff9-432a-ab5c-4c9cfa547dc4.idx/index and b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.idx/index differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-262c4389-1ff9-432a-ab5c-4c9cfa547dc4.idx/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.idx/metadata.json.gz similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-262c4389-1ff9-432a-ab5c-4c9cfa547dc4.idx/metadata.json.gz rename to v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.idx/metadata.json.gz diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/metadata.json.gz index fb41286a1..b41476cb2 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/metadata.json.gz and b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/.metadata.json.gz.crc index d1369ed08..30c202768 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/.metadata.json.gz.crc and b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/metadata.json.gz index c39674e25..29e15e9d8 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/metadata.json.gz and b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/.part-0-262c4389-1ff9-432a-ab5c-4c9cfa547dc4.crc b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/.part-0-262c4389-1ff9-432a-ab5c-4c9cfa547dc4.crc deleted file mode 100644 index bfecb511a..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/.part-0-262c4389-1ff9-432a-ab5c-4c9cfa547dc4.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/.part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.crc b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/.part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.crc new file mode 100644 index 000000000..f8028adef Binary files /dev/null and b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/.part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/part-0-262c4389-1ff9-432a-ab5c-4c9cfa547dc4 b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/part-0-262c4389-1ff9-432a-ab5c-4c9cfa547dc4 deleted file mode 100644 index 7acd67edc..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/part-0-262c4389-1ff9-432a-ab5c-4c9cfa547dc4 and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2 b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2 new file mode 100644 index 000000000..f78553dbf Binary files /dev/null and b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2 differ diff --git a/v03_pipeline/var/test/reference_data/test_interval_1.ht/.README.txt.crc b/v03_pipeline/var/test/reference_data/test_interval_1.ht/.README.txt.crc index 2985d33a7..7132fdfe3 100644 Binary files a/v03_pipeline/var/test/reference_data/test_interval_1.ht/.README.txt.crc and b/v03_pipeline/var/test/reference_data/test_interval_1.ht/.README.txt.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_interval_1.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_interval_1.ht/.metadata.json.gz.crc index 8eabeebfe..2cacfc95c 100644 Binary files a/v03_pipeline/var/test/reference_data/test_interval_1.ht/.metadata.json.gz.crc and b/v03_pipeline/var/test/reference_data/test_interval_1.ht/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_interval_1.ht/README.txt b/v03_pipeline/var/test/reference_data/test_interval_1.ht/README.txt index 2c105134d..3d9a5ac98 100644 --- a/v03_pipeline/var/test/reference_data/test_interval_1.ht/README.txt +++ b/v03_pipeline/var/test/reference_data/test_interval_1.ht/README.txt @@ -1,3 +1,3 @@ This folder comprises a Hail (www.hail.is) native Table or MatrixTable. - Written with version 0.2.114-cc8d36408b36 - Created at 2023/07/13 19:51:12 \ No newline at end of file + Written with version 0.2.130-bea04d9c79b5 + Created at 2024/05/20 13:22:32 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_data/test_interval_1.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_interval_1.ht/globals/.metadata.json.gz.crc index d23992107..d68bbbd08 100644 Binary files a/v03_pipeline/var/test/reference_data/test_interval_1.ht/globals/.metadata.json.gz.crc and b/v03_pipeline/var/test/reference_data/test_interval_1.ht/globals/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_interval_1.ht/globals/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_interval_1.ht/globals/metadata.json.gz index 380be3d03..c16ad768c 100644 Binary files a/v03_pipeline/var/test/reference_data/test_interval_1.ht/globals/metadata.json.gz and b/v03_pipeline/var/test/reference_data/test_interval_1.ht/globals/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_data/test_interval_1.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_data/test_interval_1.ht/globals/parts/.part-0.crc index e076b9bd0..c56f0f37b 100644 Binary files a/v03_pipeline/var/test/reference_data/test_interval_1.ht/globals/parts/.part-0.crc and b/v03_pipeline/var/test/reference_data/test_interval_1.ht/globals/parts/.part-0.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_interval_1.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_data/test_interval_1.ht/globals/parts/part-0 index 506199b89..7fc519095 100644 Binary files a/v03_pipeline/var/test/reference_data/test_interval_1.ht/globals/parts/part-0 and b/v03_pipeline/var/test/reference_data/test_interval_1.ht/globals/parts/part-0 differ diff --git a/v03_pipeline/var/test/reference_data/test_interval_1.ht/index/part-0-1224c3b3-ab5b-49d7-8d6d-6084ccbbc683.idx/.index.crc b/v03_pipeline/var/test/reference_data/test_interval_1.ht/index/part-0-1224c3b3-ab5b-49d7-8d6d-6084ccbbc683.idx/.index.crc new file mode 100644 index 000000000..23324f542 Binary files /dev/null and b/v03_pipeline/var/test/reference_data/test_interval_1.ht/index/part-0-1224c3b3-ab5b-49d7-8d6d-6084ccbbc683.idx/.index.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_interval_1.ht/index/part-0-1224c3b3-ab5b-49d7-8d6d-6084ccbbc683.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_interval_1.ht/index/part-0-1224c3b3-ab5b-49d7-8d6d-6084ccbbc683.idx/.metadata.json.gz.crc new file mode 100644 index 000000000..576d4ffd2 Binary files /dev/null and b/v03_pipeline/var/test/reference_data/test_interval_1.ht/index/part-0-1224c3b3-ab5b-49d7-8d6d-6084ccbbc683.idx/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_interval_1.ht/index/part-0-1224c3b3-ab5b-49d7-8d6d-6084ccbbc683.idx/index b/v03_pipeline/var/test/reference_data/test_interval_1.ht/index/part-0-1224c3b3-ab5b-49d7-8d6d-6084ccbbc683.idx/index new file mode 100644 index 000000000..952d782a3 Binary files /dev/null and b/v03_pipeline/var/test/reference_data/test_interval_1.ht/index/part-0-1224c3b3-ab5b-49d7-8d6d-6084ccbbc683.idx/index differ diff --git a/v03_pipeline/var/test/reference_data/test_interval_1.ht/index/part-0-1224c3b3-ab5b-49d7-8d6d-6084ccbbc683.idx/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_interval_1.ht/index/part-0-1224c3b3-ab5b-49d7-8d6d-6084ccbbc683.idx/metadata.json.gz new file mode 100644 index 000000000..9b05326c4 Binary files /dev/null and b/v03_pipeline/var/test/reference_data/test_interval_1.ht/index/part-0-1224c3b3-ab5b-49d7-8d6d-6084ccbbc683.idx/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_data/test_interval_1.ht/index/part-0-2d30884d-a682-4d9e-9214-4bf4b5156c98.idx/.index.crc b/v03_pipeline/var/test/reference_data/test_interval_1.ht/index/part-0-2d30884d-a682-4d9e-9214-4bf4b5156c98.idx/.index.crc deleted file mode 100644 index 241cc8228..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_interval_1.ht/index/part-0-2d30884d-a682-4d9e-9214-4bf4b5156c98.idx/.index.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_interval_1.ht/index/part-0-2d30884d-a682-4d9e-9214-4bf4b5156c98.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_interval_1.ht/index/part-0-2d30884d-a682-4d9e-9214-4bf4b5156c98.idx/.metadata.json.gz.crc deleted file mode 100644 index d24bbd1ab..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_interval_1.ht/index/part-0-2d30884d-a682-4d9e-9214-4bf4b5156c98.idx/.metadata.json.gz.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_interval_1.ht/index/part-0-2d30884d-a682-4d9e-9214-4bf4b5156c98.idx/index b/v03_pipeline/var/test/reference_data/test_interval_1.ht/index/part-0-2d30884d-a682-4d9e-9214-4bf4b5156c98.idx/index deleted file mode 100644 index 10e426a2e..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_interval_1.ht/index/part-0-2d30884d-a682-4d9e-9214-4bf4b5156c98.idx/index and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_interval_1.ht/index/part-0-2d30884d-a682-4d9e-9214-4bf4b5156c98.idx/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_interval_1.ht/index/part-0-2d30884d-a682-4d9e-9214-4bf4b5156c98.idx/metadata.json.gz deleted file mode 100644 index 417a94fc4..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_interval_1.ht/index/part-0-2d30884d-a682-4d9e-9214-4bf4b5156c98.idx/metadata.json.gz and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_interval_1.ht/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_interval_1.ht/metadata.json.gz index 55f91078d..2654291a9 100644 Binary files a/v03_pipeline/var/test/reference_data/test_interval_1.ht/metadata.json.gz and b/v03_pipeline/var/test/reference_data/test_interval_1.ht/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_data/test_interval_1.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_interval_1.ht/rows/.metadata.json.gz.crc index 2d5b49e82..017eef36d 100644 Binary files a/v03_pipeline/var/test/reference_data/test_interval_1.ht/rows/.metadata.json.gz.crc and b/v03_pipeline/var/test/reference_data/test_interval_1.ht/rows/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_interval_1.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_interval_1.ht/rows/metadata.json.gz index 2de7cf083..668d507fd 100644 Binary files a/v03_pipeline/var/test/reference_data/test_interval_1.ht/rows/metadata.json.gz and b/v03_pipeline/var/test/reference_data/test_interval_1.ht/rows/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_data/test_interval_1.ht/rows/parts/.part-0-1224c3b3-ab5b-49d7-8d6d-6084ccbbc683.crc b/v03_pipeline/var/test/reference_data/test_interval_1.ht/rows/parts/.part-0-1224c3b3-ab5b-49d7-8d6d-6084ccbbc683.crc new file mode 100644 index 000000000..a4b13f78f Binary files /dev/null and b/v03_pipeline/var/test/reference_data/test_interval_1.ht/rows/parts/.part-0-1224c3b3-ab5b-49d7-8d6d-6084ccbbc683.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_interval_1.ht/rows/parts/.part-0-2d30884d-a682-4d9e-9214-4bf4b5156c98.crc b/v03_pipeline/var/test/reference_data/test_interval_1.ht/rows/parts/.part-0-2d30884d-a682-4d9e-9214-4bf4b5156c98.crc deleted file mode 100644 index 95a7bb76c..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_interval_1.ht/rows/parts/.part-0-2d30884d-a682-4d9e-9214-4bf4b5156c98.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_interval_1.ht/rows/parts/part-0-1224c3b3-ab5b-49d7-8d6d-6084ccbbc683 b/v03_pipeline/var/test/reference_data/test_interval_1.ht/rows/parts/part-0-1224c3b3-ab5b-49d7-8d6d-6084ccbbc683 new file mode 100644 index 000000000..1d5c39801 Binary files /dev/null and b/v03_pipeline/var/test/reference_data/test_interval_1.ht/rows/parts/part-0-1224c3b3-ab5b-49d7-8d6d-6084ccbbc683 differ diff --git a/v03_pipeline/var/test/reference_data/test_interval_1.ht/rows/parts/part-0-2d30884d-a682-4d9e-9214-4bf4b5156c98 b/v03_pipeline/var/test/reference_data/test_interval_1.ht/rows/parts/part-0-2d30884d-a682-4d9e-9214-4bf4b5156c98 deleted file mode 100644 index 2fb6ca9cb..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_interval_1.ht/rows/parts/part-0-2d30884d-a682-4d9e-9214-4bf4b5156c98 and /dev/null differ