From 8c94cc95433cd0a0408d7918f093d3c1899c47dc Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Mon, 13 May 2024 15:14:54 -0400 Subject: [PATCH 1/7] Miscellaneous VEP tweaks --- .cloudbuild/vep-docker.cloudbuild.yaml | 4 ++-- v03_pipeline/bin/vep-110-GRCh38.sh | 6 ------ 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/.cloudbuild/vep-docker.cloudbuild.yaml b/.cloudbuild/vep-docker.cloudbuild.yaml index fa3d39f57..b1a645847 100644 --- a/.cloudbuild/vep-docker.cloudbuild.yaml +++ b/.cloudbuild/vep-docker.cloudbuild.yaml @@ -1,11 +1,11 @@ # Run locally with: # -# gcloud builds submit --quiet --substitutions='_VEP_VERSION=110' --config .cloudbuild/vep-docker.cloudbuild.yaml v03_pipeline/ +# gcloud builds submit --quiet --substitutions='_VEP_VERSION=110' --config .cloudbuild/vep-docker.cloudbuild.yaml v03_pipeline/deploy steps: - name: 'gcr.io/kaniko-project/executor:v1.3.0' args: - --destination=gcr.io/seqr-project/vep-docker-image:${_VEP_VERSION} - - --dockerfile=deploy/Dockerfile.vep + - --dockerfile=Dockerfile.vep - --cache=true - --cache-ttl=168h diff --git a/v03_pipeline/bin/vep-110-GRCh38.sh b/v03_pipeline/bin/vep-110-GRCh38.sh index dbb497beb..1156dce62 100644 --- a/v03_pipeline/bin/vep-110-GRCh38.sh +++ b/v03_pipeline/bin/vep-110-GRCh38.sh @@ -42,12 +42,6 @@ gcloud storage cp --billing-project $PROJECT gs://seqr-reference-data/vep/110/ve gcloud storage cp --billing-project $PROJECT gs://seqr-reference-data/vep/110/uORF_5UTR_${ASSEMBLY}_PUBLIC.txt /vep_data/ & # Raw data files copied from the bucket (https://console.cloud.google.com/storage/browser/dm_alphamissense;tab=objects?prefix=&forceOnObjectsSortingFiltering=false) -# Some investigation led us to want to combine the canonical and non-canonical transcript tsvs (run inside the VEP docker container): -# cat AlphaMissense_hg38.tsv.gz | gunzip | grep -v '#' | awk 'BEGIN { OFS = "\t" };{$6=""; print $0}' > AlphaMissense_combined_hg38.tsv -# cat AlphaMissense_isoforms_hg38.tsv.gz | gunzip | grep -v '#' >> AlphaMissense_combined_hg38.tsv -# cat AlphaMissense_combined_hg38.tsv | sort --parallel=12 --buffer-size=20G -k1,1 -k2,2n > AlphaMissense_combined_sorted_hg38.tsv -# cat AlphaMissense_combined_sorted_hg38.tsv | sed '1i #CHROM\tPOS\tREF\tALT\tgenome\ttranscript_id\tprotein_variant\tam_pathogenicity\tam_class' > AlphaMissense_hg38.tsv -# bgzip AlphaMissense_hg38.tsv # tabix -s 1 -b 2 -e 2 -f -S 1 AlphaMissense_hg38.tsv.gz gcloud storage cp --billing-project $PROJECT 'gs://seqr-reference-data/vep/110/AlphaMissense_hg38.tsv.*' /vep_data/ & From 0600039626c9ca6b007a1bf068e051872f6b0657 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 16 May 2024 12:07:17 -0400 Subject: [PATCH 2/7] Benb/validate with allele type (#785) * Bump requirements * add validation * format --- requirements.in | 2 +- requirements.txt | 9 ++--- v03_pipeline/lib/misc/validation.py | 13 +++++++ v03_pipeline/lib/misc/validation_test.py | 39 +++++++++++++++++++ .../lib/tasks/write_imported_callset.py | 2 + 5 files changed, 58 insertions(+), 7 deletions(-) diff --git a/requirements.in b/requirements.in index 42b767ccb..0d598be5e 100644 --- a/requirements.in +++ b/requirements.in @@ -1,6 +1,6 @@ elasticsearch==7.9.1 google-api-python-client>=1.8.0 -hail==0.2.128 +hail==0.2.130 luigi>=3.4.0 gnomad==0.6.4 google-cloud-storage>=2.14.0 diff --git a/requirements.txt b/requirements.txt index ccc808971..c2138f1cd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -129,7 +129,7 @@ google-resumable-media==2.7.0 # via google-cloud-storage googleapis-common-protos==1.61.0 # via google-api-core -hail==0.2.128 +hail==0.2.130 # via -r requirements.in hdbscan==0.8.33 # via gnomad @@ -202,7 +202,7 @@ numpy==1.26.2 # scipy oauthlib==3.2.2 # via requests-oauthlib -orjson==3.9.11 +orjson==3.9.10 # via hail packaging==23.2 # via @@ -230,7 +230,6 @@ protobuf==3.20.2 # via # google-api-core # googleapis-common-protos - # hail ptyprocess==0.7.0 # via pexpect pure-eval==0.2.2 @@ -252,9 +251,7 @@ pygments==2.17.2 # ipython # rich pyjwt[crypto]==2.8.0 - # via - # msal - # pyjwt + # via msal pyparsing==3.1.1 # via httplib2 pyspark==3.3.3 diff --git a/v03_pipeline/lib/misc/validation.py b/v03_pipeline/lib/misc/validation.py index 84699a25b..86b672ddc 100644 --- a/v03_pipeline/lib/misc/validation.py +++ b/v03_pipeline/lib/misc/validation.py @@ -11,6 +11,19 @@ class SeqrValidationError(Exception): pass +def validate_allele_type( + mt: hl.MatrixTable, +) -> None: + ht = mt.rows() + ht = ht.filter( + hl.numeric_allele_type(ht.alleles[0], ht.alleles[1]) + == hl.genetics.allele_type.AlleleType.UNKNOWN, + ) + if ht.count() > 0: + msg = f'Alleles with Unknown AlleleType are present in the callset: {ht.alleles.collect()}' + raise SeqrValidationError(msg) + + def validate_no_duplicate_variants( mt: hl.MatrixTable, ) -> None: diff --git a/v03_pipeline/lib/misc/validation_test.py b/v03_pipeline/lib/misc/validation_test.py index 2ce4b3422..0512d9284 100644 --- a/v03_pipeline/lib/misc/validation_test.py +++ b/v03_pipeline/lib/misc/validation_test.py @@ -4,6 +4,7 @@ from v03_pipeline.lib.misc.validation import ( SeqrValidationError, + validate_allele_type, validate_expected_contig_frequency, validate_imputed_sex_ploidy, validate_no_duplicate_variants, @@ -32,6 +33,44 @@ def _mt_from_contigs(contigs): class ValidationTest(unittest.TestCase): + def test_validate_allele_type(self) -> None: + mt = hl.MatrixTable.from_parts( + rows={ + 'locus': [ + hl.Locus( + contig='chr1', + position=1, + reference_genome='GRCh38', + ), + hl.Locus( + contig='chr1', + position=2, + reference_genome='GRCh38', + ), + hl.Locus( + contig='chr1', + position=3, + reference_genome='GRCh38', + ), + ], + 'alleles': [ + ['A', 'T'], + # NB: star alleles should pass through this validation just fine, + # but are eventually filtered out upstream. + ['A', '*'], + ['A', '-'], + ], + }, + cols={'s': ['sample_1']}, + entries={'HL': [[0.0], [0.0], [0.0]]}, + ).key_rows_by('locus', 'alleles') + self.assertRaisesRegex( + SeqrValidationError, + "Alleles with Unknown AlleleType are present in the callset: \\[\\['A', '-'\\]\\]", + validate_allele_type, + mt, + ) + def test_validate_imputed_sex_ploidy(self) -> None: sex_check_ht = hl.read_table(TEST_SEX_CHECK_1) mt = hl.MatrixTable.from_parts( diff --git a/v03_pipeline/lib/tasks/write_imported_callset.py b/v03_pipeline/lib/tasks/write_imported_callset.py index d443b1854..c8b795821 100644 --- a/v03_pipeline/lib/tasks/write_imported_callset.py +++ b/v03_pipeline/lib/tasks/write_imported_callset.py @@ -7,6 +7,7 @@ split_multi_hts, ) from v03_pipeline.lib.misc.validation import ( + validate_allele_type, validate_expected_contig_frequency, validate_imputed_sex_ploidy, validate_no_duplicate_variants, @@ -134,6 +135,7 @@ def create_table(self) -> hl.MatrixTable: ), ) if self.validate and self.dataset_type.can_run_validation: + validate_allele_type(mt) validate_no_duplicate_variants(mt) validate_expected_contig_frequency(mt, self.reference_genome) coding_and_noncoding_ht = hl.read_table( From d77a4e614e98e1776776685b049986b3e7c1ba9d Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 16 May 2024 12:07:26 -0400 Subject: [PATCH 3/7] Fix syntax (#787) * Bump requirements * add validation * format * Fix syntax --- v03_pipeline/lib/reference_data/config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/v03_pipeline/lib/reference_data/config.py b/v03_pipeline/lib/reference_data/config.py index 8f7953576..54a9f4603 100644 --- a/v03_pipeline/lib/reference_data/config.py +++ b/v03_pipeline/lib/reference_data/config.py @@ -198,7 +198,7 @@ def custom_mpc_select(ht): 'pathogenicity': CLINVAR_PATHOGENICITIES, 'assertion': CLINVAR_ASSERTIONS, }, - 'filter': lambda ht: ~(ht.locus.contig == 'MT'), + 'filter': lambda ht: ht.locus.contig != 'MT', }, '38': { 'custom_import': download_and_import_latest_clinvar_vcf, @@ -209,7 +209,7 @@ def custom_mpc_select(ht): 'pathogenicity': CLINVAR_PATHOGENICITIES, 'assertion': CLINVAR_ASSERTIONS, }, - 'filter': lambda ht: ~(ht.locus.contig == 'chrM'), + 'filter': lambda ht: ht.locus.contig != 'chrM', }, }, 'dbnsfp': { From 82140dbb9e8c746189604a0a3e235581f17dac1a Mon Sep 17 00:00:00 2001 From: Julia Klugherz Date: Fri, 17 May 2024 14:06:35 -0400 Subject: [PATCH 4/7] allele registry (#759) * add allele registry step in update vat with samples task * shh * existing tests pass * fix test deps * test * annotation_dependencies * ruff * take out the zero check * fix requirements new task name * move vep into new variants task * only annotate lookup from callset_ht * clean up mocks * r * working * working? * not that * minor changes and test cases * most recent script * working version * fix the test * implement ht chunking * fix patches * fix patches * register now yields id map of returned caids * r * fix some tests * return a hail table instead * use __str__ * log to track variants we can't map back * move to gcs with flag * union ar_ht instead of a bunch of left joins to prevent CAID, CAID_1, CAID_2... * cleaner * it is all coming together now ' * gnomad ids for 37' * use genomicalleles and gnomad ids * secrets * secret * move stuff out of environment file * add more logging * fix test * fix the other test * ruff * test * comments * o --- requirements.in | 1 + requirements.txt | 31 ++- v03_pipeline/lib/misc/allele_registry.py | 214 ++++++++++++++++ v03_pipeline/lib/misc/allele_registry_test.py | 233 ++++++++++++++++++ v03_pipeline/lib/model/dataset_type.py | 4 + v03_pipeline/lib/model/definitions.py | 70 ++++++ v03_pipeline/lib/model/environment.py | 6 + ...annotations_table_with_new_samples_test.py | 81 +++++- .../lib/tasks/write_new_variants_table.py | 23 ++ 9 files changed, 659 insertions(+), 4 deletions(-) create mode 100644 v03_pipeline/lib/misc/allele_registry.py create mode 100644 v03_pipeline/lib/misc/allele_registry_test.py diff --git a/requirements.in b/requirements.in index 0d598be5e..e1219fd47 100644 --- a/requirements.in +++ b/requirements.in @@ -4,3 +4,4 @@ hail==0.2.130 luigi>=3.4.0 gnomad==0.6.4 google-cloud-storage>=2.14.0 +google-cloud-secret-manager>=2.20.0 diff --git a/requirements.txt b/requirements.txt index c2138f1cd..b9a3cc37b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -97,10 +97,11 @@ frozenlist==1.4.0 # hail gnomad==0.6.4 # via -r requirements.in -google-api-core==2.14.0 +google-api-core[grpc]==2.14.0 # via # google-api-python-client # google-cloud-core + # google-cloud-secret-manager # google-cloud-storage google-api-python-client==2.108.0 # via -r requirements.in @@ -111,6 +112,7 @@ google-auth==2.23.4 # google-auth-httplib2 # google-auth-oauthlib # google-cloud-core + # google-cloud-secret-manager # google-cloud-storage # hail google-auth-httplib2==0.1.1 @@ -119,6 +121,8 @@ google-auth-oauthlib==0.8.0 # via hail google-cloud-core==2.4.1 # via google-cloud-storage +google-cloud-secret-manager==2.20.0 + # via -r requirements.in google-cloud-storage==2.14.0 # via -r requirements.in google-crc32c==1.5.0 @@ -127,7 +131,20 @@ google-crc32c==1.5.0 # google-resumable-media google-resumable-media==2.7.0 # via google-cloud-storage -googleapis-common-protos==1.61.0 +googleapis-common-protos[grpc]==1.61.0 + # via + # google-api-core + # grpc-google-iam-v1 + # grpcio-status +grpc-google-iam-v1==0.13.0 + # via google-cloud-secret-manager +grpcio==1.63.0 + # via + # google-api-core + # googleapis-common-protos + # grpc-google-iam-v1 + # grpcio-status +grpcio-status==1.48.2 # via google-api-core hail==0.2.130 # via -r requirements.in @@ -226,10 +243,16 @@ portalocker==2.8.2 # via msal-extensions prompt-toolkit==3.0.41 # via ipython +proto-plus==1.23.0 + # via google-cloud-secret-manager protobuf==3.20.2 # via # google-api-core + # google-cloud-secret-manager # googleapis-common-protos + # grpc-google-iam-v1 + # grpcio-status + # proto-plus ptyprocess==0.7.0 # via pexpect pure-eval==0.2.2 @@ -251,7 +274,9 @@ pygments==2.17.2 # ipython # rich pyjwt[crypto]==2.8.0 - # via msal + # via + # msal + # pyjwt pyparsing==3.1.1 # via httplib2 pyspark==3.3.3 diff --git a/v03_pipeline/lib/misc/allele_registry.py b/v03_pipeline/lib/misc/allele_registry.py new file mode 100644 index 000000000..5d480a5bb --- /dev/null +++ b/v03_pipeline/lib/misc/allele_registry.py @@ -0,0 +1,214 @@ +import dataclasses +import hashlib +import json +import math +import time +import uuid + +import hail as hl +import hailtop.fs as hfs +import requests +from google.cloud import secretmanager +from requests import HTTPError + +from v03_pipeline.lib.logger import get_logger +from v03_pipeline.lib.model import Env, ReferenceGenome + +MAX_VARIANTS_PER_REQUEST = 1000000 +ALLELE_REGISTRY_URL = 'https://reg.genome.network/alleles?file=vcf&fields=none+@id+genomicAlleles+externalRecords.{}.id' +HTTP_REQUEST_TIMEOUT_S = 420 + +logger = get_logger(__name__) + + +@dataclasses.dataclass +class AlleleRegistryError: + base_url: str + error_type: str + description: str + message: str + input_line: str | None + + @classmethod + def from_api_response(cls, response: dict, base_url: str): + return cls( + base_url=base_url, + error_type=response['errorType'], + description=response['description'], + message=response['message'], + input_line=response.get('inputLine'), + ) + + def __str__(self) -> str: + msg = ( + f'\nAPI URL: {self.base_url}\nTYPE: {self.error_type}' + f'\nDESCRIPTION: {self.description}\nMESSAGE: {self.message}' + ) + return ( + msg if self.input_line is None else f'{msg}\nINPUT_LINE: {self.input_line}' + ) + + +def register_alleles_in_chunks( + ht: hl.Table, + reference_genome: ReferenceGenome, + base_url: str = ALLELE_REGISTRY_URL, + chunk_size: int = MAX_VARIANTS_PER_REQUEST, +): + num_rows = ht.count() + num_chunks = math.ceil(num_rows / chunk_size) + logger.info( + f'Registering {num_rows} allele(s) in chunks of {chunk_size} in {num_chunks} request(s).', + ) + for start_idx in range(0, num_rows, chunk_size): + end_idx = start_idx + chunk_size + if end_idx == chunk_size: + chunk_ht = ht.head(chunk_size) + elif end_idx <= num_rows: + chunk_ht = ht.head(end_idx).tail(chunk_size) + else: + chunk_ht = ht.tail(end_idx - num_rows) + yield register_alleles(chunk_ht, reference_genome, base_url) + + +def register_alleles( + ht: hl.Table, + reference_genome: ReferenceGenome, + base_url: str, +) -> hl.Table: + uuid4 = uuid.uuid4() + raw_vcf_file_name = f'{Env.HAIL_TMPDIR}/r_{uuid4}.vcf' + formatted_vcf_file_name = f'{Env.HAIL_TMPDIR}/f_{uuid4}.vcf' + + # Export the variants to a VCF + hl.export_vcf(ht, raw_vcf_file_name) + + # Reformat the VCF created by hail's 'export_vcf' function to be compatible with the Allele Registry + with hfs.open(raw_vcf_file_name, 'r') as vcf_in, hfs.open( + formatted_vcf_file_name, + 'w', + ) as vcf_out: + vcf_out.writelines(reference_genome.allele_registry_vcf_header) + for line in vcf_in: + if not line.startswith('#'): + # NB: The Allele Registry does not accept contigs prefixed with 'chr', even for GRCh38 + vcf_out.write(line.replace('chr', '')) + + logger.info('Calling the ClinGen Allele Registry') + with hfs.open(formatted_vcf_file_name, 'r') as vcf_in: + data = vcf_in.read() + res = requests.put( + url=build_url(base_url, reference_genome), + data=data, + timeout=HTTP_REQUEST_TIMEOUT_S, + ) + return handle_api_response(res, base_url, reference_genome) + + +def build_url(base_url: str, reference_genome: ReferenceGenome) -> str: + login, password = get_ar_credentials_from_secret_manager() + + # Request a gnomad ID for the correct reference genome + base_url = base_url.format(reference_genome.allele_registry_gnomad_id) + + # adapted from https://reg.clinicalgenome.org/doc/scripts/request_with_payload.py + identity = hashlib.sha1((login + password).encode('utf-8')).hexdigest() # noqa: S324 + gb_time = str(int(time.time())) + token = hashlib.sha1((base_url + identity + gb_time).encode('utf-8')).hexdigest() # noqa: S324 + return base_url + '&gbLogin=' + login + '&gbTime=' + gb_time + '&gbToken=' + token + + +def get_ar_credentials_from_secret_manager() -> tuple[str, str]: + if Env.ALLELE_REGISTRY_SECRET_NAME is None: + msg = ( + 'SHOULD_REGISTER_ALLELES is True but cannot get allele registry credentials ' + 'because ALLELE_REGISTRY_SECRET_NAME is not set' + ) + raise ValueError(msg) + + client = secretmanager.SecretManagerServiceClient() + name = client.secret_version_path( + Env.PROJECT_ID, + Env.ALLELE_REGISTRY_SECRET_NAME, + 'latest', + ) + response = client.access_secret_version(request={'name': name}) + payload_dict = json.loads(response.payload.data.decode('UTF-8')) + return payload_dict['login'], payload_dict['password'] + + +def handle_api_response( + res: requests.Response, + base_url: str, + reference_genome: ReferenceGenome, +) -> hl.Table: + response = res.json() + if not res.ok or 'errorType' in response: + error = AlleleRegistryError.from_api_response(response, base_url) + logger.error(error) + raise HTTPError(error.message) + + parsed_structs = [] + errors = [] + unmappable_variants = [] + for allele_response in response: + if 'errorType' in allele_response: + errors.append( + AlleleRegistryError.from_api_response(allele_response, base_url), + ) + continue + + # Extract CAID and allele info + caid = allele_response['@id'].split('/')[-1] + allele_info = next( + record + for record in allele_response['genomicAlleles'] + if record['referenceGenome'] == reference_genome.value + ) + chrom = allele_info['chromosome'] + pos = allele_info['coordinates'][0]['end'] + ref = allele_info['coordinates'][0]['referenceAllele'] + alt = allele_info['coordinates'][0]['allele'] + + if ref == '' or alt == '': + # AR will turn alleles like ["A","ATT"] to ["", "TT"] so try using gnomad IDs instead + if 'externalRecords' in allele_response: + gnomad_id = allele_response['externalRecords'][ + reference_genome.allele_registry_gnomad_id + ][0]['id'] + chrom, pos, ref, alt = gnomad_id.split('-') + else: + unmappable_variants.append(allele_response) + continue + + struct = hl.Struct( + locus=hl.Locus( + f'chr{chrom}' if reference_genome == ReferenceGenome.GRCh38 else chrom, + int(pos), + reference_genome=reference_genome.value, + ), + alleles=[ref, alt], + CAID=caid, + ) + parsed_structs.append(struct) + + logger.info( + f'{len(response) - len(errors)} out of {len(response)} variants returned CAID(s)', + ) + logger.info( + f'{len(unmappable_variants)} registered variant(s) cannot be mapped back to ours. ' + f'\nFirst unmappable variant:\n{unmappable_variants[0]}', + ) + if errors: + logger.warning( + f'{len(errors)} failed. First error: {errors[0]}', + ) + return hl.Table.parallelize( + parsed_structs, + hl.tstruct( + locus=hl.tlocus(reference_genome.value), + alleles=hl.tarray(hl.tstr), + CAID=hl.tstr, + ), + key=('locus', 'alleles'), + ) diff --git a/v03_pipeline/lib/misc/allele_registry_test.py b/v03_pipeline/lib/misc/allele_registry_test.py new file mode 100644 index 000000000..83a9ceda6 --- /dev/null +++ b/v03_pipeline/lib/misc/allele_registry_test.py @@ -0,0 +1,233 @@ +import shutil +import tempfile +from unittest.mock import ANY, Mock, patch + +import hail as hl +import requests + +from v03_pipeline.lib.misc.allele_registry import ( + HTTP_REQUEST_TIMEOUT_S as ALLELE_REGISTRY_TIMEOUT, +) +from v03_pipeline.lib.misc.allele_registry import ( + register_alleles, + register_alleles_in_chunks, +) +from v03_pipeline.lib.model import ReferenceGenome +from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase + +TEST_SERVER_URL = 'http://reg.test.genome.network/alleles?file=vcf&fields=none+@id' + + +class AlleleRegistryTest(MockedDatarootTestCase): + def setUp(self): + self.temp_dir = tempfile.TemporaryDirectory() + + def tearDown(self): + shutil.rmtree(self.temp_dir.name) + + @patch.object(requests, 'put') + @patch( + 'v03_pipeline.lib.misc.allele_registry.get_ar_credentials_from_secret_manager', + ) + @patch('v03_pipeline.lib.misc.allele_registry.Env') + @patch('v03_pipeline.lib.misc.allele_registry.logger') + def test_register_alleles_38( + self, + mock_logger: Mock, + mock_env: Mock, + mock_get_credentials: Mock, + mock_put_request: Mock, + ): + mock_get_credentials.return_value = ('', '') + mock_env.HAIL_TMPDIR = self.temp_dir.name + + new_variants_ht = hl.Table.parallelize( + [ + { + 'locus': hl.Locus( + contig='chr1', + position=10126, + reference_genome='GRCh38', + ), + 'alleles': ['TA', 'T'], + 'rsid': 'rs370233999', + }, + { + 'locus': hl.Locus( + contig='chr1', + position=10129, + reference_genome='GRCh38', + ), + 'alleles': ['T', 'TC'], + 'rsid': 'rs370233997', + }, + { + 'locus': hl.Locus( + contig='chr1', + position=10128, + reference_genome='GRCh38', + ), + 'alleles': ['A', 'G'], + 'rsid': 'rs370234000', + }, + { + 'locus': hl.Locus( + contig='chr1', + position=10469, + reference_genome='GRCh38', + ), + 'alleles': ['C', 'G'], + 'rsid': 'rs370233998', + }, + ], + hl.tstruct( + locus=hl.tlocus(ReferenceGenome.GRCh38.value), + alleles=hl.tarray(hl.tstr), + rsid=hl.tstr, + ), + key=('locus', 'alleles'), + ) + + mock_response = Mock() + mock_put_request.return_value = mock_response + mock_response.ok = True + mock_response.json.return_value = [ + { + '@id': 'http://reg.genome.network/allele/CA997563840', + 'genomicAlleles': [ + { + 'chromosome': '1', + 'coordinates': [ + { + 'allele': '', # alt allele is '' + 'end': 10128, + 'referenceAllele': 'A', + 'start': 10127, + }, + ], + 'referenceGenome': 'GRCh38', + }, + ], + 'externalRecords': { + 'gnomAD_4': [{'id': '1-10126-TA-T'}], + }, # has gnomad ID + }, + { + '@id': 'http://reg.genome.network/allele/CA16716503', + 'genomicAlleles': [ + { + 'chromosome': '1', + 'coordinates': [ + { + 'allele': 'C', + 'end': 10131, + 'referenceAllele': '', # ref allele is '' and does not have a gnomad ID + 'start': 10131, + }, + ], + 'referenceGenome': 'GRCh38', + }, + ], + }, + { + '@id': 'http://reg.genome.network/allele/CA997563845', + 'genomicAlleles': [ + { + 'chromosome': '1', + 'coordinates': [ + { + 'allele': 'G', + 'end': 10128, + 'referenceAllele': 'A', + 'start': 10127, + }, + ], + 'referenceGenome': 'GRCh38', + }, + ], + 'externalRecords': {'gnomAD_4': [{'id': '1-10128-A-G'}]}, + }, + { + 'description': 'Given allele cannot be mapped in consistent way to reference genome.', + 'errorType': 'InternalServerError', + 'inputLine': 'Cannot align NC_000001.10 [10468,10469).', + 'message': '1 10469 rs370233998 C G . . .', + }, + ] + + ar_ht = register_alleles( + new_variants_ht, + ReferenceGenome.GRCh38, + TEST_SERVER_URL, + ) + self.assertEqual( + ar_ht.collect(), + [ + hl.Struct( + locus=hl.Locus('chr1', 10126, 'GRCh38'), + alleles=['TA', 'T'], + CAID='CA997563840', + ), + hl.Struct( + locus=hl.Locus('chr1', 10128, 'GRCh38'), + alleles=['A', 'G'], + CAID='CA997563845', + ), + ], + ) + mock_put_request.assert_called_once_with( + url=ANY, + data=f'{"".join(ReferenceGenome.GRCh38.allele_registry_vcf_header)}' + f'1\t10126\trs370233999\tTA\tT\t.\t.\t.\n' + f'1\t10128\trs370234000\tA\tG\t.\t.\t.\n' + f'1\t10129\trs370233997\tT\tTC\t.\t.\t.\n' + f'1\t10469\trs370233998\tC\tG\t.\t.\t.\n', + timeout=ALLELE_REGISTRY_TIMEOUT, + ) + mock_logger.warning.assert_called_once_with( + '1 failed. First error: \n' + 'API URL: http://reg.test.genome.network/alleles?file=vcf&fields=none+@id\n' + 'TYPE: InternalServerError\n' + 'DESCRIPTION: Given allele cannot be mapped in consistent way to reference genome.\n' + 'MESSAGE: 1\t10469\trs370233998\tC\tG\t.\t.\t.\n' + 'INPUT_LINE: Cannot align NC_000001.10 [10468,10469).', + ) + + @patch('v03_pipeline.lib.misc.allele_registry.register_alleles') + def test_register_alleles_in_chunks(self, mock_register_alleles): + chunk_size = 10 + ht = hl.Table.parallelize( + [{'x': x} for x in range(chunk_size * 3 + 5)], # 35 rows, expect 4 chunks + hl.tstruct(x=hl.tint32), + key='x', + ) + + # Instead of actually calling register_alleles, capture and assert on + # the value of 'x' in the first row of each chunk and number of rows in each chunk + def _side_effect(chunk_ht: hl.Table, *_): + value_in_first_row = hl.eval(chunk_ht.take(1)[0].x) + num_rows_in_chunk = chunk_ht.count() + return value_in_first_row, num_rows_in_chunk + + mock_register_alleles.side_effect = _side_effect + generator = register_alleles_in_chunks( + ht=ht, + reference_genome=ReferenceGenome.GRCh38, + base_url=TEST_SERVER_URL, + chunk_size=chunk_size, + ) + self.assertEqual(list(generator), [(0, 10), (10, 10), (20, 10), (30, 5)]) + + def test_register_alleles_in_chunks_no_new_variants(self): + ht = hl.Table.parallelize( + [], + hl.tstruct(x=hl.tint32), + key='x', + ) + empty_generator = register_alleles_in_chunks( + ht=ht, + reference_genome=ReferenceGenome.GRCh38, + base_url=TEST_SERVER_URL, + ) + with self.assertRaises(StopIteration): + next(empty_generator) diff --git a/v03_pipeline/lib/model/dataset_type.py b/v03_pipeline/lib/model/dataset_type.py index 107d651ce..cc9123848 100644 --- a/v03_pipeline/lib/model/dataset_type.py +++ b/v03_pipeline/lib/model/dataset_type.py @@ -281,3 +281,7 @@ def lookup_table_annotation_fns(self) -> list[Callable[..., hl.Expression]]: mito.gt_stats, ], }[self] + + @property + def should_send_to_allele_registry(self): + return self == DatasetType.SNV_INDEL diff --git a/v03_pipeline/lib/model/definitions.py b/v03_pipeline/lib/model/definitions.py index 15939e36a..d4966d558 100644 --- a/v03_pipeline/lib/model/definitions.py +++ b/v03_pipeline/lib/model/definitions.py @@ -66,6 +66,76 @@ def contig_recoding(self, include_mt: bool = False) -> dict[str, str]: return recode + @property + def allele_registry_vcf_header(self) -> list[str]: + return { + ReferenceGenome.GRCh37: [ + '##fileformat=VCFv4.2\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '#CHROM POS ID REF ALT QUAL FILTER INFO\n', + ], + ReferenceGenome.GRCh38: [ + '##fileformat=VCFv4.2\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '##contig=\n', + '#CHROM POS ID REF ALT QUAL FILTER INFO\n', + ], + }[self] + + @property + def allele_registry_gnomad_id(self) -> str: + return { + ReferenceGenome.GRCh37: 'gnomAD_2', + ReferenceGenome.GRCh38: 'gnomAD_4', + }[self] + class SampleType(Enum): WES = 'WES' diff --git a/v03_pipeline/lib/model/environment.py b/v03_pipeline/lib/model/environment.py index 81bf1f6b3..d89567d8b 100644 --- a/v03_pipeline/lib/model/environment.py +++ b/v03_pipeline/lib/model/environment.py @@ -19,16 +19,22 @@ ) VEP_CONFIG_PATH = os.environ.get('VEP_CONFIG_PATH', None) VEP_CONFIG_URI = os.environ.get('VEP_CONFIG_URI', None) +SHOULD_REGISTER_ALLELES = os.environ.get('SHOULD_REGISTER_ALLELES') == '1' +ALLELE_REGISTRY_SECRET_NAME = os.environ.get('ALLELE_REGISTRY_SECRET_NAME', None) +PROJECT_ID = os.environ.get('PROJECT_ID', None) @dataclass class Env: ACCESS_PRIVATE_REFERENCE_DATASETS: bool = ACCESS_PRIVATE_REFERENCE_DATASETS + ALLELE_REGISTRY_SECRET_NAME: str | None = ALLELE_REGISTRY_SECRET_NAME REFERENCE_DATA_AUTO_UPDATE: bool = REFERENCE_DATA_AUTO_UPDATE HAIL_TMPDIR: str = HAIL_TMPDIR HAIL_SEARCH_DATA: str = HAIL_SEARCH_DATA LOADING_DATASETS: str = LOADING_DATASETS PRIVATE_REFERENCE_DATASETS: str = PRIVATE_REFERENCE_DATASETS + PROJECT_ID: str | None = PROJECT_ID REFERENCE_DATASETS: str = REFERENCE_DATASETS + SHOULD_REGISTER_ALLELES: bool = SHOULD_REGISTER_ALLELES VEP_CONFIG_PATH: str | None = VEP_CONFIG_PATH VEP_CONFIG_URI: str | None = VEP_CONFIG_URI diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py index b05fae398..21fe5f532 100644 --- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py +++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py @@ -202,6 +202,8 @@ def test_missing_interval_reference( worker.run() self.assertFalse(uvatwns_task.complete()) + @patch('v03_pipeline.lib.tasks.write_new_variants_table.register_alleles_in_chunks') + @patch('v03_pipeline.lib.tasks.write_new_variants_table.Env') @patch( 'v03_pipeline.lib.tasks.write_imported_callset.UpdatedCachedReferenceDatasetQuery', ) @@ -221,8 +223,10 @@ def test_multiple_update_vat( mock_vep: Mock, mock_standard_contigs: Mock, mock_update_vat_with_rdc_task: Mock, - mock_update_rdc_task: Mock, mock_updated_cached_reference_dataset_query, + mock_env: Mock, + mock_register_alleles: Mock, + mock_update_rdc_task: Mock, ) -> None: mock_updated_cached_reference_dataset_query.return_value = MockCompleteTask() mock_update_rdc_task.return_value = MockCompleteTask() @@ -235,6 +239,64 @@ def test_multiple_update_vat( ) mock_vep.side_effect = lambda ht, **_: ht.annotate(vep=MOCK_VEP_DATA) mock_vep_validate.return_value = None + # make register_alleles return CAIDs for 4 of 30 variants + mock_env.SHOULD_REGISTER_ALLELES = True + mock_register_alleles.side_effect = [ + iter( + [ + hl.Table.parallelize( + [ + hl.Struct( + locus=hl.Locus( + contig='chr1', + position=871269, + reference_genome='GRCh38', + ), + alleles=['A', 'C'], + CAID='CA1', + ), + hl.Struct( + locus=hl.Locus( + contig='chr1', + position=874734, + reference_genome='GRCh38', + ), + alleles=['C', 'T'], + CAID='CA2', + ), + hl.Struct( + locus=hl.Locus( + contig='chr1', + position=876499, + reference_genome='GRCh38', + ), + alleles=['A', 'G'], + CAID='CA3', + ), + hl.Struct( + locus=hl.Locus( + contig='chr1', + position=878314, + reference_genome='GRCh38', + ), + alleles=['G', 'C'], + CAID='CA4', + ), + ], + hl.tstruct( + locus=hl.tlocus('GRCh38'), + alleles=hl.tarray(hl.tstr), + CAID=hl.tstr, + ), + key=('locus', 'alleles'), + ), + ], + ), + iter( + [], + ), # for the second call, there are no new variants, return empty iterator + ] + mock_standard_contigs.return_value = {'chr1'} # This creates a mock validation table with 1 coding and 1 non-coding variant # explicitly chosen from the VCF. @@ -308,6 +370,7 @@ def test_multiple_update_vat( x for x in ht.select( 'gt_stats', + 'CAID', ).collect() if x.locus.position <= 871269 # noqa: PLR2004 ], @@ -320,6 +383,7 @@ def test_multiple_update_vat( ), alleles=['A', 'C'], gt_stats=hl.Struct(AC=0, AN=6, AF=0.0, hom=0), + CAID='CA1', ), ], ) @@ -363,6 +427,7 @@ def test_multiple_update_vat( 'xpos', 'gt_stats', 'screen', + 'CAID', ).collect() if x.locus.position <= 878809 # noqa: PLR2004 ], @@ -392,6 +457,7 @@ def test_multiple_update_vat( xpos=1000871269, gt_stats=hl.Struct(AC=1, AN=32, AF=0.03125, hom=0), screen=hl.Struct(region_type_ids=[1]), + CAID='CA1', ), hl.Struct( locus=hl.Locus( @@ -407,6 +473,7 @@ def test_multiple_update_vat( xpos=1000874734, gt_stats=hl.Struct(AC=1, AN=32, AF=0.03125, hom=0), screen=hl.Struct(region_type_ids=[]), + CAID='CA2', ), hl.Struct( locus=hl.Locus( @@ -422,6 +489,7 @@ def test_multiple_update_vat( xpos=1000876499, gt_stats=hl.Struct(AC=31, AN=32, AF=0.96875, hom=15), screen=hl.Struct(region_type_ids=[]), + CAID='CA3', ), hl.Struct( locus=hl.Locus( @@ -437,6 +505,7 @@ def test_multiple_update_vat( xpos=1000878314, gt_stats=hl.Struct(AC=3, AN=32, AF=0.09375, hom=0), screen=hl.Struct(region_type_ids=[]), + CAID='CA4', ), hl.Struct( locus=hl.Locus( @@ -452,6 +521,7 @@ def test_multiple_update_vat( xpos=1000878809, gt_stats=hl.Struct(AC=1, AN=32, AF=0.03125, hom=0), screen=hl.Struct(region_type_ids=[]), + CAID=None, ), ], ) @@ -564,6 +634,7 @@ def test_multiple_update_vat( ], ) + @patch('v03_pipeline.lib.tasks.write_new_variants_table.register_alleles_in_chunks') @patch( 'v03_pipeline.lib.tasks.write_new_variants_table.UpdateVariantAnnotationsTableWithUpdatedReferenceDataset', ) @@ -574,6 +645,7 @@ def test_update_vat_grch37( mock_vep_validate: Mock, mock_vep: Mock, mock_update_vat_with_rdc_task: Mock, + mock_register_alleles: Mock, mock_update_rdc_task: Mock, ) -> None: mock_update_rdc_task.return_value = MockCompleteTask() @@ -586,6 +658,7 @@ def test_update_vat_grch37( ) mock_vep.side_effect = lambda ht, **_: ht.annotate(vep=MOCK_VEP_DATA) mock_vep_validate.return_value = None + mock_register_alleles.side_effect = None worker = luigi.worker.Worker() uvatwns_task = UpdateVariantAnnotationsTableWithNewSamplesTask( reference_genome=ReferenceGenome.GRCh37, @@ -625,6 +698,7 @@ def test_update_vat_grch37( ) self.assertFalse(hasattr(ht, 'rg37_locus')) + @patch('v03_pipeline.lib.tasks.write_new_variants_table.register_alleles_in_chunks') @patch( 'v03_pipeline.lib.tasks.write_new_variants_table.UpdateVariantAnnotationsTableWithUpdatedReferenceDataset', ) @@ -637,6 +711,7 @@ def test_update_vat_without_accessing_private_datasets( mock_vep: Mock, mock_rdc_env: Mock, mock_update_vat_with_rdc_task: Mock, + mock_register_alleles: Mock, mock_update_rdc_task: Mock, ) -> None: mock_update_rdc_task.return_value = MockCompleteTask() @@ -657,6 +732,7 @@ def test_update_vat_without_accessing_private_datasets( mock_rdc_env.ACCESS_PRIVATE_REFERENCE_DATASETS = False mock_vep.side_effect = lambda ht, **_: ht.annotate(vep=MOCK_VEP_DATA) mock_vep_validate.return_value = None + mock_register_alleles.side_effect = None worker = luigi.worker.Worker() uvatwns_task = UpdateVariantAnnotationsTableWithNewSamplesTask( reference_genome=ReferenceGenome.GRCh38, @@ -696,12 +772,14 @@ def test_update_vat_without_accessing_private_datasets( ], ) + @patch('v03_pipeline.lib.tasks.write_new_variants_table.register_alleles_in_chunks') @patch( 'v03_pipeline.lib.tasks.write_new_variants_table.UpdateVariantAnnotationsTableWithUpdatedReferenceDataset', ) def test_mito_update_vat( self, mock_update_vat_with_rdc_task: Mock, + mock_register_alleles: Mock, mock_update_rdc_task: Mock, ) -> None: mock_update_rdc_task.return_value = MockCompleteTask() @@ -712,6 +790,7 @@ def test_mito_update_vat( sample_type=SampleType.WGS, ) ) + mock_register_alleles.side_effect = None worker = luigi.worker.Worker() update_variant_annotations_task = ( UpdateVariantAnnotationsTableWithNewSamplesTask( diff --git a/v03_pipeline/lib/tasks/write_new_variants_table.py b/v03_pipeline/lib/tasks/write_new_variants_table.py index cdb65af7e..34ea5ee6a 100644 --- a/v03_pipeline/lib/tasks/write_new_variants_table.py +++ b/v03_pipeline/lib/tasks/write_new_variants_table.py @@ -7,6 +7,7 @@ from v03_pipeline.lib.annotations.rdc_dependencies import ( get_rdc_annotation_dependencies, ) +from v03_pipeline.lib.misc.allele_registry import register_alleles_in_chunks from v03_pipeline.lib.misc.callsets import callset_project_pairs, get_callset_ht from v03_pipeline.lib.misc.math import constrain from v03_pipeline.lib.model import Env, ReferenceDatasetCollection @@ -242,6 +243,28 @@ def create_table(self) -> hl.Table: rdc_ht = self.annotation_dependencies[f'{rdc.value}_ht'] new_variants_ht = new_variants_ht.join(rdc_ht, 'left') + # Register the new variant alleles to the Clingen Allele Registry + # and annotate new_variants table with CAID. + if ( + Env.SHOULD_REGISTER_ALLELES + and self.dataset_type.should_send_to_allele_registry + ): + ar_ht = hl.Table.parallelize( + [], + hl.tstruct( + locus=hl.tlocus(self.reference_genome.value), + alleles=hl.tarray(hl.tstr), + CAID=hl.tstr, + ), + key=('locus', 'alleles'), + ) + for ar_ht_chunk in register_alleles_in_chunks( + new_variants_ht, + self.reference_genome, + ): + ar_ht = ar_ht.union(ar_ht_chunk) + new_variants_ht = new_variants_ht.join(ar_ht, 'left') + return new_variants_ht.annotate_globals( updates={ hl.Struct(callset=callset_path, project_guid=project_guid) From 474d0caed098b84ab2ba05e4ec23eadef5339cae Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 23 May 2024 08:51:19 -0600 Subject: [PATCH 5/7] Reference Data Update Type Equality Check (#789) * Finish validity check test * ruff * update dbnsfp field * More types * more types * ugh * twiddle it back * update type * more tweaks * lint * fix floats * decompose * ruff formatg * Update compare_globals_test.py --- .../lib/reference_data/compare_globals.py | 44 +++++++--- .../reference_data/compare_globals_test.py | 79 ++++++++++++++---- ...ble_with_updated_reference_dataset_test.py | 6 +- .../test_combined_1.ht/.README.txt.crc | Bin 12 -> 12 bytes .../test_combined_1.ht/.metadata.json.gz.crc | Bin 16 -> 16 bytes .../test_combined_1.ht/README.txt | 4 +- .../.index.crc | Bin .../.metadata.json.gz.crc | Bin .../index | Bin .../metadata.json.gz | Bin .../test_combined_1.ht/metadata.json.gz | Bin 725 -> 725 bytes .../rows/.metadata.json.gz.crc | Bin 20 -> 20 bytes .../test_combined_1.ht/rows/metadata.json.gz | Bin 1064 -> 1063 bytes ...0-6a5a9d6a-4ded-424b-9735-922a5346e7ad.crc | Bin 0 -> 12 bytes ...0-963ef6be-cd5f-443a-970c-8b5cf2bcd090.crc | Bin 12 -> 0 bytes ...art-0-6a5a9d6a-4ded-424b-9735-922a5346e7ad | Bin 0 -> 106 bytes ...art-0-963ef6be-cd5f-443a-970c-8b5cf2bcd090 | Bin 107 -> 0 bytes .../test_combined_37.ht/.README.txt.crc | Bin 12 -> 12 bytes .../test_combined_37.ht/.metadata.json.gz.crc | Bin 16 -> 16 bytes .../test_combined_37.ht/README.txt | 4 +- .../.index.crc | Bin 0 -> 12 bytes .../.metadata.json.gz.crc | Bin 0 -> 12 bytes .../index | Bin 0 -> 140 bytes .../metadata.json.gz | Bin 0 -> 187 bytes .../.index.crc | Bin 12 -> 0 bytes .../.metadata.json.gz.crc | Bin 12 -> 0 bytes .../index | Bin 137 -> 0 bytes .../metadata.json.gz | Bin 186 -> 0 bytes .../test_combined_37.ht/metadata.json.gz | Bin 703 -> 703 bytes .../rows/.metadata.json.gz.crc | Bin 20 -> 20 bytes .../test_combined_37.ht/rows/metadata.json.gz | Bin 1029 -> 1027 bytes ...0-ac85fcb0-1e7c-453f-be81-9cd356dc49ff.crc | Bin 0 -> 12 bytes ...0-da748236-79a9-461e-a62a-a9280e863d48.crc | Bin 12 -> 0 bytes ...art-0-ac85fcb0-1e7c-453f-be81-9cd356dc49ff | Bin 0 -> 210 bytes ...art-0-da748236-79a9-461e-a62a-a9280e863d48 | Bin 212 -> 0 bytes .../test_combined_mito_1.ht/.README.txt.crc | Bin 12 -> 12 bytes .../.metadata.json.gz.crc | Bin 16 -> 16 bytes .../test_combined_mito_1.ht/README.txt | 4 +- .../.index.crc | Bin 12 -> 0 bytes .../.index.crc | Bin 0 -> 12 bytes .../.metadata.json.gz.crc | Bin .../index | Bin 130 -> 130 bytes .../metadata.json.gz | Bin .../test_combined_mito_1.ht/metadata.json.gz | Bin 560 -> 564 bytes .../rows/.metadata.json.gz.crc | Bin 16 -> 16 bytes .../rows/metadata.json.gz | Bin 851 -> 855 bytes ...0-262c4389-1ff9-432a-ab5c-4c9cfa547dc4.crc | Bin 12 -> 0 bytes ...0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.crc | Bin 0 -> 12 bytes ...art-0-262c4389-1ff9-432a-ab5c-4c9cfa547dc4 | Bin 219 -> 0 bytes ...art-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2 | Bin 0 -> 224 bytes .../test_interval_1.ht/.README.txt.crc | Bin 12 -> 12 bytes .../test_interval_1.ht/.metadata.json.gz.crc | Bin 12 -> 12 bytes .../test_interval_1.ht/README.txt | 4 +- .../globals/.metadata.json.gz.crc | Bin 12 -> 12 bytes .../globals/metadata.json.gz | Bin 359 -> 357 bytes .../globals/parts/.part-0.crc | Bin 12 -> 12 bytes .../test_interval_1.ht/globals/parts/part-0 | Bin 221 -> 217 bytes .../.index.crc | Bin 0 -> 12 bytes .../.metadata.json.gz.crc | Bin 0 -> 12 bytes .../index | Bin 0 -> 79 bytes .../metadata.json.gz | Bin 0 -> 176 bytes .../.index.crc | Bin 12 -> 0 bytes .../.metadata.json.gz.crc | Bin 12 -> 0 bytes .../index | Bin 65 -> 0 bytes .../metadata.json.gz | Bin 176 -> 0 bytes .../test_interval_1.ht/metadata.json.gz | Bin 372 -> 372 bytes .../rows/.metadata.json.gz.crc | Bin 16 -> 16 bytes .../test_interval_1.ht/rows/metadata.json.gz | Bin 647 -> 643 bytes ...0-1224c3b3-ab5b-49d7-8d6d-6084ccbbc683.crc | Bin 0 -> 12 bytes ...0-2d30884d-a682-4d9e-9214-4bf4b5156c98.crc | Bin 12 -> 0 bytes ...art-0-1224c3b3-ab5b-49d7-8d6d-6084ccbbc683 | Bin 0 -> 60 bytes ...art-0-2d30884d-a682-4d9e-9214-4bf4b5156c98 | Bin 47 -> 0 bytes 72 files changed, 109 insertions(+), 36 deletions(-) rename v03_pipeline/var/test/reference_data/test_combined_1.ht/index/{part-0-963ef6be-cd5f-443a-970c-8b5cf2bcd090.idx => part-0-6a5a9d6a-4ded-424b-9735-922a5346e7ad.idx}/.index.crc (100%) rename v03_pipeline/var/test/reference_data/test_combined_1.ht/index/{part-0-963ef6be-cd5f-443a-970c-8b5cf2bcd090.idx => part-0-6a5a9d6a-4ded-424b-9735-922a5346e7ad.idx}/.metadata.json.gz.crc (100%) rename v03_pipeline/var/test/reference_data/test_combined_1.ht/index/{part-0-963ef6be-cd5f-443a-970c-8b5cf2bcd090.idx => part-0-6a5a9d6a-4ded-424b-9735-922a5346e7ad.idx}/index (100%) rename v03_pipeline/var/test/reference_data/test_combined_1.ht/index/{part-0-963ef6be-cd5f-443a-970c-8b5cf2bcd090.idx => part-0-6a5a9d6a-4ded-424b-9735-922a5346e7ad.idx}/metadata.json.gz (100%) create mode 100644 v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/parts/.part-0-6a5a9d6a-4ded-424b-9735-922a5346e7ad.crc delete mode 100644 v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/parts/.part-0-963ef6be-cd5f-443a-970c-8b5cf2bcd090.crc create mode 100644 v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/parts/part-0-6a5a9d6a-4ded-424b-9735-922a5346e7ad delete mode 100644 v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/parts/part-0-963ef6be-cd5f-443a-970c-8b5cf2bcd090 create mode 100644 v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-ac85fcb0-1e7c-453f-be81-9cd356dc49ff.idx/.index.crc create mode 100644 v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-ac85fcb0-1e7c-453f-be81-9cd356dc49ff.idx/.metadata.json.gz.crc create mode 100644 v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-ac85fcb0-1e7c-453f-be81-9cd356dc49ff.idx/index create mode 100644 v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-ac85fcb0-1e7c-453f-be81-9cd356dc49ff.idx/metadata.json.gz delete mode 100644 v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-da748236-79a9-461e-a62a-a9280e863d48.idx/.index.crc delete mode 100644 v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-da748236-79a9-461e-a62a-a9280e863d48.idx/.metadata.json.gz.crc delete mode 100644 v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-da748236-79a9-461e-a62a-a9280e863d48.idx/index delete mode 100644 v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-da748236-79a9-461e-a62a-a9280e863d48.idx/metadata.json.gz create mode 100644 v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/parts/.part-0-ac85fcb0-1e7c-453f-be81-9cd356dc49ff.crc delete mode 100644 v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/parts/.part-0-da748236-79a9-461e-a62a-a9280e863d48.crc create mode 100644 v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/parts/part-0-ac85fcb0-1e7c-453f-be81-9cd356dc49ff delete mode 100644 v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/parts/part-0-da748236-79a9-461e-a62a-a9280e863d48 delete mode 100644 v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-262c4389-1ff9-432a-ab5c-4c9cfa547dc4.idx/.index.crc create mode 100644 v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.idx/.index.crc rename v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/{part-0-262c4389-1ff9-432a-ab5c-4c9cfa547dc4.idx => part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.idx}/.metadata.json.gz.crc (100%) rename v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/{part-0-262c4389-1ff9-432a-ab5c-4c9cfa547dc4.idx => part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.idx}/index (61%) rename v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/{part-0-262c4389-1ff9-432a-ab5c-4c9cfa547dc4.idx => part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.idx}/metadata.json.gz (100%) delete mode 100644 v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/.part-0-262c4389-1ff9-432a-ab5c-4c9cfa547dc4.crc create mode 100644 v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/.part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.crc delete mode 100644 v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/part-0-262c4389-1ff9-432a-ab5c-4c9cfa547dc4 create mode 100644 v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2 create mode 100644 v03_pipeline/var/test/reference_data/test_interval_1.ht/index/part-0-1224c3b3-ab5b-49d7-8d6d-6084ccbbc683.idx/.index.crc create mode 100644 v03_pipeline/var/test/reference_data/test_interval_1.ht/index/part-0-1224c3b3-ab5b-49d7-8d6d-6084ccbbc683.idx/.metadata.json.gz.crc create mode 100644 v03_pipeline/var/test/reference_data/test_interval_1.ht/index/part-0-1224c3b3-ab5b-49d7-8d6d-6084ccbbc683.idx/index create mode 100644 v03_pipeline/var/test/reference_data/test_interval_1.ht/index/part-0-1224c3b3-ab5b-49d7-8d6d-6084ccbbc683.idx/metadata.json.gz delete mode 100644 v03_pipeline/var/test/reference_data/test_interval_1.ht/index/part-0-2d30884d-a682-4d9e-9214-4bf4b5156c98.idx/.index.crc delete mode 100644 v03_pipeline/var/test/reference_data/test_interval_1.ht/index/part-0-2d30884d-a682-4d9e-9214-4bf4b5156c98.idx/.metadata.json.gz.crc delete mode 100644 v03_pipeline/var/test/reference_data/test_interval_1.ht/index/part-0-2d30884d-a682-4d9e-9214-4bf4b5156c98.idx/index delete mode 100644 v03_pipeline/var/test/reference_data/test_interval_1.ht/index/part-0-2d30884d-a682-4d9e-9214-4bf4b5156c98.idx/metadata.json.gz create mode 100644 v03_pipeline/var/test/reference_data/test_interval_1.ht/rows/parts/.part-0-1224c3b3-ab5b-49d7-8d6d-6084ccbbc683.crc delete mode 100644 v03_pipeline/var/test/reference_data/test_interval_1.ht/rows/parts/.part-0-2d30884d-a682-4d9e-9214-4bf4b5156c98.crc create mode 100644 v03_pipeline/var/test/reference_data/test_interval_1.ht/rows/parts/part-0-1224c3b3-ab5b-49d7-8d6d-6084ccbbc683 delete mode 100644 v03_pipeline/var/test/reference_data/test_interval_1.ht/rows/parts/part-0-2d30884d-a682-4d9e-9214-4bf4b5156c98 diff --git a/v03_pipeline/lib/reference_data/compare_globals.py b/v03_pipeline/lib/reference_data/compare_globals.py index 0b5c2993d..1feb0ac12 100644 --- a/v03_pipeline/lib/reference_data/compare_globals.py +++ b/v03_pipeline/lib/reference_data/compare_globals.py @@ -21,7 +21,7 @@ class Globals: paths: dict[str, str] versions: dict[str, str] enums: dict[str, dict[str, list[str]]] - selects: dict[str, set[str]] + selects: dict[str, dict[str, hl.dtype]] def __getitem__(self, name: str): return getattr(self, name) @@ -50,7 +50,11 @@ def from_dataset_configs( dataset_ht = dataset_ht.transmute( **get_enum_select_fields(dataset_ht, dataset_config), ) - selects[dataset] = set(dataset_ht.row) - set(dataset_ht.key) + selects[dataset] = { + k: v.dtype + for k, v in dict(dataset_ht.row).items() + if k not in set(dataset_ht.key) + } return cls(paths, versions, enums, selects) @classmethod @@ -69,32 +73,52 @@ def from_ht( if dataset in ht.row: # NB: handle an edge case (mito high constraint) where we annotate a bool from the reference dataset collection selects[dataset] = ( - set(ht[dataset]) + {k: v.dtype for k, v in dict(ht[dataset]).items()} if isinstance(ht[dataset], hl.StructExpression) - else set() + else {} ) return cls(paths, versions, enums, selects) +def validate_selects_types( + ht1_globals: Globals, + ht2_globals: Globals, + dataset: str, +) -> None: + # Assert that all shared annotations have identical types + shared_selects = ( + ht1_globals['selects'][dataset].keys() + & ht2_globals['selects'].get(dataset).keys() + ) + mismatched_select_types = [ + (select, ht2_globals['selects'][dataset][select]) + for select in shared_selects + if ( + ht1_globals['selects'][dataset][select] + != ht2_globals['selects'][dataset][select] + ) + ] + if mismatched_select_types: + msg = f'Unexpected field types detected in {dataset}: {mismatched_select_types}' + raise ValueError(msg) + + def get_datasets_to_update( ht1_globals: Globals, ht2_globals: Globals, validate_selects: bool = True, ) -> list[str]: datasets_to_update = set() - for field in dataclasses.fields(Globals): if field.name == 'selects' and not validate_selects: continue - datasets_to_update.update( ht1_globals[field.name].keys() ^ ht2_globals[field.name].keys(), ) for dataset in ht1_globals[field.name].keys() & ht2_globals[field.name].keys(): - if ht1_globals[field.name].get(dataset) != ht2_globals[field.name].get( - dataset, - ): + if field.name == 'selects': + validate_selects_types(ht1_globals, ht2_globals, dataset) + if ht1_globals[field.name][dataset] != ht2_globals[field.name][dataset]: logger.info(f'{field.name} mismatch for {dataset}') datasets_to_update.add(dataset) - return sorted(datasets_to_update) diff --git a/v03_pipeline/lib/reference_data/compare_globals_test.py b/v03_pipeline/lib/reference_data/compare_globals_test.py index 0d290489b..786964fcb 100644 --- a/v03_pipeline/lib/reference_data/compare_globals_test.py +++ b/v03_pipeline/lib/reference_data/compare_globals_test.py @@ -103,8 +103,15 @@ def test_create_globals_from_dataset_configs( self.assertTrue( dataset_config_globals.selects == { - 'a': {'test_select', 'test_enum_id'}, - 'b': {'test_select', 'field2', 'test_enum_id'}, + 'a': { + 'test_select': hl.tint32, + 'test_enum_id': hl.tint32, + }, + 'b': { + 'test_select': hl.tint32, + 'field2': hl.tint32, + 'test_enum_id': hl.tint32, + }, }, ) @@ -134,7 +141,11 @@ def test_create_globals_from_dataset_configs_single_dataset(self, mock_read_tabl self.assertTrue( dataset_config_globals.selects == { - 'b': {'test_select', 'field2', 'test_enum_id'}, + 'b': { + 'test_select': hl.tint32, + 'field2': hl.tint32, + 'test_enum_id': hl.tint32, + }, }, ) @@ -186,8 +197,8 @@ def test_from_rdc_or_annotations_ht(self): self.assertTrue( rdc_globals.selects == { - 'gnomad_non_coding_constraint': {'z_score'}, - 'screen': {'region_type_ids'}, + 'gnomad_non_coding_constraint': {'z_score': hl.tfloat32}, + 'screen': {'region_type_ids': hl.tarray(hl.tint32)}, }, ) @@ -198,13 +209,13 @@ def test_get_datasets_to_update_version_different(self): # 'a' has a different version, 'c' is missing version in ht2_globals versions={'a': 'v2', 'b': 'v2', 'c': 'v1'}, enums={'a': {}, 'b': {}, 'c': {}}, - selects={'a': set(), 'b': set()}, + selects={'a': {}, 'b': {}}, ), ht2_globals=Globals( paths={'a': 'a_path', 'b': 'b_path'}, versions={'a': 'v1', 'b': 'v2'}, enums={'a': {}, 'b': {}}, - selects={'a': set(), 'b': set()}, + selects={'a': {}, 'b': {}}, ), ) self.assertTrue(result == ['a', 'c']) @@ -216,13 +227,13 @@ def test_get_datasets_to_update_path_different(self): paths={'a': 'a_path', 'b': 'old_b_path', 'c': 'extra_c_path'}, versions={'a': 'v1', 'b': 'v2'}, enums={'a': {}, 'b': {}}, - selects={'a': set(), 'b': set()}, + selects={'a': {}, 'b': {}}, ), ht2_globals=Globals( paths={'a': 'a_path', 'b': 'b_path'}, versions={'a': 'v1', 'b': 'v2'}, enums={'a': {}, 'b': {}}, - selects={'a': set(), 'b': set()}, + selects={'a': {}, 'b': {}}, ), ) self.assertTrue(result == ['b', 'c']) @@ -238,13 +249,13 @@ def test_get_datasets_to_update_enum_different(self): 'b': {'enum_key_1': []}, 'c': {}, }, - selects={'a': set(), 'b': set()}, + selects={'a': {}, 'b': {}}, ), ht2_globals=Globals( paths={'a': 'a_path', 'b': 'b_path'}, versions={'a': 'v1', 'b': 'v2'}, enums={'a': {'test_enum': ['C', 'D']}, 'b': {'enum_key_2': []}}, - selects={'a': set(), 'b': set()}, + selects={'a': {}, 'b': {}}, ), ) self.assertTrue(result == ['a', 'b', 'c']) @@ -257,16 +268,54 @@ def test_get_datasets_to_update_select_different(self): enums={'a': {}, 'b': {}}, # 'a' has extra select, 'b' has different select, 'c' is missing select in ht2_globals selects={ - 'a': {'field1', 'field2'}, - 'b': {'test_select'}, - 'c': set('test_select'), + 'a': {'field1': hl.tint32, 'field2': hl.tint32}, + 'b': {'test_select': hl.tint32}, + 'c': {'test_select': hl.tint32}, }, ), ht2_globals=Globals( paths={'a': 'a_path', 'b': 'b_path'}, versions={'a': 'v1', 'b': 'v2'}, enums={'a': {}, 'b': {}}, - selects={'a': {'field1'}, 'b': {'test_select_2'}}, + selects={'a': {'field1': hl.tint32}, 'b': {'test_select_2': hl.tint32}}, ), ) self.assertTrue(result == ['a', 'b', 'c']) + + def test_get_datasets_to_update_select_type_validation(self): + self.assertRaisesRegex( + ValueError, + "Unexpected field types detected in a: \\[\\('field1', dtype\\('int32'\\)\\)\\]", + get_datasets_to_update, + ht1_globals=Globals( + paths={'a': 'a_path'}, + versions={'a': 'v1'}, + enums={'a': {}}, + selects={ + 'a': {'field1': hl.tarray(hl.tint32)}, + }, + ), + ht2_globals=Globals( + paths={'a': 'a_path'}, + versions={'a': 'v1'}, + enums={'a': {}}, + selects={'a': {'field1': hl.tint32, 'field2': hl.tint32}}, + ), + ) + result = get_datasets_to_update( + ht1_globals=Globals( + paths={'a': 'a_path'}, + versions={'a': 'v1'}, + enums={'a': {}}, + selects={ + 'a': {'field1': hl.tarray(hl.tint32)}, + }, + ), + ht2_globals=Globals( + paths={'a': 'a_path'}, + versions={'a': 'v1'}, + enums={'a': {}}, + selects={'a': {'field1': hl.tarray(hl.tint32), 'field2': hl.tint32}}, + ), + ) + self.assertTrue(result == ['a']) diff --git a/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py b/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py index 842686c5c..67a5492bf 100644 --- a/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py +++ b/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py @@ -45,7 +45,7 @@ hl.tstruct( locus=hl.tlocus('GRCh38'), alleles=hl.tarray(hl.tstr), - PHRED=hl.tint32, + PHRED=hl.tfloat32, ), key=['locus', 'alleles'], globals=hl.Struct( @@ -760,7 +760,7 @@ def test_update_vat_with_updated_rdc_snv_indel_38( conditions=None, ), dbnsfp=hl.Struct( - REVEL_score=0.043, + REVEL_score=0.0430000014603138, SIFT_score=None, Polyphen2_HVAR_score=None, MutationTaster_pred_id=0, @@ -1168,7 +1168,7 @@ def test_update_vat_with_updated_rdc_snv_indel_37( conditions=None, ), dbnsfp=hl.Struct( - REVEL_score=0.043, + REVEL_score=0.0430000014603138, SIFT_score=None, Polyphen2_HVAR_score=None, MutationTaster_pred_id=0, diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/.README.txt.crc b/v03_pipeline/var/test/reference_data/test_combined_1.ht/.README.txt.crc index a98fcdeadc0fcaf79fa2103c4c51f29df3150945..1c47b9a3c9eae1262a59c31f8c62067028259bc0 100644 GIT binary patch literal 12 TcmYc;N@ieSU}A9Nc8>-C526At literal 12 TcmYc;N@ieSU}D(#Q0D~z6ek1W diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_1.ht/.metadata.json.gz.crc index b2ace029a7d4822ca27d6492cfef89141512034c..db7a7824cd79229b8fa039920ffc63383e227bdd 100644 GIT binary patch literal 16 XcmYc;N@ieSU}E_5UO7Zq$#*IMB`gHA literal 16 XcmYc;N@ieSU}6x_xbB~3c>F&AA8`cw diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/README.txt b/v03_pipeline/var/test/reference_data/test_combined_1.ht/README.txt index 856600d4e..e46de4296 100644 --- a/v03_pipeline/var/test/reference_data/test_combined_1.ht/README.txt +++ b/v03_pipeline/var/test/reference_data/test_combined_1.ht/README.txt @@ -1,3 +1,3 @@ This folder comprises a Hail (www.hail.is) native Table or MatrixTable. - Written with version 0.2.128-eead8100a1c1 - Created at 2024/05/09 20:02:21 \ No newline at end of file + Written with version 0.2.130-bea04d9c79b5 + Created at 2024/05/20 13:48:16 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-963ef6be-cd5f-443a-970c-8b5cf2bcd090.idx/.index.crc b/v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-6a5a9d6a-4ded-424b-9735-922a5346e7ad.idx/.index.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-963ef6be-cd5f-443a-970c-8b5cf2bcd090.idx/.index.crc rename to v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-6a5a9d6a-4ded-424b-9735-922a5346e7ad.idx/.index.crc diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-963ef6be-cd5f-443a-970c-8b5cf2bcd090.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-6a5a9d6a-4ded-424b-9735-922a5346e7ad.idx/.metadata.json.gz.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-963ef6be-cd5f-443a-970c-8b5cf2bcd090.idx/.metadata.json.gz.crc rename to v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-6a5a9d6a-4ded-424b-9735-922a5346e7ad.idx/.metadata.json.gz.crc diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-963ef6be-cd5f-443a-970c-8b5cf2bcd090.idx/index b/v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-6a5a9d6a-4ded-424b-9735-922a5346e7ad.idx/index similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-963ef6be-cd5f-443a-970c-8b5cf2bcd090.idx/index rename to v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-6a5a9d6a-4ded-424b-9735-922a5346e7ad.idx/index diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-963ef6be-cd5f-443a-970c-8b5cf2bcd090.idx/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-6a5a9d6a-4ded-424b-9735-922a5346e7ad.idx/metadata.json.gz similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-963ef6be-cd5f-443a-970c-8b5cf2bcd090.idx/metadata.json.gz rename to v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-6a5a9d6a-4ded-424b-9735-922a5346e7ad.idx/metadata.json.gz diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_1.ht/metadata.json.gz index 2ad9c32f8c2ad53e857f2b08f4041690baa24bd4..d005657568a8c488be1e47dce7e7e6e22410a1ad 100644 GIT binary patch literal 725 zcmV;`0xJCr{x5mjq^xS8Tid)b3T;(rBS2^pLY7Gm@aEW= zZQ3G~|2{j36DMhs(rnqR5R;*Nz z1S>$8SVX3jL@uCsnt)9%^UL1+hnWLAB%QR4oYlFNYMEW3b@FS#PKE_u=2$4rEDNlW zr-4c!(p`n$2QxsyKnQ$X6`8Lyr;;Ls(`Y+NTqi;zqK-|jkEH~m^cQQ#tiO&4F+C`3 zycSq05GWP;xF#XX{9kYkR+L8v>0?TPf?WH8zuFkNC777sPj|E7zJLAM`n8gByC2@q zMz_J3a;*BjPH6@IAr2mxHv76qTiqh5~VhzqD4I=dPJRTwqA{O82=I57|7Yc=A@&0O0l{3NmY@G8$f?;Mk|gdn Hj0gY#qKa;d literal 725 zcmV;`0xJCr{$KL6Nm*5B)uFs)6xyoLMu5 zX9JQ}h;fpbGhur_ri8s>VZ~(2E(G-ZP%!1i^2fg6t?Y#MP!*jmiW` zu>ypN)yRyJ*i|T=rC`bBe$|`*kU6kp+M{jGdBd#|D|V&S2*u@u@YOI?en zQB{gqcNKme%>V_V5cu>`GT-D*C1nVw(RP%$CPHGOPHn7@r39h$7wg7szK#hoKPqgz zR#++!C=)te8<6MzPdEiHl*dQuV@iR7-1vgO*%*Z-n3>;CcZ>1iYV_Iqbx670kM0+f z+u(tzt6{IO68(^rW?y$`r(30TjL0dOxbb`aK%{{mnW{2v{5_}1Qeh!0>j$8^e_CbJ z6qb!IkK2?e-L77FoFzmRTheKIYtSY}K-t0cdP+GiI^fP;I7QKOG!MU9vmZ{I;~0g> zkBX%h?m;;MPhCq&tn}1?KA3Db#Zw)mIyiAHH}t$fceCej;jG3@wb2RF*47xZe7xQ^)_YJU$VQTi^+O2bJO2k`^$&v zs>3Ma^pqWd4blBNTy%`@EP6BS>&bF;-3LU*sjc1VZAnu!f1K&c6FuB*yS^>-8yNF6 z;|!RRUU2rx=dU<2oSJ8GvA!K!mt_WV@v_#>srApbmcRWyu5i|%4C}ouHn&)OH$Y@f zfaj!6n@DGt7T%FPWe zksou(c1p%L5l1(PQzlu=Q8MSU*Uj9MZxoD#AjglT>c;g6rM9)@^=kSKMYAv_`DQ<4 z{0kAbR$*~YhlL9lV+ca#OU8FgBKPqQIP)z|de}xxN)To+WdUMN^J+EMG8Y)S8k&JmmBdT-0?@TC5j+y# zQX9>@<=ygT^g%1RqnU9%@kaDh89UxxgEa5r$Xbh-zP&XzX6}y15uqMD#*r$t6@cOG zwz5`UnJ&s$y}On7esMP*EvmgUTs3VO!D#nbk^jekl-O0jSx$_Hk_Z+N2_9jVQtHP7 z47D`DzHMkx2qp1s-BJ}>CWXHa!0}plVCKp=iBwmZRhU#sw$;O^Le&A8OSe1?5_{lQ z(}(He6p$WHcRioo*P~p<;W60*?|Rn{qg8$N<{*h`_j{7!KzIo@vP$AKa_+-)ecx}{ zvs8LDae%WJMc&OH3k5Ws{F*|RDT;kLQOx4kLjC{pel^+mE`QZ{D&`Lr(bfq~7L*!D zs>A3-7HJQy5HcCmB+xSeYU5Agrb(Y#f)#qKg_1ih$gzpjfQ`YL8fq-kSfH^vV_~M1 zGSz?^{jA5k>g(r+yEfa_Xt$H)Kk=z(Vt}NLBNjqh2+{(NZ?nTjM@0<{jYhszn)*p< zV6>p31r*=p#0!{cLZY1!FCoGr#P$#bAZo{<#dLhuy&P9duH)}nNDY-Svg9+`w3N0= zxMp8bVQ_xl8#K55nM#^(edRu^E|93cGgCiIf~f;wEr)q2%g=)XLWLugzj^a}uCAmZ zkRU(x{iCCx`9=tsNF_d$#{%AizKbVL>x@_F;>z#`9wRe14uTY^8=UvbG^_|+5$}@@ljLnNh`_5xxUW2MSww26L#uFms zjHz#L{2W&|owuC3{{s1lr%|F#Ytq9-27|(=M z?SFDoJPb;eSn*+x7ex=FSAQ?gFr2c7o#ExhS$A-iRUiKtzz8)a4nA@RIQ>`bFz$y3 h3hN>;%pA0As$3{Zrg9-t3gs!w{swU;>2|mf0092>{x|>t literal 1064 zcmV+@1lRi?iwFP!000000Ns~gZ`w!@z`u*1daCIqkkEuTpwOU_1VI#?u0qzbJBDrT zwQs#tp^?A)&Dz-BwVmctU0bPni0s+f`L{Da){l-LAvgnPCKx;bTi(B0#X!4`>v*5x z;0!)GK}7cnzLOw;01{`7OEh9YwW2e4EG*m|VqoKP0kuGjaOL%6L=spjxu@zeGdHkA zG!lw$wTufQ4{jVKT=7VtWF}Ovo4QBeC>Tmf4j&8EwMz+Owzb98a`LX)N&_yw5NYe= z_GWC5Yq=PKA8@L;*e;0LMOzS@VhrfA@>p$;4V{C>4KsV8O4n<58wG5oh)fEFr#6x# z0jX0)S?MxgTf51A6LO_Mn!yz5M}lRwYObX&aC9{=1D|M#7w;sX>uC^?tVHlYsHHZX zdW+k|_3*D!$sJCO=dm|rpNiP=?Fy9n7KPSY$jjFoV`J)WMHCX|!9x`4LR$eC-fk*u z#gplxh}FAYdhh4Aqv5>TJH=JgmJv+p9xL)|{D-k!_3OphIFwlOkSOp7x0DhZNifus z82h%N#UN0`vvmtqY?%c9*@M96vIA3BMRBOR!mPr$O0umUMir_KNL{uOi67eox18Kh z=Es0u=cf8+?rJu@t4Fzrfh7#% z&qC|f#6Heq7<$(q<{D=>`89U4;yFNAeAw)LA7t$+G@p!4yBDKs$#v+C2h30zAxS=?O-pH` zg)9CQ75eAry?%4s--%+`))(%>>H-PtJ5&9KNpOAntK~2&WZ7L%LZESk@;7h(%+$3s z0ODsizW;O$G=C3#E)#`^@{pq&4;gQp*8Q&1#g)Miyh5gKb+*>^}Op@zr0i1Qh6eg-?4eIXy18B%xX|o$F`C>+IUQ) znsWW!ji2KBrE}7`?DxR$Z=gfjx!*ZE>yggo+Y{Qk*ql?pyP@pl@+A9A|Fk4x3nRP} zGQI!NN$@hLbYl5OJwD_;j9%$Zo?NZ{=hzqyFtb~JyTK>V5PtHNMSJ973kHV%Ltkx}<&P>D*fKEi0(F7_*kA@m1_l5U CmK@0d literal 0 HcmV?d00001 diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/parts/part-0-963ef6be-cd5f-443a-970c-8b5cf2bcd090 b/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/parts/part-0-963ef6be-cd5f-443a-970c-8b5cf2bcd090 deleted file mode 100644 index a75be974cc14cab741093a616c5b441e7cd0b4d2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 107 zcmWGzU|{eBVvVi(e-%7EnHU&%uq0;`89tq6%Eai%=*(2lFwOb2y@C|ubxYyrpDc}y z85+*9F&toKxBPa4Po5$C>4*s$9b11|V#n W?JW%QVWB>pF diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-da748236-79a9-461e-a62a-a9280e863d48.idx/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-da748236-79a9-461e-a62a-a9280e863d48.idx/metadata.json.gz deleted file mode 100644 index aaacf6d649dcba0682cabdbd0990b089b058efdf..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 186 zcmV;r07d^FiwFP!0000009B5`3c@fDME_+^3OTe|s^%tw9uyQ6FXADs+a?$iNw#1p z{dc$CybKF7`(}D*jKv#6+dM z5-t?FntJQ&F3Q8Glihiv=ZmRO)}qBI*CD8M=O7aw0|-#gc~Zoc|B%shdrT2zo(nMU oiJy}q&57U5cJQ8#O%suKB;2e zYKvX#!SU;Wys=hsuFx$w+p&E;?;Z*r=uzx z2{iyQuEMXZyg)%HCC;B}<(tAeN!`LR+Fm7Lgiu0t-sk#QDUjNHv0=~d>y#~SGmXot z2dp$mluEN*pI{i~B(0l2ArGFIAer(pW57Udd?|k0epMz|mcQ?=RwtR`!v%`&qucPiP5a@Z zTh34z|7cX2;2N|Oh{Vk#)JAt5>5Y5tZgJN(X%zl>H0Ez_5(*y7cx_%5-gXAk?-{~z1^Ntabq=Ez2$_tKO{e)FZj+63fTRmRRY{@zW zQoWAtlW6M;gNomO&#Q_JDaB^ms(Gu%)ol>j)U3)3o7wuOv$IXvNQ$(aTMkB@!v(c2LU=mmr%oO!|d#o6@I_YRoQF6f<}Oiw1$ z%P#;pxtyGy;z>O5d{4j*1h5z+5agrFDu)QBs!d|ejZn9-^mst=H##5Xic;hL^K~O{7XqvUCOpp{S zK$zHw%qWT7fZ|ySmR#vK+Wd#if#vClwz12OTjMtDYH!4^7j_ud@rGmV^jX%i9#0c$ z0smAEECVxo#Z*2hwUQ2NA%J-gE}E$(uKP1Pfo z3Ixi8Uak+&2)&b5&0kOi&y>fx^f9GCL2i7(Uv0lC6D-a5)Aee0IJ@|4>jnhflZb9s z^Q&M%IaX)W(Kc)RZ4%rvo%Xd}J1v~jT_Q&S;;t}K2ayh*GgaH=#@}A(olM?IQcc2gMxx2)D+oUm^zLpz$SfT6X!?$qR;ubpTwCUFWXomjn zn$CRQMYB%o@uhaL;u+8BWd9wtBjn*^+ez zarHX3Pok|ab(FvUJ+3O&pbYD2tLCj5SGPfAQ?n{FY-a19CzFk7BzR^f7kVw)U<2w` lB}D3YY?RAn)wFd!9$*XGf3-9M72x22a3NkMGwB&@;z%{LgT7 zypK*8(}Tn_2}1}$Ivc#AF$1a<9q+mH@Ni6kT~`;V1zLnRZ>AF>VXe{wRnNIQ&y>(u zD8BPhiwlw--6WP=@mQc_F4XWWb5EhY={QZv@#9Hx<5I$yDlpzt_4#l>z()IQ#U;1n~c16t;Zp>y=P;pQMzmGj2k#UWcOlFCxy zB|{=5q`F{~)ivgK8&@8-5myRQGnumdSg^cP?Yqncj-jmPOtr*M_9-xmAT{T29$Y>`+i~GrVQQOY&*7Rj0Q-#Nl{6F!d#P0g-^4fSQNy;Olz$e^Nk~B`iP?HJH zZA*(msEBXtmZ~^168=7bz~^cLGgrk)1U8iFdb|5!bfKPr%w=04gTw~hYWg@`oDzCd zxmwR>56vu>ad=GjAkO@o$MLFpdwWnskNX3W2navHLDof-kjpgZd^vHB#WoB5|J#Sv z^?rEyd&MRNWKa=-f8r90Mp`V=Eh^$BPWjr?EXrZy24E~dtng9&!cR}Am zeFx|}@cwnU+i0(WUg5k|Z(U~fCM%40pt}RxUxoHJkZs_4dk6wZ8!N|RIvJc@P3kMx zp?e-OLuG;t@EOP4Qrc?an*W0e7o*YeqHXqRddX_kFfV2K z&Xq!_Nd)C@-~N%SYiS51$TzF+TDMH<4R|UQw(_Z9$%}Irw>pW3j>QIVyq~qIWo{e< z5>%B25h5<$xg9ghHeEbk*I`s)(W}n>wWMEV)yk=n%`Y;2h%G_FI~{ z@^SS89)jO^+3hJTBtLiJiWn&%)3EN0UO9wS`PSgI%Wf9kEcx|g+EvO6h5U+>i%t8% zwJz^L-5uLY=4j^$N!5(&FI)WB)rVfcN9ffkpxb_L0GG6PJ{ktSEnE$H?7BQ9uoV6ZO2V|`G`rrMd7WIAJ_RH}Vu*}v%0)A;-k005XO38er4 literal 1029 zcmV+g1p50QiwFP!000000Nqz@Z<{(4{x5vG)5w;WE=~ECbYzW`woy`5)14yZ;zQyM z48zXU+=~Bxk0F2!bXzCwUTOId;p6k}^E|SjEKVY@9BaWTJOW+czuzR_4=vmBzQECP zK3QSx9|W#R7(xib+TxZUQ}C6dZmwY<>`GkeG5h)@q6;|N3}0WI21?XCRM zLzKCCw`=di@^&^|*0yup)qQEnRN=8B|4;lVF}r@fy3`IONm)cBc!U{B!jDtX)I@@F z+tQ*CO5&Nir7Dh$fUgJO_+3q4X3IEK_i9sVR7HRUw59y?=qxd`?8*ntx*i8oiNlZlm8uq)O zZ=k*d^c{HrKHN>T*Fdju-m14QvpUHN;~nVk!1i~c{R3ocxPCYU0i=y9$I_h*hLc&n zWn2E9g;Z0SAp?BDF*lSRlyJlTL51~A4yZu&wy>6hcKJD1YcJL((G`!$}_| z;}JcZ!xMJKKdg*~Pp6qOF)%tZIXc!ea9vlnVP-z6U|`GO zyx2WN`2eFcgR!wRP-dF*X?q1J#_N{C&p%lj8#6SVV`DhL%x?MZ2A@1b_{mom?U9Eq z7#R8wef_}b&M@!2;D-=~o99HC9Gw|~x>=YVomm{6S>1sa*g`E}5Mgj?ZFs;Y%D3Q~ zfX<$Z1cSv4C%D-57p@U1I=Jt#A?ty(1uQJ=PHZ6#1(lUnjJ!Y(fB?t?3JgG>000xF BKBfQw literal 0 HcmV?d00001 diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/parts/part-0-da748236-79a9-461e-a62a-a9280e863d48 b/v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/parts/part-0-da748236-79a9-461e-a62a-a9280e863d48 deleted file mode 100644 index df40d6fa16824219422825070637a5072157f50c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 212 zcmdnbz`(GCk%2*DtN!1FB@Df+3|H70|FAL|KAmRD#K7ptz;#{OhM75QPswBn zmSy$~&Wqhcln*dEGZ-6717)Wvv!gSMqcf{J&=y-}1~G<&s~X7% zgj`LVH4Hzj<2zVT)_9EVfk@K6eYt1DPsFi2p!(XF^zofqf@5CC~W IfdS|j0Mh6}+5i9m diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/.README.txt.crc b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/.README.txt.crc index c591777978c115f329ab4f2a6d06078660dade37..436531ab22aa29ccf30ce13b907c450e9fba8bf5 100644 GIT binary patch literal 12 TcmYc;N@ieSU}9j@e%}lL5M~0| literal 12 TcmYc;N@ieSU}89?~b#KgqF7{chdkqB`UY9wT%+3bq6B99J;a1Cn=(8Jib%ZecFYxG7E0-CI zdH()-cCbQHjgjLl3?4v}0xV;Lfs$#KyOXNrSas>bU2o1v)=9G+Nz(KT3_YJ%}f)G{~fokj85%adoKe9nH-?!ZnHu za7w7Ih6M;K1=4Abgp5JIp^6VokWwMc7%)&rB*jZm^_lp)`@Z}7xXG5Q_klNBW6D#_ z+@*WFw!Ox5mZTSy9x3sTyvz3H)@EjqkfN6n_?i0p=4OFp`*Phy8B*lniKSqC?sZ zPqpTHlVs4G99nOE;I-~QqV>-P&+%)Lu@1L2BJb6Nz@Ui-@o!f!#?CDw@>|3uJd^SC5Z)n literal 16 XcmYc;N@ieSU}EUo@aaUh?};`5Dd`3| diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/metadata.json.gz index c39674e252f50b64acab11dbcc3b67ed4ee2d2c7..29e15e9d80d58b8e943112439dbdfeaf3e22c342 100644 GIT binary patch literal 855 zcmV-d1E~BTiwFP!000000Ns~SZ`v>r$G=OSHZ5$00%Cas1`|?SO$F^?6^e}Q5U+_H z*`|V3`R+R>ki<#4mTG-zA5e35zPtbB7dv@$IEty`ISWqco|4<2-Evk6jkspSsq$1I&!wP3+D z+jSlXM7r9_9jW8m7w5#tX?`Z3eOv#R7P7R1X_xKJaA z7}kt$e;^%-E#(YDl`^TdJgLMSd{#-brF)5!F`LJHLRiKN8Lq`0pkjo7{=*LdoOw3xFyo*yqydB-<} zFGhPmjLMQ`sfewm*<~SM%V8emKNcPKpDmi2r{UrFG+58UmHGYf{dO8&jy@Kg=QkPx z0oNMkb-tg)yU5kbbrtI-)K4(ohIC%033N?LNweiBP`0RRK-}cFh2HiQRc@QaJ_BqM zSDP7&mo93JBo$n=oFU-x-sZqu&b_&z|xjR+SRimd`H}8Z#*djQJZUCCs(UjGbl(+Mf;pzu0qJHgWHz~|5 zZ%bTWLsHl^yPAJaA|PY8LBYP+mhjCxRWXo0cCbP$Nv|eHjQ*^esnEwNtR5J+G=W|J*EOrSco6QP)9f`etob+4Pchtl3;C?(q69|D&yju?ML(vO)Io>e7ZF(Y1{!i5?^ z#IR<3{R8PxtSM(0s+38s; z!`s3aqpcrCUg?6RBDR*6m#MqvFb(oQOQz;Ycsx7_)-!Nzem{J_8;8Bij|JoTO@=@~ zwMKZI?Pt+0a?Fb*(R;ez}iIB zCdT5WLW{*p4HXNN7Bb(YLI&b$69iP&cJAxR=)8M5DqXIFLl%)r$Ow>qfV*V{t&DNb z{y@ORV9>j0F8if4$(EkD4hBF`Twj@*KM=vpSyj_8JKD0NOVY?Z3t&Gx`<#UvE6@^T z2h@vBt?KR&FsU@qu+1N9m@^!X3NBjC4)Azyb>gM22m*E7r=ShQxyZ4#Yo_V4ZR06* zR8&~Igz5g&U8?A+(NnFPcS0X*4;)3e0L|-Y%IZnV+xf|G^#c}Bzjm{m6y}z56NawPRGuYIwpC7q@FVKfdJnO z=A!eZ)4k~8e(&jVDD35KdTlsC}MkzmI=Ps@)Js67Pd*nzw4P)=T$qHm`dgL9FkU>rvqLg#rq dpqCc5iOFs(sZoHmmZnIW{sqO{-u6=s008OnpH=_> diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/.part-0-262c4389-1ff9-432a-ab5c-4c9cfa547dc4.crc b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/.part-0-262c4389-1ff9-432a-ab5c-4c9cfa547dc4.crc deleted file mode 100644 index bfecb511a09eb9ee5317ef47608e86d2b070266c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 12 TcmYc;N@ieSU}C8Kb7?*R6srU! diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/.part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.crc b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/.part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.crc new file mode 100644 index 0000000000000000000000000000000000000000..f8028adef6aefd664500ebcaa873e0cc197a2ee4 GIT binary patch literal 12 TcmYc;N@ieSU}E^YY(_r-6(s}( literal 0 HcmV?d00001 diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/part-0-262c4389-1ff9-432a-ab5c-4c9cfa547dc4 b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/part-0-262c4389-1ff9-432a-ab5c-4c9cfa547dc4 deleted file mode 100644 index 7acd67edc2e1c969198c7b12555a85b22d0845d2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 219 zcmV<103`p$0000%0ssIgwJ-f(K>@`D0J4Z7DsXK&d=c=~EqAnz3PC(Rplw{wVf1@T zSpKZhiS1P1RQyfkKQe~WqD(1fa7fk=n9OE&1JTHtDXgYa;4wI>-o@QZRa63Q!ViL` zJUDE}jMA|Sd+;;4*a)lRqk5*>yL;ExuaIVT+7~G~Vi;yBZR>RL_c4(892XXaDfjK( zLn;#qRXq~`Fu3Hx#=tmNSO~?#W+@QZRBvMI60#}>@ht?mU;scF+(Q%vCUM0rndn7; VJPiN<0000004TLD{U87V000;tT}1!@ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2 b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2 new file mode 100644 index 0000000000000000000000000000000000000000..f78553dbff70cac10c26a3a847bcbbc696081582 GIT binary patch literal 224 zcmX@jz`&r(#K54jRsU~-F5_EPh9w+~94yHhMZSzo42+J9&h-rPt=jesJLFjH8JQRv z`hIgWxYsin8ynj*yb0W47m~E#xF#bv0~1hn2%}>?LrlMuJwyK?Q+wy6{RbJD_!t{JmO*44JJ7`bLtkwf&Ys~3Nm?{(rlcLi z^~DAZA`BNS9!j|#*bx@NaelL!NQ{nFUYNJQx+?Z#yexYdm=6f=a28=y%RHKUX4?S| Tu>vMupgkY}@{R%nBLf2fOqV>) literal 0 HcmV?d00001 diff --git a/v03_pipeline/var/test/reference_data/test_interval_1.ht/.README.txt.crc b/v03_pipeline/var/test/reference_data/test_interval_1.ht/.README.txt.crc index 2985d33a727d77c8feffc21c93a074438d31d566..7132fdfe3cc37620d0e1e1ae3d399aaf4189568d 100644 GIT binary patch literal 12 TcmYc;N@ieSU}BiO_6rLD6Tbs< literal 12 TcmYc;N@ieSU}AWYyT}p%6XXMH diff --git a/v03_pipeline/var/test/reference_data/test_interval_1.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_interval_1.ht/.metadata.json.gz.crc index 8eabeebfec2ca55da91ecac106845a179b9f8060..2cacfc95c330dc4a8ec4d55314704d9cbaf0a608 100644 GIT binary patch literal 12 TcmYc;N@ieSU}C8K_{k6e6kG$$ literal 12 TcmYc;N@ieSU}Dhv-roZN61M|) diff --git a/v03_pipeline/var/test/reference_data/test_interval_1.ht/README.txt b/v03_pipeline/var/test/reference_data/test_interval_1.ht/README.txt index 2c105134d..3d9a5ac98 100644 --- a/v03_pipeline/var/test/reference_data/test_interval_1.ht/README.txt +++ b/v03_pipeline/var/test/reference_data/test_interval_1.ht/README.txt @@ -1,3 +1,3 @@ This folder comprises a Hail (www.hail.is) native Table or MatrixTable. - Written with version 0.2.114-cc8d36408b36 - Created at 2023/07/13 19:51:12 \ No newline at end of file + Written with version 0.2.130-bea04d9c79b5 + Created at 2024/05/20 13:22:32 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_data/test_interval_1.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_interval_1.ht/globals/.metadata.json.gz.crc index d23992107450550c243e584968da657db5d685d6..d68bbbd080a8b2d387bbb0849ac99b19979e41c7 100644 GIT binary patch literal 12 TcmYc;N@ieSU}89MpW_e!6Ym40 literal 12 TcmYc;N@ieSU}6y8?EeM;5q|?f diff --git a/v03_pipeline/var/test/reference_data/test_interval_1.ht/globals/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_interval_1.ht/globals/metadata.json.gz index 380be3d033640e983bae3dc117df74e24092ebcf..c16ad768c557bd7860fd640812bfb5b00bc126e3 100644 GIT binary patch literal 357 zcmV-r0h<0FiwFP!000000G(68Zh}A*{Fl9!CasO8;Z|CEYSKiT9%_tPcb~Ww9%Oeh zp@e_$d!iuLgO&@s^Ja$G30n!2DTq+KTMZWUdQ4#RbA_jUD( zx+D#82sxx^c1QL6-=eAILVd4_GbJ0Qa)?+;h%OPu7ZQo&pQmn~`8ZPCuS!IfwxZ*m4Uc3+D@5iA;hZoQ2g< z@LuL1N6i3PKoCx@#u2wLbEXik%DLOvumi<9OSxR?Op~$FF}}2QhAZuesWdlbV1)tD z&E5w)FjgV)0<)f1V+FcMFAUW({YkVIH4tOCrNatv@MH`x=f~^|?*qvlw1=U(c89t; zMV*rdIfxun)Vq^<{%=v&a;Cmh#UCZq71Y*E0f#Tq@mIxq4W{=~O~%pfU>LQdfQgMt z2x{eE*kNn^Vf49{&pOKM=XHzUJcm zmdWK=k_D``B4TR-^4#Cw6tpe(Gl@MN_)1ZMlflS!Dwm@D^U|HInb%#zxefn098F%~ RtHBHO0|z`*byh`*{c#-FXD#78dCirKY78rRF84>ZT-?BwMpmZAdS<4E#ztnk1}0Vp L1`Mn~|9}AiMHoFs diff --git a/v03_pipeline/var/test/reference_data/test_interval_1.ht/index/part-0-1224c3b3-ab5b-49d7-8d6d-6084ccbbc683.idx/.index.crc b/v03_pipeline/var/test/reference_data/test_interval_1.ht/index/part-0-1224c3b3-ab5b-49d7-8d6d-6084ccbbc683.idx/.index.crc new file mode 100644 index 0000000000000000000000000000000000000000..23324f542252c6a23a4e82c7ff3ae91d7e155c49 GIT binary patch literal 12 TcmYc;N@ieSU}9ME-1rm#6ZHeb literal 0 HcmV?d00001 diff --git a/v03_pipeline/var/test/reference_data/test_interval_1.ht/index/part-0-1224c3b3-ab5b-49d7-8d6d-6084ccbbc683.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_interval_1.ht/index/part-0-1224c3b3-ab5b-49d7-8d6d-6084ccbbc683.idx/.metadata.json.gz.crc new file mode 100644 index 0000000000000000000000000000000000000000..576d4ffd2f53ecd87bc049cfc67827d78f6adf26 GIT binary patch literal 12 TcmYc;N@ieSU}9*~ZOjA!5sCuX literal 0 HcmV?d00001 diff --git a/v03_pipeline/var/test/reference_data/test_interval_1.ht/index/part-0-1224c3b3-ab5b-49d7-8d6d-6084ccbbc683.idx/index b/v03_pipeline/var/test/reference_data/test_interval_1.ht/index/part-0-1224c3b3-ab5b-49d7-8d6d-6084ccbbc683.idx/index new file mode 100644 index 0000000000000000000000000000000000000000..952d782a33fc4e7095e476ee391088b2cd8ec83b GIT binary patch literal 79 zcmY#pU|3c@fHM&IR53K^;`rOhfRqM)d_h(j80n_#(-xyWGj}^ ze>aL3Zv$^0(?Mx#yr2)(5y*5l&n5zLuSgq9)kkbzn_h^*anbe0hS7U9Tb}=d^xV( eIo_9Lz*tXWUNL){H;nPp_5B;Mk&JZ40001?{7q2+ diff --git a/v03_pipeline/var/test/reference_data/test_interval_1.ht/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_interval_1.ht/metadata.json.gz index 55f91078dda7c95904f86864cadeaf2b0e58569d..2654291a9fd4c6eafa6adafa6135c775e0d69833 100644 GIT binary patch literal 372 zcmV-)0gL`0iwFP!000000HspVPQxG+{TH4z!DVaJZM>NojnN0Q8DE;Fq4cgJQfPo~ zsZ0M|%F3obm?b{JJ@=f$fy*APxqxg7+HfTixt+$*45JO@;sm1cC>l-T@l_6Vd|k}h zbe`WJj5Mr4gJfVb4I-~!|c8nnNXBcRkX;Y zk{MG4m!(^zv6^yedyYe}VH!XVRfmU=|FHnsRA=jZPWR2BuQik|g1;rX)0(!)!8+|a zTu=+glRls&i8BJdr6R>z?Se%B$afXny$MWC!|VEzvr2Os#IL)GqlohLg{ zO||8g`x`bxhQZ)9v=+Ye+br;tN!t5pRfFTh4~HW-XnwK^ZY-C~iuM^=C=hW^2JTUC S(EGpZrQH`FU6-CT0{{Th8?w{@ literal 372 zcmV-)0gL`0iwFP!000000HspVYQrED{TH7$aHVZZT6`NF8)JJI?bu5Q!J6AtW;Cd2 z$wK~p#>RHt!zk=Y@44q3k6v4}cencXqA$fI~R zUTmTmVWeRT8YBagX%JaO?aqJTgfNS=pH@~T*EZk|@lU9`AO@in==GQ*Sx@+cIw@$8S7 z8PClS)g2g3UDD1?i>b0w5~MYVwCFe}zm>G?2e|rpSO&_v0_Oj~Y4CyHIaD34*LkuN zRa9GUxxZmOWEc$ILTl+ezs&+anWVjsRuwotTsWM-L37Ev{V#snw9xUH(!7n1PDs@>m1z${D01u@@EUAn zo2nG$zxO(UkOaEIp4RdL9N&9=?(w;aH$>6}kbsONhezOdzrK!_z}mzk;XSNLFd;%( z&<`{rQ^c6!z+40gHoMfH>m~5(VgaMTLAdg1IG_f`R{!Z zRIk0IGQtWa5gAF_X{R1Yer}48ELm|dp6AEr{@r%}UJ=qeRrl}L|VJlo+veEX( zORAjpHxa^>9%BP<20Qf zbNODVl1#ZsC3#L9iGs`ll^Th6;%XE@(&Rgu@!wpU?WeP+)e&E2V-Ut7fv|SQhwjdi zH}2%T2@UwfB;s#*1-$>-{VA}R_DD#v)1>bRFhNUYA(?` zH+WXYU5DJCMzhgsx4Wzpbs9A4MUCzizijlm&l`TT-(@U{nBVVYH>N*#RL)?aR+78? z4auzU`YA3}K?%$)jSSdaw~*S-(j@3#w%UFZ`QRA59(m;=w=N&&#jzzQu%Rmw)5Ji! dFZmxadM(f)sneHQt5H~V@iTJbe$31Y001twM6v(? literal 647 zcmV;20(ku&iwFP!000000NqwmZ`v>r{x5#ow9r9FX~^5?=!R5HQkw1YikZWN zq(I|J}gQWXy(nNx~DvrO1hSXoZ*)= zR#WVvzV%aVn9_jYRjR1%1(|1<29WhI4Kw5uTNEZ1+~DFTt?BA^LvWI1PPm7Sa81cZ zn;*B=iY?8%FE^L-ZuoxkcbW}?O?uA|D`-_oO_xamx^d{&!$HRn2fHPYu(?QiC1j8) zc88p5K~o2g#S;c}{fl0>pLYE#_oD`RFUV%nJCHZQE8BPE4D@zBu&4e8~@Fw`F=WoUL7!IJ_m6kQivOe zyl?LUdF56vn$UpHOn$iK6Wv|vTXXcju1!^&njY={LmfQB|4ohrigns+H1B<OZnX|-EE@3zA*;4Rt<{Z_#H(CYi0pcPD$V0zv;@3DTEUz&d3P&tEv zT1xKn*ChA8>!-Y1MHR5{G%{dw-9c(wPm?IT==i-h^1(THIkN8}H#Q#^#jzzQv7sxH h(9}S-EBPNWdMVH$nbVhAt5I0A@gFI@7D~?w004o|L7M;o diff --git a/v03_pipeline/var/test/reference_data/test_interval_1.ht/rows/parts/.part-0-1224c3b3-ab5b-49d7-8d6d-6084ccbbc683.crc b/v03_pipeline/var/test/reference_data/test_interval_1.ht/rows/parts/.part-0-1224c3b3-ab5b-49d7-8d6d-6084ccbbc683.crc new file mode 100644 index 0000000000000000000000000000000000000000..a4b13f78f85ea891e2fa7aecc6b2add89427b83c GIT binary patch literal 12 TcmYc;N@ieSU}D(e=gW#P literal 0 HcmV?d00001 diff --git a/v03_pipeline/var/test/reference_data/test_interval_1.ht/rows/parts/.part-0-2d30884d-a682-4d9e-9214-4bf4b5156c98.crc b/v03_pipeline/var/test/reference_data/test_interval_1.ht/rows/parts/.part-0-2d30884d-a682-4d9e-9214-4bf4b5156c98.crc deleted file mode 100644 index 95a7bb76c80838d69905252270d9189e2834e14c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 12 TcmYc;N@ieSU}Bi#cfbPx5~2ew diff --git a/v03_pipeline/var/test/reference_data/test_interval_1.ht/rows/parts/part-0-1224c3b3-ab5b-49d7-8d6d-6084ccbbc683 b/v03_pipeline/var/test/reference_data/test_interval_1.ht/rows/parts/part-0-1224c3b3-ab5b-49d7-8d6d-6084ccbbc683 new file mode 100644 index 0000000000000000000000000000000000000000..1d5c3980168f154018f031c1c8b8b1b40a775acd GIT binary patch literal 60 zcmY#qU|^5}VvVi(e-)%IGB7Z*Bxe*E-kD|!rhiQ{WdzDP*fTOR@B$ei09MKX6b1m5 C%?w}w literal 0 HcmV?d00001 diff --git a/v03_pipeline/var/test/reference_data/test_interval_1.ht/rows/parts/part-0-2d30884d-a682-4d9e-9214-4bf4b5156c98 b/v03_pipeline/var/test/reference_data/test_interval_1.ht/rows/parts/part-0-2d30884d-a682-4d9e-9214-4bf4b5156c98 deleted file mode 100644 index 2fb6ca9cbf23ad4487a8a16ef41dbedd791e8324..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 47 tcmY#lU|^5~;)#qb$r(k4ccz(gFnnPAHO-U}C

C>=_vuSRrf%1_0fS2iX7s From 677d4e20eaeb8ef69fb71f19ff0acc964c99a53d Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 23 May 2024 10:04:25 -0600 Subject: [PATCH 6/7] print lint rule (#791) --- pyproject.toml | 1 - ...write_cached_reference_dataset_query_ht.py | 93 ------------------- v03_pipeline/lib/misc/sample_ids.py | 12 ++- 3 files changed, 8 insertions(+), 98 deletions(-) delete mode 100755 v03_pipeline/bin/write_cached_reference_dataset_query_ht.py diff --git a/pyproject.toml b/pyproject.toml index 59bec645f..adc5d947a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,7 +41,6 @@ ignore = [ "FBT", # flake-boolean-trap... disallows boolean args to functions... fixing this code will require refactors. "ANN", # flake8-annotations is for typed code "DJ", # django specific - "T20", # forbids print, we print quite a bit "PYI", # pyi is typing stub files "PT", # pytest specific "PTH", # pathlib is preferred, but we're not using it yet diff --git a/v03_pipeline/bin/write_cached_reference_dataset_query_ht.py b/v03_pipeline/bin/write_cached_reference_dataset_query_ht.py deleted file mode 100755 index 5bf5c956f..000000000 --- a/v03_pipeline/bin/write_cached_reference_dataset_query_ht.py +++ /dev/null @@ -1,93 +0,0 @@ -#!/usr/bin/env python3 -import argparse - -import hail as hl - -from v03_pipeline.lib.misc.io import write -from v03_pipeline.lib.model import ( - CachedReferenceDatasetQuery, - DatasetType, - ReferenceDatasetCollection, - ReferenceGenome, -) -from v03_pipeline.lib.paths import ( - valid_cached_reference_dataset_query_path, - valid_reference_dataset_collection_path, -) -from v03_pipeline.lib.reference_data.config import CONFIG -from v03_pipeline.lib.reference_data.dataset_table_operations import ( - import_ht_from_config_path, -) - - -def get_ht( - dataset_type: DatasetType, - reference_genome: ReferenceGenome, - query: CachedReferenceDatasetQuery, -) -> hl.Table: - # If the query is defined over an uncombined reference dataset, use the combiner config. - if query.query_raw_dataset: - config = CONFIG[query.dataset(dataset_type)][reference_genome.v02_value] - return import_ht_from_config_path( - config, - query.dataset(dataset_type), - reference_genome, - ) - return hl.read_table( - valid_reference_dataset_collection_path( - reference_genome, - dataset_type, - ReferenceDatasetCollection.COMBINED, - ), - ) - - -def run( - dataset_type: DatasetType, - reference_genome: ReferenceGenome, - query: CachedReferenceDatasetQuery, -): - ht = get_ht(dataset_type, reference_genome, query) - ht = query.query(ht, dataset_type=dataset_type, reference_genome=reference_genome) - destination_path = valid_cached_reference_dataset_query_path( - reference_genome, - dataset_type, - query, - ) - print(f'Uploading ht to {destination_path}') - write(ht, destination_path) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument( - '--reference-genome', - type=ReferenceGenome, - choices=list(ReferenceGenome), - default=ReferenceGenome.GRCh38, - ) - parser.add_argument( - '--dataset-type', - type=DatasetType, - choices=list(DatasetType), - default=None, - help='When used, update the passed dataset, otherwise run all datasets.', - ) - parser.add_argument( - '--query', - type=CachedReferenceDatasetQuery, - choices=list(CachedReferenceDatasetQuery), - required=True, - ) - args, _ = parser.parse_known_args() - if ( - args.query - and args.query - not in CachedReferenceDatasetQuery.for_reference_genome_dataset_type( - args.reference_genome, - args.dataset_type, - ) - ): - msg = f'{args.query} is not a valid query for {DatasetType}' - raise ValueError(msg) - run(args.dataset_type, args.reference_genome, args.query) diff --git a/v03_pipeline/lib/misc/sample_ids.py b/v03_pipeline/lib/misc/sample_ids.py index d2174fe50..ca5407b5e 100644 --- a/v03_pipeline/lib/misc/sample_ids.py +++ b/v03_pipeline/lib/misc/sample_ids.py @@ -2,6 +2,10 @@ import hail as hl +from v03_pipeline.lib.logger import get_logger + +logger = get_logger(__name__) + class MatrixTableSampleSetError(Exception): def __init__(self, message, missing_samples): @@ -42,7 +46,7 @@ def remap_sample_ids( f'All callset sample IDs:{mt.s.collect()}' ) if ignore_missing_samples_when_remapping: - print(message) + logger.info(message) else: raise MatrixTableSampleSetError(message, missing_samples) @@ -50,7 +54,7 @@ def remap_sample_ids( remap_expr = hl.if_else(hl.is_missing(mt.seqr_id), mt.s, mt.seqr_id) mt = mt.annotate_cols(seqr_id=remap_expr, vcf_id=mt.s) mt = mt.key_cols_by(s=mt.seqr_id) - print(f'Remapped {remap_count} sample ids...') + logger.info(f'Remapped {remap_count} sample ids...') return mt @@ -77,9 +81,9 @@ def subset_samples( if ( subset_count > anti_join_ht_count ) and ignore_missing_samples_when_subsetting: - print(message) + logger.info(message) else: raise MatrixTableSampleSetError(message, missing_samples) - print(f'Subsetted to {subset_count} sample ids') + logger.info(f'Subsetted to {subset_count} sample ids') mt = mt.semi_join_cols(sample_subset_ht) return mt.filter_rows(hl.agg.any(hl.is_defined(mt.GT))) From a41935a54d40cd71043fc737953205834d7f1215 Mon Sep 17 00:00:00 2001 From: Julia Klugherz Date: Thu, 23 May 2024 16:59:33 -0400 Subject: [PATCH 7/7] tiny ar bug (#792) --- v03_pipeline/lib/misc/allele_registry.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/v03_pipeline/lib/misc/allele_registry.py b/v03_pipeline/lib/misc/allele_registry.py index 5d480a5bb..24abd7405 100644 --- a/v03_pipeline/lib/misc/allele_registry.py +++ b/v03_pipeline/lib/misc/allele_registry.py @@ -195,10 +195,11 @@ def handle_api_response( logger.info( f'{len(response) - len(errors)} out of {len(response)} variants returned CAID(s)', ) - logger.info( - f'{len(unmappable_variants)} registered variant(s) cannot be mapped back to ours. ' - f'\nFirst unmappable variant:\n{unmappable_variants[0]}', - ) + if unmappable_variants: + logger.info( + f'{len(unmappable_variants)} registered variant(s) cannot be mapped back to ours. ' + f'\nFirst unmappable variant:\n{unmappable_variants[0]}', + ) if errors: logger.warning( f'{len(errors)} failed. First error: {errors[0]}',