From 799ff8fa62da3fb6fea435940df2f614352b1d20 Mon Sep 17 00:00:00 2001 From: Julia Klugherz Date: Tue, 22 Oct 2024 22:23:52 -0400 Subject: [PATCH 01/10] add task to write relatedness check to tsv (#930) * add task to write relatedness check to tsv * fix requirements * relatedness_check_table_path --- v03_pipeline/lib/paths.py | 16 ++++++ .../lib/tasks/write_relatedness_check_tsv.py | 29 ++++++++++ .../tasks/write_relatedness_check_tsv_test.py | 53 +++++++++++++++++++ .../write_remapped_and_subsetted_callset.py | 19 +++++-- 4 files changed, 112 insertions(+), 5 deletions(-) create mode 100644 v03_pipeline/lib/tasks/write_relatedness_check_tsv.py create mode 100644 v03_pipeline/lib/tasks/write_relatedness_check_tsv_test.py diff --git a/v03_pipeline/lib/paths.py b/v03_pipeline/lib/paths.py index 2295951f3..0ae158866 100644 --- a/v03_pipeline/lib/paths.py +++ b/v03_pipeline/lib/paths.py @@ -187,6 +187,22 @@ def relatedness_check_table_path( ) +def relatedness_check_tsv_path( + reference_genome: ReferenceGenome, + dataset_type: DatasetType, + callset_path: str, +) -> str: + return os.path.join( + _pipeline_prefix( + Env.LOADING_DATASETS_DIR, + reference_genome, + dataset_type, + ), + 'relatedness_check', + f'{hashlib.sha256(callset_path.encode("utf8")).hexdigest()}.tsv', + ) + + def remapped_and_subsetted_callset_path( reference_genome: ReferenceGenome, dataset_type: DatasetType, diff --git a/v03_pipeline/lib/tasks/write_relatedness_check_tsv.py b/v03_pipeline/lib/tasks/write_relatedness_check_tsv.py new file mode 100644 index 000000000..bfe303a4a --- /dev/null +++ b/v03_pipeline/lib/tasks/write_relatedness_check_tsv.py @@ -0,0 +1,29 @@ +import hail as hl +import luigi +import luigi.util + +from v03_pipeline.lib.paths import relatedness_check_tsv_path +from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams +from v03_pipeline.lib.tasks.files import GCSorLocalTarget +from v03_pipeline.lib.tasks.write_relatedness_check_table import ( + WriteRelatednessCheckTableTask, +) + + +@luigi.util.inherits(BaseLoadingRunParams) +class WriteRelatednessCheckTsvTask(luigi.Task): + def output(self) -> luigi.Target: + return GCSorLocalTarget( + relatedness_check_tsv_path( + self.reference_genome, + self.dataset_type, + self.callset_path, + ), + ) + + def requires(self): + return [self.clone(WriteRelatednessCheckTableTask)] + + def run(self): + ht = hl.read_table(self.input()[0].path) + ht.export(self.output().path) diff --git a/v03_pipeline/lib/tasks/write_relatedness_check_tsv_test.py b/v03_pipeline/lib/tasks/write_relatedness_check_tsv_test.py new file mode 100644 index 000000000..49f174340 --- /dev/null +++ b/v03_pipeline/lib/tasks/write_relatedness_check_tsv_test.py @@ -0,0 +1,53 @@ +import shutil + +import luigi.worker + +from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType +from v03_pipeline.lib.paths import relatedness_check_table_path +from v03_pipeline.lib.tasks.write_relatedness_check_tsv import ( + WriteRelatednessCheckTsvTask, +) +from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase + +TEST_RELATEDNESS_CHECK_1 = ( + 'v03_pipeline/var/test/relatedness_check/test_relatedness_check_1.ht' +) +TEST_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf' +TEST_RUN_ID = 'manual__2024-04-03' + + +class WriteRelatednessCheckTsvTaskTest(MockedDatarootTestCase): + def setUp(self) -> None: + super().setUp() + shutil.copytree( + TEST_RELATEDNESS_CHECK_1, + relatedness_check_table_path( + ReferenceGenome.GRCh38, + DatasetType.SNV_INDEL, + TEST_VCF, + ), + ) + + def test_write_relatedness_check_tsv_task( + self, + ) -> None: + worker = luigi.worker.Worker() + task = WriteRelatednessCheckTsvTask( + reference_genome=ReferenceGenome.GRCh38, + dataset_type=DatasetType.SNV_INDEL, + callset_path=TEST_VCF, + run_id=TEST_RUN_ID, + sample_type=SampleType.WES, + ) + worker.add(task) + worker.run() + self.assertTrue(task.complete()) + with task.output().open('r') as f: + lines = f.readlines() + expected_lines = [ + 'i\tj\tibd0\tibd1\tibd2\tpi_hat\n', + 'HG00731_1\tHG00733_1\t0\t1\t0\t5.0000e-01\n', + 'HG00732_1\tHG00733_1\t0\t1\t0\t5.0000e-01\n', + ] + for expected_line, actual_line in zip(expected_lines, lines, strict=False): + self.assertEqual(expected_line, actual_line) diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py index e3e0a0e4f..bca068b66 100644 --- a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py +++ b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py @@ -17,13 +17,16 @@ from v03_pipeline.lib.misc.pedigree import parse_pedigree_ht_to_families from v03_pipeline.lib.misc.sample_ids import remap_sample_ids, subset_samples from v03_pipeline.lib.model.environment import Env -from v03_pipeline.lib.paths import remapped_and_subsetted_callset_path +from v03_pipeline.lib.paths import ( + relatedness_check_table_path, + remapped_and_subsetted_callset_path, +) from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask from v03_pipeline.lib.tasks.files import GCSorLocalTarget, RawFileTask from v03_pipeline.lib.tasks.validate_callset import ValidateCallsetTask -from v03_pipeline.lib.tasks.write_relatedness_check_table import ( - WriteRelatednessCheckTableTask, +from v03_pipeline.lib.tasks.write_relatedness_check_tsv import ( + WriteRelatednessCheckTsvTask, ) from v03_pipeline.lib.tasks.write_sex_check_table import WriteSexCheckTableTask @@ -67,7 +70,7 @@ def requires(self) -> list[luigi.Task]: ): requirements = [ *requirements, - self.clone(WriteRelatednessCheckTableTask), + self.clone(WriteRelatednessCheckTsvTask), self.clone(WriteSexCheckTableTask), ] return requirements @@ -101,7 +104,13 @@ def create_table(self) -> hl.MatrixTable: and self.dataset_type.check_sex_and_relatedness and not self.skip_check_sex_and_relatedness ): - relatedness_check_ht = hl.read_table(self.input()[2].path) + relatedness_check_ht = hl.read_table( + relatedness_check_table_path( + self.reference_genome, + self.dataset_type, + self.callset_path, + ), + ) sex_check_ht = hl.read_table(self.input()[3].path) families_failed_relatedness_check = get_families_failed_relatedness_check( families - families_failed_missing_samples.keys(), From b899cc01e9f8fa88afb7349c8c3c0eb62a7a97f1 Mon Sep 17 00:00:00 2001 From: Julia Klugherz Date: Fri, 25 Oct 2024 13:54:33 -0400 Subject: [PATCH 02/10] add relatedness check file path to metadata.json --- v03_pipeline/lib/tasks/write_metadata_for_run.py | 10 +++++++++- v03_pipeline/lib/tasks/write_metadata_for_run_test.py | 6 ++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/v03_pipeline/lib/tasks/write_metadata_for_run.py b/v03_pipeline/lib/tasks/write_metadata_for_run.py index 3432dd891..a41daeaba 100644 --- a/v03_pipeline/lib/tasks/write_metadata_for_run.py +++ b/v03_pipeline/lib/tasks/write_metadata_for_run.py @@ -4,7 +4,10 @@ import luigi import luigi.util -from v03_pipeline.lib.paths import metadata_for_run_path +from v03_pipeline.lib.paths import ( + metadata_for_run_path, + relatedness_check_tsv_path, +) from v03_pipeline.lib.tasks.base.base_project_info_params import ( BaseLoadingRunWithProjectInfoParams, ) @@ -52,6 +55,11 @@ def run(self) -> None: 'relatedness_check': {}, 'sex_check': {}, }, + 'relatedness_check_file_path': relatedness_check_tsv_path( + self.reference_genome, + self.dataset_type, + self.callset_path, + ), } for remapped_and_subsetted_callset in self.input(): callset_mt = hl.read_matrix_table(remapped_and_subsetted_callset.path) diff --git a/v03_pipeline/lib/tasks/write_metadata_for_run_test.py b/v03_pipeline/lib/tasks/write_metadata_for_run_test.py index f5d733a79..cb226de8d 100644 --- a/v03_pipeline/lib/tasks/write_metadata_for_run_test.py +++ b/v03_pipeline/lib/tasks/write_metadata_for_run_test.py @@ -3,6 +3,7 @@ import luigi.worker from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType +from v03_pipeline.lib.paths import relatedness_check_tsv_path from v03_pipeline.lib.tasks.write_metadata_for_run import WriteMetadataForRunTask from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase @@ -70,5 +71,10 @@ def test_write_metadata_for_run_task(self) -> None: }, 'run_id': 'run_123456', 'sample_type': SampleType.WGS.value, + 'relatedness_check_file_path': relatedness_check_tsv_path( + ReferenceGenome.GRCh38, + DatasetType.SNV_INDEL, + TEST_VCF, + ), }, ) From ed4364fa9a79589962e977b368ba249908242ac5 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Mon, 28 Oct 2024 10:50:27 -0400 Subject: [PATCH 03/10] Benb/use metadata as source of family table load (#936) * use run metadata as source of family table load * ruff --- .../lib/tasks/write_project_family_tables.py | 44 ++++++------ .../tasks/write_project_family_tables_test.py | 68 +++++++++++++++++-- .../test/pedigrees/test_pedigree_4_subset.tsv | 2 + 3 files changed, 86 insertions(+), 28 deletions(-) diff --git a/v03_pipeline/lib/tasks/write_project_family_tables.py b/v03_pipeline/lib/tasks/write_project_family_tables.py index f9b7df74f..4813c738b 100644 --- a/v03_pipeline/lib/tasks/write_project_family_tables.py +++ b/v03_pipeline/lib/tasks/write_project_family_tables.py @@ -2,12 +2,13 @@ import luigi import luigi.util -from v03_pipeline.lib.misc.io import import_pedigree -from v03_pipeline.lib.misc.pedigree import parse_pedigree_ht_to_families +from v03_pipeline.lib.paths import remapped_and_subsetted_callset_path from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams -from v03_pipeline.lib.tasks.files import RawFileTask from v03_pipeline.lib.tasks.update_project_table import UpdateProjectTableTask from v03_pipeline.lib.tasks.write_family_table import WriteFamilyTableTask +from v03_pipeline.lib.tasks.write_remapped_and_subsetted_callset import ( + WriteRemappedAndSubsettedCallsetTask, +) @luigi.util.inherits(BaseLoadingRunParams) @@ -26,27 +27,26 @@ def complete(self) -> bool: for write_family_table_task in self.dynamic_write_family_table_tasks ) - def run(self): - # https://luigi.readthedocs.io/en/stable/tasks.html#dynamic-dependencies - # Fetch family guids from project table - update_project_table_task: luigi.Target = yield self.clone( - UpdateProjectTableTask, - ) - project_ht = hl.read_table(update_project_table_task.path) - family_guids_in_project_table = set(hl.eval(project_ht.globals.family_guids)) + def requires(self) -> list[luigi.Task]: + return [ + self.clone( + WriteRemappedAndSubsettedCallsetTask, + ), + self.clone( + UpdateProjectTableTask, + ), + ] - # Fetch family guids from pedigree - pedigree_ht_task: luigi.Target = yield RawFileTask(self.project_pedigree_path) - pedigree_ht = import_pedigree(pedigree_ht_task.path) - families_guids_in_pedigree = { - f.family_guid for f in parse_pedigree_ht_to_families(pedigree_ht) - } - - # Intersect them - family_guids_to_load = ( - family_guids_in_project_table & families_guids_in_pedigree + def run(self): + ht = hl.read_matrix_table( + remapped_and_subsetted_callset_path( + self.reference_genome, + self.dataset_type, + self.callset_path, + self.project_guid, + ), ) - for family_guid in family_guids_to_load: + for family_guid in set(hl.eval(ht.globals.family_samples).keys()): self.dynamic_write_family_table_tasks.add( self.clone(WriteFamilyTableTask, family_guid=family_guid), ) diff --git a/v03_pipeline/lib/tasks/write_project_family_tables_test.py b/v03_pipeline/lib/tasks/write_project_family_tables_test.py index 3d23e9b60..5d0194b43 100644 --- a/v03_pipeline/lib/tasks/write_project_family_tables_test.py +++ b/v03_pipeline/lib/tasks/write_project_family_tables_test.py @@ -2,7 +2,10 @@ import luigi.worker from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType -from v03_pipeline.lib.paths import project_table_path +from v03_pipeline.lib.paths import ( + project_table_path, + remapped_and_subsetted_callset_path, +) from v03_pipeline.lib.tasks.write_project_family_tables import ( WriteProjectFamilyTablesTask, ) @@ -38,6 +41,33 @@ def test_snv_write_project_family_tables_task(self) -> None: hl.read_table(write_family_table_task.output().path) for write_family_table_task in write_project_family_tables.dynamic_write_family_table_tasks ] + # Validate remapped and subsetted callset families + remapped_and_subsetted_callset = hl.read_matrix_table( + remapped_and_subsetted_callset_path( + ReferenceGenome.GRCh38, + DatasetType.SNV_INDEL, + TEST_SNV_INDEL_VCF, + 'R0113_test_project', + ), + ) + self.assertCountEqual( + hl.eval(remapped_and_subsetted_callset.globals.family_samples.keys()), + { + '123_1', + '234_1', + '345_1', + '456_1', + '567_1', + '678_1', + '789_1', + '890_1', + '901_1', + 'bcd_1', + 'cde_1', + 'def_1', + 'efg_1', + }, + ) self.assertCountEqual( [ht.globals.sample_ids.collect() for ht in hts], [ @@ -73,13 +103,39 @@ def test_snv_write_project_family_tables_task(self) -> None: worker.run() self.assertTrue(write_project_family_tables_subset.complete()) hts = [ - hl.read_table(write_family_table_task.output().path) + write_family_table_task.output().path for write_family_table_task in write_project_family_tables_subset.dynamic_write_family_table_tasks ] - # Only one family table written - self.assertEqual( - len(hts), - 1, + self.assertTrue(len(hts)) + self.assertTrue( + '123_1' in hts[0], + ) + # Validate remapped and subsetted callset families + # (and that it was re-written) + remapped_and_subsetted_callset = hl.read_matrix_table( + remapped_and_subsetted_callset_path( + ReferenceGenome.GRCh38, + DatasetType.SNV_INDEL, + TEST_SNV_INDEL_VCF, + 'R0113_test_project', + ), + ) + self.assertCountEqual( + hl.eval(remapped_and_subsetted_callset.globals.family_samples.keys()), + {'123_1'}, + ) + self.assertCountEqual( + hl.eval(remapped_and_subsetted_callset.globals.failed_family_samples), + hl.Struct( + missing_samples={ + '234_1': { + 'reasons': ["Missing samples: {'NA19678_999'}"], + 'samples': ['NA19678_1', 'NA19678_999'], + }, + }, + relatedness_check={}, + sex_check={}, + ), ) # Project table still contains all family guids self.assertCountEqual( diff --git a/v03_pipeline/var/test/pedigrees/test_pedigree_4_subset.tsv b/v03_pipeline/var/test/pedigrees/test_pedigree_4_subset.tsv index 63e2addd8..dc022f159 100644 --- a/v03_pipeline/var/test/pedigrees/test_pedigree_4_subset.tsv +++ b/v03_pipeline/var/test/pedigrees/test_pedigree_4_subset.tsv @@ -1,2 +1,4 @@ Project_GUID Family_GUID Family_ID Individual_ID Paternal_ID Maternal_ID Sex R0114_project4 123_1 123 NA19675_1 F +R0114_project4 234_1 234 NA19678_1 M +R0114_project4 234_1 234 NA19678_999 F From 5930d1ca6a2fc90e011213bdbc8d1ad5d3cba723 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 31 Oct 2024 16:10:21 -0400 Subject: [PATCH 04/10] Support gcs dirs in rsync (#932) * Support gcs dirs in rsync * ws --- v03_pipeline/bin/rsync_reference_data.bash | 32 ++++++++++++++++++---- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/v03_pipeline/bin/rsync_reference_data.bash b/v03_pipeline/bin/rsync_reference_data.bash index 9dfc91d74..825c583e5 100755 --- a/v03_pipeline/bin/rsync_reference_data.bash +++ b/v03_pipeline/bin/rsync_reference_data.bash @@ -16,12 +16,34 @@ case $REFERENCE_GENOME in exit 1 esac -mkdir -p $REFERENCE_DATASETS_DIR/$REFERENCE_GENOME; +case $REFERENCE_DATASETS_DIR in + "gs://seqr-reference-data") + echo "Cannot rsync to the authoritative source" + exit 1 + ;; + *) + ;; +esac -if [ -f "$REFERENCE_DATASETS_DIR"/"$REFERENCE_GENOME"/_SUCCESS ]; then - echo "Skipping rsync because already successful" - exit 0; +if ! [[ "$REFERENCE_DATASETS_DIR" =~ gs://* ]]; then + mkdir -p $REFERENCE_DATASETS_DIR/$REFERENCE_GENOME; + if [ -f "$REFERENCE_DATASETS_DIR"/"$REFERENCE_GENOME"/_SUCCESS ]; then + echo "Skipping rsync because already successful" + exit 0; + fi +else + result=$(gsutil -q stat "$REFERENCE_DATASETS_DIR"/"$REFERENCE_GENOME"/_SUCCESS || echo 1) + if [[ $result != 1 ]]; then + echo "Skipping rsync because already successful" + exit 0; + fi fi gsutil -m rsync -rd "gs://seqr-reference-data/v03/$REFERENCE_GENOME" $REFERENCE_DATASETS_DIR/$REFERENCE_GENOME -touch "$REFERENCE_DATASETS_DIR"/"$REFERENCE_GENOME"/_SUCCESS +if ! [[ $REFERENCE_DATASETS_DIR =~ gs://* ]]; then + touch "$REFERENCE_DATASETS_DIR"/"$REFERENCE_GENOME"/_SUCCESS +else + touch _SUCCESS + gsutil cp _SUCCESS "$REFERENCE_DATASETS_DIR"/"$REFERENCE_GENOME"/_SUCCESS + rm -rf _SUCCESS +fi From a515c2cf4eb4668dd65442be80ed72541b6cb13e Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 31 Oct 2024 16:30:21 -0400 Subject: [PATCH 05/10] Gencode refactor to remove gcs (#934) * Gencode refactor to remove gcs * Fix --- pyproject.toml | 1 - requirements.in | 1 - requirements.txt | 193 +++++++++--------- .../reference_data/gencode/download_utils.py | 112 ---------- .../gencode/download_utils_test.py | 130 ------------ .../gencode/mapping_gene_ids.py | 125 ++---------- .../gencode/mapping_gene_ids_tests.py | 181 ++++------------ .../lib/tasks/write_new_variants_table.py | 2 +- 8 files changed, 149 insertions(+), 596 deletions(-) delete mode 100644 v03_pipeline/lib/reference_data/gencode/download_utils.py delete mode 100644 v03_pipeline/lib/reference_data/gencode/download_utils_test.py diff --git a/pyproject.toml b/pyproject.toml index 73826b005..2ffbdfe04 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,6 @@ extend-exclude = [ 'luigi_pipeline/lib/*', 'luigi_pipeline/seqr*.py', 'luigi_pipeline/tests/data/*', - 'v03_pipeline/lib/reference_data/gencode/*', ] ignore = [ # Individual Rules diff --git a/requirements.in b/requirements.in index 3cff99f79..af19fd655 100644 --- a/requirements.in +++ b/requirements.in @@ -3,6 +3,5 @@ google-api-python-client>=1.8.0 hail==0.2.132 luigi>=3.4.0 gnomad==0.6.4 -google-cloud-storage>=2.14.0 aiofiles==24.1.0 pydantic==2.8.2 diff --git a/requirements.txt b/requirements.txt index 4083931f8..a565a7c41 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,9 +8,9 @@ aiodns==2.0.0 # via hail aiofiles==24.1.0 # via -r requirements.in -aiohappyeyeballs==2.3.5 +aiohappyeyeballs==2.4.3 # via aiohttp -aiohttp==3.10.2 +aiohttp==3.10.10 # via # hail # slackclient @@ -22,122 +22,105 @@ asttokens==2.4.1 # via stack-data async-timeout==4.0.3 # via aiohttp -attrs==23.1.0 +attrs==24.2.0 # via aiohttp avro==1.11.3 # via hail azure-common==1.1.28 # via azure-mgmt-storage -azure-core==1.29.5 +azure-core==1.31.0 # via # azure-identity # azure-mgmt-core # azure-storage-blob # msrest -azure-identity==1.16.1 +azure-identity==1.19.0 # via hail azure-mgmt-core==1.4.0 # via azure-mgmt-storage azure-mgmt-storage==20.1.0 # via hail -azure-storage-blob==12.19.0 +azure-storage-blob==12.23.1 # via hail -bokeh==3.3.1 +bokeh==3.3.4 # via hail -boto3==1.33.1 +boto3==1.35.48 # via hail -botocore==1.33.1 +botocore==1.35.48 # via # boto3 # hail # s3transfer -cachetools==5.3.2 +cachetools==5.5.0 # via google-auth -certifi==2023.11.17 +certifi==2024.8.30 # via # elasticsearch # msrest # requests -cffi==1.16.0 +cffi==1.17.1 # via # cryptography # pycares -charset-normalizer==3.3.2 +charset-normalizer==3.4.0 # via requests click==8.1.7 # via typer -comm==0.2.0 +comm==0.2.2 # via ipywidgets commonmark==0.9.1 # via rich -contourpy==1.2.0 +contourpy==1.3.0 # via bokeh -cryptography==43.0.1 +cryptography==43.0.3 # via # azure-identity # azure-storage-blob # msal # pyjwt -cython==0.29.36 - # via hdbscan decorator==4.4.2 # via # hail # ipython deprecated==1.2.14 # via hail -dill==0.3.7 +dill==0.3.9 # via hail -docutils==0.20.1 +docutils==0.21.2 # via python-daemon elasticsearch==7.9.1 # via -r requirements.in -exceptiongroup==1.2.0 +exceptiongroup==1.2.2 # via ipython -executing==2.0.1 +executing==2.1.0 # via stack-data -frozenlist==1.4.0 +frozenlist==1.5.0 # via # aiohttp # aiosignal # hail gnomad==0.6.4 # via -r requirements.in -google-api-core==2.14.0 - # via - # google-api-python-client - # google-cloud-core - # google-cloud-storage -google-api-python-client==2.108.0 +google-api-core==2.21.0 + # via google-api-python-client +google-api-python-client==2.149.0 # via -r requirements.in -google-auth==2.23.4 +google-auth==2.35.0 # via # google-api-core # google-api-python-client # google-auth-httplib2 # google-auth-oauthlib - # google-cloud-core - # google-cloud-storage # hail -google-auth-httplib2==0.1.1 +google-auth-httplib2==0.2.0 # via google-api-python-client google-auth-oauthlib==0.8.0 # via hail -google-cloud-core==2.4.1 - # via google-cloud-storage -google-cloud-storage==2.14.0 - # via -r requirements.in -google-crc32c==1.5.0 - # via - # google-cloud-storage - # google-resumable-media -google-resumable-media==2.7.0 - # via google-cloud-storage -googleapis-common-protos==1.61.0 +googleapis-common-protos==1.65.0 # via google-api-core hail==0.2.132 # via -r requirements.in -hdbscan==0.8.33 +hdbscan==0.8.39 # via gnomad httplib2==0.22.0 # via @@ -145,15 +128,15 @@ httplib2==0.22.0 # google-auth-httplib2 humanize==1.1.0 # via hail -idna==3.6 +idna==3.10 # via # requests # yarl -ipython==8.18.1 +ipython==8.28.0 # via ipywidgets -ipywidgets==8.1.1 +ipywidgets==8.1.5 # via gnomad -isodate==0.6.1 +isodate==0.7.2 # via # azure-storage-blob # msrest @@ -161,43 +144,43 @@ janus==1.0.0 # via hail jedi==0.19.1 # via ipython -jinja2==3.1.3 +jinja2==3.1.4 # via bokeh jmespath==1.0.1 # via # boto3 # botocore -joblib==1.3.2 +joblib==1.4.2 # via # hdbscan # scikit-learn -jproperties==2.1.1 +jproperties==2.1.2 # via hail -jupyterlab-widgets==3.0.9 +jupyterlab-widgets==3.0.13 # via ipywidgets lockfile==0.12.2 # via python-daemon -luigi==3.4.0 +luigi==3.5.2 # via -r requirements.in -markupsafe==2.1.3 +markupsafe==3.0.2 # via jinja2 -matplotlib-inline==0.1.6 +matplotlib-inline==0.1.7 # via ipython -msal==1.28.0 +msal==1.31.0 # via # azure-identity # msal-extensions -msal-extensions==1.0.0 +msal-extensions==1.2.0 # via azure-identity msrest==0.7.1 # via azure-mgmt-storage -multidict==6.0.4 +multidict==6.1.0 # via # aiohttp # yarl -nest-asyncio==1.5.8 +nest-asyncio==1.6.0 # via hail -numpy==1.26.2 +numpy==1.26.4 # via # bokeh # contourpy @@ -208,102 +191,108 @@ numpy==1.26.2 # scipy oauthlib==3.2.2 # via requests-oauthlib -orjson==3.10.6 +orjson==3.10.10 # via hail -packaging==23.2 +packaging==24.1 # via # bokeh # plotly -pandas==2.1.3 +pandas==2.2.3 # via # bokeh # hail parsimonious==0.10.0 # via hail -parso==0.8.3 +parso==0.8.4 # via jedi pexpect==4.9.0 # via ipython -pillow==10.3.0 +pillow==11.0.0 # via bokeh -plotly==5.18.0 +plotly==5.24.1 # via hail -portalocker==2.8.2 +portalocker==2.10.1 # via msal-extensions -prompt-toolkit==3.0.41 +prompt-toolkit==3.0.48 # via ipython +propcache==0.2.0 + # via yarl +proto-plus==1.25.0 + # via google-api-core protobuf==3.20.2 # via # google-api-core # googleapis-common-protos # hail + # proto-plus ptyprocess==0.7.0 # via pexpect -pure-eval==0.2.2 +pure-eval==0.2.3 # via stack-data py4j==0.10.9.7 # via pyspark -pyasn1==0.5.1 +pyasn1==0.6.1 # via # pyasn1-modules # rsa -pyasn1-modules==0.3.0 +pyasn1-modules==0.4.1 # via google-auth pycares==4.4.0 # via aiodns -pycparser==2.21 +pycparser==2.22 # via cffi pydantic==2.8.2 # via -r requirements.in pydantic-core==2.20.1 # via pydantic -pygments==2.17.2 +pygments==2.18.0 # via # ipython # rich -pyjwt[crypto]==2.8.0 +pyjwt[crypto]==2.9.0 # via msal -pyparsing==3.1.1 +pyparsing==3.2.0 # via httplib2 -pyspark==3.5.1 +pyspark==3.5.3 # via hail python-daemon==3.0.1 # via luigi -python-dateutil==2.8.2 +python-dateutil==2.9.0.post0 # via # botocore # luigi # pandas python-json-logger==2.0.7 # via hail -pytz==2023.3.post1 +pytz==2024.2 # via pandas -pyyaml==6.0.1 +pyyaml==6.0.2 # via # bokeh # hail -regex==2023.10.3 +regex==2024.9.11 # via parsimonious -requests==2.31.0 +requests==2.32.3 # via # azure-core # google-api-core - # google-cloud-storage # hail # msal # msrest # requests-oauthlib -requests-oauthlib==1.3.1 +requests-oauthlib==2.0.0 # via # google-auth-oauthlib # msrest rich==12.6.0 - # via hail + # via + # hail + # typer rsa==4.9 # via google-auth -s3transfer==0.8.0 +s3transfer==0.10.3 # via boto3 -scikit-learn==1.5.0 +scikit-learn==1.5.2 # via # gnomad # hdbscan @@ -312,11 +301,12 @@ scipy==1.11.4 # hail # hdbscan # scikit-learn +shellingham==1.5.4 + # via typer six==1.16.0 # via # asttokens # azure-core - # isodate # jproperties # python-dateutil slackclient==2.5.0 @@ -327,52 +317,55 @@ stack-data==0.6.3 # via ipython tabulate==0.9.0 # via hail -tenacity==8.2.3 +tenacity==8.5.0 # via # luigi # plotly -threadpoolctl==3.2.0 +threadpoolctl==3.5.0 # via scikit-learn tornado==6.4.1 # via # bokeh # luigi -traitlets==5.14.0 +traitlets==5.14.3 # via # comm # ipython # ipywidgets # matplotlib-inline -typer==0.9.0 +typer==0.12.5 # via hail -typing-extensions==4.8.0 +typing-extensions==4.12.2 # via # azure-core + # azure-identity # azure-storage-blob + # ipython # janus + # multidict # pydantic # pydantic-core # typer -tzdata==2023.3 +tzdata==2024.2 # via pandas uritemplate==4.1.1 # via google-api-python-client -urllib3==2.0.7 +urllib3==2.2.3 # via # botocore # elasticsearch # requests -uvloop==0.19.0 +uvloop==0.21.0 # via hail -wcwidth==0.2.12 +wcwidth==0.2.13 # via prompt-toolkit -widgetsnbextension==4.0.9 +widgetsnbextension==4.0.13 # via ipywidgets wrapt==1.16.0 # via deprecated -xyzservices==2023.10.1 +xyzservices==2024.9.0 # via bokeh -yarl==1.9.3 +yarl==1.16.0 # via aiohttp # The following packages are considered to be unsafe in a requirements file: diff --git a/v03_pipeline/lib/reference_data/gencode/download_utils.py b/v03_pipeline/lib/reference_data/gencode/download_utils.py deleted file mode 100644 index 420e860b9..000000000 --- a/v03_pipeline/lib/reference_data/gencode/download_utils.py +++ /dev/null @@ -1,112 +0,0 @@ -import logging -import os -import tempfile -from contextlib import contextmanager - -import hail as hl -import requests -from google.cloud import storage - -logger = logging.getLogger(__name__) - - -def parse_gs_path_to_bucket(gs_path): - bucket_name = gs_path.replace('gs://', '').split('/')[0] - file_name = gs_path.split(bucket_name)[-1].lstrip('/') - - storage_client = storage.Client() - bucket = storage_client.bucket(bucket_name) - - return bucket, file_name - - -def stream_gs_file(gs_path, raw_download=False): - logger.info(f'Stream from GCS: {gs_path}') - bucket, file_name = parse_gs_path_to_bucket(gs_path) - - blob = bucket.get_blob(file_name) - - return blob and blob.download_as_string(raw_download=raw_download) - - -@contextmanager -def file_writer(file_path, get_existing_size=False): - bucket = None - size = None - if is_gs_path(file_path): - local_file_path = os.path.join( - tempfile.gettempdir(), - os.path.basename(file_path), - ) - bucket, file_name = parse_gs_path_to_bucket(file_path) - if get_existing_size: - blob = bucket.get_blob(file_name) - size = blob and blob.size - else: - local_file_path = file_path - if get_existing_size: - size = os.path.isfile(local_file_path) and os.path.getsize(local_file_path) - - local_file = open(local_file_path, 'wb') - - yield local_file, size - - local_file.close() - - if bucket: - blob = bucket.blob(file_name) - blob.upload_from_filename(local_file_path) - - -def is_gs_path(path): - return path.startswith('gs://') - - -def path_exists(path): - is_gs = is_gs_path(path) - return (is_gs and hl.hadoop_exists(path)) or (not is_gs and os.path.exists(path)) - - -DEFAULT_TO_DIR = tempfile.gettempdir() - - -def download_file(url, to_dir=None, verbose=True): - """Download the given file and returns its local path. - Args: - url (string): HTTP or FTP url - to_dir: optional save to directory - verbose: display verbose information - Returns: - string: local file path - """ - if to_dir is None: - to_dir = DEFAULT_TO_DIR - - if not (url and url.startswith(('http://', 'https://'))): - msg = f'Invalid url: {url}' - raise ValueError(msg) - remote_file_size = _get_remote_file_size(url) - - file_path = os.path.join(to_dir, os.path.basename(url)) - with file_writer(file_path, get_existing_size=True) as fw: - f, file_size = fw - if file_size and file_size == remote_file_size: - logger.info( - f'Re-using {file_path} previously downloaded from {url}', - ) - return file_path - - is_gz = url.endswith('.gz') - response = requests.get(url, stream=is_gz) - input_iter = response if is_gz else response.iter_content() - if verbose: - logger.info(f'Downloading {url} to {file_path}') - - f.writelines(input_iter) - input_iter.close() - - return file_path - - -def _get_remote_file_size(url): - return int(requests.head(url).headers.get('Content-Length', '0')) diff --git a/v03_pipeline/lib/reference_data/gencode/download_utils_test.py b/v03_pipeline/lib/reference_data/gencode/download_utils_test.py deleted file mode 100644 index d990bf6f6..000000000 --- a/v03_pipeline/lib/reference_data/gencode/download_utils_test.py +++ /dev/null @@ -1,130 +0,0 @@ -import unittest -from unittest import mock - -import responses - -from v03_pipeline.lib.reference_data.gencode.download_utils import download_file - -DEFAULT_TEST_DIR = 'default_test/dir' -TEST_DIR = 'test/dir' -GS_TEST_DIR = 'gs://test-bucket/test/dir' -TEST_TXT_FILE = 'test_file.txt' -TEST_GZ_FILE = 'test_file.gz' -TXT_DATA_URL = 'https://mock_url/test_file.txt' -GZ_DATA_URL = 'https://mock_url/test_file.gz' -GZ_DATA = b'test data\nanother line\n' - - -class DownloadUtilsTest(unittest.TestCase): - @responses.activate - @mock.patch( - 'v03_pipeline.lib.reference_data.gencode.download_utils.DEFAULT_TO_DIR', - DEFAULT_TEST_DIR, - ) - @mock.patch('v03_pipeline.lib.reference_data.gencode.download_utils.logger') - @mock.patch('v03_pipeline.lib.reference_data.gencode.download_utils.os.path.isfile') - @mock.patch( - 'v03_pipeline.lib.reference_data.gencode.download_utils.os.path.getsize', - ) - @mock.patch('v03_pipeline.lib.reference_data.gencode.download_utils.open') - @mock.patch( - 'v03_pipeline.lib.reference_data.gencode.download_utils.tempfile.gettempdir', - ) - @mock.patch( - 'v03_pipeline.lib.reference_data.gencode.download_utils.parse_gs_path_to_bucket', - ) - def test_download_file( - self, - mock_get_bucket, - mock_gettempdir, - mock_open, - mock_getsize, - mock_isfile, - mock_logger, - ): - responses.add( - responses.HEAD, - GZ_DATA_URL, - headers={'Content-Length': '1024'}, - status=200, - body=b' ' * 1024, - ) - responses.add(responses.GET, GZ_DATA_URL, body=GZ_DATA) - responses.add( - responses.HEAD, - TXT_DATA_URL, - headers={'Content-Length': '1024'}, - status=200, - body=b' ' * 1024, - ) - responses.add(responses.GET, TXT_DATA_URL, body='test data\nanother line\n') - - # Test bad url - with self.assertRaises(ValueError) as ve: - download_file('bad_url') - self.assertEqual(str(ve.exception), 'Invalid url: bad_url') - - # Test already downloaded - mock_isfile.return_value = True - mock_getsize.return_value = 1024 - result = download_file(GZ_DATA_URL) - self.assertEqual(result, 'default_test/dir/test_file.gz') - mock_open.assert_called_with('default_test/dir/test_file.gz', 'wb') - mock_isfile.assert_called_with('default_test/dir/test_file.gz') - mock_getsize.assert_called_with('default_test/dir/test_file.gz') - mock_logger.info.assert_called_with( - f'Re-using default_test/dir/test_file.gz previously downloaded from {GZ_DATA_URL}', - ) - - # Test download, .gz file format, verbose - mock_isfile.reset_mock() - mock_getsize.reset_mock() - mock_logger.reset_mock() - mock_open.reset_mock() - mock_isfile.return_value = False - result = download_file(GZ_DATA_URL, TEST_DIR) - self.assertEqual(result, 'test/dir/test_file.gz') - mock_isfile.assert_called_with('test/dir/test_file.gz') - mock_getsize.assert_not_called() - mock_open.assert_called_with('test/dir/test_file.gz', 'wb') - mock_logger.info.assert_called_with( - f'Downloading {GZ_DATA_URL} to test/dir/test_file.gz', - ) - - # Test download, non-.gz file format, non-verbose - mock_isfile.reset_mock() - mock_logger.reset_mock() - mock_open.reset_mock() - mock_isfile.return_value = False - result = download_file(TXT_DATA_URL, TEST_DIR, verbose=False) - self.assertEqual(result, 'test/dir/test_file.txt') - mock_isfile.assert_called_with('test/dir/test_file.txt') - mock_getsize.assert_not_called() - mock_open.assert_called_with('test/dir/test_file.txt', 'wb') - mock_open.return_value.writelines.assert_called_once() - mock_logger.info.assert_not_called() - - mock_gettempdir.assert_not_called() - mock_get_bucket.assert_not_called() - - # Test using Google Storage - mock_isfile.reset_mock() - mock_logger.reset_mock() - mock_open.reset_mock() - mock_gettempdir.return_value = TEST_DIR - mock_bucket = mock.MagicMock() - mock_get_bucket.return_value = mock_bucket, 'test/dir/test_file.gz' - result = download_file(GZ_DATA_URL, GS_TEST_DIR) - self.assertEqual(result, 'gs://test-bucket/test/dir/test_file.gz') - mock_gettempdir.assert_called_once() - mock_isfile.assert_not_called() - mock_getsize.assert_not_called() - mock_open.assert_called_with('test/dir/test_file.gz', 'wb') - mock_logger.info.assert_called_with( - f'Downloading {GZ_DATA_URL} to gs://test-bucket/test/dir/test_file.gz', - ) - mock_bucket.get_blob.assert_called_with('test/dir/test_file.gz') - mock_bucket.blob.assert_called_with('test/dir/test_file.gz') - mock_bucket.blob.return_value.upload_from_filename.assert_called_with( - 'test/dir/test_file.gz', - ) diff --git a/v03_pipeline/lib/reference_data/gencode/mapping_gene_ids.py b/v03_pipeline/lib/reference_data/gencode/mapping_gene_ids.py index fed40301f..96597f815 100644 --- a/v03_pipeline/lib/reference_data/gencode/mapping_gene_ids.py +++ b/v03_pipeline/lib/reference_data/gencode/mapping_gene_ids.py @@ -1,21 +1,8 @@ import gzip import logging -import os -import pickle import requests -from v03_pipeline.lib.reference_data.gencode.download_utils import ( - download_file, - file_writer, - is_gs_path, - path_exists, - stream_gs_file, -) - -GENOME_VERSION_GRCh37 = '37' -GENOME_VERSION_GRCh38 = '38' - logger = logging.getLogger(__name__) GENCODE_GTF_URL = 'http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_{gencode_release}/gencode.v{gencode_release}.annotation.gtf.gz' @@ -33,115 +20,43 @@ 'phase', 'info', ] +EXPECTED_ENSEMBLE_TO_REFSEQ_FIELDS = 3 -def _get_pickle_file(path): - root, ext = os.path.splitext(path) - return root + '.pickle' - - -def _load_parsed_data_or_download(gencode_release, download_path): - gene_id_mapping = {} +def load_gencode_gene_symbol_to_gene_id(gencode_release: int) -> dict[str, str]: url = GENCODE_GTF_URL.format(gencode_release=gencode_release) - gencode_gtf_path = os.path.join(download_path, os.path.basename(url)) - pickle_file = _get_pickle_file(gencode_gtf_path) - if path_exists(pickle_file): - logger.info( - 'Use the existing pickle file {}.\nIf you want to reload the data, please delete it and re-run the data loading.'.format( - pickle_file, - ), - ) - if is_gs_path(pickle_file): - p = pickle.loads(stream_gs_file(pickle_file)) - else: - with open(pickle_file, 'rb') as handle: - p = pickle.load(handle) - gene_id_mapping.update(p) - elif not path_exists(gencode_gtf_path): - gencode_gtf_path = download_file(url, to_dir=download_path) - logger.info(f'Downloaded to {gencode_gtf_path}') - else: - logger.info( - 'Use the existing downloaded file {}.\nIf you want to re-download it, please delete the file and re-run the pipeline.'.format( - gencode_gtf_path, - ), - ) - - return gene_id_mapping, gencode_gtf_path - - -def _parse_gtf_data(gencode_gtf_path): - gene_id_mapping = {} - logger.info(f'Loading {gencode_gtf_path}') - is_gs = is_gs_path(gencode_gtf_path) - gencode_file = ( - gzip.decompress(stream_gs_file(gencode_gtf_path, raw_download=True)) - .decode() - .split('\n') - if is_gs - else gzip.open(gencode_gtf_path, 'rt') - ) - for i, line in enumerate(gencode_file): - line = line.rstrip('\r\n') + response = requests.get(url, stream=True, timeout=10) + gene_symbol_to_gene_id = {} + for line in gzip.GzipFile(fileobj=response.raw): + line = line.decode('ascii') # noqa: PLW2901 if not line or line.startswith('#'): continue - fields = line.split('\t') - + fields = line.strip().split('\t') if len(fields) != len(GENCODE_FILE_HEADER): + msg = f'Unexpected number of fields: {fields}' raise ValueError( - 'Unexpected number of fields on line #%s: %s' % (i, fields), + msg, ) - - record = dict(zip(GENCODE_FILE_HEADER, fields)) - + record = dict(zip(GENCODE_FILE_HEADER, fields, strict=False)) if record['feature_type'] != 'gene': continue - # parse info field info_fields = [x.strip().split() for x in record['info'].split(';') if x != ''] info_fields = {k: v.strip('"') for k, v in info_fields} + gene_symbol_to_gene_id[info_fields['gene_name']] = info_fields['gene_id'].split( + '.', + )[0] + return gene_symbol_to_gene_id - gene_id_mapping[info_fields['gene_name']] = info_fields['gene_id'].split('.')[0] - - if not is_gs: - gencode_file.close() - - pickle_file = _get_pickle_file(gencode_gtf_path) - logger.info(f'Saving to pickle {pickle_file}') - with file_writer(pickle_file) as fw: - f, _ = fw - pickle.dump(gene_id_mapping, f, protocol=pickle.HIGHEST_PROTOCOL) - - return gene_id_mapping - - -def load_gencode_gene_symbol_to_gene_id(gencode_release, download_path=''): - """Load Gencode to create a gene symbols to gene ids mapping table. - - Args: - gencode_release (int): the gencode release to load (eg. 25) - download_path (str): The path for downloaded data - """ - gene_id_mapping, gencode_gtf_path = _load_parsed_data_or_download( - gencode_release, - download_path, - ) - - if not gene_id_mapping: - gene_id_mapping = _parse_gtf_data(gencode_gtf_path) - - logger.info(f'Got {len(gene_id_mapping)} gene id mapping records') - return gene_id_mapping def load_gencode_ensembl_to_refseq_id(gencode_release: int): url = GENCODE_ENSEMBL_TO_REFSEQ_URL.format(gencode_release=gencode_release) - response = requests.get(url, stream=True) + response = requests.get(url, stream=True, timeout=10) ensembl_to_refseq_ids = {} for line in gzip.GzipFile(fileobj=response.raw): - line = line.decode('ascii').strip().split('\t') - if len(line) > 3: - raise ValueError( - 'Unexpected number of fields on line in ensemble_to_refseq mapping', - ) - ensembl_to_refseq_ids[line[0].split('.')[0]] = line[1] + fields = line.decode('ascii').strip().split('\t') + if len(fields) > EXPECTED_ENSEMBLE_TO_REFSEQ_FIELDS: + msg = 'Unexpected number of fields on line in ensemble_to_refseq mapping' + raise ValueError(msg) + ensembl_to_refseq_ids[fields[0].split('.')[0]] = fields[1] return ensembl_to_refseq_ids diff --git a/v03_pipeline/lib/reference_data/gencode/mapping_gene_ids_tests.py b/v03_pipeline/lib/reference_data/gencode/mapping_gene_ids_tests.py index 585278a7b..58c037048 100644 --- a/v03_pipeline/lib/reference_data/gencode/mapping_gene_ids_tests.py +++ b/v03_pipeline/lib/reference_data/gencode/mapping_gene_ids_tests.py @@ -1,162 +1,52 @@ import gzip import unittest -from unittest import mock import responses -from v03_pipeline.lib.reference_data.gencode.mapping_gene_ids import load_gencode_ensembl_to_refseq_id, load_gencode_gene_symbol_to_gene_id, GENCODE_ENSEMBL_TO_REFSEQ_URL +from v03_pipeline.lib.reference_data.gencode.mapping_gene_ids import ( + GENCODE_ENSEMBL_TO_REFSEQ_URL, + GENCODE_GTF_URL, + load_gencode_ensembl_to_refseq_id, + load_gencode_gene_symbol_to_gene_id, +) -DOWNLOAD_PATH = 'test/path' -GS_DOWNLOAD_PATH ='gs://test-bucket/test/path' -DOWNLOAD_FILE = 'test/path/gencode.v29.annotation.gtf.gz' -PICKLE_FILE = 'test/path/gencode.v29.annotation.gtf.pickle' -PICKLE_FILE_HANDLE = 'handle' GTF_DATA = [ - '#description: evidence-based annotation of the human genome, version 31 (Ensembl 97), mapped to GRCh37 with gencode-backmap\n', - 'chr1 HAVANA gene 11869 14409 . + . gene_id "ENSG00000223972.5_2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; level 2; hgnc_id "HGNC:37102"; havana_gene "OTTHUMG00000000961.2_2"; remap_status "full_contig"; remap_num_mappings 1; remap_target_status "overlap";\n', - 'chr1 HAVANA gene 621059 622053 . - . gene_id "ENSG00000284662.1_2"; gene_type "protein_coding"; gene_name "OR4F16"; level 2; hgnc_id "HGNC:15079"; havana_gene "OTTHUMG00000002581.3_2"; remap_status "full_contig"; remap_num_mappings 1; remap_target_status "overlap";\n', - 'GL000193.1 HAVANA gene 77815 78162 . + . gene_id "ENSG00000279783.1_5"; gene_type "processed_pseudogene"; gene_name "AC018692.2"; level 2; havana_gene "OTTHUMG00000189459.1_5"; remap_status "full_contig"; remap_num_mappings 1; remap_target_status "new";\n', + '#description: evidence-based annotation of the human genome, version 31 (Ensembl 97), mapped to GRCh37 with gencode-backmap', + 'chr1 HAVANA gene 11869 14409 . + . gene_id "ENSG00000223972.5_2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; level 2; hgnc_id "HGNC:37102"; havana_gene "OTTHUMG00000000961.2_2"; remap_status "full_contig"; remap_num_mappings 1; remap_target_status "overlap";', + 'chr1 HAVANA gene 621059 622053 . - . gene_id "ENSG00000284662.1_2"; gene_type "protein_coding"; gene_name "OR4F16"; level 2; hgnc_id "HGNC:15079"; havana_gene "OTTHUMG00000002581.3_2"; remap_status "full_contig"; remap_num_mappings 1; remap_target_status "overlap";', + 'GL000193.1 HAVANA gene 77815 78162 . + . gene_id "ENSG00000279783.1_5"; gene_type "processed_pseudogene"; gene_name "AC018692.2"; level 2; havana_gene "OTTHUMG00000189459.1_5"; remap_status "full_contig"; remap_num_mappings 1; remap_target_status "new";', ] -GENE_ID_MAPPING = {"DDX11L1": "ENSG00000223972", "OR4F16": "ENSG00000284662", "AC018692.2": "ENSG00000279783"} +GENE_ID_MAPPING = { + 'DDX11L1': 'ENSG00000223972', + 'OR4F16': 'ENSG00000284662', + 'AC018692.2': 'ENSG00000279783', +} -ENSEMBL_TO_REFSEQ_DATA = b'''ENST00000424215.1\tNR_121638.1 +ENSEMBL_TO_REFSEQ_DATA = b"""ENST00000424215.1\tNR_121638.1 ENST00000378391.6\tNM_199454.3\tNP_955533.2 ENST00000270722.10\tNM_022114.4\tNP_071397.3 -ENST00000288774.8\tNM_001374425.1\tNP_001361354.1''' +ENST00000288774.8\tNM_001374425.1\tNP_001361354.1""" -class LoadGencodeTestCase(unittest.TestCase): - - @mock.patch('v03_pipeline.lib.reference_data.gencode.mapping_gene_ids.logger') - @mock.patch('v03_pipeline.lib.reference_data.gencode.mapping_gene_ids.path_exists') - @mock.patch('v03_pipeline.lib.reference_data.gencode.mapping_gene_ids.pickle') - @mock.patch('v03_pipeline.lib.reference_data.gencode.mapping_gene_ids.open') - @mock.patch('v03_pipeline.lib.reference_data.gencode.mapping_gene_ids.gzip.open') - @mock.patch('v03_pipeline.lib.reference_data.gencode.mapping_gene_ids.file_writer') - @mock.patch('v03_pipeline.lib.reference_data.gencode.mapping_gene_ids.download_file') - def test_load_gencode_local(self, mock_download_file, mock_file_writer, mock_gopen, mock_open, mock_pickle, - mock_path_exists, mock_logger): - # test using saved file - mock_path_exists.side_effect = [True] - mock_pickle.load.return_value = GENE_ID_MAPPING - gene_id_mapping = load_gencode_gene_symbol_to_gene_id(23, download_path=DOWNLOAD_PATH) - mock_file_writer.assert_not_called() - mock_download_file.assert_not_called() - mock_gopen.assert_not_called() - mock_open.assert_called_with('test/path/gencode.v23.annotation.gtf.pickle', 'rb') - mock_pickle.load.assert_called_with(mock_open.return_value.__enter__.return_value) - mock_path_exists.assert_called_with('test/path/gencode.v23.annotation.gtf.pickle') - mock_logger.info.assert_has_calls([ - mock.call('Use the existing pickle file test/path/gencode.v23.annotation.gtf.pickle.\nIf you want to reload the data, please delete it and re-run the data loading.'), - mock.call('Got 3 gene id mapping records'), - ]) - self.assertEqual(gene_id_mapping, GENE_ID_MAPPING) - # test downloading and parsing gtf data - mock_path_exists.reset_mock() - mock_logger.reset_mock() - mock_pickle.reset_mock() - mock_open.reset_mock() - mock_path_exists.side_effect = [False, False] - mock_download_file.return_value = 'test/path/gencode.v24.annotation.gtf.gz' - mock_gopen.return_value.__iter__.return_value = GTF_DATA - mock_f = mock.MagicMock() - mock_file_writer.return_value.__enter__.return_value = mock_f, None - gene_id_mapping = load_gencode_gene_symbol_to_gene_id(24, download_path=DOWNLOAD_PATH) - self.assertEqual(gene_id_mapping, GENE_ID_MAPPING) - mock_path_exists.assert_has_calls([ - mock.call('test/path/gencode.v24.annotation.gtf.pickle'), - mock.call('test/path/gencode.v24.annotation.gtf.gz'), - ]) - mock_download_file.assert_called_with( - 'http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_24/gencode.v24.annotation.gtf.gz', - to_dir='test/path', +class LoadGencodeTestCase(unittest.TestCase): + @responses.activate + def test_load_gencode_gene_symbol_to_gene_id(self): + url = GENCODE_GTF_URL.format(gencode_release=12) + responses.add( + responses.GET, + url, + body=gzip.compress(('\n'.join(GTF_DATA)).encode()), + ) + mapping = load_gencode_gene_symbol_to_gene_id(12) + self.assertDictEqual( + mapping, + { + 'AC018692.2': 'ENSG00000279783', + 'DDX11L1': 'ENSG00000223972', + 'OR4F16': 'ENSG00000284662', + }, ) - mock_file_writer.assert_called_with('test/path/gencode.v24.annotation.gtf.pickle') - mock_pickle.dump.assert_called_with(GENE_ID_MAPPING, mock_f, protocol=mock.ANY) - mock_gopen.assert_called_with('test/path/gencode.v24.annotation.gtf.gz', 'rt') - mock_open.assert_not_called() - mock_logger.info.assert_has_calls([ - mock.call('Downloaded to test/path/gencode.v24.annotation.gtf.gz'), - mock.call('Loading test/path/gencode.v24.annotation.gtf.gz'), - mock.call('Saving to pickle test/path/gencode.v24.annotation.gtf.pickle'), - mock.call('Got 3 gene id mapping records') - ]) - mock_pickle.load.assert_not_called() - - # test using downloaded file - mock_path_exists.reset_mock() - mock_logger.reset_mock() - mock_download_file.reset_mock() - mock_pickle.reset_mock() - mock_path_exists.side_effect = [False, True] - mock_gopen.return_value.__iter__.return_value = GTF_DATA - gene_id_mapping = load_gencode_gene_symbol_to_gene_id(24, download_path=DOWNLOAD_PATH) - self.assertEqual(gene_id_mapping, GENE_ID_MAPPING) - mock_path_exists.assert_has_calls([ - mock.call('test/path/gencode.v24.annotation.gtf.pickle'), - mock.call('test/path/gencode.v24.annotation.gtf.gz'), - ]) - mock_gopen.assert_called_with('test/path/gencode.v24.annotation.gtf.gz', 'rt') - mock_download_file.assert_not_called() - mock_file_writer.assert_called_with('test/path/gencode.v24.annotation.gtf.pickle') - mock_pickle.dump.assert_called_with(GENE_ID_MAPPING, mock_f, protocol=mock.ANY) - mock_open.assert_not_called() - mock_logger.info.assert_has_calls([ - mock.call('Use the existing downloaded file test/path/gencode.v24.annotation.gtf.gz.\nIf you want to re-download it, please delete the file and re-run the pipeline.'), - mock.call('Loading test/path/gencode.v24.annotation.gtf.gz'), - mock.call('Saving to pickle test/path/gencode.v24.annotation.gtf.pickle'), - mock.call('Got 3 gene id mapping records') - ]) - mock_pickle.load.assert_not_called() - - # bad gtf data test - mock_path_exists.side_effect = [False, False] - mock_gopen.return_value.__iter__.return_value = ['bad data'] - with self.assertRaises(ValueError) as ve: - _ = load_gencode_gene_symbol_to_gene_id(24, download_path=DOWNLOAD_PATH) - self.assertEqual(str(ve.exception), "Unexpected number of fields on line #0: ['bad data']") - - @mock.patch('v03_pipeline.lib.reference_data.gencode.mapping_gene_ids.gzip') - @mock.patch('v03_pipeline.lib.reference_data.gencode.mapping_gene_ids.logger') - @mock.patch('v03_pipeline.lib.reference_data.gencode.mapping_gene_ids.path_exists') - @mock.patch('v03_pipeline.lib.reference_data.gencode.mapping_gene_ids.pickle') - @mock.patch('v03_pipeline.lib.reference_data.gencode.mapping_gene_ids.stream_gs_file') - @mock.patch('v03_pipeline.lib.reference_data.gencode.mapping_gene_ids.file_writer') - def test_load_gencode_using_gs(self, mock_file_writer, mock_stream_gs_file, mock_pickle, mock_path_exists, - mock_logger, mock_gzip): - - # test using saved file. - mock_path_exists.side_effect = [True] - mock_pickle.loads.return_value = GENE_ID_MAPPING - gene_id_mapping = load_gencode_gene_symbol_to_gene_id(25, download_path=GS_DOWNLOAD_PATH) - self.assertEqual(gene_id_mapping, GENE_ID_MAPPING) - mock_path_exists.assert_called_with('gs://test-bucket/test/path/gencode.v25.annotation.gtf.pickle') - mock_logger.info.assert_has_calls([ - mock.call('Use the existing pickle file gs://test-bucket/test/path/gencode.v25.annotation.gtf.pickle.\n' - 'If you want to reload the data, please delete it and re-run the data loading.'), - mock.call('Got 3 gene id mapping records') - ]) - mock_stream_gs_file.assert_called_with('gs://test-bucket/test/path/gencode.v25.annotation.gtf.pickle') - mock_pickle.dump.assert_not_called() - mock_file_writer.assert_not_called() - - # test using downloaded file. - mock_path_exists.side_effect = [False, True] - mock_gzip.decompress.return_value = ''.join(GTF_DATA).encode() - mock_f = mock.MagicMock() - mock_file_writer.return_value.__enter__.return_value = mock_f, None - gene_id_mapping = load_gencode_gene_symbol_to_gene_id(25, download_path=GS_DOWNLOAD_PATH) - self.assertEqual(gene_id_mapping, GENE_ID_MAPPING) - mock_path_exists.assert_has_calls([ - mock.call('gs://test-bucket/test/path/gencode.v25.annotation.gtf.pickle'), - mock.call('gs://test-bucket/test/path/gencode.v25.annotation.gtf.gz'), - ]) - mock_stream_gs_file.assert_called_with('gs://test-bucket/test/path/gencode.v25.annotation.gtf.gz', raw_download=True) - mock_gzip.decompress.assert_called_with(mock_stream_gs_file.return_value) - mock_file_writer.assert_called_with('gs://test-bucket/test/path/gencode.v25.annotation.gtf.pickle') - mock_pickle.dump.assert_called_with(GENE_ID_MAPPING, mock_f, protocol=mock.ANY) - @responses.activate def test_load_gencode_ensembl_to_refseq_id(self): @@ -170,6 +60,5 @@ def test_load_gencode_ensembl_to_refseq_id(self): 'ENST00000378391': 'NM_199454.3', 'ENST00000270722': 'NM_022114.4', 'ENST00000288774': 'NM_001374425.1', - } + }, ) - diff --git a/v03_pipeline/lib/tasks/write_new_variants_table.py b/v03_pipeline/lib/tasks/write_new_variants_table.py index f8b1c570b..92f65680f 100644 --- a/v03_pipeline/lib/tasks/write_new_variants_table.py +++ b/v03_pipeline/lib/tasks/write_new_variants_table.py @@ -58,7 +58,7 @@ def annotation_dependencies(self) -> dict[str, hl.Table]: ) if self.dataset_type.has_gencode_gene_symbol_to_gene_id_mapping: deps['gencode_gene_symbol_to_gene_id_mapping'] = hl.literal( - load_gencode_gene_symbol_to_gene_id(GENCODE_RELEASE, ''), + load_gencode_gene_symbol_to_gene_id(GENCODE_RELEASE), ) deps[ 'grch37_to_grch38_liftover_ref_path' From f13df939176a32a3dec4da55a68428901eefc4f3 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Fri, 1 Nov 2024 11:26:11 -0400 Subject: [PATCH 06/10] additional semi join (#947) --- ...date_variant_annotations_table_with_new_samples.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples.py index 96ded8491..aacef8e67 100644 --- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples.py +++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples.py @@ -74,6 +74,12 @@ def update_table(self, ht: hl.Table) -> hl.Table: # and either present or not present in the existing annotations table. callset_variants_ht = ht.semi_join(callset_ht) ht = ht.anti_join(callset_ht) + lookup_ht = hl.read_table( + lookup_table_path( + self.reference_genome, + self.dataset_type, + ), + ) callset_variants_ht = callset_variants_ht.annotate( **get_fields( callset_variants_ht, @@ -89,6 +95,11 @@ def update_table(self, ht: hl.Table) -> hl.Table: ) ht = ht.union(callset_variants_ht, unify=True) + # Variants may have fallen out of the callset and + # have been removed from the lookup table during modification. + # Ensure we don't proceed with those variants. + ht = ht.semi_join(lookup_ht) + # Fix up the globals and mark the table as updated with these callset/project pairs. ht = self.annotate_globals(ht) return ht.annotate_globals( From 7a1966d85ddb67a2882a3b40e3d9bee06666f832 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Fri, 1 Nov 2024 15:37:45 -0400 Subject: [PATCH 07/10] metadata parameters refactor (#946) * metadata parameters refactor * fix missing param * tweak * missed one * last one * fix test * last few bugfixes * fix * bump * missed one * change parameter type due to confusing bug * push * enum --- .../lib/tasks/base/base_loading_run_params.py | 3 ++ .../tasks/base/base_project_info_params.py | 11 ----- .../tasks/base/base_update_project_table.py | 42 ---------------- .../lib/tasks/trigger_hail_backend_reload.py | 6 +-- v03_pipeline/lib/tasks/update_lookup_table.py | 6 +-- .../lib/tasks/update_project_table.py | 49 +++++++++++++++---- .../lib/tasks/update_project_table_test.py | 21 ++++---- ...ate_project_table_with_deleted_families.py | 41 ++++++++++++++-- ...iant_annotations_table_with_new_samples.py | 6 +-- .../lib/tasks/validate_callset_test.py | 3 ++ v03_pipeline/lib/tasks/write_family_table.py | 4 +- .../lib/tasks/write_family_table_test.py | 21 ++++---- .../lib/tasks/write_metadata_for_run.py | 18 +++---- .../lib/tasks/write_metadata_for_run_test.py | 1 + .../lib/tasks/write_new_variants_table.py | 6 +-- .../lib/tasks/write_project_family_tables.py | 6 +-- .../tasks/write_project_family_tables_test.py | 14 +++--- .../write_remapped_and_subsetted_callset.py | 20 ++++---- ...ite_remapped_and_subsetted_callset_test.py | 14 +++--- v03_pipeline/lib/tasks/write_success_file.py | 10 ++-- .../tasks/write_validation_errors_for_run.py | 2 + 21 files changed, 160 insertions(+), 144 deletions(-) delete mode 100644 v03_pipeline/lib/tasks/base/base_project_info_params.py delete mode 100644 v03_pipeline/lib/tasks/base/base_update_project_table.py diff --git a/v03_pipeline/lib/tasks/base/base_loading_run_params.py b/v03_pipeline/lib/tasks/base/base_loading_run_params.py index cde621c4f..7c79b00d6 100644 --- a/v03_pipeline/lib/tasks/base/base_loading_run_params.py +++ b/v03_pipeline/lib/tasks/base/base_loading_run_params.py @@ -19,6 +19,9 @@ class BaseLoadingRunParams(luigi.Task): run_id = luigi.Parameter() sample_type = luigi.EnumParameter(enum=SampleType) callset_path = luigi.Parameter() + project_guids = luigi.ListParameter(default=[]) + project_remap_paths = luigi.ListParameter(default=[]) + project_pedigree_paths = luigi.ListParameter(default=[]) ignore_missing_samples_when_remapping = luigi.BoolParameter( default=False, parsing=luigi.BoolParameter.EXPLICIT_PARSING, diff --git a/v03_pipeline/lib/tasks/base/base_project_info_params.py b/v03_pipeline/lib/tasks/base/base_project_info_params.py deleted file mode 100644 index 3bb5f5873..000000000 --- a/v03_pipeline/lib/tasks/base/base_project_info_params.py +++ /dev/null @@ -1,11 +0,0 @@ -import luigi -import luigi.util - -from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams - - -@luigi.util.inherits(BaseLoadingRunParams) -class BaseLoadingRunWithProjectInfoParams(luigi.Task): - project_guids = luigi.ListParameter() - project_remap_paths = luigi.ListParameter() - project_pedigree_paths = luigi.ListParameter() diff --git a/v03_pipeline/lib/tasks/base/base_update_project_table.py b/v03_pipeline/lib/tasks/base/base_update_project_table.py deleted file mode 100644 index 473a31bc2..000000000 --- a/v03_pipeline/lib/tasks/base/base_update_project_table.py +++ /dev/null @@ -1,42 +0,0 @@ -import hail as hl -import luigi - -from v03_pipeline.lib.model import SampleType -from v03_pipeline.lib.paths import project_table_path -from v03_pipeline.lib.tasks.base.base_update import BaseUpdateTask -from v03_pipeline.lib.tasks.files import GCSorLocalTarget - - -class BaseUpdateProjectTableTask(BaseUpdateTask): - sample_type = luigi.EnumParameter(enum=SampleType) - project_guid = luigi.Parameter() - - def output(self) -> luigi.Target: - return GCSorLocalTarget( - project_table_path( - self.reference_genome, - self.dataset_type, - self.sample_type, - self.project_guid, - ), - ) - - def initialize_table(self) -> hl.Table: - key_type = self.dataset_type.table_key_type(self.reference_genome) - return hl.Table.parallelize( - [], - hl.tstruct( - **key_type, - filters=hl.tset(hl.tstr), - # NB: entries is missing here because it is untyped - # until we read the type off of the first callset aggregation. - ), - key=key_type.fields, - globals=hl.Struct( - family_guids=hl.empty_array(hl.tstr), - family_samples=hl.empty_dict(hl.tstr, hl.tarray(hl.tstr)), - updates=hl.empty_set( - hl.tstruct(callset=hl.tstr, remap_pedigree_hash=hl.tint32), - ), - ), - ) diff --git a/v03_pipeline/lib/tasks/trigger_hail_backend_reload.py b/v03_pipeline/lib/tasks/trigger_hail_backend_reload.py index 427ba23fd..f4e8d36fb 100644 --- a/v03_pipeline/lib/tasks/trigger_hail_backend_reload.py +++ b/v03_pipeline/lib/tasks/trigger_hail_backend_reload.py @@ -4,15 +4,15 @@ from v03_pipeline.lib.logger import get_logger from v03_pipeline.lib.model import Env -from v03_pipeline.lib.tasks.base.base_project_info_params import ( - BaseLoadingRunWithProjectInfoParams, +from v03_pipeline.lib.tasks.base.base_loading_run_params import ( + BaseLoadingRunParams, ) from v03_pipeline.lib.tasks.write_success_file import WriteSuccessFileTask logger = get_logger(__name__) -@luigi.util.inherits(BaseLoadingRunWithProjectInfoParams) +@luigi.util.inherits(BaseLoadingRunParams) class TriggerHailBackendReload(luigi.Task): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) diff --git a/v03_pipeline/lib/tasks/update_lookup_table.py b/v03_pipeline/lib/tasks/update_lookup_table.py index 1dd0b746a..fad438f57 100644 --- a/v03_pipeline/lib/tasks/update_lookup_table.py +++ b/v03_pipeline/lib/tasks/update_lookup_table.py @@ -10,8 +10,8 @@ ) from v03_pipeline.lib.model.constants import PROJECTS_EXCLUDED_FROM_LOOKUP from v03_pipeline.lib.paths import remapped_and_subsetted_callset_path -from v03_pipeline.lib.tasks.base.base_project_info_params import ( - BaseLoadingRunWithProjectInfoParams, +from v03_pipeline.lib.tasks.base.base_loading_run_params import ( + BaseLoadingRunParams, ) from v03_pipeline.lib.tasks.base.base_update_lookup_table import ( BaseUpdateLookupTableTask, @@ -21,7 +21,7 @@ ) -@luigi.util.inherits(BaseLoadingRunWithProjectInfoParams) +@luigi.util.inherits(BaseLoadingRunParams) class UpdateLookupTableTask(BaseUpdateLookupTableTask): def complete(self) -> bool: return super().complete() and hl.eval( diff --git a/v03_pipeline/lib/tasks/update_project_table.py b/v03_pipeline/lib/tasks/update_project_table.py index cd582009f..6c723ffde 100644 --- a/v03_pipeline/lib/tasks/update_project_table.py +++ b/v03_pipeline/lib/tasks/update_project_table.py @@ -9,19 +9,30 @@ remove_family_guids, ) from v03_pipeline.lib.misc.io import remap_pedigree_hash +from v03_pipeline.lib.paths import project_table_path from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams -from v03_pipeline.lib.tasks.base.base_update_project_table import ( - BaseUpdateProjectTableTask, +from v03_pipeline.lib.tasks.base.base_update import ( + BaseUpdateTask, ) +from v03_pipeline.lib.tasks.files import GCSorLocalTarget from v03_pipeline.lib.tasks.write_remapped_and_subsetted_callset import ( WriteRemappedAndSubsettedCallsetTask, ) @luigi.util.inherits(BaseLoadingRunParams) -class UpdateProjectTableTask(BaseUpdateProjectTableTask): - project_remap_path = luigi.Parameter() - project_pedigree_path = luigi.Parameter() +class UpdateProjectTableTask(BaseUpdateTask): + project_i = luigi.IntParameter() + + def output(self) -> luigi.Target: + return GCSorLocalTarget( + project_table_path( + self.reference_genome, + self.dataset_type, + self.sample_type, + self.project_guids[self.project_i], + ), + ) def complete(self) -> bool: return super().complete() and hl.eval( @@ -29,8 +40,8 @@ def complete(self) -> bool: hl.Struct( callset=self.callset_path, remap_pedigree_hash=remap_pedigree_hash( - self.project_remap_path, - self.project_pedigree_path, + self.project_remap_paths[self.project_i], + self.project_pedigree_paths[self.project_i], ), ), ), @@ -39,6 +50,26 @@ def complete(self) -> bool: def requires(self) -> luigi.Task: return self.clone(WriteRemappedAndSubsettedCallsetTask) + def initialize_table(self) -> hl.Table: + key_type = self.dataset_type.table_key_type(self.reference_genome) + return hl.Table.parallelize( + [], + hl.tstruct( + **key_type, + filters=hl.tset(hl.tstr), + # NB: entries is missing here because it is untyped + # until we read the type off of the first callset aggregation. + ), + key=key_type.fields, + globals=hl.Struct( + family_guids=hl.empty_array(hl.tstr), + family_samples=hl.empty_dict(hl.tstr, hl.tarray(hl.tstr)), + updates=hl.empty_set( + hl.tstruct(callset=hl.tstr, remap_pedigree_hash=hl.tint32), + ), + ), + ) + def update_table(self, ht: hl.Table) -> hl.Table: callset_mt = hl.read_matrix_table(self.input().path) callset_ht = compute_callset_family_entries_ht( @@ -69,8 +100,8 @@ def update_table(self, ht: hl.Table) -> hl.Table: hl.Struct( callset=self.callset_path, remap_pedigree_hash=remap_pedigree_hash( - self.project_remap_path, - self.project_pedigree_path, + self.project_remap_paths[self.project_i], + self.project_pedigree_paths[self.project_i], ), ), ), diff --git a/v03_pipeline/lib/tasks/update_project_table_test.py b/v03_pipeline/lib/tasks/update_project_table_test.py index 7e6ab67f9..0daad72e0 100644 --- a/v03_pipeline/lib/tasks/update_project_table_test.py +++ b/v03_pipeline/lib/tasks/update_project_table_test.py @@ -25,9 +25,10 @@ def test_update_project_table_task(self) -> None: run_id=TEST_RUN_ID, sample_type=SampleType.WGS, callset_path=TEST_VCF, - project_guid='R0113_test_project', - project_remap_path=TEST_REMAP, - project_pedigree_path=TEST_PEDIGREE_3, + project_guids=['R0113_test_project'], + project_remap_paths=[TEST_REMAP], + project_pedigree_paths=[TEST_PEDIGREE_3], + project_i=0, skip_validation=True, ) worker.add(upt_task) @@ -134,9 +135,10 @@ def test_update_project_table_task_different_pedigree(self) -> None: run_id=TEST_RUN_ID, sample_type=SampleType.WGS, callset_path=TEST_VCF, - project_guid='R0113_test_project', - project_remap_path=TEST_REMAP, - project_pedigree_path=TEST_PEDIGREE_3, + project_guids=['R0113_test_project'], + project_remap_paths=[TEST_REMAP], + project_pedigree_paths=[TEST_PEDIGREE_3], + project_i=0, skip_validation=True, ) worker.add(upt_task) @@ -147,9 +149,10 @@ def test_update_project_table_task_different_pedigree(self) -> None: run_id=TEST_RUN_ID, sample_type=SampleType.WGS, callset_path=TEST_VCF, - project_guid='R0113_test_project', - project_remap_path=TEST_REMAP, - project_pedigree_path=TEST_PEDIGREE_3_DIFFERENT_FAMILIES, + project_guids=['R0113_test_project'], + project_remap_paths=[TEST_REMAP], + project_pedigree_paths=[TEST_PEDIGREE_3_DIFFERENT_FAMILIES], + project_i=0, skip_validation=True, ) worker.add(upt_task) diff --git a/v03_pipeline/lib/tasks/update_project_table_with_deleted_families.py b/v03_pipeline/lib/tasks/update_project_table_with_deleted_families.py index 90f1937dc..56277f34b 100644 --- a/v03_pipeline/lib/tasks/update_project_table_with_deleted_families.py +++ b/v03_pipeline/lib/tasks/update_project_table_with_deleted_families.py @@ -2,14 +2,27 @@ import luigi from v03_pipeline.lib.misc.family_entries import remove_family_guids -from v03_pipeline.lib.tasks.base.base_update_project_table import ( - BaseUpdateProjectTableTask, -) +from v03_pipeline.lib.model import SampleType +from v03_pipeline.lib.paths import project_table_path +from v03_pipeline.lib.tasks.base.base_update import BaseUpdateTask +from v03_pipeline.lib.tasks.files import GCSorLocalTarget -class UpdateProjectTableWithDeletedFamiliesTask(BaseUpdateProjectTableTask): +class UpdateProjectTableWithDeletedFamiliesTask(BaseUpdateTask): + sample_type = luigi.EnumParameter(enum=SampleType) + project_guid = luigi.Parameter() family_guids = luigi.ListParameter() + def output(self) -> luigi.Target: + return GCSorLocalTarget( + project_table_path( + self.reference_genome, + self.dataset_type, + self.sample_type, + self.project_guid, + ), + ) + def complete(self) -> bool: return super().complete() and hl.eval( hl.bind( @@ -26,6 +39,26 @@ def complete(self) -> bool: ), ) + def initialize_table(self) -> hl.Table: + key_type = self.dataset_type.table_key_type(self.reference_genome) + return hl.Table.parallelize( + [], + hl.tstruct( + **key_type, + filters=hl.tset(hl.tstr), + # NB: entries is missing here because it is untyped + # until we read the type off of the first callset aggregation. + ), + key=key_type.fields, + globals=hl.Struct( + family_guids=hl.empty_array(hl.tstr), + family_samples=hl.empty_dict(hl.tstr, hl.tarray(hl.tstr)), + updates=hl.empty_set( + hl.tstruct(callset=hl.tstr, remap_pedigree_hash=hl.tint32), + ), + ), + ) + def update_table(self, ht: hl.Table) -> hl.Table: return remove_family_guids( ht, diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples.py index aacef8e67..739247770 100644 --- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples.py +++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples.py @@ -9,8 +9,8 @@ lookup_table_path, new_variants_table_path, ) -from v03_pipeline.lib.tasks.base.base_project_info_params import ( - BaseLoadingRunWithProjectInfoParams, +from v03_pipeline.lib.tasks.base.base_loading_run_params import ( + BaseLoadingRunParams, ) from v03_pipeline.lib.tasks.base.base_update_variant_annotations_table import ( BaseUpdateVariantAnnotationsTableTask, @@ -18,7 +18,7 @@ from v03_pipeline.lib.tasks.write_new_variants_table import WriteNewVariantsTableTask -@luigi.util.inherits(BaseLoadingRunWithProjectInfoParams) +@luigi.util.inherits(BaseLoadingRunParams) class UpdateVariantAnnotationsTableWithNewSamplesTask( BaseUpdateVariantAnnotationsTableTask, ): diff --git a/v03_pipeline/lib/tasks/validate_callset_test.py b/v03_pipeline/lib/tasks/validate_callset_test.py index f00e5f125..991412824 100644 --- a/v03_pipeline/lib/tasks/validate_callset_test.py +++ b/v03_pipeline/lib/tasks/validate_callset_test.py @@ -62,6 +62,7 @@ def test_validate_callset_multiple_exceptions( # a NON_REF allele type at position chr1: 902024, missing # all contigs but chr1, and contains non-coding variants. callset_path=MULTIPLE_VALIDATION_EXCEPTIONS_VCF, + project_guids=['project_a'], skip_validation=False, run_id=TEST_RUN_ID, ) @@ -74,6 +75,7 @@ def test_validate_callset_multiple_exceptions( dataset_type=DatasetType.SNV_INDEL, sample_type=SampleType.WES, callset_path=MULTIPLE_VALIDATION_EXCEPTIONS_VCF, + project_guids=['project_a'], skip_validation=False, run_id=TEST_RUN_ID, ) @@ -82,6 +84,7 @@ def test_validate_callset_multiple_exceptions( self.assertDictEqual( json.load(f), { + 'project_guids': ['project_a'], 'error_messages': [ 'Alleles with invalid allele are present in the callset. This appears to be a GVCF containing records for sites with no variants.', "Variants are present multiple times in the callset: ['1-902088-G-A']", diff --git a/v03_pipeline/lib/tasks/write_family_table.py b/v03_pipeline/lib/tasks/write_family_table.py index 42715aff9..9ffbc5482 100644 --- a/v03_pipeline/lib/tasks/write_family_table.py +++ b/v03_pipeline/lib/tasks/write_family_table.py @@ -13,9 +13,7 @@ @luigi.util.inherits(BaseLoadingRunParams) class WriteFamilyTableTask(BaseWriteTask): - project_guid = luigi.Parameter() - project_remap_path = luigi.Parameter() - project_pedigree_path = luigi.Parameter() + project_i = luigi.IntParameter() family_guid = luigi.Parameter() def output(self) -> luigi.Target: diff --git a/v03_pipeline/lib/tasks/write_family_table_test.py b/v03_pipeline/lib/tasks/write_family_table_test.py index 5c6995146..60d6f0e41 100644 --- a/v03_pipeline/lib/tasks/write_family_table_test.py +++ b/v03_pipeline/lib/tasks/write_family_table_test.py @@ -24,9 +24,10 @@ def test_snv_write_family_table_task(self) -> None: run_id=TEST_RUN_ID, sample_type=SampleType.WGS, callset_path=TEST_SNV_INDEL_VCF, - project_guid='R0113_test_project', - project_remap_path=TEST_REMAP, - project_pedigree_path=TEST_PEDIGREE_3, + project_guids=['R0113_test_project'], + project_remap_paths=[TEST_REMAP], + project_pedigree_paths=[TEST_PEDIGREE_3], + project_i=0, family_guid='abc_1', skip_validation=True, ) @@ -162,9 +163,10 @@ def test_sv_write_family_table_task(self) -> None: run_id=TEST_RUN_ID, sample_type=SampleType.WGS, callset_path=TEST_SV_VCF, - project_guid='R0115_test_project2', - project_remap_path='not_a_real_file', - project_pedigree_path=TEST_PEDIGREE_5, + project_guids=['R0115_test_project2'], + project_remap_paths=['not_a_real_file'], + project_pedigree_paths=[TEST_PEDIGREE_5], + project_i=0, family_guid='family_2_1', skip_validation=True, ) @@ -415,9 +417,10 @@ def test_gcnv_write_family_table_task(self) -> None: run_id=TEST_RUN_ID, sample_type=SampleType.WES, callset_path=TEST_GCNV_BED_FILE, - project_guid='R0115_test_project2', - project_remap_path='not_a_real_file', - project_pedigree_path=TEST_PEDIGREE_5, + project_guids=['R0115_test_project2'], + project_remap_paths=['not_a_real_file'], + project_pedigree_paths=[TEST_PEDIGREE_5], + project_i=0, family_guid='family_2_1', skip_validation=True, ) diff --git a/v03_pipeline/lib/tasks/write_metadata_for_run.py b/v03_pipeline/lib/tasks/write_metadata_for_run.py index a41daeaba..cc012a926 100644 --- a/v03_pipeline/lib/tasks/write_metadata_for_run.py +++ b/v03_pipeline/lib/tasks/write_metadata_for_run.py @@ -8,8 +8,8 @@ metadata_for_run_path, relatedness_check_tsv_path, ) -from v03_pipeline.lib.tasks.base.base_project_info_params import ( - BaseLoadingRunWithProjectInfoParams, +from v03_pipeline.lib.tasks.base.base_loading_run_params import ( + BaseLoadingRunParams, ) from v03_pipeline.lib.tasks.files import GCSorLocalTarget from v03_pipeline.lib.tasks.write_remapped_and_subsetted_callset import ( @@ -17,7 +17,7 @@ ) -@luigi.util.inherits(BaseLoadingRunWithProjectInfoParams) +@luigi.util.inherits(BaseLoadingRunParams) class WriteMetadataForRunTask(luigi.Task): def output(self) -> luigi.Target: return GCSorLocalTarget( @@ -32,16 +32,9 @@ def requires(self) -> list[luigi.Task]: return [ self.clone( WriteRemappedAndSubsettedCallsetTask, - project_guid=project_guid, - project_remap_path=project_remap_path, - project_pedigree_path=project_pedigree_path, - ) - for (project_guid, project_remap_path, project_pedigree_path) in zip( - self.project_guids, - self.project_remap_paths, - self.project_pedigree_paths, - strict=True, + project_i=i, ) + for i in range(len(self.project_guids)) ] def run(self) -> None: @@ -49,6 +42,7 @@ def run(self) -> None: 'callsets': [self.callset_path], 'run_id': self.run_id, 'sample_type': self.sample_type.value, + 'project_guids': self.project_guids, 'family_samples': {}, 'failed_family_samples': { 'missing_samples': {}, diff --git a/v03_pipeline/lib/tasks/write_metadata_for_run_test.py b/v03_pipeline/lib/tasks/write_metadata_for_run_test.py index cb226de8d..dc007fbcb 100644 --- a/v03_pipeline/lib/tasks/write_metadata_for_run_test.py +++ b/v03_pipeline/lib/tasks/write_metadata_for_run_test.py @@ -38,6 +38,7 @@ def test_write_metadata_for_run_task(self) -> None: json.load(f), { 'callsets': [TEST_VCF], + 'project_guids': ['R0113_test_project', 'R0114_project4'], 'failed_family_samples': { 'missing_samples': { 'efg_1': { diff --git a/v03_pipeline/lib/tasks/write_new_variants_table.py b/v03_pipeline/lib/tasks/write_new_variants_table.py index 92f65680f..ea3bf396a 100644 --- a/v03_pipeline/lib/tasks/write_new_variants_table.py +++ b/v03_pipeline/lib/tasks/write_new_variants_table.py @@ -24,8 +24,8 @@ load_gencode_ensembl_to_refseq_id, load_gencode_gene_symbol_to_gene_id, ) -from v03_pipeline.lib.tasks.base.base_project_info_params import ( - BaseLoadingRunWithProjectInfoParams, +from v03_pipeline.lib.tasks.base.base_loading_run_params import ( + BaseLoadingRunParams, ) from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask from v03_pipeline.lib.tasks.files import GCSorLocalTarget @@ -45,7 +45,7 @@ GENCODE_FOR_VEP_RELEASE = 44 -@luigi.util.inherits(BaseLoadingRunWithProjectInfoParams) +@luigi.util.inherits(BaseLoadingRunParams) class WriteNewVariantsTableTask(BaseWriteTask): @property def annotation_dependencies(self) -> dict[str, hl.Table]: diff --git a/v03_pipeline/lib/tasks/write_project_family_tables.py b/v03_pipeline/lib/tasks/write_project_family_tables.py index 4813c738b..7085a3aa1 100644 --- a/v03_pipeline/lib/tasks/write_project_family_tables.py +++ b/v03_pipeline/lib/tasks/write_project_family_tables.py @@ -13,9 +13,7 @@ @luigi.util.inherits(BaseLoadingRunParams) class WriteProjectFamilyTablesTask(luigi.Task): - project_guid = luigi.Parameter() - project_remap_path = luigi.Parameter() - project_pedigree_path = luigi.Parameter() + project_i = luigi.IntParameter() def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -43,7 +41,7 @@ def run(self): self.reference_genome, self.dataset_type, self.callset_path, - self.project_guid, + self.project_guids[self.project_i], ), ) for family_guid in set(hl.eval(ht.globals.family_samples).keys()): diff --git a/v03_pipeline/lib/tasks/write_project_family_tables_test.py b/v03_pipeline/lib/tasks/write_project_family_tables_test.py index 5d0194b43..dd535f988 100644 --- a/v03_pipeline/lib/tasks/write_project_family_tables_test.py +++ b/v03_pipeline/lib/tasks/write_project_family_tables_test.py @@ -28,9 +28,10 @@ def test_snv_write_project_family_tables_task(self) -> None: run_id=TEST_RUN_ID, sample_type=SampleType.WGS, callset_path=TEST_SNV_INDEL_VCF, - project_guid='R0113_test_project', - project_remap_path=TEST_REMAP, - project_pedigree_path=TEST_PEDIGREE_4, + project_guids=['R0113_test_project'], + project_remap_paths=[TEST_REMAP], + project_pedigree_paths=[TEST_PEDIGREE_4], + project_i=0, skip_validation=True, skip_check_sex_and_relatedness=True, ) @@ -93,9 +94,10 @@ def test_snv_write_project_family_tables_task(self) -> None: run_id=TEST_RUN_ID, sample_type=SampleType.WGS, callset_path=TEST_SNV_INDEL_VCF, - project_guid='R0113_test_project', - project_remap_path=TEST_REMAP, - project_pedigree_path=TEST_PEDIGREE_4_SUBSET, + project_guids=['R0113_test_project'], + project_remap_paths=[TEST_REMAP], + project_pedigree_paths=[TEST_PEDIGREE_4_SUBSET], + project_i=0, skip_validation=True, skip_check_sex_and_relatedness=True, ) diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py index bca068b66..f4c934662 100644 --- a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py +++ b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py @@ -35,16 +35,14 @@ @luigi.util.inherits(BaseLoadingRunParams) class WriteRemappedAndSubsettedCallsetTask(BaseWriteTask): - project_guid = luigi.Parameter() - project_remap_path = luigi.Parameter() - project_pedigree_path = luigi.Parameter() + project_i = luigi.IntParameter() def complete(self) -> luigi.Target: return super().complete() and hl.eval( hl.read_matrix_table(self.output().path).globals.remap_pedigree_hash == remap_pedigree_hash( - self.project_remap_path, - self.project_pedigree_path, + self.project_remap_paths[self.project_i], + self.project_pedigree_paths[self.project_i], ), ) @@ -54,14 +52,14 @@ def output(self) -> luigi.Target: self.reference_genome, self.dataset_type, self.callset_path, - self.project_guid, + self.project_guids[self.project_i], ), ) def requires(self) -> list[luigi.Task]: requirements = [ self.clone(ValidateCallsetTask), - RawFileTask(self.project_pedigree_path), + RawFileTask(self.project_pedigree_paths[self.project_i]), ] if ( Env.CHECK_SEX_AND_RELATEDNESS @@ -81,8 +79,8 @@ def create_table(self) -> hl.MatrixTable: # Remap, but only if the remap file is present! remap_lookup = hl.empty_dict(hl.tstr, hl.tstr) - if does_file_exist(self.project_remap_path): - project_remap_ht = import_remap(self.project_remap_path) + if does_file_exist(self.project_remap_paths[self.project_i]): + project_remap_ht = import_remap(self.project_remap_paths[self.project_i]) callset_mt = remap_sample_ids( callset_mt, project_remap_ht, @@ -162,8 +160,8 @@ def create_table(self) -> hl.MatrixTable: mt = mt.drop(field) return mt.select_globals( remap_pedigree_hash=remap_pedigree_hash( - self.project_remap_path, - self.project_pedigree_path, + self.project_remap_paths[self.project_i], + self.project_pedigree_paths[self.project_i], ), family_samples=( { diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py index 1ed7550a6..4a0c84660 100644 --- a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py +++ b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py @@ -84,9 +84,10 @@ def test_write_remapped_and_subsetted_callset_task( run_id=TEST_RUN_ID, sample_type=SampleType.WGS, callset_path=TEST_VCF, - project_guid='R0113_test_project', - project_remap_path=TEST_REMAP, - project_pedigree_path=TEST_PEDIGREE_3, + project_guids=['R0113_test_project'], + project_remap_paths=[TEST_REMAP], + project_pedigree_paths=[TEST_PEDIGREE_3], + project_i=0, skip_validation=True, ) worker.add(wrsc_task) @@ -127,9 +128,10 @@ def test_write_remapped_and_subsetted_callset_task_failed_sex_check_family( run_id=TEST_RUN_ID, sample_type=SampleType.WGS, callset_path=TEST_VCF, - project_guid='R0114_project4', - project_remap_path=TEST_REMAP, - project_pedigree_path=TEST_PEDIGREE_4, + project_guids=['R0114_project4'], + project_remap_paths=[TEST_REMAP], + project_pedigree_paths=[TEST_PEDIGREE_4], + project_i=0, skip_validation=True, ) worker.add(wrsc_task) diff --git a/v03_pipeline/lib/tasks/write_success_file.py b/v03_pipeline/lib/tasks/write_success_file.py index 3576a8d33..3dc471063 100644 --- a/v03_pipeline/lib/tasks/write_success_file.py +++ b/v03_pipeline/lib/tasks/write_success_file.py @@ -3,8 +3,8 @@ from v03_pipeline.lib.paths import pipeline_run_success_file_path from v03_pipeline.lib.tasks import WriteProjectFamilyTablesTask -from v03_pipeline.lib.tasks.base.base_project_info_params import ( - BaseLoadingRunWithProjectInfoParams, +from v03_pipeline.lib.tasks.base.base_loading_run_params import ( + BaseLoadingRunParams, ) from v03_pipeline.lib.tasks.files import GCSorLocalTarget from v03_pipeline.lib.tasks.update_variant_annotations_table_with_new_samples import ( @@ -12,7 +12,7 @@ ) -@luigi.util.inherits(BaseLoadingRunWithProjectInfoParams) +@luigi.util.inherits(BaseLoadingRunParams) class WriteSuccessFileTask(luigi.Task): def output(self) -> luigi.Target: return GCSorLocalTarget( @@ -32,9 +32,7 @@ def requires(self): *[ self.clone( WriteProjectFamilyTablesTask, - project_guid=self.project_guids[i], - project_remap_path=self.project_remap_paths[i], - project_pedigree_path=self.project_pedigree_paths[i], + project_i=i, ) for i in range(len(self.project_guids)) ], diff --git a/v03_pipeline/lib/tasks/write_validation_errors_for_run.py b/v03_pipeline/lib/tasks/write_validation_errors_for_run.py index eaefb0e8c..9149f6158 100644 --- a/v03_pipeline/lib/tasks/write_validation_errors_for_run.py +++ b/v03_pipeline/lib/tasks/write_validation_errors_for_run.py @@ -10,6 +10,7 @@ @luigi.util.inherits(BaseLoadingRunParams) class WriteValidationErrorsForRunTask(luigi.Task): + project_guids = luigi.ListParameter() error_messages = luigi.ListParameter(default=[]) def to_single_error_message(self) -> str: @@ -30,6 +31,7 @@ def output(self) -> luigi.Target: def run(self) -> None: validation_errors_json = { + 'project_guids': self.project_guids, 'error_messages': self.error_messages, } with self.output().open('w') as f: From 2debf885ad7f7a2eb5d53164d6106bebaee16afe Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Mon, 4 Nov 2024 15:02:55 -0500 Subject: [PATCH 08/10] Parse clinvar version from header (#949) * Parse clinvar version from header * responses activate * fix test --- v03_pipeline/lib/reference_data/clinvar.py | 30 ++++++++----------- .../lib/reference_data/clinvar_test.py | 26 ++++++++++++++++ 2 files changed, 38 insertions(+), 18 deletions(-) diff --git a/v03_pipeline/lib/reference_data/clinvar.py b/v03_pipeline/lib/reference_data/clinvar.py index fd5d47561..3e482e0b6 100644 --- a/v03_pipeline/lib/reference_data/clinvar.py +++ b/v03_pipeline/lib/reference_data/clinvar.py @@ -120,6 +120,7 @@ def download_and_import_latest_clinvar_vcf( clinvar_url: str, reference_genome: ReferenceGenome, ) -> hl.Table: + version = parse_clinvar_release_date(clinvar_url) with tempfile.NamedTemporaryFile(suffix='.vcf.gz', delete=False) as tmp_file: urllib.request.urlretrieve(clinvar_url, tmp_file.name) # noqa: S310 cached_tmp_file_name = os.path.join( @@ -139,27 +140,20 @@ def download_and_import_latest_clinvar_vcf( min_partitions=MIN_HT_PARTITIONS, force_bgz=True, ) - mt = mt.annotate_globals(version=_parse_clinvar_release_date(tmp_file.name)) + mt = mt.annotate_globals(version=version) return join_to_submission_summary_ht(mt.rows()) -def _parse_clinvar_release_date(local_vcf_path: str) -> str: - """Parse clinvar release date from the VCF header. - - Args: - local_vcf_path (str): clinvar vcf path on the local file system. - - Returns: - str: return VCF release date as string, or None if release date not found in header. - """ - with gzip.open(local_vcf_path, 'rt') as f: - for line in f: - if line.startswith('##fileDate='): - return line.split('=')[-1].strip() - - if not line.startswith('#'): - return None - +def parse_clinvar_release_date(clinvar_url: str) -> str: + response = requests.get(clinvar_url, stream=True, timeout=10) + for byte_line in gzip.GzipFile(fileobj=response.raw): + line = byte_line.decode('ascii').strip() + if not line: + continue + if line.startswith('##fileDate='): + return line.split('=')[-1].strip() + if not line.startswith('#'): + return None return None diff --git a/v03_pipeline/lib/reference_data/clinvar_test.py b/v03_pipeline/lib/reference_data/clinvar_test.py index 8e1b509ff..fd8d4e832 100644 --- a/v03_pipeline/lib/reference_data/clinvar_test.py +++ b/v03_pipeline/lib/reference_data/clinvar_test.py @@ -1,17 +1,43 @@ +import gzip import unittest from unittest import mock import hail as hl +import responses from v03_pipeline.lib.reference_data.clinvar import ( import_submission_table, join_to_submission_summary_ht, + parse_clinvar_release_date, parsed_and_mapped_clnsigconf, parsed_clnsig, ) +CLINVAR_VCF_DATA = b""" +##fileformat=VCFv4.1 +##fileDate=2024-10-27 +##source=ClinVar +##reference=GRCh37 +##ID= +##INFO= +""" + class ClinvarTest(unittest.TestCase): + @responses.activate + def test_parse_clinvar_release_date(self): + clinvar_url = ( + 'https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz' + ) + responses.get( + clinvar_url, + body=gzip.compress(CLINVAR_VCF_DATA), + ) + self.assertEqual( + parse_clinvar_release_date(clinvar_url), + '2024-10-27', + ) + def test_parsed_clnsig(self): ht = hl.Table.parallelize( [ From 1fea1f7dc61a2c8d6ec36b9fb083c54a6ca9b625 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Tue, 5 Nov 2024 09:11:46 -0500 Subject: [PATCH 09/10] Dependency reordering so that `ValidateCallsetTask` runs before updating the reference data. (#950) * Parse clinvar version from header * Dependency reordering for reference data updates and validation * ruff * missed one * Revert relatedness changes * push * Fix import issue * Fix sample type * ruff * Fix import mocking * imports * responses activate * fix test * Tweaks * comment --- .../base_update_variant_annotations_table.py | 12 +- ...update_cached_reference_dataset_queries.py | 6 +- ...e_cached_reference_dataset_queries_test.py | 110 +++++++++--------- ...ns_table_with_updated_reference_dataset.py | 5 + ...ble_with_updated_reference_dataset_test.py | 24 ++++ .../updated_cached_reference_dataset_query.py | 17 ++- ...ted_cached_reference_dataset_query_test.py | 22 +++- .../updated_reference_dataset_collection.py | 16 +++ ...dated_reference_dataset_collection_test.py | 16 +++ .../lib/tasks/write_new_variants_table.py | 5 +- 10 files changed, 159 insertions(+), 74 deletions(-) diff --git a/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py b/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py index 21b0253b3..31c718034 100644 --- a/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py +++ b/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py @@ -36,16 +36,12 @@ def output(self) -> luigi.Target: def requires(self) -> list[luigi.Task]: requirements = [ - UpdateCachedReferenceDatasetQueries( - reference_genome=self.reference_genome, - dataset_type=self.dataset_type, - ), + self.clone(UpdateCachedReferenceDatasetQueries), ] requirements.extend( - UpdatedReferenceDatasetCollectionTask( - self.reference_genome, - self.dataset_type, - rdc, + self.clone( + UpdatedReferenceDatasetCollectionTask, + reference_dataset_collection=rdc, ) for rdc in ReferenceDatasetCollection.for_reference_genome_dataset_type( self.reference_genome, diff --git a/v03_pipeline/lib/tasks/reference_data/update_cached_reference_dataset_queries.py b/v03_pipeline/lib/tasks/reference_data/update_cached_reference_dataset_queries.py index 58e2cae18..dc9c2a17e 100644 --- a/v03_pipeline/lib/tasks/reference_data/update_cached_reference_dataset_queries.py +++ b/v03_pipeline/lib/tasks/reference_data/update_cached_reference_dataset_queries.py @@ -4,15 +4,15 @@ from v03_pipeline.lib.model import ( CachedReferenceDatasetQuery, ) -from v03_pipeline.lib.tasks.base.base_loading_pipeline_params import ( - BaseLoadingPipelineParams, +from v03_pipeline.lib.tasks.base.base_loading_run_params import ( + BaseLoadingRunParams, ) from v03_pipeline.lib.tasks.reference_data.updated_cached_reference_dataset_query import ( UpdatedCachedReferenceDatasetQuery, ) -@luigi.util.inherits(BaseLoadingPipelineParams) +@luigi.util.inherits(BaseLoadingRunParams) class UpdateCachedReferenceDatasetQueries(luigi.Task): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) diff --git a/v03_pipeline/lib/tasks/reference_data/update_cached_reference_dataset_queries_test.py b/v03_pipeline/lib/tasks/reference_data/update_cached_reference_dataset_queries_test.py index 794a77897..d6bf33d36 100644 --- a/v03_pipeline/lib/tasks/reference_data/update_cached_reference_dataset_queries_test.py +++ b/v03_pipeline/lib/tasks/reference_data/update_cached_reference_dataset_queries_test.py @@ -7,6 +7,7 @@ CachedReferenceDatasetQuery, DatasetType, ReferenceGenome, + SampleType, ) from v03_pipeline.lib.tasks.reference_data.update_cached_reference_dataset_queries import ( UpdateCachedReferenceDatasetQueries, @@ -21,99 +22,100 @@ class UpdateCachedReferenceDatasetQueriesTest(unittest.TestCase): def test_37_snv_indel(self, mock_crdq_task): mock_crdq_task.return_value = MockCompleteTask() worker = luigi.worker.Worker() + kwargs = { + 'sample_type': SampleType.WGS, + 'callset_path': '', + 'project_guids': [], + 'project_remap_paths': [], + 'project_pedigree_paths': [], + 'skip_validation': True, + 'run_id': '1', + } task = UpdateCachedReferenceDatasetQueries( reference_genome=ReferenceGenome.GRCh37, dataset_type=DatasetType.SNV_INDEL, + **kwargs, ) worker.add(task) worker.run() self.assertTrue(task.complete()) - mock_crdq_task.assert_has_calls( - [ - mock.call( - reference_genome=ReferenceGenome.GRCh37, - dataset_type=DatasetType.SNV_INDEL, - crdq=CachedReferenceDatasetQuery.CLINVAR_PATH_VARIANTS, - ), - mock.call( - reference_genome=ReferenceGenome.GRCh37, - dataset_type=DatasetType.SNV_INDEL, - crdq=CachedReferenceDatasetQuery.GNOMAD_CODING_AND_NONCODING_VARIANTS, - ), - mock.call( - reference_genome=ReferenceGenome.GRCh37, - dataset_type=DatasetType.SNV_INDEL, - crdq=CachedReferenceDatasetQuery.GNOMAD_QC, - ), - mock.call( - reference_genome=ReferenceGenome.GRCh37, - dataset_type=DatasetType.SNV_INDEL, - crdq=CachedReferenceDatasetQuery.HIGH_AF_VARIANTS, - ), - ], + call_args_list = mock_crdq_task.call_args_list + self.assertEqual(len(call_args_list), 4) + self.assertEqual( + [x.kwargs['crdq'] for x in call_args_list], + list(CachedReferenceDatasetQuery), ) def test_38_snv_indel(self, mock_crdq_task): mock_crdq_task.return_value = MockCompleteTask() worker = luigi.worker.Worker() + kwargs = { + 'sample_type': SampleType.WGS, + 'callset_path': '', + 'project_guids': [], + 'project_remap_paths': [], + 'project_pedigree_paths': [], + 'skip_validation': True, + 'run_id': '2', + } task = UpdateCachedReferenceDatasetQueries( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.SNV_INDEL, + **kwargs, ) worker.add(task) worker.run() self.assertTrue(task.complete()) - mock_crdq_task.assert_has_calls( - [ - mock.call( - reference_genome=ReferenceGenome.GRCh38, - dataset_type=DatasetType.SNV_INDEL, - crdq=CachedReferenceDatasetQuery.CLINVAR_PATH_VARIANTS, - ), - mock.call( - reference_genome=ReferenceGenome.GRCh38, - dataset_type=DatasetType.SNV_INDEL, - crdq=CachedReferenceDatasetQuery.GNOMAD_CODING_AND_NONCODING_VARIANTS, - ), - mock.call( - reference_genome=ReferenceGenome.GRCh38, - dataset_type=DatasetType.SNV_INDEL, - crdq=CachedReferenceDatasetQuery.GNOMAD_QC, - ), - mock.call( - reference_genome=ReferenceGenome.GRCh38, - dataset_type=DatasetType.SNV_INDEL, - crdq=CachedReferenceDatasetQuery.HIGH_AF_VARIANTS, - ), - ], + call_args_list = mock_crdq_task.call_args_list + self.assertEqual(len(call_args_list), 4) + self.assertEqual( + [x.kwargs['crdq'] for x in call_args_list], + list(CachedReferenceDatasetQuery), ) def test_38_mito(self, mock_crdq_task): mock_crdq_task.return_value = MockCompleteTask() worker = luigi.worker.Worker() + kwargs = { + 'sample_type': SampleType.WGS, + 'callset_path': '', + 'project_guids': [], + 'project_remap_paths': [], + 'project_pedigree_paths': [], + 'skip_validation': True, + 'run_id': '3', + } task = UpdateCachedReferenceDatasetQueries( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.MITO, + **kwargs, ) worker.add(task) worker.run() self.assertTrue(task.complete()) - mock_crdq_task.assert_has_calls( - [ - mock.call( - reference_genome=ReferenceGenome.GRCh38, - dataset_type=DatasetType.MITO, - crdq=CachedReferenceDatasetQuery.CLINVAR_PATH_VARIANTS, - ), - ], + call_args_list = mock_crdq_task.call_args_list + self.assertEqual(len(call_args_list), 1) + self.assertEqual( + next(x.kwargs['crdq'] for x in call_args_list), + CachedReferenceDatasetQuery.CLINVAR_PATH_VARIANTS, ) def test_38_sv(self, mock_crdq_task): mock_crdq_task.return_value = MockCompleteTask() worker = luigi.worker.Worker() + kwargs = { + 'sample_type': SampleType.WGS, + 'callset_path': '', + 'project_guids': [], + 'project_remap_paths': [], + 'project_pedigree_paths': [], + 'skip_validation': True, + 'run_id': '4', + } task = UpdateCachedReferenceDatasetQueries( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.SV, + **kwargs, ) worker.add(task) worker.run() diff --git a/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset.py b/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset.py index f03526c50..87db93313 100644 --- a/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset.py +++ b/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset.py @@ -1,4 +1,5 @@ import hail as hl +import luigi from v03_pipeline.lib.annotations.fields import get_fields from v03_pipeline.lib.logger import get_logger @@ -8,6 +9,9 @@ get_datasets_to_update, ) from v03_pipeline.lib.reference_data.config import CONFIG +from v03_pipeline.lib.tasks.base.base_loading_run_params import ( + BaseLoadingRunParams, +) from v03_pipeline.lib.tasks.base.base_update_variant_annotations_table import ( BaseUpdateVariantAnnotationsTableTask, ) @@ -15,6 +19,7 @@ logger = get_logger(__name__) +@luigi.util.inherits(BaseLoadingRunParams) class UpdateVariantAnnotationsTableWithUpdatedReferenceDataset( BaseUpdateVariantAnnotationsTableTask, ): diff --git a/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py b/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py index b5a5ced2f..38ac936c6 100644 --- a/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py +++ b/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py @@ -19,6 +19,7 @@ DatasetType, ReferenceDatasetCollection, ReferenceGenome, + SampleType, ) from v03_pipeline.lib.paths import valid_reference_dataset_collection_path from v03_pipeline.lib.reference_data.clinvar import CLINVAR_ASSERTIONS @@ -37,6 +38,8 @@ TEST_INTERVAL_MITO_1 = 'v03_pipeline/var/test/reference_data/test_interval_mito_1.ht' TEST_COMBINED_37 = 'v03_pipeline/var/test/reference_data/test_combined_37.ht' TEST_HGMD_37 = 'v03_pipeline/var/test/reference_data/test_hgmd_37.ht' +TEST_SNV_INDEL_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf' +TEST_MITO_MT = 'v03_pipeline/var/test/callsets/mito_1.mt' MOCK_CADD_CONFIG = { @@ -754,6 +757,13 @@ def test_update_vat_with_updated_rdc_snv_indel_38( task = UpdateVariantAnnotationsTableWithUpdatedReferenceDataset( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.SNV_INDEL, + sample_type=SampleType.WGS, + callset_path=TEST_SNV_INDEL_VCF, + project_guids=[], + project_remap_paths=[], + project_pedigree_paths=[], + skip_validation=True, + run_id='3', ) worker = luigi.worker.Worker() worker.add(task) @@ -964,6 +974,13 @@ def test_update_vat_with_updated_rdc_mito_38( task = UpdateVariantAnnotationsTableWithUpdatedReferenceDataset( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.MITO, + sample_type=SampleType.WGS, + callset_path=TEST_MITO_MT, + project_guids=[], + project_remap_paths=[], + project_pedigree_paths=[], + skip_validation=True, + run_id='1', ) worker = luigi.worker.Worker() worker.add(task) @@ -1114,6 +1131,13 @@ def test_update_vat_with_updated_rdc_snv_indel_37( task = UpdateVariantAnnotationsTableWithUpdatedReferenceDataset( reference_genome=ReferenceGenome.GRCh37, dataset_type=DatasetType.SNV_INDEL, + sample_type=SampleType.WGS, + callset_path=TEST_SNV_INDEL_VCF, + project_guids=[], + project_remap_paths=[], + project_pedigree_paths=[], + skip_validation=True, + run_id='2', ) worker = luigi.worker.Worker() worker.add(task) diff --git a/v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query.py b/v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query.py index 9aa4a3a74..4e6a0cfaf 100644 --- a/v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query.py +++ b/v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query.py @@ -19,15 +19,16 @@ get_ht_path, import_ht_from_config_path, ) +from v03_pipeline.lib.tasks.base.base_loading_run_params import ( + BaseLoadingRunParams, +) from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask from v03_pipeline.lib.tasks.files import GCSorLocalTarget, HailTableTask -from v03_pipeline.lib.tasks.reference_data.updated_reference_dataset_collection import ( - UpdatedReferenceDatasetCollectionTask, -) logger = get_logger(__name__) +@luigi.util.inherits(BaseLoadingRunParams) class UpdatedCachedReferenceDatasetQuery(BaseWriteTask): crdq = luigi.EnumParameter(enum=CachedReferenceDatasetQuery) @@ -71,6 +72,16 @@ def requires(self) -> luigi.Task: ], ), ) + # Special nested import to avoid a circular dependency issue + # (ValidateCallset -> this file -> UpdatedReferenceDatasetCollection -> ValidateCallset) + # The specific CRDQ referenced in ValidateCallset will never reach + # this line due to it being a "query_raw_dataset". In theory this + # would be fixed by splitting the CRDQ into raw_dataset and non-raw_dataset + # queries. + from v03_pipeline.lib.tasks.reference_data.updated_reference_dataset_collection import ( + UpdatedReferenceDatasetCollectionTask, + ) + return UpdatedReferenceDatasetCollectionTask( self.reference_genome, self.dataset_type, diff --git a/v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query_test.py b/v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query_test.py index 210a8cc8a..60009daf8 100644 --- a/v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query_test.py +++ b/v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query_test.py @@ -5,12 +5,14 @@ import hail as hl import luigi +import v03_pipeline.lib.tasks.reference_data.updated_reference_dataset_collection from v03_pipeline.lib.annotations.enums import CLINVAR_PATHOGENICITIES from v03_pipeline.lib.model import ( CachedReferenceDatasetQuery, DatasetType, ReferenceDatasetCollection, ReferenceGenome, + SampleType, ) from v03_pipeline.lib.paths import ( cached_reference_dataset_query_path, @@ -28,6 +30,7 @@ CLINVAR_CRDQ_PATH = ( 'v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht' ) +TEST_SNV_INDEL_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf' MOCK_CONFIG = { 'gnomad_qc': { @@ -109,6 +112,13 @@ def test_gnomad_qc( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.SNV_INDEL, crdq=CachedReferenceDatasetQuery.GNOMAD_QC, + sample_type=SampleType.WGS, + callset_path=TEST_SNV_INDEL_VCF, + project_guids=[], + project_remap_paths=[], + project_pedigree_paths=[], + skip_validation=True, + run_id='1', ) worker.add(task) worker.run() @@ -143,8 +153,9 @@ def test_gnomad_qc( 'v03_pipeline.lib.reference_data.compare_globals.CONFIG', MOCK_CONFIG, ) - @mock.patch( - 'v03_pipeline.lib.tasks.reference_data.updated_cached_reference_dataset_query.UpdatedReferenceDatasetCollectionTask', + @mock.patch.object( + v03_pipeline.lib.tasks.reference_data.updated_reference_dataset_collection, + 'UpdatedReferenceDatasetCollectionTask', ) @mock.patch( 'v03_pipeline.lib.tasks.reference_data.updated_cached_reference_dataset_query.CachedReferenceDatasetQuery.query', @@ -198,6 +209,13 @@ def _clinvar_path_variants(table, **_: Any): reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.SNV_INDEL, crdq=CachedReferenceDatasetQuery.CLINVAR_PATH_VARIANTS, + sample_type=SampleType.WGS, + callset_path=TEST_SNV_INDEL_VCF, + project_guids=[], + project_remap_paths=[], + project_pedigree_paths=[], + skip_validation=True, + run_id='2', ) worker.add(task) worker.run() diff --git a/v03_pipeline/lib/tasks/reference_data/updated_reference_dataset_collection.py b/v03_pipeline/lib/tasks/reference_data/updated_reference_dataset_collection.py index 253e2f526..e09b04f09 100644 --- a/v03_pipeline/lib/tasks/reference_data/updated_reference_dataset_collection.py +++ b/v03_pipeline/lib/tasks/reference_data/updated_reference_dataset_collection.py @@ -11,12 +11,17 @@ from v03_pipeline.lib.reference_data.dataset_table_operations import ( update_or_create_joined_ht, ) +from v03_pipeline.lib.tasks.base.base_loading_run_params import ( + BaseLoadingRunParams, +) from v03_pipeline.lib.tasks.base.base_update import BaseUpdateTask from v03_pipeline.lib.tasks.files import GCSorLocalTarget +from v03_pipeline.lib.tasks.validate_callset import ValidateCallsetTask logger = get_logger(__name__) +@luigi.util.inherits(BaseLoadingRunParams) class UpdatedReferenceDatasetCollectionTask(BaseUpdateTask): reference_dataset_collection = luigi.EnumParameter(enum=ReferenceDatasetCollection) @@ -24,6 +29,17 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._datasets_to_update = [] + def requires(self) -> luigi.Task: + # Though there is no explicit functional dependency between + # validing the callset and updating the reference data, it's + # a more user-friendly experience for the callset validation + # to fail/succeed prior to attempting any + # compute intensive work. + # + # Note that, if validation is disabled or skipped the task + # still runs but is a no-op. + return self.clone(ValidateCallsetTask) + def complete(self) -> bool: self._datasets_to_update = [] datasets = self.reference_dataset_collection.datasets(self.dataset_type) diff --git a/v03_pipeline/lib/tasks/reference_data/updated_reference_dataset_collection_test.py b/v03_pipeline/lib/tasks/reference_data/updated_reference_dataset_collection_test.py index b3fdde4bb..72b0dc180 100644 --- a/v03_pipeline/lib/tasks/reference_data/updated_reference_dataset_collection_test.py +++ b/v03_pipeline/lib/tasks/reference_data/updated_reference_dataset_collection_test.py @@ -10,6 +10,7 @@ DatasetType, ReferenceDatasetCollection, ReferenceGenome, + SampleType, ) from v03_pipeline.lib.paths import valid_reference_dataset_collection_path from v03_pipeline.lib.reference_data.clinvar import CLINVAR_ASSERTIONS @@ -20,6 +21,7 @@ from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase COMBINED_2_PATH = 'v03_pipeline/var/test/reference_data/test_combined_2.ht' +TEST_SNV_INDEL_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf' MOCK_PRIMATE_AI_DATASET_HT = hl.Table.parallelize( [ @@ -170,6 +172,13 @@ def test_update_task_with_empty_reference_data_table( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.SNV_INDEL, reference_dataset_collection=ReferenceDatasetCollection.COMBINED, + sample_type=SampleType.WGS, + callset_path=TEST_SNV_INDEL_VCF, + project_guids=[], + project_remap_paths=[], + project_pedigree_paths=[], + skip_validation=True, + run_id='2', ) worker.add(task) worker.run() @@ -279,6 +288,13 @@ def test_update_task_with_existing_reference_dataset_collection_table( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.SNV_INDEL, reference_dataset_collection=ReferenceDatasetCollection.COMBINED, + sample_type=SampleType.WGS, + callset_path=TEST_SNV_INDEL_VCF, + project_guids=[], + project_remap_paths=[], + project_pedigree_paths=[], + skip_validation=True, + run_id='2', ) worker.add(task) worker.run() diff --git a/v03_pipeline/lib/tasks/write_new_variants_table.py b/v03_pipeline/lib/tasks/write_new_variants_table.py index ea3bf396a..a312084b4 100644 --- a/v03_pipeline/lib/tasks/write_new_variants_table.py +++ b/v03_pipeline/lib/tasks/write_new_variants_table.py @@ -79,10 +79,7 @@ def output(self) -> luigi.Target: def requires(self) -> list[luigi.Task]: requirements = [ - UpdateVariantAnnotationsTableWithUpdatedReferenceDataset( - self.reference_genome, - self.dataset_type, - ), + self.clone(UpdateVariantAnnotationsTableWithUpdatedReferenceDataset), ] if self.dataset_type.has_lookup_table: # NB: the lookup table task has remapped and subsetted callset tasks as dependencies. From d81cc10e355aa8be780184bebd332c7b7edb976b Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Tue, 5 Nov 2024 10:23:19 -0500 Subject: [PATCH 10/10] Benb/check parsed clinvar version in complete (#951) * Parse clinvar version from header * First pass * Bump hail tables to https * correct dataset/dataset types * Fix clinvar mito * Fix combined * Dependency reordering for reference data updates and validation * ruff * missed one * Revert relatedness changes * push * Fix import issue * Fix sample type * ruff * Fix import mocking * imports * Missed one * First mocking pass * Finish mocks in reference data * responses activate * ruff * commas * fix test * Update compare_globals.py * import --- .../lib/reference_data/compare_globals.py | 13 +++++++++ ...ns_table_with_updated_reference_dataset.py | 12 +++++++++ ...ble_with_updated_reference_dataset_test.py | 25 ++++++++++++++---- .../updated_cached_reference_dataset_query.py | 14 +++++++--- ...ted_cached_reference_dataset_query_test.py | 8 +++++- .../updated_reference_dataset_collection.py | 9 +++++++ ...dated_reference_dataset_collection_test.py | 5 ++++ ...annotations_table_with_new_samples_test.py | 6 ++--- .../.README.txt.crc | Bin 12 -> 12 bytes .../.metadata.json.gz.crc | Bin 12 -> 12 bytes .../README.txt | 4 +-- .../globals/parts/.part-0.crc | Bin 12 -> 12 bytes .../globals/parts/part-0 | Bin 400 -> 400 bytes .../.index.crc | Bin .../.metadata.json.gz.crc | Bin .../index | Bin .../metadata.json.gz | Bin .../metadata.json.gz | Bin 344 -> 345 bytes .../rows/.metadata.json.gz.crc | Bin 16 -> 16 bytes .../rows/metadata.json.gz | Bin 586 -> 586 bytes ...-9e75273d-7113-40e4-a327-453f3451dc8c.crc} | Bin ...rt-0-9e75273d-7113-40e4-a327-453f3451dc8c} | Bin .../test_combined_1.ht.ht/.README.txt.crc | Bin 0 -> 12 bytes .../test_combined_1.ht.ht/._SUCCESS.crc | Bin 0 -> 8 bytes .../.metadata.json.gz.crc | Bin 0 -> 16 bytes .../test_combined_1.ht.ht/README.txt | 3 +++ .../test_combined_1.ht.ht/_SUCCESS | 0 .../globals/.metadata.json.gz.crc | Bin 0 -> 16 bytes .../globals/metadata.json.gz | Bin 0 -> 546 bytes .../globals/parts/.part-0.crc | Bin 0 -> 16 bytes .../globals/parts/part-0 | Bin 0 -> 774 bytes .../.index.crc | Bin .../.metadata.json.gz.crc | Bin .../index | Bin .../metadata.json.gz | Bin .../test_combined_1.ht.ht/metadata.json.gz | Bin 0 -> 725 bytes .../rows/.metadata.json.gz.crc | Bin 0 -> 20 bytes .../rows/metadata.json.gz | Bin 0 -> 1064 bytes ...-3569201c-d630-43c4-9056-cbace806fe8d.crc} | Bin ...rt-0-3569201c-d630-43c4-9056-cbace806fe8d} | Bin .../test_combined_1.ht/.README.txt.crc | Bin 12 -> 12 bytes .../test_combined_1.ht/.metadata.json.gz.crc | Bin 16 -> 16 bytes .../test_combined_1.ht/README.txt | 4 +-- .../globals/.metadata.json.gz.crc | Bin 16 -> 16 bytes .../globals/metadata.json.gz | Bin 546 -> 546 bytes .../globals/parts/.part-0.crc | Bin 16 -> 16 bytes .../test_combined_1.ht/globals/parts/part-0 | Bin 776 -> 774 bytes .../.index.crc | Bin 0 -> 12 bytes .../.metadata.json.gz.crc | Bin 0 -> 12 bytes .../index | Bin 0 -> 69 bytes .../metadata.json.gz | Bin 0 -> 185 bytes .../test_combined_1.ht/metadata.json.gz | Bin 725 -> 725 bytes .../rows/.metadata.json.gz.crc | Bin 20 -> 20 bytes .../test_combined_1.ht/rows/metadata.json.gz | Bin 1063 -> 1062 bytes ...0-1d126232-414b-4ffa-aa43-9ed52895fbf2.crc | Bin 0 -> 12 bytes ...art-0-1d126232-414b-4ffa-aa43-9ed52895fbf2 | Bin 0 -> 106 bytes .../test_combined_37.ht/.README.txt.crc | Bin 12 -> 12 bytes .../test_combined_37.ht/.metadata.json.gz.crc | Bin 16 -> 16 bytes .../test_combined_37.ht/README.txt | 4 +-- .../globals/parts/.part-0.crc | Bin 16 -> 16 bytes .../test_combined_37.ht/globals/parts/part-0 | Bin 798 -> 798 bytes .../.index.crc | Bin .../.metadata.json.gz.crc | Bin .../index | Bin .../metadata.json.gz | Bin .../test_combined_37.ht/metadata.json.gz | Bin 703 -> 702 bytes .../rows/.metadata.json.gz.crc | Bin 20 -> 20 bytes .../test_combined_37.ht/rows/metadata.json.gz | Bin 1027 -> 1027 bytes ...-6353b1d7-bc23-4f3a-9fa2-dd9321ab97a2.crc} | Bin ...rt-0-6353b1d7-bc23-4f3a-9fa2-dd9321ab97a2} | Bin .../test_combined_mito_1.ht/.README.txt.crc | Bin 12 -> 12 bytes .../.metadata.json.gz.crc | Bin 16 -> 16 bytes .../test_combined_mito_1.ht/README.txt | 4 +-- .../globals/parts/.part-0.crc | Bin 16 -> 16 bytes .../globals/parts/part-0 | Bin 712 -> 710 bytes .../.index.crc | Bin .../.metadata.json.gz.crc | Bin .../index | Bin .../metadata.json.gz | Bin .../test_combined_mito_1.ht/metadata.json.gz | Bin 586 -> 585 bytes .../rows/.metadata.json.gz.crc | Bin 16 -> 16 bytes .../rows/metadata.json.gz | Bin 877 -> 877 bytes ...-3c042736-0e6c-4911-9b80-b9356af9df25.crc} | Bin ...rt-0-3c042736-0e6c-4911-9b80-b9356af9df25} | Bin 84 files changed, 91 insertions(+), 20 deletions(-) rename v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/index/{part-0-d5c50933-cc72-4716-b930-1e885aa0ba7a.idx => part-0-9e75273d-7113-40e4-a327-453f3451dc8c.idx}/.index.crc (100%) rename v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/index/{part-0-d5c50933-cc72-4716-b930-1e885aa0ba7a.idx => part-0-9e75273d-7113-40e4-a327-453f3451dc8c.idx}/.metadata.json.gz.crc (100%) rename v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/index/{part-0-d5c50933-cc72-4716-b930-1e885aa0ba7a.idx => part-0-9e75273d-7113-40e4-a327-453f3451dc8c.idx}/index (100%) rename v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/index/{part-0-d5c50933-cc72-4716-b930-1e885aa0ba7a.idx => part-0-9e75273d-7113-40e4-a327-453f3451dc8c.idx}/metadata.json.gz (100%) rename v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/rows/parts/{.part-0-d5c50933-cc72-4716-b930-1e885aa0ba7a.crc => .part-0-9e75273d-7113-40e4-a327-453f3451dc8c.crc} (100%) rename v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/rows/parts/{part-0-d5c50933-cc72-4716-b930-1e885aa0ba7a => part-0-9e75273d-7113-40e4-a327-453f3451dc8c} (100%) create mode 100644 v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/.README.txt.crc create mode 100644 v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/._SUCCESS.crc create mode 100644 v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/.metadata.json.gz.crc create mode 100644 v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/README.txt create mode 100644 v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/_SUCCESS create mode 100644 v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/globals/.metadata.json.gz.crc create mode 100644 v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/globals/metadata.json.gz create mode 100644 v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/globals/parts/.part-0.crc create mode 100644 v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/globals/parts/part-0 rename v03_pipeline/var/test/reference_data/{test_combined_1.ht/index/part-0-6a5a9d6a-4ded-424b-9735-922a5346e7ad.idx => test_combined_1.ht.ht/index/part-0-3569201c-d630-43c4-9056-cbace806fe8d.idx}/.index.crc (100%) rename v03_pipeline/var/test/reference_data/{test_combined_1.ht/index/part-0-6a5a9d6a-4ded-424b-9735-922a5346e7ad.idx => test_combined_1.ht.ht/index/part-0-3569201c-d630-43c4-9056-cbace806fe8d.idx}/.metadata.json.gz.crc (100%) rename v03_pipeline/var/test/reference_data/{test_combined_1.ht/index/part-0-6a5a9d6a-4ded-424b-9735-922a5346e7ad.idx => test_combined_1.ht.ht/index/part-0-3569201c-d630-43c4-9056-cbace806fe8d.idx}/index (100%) rename v03_pipeline/var/test/reference_data/{test_combined_1.ht/index/part-0-6a5a9d6a-4ded-424b-9735-922a5346e7ad.idx => test_combined_1.ht.ht/index/part-0-3569201c-d630-43c4-9056-cbace806fe8d.idx}/metadata.json.gz (100%) create mode 100644 v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/metadata.json.gz create mode 100644 v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/rows/.metadata.json.gz.crc create mode 100644 v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/rows/metadata.json.gz rename v03_pipeline/var/test/reference_data/{test_combined_1.ht/rows/parts/.part-0-6a5a9d6a-4ded-424b-9735-922a5346e7ad.crc => test_combined_1.ht.ht/rows/parts/.part-0-3569201c-d630-43c4-9056-cbace806fe8d.crc} (100%) rename v03_pipeline/var/test/reference_data/{test_combined_1.ht/rows/parts/part-0-6a5a9d6a-4ded-424b-9735-922a5346e7ad => test_combined_1.ht.ht/rows/parts/part-0-3569201c-d630-43c4-9056-cbace806fe8d} (100%) create mode 100644 v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-1d126232-414b-4ffa-aa43-9ed52895fbf2.idx/.index.crc create mode 100644 v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-1d126232-414b-4ffa-aa43-9ed52895fbf2.idx/.metadata.json.gz.crc create mode 100644 v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-1d126232-414b-4ffa-aa43-9ed52895fbf2.idx/index create mode 100644 v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-1d126232-414b-4ffa-aa43-9ed52895fbf2.idx/metadata.json.gz create mode 100644 v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/parts/.part-0-1d126232-414b-4ffa-aa43-9ed52895fbf2.crc create mode 100644 v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/parts/part-0-1d126232-414b-4ffa-aa43-9ed52895fbf2 rename v03_pipeline/var/test/reference_data/test_combined_37.ht/index/{part-0-ac85fcb0-1e7c-453f-be81-9cd356dc49ff.idx => part-0-6353b1d7-bc23-4f3a-9fa2-dd9321ab97a2.idx}/.index.crc (100%) rename v03_pipeline/var/test/reference_data/test_combined_37.ht/index/{part-0-ac85fcb0-1e7c-453f-be81-9cd356dc49ff.idx => part-0-6353b1d7-bc23-4f3a-9fa2-dd9321ab97a2.idx}/.metadata.json.gz.crc (100%) rename v03_pipeline/var/test/reference_data/test_combined_37.ht/index/{part-0-ac85fcb0-1e7c-453f-be81-9cd356dc49ff.idx => part-0-6353b1d7-bc23-4f3a-9fa2-dd9321ab97a2.idx}/index (100%) rename v03_pipeline/var/test/reference_data/test_combined_37.ht/index/{part-0-ac85fcb0-1e7c-453f-be81-9cd356dc49ff.idx => part-0-6353b1d7-bc23-4f3a-9fa2-dd9321ab97a2.idx}/metadata.json.gz (100%) rename v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/parts/{.part-0-ac85fcb0-1e7c-453f-be81-9cd356dc49ff.crc => .part-0-6353b1d7-bc23-4f3a-9fa2-dd9321ab97a2.crc} (100%) rename v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/parts/{part-0-ac85fcb0-1e7c-453f-be81-9cd356dc49ff => part-0-6353b1d7-bc23-4f3a-9fa2-dd9321ab97a2} (100%) rename v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/{part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.idx => part-0-3c042736-0e6c-4911-9b80-b9356af9df25.idx}/.index.crc (100%) rename v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/{part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.idx => part-0-3c042736-0e6c-4911-9b80-b9356af9df25.idx}/.metadata.json.gz.crc (100%) rename v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/{part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.idx => part-0-3c042736-0e6c-4911-9b80-b9356af9df25.idx}/index (100%) rename v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/{part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.idx => part-0-3c042736-0e6c-4911-9b80-b9356af9df25.idx}/metadata.json.gz (100%) rename v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/{.part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.crc => .part-0-3c042736-0e6c-4911-9b80-b9356af9df25.crc} (100%) rename v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/{part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90 => part-0-3c042736-0e6c-4911-9b80-b9356af9df25} (100%) diff --git a/v03_pipeline/lib/reference_data/compare_globals.py b/v03_pipeline/lib/reference_data/compare_globals.py index 1feb0ac12..c295b3a35 100644 --- a/v03_pipeline/lib/reference_data/compare_globals.py +++ b/v03_pipeline/lib/reference_data/compare_globals.py @@ -4,8 +4,10 @@ from v03_pipeline.lib.logger import get_logger from v03_pipeline.lib.model import ( + DatasetType, ReferenceGenome, ) +from v03_pipeline.lib.reference_data.clinvar import parse_clinvar_release_date from v03_pipeline.lib.reference_data.config import CONFIG from v03_pipeline.lib.reference_data.dataset_table_operations import ( get_all_select_fields, @@ -16,6 +18,17 @@ logger = get_logger(__name__) +def clinvar_versions_equal( + ht: hl.Table, + reference_genome: ReferenceGenome, + dataset_type: DatasetType, +) -> bool: + dataset = 'clinvar_mito' if dataset_type == DatasetType.MITO else 'clinvar' + return hl.eval(ht.globals.versions[dataset]) == parse_clinvar_release_date( + CONFIG[dataset][reference_genome.v02_value], + ) + + @dataclasses.dataclass class Globals: paths: dict[str, str] diff --git a/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset.py b/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset.py index 87db93313..9a0aeca2d 100644 --- a/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset.py +++ b/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset.py @@ -6,6 +6,7 @@ from v03_pipeline.lib.model import ReferenceDatasetCollection from v03_pipeline.lib.reference_data.compare_globals import ( Globals, + clinvar_versions_equal, get_datasets_to_update, ) from v03_pipeline.lib.reference_data.config import CONFIG @@ -54,6 +55,17 @@ def complete(self) -> bool: for rdc in self.reference_dataset_collections for dataset in rdc.datasets(self.dataset_type) ] + + if any( + 'clinvar' in d for d in datasets_to_check + ) and not clinvar_versions_equal( + hl.read_table(self.output().path), + self.reference_genome, + self.dataset_type, + ): + datasets_to_check.remove('clinvar') + self._datasets_to_update.add('clinvar') + annotations_ht_globals = Globals.from_ht( hl.read_table(self.output().path), datasets_to_check, diff --git a/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py b/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py index 38ac936c6..5c30630e9 100644 --- a/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py +++ b/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py @@ -61,7 +61,7 @@ } MOCK_CLINVAR_CONFIG = { **CONFIG['clinvar']['38'], - 'source_path': 'ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz', + 'source_path': 'https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz', 'custom_import': lambda *_: hl.Table.parallelize( [], hl.tstruct( @@ -486,7 +486,7 @@ 'clinvar_mito': { '38': { **CONFIG['clinvar_mito']['38'], - 'source_path': 'ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz', + 'source_path': 'https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz', 'custom_import': lambda *_: hl.Table.parallelize( [], hl.tstruct( @@ -722,12 +722,17 @@ def setUp(self) -> None: 'v03_pipeline.lib.reference_data.compare_globals.CONFIG', MOCK_CONFIG, ) + @mock.patch( + 'v03_pipeline.lib.tasks.reference_data.update_variant_annotations_table_with_updated_reference_dataset.clinvar_versions_equal', + ) def test_update_vat_with_updated_rdc_snv_indel_38( self, + mock_clinvar_versions_equal, mock_initialize_table, mock_update_crdqs_task, mock_update_rdc_task, ): + mock_clinvar_versions_equal.return_value = True mock_update_rdc_task.return_value = MockCompleteTask() mock_update_crdqs_task.return_value = MockCompleteTask() mock_initialize_table.return_value = hl.Table.parallelize( @@ -840,7 +845,7 @@ def test_update_vat_with_updated_rdc_snv_indel_38( hl.Struct( paths=hl.Struct( cadd='gs://seqr-reference-data/GRCh37/CADD/CADD_snvs_and_indels.v1.6.ht', - clinvar='ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz', + clinvar='https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz', dbnsfp='gs://seqr-reference-data/GRCh37/dbNSFP/v2.9.3/dbNSFP2.9.3_variant.ht', eigen='gs://seqr-reference-data/GRCh37/eigen/EIGEN_coding_noncoding.grch37.ht', exac='gs://seqr-reference-data/GRCh37/gnomad/ExAC.r1.sites.vep.ht', @@ -939,12 +944,17 @@ def test_update_vat_with_updated_rdc_snv_indel_38( 'v03_pipeline.lib.reference_data.compare_globals.CONFIG', MOCK_CONFIG_MITO, ) + @mock.patch( + 'v03_pipeline.lib.tasks.reference_data.update_variant_annotations_table_with_updated_reference_dataset.clinvar_versions_equal', + ) def test_update_vat_with_updated_rdc_mito_38( self, + mock_clinvar_versions_equal, mock_initialize_table, mock_update_crdqs_task, mock_update_rdc_task, ): + mock_clinvar_versions_equal.return_value = (True,) mock_update_rdc_task.return_value = MockCompleteTask() mock_update_crdqs_task.return_value = MockCompleteTask() mock_initialize_table.return_value = hl.Table.parallelize( @@ -999,7 +1009,7 @@ def test_update_vat_with_updated_rdc_mito_38( hmtvar='gs://seqr-reference-data/GRCh38/mitochondrial/HmtVar/HmtVar%20Jan.%2010%202022.ht', mitomap='gs://seqr-reference-data/GRCh38/mitochondrial/MITOMAP/mitomap-confirmed-mutations-2022-02-04.ht', mitimpact='gs://seqr-reference-data/GRCh38/mitochondrial/MitImpact/MitImpact_db_3.0.7.ht', - clinvar_mito='ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz', + clinvar_mito='https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz', dbnsfp_mito='gs://seqr-reference-data/GRCh38/dbNSFP/v4.2/dbNSFP4.2a_variant.with_new_scores.ht', high_constraint_region_mito='gs://seqr-reference-data/GRCh38/mitochondrial/Helix high constraint intervals Feb-15-2022.tsv', local_constraint_mito='gs://seqr-reference-data/GRCh38/mitochondrial/local_constraint.tsv', @@ -1096,12 +1106,17 @@ def test_update_vat_with_updated_rdc_mito_38( 'v03_pipeline.lib.reference_data.compare_globals.CONFIG', MOCK_CONFIG, ) + @mock.patch( + 'v03_pipeline.lib.tasks.reference_data.update_variant_annotations_table_with_updated_reference_dataset.clinvar_versions_equal', + ) def test_update_vat_with_updated_rdc_snv_indel_37( self, + mock_clinvar_versions_equal, mock_initialize_table, mock_update_crdqs_task, mock_update_rdc_task, ): + mock_clinvar_versions_equal.return_value = True mock_update_rdc_task.return_value = MockCompleteTask() mock_update_crdqs_task.return_value = MockCompleteTask() mock_initialize_table.return_value = hl.Table.parallelize( @@ -1152,7 +1167,7 @@ def test_update_vat_with_updated_rdc_snv_indel_37( hl.Struct( paths=hl.Struct( cadd='gs://seqr-reference-data/GRCh37/CADD/CADD_snvs_and_indels.v1.6.ht', - clinvar='ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz', + clinvar='https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz', dbnsfp='gs://seqr-reference-data/GRCh37/dbNSFP/v2.9.3/dbNSFP2.9.3_variant.ht', eigen='gs://seqr-reference-data/GRCh37/eigen/EIGEN_coding_noncoding.grch37.ht', exac='gs://seqr-reference-data/GRCh37/gnomad/ExAC.r1.sites.vep.ht', diff --git a/v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query.py b/v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query.py index 4e6a0cfaf..57d163146 100644 --- a/v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query.py +++ b/v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query.py @@ -12,6 +12,7 @@ ) from v03_pipeline.lib.reference_data.compare_globals import ( Globals, + clinvar_versions_equal, get_datasets_to_update, ) from v03_pipeline.lib.reference_data.config import CONFIG @@ -39,14 +40,21 @@ def complete(self) -> bool: ) return False - datasets_to_check = [self.crdq.dataset(self.dataset_type)] + dataset = self.crdq.dataset(self.dataset_type) + if 'clinvar' in dataset and not clinvar_versions_equal( + hl.read_table(self.output().path), + self.reference_genome, + self.dataset_type, + ): + return False + crdq_globals = Globals.from_ht( hl.read_table(self.output().path), - datasets_to_check, + [dataset], ) dataset_config_globals = Globals.from_dataset_configs( self.reference_genome, - datasets_to_check, + [dataset], ) return not get_datasets_to_update( crdq_globals, diff --git a/v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query_test.py b/v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query_test.py index 60009daf8..566337f2e 100644 --- a/v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query_test.py +++ b/v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query_test.py @@ -60,7 +60,7 @@ 'clinvar': { '38': { **CONFIG['clinvar']['38'], - 'source_path': 'ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz', + 'source_path': 'https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz', 'custom_import': lambda *_: hl.Table.parallelize( [], hl.tstruct( @@ -160,8 +160,12 @@ def test_gnomad_qc( @mock.patch( 'v03_pipeline.lib.tasks.reference_data.updated_cached_reference_dataset_query.CachedReferenceDatasetQuery.query', ) + @mock.patch( + 'v03_pipeline.lib.tasks.reference_data.updated_cached_reference_dataset_query.clinvar_versions_equal', + ) def test_clinvar( self, + mock_clinvar_versions_equal, mock_crdq_query, mock_updated_rdc_task, ) -> None: @@ -169,6 +173,8 @@ def test_clinvar( Given a crdq task where there exists a clinvar crdq table and a clinvar rdc table, expect task to replace the clinvar crdq table with new version. """ + mock_clinvar_versions_equal.return_value = True + # rdc dependency exists mock_updated_rdc_task.return_value = MockCompleteTask() diff --git a/v03_pipeline/lib/tasks/reference_data/updated_reference_dataset_collection.py b/v03_pipeline/lib/tasks/reference_data/updated_reference_dataset_collection.py index e09b04f09..af2144839 100644 --- a/v03_pipeline/lib/tasks/reference_data/updated_reference_dataset_collection.py +++ b/v03_pipeline/lib/tasks/reference_data/updated_reference_dataset_collection.py @@ -6,6 +6,7 @@ from v03_pipeline.lib.paths import valid_reference_dataset_collection_path from v03_pipeline.lib.reference_data.compare_globals import ( Globals, + clinvar_versions_equal, get_datasets_to_update, ) from v03_pipeline.lib.reference_data.dataset_table_operations import ( @@ -53,6 +54,14 @@ def complete(self) -> bool: ) return False + if any('clinvar' in d for d in datasets) and not clinvar_versions_equal( + hl.read_table(self.output().path), + self.reference_genome, + self.dataset_type, + ): + datasets.remove('clinvar') + self._datasets_to_update.add('clinvar') + joined_ht_globals = Globals.from_ht( hl.read_table(self.output().path), datasets, diff --git a/v03_pipeline/lib/tasks/reference_data/updated_reference_dataset_collection_test.py b/v03_pipeline/lib/tasks/reference_data/updated_reference_dataset_collection_test.py index 72b0dc180..bc19d39d5 100644 --- a/v03_pipeline/lib/tasks/reference_data/updated_reference_dataset_collection_test.py +++ b/v03_pipeline/lib/tasks/reference_data/updated_reference_dataset_collection_test.py @@ -158,14 +158,19 @@ class UpdatedReferenceDatasetCollectionTaskTest(MockedDatarootTestCase): MOCK_CONFIG, ) @mock.patch.object(ReferenceDatasetCollection, 'datasets') + @mock.patch( + 'v03_pipeline.lib.tasks.reference_data.updated_reference_dataset_collection.clinvar_versions_equal', + ) def test_update_task_with_empty_reference_data_table( self, + mock_clinvar_versions_equal, mock_rdc_datasets, ) -> None: """ Given a new task with no existing reference dataset collection table, expect the task to create a new reference dataset collection table for all datasets in the collection. """ + mock_clinvar_versions_equal.return_value = True mock_rdc_datasets.return_value = ['cadd', 'primate_ai', 'clinvar'] worker = luigi.worker.Worker() task = UpdatedReferenceDatasetCollectionTask( diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py index 0bc94f473..b5290e88c 100644 --- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py +++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py @@ -585,7 +585,7 @@ def test_multiple_update_vat( }, paths=hl.Struct( cadd='gs://seqr-reference-data/GRCh37/CADD/CADD_snvs_and_indels.v1.6.ht', - clinvar='ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz', + clinvar='https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz', dbnsfp='gs://seqr-reference-data/GRCh37/dbNSFP/v2.9.3/dbNSFP2.9.3_variant.ht', eigen='gs://seqr-reference-data/GRCh37/eigen/EIGEN_coding_noncoding.grch37.ht', exac='gs://seqr-reference-data/GRCh37/gnomad/ExAC.r1.sites.vep.ht', @@ -724,7 +724,7 @@ def test_update_vat_grch37( [ hl.Struct( cadd='gs://seqr-reference-data/GRCh37/CADD/CADD_snvs_and_indels.v1.6.ht', - clinvar='ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz', + clinvar='https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz', dbnsfp='gs://seqr-reference-data/GRCh37/dbNSFP/v2.9.3/dbNSFP2.9.3_variant.ht', eigen='gs://seqr-reference-data/GRCh37/eigen/EIGEN_coding_noncoding.grch37.ht', exac='gs://seqr-reference-data/GRCh37/gnomad/ExAC.r1.sites.vep.ht', @@ -965,7 +965,7 @@ def test_mito_update_vat( hl.Struct( paths=hl.Struct( high_constraint_region_mito='gs://seqr-reference-data/GRCh38/mitochondrial/Helix high constraint intervals Feb-15-2022.tsv', - clinvar_mito='ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz', + clinvar_mito='https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz', dbnsfp_mito='gs://seqr-reference-data/GRCh38/dbNSFP/v4.2/dbNSFP4.2a_variant.with_new_scores.ht', gnomad_mito='gs://gcp-public-data--gnomad/release/3.1/ht/genomes/gnomad.genomes.v3.1.sites.chrM.ht', helix_mito='gs://seqr-reference-data/GRCh38/mitochondrial/Helix/HelixMTdb_20200327.ht', diff --git a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/.README.txt.crc b/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/.README.txt.crc index 22d3757fcca4e05493004732c98c006bb661bfed..add5a194299fbb1ee96e0513a6c3530cf9b37be1 100644 GIT binary patch literal 12 TcmYc;N@ieSU}EsjXZj2P5by%( literal 12 TcmYc;N@ieSU}Bgw=~y5D6I%mz diff --git a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/.metadata.json.gz.crc index 800ce4b09b50e4cbc823d0765bc944120b05aa7f..3a7d8101c69ad7710090a2decea7405f78301c8d 100644 GIT binary patch literal 12 TcmYc;N@ieSU}6w>FCYp45W)gq literal 12 TcmYc;N@ieSU}BiIrD*{G6PN?1 diff --git a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/README.txt b/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/README.txt index 7ed0c7ae4..9aea8fa4b 100644 --- a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/README.txt +++ b/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/README.txt @@ -1,3 +1,3 @@ This folder comprises a Hail (www.hail.is) native Table or MatrixTable. - Written with version 0.2.120-f00f916faf78 - Created at 2024/03/15 15:45:48 \ No newline at end of file + Written with version 0.2.133-4c60fddb171a + Created at 2024/11/02 13:12:12 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/globals/parts/.part-0.crc index 7e41d08ed5c41fe94209249e27e052d99136ba96..c96ad70c97f6bd659ec821360a30dd7aedb4b323 100644 GIT binary patch literal 12 TcmYc;N@ieSU}BiCm%|AF5=jEm literal 12 TcmYc;N@ieSU}EUt`^pIb5xoM> diff --git a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/globals/parts/part-0 index 67904bd84c8ff836fbd0edc563c3f2f99d374ba1..bb1d539437156e9e5b2b058172ef9e107c500919 100644 GIT binary patch delta 298 zcmV+_0oDGH1CRrMdjS9d00RI3D77#BU;qMb3jp?&Ln07Y+5*;=XvoBfY$Dk*ji?ep z`~vQw4ST*hTwvCZb>trbJpeZVH~{MXBV{SskKuLr~{-t`8aJ zJqMZJXn&p6(Vs5vTzDAM3%?l~Hrrhl_@}PLL0~Y@!N6=;or4zu7Z!^Pi^b()xm5gl woywSMi!peS{~ZXGr_i^+qt6DDBLP?yl@SUG4FCWD00000D77#BAOHaX0N|K^Y5)KL delta 299 zcmV+`0o4AG1CRrMdjS9d{sI61D77#BVEzGZ3jo%YLm>cJ+CgngL}aE#dWkf-bmpd^qSf#u9c2t2q#lhv05||R0L@YFASuJ4O4E}SNf5cLtf5yVmV_17tc)5B4CtdI z1Xo7ssff2_axyMO%^GOAL22BqliavpDQ&o49036bPu&%N_Q-=n(qv}q6};-7CYv;< z{H{jl-dDM6-Bn9uyr0_^_eX`oLy|~{NZPbDg(tobABB7a&qn>6!Fy$On9>bFo!hxS zWSRFIsQgCz>#UCcbaCgx!f)X3OdX02ddFi;D#YbHP~r xd7a9bYKt=gh>`vs2bQPMx4@$TlOq9G6^#)J3Jm}N0000004TLD{U87V002l+f_DG_ diff --git a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/index/part-0-d5c50933-cc72-4716-b930-1e885aa0ba7a.idx/.index.crc b/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/index/part-0-9e75273d-7113-40e4-a327-453f3451dc8c.idx/.index.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/index/part-0-d5c50933-cc72-4716-b930-1e885aa0ba7a.idx/.index.crc rename to v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/index/part-0-9e75273d-7113-40e4-a327-453f3451dc8c.idx/.index.crc diff --git a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/index/part-0-d5c50933-cc72-4716-b930-1e885aa0ba7a.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/index/part-0-9e75273d-7113-40e4-a327-453f3451dc8c.idx/.metadata.json.gz.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/index/part-0-d5c50933-cc72-4716-b930-1e885aa0ba7a.idx/.metadata.json.gz.crc rename to v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/index/part-0-9e75273d-7113-40e4-a327-453f3451dc8c.idx/.metadata.json.gz.crc diff --git a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/index/part-0-d5c50933-cc72-4716-b930-1e885aa0ba7a.idx/index b/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/index/part-0-9e75273d-7113-40e4-a327-453f3451dc8c.idx/index similarity index 100% rename from v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/index/part-0-d5c50933-cc72-4716-b930-1e885aa0ba7a.idx/index rename to v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/index/part-0-9e75273d-7113-40e4-a327-453f3451dc8c.idx/index diff --git a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/index/part-0-d5c50933-cc72-4716-b930-1e885aa0ba7a.idx/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/index/part-0-9e75273d-7113-40e4-a327-453f3451dc8c.idx/metadata.json.gz similarity index 100% rename from v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/index/part-0-d5c50933-cc72-4716-b930-1e885aa0ba7a.idx/metadata.json.gz rename to v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/index/part-0-9e75273d-7113-40e4-a327-453f3451dc8c.idx/metadata.json.gz diff --git a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/metadata.json.gz index 213fc997c4fa60ba0486aac8e537457dbccc6f75..5aed747bc2f7f8e2b95dd92177c014ed65ebc41e 100644 GIT binary patch literal 345 zcmV-f0jB;RiwFP!000000F6@5Yr-%T{$Fz1AQNkcN^YBj9k#>Jv5QEF>8l1~Qj%0D zCI9=9*eHsFT|(dYl8>esZ7CNQ5Df>TL6+xF&ul!b%5t4e zGKw(LP=N-?z!VxpQB%A1;tT2}mact!+t%Qj?@)9bq1IH8h1CtST`-wcFu~=4>QHjI zaTsK?zegsc+JNMY+m^g&P21(kx5CsI(ALj>jXQ_?fBNf2hj-ADrBJLfm&u~7x&u80w`j~V#GjLN}MkKFF37|5dg6}}IcSz@aU`*|6B|%z)NQX{Y#ZO81 zA;bC4+bl^IHL$pbbM{H^JgScW_eHTc)zrrpf-9J6`WgBLUx}JU=C)bVFSAwXqg5R? r09PH(VDGtBmE2e^nHB8^v``}A69?y4lKk=cM63G+kd-Tmw*mkFEvu=% literal 344 zcmV-e0jK^SiwFP!000000F6>jYXUJ4{VzGSkjko{Hn-M75AC67=|yCTaaMydS(2<0 zmi+f7aib^}dI>x4V>0_DEvmSH@&MX!B@r2ohl2@5Tgt@+L`84V>ko>jswk@G{-~nW zc!Ds}P=N-?z?2$9SyQ_W;tLuimaam3>uLzhcP!eCP%A3P-0FteHkwQ>nBek2bu78u zcnq@H-y^e8Z9sCyts^g5Q@1$zmY5m?+J@P$aqn^WPk-I$@eUkW2*nzMsSqH*EU{L{ ztWOO2jOWAqY`Psz9<%Od25!pBh~!l%0aQj*@Er(uhx9%K#?-!65~MYVwCI#o`jm7Z zGo1arO|xWP150Z-XP@;xpz8R4UlIpXO+#!kxRR-+o1ts)m8ofBZkuKOqQ8uNw5sC< q;HtwJ>^#@1H8++^W`+9z2Wvz^;^4zdvOhhaWc9yzW}0@l0ssK;KBII1 diff --git a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/rows/.metadata.json.gz.crc index d581eec458db9adf73244765a54d62a958ad01eb..682fea6e74bb55e9b2648b1e3a7d2bfe53a4718a 100644 GIT binary patch literal 16 XcmYc;N@ieSU}7j(_(}fD{NlR+C<+EZ literal 16 XcmYc;N@ieSU}A{$dH((BrR%EzDdz_E diff --git a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/rows/metadata.json.gz index 0b6e1772c16fe82265c981a8325800999060124c..d37774da987b28605255ea68e1ed47f81f458942 100644 GIT binary patch literal 586 zcmV-Q0=4}giwFP!000000Ns|~Z`v>r$Nv{UZCcVb3BQ`R!q|jVO%qIeScQ;lU%taL7{&&}rlfEErF3?$)gRLf^N@+`Xr;jw8 zo1R1LVCBe2#n#@yr94(soy^pj=9K_w!6+&}BR9_js?al_&LdP1JIjQGS>t7&f>=gg z9J*4*0Mmw_AL>F5s}0oHJ9Xal-E@69spqWx!N_Zb7nkL9B!>Gy@Cc<&{43o6ijpELYJN!q$0dBEkK=73lQ2f925sJw*N3@H<&^c7G Yp_lLx=eO0hX0FCt(t6sEMK`TlvYTqs)DMAt`IWz1YEo} za!j=p`M-CZKp-Kl%Bp)=4v5T*=bd-*NM=tIeFPGdvEuLqeDdwXbOWqK0y1C2j>Hc{ zYPL=BnMw%~QZmC7(;S#DB=P>(;p=V#d{`u~3VaAN??wZf!ql2Av-_-FTnaI*tXTS( zP}J;#jZTFXT47|YY}9hWK^p^i#?alvv2+%QbiSc>rI(-t^DVTNK$VY6u?D%TO*=o{ z&%=17!S4O{n@^MBD(by0baPcK6lz)3{Hx-*-eQ|1U@BX?9}SwV-r$r92wMrs&q@Y3 z*&2mQMGGICieFgJX}6=!<+1On<=N3I&!J18kJXdTf6#>RoK!rFkFg9J1M&+l0OL0| z@3V3r#VsTm9REbjIUR{Gshtb{l;hC34G_!{3a47k#4kvj3v`mCV2eqpQrgnP=`#)I zW_5_2tehDs*g6`xmd9eMiH%}%)JM~$!vy0C!k1B>2h+DE?pi2*u=Q$^IoX6#w$7Z8#@N3+usNP0Q!w2hyVZp diff --git a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/rows/parts/.part-0-d5c50933-cc72-4716-b930-1e885aa0ba7a.crc b/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/rows/parts/.part-0-9e75273d-7113-40e4-a327-453f3451dc8c.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/rows/parts/.part-0-d5c50933-cc72-4716-b930-1e885aa0ba7a.crc rename to v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/rows/parts/.part-0-9e75273d-7113-40e4-a327-453f3451dc8c.crc diff --git a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/rows/parts/part-0-d5c50933-cc72-4716-b930-1e885aa0ba7a b/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/rows/parts/part-0-9e75273d-7113-40e4-a327-453f3451dc8c similarity index 100% rename from v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/rows/parts/part-0-d5c50933-cc72-4716-b930-1e885aa0ba7a rename to v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/rows/parts/part-0-9e75273d-7113-40e4-a327-453f3451dc8c diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/.README.txt.crc b/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/.README.txt.crc new file mode 100644 index 0000000000000000000000000000000000000000..e175e8da445a91dd1006759d9ddf63949420999e GIT binary patch literal 12 TcmYc;N@ieSU}CsG`I8R-6y*c! literal 0 HcmV?d00001 diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/._SUCCESS.crc b/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/._SUCCESS.crc new file mode 100644 index 0000000000000000000000000000000000000000..3b7b044936a890cd8d651d349a752d819d71d22c GIT binary patch literal 8 PcmYc;N@ieSU}69O2$TUk literal 0 HcmV?d00001 diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/.metadata.json.gz.crc new file mode 100644 index 0000000000000000000000000000000000000000..5def68f7f61e58f681686d4086173f9fea428d51 GIT binary patch literal 16 XcmYc;N@ieSU}DhrId0s>z1|Z5A7KPF literal 0 HcmV?d00001 diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/README.txt b/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/README.txt new file mode 100644 index 000000000..1b764aef2 --- /dev/null +++ b/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/README.txt @@ -0,0 +1,3 @@ +This folder comprises a Hail (www.hail.is) native Table or MatrixTable. + Written with version 0.2.133-4c60fddb171a + Created at 2024/11/02 13:13:26 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/_SUCCESS b/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/_SUCCESS new file mode 100644 index 000000000..e69de29bb diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/globals/.metadata.json.gz.crc new file mode 100644 index 0000000000000000000000000000000000000000..92c2ee4f33acec3fa52c2ba069dcb372b9ce82f9 GIT binary patch literal 16 XcmYc;N@ieSU}8`;O8I}%$)56wXXzxnxfqkvtI71=&POY*yWcVaaM zhB(PjL&9yVfPWbY7X=78CR-QOz{cu^*^{F7!Q>IdDChaQ5R&S4#Y!QMRAWRB@SQY! zHS!=1Ag9|iW#i=_>x%L%Jl7SN6m?kvGW8w6EH&tGC~%`Bv7m=WF}=1;bG=R z1iu@Q3>^PiYuerok8P`Exv^7#xPUY)1? zGM2xjzA?`=4YW!#JN!8jifVf;N-*v#NbvIRJ;ATx;|-MuDC$OXq)x8eR0J}H(&Hw-a literal 0 HcmV?d00001 diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/globals/parts/.part-0.crc new file mode 100644 index 0000000000000000000000000000000000000000..66c4951841b92bd36f385f7c37d788ca9c45f69f GIT binary patch literal 16 XcmYc;N@ieSU}E@^v-XF~EXQyFDE$SF literal 0 HcmV?d00001 diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/globals/parts/part-0 new file mode 100644 index 0000000000000000000000000000000000000000..31232639d8682902191606d0e4982ce0b412c2e0 GIT binary patch literal 774 zcmV+h1Nr>%0ssI+1^@skwJ-f(LGP`VM&1X$LPEc*?j9v#v$0u`qdr{F=Hs_nH_7FAE z>(x-B7R90~EiW0rg*+|iu#C(3RtnvjbU=-t+kYaimggn&9nE(+Xo=i>3^_*XzWLSn zrqI1JRcUM<5DH3f1$xOgF2;OCG-lh&wqYA>{q5X3O^|13WK9;VJ};TI7xR5~wyDh1 zi5Y2BZDDuCj?w(SE^HPDndeBIH|2kjBfGx|AM%V%Js1lRtwKSf?c6z^b8)VYTIoDP z?|r0gSWafH3!k(I$h{Sn|3C$jBq>Ovm!+Vegr*_|?sPW4ilWlD7QzmKK){A=00?k! zrp;#3;AC(L0w!Lvx>S!o@?HZqQPGLaGzGIpssu>KrO`=t=2*P?X?^XoeAqrE#K;@QKb-Mo-cm( zUUyzMHiZmSk7#sH+c_|#?$5vr{x5mjq^v4XwN>62g|;fR5g;@PAw5=GXbpOo7yXOg zVDRlKy1v}S@wRu63U76IbxJ{7mPR@6Dc@omER;A# z>TF2TDiMxjBNMjwBTCpS7FLb7Ozu*v0oenqly@9idCz!)V+8L!0oejI#YK~}Vx@v4 zSOLPsA~K~UaskED1Z;AdU-srd%pBM;>7;Gstj?`e%j^oRlV1aNGA!^i$3kglSzwJk z4O9Y=?kfB~ngI%iLg3T7$b6GIl@uYIM%z*1IuQ~Pb!u{bEF}n~zgRnF{dG);*->HR zwZu|^K&jBjH3?bf|AJGnraU@IA5#hxRe^N-Ow}aq!5r+1EYV>1HV%C3K1=?g^caAX3whOcipr@%Nm@3x$O&t)Gyp z4r<6AlUp{vJZ=-Bbi+CYauySn?@6oejZqsL1*Hem?WzOr^bIE{dJrB)t5ISNl)OG2#nT$4VTZTE=hDo80facyqsWsdHrFW-mLiZ{_lCro|*XM6K+ z&vbTj7LMvxch^%!TtMy6+0_`R!l>cpclWm%p7!ciC4E$HU1s$q+iN->ug25X@r%dF zvIT{lp3(!bK6+RU=Plz0t6mTLYP?upbqS$yYN9*6ElGlAPg7latcSa4*Y|~f1tXrM zoB>nP3(j8g0OlgYiFp|3tGkhPS)>r%Z=8OdP>ea|J*nLIswbrdLd)zNI2XPfu`y*bDRr1apa6e*T{zr)GZuT%h$S Hj0gY#wW3}2 literal 0 HcmV?d00001 diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/rows/.metadata.json.gz.crc new file mode 100644 index 0000000000000000000000000000000000000000..edeb9708277545d1bc428b01c4304a1c8b86c5ca GIT binary patch literal 20 bcmYc;N@ieSU}E@Hx|{dO6$aN%(Exq`K`{nw literal 0 HcmV?d00001 diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/rows/metadata.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..8ab2a9563aa7a4c899e9271bff4aab7ca00c19a0 GIT binary patch literal 1064 zcmV+@1lRi?iwFP!000000Ns~gZ`(Eyz`u*0HcW0EJ8>Pq*{c*vnG27bmYBhV0qInRJd@~KW z_(~F6tFU;@MuiI(WAFn`B^SFTk^5)|f>VqFomM`p?XjVA^tfT>Af%3attCm) zQsKFcBuybvNux}=EY>z|df0|sN=VFL$^t}!<<)AgWiD`ZH8KOADTx>F6F}FxMDR$c zr8b^>%e&>x_@h>G$8+O)>W$gwGIqSX0%_hwp|uureS2$c%-o%bLc%R$*Et*;Wsu3RMSWF58OKkL`h5 z%^qfpQ$Tt+-SzeSz8>W=3Xbs}M9;f=7_aK9HwQ^nyWf)(2f~Z7kyR3>k#iqr>-+PT zJxirm6Z<%eVd&lbzED8J$*(D7nWEU&6U8ikE!6)nA6CvY^yJ zQXNJwvPgSqg^=H%X=x06NRbM|p+_l-ZM!TIX|A|jU69Xh|9I+76LXZ}Ce48CMIx1>tXf*Pz($r5< z1EU2MEui=&Ctkor6B6x=cnJ{}A+`tKhop8KTFfSAz2T%nrzRb%BKSotgS!5?mbsYdKt(viv-lK%j7h@;7hZ=ITls z0P*uv-#sxoaPH!X(>mi-y0|j>fycz>gL8}z%K^FBTATy`%89l9kHhTVY=LzaK4e_j!>g9)Aq zncDy4q<9#VDzW0j9xsX>Mz3_AoMAX+4?CmbyR+WlGOIrRF@P~@OdWjW4hZ(I*x`Z( i2L|gRFw7jZY^Gc&Nv3ikQYOk%mi-MbD`&m95C8yQc>>7* literal 0 HcmV?d00001 diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/parts/.part-0-6a5a9d6a-4ded-424b-9735-922a5346e7ad.crc b/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/rows/parts/.part-0-3569201c-d630-43c4-9056-cbace806fe8d.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/parts/.part-0-6a5a9d6a-4ded-424b-9735-922a5346e7ad.crc rename to v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/rows/parts/.part-0-3569201c-d630-43c4-9056-cbace806fe8d.crc diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/parts/part-0-6a5a9d6a-4ded-424b-9735-922a5346e7ad b/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/rows/parts/part-0-3569201c-d630-43c4-9056-cbace806fe8d similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/parts/part-0-6a5a9d6a-4ded-424b-9735-922a5346e7ad rename to v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/rows/parts/part-0-3569201c-d630-43c4-9056-cbace806fe8d diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/.README.txt.crc b/v03_pipeline/var/test/reference_data/test_combined_1.ht/.README.txt.crc index 1c47b9a3c9eae1262a59c31f8c62067028259bc0..2796480e9024ef6664a5fc3ea3e7c508fb9fd7d5 100644 GIT binary patch literal 12 TcmYc;N@ieSU}9KhQ~wPB63_#d literal 12 TcmYc;N@ieSU}A9Nc8>-C526At diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_1.ht/.metadata.json.gz.crc index db7a7824cd79229b8fa039920ffc63383e227bdd..5def68f7f61e58f681686d4086173f9fea428d51 100644 GIT binary patch literal 16 XcmYc;N@ieSU}DhrId0s>z1|Z5A7KPF literal 16 XcmYc;N@ieSU}E_5UO7Zq$#*IMB`gHA diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/README.txt b/v03_pipeline/var/test/reference_data/test_combined_1.ht/README.txt index e46de4296..9b284affa 100644 --- a/v03_pipeline/var/test/reference_data/test_combined_1.ht/README.txt +++ b/v03_pipeline/var/test/reference_data/test_combined_1.ht/README.txt @@ -1,3 +1,3 @@ This folder comprises a Hail (www.hail.is) native Table or MatrixTable. - Written with version 0.2.130-bea04d9c79b5 - Created at 2024/05/20 13:48:16 \ No newline at end of file + Written with version 0.2.133-4c60fddb171a + Created at 2024/11/02 15:22:20 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_1.ht/globals/.metadata.json.gz.crc index b47637bf17f5447178624421d6349b704793fc93..92c2ee4f33acec3fa52c2ba069dcb372b9ce82f9 100644 GIT binary patch literal 16 XcmYc;N@ieSU}8`;O8I}%9A~C62Th=5ws2BL(C)TnpDK@$umy`4!$?%Df z@QFxT0m;b^%|Qdd`T2FDfL)Lk**-x_^1FL?Vl@YbILS{#!fmU7e;ElE1qeANTNl*8 z#_ERIlcM&)x%L%Jl7SN z6m?kvGW8w6EH&tGC~%`Bv7m=WF}=1;bG=R1iu@Q3>^PiYuerok8mQO zyaWsOUow7V`8l)C*nHXGy@C49Cv%_Kde+cUZeZu6=6^JuB^$>Hy@_|teh)hNVtiW{ zA1*!>6SX4jL6kfQDax8XxjIqM4#AE1;{vWOFF#(Lr~NXPzofn~&ovFSN;5nBIT4C# zdo4;Z?kh;}^6ovsui@hjl?N#5MslQ1#8^kVvMvcE?+-$9A~C5}Th=5ws2BL(N7b?|DK@qqo0Iq`GJJZE z@QFxT0m;dC%|Qdd{c*ifz%Iy&Y@eVd`TYZ)Sj~YsoaCpW;I37`zl;gI1rEYYwl1iF zjnxgaCq?a}$s>qS&hvF4B-QPTl|mk=9%H_fX0OH&!~x_KJ%3X+1_xPJlyBj=uE3-y zRz+~7rb4AfMJ;Tp7$Kud!6L-gs)BLMZgm80L@np2WGR;{A(OqDO}0QM`v;%w{|SL? zss%O^()p27&`q9TX9Q-F0gW*0$RtqADa=gFqJ)PTN(8?fkPI;YS!>$f50CQD4Udqt z40MZ)TjRij34hz>zSYoaMU4S%(GY2-3q+rqeQPXrx9_O2ptnjpC6dCfb4zseOBAh7 zs1KfnJd88@(Eq8AdeMf!iRI6+NHI8w<54GyKL%oWWD>yx{$cwl*p@dpdA}CP*x@R~ zOR!-7CF4hypELW6&6f?{8>pXrGWVISXAK?I26j$*PJikw**GrfO}uONd(g=T=RwuIdCKX;DxKp%a2#*Wj~GOFQu=n!=ZszX=UerNJ3F< zZ$t?OKZ3*s@7@#lG5mE)WAi diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_data/test_combined_1.ht/globals/parts/part-0 index ef03f366e9eaa008e74a458e1cc7b2c9471b4867..31232639d8682902191606d0e4982ce0b412c2e0 100644 GIT binary patch delta 667 zcmV;M0%ZM&28ITI@d5wX=K*Bx=aOaKo!cngx&EX^ zB-Hb- zWZ#eLyPO8|V4SOgNK`%{N@S*D5njwic(wYj<_TM9WVMTbR!nj;(QL*c*v$Ia+J{^J zGUO%WV4I8iT&meo{hVLzPDG^RX&oBUlTy;&KWbXoTQ_c<>dSg-A6|7NNfUcf(N;F+ zo2>Q_HPP$UP@@*bqAM*g8NY=*E$6U|%lTFc-I;Viji1|pBCeL_CG#E4cR6T@+1-43QBJUddW5}#(YIIX4}iQVH<7z?c6#|kY{LQO%|*^FPXI$ z^L=)S!o@?HZqQPGLaGzGIpssu>KrO`=t=2*P?X?^XoeAqrE#K;@ zQKb+rWS%d6_Fi{hH#UU~RF7zMPun>#r0&naA8w2t0!ca;2x;hwdqW6y-Jhc$h=-Fb z0(oP%g6Sg2iV5TAjX0vl_fR_0XWX)8)JzB21cjqyW}FW^mfb=p@U0M~+KMnHpPqy^ z78#L5sg){;FtH=w^Od5b!;bJ@$dMO>!tTt-jtVpi6%7CY0000004TLD{U87V005h4 BJof+q delta 651 zcmV;60(AX`28afK^8x?>LIwZ;D77#BU_u2g7XbF`g(a{a}~PyVSPRo$F54 z!+j&c?SLNOivfPEr64YL-roX!0C)g?0MsMRu0Y;m9U2y_73Drb!cfP105- z<=fEq5w)4utD{CO3Y{x0Z!vxgSyJz?jLZ4f3Ei1^M2(-@eE@;jZQ|VAYkGxR+sA1N0uyx<5udb1sxTWNE6a{G$73sFgaBx+^Tvq zcFB=ujm|GtR?Y6#dRJ$3vn}d;_@cpH{BVC(T2R;eDL^g7*R1rQ{`OJ_GiAbhPwHD; zRjL#)g3R;9&))0K>&Bjtf$GwX?q@p(hSU8S_`{9SLoiJT10oGwaqkGBuKRN{1o6-U zlPUsvO}2vRBFKsf`7I?^ZHvS!pw2iXLLqhw~B4?LFLLMQI65T)9RFeaa# lgf7|snHU`zof+hTLP9VVLKA`Vj0_B@N&vXe B4MzX~ literal 0 HcmV?d00001 diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-1d126232-414b-4ffa-aa43-9ed52895fbf2.idx/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-1d126232-414b-4ffa-aa43-9ed52895fbf2.idx/metadata.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..051d3e03d56a7a32825d7404a65aab17a00602f9 GIT binary patch literal 185 zcmV;q07m~GiwFP!0000009B5`3c@fDME_+^3OQ6;O3h6KJt!zDUc^IMw@ol4l5D|H z`tNSNd07T#=Isp78jCj!(Rc^4EVH5#PzU82ZCtli4fz1F$X2BQji%ECaw*`2>6+dM z5-t?_h6d}KKFY&%XS?%8=ha*&Ytdqq>kw4BbC8LT5d62g|;fR5g;@PAw5=GXbpOo7yXOgVDRlKy1v}S@wRu63U76IbxJ{7mPR@6Dc@omER;A#>VIrV(kc;-VqtCV*fS$WTRf@1{lJOSAPHN{1fv|^=#Bv=8$#3C}KBys`8(*$gC znP2wiKg=B1G3lgjHgj}C*|uhy@Xl-vFAVLrYKCX{0}=yysh_>Xb$ z$h6tlJ=*DJDIFzriYD#}osJ+<(~nFQa<=jJoW=`&lRlYk_lV~zNGUjRZEomgj_#)~--pwRH_=8XOnv@md-HG4barwUj_Ou-*HcDZ zK<&`k)qfbM!l>cpclWm%p7!ciC4E$HU1s$q+iN->ug25X@r%dFvIT{lp3(!bK6+RU z=Plz0t6mTLYP?upbqS$yYN9*6ElGlAPg7latcSa4*Y|~f1tXrMoB>nP3(j8g0OlgY ziFp|3tGkhPS)>rH;DU1jJ0CS>ShyVZp delta 714 zcmV;*0yX{B1=R(R7k_G@Tid)b3T;(rBS2^pLY7Gm@aEW=ZQ3G~|2{j36DMh%nlteC|c$$Ds zF7wOY{D+wXJ0zX7jhxlFm1>z?p>^_Wz)pq*UglUR%`6M7k*9%5Aktlh-v={5!9WOn zToswGGpCXwgn!d$J4#$9LL#D$O|Fln1fld7Ysaj=jtMb6C~UkISSk=G75ca)A82= zI57|7Y literal 20 bcmYc;N@ieSU}AXxj(f*fTkAsOl7CGAMLY-K diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/metadata.json.gz index 19968eb85a27deaf4cfe83d5fbf5bc1e26024e44..d2c7ccb1c58d4fa54ca801b2151bd026682c12bc 100644 GIT binary patch literal 1062 zcmV+>1lju^iwFP!000000Ns~uZ`(Eyfd7j=ZJ6BEFUb;rvt7F}<|Kw41jPzNpv9BT zR-!bDF5(6H?>kBsMM_H36nz2e4~BF+-ka_oQI8HM5jaC<&M7XJ0)YBh@+dtDU&SbD4BEF?`7`EHws2VkmLJOb>sSkQrp_{YBha>qFES|e6t@i z{)q@%tFSny!@`A&F$5v=CF8p#k^6WDocS09I<0(G+Y>|Q=rP00flD3t#@)psT}vX; zQsJqMBuzo6q*10_7Hbcq96uj2-W;K$>@PWUWO^-`*G-Gk3@1h)@q6;z$+R3c&Do zQ&}soOc!OW-tEeJySN>X7S-Mvu9~)tV6=Oz$j|W~C3e-XmlNZmB!Wdmf=8I8l=`s% zLoH3PZyQ<^LPM(kiMcP9vgiHoC3G@Vj+W1qrY0{^bV1*uQq2x{ra%|!>U}LbRh8l}B7HDkFSeR+0 zOf}#}KkMi*dE&I{u!8)KD2COFp7aOKGcw zYxW;1oS&Wb&zsx+NhQsx^Gp~4Z$U%mP>S69*y zNRXfUzUU}uz7YZ@Qi%`cp@8@Kz{L}%b;hf7ab@@&kCB-h2SEx_M?yG{iw3fNWZ9ofeQ!ScbqyceIISFsDZ@#Xfa(@DNdkf_q>jf+XulBH;jaW#& zd(#ze1Rzpqdr&XzLtO!@_nMj4^R8$8@=k3_<%vXo#pcDLedRGRuR&EE+e+pr;|URR z#?-erevYe~&g)K>c6;Z&ey=m=4z`^^5ReWbgMQ}{=vnXL@+{Z}z5G-C!;0`7jPXoJ z)&55(#lxUfi4`CAcv18)diD3Bi{X?#>p#T5? literal 1063 zcmV+?1lao@iwFP!000000Ns~gZ`(Eyz`u*0Hcajs+i?=V*{faj!6n@DGt7T%FPWe zksou(c1p%L5l1(PQzlu=Q8MSU*Uj9MZxoD#AjglT>c;g6rM9)@^=kSKMYAv_`DQ<4 z{0kAbR$*~YhlL9lV+ca#OU8FgBKPqQIP)z|de}xxN)To+WdUMN^J+EMG8Y)S8k&JmmBdT-0?@TC5j+y# zQX9>@<=ygT^g%1RqnU9%@kaDh89UxxgEa5r$Xbh-zP&XzX6}y15uqMD#*r$t6@cOG zwz5`UnJ&s$y}On7esMP*EvmgUTs3VO!D#nbk^jekl-O0jSx$_Hk_Z+N2_9jVQtHP7 z47D`DzHMkx2qp1s-BJ}>CWXHa!0}plVCKp=iBwmZRhU#sw$;O^Le&A8OSe1?5_{lQ z(}(He6p$WHcRioo*P~p<;W60*?|Rn{qg8$N<{*h`_j{7!KzIo@vP$AKa_+-)ecx}{ zvs8LDae%WJMc&OH3k5Ws{F*|RDT;kLQOx4kLjC{pel^+mE`QZ{D&`Lr(bfq~7L*!D zs>A3-7HJQy5HcCmB+xSeYU5Agrb(Y#f)#qKg_1ih$gzpjfQ`YL8fq-kSfH^vV_~M1 zGSz?^{jA5k>g(r+yEfa_Xt$H)Kk=z(Vt}NLBNjqh2+{(NZ?nTjM@0<{jYhszn)*p< zV6>p31r*=p#0!{cLZY1!FCoGr#P$#bAZo{<#dLhuy&P9duH)}nNDY-Svg9+`w3N0= zxMp8bVQ_xl8#K55nM#^(edRu^E|93cGgCiIf~f;wEr)q2%g=)XLWLugzj^a}uCAmZ zkRU(x{iCCx`9=tsNF_d$#{%AizKbVL>x@_F;>z#`9wRe14uTY^8=UvbG^_|+5$}@@ljLnNh`_5xxUW2MSww26L#uFms zjHz#L{2W&|owuC3{{s1lr%|F#Ytq9-27|(=M z?SFDoJPb;eSn*+x7ex=FSAQ?gFr2c7o#ExhS$A-iRUiKtzz8)a4nA@RIQ>`bFz$y3 h3hN>;%pA0As$3{Zrg9-t3gs!w{swU;>2|mf0092>{x|>t diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/parts/.part-0-1d126232-414b-4ffa-aa43-9ed52895fbf2.crc b/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/parts/.part-0-1d126232-414b-4ffa-aa43-9ed52895fbf2.crc new file mode 100644 index 0000000000000000000000000000000000000000..dd555f5530818b965a42aefee5e49be3d67965d2 GIT binary patch literal 12 TcmYc;N@ieSU}8AWR@njo5<~+H literal 0 HcmV?d00001 diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/parts/part-0-1d126232-414b-4ffa-aa43-9ed52895fbf2 b/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/parts/part-0-1d126232-414b-4ffa-aa43-9ed52895fbf2 new file mode 100644 index 0000000000000000000000000000000000000000..446fb54911ffb22915fe6eeaa755c8b05942e6ed GIT binary patch literal 106 zcmWG#U|{e7VvVi(e-%6&nHU&%uq0;`89tq6%Eai%=nRx{sArhweA-??it)Oo@bgcW z#>NZ{=hzqyFtb~JyTK>V5PtHNMSJ973kHV%Ltkx}<&P>D*fKEi0(F7_*kA@m1_l5U CmK@0d literal 0 HcmV?d00001 diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/.README.txt.crc b/v03_pipeline/var/test/reference_data/test_combined_37.ht/.README.txt.crc index 1b96b5393db01d08490ce1d23fd7ed5e96f1950b..394adb99dcf6ecf289b9389b9946e8e3357f051a 100644 GIT binary patch literal 12 TcmYc;N@ieSU}89au(=Qb6jlSb literal 12 TcmYc;N@ieSU}D&F?e8W46`llq diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_37.ht/.metadata.json.gz.crc index 82e0d403587f5270439f388cc565f7e12900c5f4..6b72fb1f0640451828aa1213d4fd3e8a4b4ceb45 100644 GIT binary patch literal 16 XcmYc;N@ieSU}89RXRlpX-sMyPDd7eT literal 16 XcmYc;N@ieSU}9MRJ9@2H-S-dxC|Cu| diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/README.txt b/v03_pipeline/var/test/reference_data/test_combined_37.ht/README.txt index e38d73d71..f5927612a 100644 --- a/v03_pipeline/var/test/reference_data/test_combined_37.ht/README.txt +++ b/v03_pipeline/var/test/reference_data/test_combined_37.ht/README.txt @@ -1,3 +1,3 @@ This folder comprises a Hail (www.hail.is) native Table or MatrixTable. - Written with version 0.2.130-bea04d9c79b5 - Created at 2024/05/20 15:38:26 \ No newline at end of file + Written with version 0.2.133-4c60fddb171a + Created at 2024/11/02 13:18:45 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_data/test_combined_37.ht/globals/parts/.part-0.crc index f3ed5e11b94e6389a4fb59844c7068f80cdbd9ea..3181e59913472c64d7152b9911ae79796ba0f7c0 100644 GIT binary patch literal 16 XcmYc;N@ieSU}CsEY4f?&Zil=8D#ivT literal 16 XcmYc;N@ieSU}AWiKjZt7wp~U5D+mTW diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_data/test_combined_37.ht/globals/parts/part-0 index 259a7345f1d74a677c6926ae43e247f1edf887ee..92eda86fb7b54a469acad5cd89c0dac9695c9a5c 100644 GIT binary patch delta 675 zcmV;U0$lx`2A&3g2?GECaRvYYD77#BU~vWA7XU`CiYBmJZ3AQ^W}&uRQ%OX!(w^Rp zJU3Q1aS77`BKAJsb2#fUI*%h8fB<>`fdC;;%ZIC=iMG}8SJUWRv0ANZKdZT-@r^!0 zL_S=v#S8LR`-WU$_BxHDaWY2&F&ULA(Tmk&%-WMNa}{QP%`>{v33C~>+Qr>Wvm2LU zH|(WzAIJVLkH4CyZL;QbtL9fVe4e#Gkq8uW8l5Xt65`;)irUy)S8k>1%DQtOw?LV4 zEB5LYU$q%EZEbTx3TrP_6wP2QxoSx;dXn;2W4Mtg=A5RbZOG)21!}{JdzIU!Fl`cdgA?dYF{;I9{nrO?qzucy68po}}om(j? z@+_S&3X4|BUk%%=`96Ew)D~@VZ|R_}JA8EX;NUiPTkIFj@ax7#X>fU#^m$W$7+JDA zobe-1*)(*a@ZcyCBHYfM6FwE^`l`L0X9>Q4I1KE6xy)7J5eE^u*P_=$5&;H-Nx+;| zc7l!&nu-YA>1>A9M5T{yjG%)60D%AxhuUnY4oAZw0Fd~r)vfyUl4on-7>(wWp`kD| z9nWUdJe!ZF^U-ilHYP}!Q3jL|WiS93uPTjeS#ZieJrk|+xy4Sa+23CODwU6RMWr91 zEZB@gKkn0t5!Ah&22@*dMk{7ee|xdVLMOs{Q_QzM>r|@)m*li&gsDW*R1cfN9Tnn=XqgbPI@E@qdJxs~zcX$=4X0000004TLD J{U87V005w9L|Om< delta 675 zcmV;U0$lx`2A&3g2?GECZw3GWD77#BU~dK87XT)%iYA~Q=KpEzBOLakNe5OhQmZBP$ehH5s$^WXzm@g_-w^Zd9r+qgTV)8)DXR-J!&M1j(n-fyie`HZKgLdRBE~zwS`%dG#-&tmL*j^KTN5QkA8%w(1(aS*YcFN$4gLNHBJ z!c=PE2`WNx8Y1|o(;HqBmfp%SehvZv1Ohr6s;Vf?MzbLRkhoa0YS=bT}T(CS_y7BqPdvGNH@_Fui4I8raLu^|{cA!0)s^+jo_$bpUfa>)Y@A?=4HGal!Hu zmG4r24ISw}EchE)%Fn@wHAyLh5{x3>B-&(2 zljL?L_`gs7kS*IS>}3yo3A*Rrb9HpA8=bs8p%A}-RD?4xxVW56zxdt}5!!*?`PuYr zGMjz+j4#gjNwS+HhT@{BXjwv% z6lsVNv4|3sU>DF_q+rDre&fx5s2o_9j%1hXqSMxdjk-D->FdFc!WwTh)qWrm5zAc=S)GZvN?N#DN2*E_>eXfs`0;$ax8}{tJPTArv)2OU^ zL`s8tgF4I88PhN!4WE zA+>HA$Zk_*w!X@CDbdEf(E&M2h^}U4@Cka?ZczrYIF`=04!AclT*lEubQgZNX+KWnQY5Ol8bV3q1U1X8&bb2 kAlL;517%S2Sx*Q6091opZ~y=R literal 703 zcmV;w0zmyAiwFP!000000M%7pPunmQ{V#diqzpm3jghykw18G1Y7v@*D$8|lTW^k? z*$yMB^55rtIB^n?_Oge)RKDllbA5c`TOYk0Wq_VRDk``ad_JE}zWCk&Q`Ujr>B;2e zYKvX#!SU;Wys=hsuFx$w+p&E;?;Z*r=uzx z2{iyQuEMXZyg)%HCC;B}<(tAeN!`LR+Fm7Lgiu0t-sk#QDUjNHv0=~d>y#~SGmXot z2dp$mluEN*pI{i~B(0l2ArGFIAer(pW57Udd?|k0epMz|mcQ?=RwtR`!v%`&qucPiP5a@Z zTh34z|7cX2;2N|Oh{Vk#)JAt5>5Y5tZgJN(X%zl>H0Ez_5(*y7cx_%5-gXAk?-{~z1^Ntabq=Ez2$_tKO{e)FZj+63fTRmRRY{@zW zQoWAtlW6M;gNomO&#Q_JDaB^ms(Gu%)ol>j)U3)3o7wuOv$IXvNQ$(aT^YQFs0q*?IZ^LcIro literal 20 bcmYc;N@ieSU}A7zywj~%=f}-2vh(x-Kgb7i diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/metadata.json.gz index b83d7239aa0df3815dfebbf872bb2ff4c8c80fb9..c22d07b9f585fe64ceff1b498eac9a7b714748b1 100644 GIT binary patch literal 1027 zcmV+e1pNCSiwFP!000000Nqz@Z<|OE{x5vGQ*qBWP15kqPH|(&Nh~{xPOU7oW(Rv) zSU7fF#TW74Zx#&f0%@B{?Md|qi=CZ!=6N(cS)9aRxz?Ohcm%q7c(+a=7+bdGe}<#w zezc+_I0!tGD1r!twZW?(p%5rV%YCXm+#ge*SM>#IffC`(D{o2!tYvlx;%+)wG`2Im)qQEn)Zwus|4;llHM@SZywYAunz5Kj@Ch@NM37{lsfiTl zwxdNMlEgQ4D^(mB0skC;<8wWMxh<141`|q6z5V^ry3kBOZqqFnVQKIl8Oq0>f}G@jv%KFTP=bP1zXV8a&1MMV`o zEMbyFhaWV0Xt9Gq4hA=v)L<|}FEKQj)JCvIWHXA-{&!Ll_@g7XB=}wX{&^5_Ae@&pPf19U9%5D(!%tGcGwdjaoaLiKP-Z&C974#ypk0= zR|b(H5tP4v{a2x`q!Ey?*sQ*5-O{NyWSNlI%Eyu=&)?X%)d@WGEH-%K{;X9kx05gw zAnP;;=Slg_&6s(<>ErRH4y_7jN|?oMGT`MDEU#>fDfhW2Ol!Xd27HwLd;cDv|y$uA$%zEYkka@YY?RAn)wFd!9$*XGf3-9M72x22a3NkMGwB&@;z%{LgT7 zypK*8(}Tn_2}1}$Ivc#AF$1a<9q+mH@Ni6kT~`;V1zLnRZ>AF>VXe{wRnNIQ&y>(u zD8BPhiwlw--6WP=@mQc_F4XWWb5EhY={QZv@#9Hx<5I$yDlpzt_4#l>z()IQ#U;1n~c16t;Zp>y=P;pQMzmGj2k#UWcOlFCxy zB|{=5q`F{~)ivgK8&@8-5myRQGnumdSg^cP?Yqncj-jmPOtr*M_9-xmAT{T29$Y>`+i~GrVQQOY&*7Rj0Q-#Nl{6F!d#P0g-^4fSQNy;Olz$e^Nk~B`iP?HJH zZA*(msEBXtmZ~^168=7bz~^cLGgrk)1U8iFdb|5!bfKPr%w=04gTw~hYWg@`oDzCd zxmwR>56vu>ad=GjAkO@o$MLFpdwWnskNX3W2navHLDof-kjpgZd^vHB#WoB5|J#Sv z^?rEyd&MRNWKa=-f8r90Mp`V=Eh^$BPWjr?EXrZy24E~dtng9&!cR}Am zeFx|}@cwnU+i0(WUg5k|Z(U~fCM%40pt}RxUxoHJkZs_4dk6wZ8!N|RIvJc@P3kMx zp?e-OLuG;t@EOP4Qrc?an*W0e7o*YeqHXqRddX_kFfV2K z&Xq!_Nd)C@-~N%SYiS51$TzF+TDMH<4R|UQw(_Z9$%}Irw>pW3j>QIVyq~qIWo{e< z5>%B25h5<$xg9ghHeEbk*I`s)(W}n>wWMEV)yk=n%`Y;2h%G_FI~{ z@^SS89)jO^+3hJTBtLiJiWn&%)3EN0UO9wS`PSgI%Wf9kEcx|g+EvO6h5U+>i%t8% zwJz^L-5uLY=4j^$N!5(&FI)WB)rVfcN9ffkpxb_L0GG6PJ{ktSEnE$H?7BQ9uoV6ZO2V|`G`rrMd7WIAJ_RH}Vu*}v%0)A;-k005XO38er4 diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/parts/.part-0-ac85fcb0-1e7c-453f-be81-9cd356dc49ff.crc b/v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/parts/.part-0-6353b1d7-bc23-4f3a-9fa2-dd9321ab97a2.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/parts/.part-0-ac85fcb0-1e7c-453f-be81-9cd356dc49ff.crc rename to v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/parts/.part-0-6353b1d7-bc23-4f3a-9fa2-dd9321ab97a2.crc diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/parts/part-0-ac85fcb0-1e7c-453f-be81-9cd356dc49ff b/v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/parts/part-0-6353b1d7-bc23-4f3a-9fa2-dd9321ab97a2 similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/parts/part-0-ac85fcb0-1e7c-453f-be81-9cd356dc49ff rename to v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/parts/part-0-6353b1d7-bc23-4f3a-9fa2-dd9321ab97a2 diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/.README.txt.crc b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/.README.txt.crc index e08d4d12b482e6c65d7edabd267cd91cdc06caf5..b768134397bc0db94d53b99b3dc00b30a7d29a12 100644 GIT binary patch literal 12 TcmYc;N@ieSU}BILc_aw{5ETMx literal 12 TcmYc;N@ieSU}88p-EuDg6L+g(`8VeX`&_y zR#ppWrYVm+8rxc4r>x)E)9_4>6_Ll%+499!vN3!~emany(p~<(uH!tLbMJCiow4`x zb)4t%jZE1-D`wR(Ns1pWNj9xQnxskWy^V{xYy$7zJKDcqC8SYS+VRs}wpCsz!w%=S zjC+r5;|3$;K9u`xFC~QKq_S!-cgfAcOK#7`&6~WkBwerl@k5zpac2ASDkUvuDI5Ve!g>Pn@L0NIeKEY_TbAzP zK=XZ7DEsPrBb5;Ie%bW$>VdHU@hB7|9#f_?LYUhYT0C#lD*gp|9}Ut_hG~-*A=??N z^J9uZ6h#VA%I0LlvH93U0nz6}xeowENg0#mp(e^e;4@9QibFUPQ)hxx&~ctEV7md) zwQQA)VUb=Qf?2f}ycbL{?+bZ8wDp43Eo^-muLIutC4aS=f#vyCz+$VnYBi31cGjRw zr3rk!YL_jED)g8ZI(*6$VvjzFz6V$0gFQh3Xv$|b31UuE=2fT;0Y`to*7+R1{kV=&Q190CCcte;(-EHm0p;QJ{!&&W`EOG1DvbyW=uop s&l<*ZmEiJgRqHe(JM`3PDf^KSY+-Dw4FCWD00000D77#BAOHaX0DR3%!vFvP literal 712 zcmV;(0yq7$0ssJk1pojjwJ-f(fdnlT0EXy$C7|xO0b~u66xrla%rYykH`!`0yj>oX z-H_OV7|f?~Y)UKbAQE!`Z2)%wpOlu8PkilBRK?VpsEX&(WTGmpeECh~wUW^QIG8Ar5vmds_S3sK2(RKkhLIH@L?* z-t;`ty5djCZ6)8I*IqSA)FesQYrp)^CSAPQ#=K66lN+cbv7$gcK%VYdUkq>gmZ!To z(0pSx&c^y)NoUNwe>TOueqby>Knev3$doCS5$3jy7i+hvCI5!NiiwM$2%y4`R^h&Z+OZ&bW`%Gm6lMd2m+VuksXDp94+f}O`k+kI*)-jQ?fIs4liXma*r*PRuv5Z00000001bpFa00@0RR9&Yg9r2 diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.idx/.index.crc b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-3c042736-0e6c-4911-9b80-b9356af9df25.idx/.index.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.idx/.index.crc rename to v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-3c042736-0e6c-4911-9b80-b9356af9df25.idx/.index.crc diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-3c042736-0e6c-4911-9b80-b9356af9df25.idx/.metadata.json.gz.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.idx/.metadata.json.gz.crc rename to v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-3c042736-0e6c-4911-9b80-b9356af9df25.idx/.metadata.json.gz.crc diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.idx/index b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-3c042736-0e6c-4911-9b80-b9356af9df25.idx/index similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.idx/index rename to v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-3c042736-0e6c-4911-9b80-b9356af9df25.idx/index diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.idx/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-3c042736-0e6c-4911-9b80-b9356af9df25.idx/metadata.json.gz similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.idx/metadata.json.gz rename to v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-3c042736-0e6c-4911-9b80-b9356af9df25.idx/metadata.json.gz diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/metadata.json.gz index 95672cd4519132e00409dd0cf4fd371121d4b1cd..a43d1f48c090f40c42a5311e08a0312eeda89c40 100644 GIT binary patch literal 585 zcmV-P0=E4hiwFP!000000L@j~irX*{{TDyApf0&A*~M>ZN=oR3WfS@mf>Gp2EGkPz zl6Thx|9d6b@-5ENzV*d;&YT$?jgRC@aH0&N7m$hy9)ydB`Row|38t(CgUQ|OZaSZT zx+jatDUOfRhbazFAmIcOH~|$&U{Qv3V#qCOC>kG4dtGE;GCLbw&P*IJ3%6S4L|<%? z^1+#l6vt6YwXhK7XOsY=Z+#@yFIf7>W>TDWLYiinXb%uZxuN$u#7C}9Sr1}@U=|U< zm6n)t-A$ILp2UBz6L_B95^14wAhX6Ap;g?G+5oNC3K~eW;E!BmZKkxtN`s6t330fT z5*PbQ-l4qOq&I*#1E+-QqF;cpQXp;5VWXR4aO%nu{e~jk8&<9mW(*jp1Crv^Q(Pwe zZl52%KChDb<44aMNEnk`F*k9z;d(wpM#hOzq7DD4vl-g$b+yxow&gk3)q8nBXTjpF z#!}VU8%51z4CI$_nFCV!(TFvl;A$&^$5cBX560PdfFeRdxMl)dNkcodg2K;2Fu;7o zAcGboZrN_uyQoJ)BL-?VJm%cxDoUW999phj;Cn59)_98FqJ*`$sS#)OoRS;f+>h5%}9H;h9eg5@j~Py%r5v7xh^&Rpv|)&5CbOKpcc- XNMH}QuXq0wS6co7SPOD?$OZra42=|m literal 586 zcmV-Q0=4}giwFP!000000L@j)Zrd;r{1=^CfDIfw%|kcWg$tt(1hJ8eVF(npwun-s zKvGEzL;qfq67{kiw6|Ua?d^X488Vq?oc63?^68tLxe1 zQw(@=lRS{Shxl`V0ttJNzzL{G0*i92Q$uc1L(w=k?e#GSliAqdbYLRJEL>|@5Ph;m z$_EE7GMq#i)xtuQA5aR6zV(qzzhLPjo5^tA32ByNqCG$u<%Zttki=Z=^B%+m!7L(z zD=jhQx|=LfJ&FHbC-5S>B+^3VKxU0KLaVqVwE%vd4jWw@gHun(bPz0A zYb;fry-?Il#z1}fN*das6%>9Ef&u0u z1{t&%am#wO+(bPZ8Zl6_;W6jV7f}lJ8dSXg zeVrqlq8uBOIh#3`b{jnU)WT(k%{+g5J=xhtQsmaK&}-3vby1&{P-V^((d_sgj*tYQ Y84}pT?d#qD#FdtR0EcpXCddW=0HNIwsQ>@~ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/.metadata.json.gz.crc index a927fa9da88f47ec8a2722411997135806834dff..9a55fd3077cae703357d718c3397f86d70fbbe58 100644 GIT binary patch literal 16 YcmYc;N@ieSU}8AAF=58?)jv-I04f~@!vFvP literal 16 XcmYc;N@ieSU}DI5yYt7nNggo(D^CVV diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/metadata.json.gz index 213bdb7aaaaac35564612c520537f123f0367231..dc89f5aa6a24e278ffe1b51005923fff57915a4f 100644 GIT binary patch delta 867 zcmV-p1DyQr2JHrr7k_bI$r~`3klJc0Xb-DUWMT)r8Yi+%1w;AoJ5C^p6L&4u`qDn2 z`rN?EuaV}uduEI}1Sm;~C; z@s8&Ri$hHCI9Grb(IU`%nv9Snb0v0xI;P>gvLuMOqH7PVD1S%d;Kq@pibfnHGp;)A z#66|n04WF9l@Wg%S15i%Mm%GW=T+C*!@X#iP@poV)~ zvN`y)6lTX&Uf0rHM-0yu67p>Dlp~Uo2n`Bk1&&m=bmeXZ2ow?JV$KjFj?-?`b&0## zDu8b^QwEVB-hX7uB9yjHw`PlC&yG$vEs7XHJhi0T7zOH|@$ibvFt&7jKASFl)7rh+ zDWq#(APMOFSP-mxlJDz=FuJ$I8jk%9$@}aMr>EsP?(jC(ZPDJ}up*^)jkbnsy37Hr zIrIbn$E5x4^GSj9h{4owsRW{2S<+{%D|`|?L64^iGJj_0TK|6demCB9u0H0ok&y!o zf#Q|PsLsT*Baz3l&O&(<>Vxpa9N5MnOH&_$3Y2033`I_Vqk%Qz*Ew%tygfvj>vfht zL-IPsO`aJnD{;(VScTOFy{wFWZH|J(_8fKsTiccrceM*cA}Tk;^<>m)UyTZt>jWFh zaLHr@iGLnocMYSJ7S8D(FzEMsoqpZzmsB`)z0eN;5VWm&Ci)LRP<FwD-f^okUO z>DdAK=g(hKb1emSh3Q53qT8{22!vEf1?<^pX9?^IT*49-mOBbG-do+Qi5rEXJfDV8 z2I4%*Xl~|AlBMmSDsYrg7^jM^{?bz|uc}s4rGJ`t!ag`oIAXUzn%A)@%_j)A^CzXN zA5upA%4XMDm`UE2aaj&Q;4s1ViY_E!4&NYN9d;FT74qdYwLy6T$a`ciH0_n&Eol#m z?wC;$M+c9QP*bWOBJeiRi$=5237XyZ*G|9DB>kY#9kg1F!Ro5nSPeS8J_-joZ1>W4 z%Utg}#Mfj5pDtMY9~}wbEJ_2*UODhj#$d=AY{U(eb7rvPwUXiy1i6Qx96PYv4&nHP tA-ZOWF(?;l2+9Mt!(^V719(X;o9O6Tl4t}-DWQ`j$zPa{F|NA}007w!r56AI delta 867 zcmV-p1DyQr2JHrr7k`yoNAm^@CZx8S3fjXe6q(o|UX2skrh=jT_Z=sY#EH9>YJF)R zP<`&-@ptU_;Bd4dj_1rcCOd-fetw%LBt%TWfRn)a~*D=F$g@imAJmrX_BtnA%S&1XlEnT@?0RlxtwU`scnB%M)ODFve;@Q#Zro}NMh-a2`lb}HTGag=X877vF&u7zxZ(6%I zTZMG(3nT%Zp9q3=Pl|nw2%|eotl`*?NYQ6^I6W>;afi2sZi9Ay#LAS~HQpGm*|Gqz z=FkuPACq48o=*y-M+~NhOC=EH%91{BUFnnR33@zEkbf~d*ZTLv_q%b_9egZiBPRzK z0>x{SQJ;xtN1}*jorUTsGzZ~{Ik1gEo~AhjH7Ml-7>a`aMgtqfZ*tzkczcK{*PASV zhU86(n>;gER^gbzuo|lkdRZC$+8iZ`?K$iOwze%J?rIx`MAUAG>&d9y8H`Gm>jV*H zxMDJbM1K#kyN1z93+MC?7+m)I-OHxiFR5_mdZr%$AZSzfO!Xgtp!!a%VVIp{*%c`W z)3XEe&!4|!=2{Bu3bTvwMYm)15D2M|3fQyD&l1=bxP&DvEO!)WytlepQ#TGnc{&ZD z48(by)7;FNrc2vFRpO|iFisU+{kf-FQB|X+T7Na~gne+FaKvtbG_PY*mQN6F=TAmg zKctNKwasp_Fqgb76wkoU-3Xxb~kTe2RM z-7%x2jt-t6p{7(nMBr_r7p;p{FC@LeimY1gD-yPPy*_RYI$^uj?etMM_!?ZSt}e27 z%Utg}#Mfj5pDtMY9~=qaEJ_2*Upeqk&S1zIMB)a@IWgGr+G+U+g4{z;jvd%-hj9GD t5Ir-*7?g`F1mywSVlvOm0lcJ;O>}fENi_nbl+a1i^e-*n_jJ1r0018spGN=y diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/.part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.crc b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/.part-0-3c042736-0e6c-4911-9b80-b9356af9df25.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/.part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.crc rename to v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/.part-0-3c042736-0e6c-4911-9b80-b9356af9df25.crc diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90 b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/part-0-3c042736-0e6c-4911-9b80-b9356af9df25 similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90 rename to v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/part-0-3c042736-0e6c-4911-9b80-b9356af9df25