From 4e05a0d4380497222a54e891e595811204f96070 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Wed, 12 Jun 2024 10:45:09 -0400 Subject: [PATCH 1/3] Remove concept of private crdqs --- .../model/cached_reference_dataset_query.py | 13 ++----- v03_pipeline/lib/paths.py | 36 ++++++++----------- v03_pipeline/lib/paths_test.py | 4 +-- .../updated_cached_reference_dataset_query.py | 4 +-- ...ted_cached_reference_dataset_query_test.py | 4 +-- ...annotations_table_with_new_samples_test.py | 4 +-- .../lib/tasks/write_imported_callset.py | 6 ++-- .../tasks/write_relatedness_check_table.py | 4 +-- .../write_relatedness_check_table_test.py | 4 +-- 9 files changed, 32 insertions(+), 47 deletions(-) diff --git a/v03_pipeline/lib/model/cached_reference_dataset_query.py b/v03_pipeline/lib/model/cached_reference_dataset_query.py index 42b9f6abd..e26fa9fbd 100644 --- a/v03_pipeline/lib/model/cached_reference_dataset_query.py +++ b/v03_pipeline/lib/model/cached_reference_dataset_query.py @@ -4,7 +4,7 @@ import hail as hl from v03_pipeline.lib.model.dataset_type import DatasetType -from v03_pipeline.lib.model.definitions import AccessControl, ReferenceGenome +from v03_pipeline.lib.model.definitions import ReferenceGenome from v03_pipeline.lib.model.environment import Env from v03_pipeline.lib.reference_data.queries import ( clinvar_path_variants, @@ -20,10 +20,6 @@ class CachedReferenceDatasetQuery(Enum): GNOMAD_QC = 'gnomad_qc' HIGH_AF_VARIANTS = 'high_af_variants' - @property - def access_control(self) -> AccessControl: - return AccessControl.PUBLIC - def dataset(self, dataset_type: DatasetType) -> str | None: return { CachedReferenceDatasetQuery.CLINVAR_PATH_VARIANTS: 'clinvar_mito' @@ -56,15 +52,10 @@ def for_reference_genome_dataset_type( reference_genome: ReferenceGenome, dataset_type: DatasetType, ) -> list['CachedReferenceDatasetQuery']: - crdqs = { + return { (ReferenceGenome.GRCh38, DatasetType.SNV_INDEL): list(cls), (ReferenceGenome.GRCh38, DatasetType.MITO): [ CachedReferenceDatasetQuery.CLINVAR_PATH_VARIANTS, ], (ReferenceGenome.GRCh37, DatasetType.SNV_INDEL): list(cls), }.get((reference_genome, dataset_type), []) - if not Env.ACCESS_PRIVATE_REFERENCE_DATASETS: - return [ - crdq for crdq in crdqs if crdq.access_control == AccessControl.PUBLIC - ] - return crdqs diff --git a/v03_pipeline/lib/paths.py b/v03_pipeline/lib/paths.py index 93669d43a..67f7234a5 100644 --- a/v03_pipeline/lib/paths.py +++ b/v03_pipeline/lib/paths.py @@ -40,6 +40,21 @@ def _v03_reference_data_prefix( reference_genome.value, ) +def cached_reference_dataset_query_path( + reference_genome: ReferenceGenome, + dataset_type: DatasetType, + cached_reference_dataset_query: CachedReferenceDatasetQuery, +) -> str: + return os.path.join( + _v03_reference_data_prefix( + cached_reference_dataset_query.access_control, + reference_genome, + ), + dataset_type.value, + 'cached_reference_dataset_queries', + f'{cached_reference_dataset_query.value}.ht', + ) + def family_table_path( reference_genome: ReferenceGenome, @@ -182,27 +197,6 @@ def sex_check_table_path( ) -def valid_cached_reference_dataset_query_path( - reference_genome: ReferenceGenome, - dataset_type: DatasetType, - cached_reference_dataset_query: CachedReferenceDatasetQuery, -) -> str | None: - if ( - not Env.ACCESS_PRIVATE_REFERENCE_DATASETS - and cached_reference_dataset_query.access_control == AccessControl.PRIVATE - ): - return None - return os.path.join( - _v03_reference_data_prefix( - cached_reference_dataset_query.access_control, - reference_genome, - ), - dataset_type.value, - 'cached_reference_dataset_queries', - f'{cached_reference_dataset_query.value}.ht', - ) - - def valid_reference_dataset_collection_path( reference_genome: ReferenceGenome, dataset_type: DatasetType, diff --git a/v03_pipeline/lib/paths_test.py b/v03_pipeline/lib/paths_test.py index fabb920e1..f31b8f8ef 100644 --- a/v03_pipeline/lib/paths_test.py +++ b/v03_pipeline/lib/paths_test.py @@ -17,7 +17,7 @@ relatedness_check_table_path, remapped_and_subsetted_callset_path, sex_check_table_path, - valid_cached_reference_dataset_query_path, + cached_reference_dataset_query_path, valid_reference_dataset_collection_path, variant_annotations_table_path, ) @@ -26,7 +26,7 @@ class TestPaths(unittest.TestCase): def test_cached_reference_dataset_query_path(self) -> None: self.assertEqual( - valid_cached_reference_dataset_query_path( + cached_reference_dataset_query_path( ReferenceGenome.GRCh38, DatasetType.SNV_INDEL, CachedReferenceDatasetQuery.CLINVAR_PATH_VARIANTS, diff --git a/v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query.py b/v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query.py index 9177aa21f..92fc7718b 100644 --- a/v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query.py +++ b/v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query.py @@ -8,7 +8,7 @@ ReferenceDatasetCollection, ) from v03_pipeline.lib.paths import ( - valid_cached_reference_dataset_query_path, + cached_reference_dataset_query_path, valid_reference_dataset_collection_path, ) from v03_pipeline.lib.reference_data.compare_globals import ( @@ -56,7 +56,7 @@ def complete(self) -> bool: def output(self) -> luigi.Target: return GCSorLocalTarget( - valid_cached_reference_dataset_query_path( + cached_reference_dataset_query_path( self.reference_genome, self.dataset_type, self.crdq, diff --git a/v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query_test.py b/v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query_test.py index 6541f8672..8dd9558c8 100644 --- a/v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query_test.py +++ b/v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query_test.py @@ -14,7 +14,7 @@ SampleType, ) from v03_pipeline.lib.paths import ( - valid_cached_reference_dataset_query_path, + cached_reference_dataset_query_path, valid_reference_dataset_collection_path, ) from v03_pipeline.lib.reference_data.clinvar import CLINVAR_ASSERTIONS @@ -167,7 +167,7 @@ def test_clinvar( # clinvar has version '2022-01-01' shutil.copytree( CLINVAR_CRDQ_PATH, - valid_cached_reference_dataset_query_path( + cached_reference_dataset_query_path( ReferenceGenome.GRCh38, DatasetType.SNV_INDEL, CachedReferenceDatasetQuery.CLINVAR_PATH_VARIANTS, diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py index 21fe5f532..80a8fe2b4 100644 --- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py +++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py @@ -24,7 +24,7 @@ SampleType, ) from v03_pipeline.lib.paths import ( - valid_cached_reference_dataset_query_path, + cached_reference_dataset_query_path, valid_reference_dataset_collection_path, ) from v03_pipeline.lib.reference_data.clinvar import CLINVAR_ASSERTIONS @@ -341,7 +341,7 @@ def test_multiple_update_vat( ), ) coding_and_noncoding_variants_ht.write( - valid_cached_reference_dataset_query_path( + cached_reference_dataset_query_path( ReferenceGenome.GRCh38, DatasetType.SNV_INDEL, CachedReferenceDatasetQuery.GNOMAD_CODING_AND_NONCODING_VARIANTS, diff --git a/v03_pipeline/lib/tasks/write_imported_callset.py b/v03_pipeline/lib/tasks/write_imported_callset.py index 345af90e2..04af5bfc9 100644 --- a/v03_pipeline/lib/tasks/write_imported_callset.py +++ b/v03_pipeline/lib/tasks/write_imported_callset.py @@ -20,7 +20,7 @@ from v03_pipeline.lib.paths import ( imported_callset_path, sex_check_table_path, - valid_cached_reference_dataset_query_path, + cached_reference_dataset_query_path, ) from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask from v03_pipeline.lib.tasks.files import CallsetTask, GCSorLocalTarget, HailTableTask @@ -86,7 +86,7 @@ def requires(self) -> list[luigi.Task]: ) if Env.REFERENCE_DATA_AUTO_UPDATE else HailTableTask( - valid_cached_reference_dataset_query_path( + cached_reference_dataset_query_path( self.reference_genome, self.dataset_type, CachedReferenceDatasetQuery.GNOMAD_CODING_AND_NONCODING_VARIANTS, @@ -169,7 +169,7 @@ def create_table(self) -> hl.MatrixTable: validate_no_duplicate_variants(mt) validate_expected_contig_frequency(mt, self.reference_genome) coding_and_noncoding_ht = hl.read_table( - valid_cached_reference_dataset_query_path( + cached_reference_dataset_query_path( self.reference_genome, self.dataset_type, CachedReferenceDatasetQuery.GNOMAD_CODING_AND_NONCODING_VARIANTS, diff --git a/v03_pipeline/lib/tasks/write_relatedness_check_table.py b/v03_pipeline/lib/tasks/write_relatedness_check_table.py index be7b92e6e..a53f69430 100644 --- a/v03_pipeline/lib/tasks/write_relatedness_check_table.py +++ b/v03_pipeline/lib/tasks/write_relatedness_check_table.py @@ -5,7 +5,7 @@ from v03_pipeline.lib.model import CachedReferenceDatasetQuery, Env from v03_pipeline.lib.paths import ( relatedness_check_table_path, - valid_cached_reference_dataset_query_path, + cached_reference_dataset_query_path, ) from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask from v03_pipeline.lib.tasks.files import GCSorLocalTarget, HailTableTask @@ -48,7 +48,7 @@ def requires(self) -> luigi.Task: ) if Env.REFERENCE_DATA_AUTO_UPDATE else HailTableTask( - valid_cached_reference_dataset_query_path( + cached_reference_dataset_query_path( self.reference_genome, self.dataset_type, CachedReferenceDatasetQuery.GNOMAD_QC, diff --git a/v03_pipeline/lib/tasks/write_relatedness_check_table_test.py b/v03_pipeline/lib/tasks/write_relatedness_check_table_test.py index dd33bb5e2..f3302b399 100644 --- a/v03_pipeline/lib/tasks/write_relatedness_check_table_test.py +++ b/v03_pipeline/lib/tasks/write_relatedness_check_table_test.py @@ -14,7 +14,7 @@ from v03_pipeline.lib.paths import ( imported_callset_path, relatedness_check_table_path, - valid_cached_reference_dataset_query_path, + cached_reference_dataset_query_path, ) from v03_pipeline.lib.tasks.write_relatedness_check_table import ( WriteRelatednessCheckTableTask, @@ -45,7 +45,7 @@ class WriteRelatednessCheckTableTaskTest(MockedDatarootTestCase): def setUp(self) -> None: super().setUp() - self.gnomad_qc_path = valid_cached_reference_dataset_query_path( + self.gnomad_qc_path = cached_reference_dataset_query_path( ReferenceGenome.GRCh38, DatasetType.SNV_INDEL, CachedReferenceDatasetQuery.GNOMAD_QC, From 65da04f07ecaeef3113d029444a7fa78772d59b4 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Wed, 12 Jun 2024 10:48:37 -0400 Subject: [PATCH 2/3] lint --- v03_pipeline/lib/model/cached_reference_dataset_query.py | 1 - v03_pipeline/lib/paths.py | 1 + v03_pipeline/lib/paths_test.py | 2 +- v03_pipeline/lib/tasks/write_imported_callset.py | 2 +- v03_pipeline/lib/tasks/write_relatedness_check_table.py | 2 +- v03_pipeline/lib/tasks/write_relatedness_check_table_test.py | 2 +- 6 files changed, 5 insertions(+), 5 deletions(-) diff --git a/v03_pipeline/lib/model/cached_reference_dataset_query.py b/v03_pipeline/lib/model/cached_reference_dataset_query.py index e26fa9fbd..02ff1c807 100644 --- a/v03_pipeline/lib/model/cached_reference_dataset_query.py +++ b/v03_pipeline/lib/model/cached_reference_dataset_query.py @@ -5,7 +5,6 @@ from v03_pipeline.lib.model.dataset_type import DatasetType from v03_pipeline.lib.model.definitions import ReferenceGenome -from v03_pipeline.lib.model.environment import Env from v03_pipeline.lib.reference_data.queries import ( clinvar_path_variants, gnomad_coding_and_noncoding_variants, diff --git a/v03_pipeline/lib/paths.py b/v03_pipeline/lib/paths.py index 67f7234a5..5d35c3e43 100644 --- a/v03_pipeline/lib/paths.py +++ b/v03_pipeline/lib/paths.py @@ -40,6 +40,7 @@ def _v03_reference_data_prefix( reference_genome.value, ) + def cached_reference_dataset_query_path( reference_genome: ReferenceGenome, dataset_type: DatasetType, diff --git a/v03_pipeline/lib/paths_test.py b/v03_pipeline/lib/paths_test.py index f31b8f8ef..d6f0b10ba 100644 --- a/v03_pipeline/lib/paths_test.py +++ b/v03_pipeline/lib/paths_test.py @@ -8,6 +8,7 @@ ReferenceGenome, ) from v03_pipeline.lib.paths import ( + cached_reference_dataset_query_path, family_table_path, imported_callset_path, lookup_table_path, @@ -17,7 +18,6 @@ relatedness_check_table_path, remapped_and_subsetted_callset_path, sex_check_table_path, - cached_reference_dataset_query_path, valid_reference_dataset_collection_path, variant_annotations_table_path, ) diff --git a/v03_pipeline/lib/tasks/write_imported_callset.py b/v03_pipeline/lib/tasks/write_imported_callset.py index 04af5bfc9..e5847380b 100644 --- a/v03_pipeline/lib/tasks/write_imported_callset.py +++ b/v03_pipeline/lib/tasks/write_imported_callset.py @@ -18,9 +18,9 @@ from v03_pipeline.lib.model import CachedReferenceDatasetQuery from v03_pipeline.lib.model.environment import Env from v03_pipeline.lib.paths import ( + cached_reference_dataset_query_path, imported_callset_path, sex_check_table_path, - cached_reference_dataset_query_path, ) from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask from v03_pipeline.lib.tasks.files import CallsetTask, GCSorLocalTarget, HailTableTask diff --git a/v03_pipeline/lib/tasks/write_relatedness_check_table.py b/v03_pipeline/lib/tasks/write_relatedness_check_table.py index a53f69430..1ba75446c 100644 --- a/v03_pipeline/lib/tasks/write_relatedness_check_table.py +++ b/v03_pipeline/lib/tasks/write_relatedness_check_table.py @@ -4,8 +4,8 @@ from v03_pipeline.lib.methods.relatedness import call_relatedness from v03_pipeline.lib.model import CachedReferenceDatasetQuery, Env from v03_pipeline.lib.paths import ( - relatedness_check_table_path, cached_reference_dataset_query_path, + relatedness_check_table_path, ) from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask from v03_pipeline.lib.tasks.files import GCSorLocalTarget, HailTableTask diff --git a/v03_pipeline/lib/tasks/write_relatedness_check_table_test.py b/v03_pipeline/lib/tasks/write_relatedness_check_table_test.py index f3302b399..239c6ea11 100644 --- a/v03_pipeline/lib/tasks/write_relatedness_check_table_test.py +++ b/v03_pipeline/lib/tasks/write_relatedness_check_table_test.py @@ -12,9 +12,9 @@ SampleType, ) from v03_pipeline.lib.paths import ( + cached_reference_dataset_query_path, imported_callset_path, relatedness_check_table_path, - cached_reference_dataset_query_path, ) from v03_pipeline.lib.tasks.write_relatedness_check_table import ( WriteRelatednessCheckTableTask, From f90baef2995b24a804897321f993209d79e536dd Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Wed, 12 Jun 2024 11:19:51 -0400 Subject: [PATCH 3/3] fix logic --- v03_pipeline/lib/paths.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/v03_pipeline/lib/paths.py b/v03_pipeline/lib/paths.py index 5d35c3e43..14482d831 100644 --- a/v03_pipeline/lib/paths.py +++ b/v03_pipeline/lib/paths.py @@ -48,7 +48,7 @@ def cached_reference_dataset_query_path( ) -> str: return os.path.join( _v03_reference_data_prefix( - cached_reference_dataset_query.access_control, + AccessControl.PUBLIC, reference_genome, ), dataset_type.value,