From 4e05a0d4380497222a54e891e595811204f96070 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Wed, 12 Jun 2024 10:45:09 -0400 Subject: [PATCH 01/49] Remove concept of private crdqs --- .../model/cached_reference_dataset_query.py | 13 ++----- v03_pipeline/lib/paths.py | 36 ++++++++----------- v03_pipeline/lib/paths_test.py | 4 +-- .../updated_cached_reference_dataset_query.py | 4 +-- ...ted_cached_reference_dataset_query_test.py | 4 +-- ...annotations_table_with_new_samples_test.py | 4 +-- .../lib/tasks/write_imported_callset.py | 6 ++-- .../tasks/write_relatedness_check_table.py | 4 +-- .../write_relatedness_check_table_test.py | 4 +-- 9 files changed, 32 insertions(+), 47 deletions(-) diff --git a/v03_pipeline/lib/model/cached_reference_dataset_query.py b/v03_pipeline/lib/model/cached_reference_dataset_query.py index 42b9f6abd..e26fa9fbd 100644 --- a/v03_pipeline/lib/model/cached_reference_dataset_query.py +++ b/v03_pipeline/lib/model/cached_reference_dataset_query.py @@ -4,7 +4,7 @@ import hail as hl from v03_pipeline.lib.model.dataset_type import DatasetType -from v03_pipeline.lib.model.definitions import AccessControl, ReferenceGenome +from v03_pipeline.lib.model.definitions import ReferenceGenome from v03_pipeline.lib.model.environment import Env from v03_pipeline.lib.reference_data.queries import ( clinvar_path_variants, @@ -20,10 +20,6 @@ class CachedReferenceDatasetQuery(Enum): GNOMAD_QC = 'gnomad_qc' HIGH_AF_VARIANTS = 'high_af_variants' - @property - def access_control(self) -> AccessControl: - return AccessControl.PUBLIC - def dataset(self, dataset_type: DatasetType) -> str | None: return { CachedReferenceDatasetQuery.CLINVAR_PATH_VARIANTS: 'clinvar_mito' @@ -56,15 +52,10 @@ def for_reference_genome_dataset_type( reference_genome: ReferenceGenome, dataset_type: DatasetType, ) -> list['CachedReferenceDatasetQuery']: - crdqs = { + return { (ReferenceGenome.GRCh38, DatasetType.SNV_INDEL): list(cls), (ReferenceGenome.GRCh38, DatasetType.MITO): [ CachedReferenceDatasetQuery.CLINVAR_PATH_VARIANTS, ], (ReferenceGenome.GRCh37, DatasetType.SNV_INDEL): list(cls), }.get((reference_genome, dataset_type), []) - if not Env.ACCESS_PRIVATE_REFERENCE_DATASETS: - return [ - crdq for crdq in crdqs if crdq.access_control == AccessControl.PUBLIC - ] - return crdqs diff --git a/v03_pipeline/lib/paths.py b/v03_pipeline/lib/paths.py index 93669d43a..67f7234a5 100644 --- a/v03_pipeline/lib/paths.py +++ b/v03_pipeline/lib/paths.py @@ -40,6 +40,21 @@ def _v03_reference_data_prefix( reference_genome.value, ) +def cached_reference_dataset_query_path( + reference_genome: ReferenceGenome, + dataset_type: DatasetType, + cached_reference_dataset_query: CachedReferenceDatasetQuery, +) -> str: + return os.path.join( + _v03_reference_data_prefix( + cached_reference_dataset_query.access_control, + reference_genome, + ), + dataset_type.value, + 'cached_reference_dataset_queries', + f'{cached_reference_dataset_query.value}.ht', + ) + def family_table_path( reference_genome: ReferenceGenome, @@ -182,27 +197,6 @@ def sex_check_table_path( ) -def valid_cached_reference_dataset_query_path( - reference_genome: ReferenceGenome, - dataset_type: DatasetType, - cached_reference_dataset_query: CachedReferenceDatasetQuery, -) -> str | None: - if ( - not Env.ACCESS_PRIVATE_REFERENCE_DATASETS - and cached_reference_dataset_query.access_control == AccessControl.PRIVATE - ): - return None - return os.path.join( - _v03_reference_data_prefix( - cached_reference_dataset_query.access_control, - reference_genome, - ), - dataset_type.value, - 'cached_reference_dataset_queries', - f'{cached_reference_dataset_query.value}.ht', - ) - - def valid_reference_dataset_collection_path( reference_genome: ReferenceGenome, dataset_type: DatasetType, diff --git a/v03_pipeline/lib/paths_test.py b/v03_pipeline/lib/paths_test.py index fabb920e1..f31b8f8ef 100644 --- a/v03_pipeline/lib/paths_test.py +++ b/v03_pipeline/lib/paths_test.py @@ -17,7 +17,7 @@ relatedness_check_table_path, remapped_and_subsetted_callset_path, sex_check_table_path, - valid_cached_reference_dataset_query_path, + cached_reference_dataset_query_path, valid_reference_dataset_collection_path, variant_annotations_table_path, ) @@ -26,7 +26,7 @@ class TestPaths(unittest.TestCase): def test_cached_reference_dataset_query_path(self) -> None: self.assertEqual( - valid_cached_reference_dataset_query_path( + cached_reference_dataset_query_path( ReferenceGenome.GRCh38, DatasetType.SNV_INDEL, CachedReferenceDatasetQuery.CLINVAR_PATH_VARIANTS, diff --git a/v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query.py b/v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query.py index 9177aa21f..92fc7718b 100644 --- a/v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query.py +++ b/v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query.py @@ -8,7 +8,7 @@ ReferenceDatasetCollection, ) from v03_pipeline.lib.paths import ( - valid_cached_reference_dataset_query_path, + cached_reference_dataset_query_path, valid_reference_dataset_collection_path, ) from v03_pipeline.lib.reference_data.compare_globals import ( @@ -56,7 +56,7 @@ def complete(self) -> bool: def output(self) -> luigi.Target: return GCSorLocalTarget( - valid_cached_reference_dataset_query_path( + cached_reference_dataset_query_path( self.reference_genome, self.dataset_type, self.crdq, diff --git a/v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query_test.py b/v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query_test.py index 6541f8672..8dd9558c8 100644 --- a/v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query_test.py +++ b/v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query_test.py @@ -14,7 +14,7 @@ SampleType, ) from v03_pipeline.lib.paths import ( - valid_cached_reference_dataset_query_path, + cached_reference_dataset_query_path, valid_reference_dataset_collection_path, ) from v03_pipeline.lib.reference_data.clinvar import CLINVAR_ASSERTIONS @@ -167,7 +167,7 @@ def test_clinvar( # clinvar has version '2022-01-01' shutil.copytree( CLINVAR_CRDQ_PATH, - valid_cached_reference_dataset_query_path( + cached_reference_dataset_query_path( ReferenceGenome.GRCh38, DatasetType.SNV_INDEL, CachedReferenceDatasetQuery.CLINVAR_PATH_VARIANTS, diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py index 21fe5f532..80a8fe2b4 100644 --- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py +++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py @@ -24,7 +24,7 @@ SampleType, ) from v03_pipeline.lib.paths import ( - valid_cached_reference_dataset_query_path, + cached_reference_dataset_query_path, valid_reference_dataset_collection_path, ) from v03_pipeline.lib.reference_data.clinvar import CLINVAR_ASSERTIONS @@ -341,7 +341,7 @@ def test_multiple_update_vat( ), ) coding_and_noncoding_variants_ht.write( - valid_cached_reference_dataset_query_path( + cached_reference_dataset_query_path( ReferenceGenome.GRCh38, DatasetType.SNV_INDEL, CachedReferenceDatasetQuery.GNOMAD_CODING_AND_NONCODING_VARIANTS, diff --git a/v03_pipeline/lib/tasks/write_imported_callset.py b/v03_pipeline/lib/tasks/write_imported_callset.py index 345af90e2..04af5bfc9 100644 --- a/v03_pipeline/lib/tasks/write_imported_callset.py +++ b/v03_pipeline/lib/tasks/write_imported_callset.py @@ -20,7 +20,7 @@ from v03_pipeline.lib.paths import ( imported_callset_path, sex_check_table_path, - valid_cached_reference_dataset_query_path, + cached_reference_dataset_query_path, ) from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask from v03_pipeline.lib.tasks.files import CallsetTask, GCSorLocalTarget, HailTableTask @@ -86,7 +86,7 @@ def requires(self) -> list[luigi.Task]: ) if Env.REFERENCE_DATA_AUTO_UPDATE else HailTableTask( - valid_cached_reference_dataset_query_path( + cached_reference_dataset_query_path( self.reference_genome, self.dataset_type, CachedReferenceDatasetQuery.GNOMAD_CODING_AND_NONCODING_VARIANTS, @@ -169,7 +169,7 @@ def create_table(self) -> hl.MatrixTable: validate_no_duplicate_variants(mt) validate_expected_contig_frequency(mt, self.reference_genome) coding_and_noncoding_ht = hl.read_table( - valid_cached_reference_dataset_query_path( + cached_reference_dataset_query_path( self.reference_genome, self.dataset_type, CachedReferenceDatasetQuery.GNOMAD_CODING_AND_NONCODING_VARIANTS, diff --git a/v03_pipeline/lib/tasks/write_relatedness_check_table.py b/v03_pipeline/lib/tasks/write_relatedness_check_table.py index be7b92e6e..a53f69430 100644 --- a/v03_pipeline/lib/tasks/write_relatedness_check_table.py +++ b/v03_pipeline/lib/tasks/write_relatedness_check_table.py @@ -5,7 +5,7 @@ from v03_pipeline.lib.model import CachedReferenceDatasetQuery, Env from v03_pipeline.lib.paths import ( relatedness_check_table_path, - valid_cached_reference_dataset_query_path, + cached_reference_dataset_query_path, ) from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask from v03_pipeline.lib.tasks.files import GCSorLocalTarget, HailTableTask @@ -48,7 +48,7 @@ def requires(self) -> luigi.Task: ) if Env.REFERENCE_DATA_AUTO_UPDATE else HailTableTask( - valid_cached_reference_dataset_query_path( + cached_reference_dataset_query_path( self.reference_genome, self.dataset_type, CachedReferenceDatasetQuery.GNOMAD_QC, diff --git a/v03_pipeline/lib/tasks/write_relatedness_check_table_test.py b/v03_pipeline/lib/tasks/write_relatedness_check_table_test.py index dd33bb5e2..f3302b399 100644 --- a/v03_pipeline/lib/tasks/write_relatedness_check_table_test.py +++ b/v03_pipeline/lib/tasks/write_relatedness_check_table_test.py @@ -14,7 +14,7 @@ from v03_pipeline.lib.paths import ( imported_callset_path, relatedness_check_table_path, - valid_cached_reference_dataset_query_path, + cached_reference_dataset_query_path, ) from v03_pipeline.lib.tasks.write_relatedness_check_table import ( WriteRelatednessCheckTableTask, @@ -45,7 +45,7 @@ class WriteRelatednessCheckTableTaskTest(MockedDatarootTestCase): def setUp(self) -> None: super().setUp() - self.gnomad_qc_path = valid_cached_reference_dataset_query_path( + self.gnomad_qc_path = cached_reference_dataset_query_path( ReferenceGenome.GRCh38, DatasetType.SNV_INDEL, CachedReferenceDatasetQuery.GNOMAD_QC, From 65da04f07ecaeef3113d029444a7fa78772d59b4 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Wed, 12 Jun 2024 10:48:37 -0400 Subject: [PATCH 02/49] lint --- v03_pipeline/lib/model/cached_reference_dataset_query.py | 1 - v03_pipeline/lib/paths.py | 1 + v03_pipeline/lib/paths_test.py | 2 +- v03_pipeline/lib/tasks/write_imported_callset.py | 2 +- v03_pipeline/lib/tasks/write_relatedness_check_table.py | 2 +- v03_pipeline/lib/tasks/write_relatedness_check_table_test.py | 2 +- 6 files changed, 5 insertions(+), 5 deletions(-) diff --git a/v03_pipeline/lib/model/cached_reference_dataset_query.py b/v03_pipeline/lib/model/cached_reference_dataset_query.py index e26fa9fbd..02ff1c807 100644 --- a/v03_pipeline/lib/model/cached_reference_dataset_query.py +++ b/v03_pipeline/lib/model/cached_reference_dataset_query.py @@ -5,7 +5,6 @@ from v03_pipeline.lib.model.dataset_type import DatasetType from v03_pipeline.lib.model.definitions import ReferenceGenome -from v03_pipeline.lib.model.environment import Env from v03_pipeline.lib.reference_data.queries import ( clinvar_path_variants, gnomad_coding_and_noncoding_variants, diff --git a/v03_pipeline/lib/paths.py b/v03_pipeline/lib/paths.py index 67f7234a5..5d35c3e43 100644 --- a/v03_pipeline/lib/paths.py +++ b/v03_pipeline/lib/paths.py @@ -40,6 +40,7 @@ def _v03_reference_data_prefix( reference_genome.value, ) + def cached_reference_dataset_query_path( reference_genome: ReferenceGenome, dataset_type: DatasetType, diff --git a/v03_pipeline/lib/paths_test.py b/v03_pipeline/lib/paths_test.py index f31b8f8ef..d6f0b10ba 100644 --- a/v03_pipeline/lib/paths_test.py +++ b/v03_pipeline/lib/paths_test.py @@ -8,6 +8,7 @@ ReferenceGenome, ) from v03_pipeline.lib.paths import ( + cached_reference_dataset_query_path, family_table_path, imported_callset_path, lookup_table_path, @@ -17,7 +18,6 @@ relatedness_check_table_path, remapped_and_subsetted_callset_path, sex_check_table_path, - cached_reference_dataset_query_path, valid_reference_dataset_collection_path, variant_annotations_table_path, ) diff --git a/v03_pipeline/lib/tasks/write_imported_callset.py b/v03_pipeline/lib/tasks/write_imported_callset.py index 04af5bfc9..e5847380b 100644 --- a/v03_pipeline/lib/tasks/write_imported_callset.py +++ b/v03_pipeline/lib/tasks/write_imported_callset.py @@ -18,9 +18,9 @@ from v03_pipeline.lib.model import CachedReferenceDatasetQuery from v03_pipeline.lib.model.environment import Env from v03_pipeline.lib.paths import ( + cached_reference_dataset_query_path, imported_callset_path, sex_check_table_path, - cached_reference_dataset_query_path, ) from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask from v03_pipeline.lib.tasks.files import CallsetTask, GCSorLocalTarget, HailTableTask diff --git a/v03_pipeline/lib/tasks/write_relatedness_check_table.py b/v03_pipeline/lib/tasks/write_relatedness_check_table.py index a53f69430..1ba75446c 100644 --- a/v03_pipeline/lib/tasks/write_relatedness_check_table.py +++ b/v03_pipeline/lib/tasks/write_relatedness_check_table.py @@ -4,8 +4,8 @@ from v03_pipeline.lib.methods.relatedness import call_relatedness from v03_pipeline.lib.model import CachedReferenceDatasetQuery, Env from v03_pipeline.lib.paths import ( - relatedness_check_table_path, cached_reference_dataset_query_path, + relatedness_check_table_path, ) from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask from v03_pipeline.lib.tasks.files import GCSorLocalTarget, HailTableTask diff --git a/v03_pipeline/lib/tasks/write_relatedness_check_table_test.py b/v03_pipeline/lib/tasks/write_relatedness_check_table_test.py index f3302b399..239c6ea11 100644 --- a/v03_pipeline/lib/tasks/write_relatedness_check_table_test.py +++ b/v03_pipeline/lib/tasks/write_relatedness_check_table_test.py @@ -12,9 +12,9 @@ SampleType, ) from v03_pipeline.lib.paths import ( + cached_reference_dataset_query_path, imported_callset_path, relatedness_check_table_path, - cached_reference_dataset_query_path, ) from v03_pipeline.lib.tasks.write_relatedness_check_table import ( WriteRelatednessCheckTableTask, From f90baef2995b24a804897321f993209d79e536dd Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Wed, 12 Jun 2024 11:19:51 -0400 Subject: [PATCH 03/49] fix logic --- v03_pipeline/lib/paths.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/v03_pipeline/lib/paths.py b/v03_pipeline/lib/paths.py index 5d35c3e43..14482d831 100644 --- a/v03_pipeline/lib/paths.py +++ b/v03_pipeline/lib/paths.py @@ -48,7 +48,7 @@ def cached_reference_dataset_query_path( ) -> str: return os.path.join( _v03_reference_data_prefix( - cached_reference_dataset_query.access_control, + AccessControl.PUBLIC, reference_genome, ), dataset_type.value, From 7c8cbb2c0b8260a22179aa8312a920ab13891d40 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Wed, 12 Jun 2024 13:12:38 -0400 Subject: [PATCH 04/49] Move SampleType out of BaseHailTableTask --- v03_pipeline/lib/tasks/base/base_hail_table.py | 3 +-- .../base/base_update_variant_annotations_table.py | 1 - .../base_update_variant_annotations_table_test.py | 2 -- v03_pipeline/lib/tasks/delete_family_table_test.py | 3 +-- v03_pipeline/lib/tasks/delete_family_tables.py | 1 - .../lib/tasks/delete_family_tables_test.py | 3 +-- .../lib/tasks/delete_project_family_tables.py | 1 - .../lib/tasks/delete_project_family_tables_test.py | 3 +-- v03_pipeline/lib/tasks/delete_project_table.py | 1 - .../update_cached_reference_dataset_queries.py | 2 -- ...update_cached_reference_dataset_queries_test.py | 14 -------------- ...ns_table_with_updated_reference_dataset_test.py | 4 ---- .../updated_cached_reference_dataset_query.py | 1 - .../updated_cached_reference_dataset_query_test.py | 3 --- .../updated_reference_dataset_collection_test.py | 3 --- v03_pipeline/lib/tasks/update_lookup_table.py | 2 ++ ...date_lookup_table_with_deleted_families_test.py | 4 +--- ...pdate_lookup_table_with_deleted_project_test.py | 4 +--- v03_pipeline/lib/tasks/update_project_table.py | 2 ++ ...ate_project_table_with_deleted_families_test.py | 3 +-- ...iant_annotations_table_with_deleted_families.py | 1 - ...annotations_table_with_deleted_families_test.py | 3 +-- ...riant_annotations_table_with_deleted_project.py | 1 - ..._annotations_table_with_deleted_project_test.py | 3 +-- ...e_variant_annotations_table_with_new_samples.py | 2 ++ v03_pipeline/lib/tasks/write_family_table.py | 2 ++ v03_pipeline/lib/tasks/write_imported_callset.py | 5 ++--- v03_pipeline/lib/tasks/write_metadata_for_run.py | 2 ++ v03_pipeline/lib/tasks/write_new_variants_table.py | 3 ++- .../lib/tasks/write_project_family_tables.py | 2 ++ .../lib/tasks/write_relatedness_check_table.py | 3 ++- .../tasks/write_remapped_and_subsetted_callset.py | 2 ++ 32 files changed, 29 insertions(+), 60 deletions(-) diff --git a/v03_pipeline/lib/tasks/base/base_hail_table.py b/v03_pipeline/lib/tasks/base/base_hail_table.py index e8c2b47e5..23480ffb9 100644 --- a/v03_pipeline/lib/tasks/base/base_hail_table.py +++ b/v03_pipeline/lib/tasks/base/base_hail_table.py @@ -2,7 +2,7 @@ import luigi from v03_pipeline.lib.logger import get_logger -from v03_pipeline.lib.model import DatasetType, Env, ReferenceGenome, SampleType +from v03_pipeline.lib.model import DatasetType, Env, ReferenceGenome from v03_pipeline.lib.tasks.files import GCSorLocalFolderTarget logger = get_logger(__name__) @@ -11,7 +11,6 @@ class BaseHailTableTask(luigi.Task): reference_genome = luigi.EnumParameter(enum=ReferenceGenome) dataset_type = luigi.EnumParameter(enum=DatasetType) - sample_type = luigi.EnumParameter(enum=SampleType) def output(self) -> luigi.Target: raise NotImplementedError diff --git a/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py b/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py index c04e4f060..32f5f8205 100644 --- a/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py +++ b/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py @@ -39,7 +39,6 @@ def requires(self) -> list[luigi.Task]: UpdatedReferenceDatasetCollectionTask( self.reference_genome, self.dataset_type, - self.sample_type, rdc, ) if Env.REFERENCE_DATA_AUTO_UPDATE diff --git a/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table_test.py b/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table_test.py index 5ee33d4cd..05f90e097 100644 --- a/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table_test.py +++ b/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table_test.py @@ -8,7 +8,6 @@ DatasetType, ReferenceDatasetCollection, ReferenceGenome, - SampleType, ) from v03_pipeline.lib.paths import valid_reference_dataset_collection_path from v03_pipeline.lib.tasks.base.base_update_variant_annotations_table import ( @@ -59,7 +58,6 @@ def test_should_create_initialized_table(self, mock_update_rdc_task) -> None: vat_task = BaseUpdateVariantAnnotationsTableTask( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.SNV_INDEL, - sample_type=SampleType.WGS, ) self.assertTrue('annotations.ht' in vat_task.output().path) self.assertTrue(DatasetType.SNV_INDEL.value in vat_task.output().path) diff --git a/v03_pipeline/lib/tasks/delete_family_table_test.py b/v03_pipeline/lib/tasks/delete_family_table_test.py index d4f07c146..43e92bb6b 100644 --- a/v03_pipeline/lib/tasks/delete_family_table_test.py +++ b/v03_pipeline/lib/tasks/delete_family_table_test.py @@ -3,7 +3,7 @@ import hail as hl import luigi.worker -from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType +from v03_pipeline.lib.model import DatasetType, ReferenceGenome from v03_pipeline.lib.paths import family_table_path from v03_pipeline.lib.tasks.delete_family_table import DeleteFamilyTableTask from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase @@ -50,7 +50,6 @@ def test_delete_family_table_task(self) -> None: task = DeleteFamilyTableTask( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.SNV_INDEL, - sample_type=SampleType.WGS, family_guid='abc_1', ) worker.add(task) diff --git a/v03_pipeline/lib/tasks/delete_family_tables.py b/v03_pipeline/lib/tasks/delete_family_tables.py index f8ff365a2..a68f4dc28 100644 --- a/v03_pipeline/lib/tasks/delete_family_tables.py +++ b/v03_pipeline/lib/tasks/delete_family_tables.py @@ -23,7 +23,6 @@ def run(self): DeleteFamilyTableTask( reference_genome=self.reference_genome, dataset_type=self.dataset_type, - sample_type=self.sample_type, family_guid=family_guid, ), ) diff --git a/v03_pipeline/lib/tasks/delete_family_tables_test.py b/v03_pipeline/lib/tasks/delete_family_tables_test.py index be8d99eea..535299602 100644 --- a/v03_pipeline/lib/tasks/delete_family_tables_test.py +++ b/v03_pipeline/lib/tasks/delete_family_tables_test.py @@ -3,7 +3,7 @@ import hail as hl import luigi.worker -from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType +from v03_pipeline.lib.model import DatasetType, ReferenceGenome from v03_pipeline.lib.paths import family_table_path from v03_pipeline.lib.tasks.delete_family_tables import ( DeleteFamilyTablesTask, @@ -38,7 +38,6 @@ def test_delete_project_family_tables_task(self) -> None: task = DeleteFamilyTablesTask( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.SNV_INDEL, - sample_type=SampleType.WGS, family_guids=['family_a', 'family_b'], ) worker.add(task) diff --git a/v03_pipeline/lib/tasks/delete_project_family_tables.py b/v03_pipeline/lib/tasks/delete_project_family_tables.py index e366edb99..befca9a45 100644 --- a/v03_pipeline/lib/tasks/delete_project_family_tables.py +++ b/v03_pipeline/lib/tasks/delete_project_family_tables.py @@ -35,7 +35,6 @@ def run(self): DeleteFamilyTableTask( reference_genome=self.reference_genome, dataset_type=self.dataset_type, - sample_type=self.sample_type, family_guid=family_guid, ), ) diff --git a/v03_pipeline/lib/tasks/delete_project_family_tables_test.py b/v03_pipeline/lib/tasks/delete_project_family_tables_test.py index e86005f9a..3cb56f1c8 100644 --- a/v03_pipeline/lib/tasks/delete_project_family_tables_test.py +++ b/v03_pipeline/lib/tasks/delete_project_family_tables_test.py @@ -3,7 +3,7 @@ import hail as hl import luigi.worker -from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType +from v03_pipeline.lib.model import DatasetType, ReferenceGenome from v03_pipeline.lib.paths import family_table_path, project_table_path from v03_pipeline.lib.tasks.delete_project_family_tables import ( DeleteProjectFamilyTablesTask, @@ -149,7 +149,6 @@ def test_delete_project_family_tables_task(self) -> None: task = DeleteProjectFamilyTablesTask( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.SNV_INDEL, - sample_type=SampleType.WGS, project_guid='project_a', ) worker.add(task) diff --git a/v03_pipeline/lib/tasks/delete_project_table.py b/v03_pipeline/lib/tasks/delete_project_table.py index 283d7f137..0a403ea3b 100644 --- a/v03_pipeline/lib/tasks/delete_project_table.py +++ b/v03_pipeline/lib/tasks/delete_project_table.py @@ -15,7 +15,6 @@ def requires(self) -> luigi.Task: return DeleteProjectFamilyTablesTask( self.reference_genome, self.dataset_type, - self.sample_type, self.project_guid, ) diff --git a/v03_pipeline/lib/tasks/reference_data/update_cached_reference_dataset_queries.py b/v03_pipeline/lib/tasks/reference_data/update_cached_reference_dataset_queries.py index 5efbad471..bcae534ba 100644 --- a/v03_pipeline/lib/tasks/reference_data/update_cached_reference_dataset_queries.py +++ b/v03_pipeline/lib/tasks/reference_data/update_cached_reference_dataset_queries.py @@ -4,7 +4,6 @@ CachedReferenceDatasetQuery, DatasetType, ReferenceGenome, - SampleType, ) from v03_pipeline.lib.tasks.reference_data.updated_cached_reference_dataset_query import ( UpdatedCachedReferenceDatasetQuery, @@ -14,7 +13,6 @@ class UpdateCachedReferenceDatasetQueries(luigi.Task): reference_genome = luigi.EnumParameter(enum=ReferenceGenome) dataset_type = luigi.EnumParameter(enum=DatasetType) - sample_type = luigi.EnumParameter(enum=SampleType) def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) diff --git a/v03_pipeline/lib/tasks/reference_data/update_cached_reference_dataset_queries_test.py b/v03_pipeline/lib/tasks/reference_data/update_cached_reference_dataset_queries_test.py index 845ccfc45..794a77897 100644 --- a/v03_pipeline/lib/tasks/reference_data/update_cached_reference_dataset_queries_test.py +++ b/v03_pipeline/lib/tasks/reference_data/update_cached_reference_dataset_queries_test.py @@ -7,7 +7,6 @@ CachedReferenceDatasetQuery, DatasetType, ReferenceGenome, - SampleType, ) from v03_pipeline.lib.tasks.reference_data.update_cached_reference_dataset_queries import ( UpdateCachedReferenceDatasetQueries, @@ -25,7 +24,6 @@ def test_37_snv_indel(self, mock_crdq_task): task = UpdateCachedReferenceDatasetQueries( reference_genome=ReferenceGenome.GRCh37, dataset_type=DatasetType.SNV_INDEL, - sample_type=SampleType.WGS, ) worker.add(task) worker.run() @@ -35,25 +33,21 @@ def test_37_snv_indel(self, mock_crdq_task): mock.call( reference_genome=ReferenceGenome.GRCh37, dataset_type=DatasetType.SNV_INDEL, - sample_type=SampleType.WGS, crdq=CachedReferenceDatasetQuery.CLINVAR_PATH_VARIANTS, ), mock.call( reference_genome=ReferenceGenome.GRCh37, dataset_type=DatasetType.SNV_INDEL, - sample_type=SampleType.WGS, crdq=CachedReferenceDatasetQuery.GNOMAD_CODING_AND_NONCODING_VARIANTS, ), mock.call( reference_genome=ReferenceGenome.GRCh37, dataset_type=DatasetType.SNV_INDEL, - sample_type=SampleType.WGS, crdq=CachedReferenceDatasetQuery.GNOMAD_QC, ), mock.call( reference_genome=ReferenceGenome.GRCh37, dataset_type=DatasetType.SNV_INDEL, - sample_type=SampleType.WGS, crdq=CachedReferenceDatasetQuery.HIGH_AF_VARIANTS, ), ], @@ -65,7 +59,6 @@ def test_38_snv_indel(self, mock_crdq_task): task = UpdateCachedReferenceDatasetQueries( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.SNV_INDEL, - sample_type=SampleType.WGS, ) worker.add(task) worker.run() @@ -75,25 +68,21 @@ def test_38_snv_indel(self, mock_crdq_task): mock.call( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.SNV_INDEL, - sample_type=SampleType.WGS, crdq=CachedReferenceDatasetQuery.CLINVAR_PATH_VARIANTS, ), mock.call( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.SNV_INDEL, - sample_type=SampleType.WGS, crdq=CachedReferenceDatasetQuery.GNOMAD_CODING_AND_NONCODING_VARIANTS, ), mock.call( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.SNV_INDEL, - sample_type=SampleType.WGS, crdq=CachedReferenceDatasetQuery.GNOMAD_QC, ), mock.call( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.SNV_INDEL, - sample_type=SampleType.WGS, crdq=CachedReferenceDatasetQuery.HIGH_AF_VARIANTS, ), ], @@ -105,7 +94,6 @@ def test_38_mito(self, mock_crdq_task): task = UpdateCachedReferenceDatasetQueries( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.MITO, - sample_type=SampleType.WGS, ) worker.add(task) worker.run() @@ -115,7 +103,6 @@ def test_38_mito(self, mock_crdq_task): mock.call( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.MITO, - sample_type=SampleType.WGS, crdq=CachedReferenceDatasetQuery.CLINVAR_PATH_VARIANTS, ), ], @@ -127,7 +114,6 @@ def test_38_sv(self, mock_crdq_task): task = UpdateCachedReferenceDatasetQueries( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.SV, - sample_type=SampleType.WGS, ) worker.add(task) worker.run() diff --git a/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py b/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py index 67a5492bf..dc6304cf5 100644 --- a/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py +++ b/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py @@ -15,7 +15,6 @@ DatasetType, ReferenceDatasetCollection, ReferenceGenome, - SampleType, ) from v03_pipeline.lib.paths import valid_reference_dataset_collection_path from v03_pipeline.lib.reference_data.clinvar import CLINVAR_ASSERTIONS @@ -730,7 +729,6 @@ def test_update_vat_with_updated_rdc_snv_indel_38( task = UpdateVariantAnnotationsTableWithUpdatedReferenceDataset( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.SNV_INDEL, - sample_type=SampleType.WGS, ) worker = luigi.worker.Worker() worker.add(task) @@ -925,7 +923,6 @@ def test_update_vat_with_updated_rdc_mito_38( task = UpdateVariantAnnotationsTableWithUpdatedReferenceDataset( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.MITO, - sample_type=SampleType.WGS, ) worker = luigi.worker.Worker() worker.add(task) @@ -1068,7 +1065,6 @@ def test_update_vat_with_updated_rdc_snv_indel_37( task = UpdateVariantAnnotationsTableWithUpdatedReferenceDataset( reference_genome=ReferenceGenome.GRCh37, dataset_type=DatasetType.SNV_INDEL, - sample_type=SampleType.WGS, ) worker = luigi.worker.Worker() worker.add(task) diff --git a/v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query.py b/v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query.py index 92fc7718b..ef54d471d 100644 --- a/v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query.py +++ b/v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query.py @@ -68,7 +68,6 @@ def requires(self) -> luigi.Task: return UpdatedReferenceDatasetCollectionTask( self.reference_genome, self.dataset_type, - self.sample_type, ReferenceDatasetCollection.COMBINED, ) if self.crdq.query_raw_dataset: diff --git a/v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query_test.py b/v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query_test.py index 8dd9558c8..210a8cc8a 100644 --- a/v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query_test.py +++ b/v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query_test.py @@ -11,7 +11,6 @@ DatasetType, ReferenceDatasetCollection, ReferenceGenome, - SampleType, ) from v03_pipeline.lib.paths import ( cached_reference_dataset_query_path, @@ -109,7 +108,6 @@ def test_gnomad_qc( task = UpdatedCachedReferenceDatasetQuery( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.SNV_INDEL, - sample_type=SampleType.WGS, crdq=CachedReferenceDatasetQuery.GNOMAD_QC, ) worker.add(task) @@ -199,7 +197,6 @@ def _clinvar_path_variants(table, **_: Any): task = UpdatedCachedReferenceDatasetQuery( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.SNV_INDEL, - sample_type=SampleType.WGS, crdq=CachedReferenceDatasetQuery.CLINVAR_PATH_VARIANTS, ) worker.add(task) diff --git a/v03_pipeline/lib/tasks/reference_data/updated_reference_dataset_collection_test.py b/v03_pipeline/lib/tasks/reference_data/updated_reference_dataset_collection_test.py index 06c73559b..9995225c0 100644 --- a/v03_pipeline/lib/tasks/reference_data/updated_reference_dataset_collection_test.py +++ b/v03_pipeline/lib/tasks/reference_data/updated_reference_dataset_collection_test.py @@ -10,7 +10,6 @@ DatasetType, ReferenceDatasetCollection, ReferenceGenome, - SampleType, ) from v03_pipeline.lib.paths import valid_reference_dataset_collection_path from v03_pipeline.lib.reference_data.clinvar import CLINVAR_ASSERTIONS @@ -170,7 +169,6 @@ def test_update_task_with_empty_reference_data_table( task = UpdatedReferenceDatasetCollectionTask( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.SNV_INDEL, - sample_type=SampleType.WGS, reference_dataset_collection=ReferenceDatasetCollection.COMBINED, ) worker.add(task) @@ -280,7 +278,6 @@ def test_update_task_with_existing_reference_dataset_collection_table( task = UpdatedReferenceDatasetCollectionTask( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.SNV_INDEL, - sample_type=SampleType.WGS, reference_dataset_collection=ReferenceDatasetCollection.COMBINED, ) worker.add(task) diff --git a/v03_pipeline/lib/tasks/update_lookup_table.py b/v03_pipeline/lib/tasks/update_lookup_table.py index eb6068e76..0c389b713 100644 --- a/v03_pipeline/lib/tasks/update_lookup_table.py +++ b/v03_pipeline/lib/tasks/update_lookup_table.py @@ -7,6 +7,7 @@ join_lookup_hts, remove_family_guids, ) +from v03_pipeline.lib.model import SampleType from v03_pipeline.lib.model.constants import PROJECTS_EXCLUDED_FROM_LOOKUP from v03_pipeline.lib.tasks.base.base_update_lookup_table import ( BaseUpdateLookupTableTask, @@ -17,6 +18,7 @@ class UpdateLookupTableTask(BaseUpdateLookupTableTask): + sample_type = luigi.EnumParameter(enum=SampleType) callset_paths = luigi.ListParameter() project_guids = luigi.ListParameter() project_remap_paths = luigi.ListParameter() diff --git a/v03_pipeline/lib/tasks/update_lookup_table_with_deleted_families_test.py b/v03_pipeline/lib/tasks/update_lookup_table_with_deleted_families_test.py index 283dd5003..70915ef9d 100644 --- a/v03_pipeline/lib/tasks/update_lookup_table_with_deleted_families_test.py +++ b/v03_pipeline/lib/tasks/update_lookup_table_with_deleted_families_test.py @@ -3,7 +3,7 @@ import hail as hl import luigi.worker -from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType +from v03_pipeline.lib.model import DatasetType, ReferenceGenome from v03_pipeline.lib.tasks.update_lookup_table_with_deleted_families import ( UpdateLookupTableWithDeletedFamiliesTask, ) @@ -17,7 +17,6 @@ def test_delete_project_empty_table( worker = luigi.worker.Worker() task = UpdateLookupTableWithDeletedFamiliesTask( dataset_type=DatasetType.SNV_INDEL, - sample_type=SampleType.WGS, reference_genome=ReferenceGenome.GRCh38, project_guid='R0555_seqr_demo', family_guids=['abc'], @@ -132,7 +131,6 @@ def test_delete_project( worker = luigi.worker.Worker() task = UpdateLookupTableWithDeletedFamiliesTask( dataset_type=DatasetType.SNV_INDEL, - sample_type=SampleType.WGS, reference_genome=ReferenceGenome.GRCh38, project_guid='project_a', family_guids=['1', '3'], diff --git a/v03_pipeline/lib/tasks/update_lookup_table_with_deleted_project_test.py b/v03_pipeline/lib/tasks/update_lookup_table_with_deleted_project_test.py index 5375d4c32..e40e034ec 100644 --- a/v03_pipeline/lib/tasks/update_lookup_table_with_deleted_project_test.py +++ b/v03_pipeline/lib/tasks/update_lookup_table_with_deleted_project_test.py @@ -3,7 +3,7 @@ import hail as hl import luigi.worker -from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType +from v03_pipeline.lib.model import DatasetType, ReferenceGenome from v03_pipeline.lib.tasks.update_lookup_table_with_deleted_project import ( UpdateLookupTableWithDeletedProjectTask, ) @@ -17,7 +17,6 @@ def test_delete_project_empty_table( worker = luigi.worker.Worker() task = UpdateLookupTableWithDeletedProjectTask( dataset_type=DatasetType.SNV_INDEL, - sample_type=SampleType.WGS, reference_genome=ReferenceGenome.GRCh38, project_guid='R0555_seqr_demo', ) @@ -131,7 +130,6 @@ def test_delete_project( worker = luigi.worker.Worker() task = UpdateLookupTableWithDeletedProjectTask( dataset_type=DatasetType.SNV_INDEL, - sample_type=SampleType.WGS, reference_genome=ReferenceGenome.GRCh38, project_guid='project_a', ) diff --git a/v03_pipeline/lib/tasks/update_project_table.py b/v03_pipeline/lib/tasks/update_project_table.py index 508c51c30..288b6b20c 100644 --- a/v03_pipeline/lib/tasks/update_project_table.py +++ b/v03_pipeline/lib/tasks/update_project_table.py @@ -7,6 +7,7 @@ join_family_entries_hts, remove_family_guids, ) +from v03_pipeline.lib.model import SampleType from v03_pipeline.lib.tasks.base.base_update_project_table import ( BaseUpdateProjectTableTask, ) @@ -16,6 +17,7 @@ class UpdateProjectTableTask(BaseUpdateProjectTableTask): + sample_type = luigi.EnumParameter(enum=SampleType) callset_path = luigi.Parameter() project_remap_path = luigi.Parameter() project_pedigree_path = luigi.Parameter() diff --git a/v03_pipeline/lib/tasks/update_project_table_with_deleted_families_test.py b/v03_pipeline/lib/tasks/update_project_table_with_deleted_families_test.py index 77936595c..b1f5cc5af 100644 --- a/v03_pipeline/lib/tasks/update_project_table_with_deleted_families_test.py +++ b/v03_pipeline/lib/tasks/update_project_table_with_deleted_families_test.py @@ -1,7 +1,7 @@ import hail as hl import luigi -from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType +from v03_pipeline.lib.model import DatasetType, ReferenceGenome from v03_pipeline.lib.paths import project_table_path from v03_pipeline.lib.tasks.update_project_table_with_deleted_families import ( UpdateProjectTableWithDeletedFamiliesTask, @@ -128,7 +128,6 @@ def test_update_project_with_deleted_families(self): worker = luigi.worker.Worker() task = UpdateProjectTableWithDeletedFamiliesTask( dataset_type=DatasetType.SNV_INDEL, - sample_type=SampleType.WGS, reference_genome=ReferenceGenome.GRCh38, project_guid='project_a', family_guids=['family_b'], diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_deleted_families.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_deleted_families.py index 4db8ca582..a77f5280c 100644 --- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_deleted_families.py +++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_deleted_families.py @@ -24,7 +24,6 @@ def requires(self) -> luigi.Task | None: if self.dataset_type.has_lookup_table: return UpdateLookupTableWithDeletedFamiliesTask( dataset_type=self.dataset_type, - sample_type=self.sample_type, reference_genome=self.reference_genome, project_guid=self.project_guid, family_guids=self.family_guids, diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_deleted_families_test.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_deleted_families_test.py index 266ddf19e..67410ef18 100644 --- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_deleted_families_test.py +++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_deleted_families_test.py @@ -1,7 +1,7 @@ import hail as hl import luigi.worker -from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType +from v03_pipeline.lib.model import DatasetType, ReferenceGenome from v03_pipeline.lib.paths import ( lookup_table_path, variant_annotations_table_path, @@ -139,7 +139,6 @@ def test_update_annotations_with_deleted_project(self) -> None: worker = luigi.worker.Worker() task = UpdateVariantAnnotationsTableWithDeletedFamiliesTask( dataset_type=DatasetType.SNV_INDEL, - sample_type=SampleType.WGS, reference_genome=ReferenceGenome.GRCh38, project_guid='project_a', family_guids=['2', '3'], diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_deleted_project.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_deleted_project.py index 447c16e04..a0a97637b 100644 --- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_deleted_project.py +++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_deleted_project.py @@ -19,7 +19,6 @@ def requires(self) -> luigi.Task | None: if self.dataset_type.has_lookup_table: return UpdateLookupTableWithDeletedProjectTask( dataset_type=self.dataset_type, - sample_type=self.sample_type, reference_genome=self.reference_genome, project_guid=self.project_guid, ) diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_deleted_project_test.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_deleted_project_test.py index a77bc28b9..295a9577b 100644 --- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_deleted_project_test.py +++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_deleted_project_test.py @@ -1,7 +1,7 @@ import hail as hl import luigi.worker -from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType +from v03_pipeline.lib.model import DatasetType, ReferenceGenome from v03_pipeline.lib.paths import ( lookup_table_path, variant_annotations_table_path, @@ -147,7 +147,6 @@ def test_update_annotations_with_deleted_project(self) -> None: worker = luigi.worker.Worker() task = UpdateVariantAnnotationsTableWithDeletedProjectTask( dataset_type=DatasetType.SNV_INDEL, - sample_type=SampleType.WGS, reference_genome=ReferenceGenome.GRCh38, project_guid='project_b', ) diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples.py index 99e6c49df..a38bd2bc7 100644 --- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples.py +++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples.py @@ -3,6 +3,7 @@ from v03_pipeline.lib.annotations.fields import get_fields from v03_pipeline.lib.misc.callsets import callset_project_pairs, get_callset_ht +from v03_pipeline.lib.model import SampleType from v03_pipeline.lib.paths import ( lookup_table_path, new_variants_table_path, @@ -16,6 +17,7 @@ class UpdateVariantAnnotationsTableWithNewSamplesTask( BaseUpdateVariantAnnotationsTableTask, ): + sample_type = luigi.EnumParameter(enum=SampleType) callset_paths = luigi.ListParameter() project_guids = luigi.ListParameter() project_remap_paths = luigi.ListParameter() diff --git a/v03_pipeline/lib/tasks/write_family_table.py b/v03_pipeline/lib/tasks/write_family_table.py index ef3025842..3ea156fdf 100644 --- a/v03_pipeline/lib/tasks/write_family_table.py +++ b/v03_pipeline/lib/tasks/write_family_table.py @@ -1,6 +1,7 @@ import hail as hl import luigi +from v03_pipeline.lib.model import SampleType from v03_pipeline.lib.paths import family_table_path from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask from v03_pipeline.lib.tasks.files import GCSorLocalTarget @@ -10,6 +11,7 @@ class WriteFamilyTableTask(BaseWriteTask): + sample_type = luigi.EnumParameter(enum=SampleType) callset_path = luigi.Parameter() project_guid = luigi.Parameter() project_remap_path = luigi.Parameter() diff --git a/v03_pipeline/lib/tasks/write_imported_callset.py b/v03_pipeline/lib/tasks/write_imported_callset.py index e5847380b..07b0bb37e 100644 --- a/v03_pipeline/lib/tasks/write_imported_callset.py +++ b/v03_pipeline/lib/tasks/write_imported_callset.py @@ -15,7 +15,7 @@ validate_sample_type, ) from v03_pipeline.lib.misc.vets import annotate_vets -from v03_pipeline.lib.model import CachedReferenceDatasetQuery +from v03_pipeline.lib.model import CachedReferenceDatasetQuery, SampleType from v03_pipeline.lib.model.environment import Env from v03_pipeline.lib.paths import ( cached_reference_dataset_query_path, @@ -31,6 +31,7 @@ class WriteImportedCallsetTask(BaseWriteTask): + sample_type = luigi.EnumParameter(enum=SampleType) callset_path = luigi.Parameter() imputed_sex_path = luigi.Parameter(default=None) filters_path = luigi.OptionalParameter( @@ -81,7 +82,6 @@ def requires(self) -> list[luigi.Task]: UpdatedCachedReferenceDatasetQuery( reference_genome=self.reference_genome, dataset_type=self.dataset_type, - sample_type=self.sample_type, crdq=CachedReferenceDatasetQuery.GNOMAD_CODING_AND_NONCODING_VARIANTS, ) if Env.REFERENCE_DATA_AUTO_UPDATE @@ -103,7 +103,6 @@ def requires(self) -> list[luigi.Task]: WriteSexCheckTableTask( self.reference_genome, self.dataset_type, - self.sample_type, self.callset_path, self.imputed_sex_path, ), diff --git a/v03_pipeline/lib/tasks/write_metadata_for_run.py b/v03_pipeline/lib/tasks/write_metadata_for_run.py index 3ec7d4f64..80b39caca 100644 --- a/v03_pipeline/lib/tasks/write_metadata_for_run.py +++ b/v03_pipeline/lib/tasks/write_metadata_for_run.py @@ -4,6 +4,7 @@ import luigi from v03_pipeline.lib.misc.callsets import callset_project_pairs +from v03_pipeline.lib.model import SampleType from v03_pipeline.lib.paths import metadata_for_run_path from v03_pipeline.lib.tasks.base.base_hail_table import BaseHailTableTask from v03_pipeline.lib.tasks.files import GCSorLocalTarget @@ -13,6 +14,7 @@ class WriteMetadataForRunTask(BaseHailTableTask): + sample_type = luigi.EnumParameter(enum=SampleType) callset_paths = luigi.ListParameter() project_guids = luigi.ListParameter() project_remap_paths = luigi.ListParameter() diff --git a/v03_pipeline/lib/tasks/write_new_variants_table.py b/v03_pipeline/lib/tasks/write_new_variants_table.py index 734b07d73..abcfc5458 100644 --- a/v03_pipeline/lib/tasks/write_new_variants_table.py +++ b/v03_pipeline/lib/tasks/write_new_variants_table.py @@ -10,7 +10,7 @@ from v03_pipeline.lib.misc.allele_registry import register_alleles_in_chunks from v03_pipeline.lib.misc.callsets import callset_project_pairs, get_callset_ht from v03_pipeline.lib.misc.math import constrain -from v03_pipeline.lib.model import Env, ReferenceDatasetCollection +from v03_pipeline.lib.model import Env, ReferenceDatasetCollection, SampleType from v03_pipeline.lib.paths import ( new_variants_table_path, variant_annotations_table_path, @@ -37,6 +37,7 @@ class WriteNewVariantsTableTask(BaseWriteTask): + sample_type = luigi.EnumParameter(enum=SampleType) callset_paths = luigi.ListParameter() project_guids = luigi.ListParameter() project_remap_paths = luigi.ListParameter() diff --git a/v03_pipeline/lib/tasks/write_project_family_tables.py b/v03_pipeline/lib/tasks/write_project_family_tables.py index fe96f441b..7bd59005b 100644 --- a/v03_pipeline/lib/tasks/write_project_family_tables.py +++ b/v03_pipeline/lib/tasks/write_project_family_tables.py @@ -1,12 +1,14 @@ import hail as hl import luigi +from v03_pipeline.lib.model import SampleType from v03_pipeline.lib.tasks.base.base_hail_table import BaseHailTableTask from v03_pipeline.lib.tasks.update_project_table import UpdateProjectTableTask from v03_pipeline.lib.tasks.write_family_table import WriteFamilyTableTask class WriteProjectFamilyTablesTask(BaseHailTableTask): + sample_type = luigi.EnumParameter(enum=SampleType) callset_path = luigi.Parameter() project_guid = luigi.Parameter() project_remap_path = luigi.Parameter() diff --git a/v03_pipeline/lib/tasks/write_relatedness_check_table.py b/v03_pipeline/lib/tasks/write_relatedness_check_table.py index 1ba75446c..3893f5760 100644 --- a/v03_pipeline/lib/tasks/write_relatedness_check_table.py +++ b/v03_pipeline/lib/tasks/write_relatedness_check_table.py @@ -2,7 +2,7 @@ import luigi from v03_pipeline.lib.methods.relatedness import call_relatedness -from v03_pipeline.lib.model import CachedReferenceDatasetQuery, Env +from v03_pipeline.lib.model import CachedReferenceDatasetQuery, Env, SampleType from v03_pipeline.lib.paths import ( cached_reference_dataset_query_path, relatedness_check_table_path, @@ -16,6 +16,7 @@ class WriteRelatednessCheckTableTask(BaseWriteTask): + sample_type = luigi.EnumParameter(enum=SampleType) callset_path = luigi.Parameter() def output(self) -> luigi.Target: diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py index 7998fb689..e72a2d576 100644 --- a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py +++ b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py @@ -14,6 +14,7 @@ ) from v03_pipeline.lib.misc.pedigree import parse_pedigree_ht_to_families from v03_pipeline.lib.misc.sample_ids import remap_sample_ids, subset_samples +from v03_pipeline.lib.model import SampleType from v03_pipeline.lib.paths import remapped_and_subsetted_callset_path from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask from v03_pipeline.lib.tasks.files import GCSorLocalTarget, RawFileTask @@ -27,6 +28,7 @@ class WriteRemappedAndSubsettedCallsetTask(BaseWriteTask): + sample_type = luigi.EnumParameter(enum=SampleType) callset_path = luigi.Parameter() project_guid = luigi.Parameter() project_remap_path = luigi.Parameter() From 4a2f978c5795aba393ec11ec9a5a3b4e11cbe31c Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Wed, 12 Jun 2024 13:31:50 -0400 Subject: [PATCH 05/49] cleanup --- v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py | 1 - 1 file changed, 1 deletion(-) diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py index e72a2d576..079996d35 100644 --- a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py +++ b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py @@ -97,7 +97,6 @@ def requires(self) -> list[luigi.Task]: WriteSexCheckTableTask( self.reference_genome, self.dataset_type, - self.sample_type, self.callset_path, self.imputed_sex_path, ), From fb737f879d155006e21ba6e84ed6fc709888d173 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Wed, 12 Jun 2024 13:46:20 -0400 Subject: [PATCH 06/49] fix --- v03_pipeline/lib/tasks/write_relatedness_check_table.py | 1 - 1 file changed, 1 deletion(-) diff --git a/v03_pipeline/lib/tasks/write_relatedness_check_table.py b/v03_pipeline/lib/tasks/write_relatedness_check_table.py index 3893f5760..dc8bf17d6 100644 --- a/v03_pipeline/lib/tasks/write_relatedness_check_table.py +++ b/v03_pipeline/lib/tasks/write_relatedness_check_table.py @@ -44,7 +44,6 @@ def requires(self) -> luigi.Task: UpdatedCachedReferenceDatasetQuery( reference_genome=self.reference_genome, dataset_type=self.dataset_type, - sample_type=self.sample_type, crdq=CachedReferenceDatasetQuery.GNOMAD_QC, ) if Env.REFERENCE_DATA_AUTO_UPDATE From 2e2676b374b3de0970be67dc3863b4a92454c238 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Wed, 12 Jun 2024 14:05:04 -0400 Subject: [PATCH 07/49] missed a few! --- v03_pipeline/lib/tasks/write_new_variants_table.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/v03_pipeline/lib/tasks/write_new_variants_table.py b/v03_pipeline/lib/tasks/write_new_variants_table.py index abcfc5458..4779b5c42 100644 --- a/v03_pipeline/lib/tasks/write_new_variants_table.py +++ b/v03_pipeline/lib/tasks/write_new_variants_table.py @@ -85,7 +85,6 @@ def requires(self) -> list[luigi.Task]: UpdateVariantAnnotationsTableWithUpdatedReferenceDataset( self.reference_genome, self.dataset_type, - self.sample_type, ), ] else: @@ -93,7 +92,6 @@ def requires(self) -> list[luigi.Task]: BaseUpdateVariantAnnotationsTableTask( self.reference_genome, self.dataset_type, - self.sample_type, ), ] if self.dataset_type.has_lookup_table: From 89754a415bc4316dcbddae897836f9736746d9c4 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Wed, 12 Jun 2024 14:31:41 -0400 Subject: [PATCH 08/49] shitshow --- .../update_variant_annotations_table_with_new_samples_test.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py index 80a8fe2b4..d80d6cb87 100644 --- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py +++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py @@ -234,7 +234,6 @@ def test_multiple_update_vat( BaseUpdateVariantAnnotationsTableTask( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.SNV_INDEL, - sample_type=SampleType.WGS, ) ) mock_vep.side_effect = lambda ht, **_: ht.annotate(vep=MOCK_VEP_DATA) @@ -653,7 +652,6 @@ def test_update_vat_grch37( BaseUpdateVariantAnnotationsTableTask( reference_genome=ReferenceGenome.GRCh37, dataset_type=DatasetType.SNV_INDEL, - sample_type=SampleType.WGS, ) ) mock_vep.side_effect = lambda ht, **_: ht.annotate(vep=MOCK_VEP_DATA) @@ -719,7 +717,6 @@ def test_update_vat_without_accessing_private_datasets( BaseUpdateVariantAnnotationsTableTask( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.SNV_INDEL, - sample_type=SampleType.WGS, ) ) shutil.rmtree( @@ -787,7 +784,6 @@ def test_mito_update_vat( BaseUpdateVariantAnnotationsTableTask( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.MITO, - sample_type=SampleType.WGS, ) ) mock_register_alleles.side_effect = None From 3879c68e75dd711c11aae91416999ff063924a71 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Wed, 12 Jun 2024 14:35:46 -0400 Subject: [PATCH 09/49] First one? --- .../lib/tasks/write_imported_callset.py | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) diff --git a/v03_pipeline/lib/tasks/write_imported_callset.py b/v03_pipeline/lib/tasks/write_imported_callset.py index 07b0bb37e..ed709afc1 100644 --- a/v03_pipeline/lib/tasks/write_imported_callset.py +++ b/v03_pipeline/lib/tasks/write_imported_callset.py @@ -34,10 +34,7 @@ class WriteImportedCallsetTask(BaseWriteTask): sample_type = luigi.EnumParameter(enum=SampleType) callset_path = luigi.Parameter() imputed_sex_path = luigi.Parameter(default=None) - filters_path = luigi.OptionalParameter( - default=None, - description='Optional path to part two outputs from callset (VCF shards containing filter information)', - ) + filters_path = luigi.OptionalParameter(default=None) validate = luigi.BoolParameter( default=True, parsing=luigi.BoolParameter.EXPLICIT_PARSING, @@ -79,11 +76,7 @@ def requires(self) -> list[luigi.Task]: requirements = [ *requirements, ( - UpdatedCachedReferenceDatasetQuery( - reference_genome=self.reference_genome, - dataset_type=self.dataset_type, - crdq=CachedReferenceDatasetQuery.GNOMAD_CODING_AND_NONCODING_VARIANTS, - ) + self.clone(UpdatedCachedReferenceDatasetQuery, crdq=CachedReferenceDatasetQuery.GNOMAD_CODING_AND_NONCODING_VARIANTS), if Env.REFERENCE_DATA_AUTO_UPDATE else HailTableTask( cached_reference_dataset_query_path( @@ -100,12 +93,7 @@ def requires(self) -> list[luigi.Task]: ): requirements = [ *requirements, - WriteSexCheckTableTask( - self.reference_genome, - self.dataset_type, - self.callset_path, - self.imputed_sex_path, - ), + self.clone(WriteSexCheckTableTask), ] return [ *requirements, From 2e1620f181bb46f794893034d1d98d6cbac7b1c8 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Wed, 12 Jun 2024 14:52:52 -0400 Subject: [PATCH 10/49] flip order here --- v03_pipeline/lib/tasks/write_project_family_tables.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/v03_pipeline/lib/tasks/write_project_family_tables.py b/v03_pipeline/lib/tasks/write_project_family_tables.py index 7bd59005b..b7d83cd49 100644 --- a/v03_pipeline/lib/tasks/write_project_family_tables.py +++ b/v03_pipeline/lib/tasks/write_project_family_tables.py @@ -50,8 +50,8 @@ def run(self): update_project_table_task: luigi.Target = yield UpdateProjectTableTask( self.reference_genome, self.dataset_type, - self.sample_type, self.project_guid, + self.sample_type, self.callset_path, self.project_remap_path, self.project_pedigree_path, From df3e2591c7419bfc7bc413d52f86a2e182f15acd Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Wed, 12 Jun 2024 15:12:49 -0400 Subject: [PATCH 11/49] flip order --- v03_pipeline/lib/tasks/write_family_table.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/v03_pipeline/lib/tasks/write_family_table.py b/v03_pipeline/lib/tasks/write_family_table.py index 3ea156fdf..73400983f 100644 --- a/v03_pipeline/lib/tasks/write_family_table.py +++ b/v03_pipeline/lib/tasks/write_family_table.py @@ -54,8 +54,8 @@ def requires(self) -> luigi.Task: return UpdateProjectTableTask( self.reference_genome, self.dataset_type, - self.sample_type, self.project_guid, + self.sample_type, self.callset_path, self.project_remap_path, self.project_pedigree_path, From d730d8c927702ec6156c4b33fd33bd8120c0b08f Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Wed, 12 Jun 2024 15:12:49 -0400 Subject: [PATCH 12/49] flip order --- v03_pipeline/lib/tasks/write_family_table.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/v03_pipeline/lib/tasks/write_family_table.py b/v03_pipeline/lib/tasks/write_family_table.py index 3ea156fdf..73400983f 100644 --- a/v03_pipeline/lib/tasks/write_family_table.py +++ b/v03_pipeline/lib/tasks/write_family_table.py @@ -54,8 +54,8 @@ def requires(self) -> luigi.Task: return UpdateProjectTableTask( self.reference_genome, self.dataset_type, - self.sample_type, self.project_guid, + self.sample_type, self.callset_path, self.project_remap_path, self.project_pedigree_path, From 98cffbbbdb34c681f3716d1dbd835900d226b0e6 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Wed, 12 Jun 2024 16:33:51 -0400 Subject: [PATCH 13/49] try sharing these params? --- .../lib/tasks/base/base_loading_params.py | 32 +++++++++++++++++++ .../lib/tasks/write_imported_callset.py | 27 +++++----------- .../tasks/write_relatedness_check_table.py | 1 - .../lib/tasks/write_sex_check_table.py | 6 ++-- 4 files changed, 42 insertions(+), 24 deletions(-) create mode 100644 v03_pipeline/lib/tasks/base/base_loading_params.py diff --git a/v03_pipeline/lib/tasks/base/base_loading_params.py b/v03_pipeline/lib/tasks/base/base_loading_params.py new file mode 100644 index 000000000..d84960a30 --- /dev/null +++ b/v03_pipeline/lib/tasks/base/base_loading_params.py @@ -0,0 +1,32 @@ +import luigi + +from v03_pipeline.lib.model import SampleType + + +class BaseLoadingParams(luigi.Task): + # NB: + # These params are "inherited" with the special + # luigi.util.inherits function, copying params + # but nothing else. + sample_type = luigi.EnumParameter(enum=SampleType) + callset_path = luigi.Parameter() + imputed_sex_path = luigi.Parameter( + default=None, + description='Optional path to a tsv of imputed sex values from the DRAGEN GVS pipeline.', + ) + filters_path = luigi.Parameter( + default=None, + description='Optional path to part two outputs from callset (VCF shards containing filter information)', + ) + validate = luigi.BoolParameter( + default=True, + parsing=luigi.BoolParameter.EXPLICIT_PARSING, + ) + force = luigi.BoolParameter( + default=False, + parsing=luigi.BoolParameter.EXPLICIT_PARSING, + ) + check_sex_and_relatedness = luigi.BoolParameter( + default=False, + parsing=luigi.BoolParameter.EXPLICIT_PARSING, + ) diff --git a/v03_pipeline/lib/tasks/write_imported_callset.py b/v03_pipeline/lib/tasks/write_imported_callset.py index ed709afc1..e6b179f9f 100644 --- a/v03_pipeline/lib/tasks/write_imported_callset.py +++ b/v03_pipeline/lib/tasks/write_imported_callset.py @@ -15,13 +15,14 @@ validate_sample_type, ) from v03_pipeline.lib.misc.vets import annotate_vets -from v03_pipeline.lib.model import CachedReferenceDatasetQuery, SampleType +from v03_pipeline.lib.model import CachedReferenceDatasetQuery from v03_pipeline.lib.model.environment import Env from v03_pipeline.lib.paths import ( cached_reference_dataset_query_path, imported_callset_path, sex_check_table_path, ) +from v03_pipeline.lib.tasks.base.base_loading_params import BaseLoadingParams from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask from v03_pipeline.lib.tasks.files import CallsetTask, GCSorLocalTarget, HailTableTask from v03_pipeline.lib.tasks.reference_data.updated_cached_reference_dataset_query import ( @@ -29,25 +30,10 @@ ) from v03_pipeline.lib.tasks.write_sex_check_table import WriteSexCheckTableTask +luigi.util.inherits(BaseLoadingParams) -class WriteImportedCallsetTask(BaseWriteTask): - sample_type = luigi.EnumParameter(enum=SampleType) - callset_path = luigi.Parameter() - imputed_sex_path = luigi.Parameter(default=None) - filters_path = luigi.OptionalParameter(default=None) - validate = luigi.BoolParameter( - default=True, - parsing=luigi.BoolParameter.EXPLICIT_PARSING, - ) - force = luigi.BoolParameter( - default=False, - parsing=luigi.BoolParameter.EXPLICIT_PARSING, - ) - check_sex_and_relatedness = luigi.BoolParameter( - default=False, - parsing=luigi.BoolParameter.EXPLICIT_PARSING, - ) +class WriteImportedCallsetTask(BaseWriteTask): def complete(self) -> luigi.Target: if not self.force and super().complete(): mt = hl.read_matrix_table(self.output().path) @@ -76,7 +62,10 @@ def requires(self) -> list[luigi.Task]: requirements = [ *requirements, ( - self.clone(UpdatedCachedReferenceDatasetQuery, crdq=CachedReferenceDatasetQuery.GNOMAD_CODING_AND_NONCODING_VARIANTS), + self.clone( + UpdatedCachedReferenceDatasetQuery, + crdq=CachedReferenceDatasetQuery.GNOMAD_CODING_AND_NONCODING_VARIANTS, + ) if Env.REFERENCE_DATA_AUTO_UPDATE else HailTableTask( cached_reference_dataset_query_path( diff --git a/v03_pipeline/lib/tasks/write_relatedness_check_table.py b/v03_pipeline/lib/tasks/write_relatedness_check_table.py index dc8bf17d6..ce5c95315 100644 --- a/v03_pipeline/lib/tasks/write_relatedness_check_table.py +++ b/v03_pipeline/lib/tasks/write_relatedness_check_table.py @@ -33,7 +33,6 @@ def requires(self) -> luigi.Task: WriteImportedCallsetTask( self.reference_genome, self.dataset_type, - self.sample_type, self.callset_path, ), ] diff --git a/v03_pipeline/lib/tasks/write_sex_check_table.py b/v03_pipeline/lib/tasks/write_sex_check_table.py index 55cbc1387..801330689 100644 --- a/v03_pipeline/lib/tasks/write_sex_check_table.py +++ b/v03_pipeline/lib/tasks/write_sex_check_table.py @@ -21,9 +21,7 @@ def output(self) -> luigi.Target: ) def requires(self) -> luigi.Task: - return [ - RawFileTask(self.imputed_sex_path), - ] + return RawFileTask(self.imputed_sex_path) def create_table(self) -> hl.Table: - return import_imputed_sex(self.input()[0].path) + return import_imputed_sex(self.input().path) From 93cadea72d167826d6afd9a37049c93888b07114 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Wed, 12 Jun 2024 16:42:07 -0400 Subject: [PATCH 14/49] that failed :/ --- v03_pipeline/lib/tasks/write_imported_callset.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/v03_pipeline/lib/tasks/write_imported_callset.py b/v03_pipeline/lib/tasks/write_imported_callset.py index e6b179f9f..720bdbc56 100644 --- a/v03_pipeline/lib/tasks/write_imported_callset.py +++ b/v03_pipeline/lib/tasks/write_imported_callset.py @@ -30,9 +30,8 @@ ) from v03_pipeline.lib.tasks.write_sex_check_table import WriteSexCheckTableTask -luigi.util.inherits(BaseLoadingParams) - +@luigi.util.inherits(BaseLoadingParams) class WriteImportedCallsetTask(BaseWriteTask): def complete(self) -> luigi.Target: if not self.force and super().complete(): From cb8b97496dc0cb10d0c0edb62c3f4f9bbdbada06 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Wed, 12 Jun 2024 16:47:34 -0400 Subject: [PATCH 15/49] more scaffolding --- ...g_params.py => base_loading_run_params.py} | 2 +- .../lib/tasks/write_imported_callset.py | 5 +++-- .../tasks/write_relatedness_check_table.py | 19 +++++-------------- 3 files changed, 9 insertions(+), 17 deletions(-) rename v03_pipeline/lib/tasks/base/{base_loading_params.py => base_loading_run_params.py} (96%) diff --git a/v03_pipeline/lib/tasks/base/base_loading_params.py b/v03_pipeline/lib/tasks/base/base_loading_run_params.py similarity index 96% rename from v03_pipeline/lib/tasks/base/base_loading_params.py rename to v03_pipeline/lib/tasks/base/base_loading_run_params.py index d84960a30..ec3742d56 100644 --- a/v03_pipeline/lib/tasks/base/base_loading_params.py +++ b/v03_pipeline/lib/tasks/base/base_loading_run_params.py @@ -3,7 +3,7 @@ from v03_pipeline.lib.model import SampleType -class BaseLoadingParams(luigi.Task): +class BaseLoadingRunParams(luigi.Task): # NB: # These params are "inherited" with the special # luigi.util.inherits function, copying params diff --git a/v03_pipeline/lib/tasks/write_imported_callset.py b/v03_pipeline/lib/tasks/write_imported_callset.py index 720bdbc56..9cd146af4 100644 --- a/v03_pipeline/lib/tasks/write_imported_callset.py +++ b/v03_pipeline/lib/tasks/write_imported_callset.py @@ -1,5 +1,6 @@ import hail as hl import luigi +import luigi.util from v03_pipeline.lib.misc.io import ( import_callset, @@ -22,7 +23,7 @@ imported_callset_path, sex_check_table_path, ) -from v03_pipeline.lib.tasks.base.base_loading_params import BaseLoadingParams +from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask from v03_pipeline.lib.tasks.files import CallsetTask, GCSorLocalTarget, HailTableTask from v03_pipeline.lib.tasks.reference_data.updated_cached_reference_dataset_query import ( @@ -31,7 +32,7 @@ from v03_pipeline.lib.tasks.write_sex_check_table import WriteSexCheckTableTask -@luigi.util.inherits(BaseLoadingParams) +@luigi.util.inherits(BaseLoadingRunParams) class WriteImportedCallsetTask(BaseWriteTask): def complete(self) -> luigi.Target: if not self.force and super().complete(): diff --git a/v03_pipeline/lib/tasks/write_relatedness_check_table.py b/v03_pipeline/lib/tasks/write_relatedness_check_table.py index ce5c95315..7eeac08e8 100644 --- a/v03_pipeline/lib/tasks/write_relatedness_check_table.py +++ b/v03_pipeline/lib/tasks/write_relatedness_check_table.py @@ -2,11 +2,12 @@ import luigi from v03_pipeline.lib.methods.relatedness import call_relatedness -from v03_pipeline.lib.model import CachedReferenceDatasetQuery, Env, SampleType +from v03_pipeline.lib.model import CachedReferenceDatasetQuery, Env from v03_pipeline.lib.paths import ( cached_reference_dataset_query_path, relatedness_check_table_path, ) +from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask from v03_pipeline.lib.tasks.files import GCSorLocalTarget, HailTableTask from v03_pipeline.lib.tasks.reference_data.updated_cached_reference_dataset_query import ( @@ -15,10 +16,8 @@ from v03_pipeline.lib.tasks.write_imported_callset import WriteImportedCallsetTask +@luigi.util.inherits(BaseLoadingRunParams) class WriteRelatednessCheckTableTask(BaseWriteTask): - sample_type = luigi.EnumParameter(enum=SampleType) - callset_path = luigi.Parameter() - def output(self) -> luigi.Target: return GCSorLocalTarget( relatedness_check_table_path( @@ -30,21 +29,13 @@ def output(self) -> luigi.Target: def requires(self) -> luigi.Task: requirements = [ - WriteImportedCallsetTask( - self.reference_genome, - self.dataset_type, - self.callset_path, - ), + self.clone(WriteImportedCallsetTask), ] if Env.ACCESS_PRIVATE_REFERENCE_DATASETS: requirements = [ *requirements, ( - UpdatedCachedReferenceDatasetQuery( - reference_genome=self.reference_genome, - dataset_type=self.dataset_type, - crdq=CachedReferenceDatasetQuery.GNOMAD_QC, - ) + self.clone(UpdatedCachedReferenceDatasetQuery, crdq=CachedReferenceDatasetQuery.GNOMAD_QC) if Env.REFERENCE_DATA_AUTO_UPDATE else HailTableTask( cached_reference_dataset_query_path( From 92868de722f24a821f3cca456326d5240125cb9f Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Wed, 12 Jun 2024 16:49:55 -0400 Subject: [PATCH 16/49] ruff --- v03_pipeline/lib/tasks/write_relatedness_check_table.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/v03_pipeline/lib/tasks/write_relatedness_check_table.py b/v03_pipeline/lib/tasks/write_relatedness_check_table.py index 7eeac08e8..86e4eb500 100644 --- a/v03_pipeline/lib/tasks/write_relatedness_check_table.py +++ b/v03_pipeline/lib/tasks/write_relatedness_check_table.py @@ -35,7 +35,10 @@ def requires(self) -> luigi.Task: requirements = [ *requirements, ( - self.clone(UpdatedCachedReferenceDatasetQuery, crdq=CachedReferenceDatasetQuery.GNOMAD_QC) + self.clone( + UpdatedCachedReferenceDatasetQuery, + crdq=CachedReferenceDatasetQuery.GNOMAD_QC, + ) if Env.REFERENCE_DATA_AUTO_UPDATE else HailTableTask( cached_reference_dataset_query_path( From ec8b1b22c9a3270b84c5a349d7e3a7264228f756 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Wed, 12 Jun 2024 17:11:14 -0400 Subject: [PATCH 17/49] Another one --- .../write_remapped_and_subsetted_callset.py | 56 ++----------------- 1 file changed, 6 insertions(+), 50 deletions(-) diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py index 079996d35..00f17daae 100644 --- a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py +++ b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py @@ -1,5 +1,6 @@ import hail as hl import luigi +import luigi.util from v03_pipeline.lib.logger import get_logger from v03_pipeline.lib.misc.family_loading_failures import ( @@ -14,8 +15,8 @@ ) from v03_pipeline.lib.misc.pedigree import parse_pedigree_ht_to_families from v03_pipeline.lib.misc.sample_ids import remap_sample_ids, subset_samples -from v03_pipeline.lib.model import SampleType from v03_pipeline.lib.paths import remapped_and_subsetted_callset_path +from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask from v03_pipeline.lib.tasks.files import GCSorLocalTarget, RawFileTask from v03_pipeline.lib.tasks.write_imported_callset import WriteImportedCallsetTask @@ -27,30 +28,8 @@ logger = get_logger(__name__) +@luigi.util.inherits(BaseLoadingRunParams) class WriteRemappedAndSubsettedCallsetTask(BaseWriteTask): - sample_type = luigi.EnumParameter(enum=SampleType) - callset_path = luigi.Parameter() - project_guid = luigi.Parameter() - project_remap_path = luigi.Parameter() - project_pedigree_path = luigi.Parameter() - imputed_sex_path = luigi.Parameter(default=None) - ignore_missing_samples_when_remapping = luigi.BoolParameter( - default=False, - parsing=luigi.BoolParameter.EXPLICIT_PARSING, - ) - validate = luigi.BoolParameter( - default=True, - parsing=luigi.BoolParameter.EXPLICIT_PARSING, - ) - force = luigi.BoolParameter( - default=False, - parsing=luigi.BoolParameter.EXPLICIT_PARSING, - ) - check_sex_and_relatedness = luigi.BoolParameter( - default=False, - parsing=luigi.BoolParameter.EXPLICIT_PARSING, - ) - def complete(self) -> luigi.Target: return not self.force and super().complete() @@ -66,20 +45,7 @@ def output(self) -> luigi.Target: def requires(self) -> list[luigi.Task]: requirements = [ - WriteImportedCallsetTask( - reference_genome=self.reference_genome, - dataset_type=self.dataset_type, - sample_type=self.sample_type, - callset_path=self.callset_path, - imputed_sex_path=self.imputed_sex_path, - # NB: filters_path is explicitly passed as None here - # to avoid carrying it throughout the rest of the pipeline. - # Only the primary import task itself should be aware of it. - filters_path=None, - validate=self.validate, - force=False, - check_sex_and_relatedness=self.check_sex_and_relatedness, - ), + self.clone(WriteImportedCallsetTask, force=False), RawFileTask(self.project_pedigree_path), ] if ( @@ -88,18 +54,8 @@ def requires(self) -> list[luigi.Task]: ): requirements = [ *requirements, - WriteRelatednessCheckTableTask( - self.reference_genome, - self.dataset_type, - self.sample_type, - self.callset_path, - ), - WriteSexCheckTableTask( - self.reference_genome, - self.dataset_type, - self.callset_path, - self.imputed_sex_path, - ), + self.clone(WriteRelatednessCheckTableTask), + self.close(WriteSexCheckTableTask), ] return requirements From 68bc804578c4303f6302ef75a9918a709534b592 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Wed, 12 Jun 2024 17:24:56 -0400 Subject: [PATCH 18/49] more hacking --- .../lib/tasks/write_metadata_for_run.py | 25 +++---------------- .../write_remapped_and_subsetted_callset.py | 4 +++ 2 files changed, 8 insertions(+), 21 deletions(-) diff --git a/v03_pipeline/lib/tasks/write_metadata_for_run.py b/v03_pipeline/lib/tasks/write_metadata_for_run.py index 80b39caca..23beb9e19 100644 --- a/v03_pipeline/lib/tasks/write_metadata_for_run.py +++ b/v03_pipeline/lib/tasks/write_metadata_for_run.py @@ -2,40 +2,23 @@ import hail as hl import luigi +import luigi.util from v03_pipeline.lib.misc.callsets import callset_project_pairs -from v03_pipeline.lib.model import SampleType from v03_pipeline.lib.paths import metadata_for_run_path from v03_pipeline.lib.tasks.base.base_hail_table import BaseHailTableTask +from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams from v03_pipeline.lib.tasks.files import GCSorLocalTarget from v03_pipeline.lib.tasks.write_remapped_and_subsetted_callset import ( WriteRemappedAndSubsettedCallsetTask, ) +@luigi.util.inherits(BaseLoadingRunParams) class WriteMetadataForRunTask(BaseHailTableTask): - sample_type = luigi.EnumParameter(enum=SampleType) - callset_paths = luigi.ListParameter() project_guids = luigi.ListParameter() project_remap_paths = luigi.ListParameter() project_pedigree_paths = luigi.ListParameter() - imputed_sex_paths = luigi.ListParameter(default=None) - ignore_missing_samples_when_remapping = luigi.BoolParameter( - default=False, - parsing=luigi.BoolParameter.EXPLICIT_PARSING, - ) - validate = luigi.BoolParameter( - default=True, - parsing=luigi.BoolParameter.EXPLICIT_PARSING, - ) - force = luigi.BoolParameter( - default=False, - parsing=luigi.BoolParameter.EXPLICIT_PARSING, - ) - check_sex_and_relatedness = luigi.BoolParameter( - default=True, - parsing=luigi.BoolParameter.EXPLICIT_PARSING, - ) run_id = luigi.Parameter() def output(self) -> luigi.Target: @@ -52,7 +35,7 @@ def complete(self) -> bool: def requires(self) -> list[luigi.Task]: return [ - WriteRemappedAndSubsettedCallsetTask( + self.clone(WriteRemappedAndSubsettedCallsetTask, self.reference_genome, self.dataset_type, self.sample_type, diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py index 00f17daae..84dc75171 100644 --- a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py +++ b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py @@ -30,6 +30,10 @@ @luigi.util.inherits(BaseLoadingRunParams) class WriteRemappedAndSubsettedCallsetTask(BaseWriteTask): + project_guid = luigi.Parameter() + project_remap_path = luigi.Parameter() + project_pedigree_path = luigi.Parameter() + def complete(self) -> luigi.Target: return not self.force and super().complete() From 0c22f46079cdac9a55e6ae440567fc0c71127029 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 13 Jun 2024 00:23:28 -0400 Subject: [PATCH 19/49] Fix write metadata --- .../lib/tasks/write_metadata_for_run.py | 30 +++++-------------- .../lib/tasks/write_metadata_for_run_test.py | 2 +- 2 files changed, 8 insertions(+), 24 deletions(-) diff --git a/v03_pipeline/lib/tasks/write_metadata_for_run.py b/v03_pipeline/lib/tasks/write_metadata_for_run.py index 23beb9e19..6cf840207 100644 --- a/v03_pipeline/lib/tasks/write_metadata_for_run.py +++ b/v03_pipeline/lib/tasks/write_metadata_for_run.py @@ -4,7 +4,6 @@ import luigi import luigi.util -from v03_pipeline.lib.misc.callsets import callset_project_pairs from v03_pipeline.lib.paths import metadata_for_run_path from v03_pipeline.lib.tasks.base.base_hail_table import BaseHailTableTask from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams @@ -35,32 +34,17 @@ def complete(self) -> bool: def requires(self) -> list[luigi.Task]: return [ - self.clone(WriteRemappedAndSubsettedCallsetTask, - self.reference_genome, - self.dataset_type, - self.sample_type, - callset_path, - project_guid, - project_remap_path, - project_pedigree_path, - imputed_sex_path, - self.ignore_missing_samples_when_remapping, - self.validate, - self.force, - self.check_sex_and_relatedness, + self.clone( + WriteRemappedAndSubsettedCallsetTask, + project_guid=project_guid, + project_remap_path=project_remap_path, + project_pedigree_path=project_pedigree_path, ) - for ( - callset_path, - project_guid, - project_remap_path, - project_pedigree_path, - imputed_sex_path, - ) in callset_project_pairs( - self.callset_paths, + for (project_guid, project_remap_path, project_pedigree_path) in zip( self.project_guids, self.project_remap_paths, self.project_pedigree_paths, - self.imputed_sex_paths, + strict=False, ) ] diff --git a/v03_pipeline/lib/tasks/write_metadata_for_run_test.py b/v03_pipeline/lib/tasks/write_metadata_for_run_test.py index 13ced2c85..cf61fcc4f 100644 --- a/v03_pipeline/lib/tasks/write_metadata_for_run_test.py +++ b/v03_pipeline/lib/tasks/write_metadata_for_run_test.py @@ -19,7 +19,7 @@ def test_write_metadata_for_run_task(self) -> None: reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.SNV_INDEL, sample_type=SampleType.WGS, - callset_paths=[TEST_VCF], + callset_path=TEST_VCF, project_guids=['R0113_test_project', 'R0114_project4'], project_remap_paths=[TEST_REMAP_2, TEST_REMAP_2], project_pedigree_paths=[TEST_PEDIGREE_3, TEST_PEDIGREE_4], From 4dec7b8a908716c667352c090f134dd86292c527 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 13 Jun 2024 00:30:13 -0400 Subject: [PATCH 20/49] typo --- v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py index 84dc75171..81e5adf93 100644 --- a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py +++ b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py @@ -59,7 +59,7 @@ def requires(self) -> list[luigi.Task]: requirements = [ *requirements, self.clone(WriteRelatednessCheckTableTask), - self.close(WriteSexCheckTableTask), + self.clone(WriteSexCheckTableTask), ] return requirements From 379ae36fe3f8481c433acfd3237b4a9a542c7111 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 13 Jun 2024 00:45:04 -0400 Subject: [PATCH 21/49] missing param --- v03_pipeline/lib/tasks/base/base_loading_run_params.py | 4 ++++ v03_pipeline/lib/tasks/write_metadata_for_run.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/v03_pipeline/lib/tasks/base/base_loading_run_params.py b/v03_pipeline/lib/tasks/base/base_loading_run_params.py index ec3742d56..c29f03c21 100644 --- a/v03_pipeline/lib/tasks/base/base_loading_run_params.py +++ b/v03_pipeline/lib/tasks/base/base_loading_run_params.py @@ -18,6 +18,10 @@ class BaseLoadingRunParams(luigi.Task): default=None, description='Optional path to part two outputs from callset (VCF shards containing filter information)', ) + ignore_missing_samples_when_remapping = luigi.BoolParameter( + default=False, + parsing=luigi.BoolParameter.EXPLICIT_PARSING, + ) validate = luigi.BoolParameter( default=True, parsing=luigi.BoolParameter.EXPLICIT_PARSING, diff --git a/v03_pipeline/lib/tasks/write_metadata_for_run.py b/v03_pipeline/lib/tasks/write_metadata_for_run.py index 6cf840207..f87a99a17 100644 --- a/v03_pipeline/lib/tasks/write_metadata_for_run.py +++ b/v03_pipeline/lib/tasks/write_metadata_for_run.py @@ -50,7 +50,7 @@ def requires(self) -> list[luigi.Task]: def run(self) -> None: metadata_json = { - 'callsets': self.callset_paths, + 'callsets': [self.callset], 'run_id': self.run_id, 'sample_type': self.sample_type.value, 'family_samples': {}, From e3cae7a4f3822bc54ddd14d9101ac47d0904ca32 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 13 Jun 2024 00:45:52 -0400 Subject: [PATCH 22/49] typo --- v03_pipeline/lib/tasks/write_metadata_for_run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/v03_pipeline/lib/tasks/write_metadata_for_run.py b/v03_pipeline/lib/tasks/write_metadata_for_run.py index f87a99a17..c3e3f0542 100644 --- a/v03_pipeline/lib/tasks/write_metadata_for_run.py +++ b/v03_pipeline/lib/tasks/write_metadata_for_run.py @@ -50,7 +50,7 @@ def requires(self) -> list[luigi.Task]: def run(self) -> None: metadata_json = { - 'callsets': [self.callset], + 'callsets': [self.callset_path], 'run_id': self.run_id, 'sample_type': self.sample_type.value, 'family_samples': {}, From 6e2d31e0a2fb9667bd4997af473e2bc8e6a3114e Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 13 Jun 2024 01:52:50 -0400 Subject: [PATCH 23/49] making progress --- v03_pipeline/lib/misc/callsets.py | 57 +-------- .../lib/tasks/base/base_loading_run_params.py | 8 ++ .../lib/tasks/update_project_table.py | 3 + ...iant_annotations_table_with_new_samples.py | 78 ++---------- ...annotations_table_with_new_samples_test.py | 18 +-- v03_pipeline/lib/tasks/write_family_table.py | 35 +----- .../lib/tasks/write_new_variants_table.py | 117 ++++-------------- .../lib/tasks/write_project_family_tables.py | 44 ++----- .../tasks/write_relatedness_check_table.py | 1 + 9 files changed, 71 insertions(+), 290 deletions(-) diff --git a/v03_pipeline/lib/misc/callsets.py b/v03_pipeline/lib/misc/callsets.py index e65e9bc61..34ac4f8a7 100644 --- a/v03_pipeline/lib/misc/callsets.py +++ b/v03_pipeline/lib/misc/callsets.py @@ -6,14 +6,11 @@ from v03_pipeline.lib.paths import remapped_and_subsetted_callset_path -def get_callset_ht( # noqa: PLR0913 +def get_callset_ht( reference_genome: ReferenceGenome, dataset_type: DatasetType, - callset_paths: list[str], + callset_path: str, project_guids: list[str], - project_remap_paths: list[str], - project_pedigree_paths: list[str], - imputed_sex_paths: list[str] | None, ): callset_hts = [ hl.read_matrix_table( @@ -24,58 +21,10 @@ def get_callset_ht( # noqa: PLR0913 project_guid, ), ).rows() - for (callset_path, project_guid, _, _, _) in callset_project_pairs( - callset_paths, - project_guids, - project_remap_paths, - project_pedigree_paths, - imputed_sex_paths, - ) + for project_guid in project_guids ] callset_ht = functools.reduce( (lambda ht1, ht2: ht1.union(ht2, unify=True)), callset_hts, ) return callset_ht.distinct() - - -def callset_project_pairs( - callset_paths: list[str], - project_guids: list[str], - project_remap_paths: list[str], - project_pedigree_paths: list[str], - imputed_sex_paths: list[str] | None, -): - if len(callset_paths) == len(project_guids): - return zip( - callset_paths, - project_guids, - project_remap_paths, - project_pedigree_paths, - imputed_sex_paths - if imputed_sex_paths is not None - else [None] * len(callset_paths), - strict=True, - ) - return ( - ( - callset_path, - project_guid, - project_remap_path, - project_pedigree_path, - imputed_sex_path, - ) - for callset_path, imputed_sex_path in zip( - callset_paths, - imputed_sex_paths - if imputed_sex_paths is not None - else [None] * len(callset_paths), - strict=False, - ) - for (project_guid, project_remap_path, project_pedigree_path) in zip( - project_guids, - project_remap_paths, - project_pedigree_paths, - strict=True, - ) - ) diff --git a/v03_pipeline/lib/tasks/base/base_loading_run_params.py b/v03_pipeline/lib/tasks/base/base_loading_run_params.py index c29f03c21..b41a26d73 100644 --- a/v03_pipeline/lib/tasks/base/base_loading_run_params.py +++ b/v03_pipeline/lib/tasks/base/base_loading_run_params.py @@ -34,3 +34,11 @@ class BaseLoadingRunParams(luigi.Task): default=False, parsing=luigi.BoolParameter.EXPLICIT_PARSING, ) + is_new_gcnv_joint_call = luigi.BoolParameter( + default=False, + description='Is this a fully joint-called callset.', + ) + liftover_ref_path = luigi.OptionalParameter( + default='gs://hail-common/references/grch38_to_grch37.over.chain.gz', + description='Path to GRCh38 to GRCh37 coordinates file', + ) diff --git a/v03_pipeline/lib/tasks/update_project_table.py b/v03_pipeline/lib/tasks/update_project_table.py index 288b6b20c..18ae203e4 100644 --- a/v03_pipeline/lib/tasks/update_project_table.py +++ b/v03_pipeline/lib/tasks/update_project_table.py @@ -1,5 +1,6 @@ import hail as hl import luigi +import luigi.util from v03_pipeline.lib.annotations.fields import get_fields from v03_pipeline.lib.misc.family_entries import ( @@ -14,8 +15,10 @@ from v03_pipeline.lib.tasks.write_remapped_and_subsetted_callset import ( WriteRemappedAndSubsettedCallsetTask, ) +from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams +@luigi.util.inherits(BaseLoadingRunParams) class UpdateProjectTableTask(BaseUpdateProjectTableTask): sample_type = luigi.EnumParameter(enum=SampleType) callset_path = luigi.Parameter() diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples.py index a38bd2bc7..5ba1448c9 100644 --- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples.py +++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples.py @@ -1,64 +1,33 @@ import hail as hl import luigi +import luigi.util from v03_pipeline.lib.annotations.fields import get_fields -from v03_pipeline.lib.misc.callsets import callset_project_pairs, get_callset_ht -from v03_pipeline.lib.model import SampleType +from v03_pipeline.lib.misc.callsets import get_callset_ht from v03_pipeline.lib.paths import ( lookup_table_path, new_variants_table_path, ) +from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams from v03_pipeline.lib.tasks.base.base_update_variant_annotations_table import ( BaseUpdateVariantAnnotationsTableTask, ) from v03_pipeline.lib.tasks.write_new_variants_table import WriteNewVariantsTableTask +@luigi.util.inherits(BaseLoadingRunParams) class UpdateVariantAnnotationsTableWithNewSamplesTask( BaseUpdateVariantAnnotationsTableTask, ): - sample_type = luigi.EnumParameter(enum=SampleType) - callset_paths = luigi.ListParameter() project_guids = luigi.ListParameter() project_remap_paths = luigi.ListParameter() project_pedigree_paths = luigi.ListParameter() - imputed_sex_paths = luigi.ListParameter(default=None) - ignore_missing_samples_when_remapping = luigi.BoolParameter( - default=False, - parsing=luigi.BoolParameter.EXPLICIT_PARSING, - ) - validate = luigi.BoolParameter( - default=True, - parsing=luigi.BoolParameter.EXPLICIT_PARSING, - ) - force = luigi.BoolParameter( - default=False, - parsing=luigi.BoolParameter.EXPLICIT_PARSING, - ) - liftover_ref_path = luigi.OptionalParameter( - default='gs://hail-common/references/grch38_to_grch37.over.chain.gz', - description='Path to GRCh38 to GRCh37 coordinates file', - ) run_id = luigi.Parameter() def requires(self) -> list[luigi.Task]: return [ *super().requires(), - WriteNewVariantsTableTask( - self.reference_genome, - self.dataset_type, - self.sample_type, - self.callset_paths, - self.project_guids, - self.project_remap_paths, - self.project_pedigree_paths, - self.imputed_sex_paths, - self.ignore_missing_samples_when_remapping, - self.validate, - self.force, - self.liftover_ref_path, - self.run_id, - ), + self.clone(WriteNewVariantsTableTask), ] def complete(self) -> bool: @@ -71,23 +40,11 @@ def complete(self) -> bool: [ updates.contains( hl.Struct( - callset=callset_path, + callset=self.callset_path, project_guid=project_guid, ), ) - for ( - callset_path, - project_guid, - _, - _, - _, - ) in callset_project_pairs( - self.callset_paths, - self.project_guids, - self.project_remap_paths, - self.project_pedigree_paths, - self.imputed_sex_paths, - ) + for project_guid in self.project_guids ], ), hl.read_table(self.output().path).updates, @@ -110,11 +67,8 @@ def update_table(self, ht: hl.Table) -> hl.Table: callset_ht = get_callset_ht( self.reference_genome, self.dataset_type, - self.callset_paths, + self.callset_path, self.project_guids, - self.project_remap_paths, - self.project_pedigree_paths, - self.imputed_sex_paths, ) # new_variants_ht consists of variants present in the new callset, fully annotated, # but NOT present in the existing annotations table. @@ -142,20 +96,8 @@ def update_table(self, ht: hl.Table) -> hl.Table: return ht.annotate_globals( updates=ht.updates.union( { - hl.Struct(callset=callset_path, project_guid=project_guid) - for ( - callset_path, - project_guid, - _, - _, - _, - ) in callset_project_pairs( - self.callset_paths, - self.project_guids, - self.project_remap_paths, - self.project_pedigree_paths, - self.imputed_sex_paths, - ) + hl.Struct(callset=self.callset_path, project_guid=project_guid) + for project_guid in self.project_guids }, ), ) diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py index d80d6cb87..411c0acf5 100644 --- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py +++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py @@ -155,7 +155,7 @@ def test_missing_pedigree( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.SNV_INDEL, sample_type=SampleType.WGS, - callset_paths=[TEST_SNV_INDEL_VCF], + callset_path=TEST_SNV_INDEL_VCF, project_guids=['R0113_test_project'], project_remap_paths=[TEST_REMAP], project_pedigree_paths=['bad_pedigree'], @@ -189,7 +189,7 @@ def test_missing_interval_reference( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.SNV_INDEL, sample_type=SampleType.WGS, - callset_paths=[TEST_SNV_INDEL_VCF], + callset_path=TEST_SNV_INDEL_VCF, project_guids=['R0113_test_project'], project_remap_paths=[TEST_REMAP], project_pedigree_paths=[TEST_PEDIGREE_3], @@ -351,7 +351,7 @@ def test_multiple_update_vat( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.SNV_INDEL, sample_type=SampleType.WGS, - callset_paths=[TEST_SNV_INDEL_VCF], + callset_path=TEST_SNV_INDEL_VCF, project_guids=['R0113_test_project'], project_remap_paths=[TEST_REMAP], project_pedigree_paths=[TEST_PEDIGREE_3], @@ -403,7 +403,7 @@ def test_multiple_update_vat( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.SNV_INDEL, sample_type=SampleType.WGS, - callset_paths=[TEST_SNV_INDEL_VCF], + callset_path=TEST_SNV_INDEL_VCF, project_guids=['R0114_project4'], project_remap_paths=[TEST_REMAP], project_pedigree_paths=[TEST_PEDIGREE_4], @@ -662,7 +662,7 @@ def test_update_vat_grch37( reference_genome=ReferenceGenome.GRCh37, dataset_type=DatasetType.SNV_INDEL, sample_type=SampleType.WGS, - callset_paths=[TEST_SNV_INDEL_VCF], + callset_path=TEST_SNV_INDEL_VCF, project_guids=['R0113_test_project'], project_remap_paths=[TEST_REMAP], project_pedigree_paths=[TEST_PEDIGREE_3], @@ -735,7 +735,7 @@ def test_update_vat_without_accessing_private_datasets( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.SNV_INDEL, sample_type=SampleType.WGS, - callset_paths=[TEST_SNV_INDEL_VCF], + callset_path=TEST_SNV_INDEL_VCF, project_guids=['R0113_test_project'], project_remap_paths=[TEST_REMAP], project_pedigree_paths=[TEST_PEDIGREE_3], @@ -793,7 +793,7 @@ def test_mito_update_vat( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.MITO, sample_type=SampleType.WGS, - callset_paths=[TEST_MITO_MT], + callset_path=TEST_MITO_MT, project_guids=['R0115_test_project2'], project_remap_paths=['not_a_real_file'], project_pedigree_paths=[TEST_PEDIGREE_5], @@ -1058,7 +1058,7 @@ def test_sv_update_vat( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.SV, sample_type=SampleType.WGS, - callset_paths=[TEST_SV_VCF], + callset_path=TEST_SV_VCF, project_guids=['R0115_test_project2'], project_remap_paths=['not_a_real_file'], project_pedigree_paths=[TEST_PEDIGREE_5], @@ -1620,7 +1620,7 @@ def test_gcnv_update_vat( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.GCNV, sample_type=SampleType.WES, - callset_paths=[TEST_GCNV_BED_FILE], + callset_path=TEST_GCNV_BED_FILE, project_guids=['R0115_test_project2'], project_remap_paths=['not_a_real_file'], project_pedigree_paths=[TEST_PEDIGREE_5], diff --git a/v03_pipeline/lib/tasks/write_family_table.py b/v03_pipeline/lib/tasks/write_family_table.py index 73400983f..ce4c8679c 100644 --- a/v03_pipeline/lib/tasks/write_family_table.py +++ b/v03_pipeline/lib/tasks/write_family_table.py @@ -1,8 +1,9 @@ import hail as hl import luigi +import luigi.util -from v03_pipeline.lib.model import SampleType from v03_pipeline.lib.paths import family_table_path +from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask from v03_pipeline.lib.tasks.files import GCSorLocalTarget from v03_pipeline.lib.tasks.update_project_table import ( @@ -10,26 +11,11 @@ ) +@luigi.util.inherits(BaseLoadingRunParams) class WriteFamilyTableTask(BaseWriteTask): - sample_type = luigi.EnumParameter(enum=SampleType) - callset_path = luigi.Parameter() project_guid = luigi.Parameter() project_remap_path = luigi.Parameter() project_pedigree_path = luigi.Parameter() - imputed_sex_path = luigi.Parameter(default=None) - ignore_missing_samples_when_remapping = luigi.BoolParameter( - parsing=luigi.BoolParameter.EXPLICIT_PARSING, - ) - validate = luigi.BoolParameter( - parsing=luigi.BoolParameter.EXPLICIT_PARSING, - ) - force = luigi.BoolParameter( - default=False, - parsing=luigi.BoolParameter.EXPLICIT_PARSING, - ) - is_new_gcnv_joint_call = luigi.BoolParameter( - description='Is this a fully joint-called callset.', - ) family_guid = luigi.Parameter() def output(self) -> luigi.Target: @@ -51,20 +37,7 @@ def complete(self) -> bool: ) def requires(self) -> luigi.Task: - return UpdateProjectTableTask( - self.reference_genome, - self.dataset_type, - self.project_guid, - self.sample_type, - self.callset_path, - self.project_remap_path, - self.project_pedigree_path, - self.imputed_sex_path, - self.ignore_missing_samples_when_remapping, - self.validate, - False, - self.is_new_gcnv_joint_call, - ) + return self.clone(UpdateProjectTableTask, force=False) def create_table(self) -> hl.Table: project_ht = hl.read_table(self.input().path) diff --git a/v03_pipeline/lib/tasks/write_new_variants_table.py b/v03_pipeline/lib/tasks/write_new_variants_table.py index 4779b5c42..cc3e80545 100644 --- a/v03_pipeline/lib/tasks/write_new_variants_table.py +++ b/v03_pipeline/lib/tasks/write_new_variants_table.py @@ -2,20 +2,22 @@ import hail as hl import luigi +import luigi.util from v03_pipeline.lib.annotations.fields import get_fields from v03_pipeline.lib.annotations.rdc_dependencies import ( get_rdc_annotation_dependencies, ) from v03_pipeline.lib.misc.allele_registry import register_alleles_in_chunks -from v03_pipeline.lib.misc.callsets import callset_project_pairs, get_callset_ht +from v03_pipeline.lib.misc.callsets import get_callset_ht from v03_pipeline.lib.misc.math import constrain -from v03_pipeline.lib.model import Env, ReferenceDatasetCollection, SampleType +from v03_pipeline.lib.model import Env, ReferenceDatasetCollection from v03_pipeline.lib.paths import ( new_variants_table_path, variant_annotations_table_path, ) from v03_pipeline.lib.reference_data.gencode.mapping_gene_ids import load_gencode +from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams from v03_pipeline.lib.tasks.base.base_update_variant_annotations_table import ( BaseUpdateVariantAnnotationsTableTask, ) @@ -36,29 +38,11 @@ GENCODE_RELEASE = 42 +@luigi.util.inherits(BaseLoadingRunParams) class WriteNewVariantsTableTask(BaseWriteTask): - sample_type = luigi.EnumParameter(enum=SampleType) - callset_paths = luigi.ListParameter() project_guids = luigi.ListParameter() project_remap_paths = luigi.ListParameter() project_pedigree_paths = luigi.ListParameter() - imputed_sex_paths = luigi.ListParameter(default=None) - ignore_missing_samples_when_remapping = luigi.BoolParameter( - default=False, - parsing=luigi.BoolParameter.EXPLICIT_PARSING, - ) - validate = luigi.BoolParameter( - default=True, - parsing=luigi.BoolParameter.EXPLICIT_PARSING, - ) - force = luigi.BoolParameter( - default=False, - parsing=luigi.BoolParameter.EXPLICIT_PARSING, - ) - liftover_ref_path = luigi.OptionalParameter( - default='gs://hail-common/references/grch38_to_grch37.over.chain.gz', - description='Path to GRCh38 to GRCh37 coordinates file', - ) run_id = luigi.Parameter() @property @@ -81,14 +65,14 @@ def output(self) -> luigi.Target: def requires(self) -> list[luigi.Task]: if Env.REFERENCE_DATA_AUTO_UPDATE: - upstream_table_tasks = [ + requirements = [ UpdateVariantAnnotationsTableWithUpdatedReferenceDataset( self.reference_genome, self.dataset_type, ), ] else: - upstream_table_tasks = [ + requirements = [ BaseUpdateVariantAnnotationsTableTask( self.reference_genome, self.dataset_type, @@ -96,55 +80,33 @@ def requires(self) -> list[luigi.Task]: ] if self.dataset_type.has_lookup_table: # NB: the lookup table task has remapped and subsetted callset tasks as dependencies. - upstream_table_tasks.extend( - [ - UpdateLookupTableTask( - self.reference_genome, - self.dataset_type, - self.sample_type, - self.callset_paths, - self.project_guids, - self.project_remap_paths, - self.project_pedigree_paths, - self.imputed_sex_paths, - self.ignore_missing_samples_when_remapping, - self.validate, - self.force, - ), - ], - ) + # Also note that force is passed here, + requirements = [ + *requirements, + self.clone(UpdateLookupTableTask), + ] else: - upstream_table_tasks.extend( + requirements.extend( [ - WriteRemappedAndSubsettedCallsetTask( - self.reference_genome, - self.dataset_type, - self.sample_type, - callset_path, - project_guid, - project_remap_path, - project_pedigree_path, - imputed_sex_path, - self.ignore_missing_samples_when_remapping, - self.validate, - False, + self.clone( + WriteRemappedAndSubsettedCallsetTask, + project_guid=project_guid, + project_remap_path=project_remap_path, + project_pedigree_path=project_pedigree_path, ) for ( - callset_path, project_guid, project_remap_path, project_pedigree_path, - imputed_sex_path, - ) in callset_project_pairs( - self.callset_paths, + ) in zip( self.project_guids, self.project_remap_paths, self.project_pedigree_paths, - self.imputed_sex_paths, + strict=False, ) ], ) - return upstream_table_tasks + return requirements def complete(self) -> bool: return super().complete() and hl.eval( @@ -153,23 +115,11 @@ def complete(self) -> bool: [ updates.contains( hl.Struct( - callset=callset_path, + callset=self.callset_path, project_guid=project_guid, ), ) - for ( - callset_path, - project_guid, - _, - _, - _, - ) in callset_project_pairs( - self.callset_paths, - self.project_guids, - self.project_remap_paths, - self.project_pedigree_paths, - self.imputed_sex_paths, - ) + for project_guid in self.project_guids ], ), hl.read_table(self.output().path).updates, @@ -180,11 +130,8 @@ def create_table(self) -> hl.Table: callset_ht = get_callset_ht( self.reference_genome, self.dataset_type, - self.callset_paths, + self.callset_path, self.project_guids, - self.project_remap_paths, - self.project_pedigree_paths, - self.imputed_sex_paths, ) # 1) Identify new variants. @@ -260,19 +207,7 @@ def create_table(self) -> hl.Table: return new_variants_ht.select_globals( updates={ - hl.Struct(callset=callset_path, project_guid=project_guid) - for ( - callset_path, - project_guid, - _, - _, - _, - ) in callset_project_pairs( - self.callset_paths, - self.project_guids, - self.project_remap_paths, - self.project_pedigree_paths, - self.imputed_sex_paths, - ) + hl.Struct(callset=self.callset_path, project_guid=project_guid) + for project_guid in self.project_guids }, ) diff --git a/v03_pipeline/lib/tasks/write_project_family_tables.py b/v03_pipeline/lib/tasks/write_project_family_tables.py index b7d83cd49..26253a1da 100644 --- a/v03_pipeline/lib/tasks/write_project_family_tables.py +++ b/v03_pipeline/lib/tasks/write_project_family_tables.py @@ -1,35 +1,18 @@ import hail as hl import luigi +import luigi.util -from v03_pipeline.lib.model import SampleType from v03_pipeline.lib.tasks.base.base_hail_table import BaseHailTableTask +from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams from v03_pipeline.lib.tasks.update_project_table import UpdateProjectTableTask from v03_pipeline.lib.tasks.write_family_table import WriteFamilyTableTask +@luigi.util.inherits(BaseLoadingRunParams) class WriteProjectFamilyTablesTask(BaseHailTableTask): - sample_type = luigi.EnumParameter(enum=SampleType) - callset_path = luigi.Parameter() project_guid = luigi.Parameter() project_remap_path = luigi.Parameter() project_pedigree_path = luigi.Parameter() - imputed_sex_path = luigi.Parameter(default=None) - ignore_missing_samples_when_remapping = luigi.BoolParameter( - default=False, - parsing=luigi.BoolParameter.EXPLICIT_PARSING, - ) - validate = luigi.BoolParameter( - default=True, - parsing=luigi.BoolParameter.EXPLICIT_PARSING, - ) - force = luigi.BoolParameter( - default=False, - parsing=luigi.BoolParameter.EXPLICIT_PARSING, - ) - is_new_gcnv_joint_call = luigi.BoolParameter( - default=False, - description='Is this a fully joint-called callset.', - ) def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -47,27 +30,14 @@ def complete(self) -> bool: def run(self): # https://luigi.readthedocs.io/en/stable/tasks.html#dynamic-dependencies - update_project_table_task: luigi.Target = yield UpdateProjectTableTask( - self.reference_genome, - self.dataset_type, - self.project_guid, - self.sample_type, - self.callset_path, - self.project_remap_path, - self.project_pedigree_path, - self.imputed_sex_path, - self.ignore_missing_samples_when_remapping, - self.validate, - False, - self.is_new_gcnv_joint_call, + update_project_table_task: luigi.Target = yield self.clone( + UpdateProjectTableTask, + force=False, ) project_ht = hl.read_table(update_project_table_task.path) family_guids = hl.eval(project_ht.globals.family_guids) for family_guid in family_guids: self.dynamic_write_family_table_tasks.add( - WriteFamilyTableTask( - **self.param_kwargs, - family_guid=family_guid, - ), + self.clone(WriteFamilyTableTask, family_guid=family_guid), ) yield self.dynamic_write_family_table_tasks diff --git a/v03_pipeline/lib/tasks/write_relatedness_check_table.py b/v03_pipeline/lib/tasks/write_relatedness_check_table.py index 86e4eb500..6b943c643 100644 --- a/v03_pipeline/lib/tasks/write_relatedness_check_table.py +++ b/v03_pipeline/lib/tasks/write_relatedness_check_table.py @@ -1,5 +1,6 @@ import hail as hl import luigi +import luigi.util from v03_pipeline.lib.methods.relatedness import call_relatedness from v03_pipeline.lib.model import CachedReferenceDatasetQuery, Env From 3d317cfb1fa90002250d2500b43270ccb15290a1 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 13 Jun 2024 09:50:06 -0400 Subject: [PATCH 24/49] format --- v03_pipeline/lib/tasks/update_lookup_table.py | 59 ++++--------------- .../lib/tasks/update_lookup_table_test.py | 4 +- .../lib/tasks/update_project_table.py | 36 +---------- 3 files changed, 16 insertions(+), 83 deletions(-) diff --git a/v03_pipeline/lib/tasks/update_lookup_table.py b/v03_pipeline/lib/tasks/update_lookup_table.py index 0c389b713..0d707f918 100644 --- a/v03_pipeline/lib/tasks/update_lookup_table.py +++ b/v03_pipeline/lib/tasks/update_lookup_table.py @@ -1,5 +1,6 @@ import hail as hl import luigi +import luigi.util from v03_pipeline.lib.misc.callsets import callset_project_pairs from v03_pipeline.lib.misc.lookup import ( @@ -7,8 +8,8 @@ join_lookup_hts, remove_family_guids, ) -from v03_pipeline.lib.model import SampleType from v03_pipeline.lib.model.constants import PROJECTS_EXCLUDED_FROM_LOOKUP +from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams from v03_pipeline.lib.tasks.base.base_update_lookup_table import ( BaseUpdateLookupTableTask, ) @@ -17,25 +18,11 @@ ) +@luigi.util.inherits(BaseLoadingRunParams) class UpdateLookupTableTask(BaseUpdateLookupTableTask): - sample_type = luigi.EnumParameter(enum=SampleType) - callset_paths = luigi.ListParameter() project_guids = luigi.ListParameter() project_remap_paths = luigi.ListParameter() project_pedigree_paths = luigi.ListParameter() - imputed_sex_paths = luigi.ListParameter(default=None) - ignore_missing_samples_when_remapping = luigi.BoolParameter( - default=False, - parsing=luigi.BoolParameter.EXPLICIT_PARSING, - ) - validate = luigi.BoolParameter( - default=True, - parsing=luigi.BoolParameter.EXPLICIT_PARSING, - ) - force = luigi.BoolParameter( - default=False, - parsing=luigi.BoolParameter.EXPLICIT_PARSING, - ) def complete(self) -> bool: return ( @@ -47,23 +34,11 @@ def complete(self) -> bool: [ updates.contains( hl.Struct( - callset=callset_path, + callset=self.callset_path, project_guid=project_guid, ), ) - for ( - callset_path, - project_guid, - _, - _, - _, - ) in callset_project_pairs( - self.callset_paths, - self.project_guids, - self.project_remap_paths, - self.project_pedigree_paths, - self.imputed_sex_paths, - ) + for project_guid in self.project_guids ], ), hl.read_table(self.output().path).updates, @@ -73,31 +48,21 @@ def complete(self) -> bool: def requires(self) -> list[luigi.Task]: return [ - WriteRemappedAndSubsettedCallsetTask( - self.reference_genome, - self.dataset_type, - self.sample_type, - callset_path, - project_guid, - project_remap_path, - project_pedigree_path, - imputed_sex_path, - self.ignore_missing_samples_when_remapping, - self.validate, - False, + self.clone( + WriteRemappedAndSubsettedCallsetTask, + project_guid=project_guid, + project_remap_path=project_remap_path, + project_pedigree_path=project_pedigree_path, ) for ( - callset_path, project_guid, project_remap_path, project_pedigree_path, - imputed_sex_path, - ) in callset_project_pairs( - self.callset_paths, + ) in zip( self.project_guids, self.project_remap_paths, self.project_pedigree_paths, - self.imputed_sex_paths, + strict=False, ) ] diff --git a/v03_pipeline/lib/tasks/update_lookup_table_test.py b/v03_pipeline/lib/tasks/update_lookup_table_test.py index a81fe6a35..7aaa6329a 100644 --- a/v03_pipeline/lib/tasks/update_lookup_table_test.py +++ b/v03_pipeline/lib/tasks/update_lookup_table_test.py @@ -19,7 +19,7 @@ def test_skip_update_lookup_table_task(self) -> None: reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.SNV_INDEL, sample_type=SampleType.WGS, - callset_paths=[TEST_VCF], + callset_path=TEST_VCF, project_guids=[ 'R0555_seqr_demo', ], # a project excluded from the lookup table @@ -52,7 +52,7 @@ def test_update_lookup_table_task(self) -> None: reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.SNV_INDEL, sample_type=SampleType.WGS, - callset_paths=[TEST_VCF], + callset_path=TEST_VCF, project_guids=['R0113_test_project'], project_remap_paths=[TEST_REMAP], project_pedigree_paths=[TEST_PEDIGREE_3], diff --git a/v03_pipeline/lib/tasks/update_project_table.py b/v03_pipeline/lib/tasks/update_project_table.py index 18ae203e4..c7ea539e4 100644 --- a/v03_pipeline/lib/tasks/update_project_table.py +++ b/v03_pipeline/lib/tasks/update_project_table.py @@ -8,39 +8,19 @@ join_family_entries_hts, remove_family_guids, ) -from v03_pipeline.lib.model import SampleType +from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams from v03_pipeline.lib.tasks.base.base_update_project_table import ( BaseUpdateProjectTableTask, ) from v03_pipeline.lib.tasks.write_remapped_and_subsetted_callset import ( WriteRemappedAndSubsettedCallsetTask, ) -from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams @luigi.util.inherits(BaseLoadingRunParams) class UpdateProjectTableTask(BaseUpdateProjectTableTask): - sample_type = luigi.EnumParameter(enum=SampleType) - callset_path = luigi.Parameter() project_remap_path = luigi.Parameter() project_pedigree_path = luigi.Parameter() - imputed_sex_path = luigi.Parameter(default=None) - ignore_missing_samples_when_remapping = luigi.BoolParameter( - default=False, - parsing=luigi.BoolParameter.EXPLICIT_PARSING, - ) - validate = luigi.BoolParameter( - default=True, - parsing=luigi.BoolParameter.EXPLICIT_PARSING, - ) - force = luigi.BoolParameter( - default=False, - parsing=luigi.BoolParameter.EXPLICIT_PARSING, - ) - is_new_gcnv_joint_call = luigi.BoolParameter( - default=False, - description='Is this a fully joint-called callset.', - ) def complete(self) -> bool: return ( @@ -54,19 +34,7 @@ def complete(self) -> bool: ) def requires(self) -> luigi.Task: - return WriteRemappedAndSubsettedCallsetTask( - self.reference_genome, - self.dataset_type, - self.sample_type, - self.callset_path, - self.project_guid, - self.project_remap_path, - self.project_pedigree_path, - self.imputed_sex_path, - self.ignore_missing_samples_when_remapping, - self.validate, - False, - ) + return self.clone(WriteRemappedAndSubsettedCallsetTask, force=False) def update_table(self, ht: hl.Table) -> hl.Table: callset_mt = hl.read_matrix_table(self.input().path) From eff732a84bdfb3c9107c148344ca8cb7d7cec22e Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 13 Jun 2024 09:56:18 -0400 Subject: [PATCH 25/49] remove pairing --- v03_pipeline/lib/tasks/update_lookup_table.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/v03_pipeline/lib/tasks/update_lookup_table.py b/v03_pipeline/lib/tasks/update_lookup_table.py index 0d707f918..fc26b53e5 100644 --- a/v03_pipeline/lib/tasks/update_lookup_table.py +++ b/v03_pipeline/lib/tasks/update_lookup_table.py @@ -2,7 +2,6 @@ import luigi import luigi.util -from v03_pipeline.lib.misc.callsets import callset_project_pairs from v03_pipeline.lib.misc.lookup import ( compute_callset_lookup_ht, join_lookup_hts, @@ -53,6 +52,7 @@ def requires(self) -> list[luigi.Task]: project_guid=project_guid, project_remap_path=project_remap_path, project_pedigree_path=project_pedigree_path, + force=False, ) for ( project_guid, @@ -69,15 +69,7 @@ def requires(self) -> list[luigi.Task]: def update_table(self, ht: hl.Table) -> hl.Table: # NB: there's a chance this many hail operations blows the DAG compute stack # in an unfortunate way. Please keep an eye out! - for i, (callset_path, project_guid, _, _, _) in enumerate( - callset_project_pairs( - self.callset_paths, - self.project_guids, - self.project_remap_paths, - self.project_pedigree_paths, - self.imputed_sex_paths, - ), - ): + for i, project_guid in enumerate(self.project_guids): if project_guid in PROJECTS_EXCLUDED_FROM_LOOKUP: ht = ht.annotate_globals( updates=ht.updates.add( From f39db4654341c3e17b69194ac26154f305ae02db Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 13 Jun 2024 10:03:29 -0400 Subject: [PATCH 26/49] another --- v03_pipeline/lib/tasks/update_lookup_table.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/v03_pipeline/lib/tasks/update_lookup_table.py b/v03_pipeline/lib/tasks/update_lookup_table.py index fc26b53e5..139b89b62 100644 --- a/v03_pipeline/lib/tasks/update_lookup_table.py +++ b/v03_pipeline/lib/tasks/update_lookup_table.py @@ -74,7 +74,7 @@ def update_table(self, ht: hl.Table) -> hl.Table: ht = ht.annotate_globals( updates=ht.updates.add( hl.Struct( - callset=callset_path, + callset=self.callset_path, project_guid=project_guid, ), ), @@ -100,7 +100,7 @@ def update_table(self, ht: hl.Table) -> hl.Table: project_families=ht.project_families, updates=ht.updates.add( hl.Struct( - callset=callset_path, + callset=self.callset_path, project_guid=project_guid, ), ), From 4cde8f438d79bca4d08466e8fedce01b5f0cecc3 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 13 Jun 2024 11:04:57 -0400 Subject: [PATCH 27/49] missing test liftover --- v03_pipeline/lib/tasks/base/base_loading_run_params.py | 6 ++++-- v03_pipeline/lib/tasks/update_lookup_table_test.py | 3 +++ v03_pipeline/lib/tasks/update_project_table_test.py | 2 ++ v03_pipeline/lib/tasks/write_family_table_test.py | 4 ++++ v03_pipeline/lib/tasks/write_project_family_tables_test.py | 2 ++ 5 files changed, 15 insertions(+), 2 deletions(-) diff --git a/v03_pipeline/lib/tasks/base/base_loading_run_params.py b/v03_pipeline/lib/tasks/base/base_loading_run_params.py index b41a26d73..1bfa204be 100644 --- a/v03_pipeline/lib/tasks/base/base_loading_run_params.py +++ b/v03_pipeline/lib/tasks/base/base_loading_run_params.py @@ -10,11 +10,13 @@ class BaseLoadingRunParams(luigi.Task): # but nothing else. sample_type = luigi.EnumParameter(enum=SampleType) callset_path = luigi.Parameter() - imputed_sex_path = luigi.Parameter( + # HINT: OptionalParameter vs Parameter is significant here. + # The default Parameter will case `None` to the string "None". + imputed_sex_path = luigi.OptionalParameter( default=None, description='Optional path to a tsv of imputed sex values from the DRAGEN GVS pipeline.', ) - filters_path = luigi.Parameter( + filters_path = luigi.OptionalParameter( default=None, description='Optional path to part two outputs from callset (VCF shards containing filter information)', ) diff --git a/v03_pipeline/lib/tasks/update_lookup_table_test.py b/v03_pipeline/lib/tasks/update_lookup_table_test.py index 7aaa6329a..dc5f22059 100644 --- a/v03_pipeline/lib/tasks/update_lookup_table_test.py +++ b/v03_pipeline/lib/tasks/update_lookup_table_test.py @@ -7,6 +7,7 @@ ) from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase +TEST_LIFTOVER = 'v03_pipeline/var/test/liftover/grch38_to_grch37.over.chain.gz' TEST_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf' TEST_REMAP = 'v03_pipeline/var/test/remaps/test_remap_1.tsv' TEST_PEDIGREE_3 = 'v03_pipeline/var/test/pedigrees/test_pedigree_3.tsv' @@ -26,6 +27,7 @@ def test_skip_update_lookup_table_task(self) -> None: project_remap_paths=[TEST_REMAP], project_pedigree_paths=[TEST_PEDIGREE_3], validate=False, + liftover_ref_path=TEST_LIFTOVER, ) worker.add(uslt_task) worker.run() @@ -57,6 +59,7 @@ def test_update_lookup_table_task(self) -> None: project_remap_paths=[TEST_REMAP], project_pedigree_paths=[TEST_PEDIGREE_3], validate=False, + liftover_ref_path=TEST_LIFTOVER, ) worker.add(uslt_task) worker.run() diff --git a/v03_pipeline/lib/tasks/update_project_table_test.py b/v03_pipeline/lib/tasks/update_project_table_test.py index bc92b762f..07278a2d3 100644 --- a/v03_pipeline/lib/tasks/update_project_table_test.py +++ b/v03_pipeline/lib/tasks/update_project_table_test.py @@ -5,6 +5,7 @@ from v03_pipeline.lib.tasks.update_project_table import UpdateProjectTableTask from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase +TEST_LIFTOVER = 'v03_pipeline/var/test/liftover/grch38_to_grch37.over.chain.gz' TEST_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf' TEST_REMAP = 'v03_pipeline/var/test/remaps/test_remap_1.tsv' TEST_PEDIGREE_3 = 'v03_pipeline/var/test/pedigrees/test_pedigree_3.tsv' @@ -22,6 +23,7 @@ def test_update_project_table_task(self) -> None: project_remap_path=TEST_REMAP, project_pedigree_path=TEST_PEDIGREE_3, validate=False, + liftover_ref_path=TEST_LIFTOVER, ) worker.add(upt_task) worker.run() diff --git a/v03_pipeline/lib/tasks/write_family_table_test.py b/v03_pipeline/lib/tasks/write_family_table_test.py index 1e205aa52..554735c59 100644 --- a/v03_pipeline/lib/tasks/write_family_table_test.py +++ b/v03_pipeline/lib/tasks/write_family_table_test.py @@ -5,6 +5,7 @@ from v03_pipeline.lib.tasks.write_family_table import WriteFamilyTableTask from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase +TEST_LIFTOVER = 'v03_pipeline/var/test/liftover/grch38_to_grch37.over.chain.gz' TEST_GCNV_BED_FILE = 'v03_pipeline/var/test/callsets/gcnv_1.tsv' TEST_SNV_INDEL_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf' TEST_SV_VCF = 'v03_pipeline/var/test/callsets/sv_1.vcf' @@ -26,6 +27,7 @@ def test_snv_write_family_table_task(self) -> None: project_pedigree_path=TEST_PEDIGREE_3, family_guid='abc_1', validate=False, + liftover_ref_path=TEST_LIFTOVER, ) worker.add(wft_task) worker.run() @@ -163,6 +165,7 @@ def test_sv_write_family_table_task(self) -> None: project_pedigree_path=TEST_PEDIGREE_5, family_guid='family_2_1', validate=False, + liftover_ref_path=TEST_LIFTOVER, ) worker.add(write_family_table_task) worker.run() @@ -415,6 +418,7 @@ def test_gcnv_write_family_table_task(self) -> None: project_pedigree_path=TEST_PEDIGREE_5, family_guid='family_2_1', validate=False, + liftover_ref_path=TEST_LIFTOVER, ) worker.add(write_family_table_task) worker.run() diff --git a/v03_pipeline/lib/tasks/write_project_family_tables_test.py b/v03_pipeline/lib/tasks/write_project_family_tables_test.py index d3b713224..9943771d2 100644 --- a/v03_pipeline/lib/tasks/write_project_family_tables_test.py +++ b/v03_pipeline/lib/tasks/write_project_family_tables_test.py @@ -7,6 +7,7 @@ ) from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase +TEST_LIFTOVER = 'v03_pipeline/var/test/liftover/grch38_to_grch37.over.chain.gz' TEST_SNV_INDEL_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf' TEST_REMAP = 'v03_pipeline/var/test/remaps/test_remap_1.tsv' TEST_PEDIGREE_4 = 'v03_pipeline/var/test/pedigrees/test_pedigree_4.tsv' @@ -24,6 +25,7 @@ def test_snv_write_project_family_tables_task(self) -> None: project_remap_path=TEST_REMAP, project_pedigree_path=TEST_PEDIGREE_4, validate=False, + liftover_ref_path=TEST_LIFTOVER, ) worker.add(write_project_family_tables) worker.run() From 391073f6a60e5f2ad5ce186ffee6be8848bbe3e1 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 20 Jun 2024 10:18:39 -0400 Subject: [PATCH 28/49] ruff --- v03_pipeline/lib/tasks/update_lookup_table.py | 1 - v03_pipeline/lib/tasks/write_family_table.py | 1 - v03_pipeline/lib/tasks/write_imported_callset.py | 2 +- v03_pipeline/lib/tasks/write_new_variants_table.py | 2 +- v03_pipeline/lib/tasks/write_project_family_tables.py | 1 - v03_pipeline/lib/tasks/write_relatedness_check_table.py | 2 +- v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py | 1 - 7 files changed, 3 insertions(+), 7 deletions(-) diff --git a/v03_pipeline/lib/tasks/update_lookup_table.py b/v03_pipeline/lib/tasks/update_lookup_table.py index 6b1797357..139b89b62 100644 --- a/v03_pipeline/lib/tasks/update_lookup_table.py +++ b/v03_pipeline/lib/tasks/update_lookup_table.py @@ -7,7 +7,6 @@ join_lookup_hts, remove_family_guids, ) -from v03_pipeline.lib.model import SampleType from v03_pipeline.lib.model.constants import PROJECTS_EXCLUDED_FROM_LOOKUP from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams from v03_pipeline.lib.tasks.base.base_update_lookup_table import ( diff --git a/v03_pipeline/lib/tasks/write_family_table.py b/v03_pipeline/lib/tasks/write_family_table.py index 1746d294b..ce4c8679c 100644 --- a/v03_pipeline/lib/tasks/write_family_table.py +++ b/v03_pipeline/lib/tasks/write_family_table.py @@ -2,7 +2,6 @@ import luigi import luigi.util -from v03_pipeline.lib.model import SampleType from v03_pipeline.lib.paths import family_table_path from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask diff --git a/v03_pipeline/lib/tasks/write_imported_callset.py b/v03_pipeline/lib/tasks/write_imported_callset.py index 4df4086aa..9cd146af4 100644 --- a/v03_pipeline/lib/tasks/write_imported_callset.py +++ b/v03_pipeline/lib/tasks/write_imported_callset.py @@ -16,7 +16,7 @@ validate_sample_type, ) from v03_pipeline.lib.misc.vets import annotate_vets -from v03_pipeline.lib.model import CachedReferenceDatasetQuery, SampleType +from v03_pipeline.lib.model import CachedReferenceDatasetQuery from v03_pipeline.lib.model.environment import Env from v03_pipeline.lib.paths import ( cached_reference_dataset_query_path, diff --git a/v03_pipeline/lib/tasks/write_new_variants_table.py b/v03_pipeline/lib/tasks/write_new_variants_table.py index 633d159fd..d58867e70 100644 --- a/v03_pipeline/lib/tasks/write_new_variants_table.py +++ b/v03_pipeline/lib/tasks/write_new_variants_table.py @@ -11,7 +11,7 @@ from v03_pipeline.lib.misc.allele_registry import register_alleles_in_chunks from v03_pipeline.lib.misc.callsets import get_callset_ht from v03_pipeline.lib.misc.math import constrain -from v03_pipeline.lib.model import Env, ReferenceDatasetCollection, SampleType +from v03_pipeline.lib.model import Env, ReferenceDatasetCollection from v03_pipeline.lib.paths import ( new_variants_table_path, variant_annotations_table_path, diff --git a/v03_pipeline/lib/tasks/write_project_family_tables.py b/v03_pipeline/lib/tasks/write_project_family_tables.py index be7c4bcb9..26253a1da 100644 --- a/v03_pipeline/lib/tasks/write_project_family_tables.py +++ b/v03_pipeline/lib/tasks/write_project_family_tables.py @@ -2,7 +2,6 @@ import luigi import luigi.util -from v03_pipeline.lib.model import SampleType from v03_pipeline.lib.tasks.base.base_hail_table import BaseHailTableTask from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams from v03_pipeline.lib.tasks.update_project_table import UpdateProjectTableTask diff --git a/v03_pipeline/lib/tasks/write_relatedness_check_table.py b/v03_pipeline/lib/tasks/write_relatedness_check_table.py index 3140c2d84..6b943c643 100644 --- a/v03_pipeline/lib/tasks/write_relatedness_check_table.py +++ b/v03_pipeline/lib/tasks/write_relatedness_check_table.py @@ -3,7 +3,7 @@ import luigi.util from v03_pipeline.lib.methods.relatedness import call_relatedness -from v03_pipeline.lib.model import CachedReferenceDatasetQuery, Env, SampleType +from v03_pipeline.lib.model import CachedReferenceDatasetQuery, Env from v03_pipeline.lib.paths import ( cached_reference_dataset_query_path, relatedness_check_table_path, diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py index a222ea1f7..81e5adf93 100644 --- a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py +++ b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py @@ -15,7 +15,6 @@ ) from v03_pipeline.lib.misc.pedigree import parse_pedigree_ht_to_families from v03_pipeline.lib.misc.sample_ids import remap_sample_ids, subset_samples -from v03_pipeline.lib.model import SampleType from v03_pipeline.lib.paths import remapped_and_subsetted_callset_path from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask From 965782325ae4c9ac9465a00a04a936b45c7ed9b7 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 20 Jun 2024 11:01:37 -0400 Subject: [PATCH 29/49] ruff --- v03_pipeline/lib/tasks/write_new_variants_table.py | 1 + 1 file changed, 1 insertion(+) diff --git a/v03_pipeline/lib/tasks/write_new_variants_table.py b/v03_pipeline/lib/tasks/write_new_variants_table.py index d58867e70..d35dccc71 100644 --- a/v03_pipeline/lib/tasks/write_new_variants_table.py +++ b/v03_pipeline/lib/tasks/write_new_variants_table.py @@ -20,6 +20,7 @@ load_gencode_ensembl_to_refseq_id, load_gencode_gene_symbol_to_gene_id, ) +from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams from v03_pipeline.lib.tasks.base.base_update_variant_annotations_table import ( BaseUpdateVariantAnnotationsTableTask, ) From 907eee4edfa946bfc0d96b557fcee7c3c95c6f5e Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 20 Jun 2024 12:45:42 -0400 Subject: [PATCH 30/49] use predetermined filters/imputed_sex paths --- v03_pipeline/lib/model/environment.py | 18 ++++++--- v03_pipeline/lib/paths.py | 37 +++++++++++++++++++ v03_pipeline/lib/paths_test.py | 31 ++++++++++++++++ .../lib/tasks/base/base_loading_run_params.py | 10 ----- .../lib/tasks/write_imported_callset.py | 22 ++++++++--- .../lib/tasks/write_sex_check_table.py | 11 ++++-- 6 files changed, 105 insertions(+), 24 deletions(-) diff --git a/v03_pipeline/lib/model/environment.py b/v03_pipeline/lib/model/environment.py index d89567d8b..831f430c6 100644 --- a/v03_pipeline/lib/model/environment.py +++ b/v03_pipeline/lib/model/environment.py @@ -2,10 +2,6 @@ from dataclasses import dataclass # NB: using os.environ.get inside the dataclass defaults gives a lint error. -ACCESS_PRIVATE_REFERENCE_DATASETS = ( - os.environ.get('ACCESS_PRIVATE_REFERENCE_DATASETS') == '1' -) -REFERENCE_DATA_AUTO_UPDATE = os.environ.get('REFERENCE_DATA_AUTO_UPDATE') == '1' HAIL_TMPDIR = os.environ.get('HAIL_TMPDIR', '/tmp') # noqa: S108 HAIL_SEARCH_DATA = os.environ.get('HAIL_SEARCH_DATA', '/hail-search-data') LOADING_DATASETS = os.environ.get('LOADING_DATASETS', '/seqr-loading-temp') @@ -19,22 +15,32 @@ ) VEP_CONFIG_PATH = os.environ.get('VEP_CONFIG_PATH', None) VEP_CONFIG_URI = os.environ.get('VEP_CONFIG_URI', None) -SHOULD_REGISTER_ALLELES = os.environ.get('SHOULD_REGISTER_ALLELES') == '1' + +# Allele registry secrets :/ ALLELE_REGISTRY_SECRET_NAME = os.environ.get('ALLELE_REGISTRY_SECRET_NAME', None) PROJECT_ID = os.environ.get('PROJECT_ID', None) +# Feature Flags +ACCESS_PRIVATE_REFERENCE_DATASETS = ( + os.environ.get('ACCESS_PRIVATE_REFERENCE_DATASETS') == '1' +) +EXPECT_WES_FILTERS = os.environ.get('EXPECT_WES_FILTERS') == '1' +REFERENCE_DATA_AUTO_UPDATE = os.environ.get('REFERENCE_DATA_AUTO_UPDATE') == '1' +SHOULD_REGISTER_ALLELES = os.environ.get('SHOULD_REGISTER_ALLELES') == '1' + @dataclass class Env: ACCESS_PRIVATE_REFERENCE_DATASETS: bool = ACCESS_PRIVATE_REFERENCE_DATASETS ALLELE_REGISTRY_SECRET_NAME: str | None = ALLELE_REGISTRY_SECRET_NAME - REFERENCE_DATA_AUTO_UPDATE: bool = REFERENCE_DATA_AUTO_UPDATE + EXPECT_WES_FILTERS: bool = EXPECT_WES_FILTERS HAIL_TMPDIR: str = HAIL_TMPDIR HAIL_SEARCH_DATA: str = HAIL_SEARCH_DATA LOADING_DATASETS: str = LOADING_DATASETS PRIVATE_REFERENCE_DATASETS: str = PRIVATE_REFERENCE_DATASETS PROJECT_ID: str | None = PROJECT_ID REFERENCE_DATASETS: str = REFERENCE_DATASETS + REFERENCE_DATA_AUTO_UPDATE: bool = REFERENCE_DATA_AUTO_UPDATE SHOULD_REGISTER_ALLELES: bool = SHOULD_REGISTER_ALLELES VEP_CONFIG_PATH: str | None = VEP_CONFIG_PATH VEP_CONFIG_URI: str | None = VEP_CONFIG_URI diff --git a/v03_pipeline/lib/paths.py b/v03_pipeline/lib/paths.py index 14482d831..c6f987a77 100644 --- a/v03_pipeline/lib/paths.py +++ b/v03_pipeline/lib/paths.py @@ -1,5 +1,6 @@ import hashlib import os +import re from v03_pipeline.lib.model import ( AccessControl, @@ -9,6 +10,7 @@ PipelineVersion, ReferenceDatasetCollection, ReferenceGenome, + SampleType, ) @@ -73,6 +75,22 @@ def family_table_path( ) +def imputed_sex_path( + reference_genome: ReferenceGenome, + dataset_type: DatasetType, + callset_path: str, +) -> str: + return os.path.join( + _v03_pipeline_prefix( + Env.LOADING_DATASETS, + reference_genome, + dataset_type, + ), + 'imputed_sex', + f'{hashlib.sha256(callset_path.encode("utf8")).hexdigest()}.tsv', + ) + + def imported_callset_path( reference_genome: ReferenceGenome, dataset_type: DatasetType, @@ -198,6 +216,25 @@ def sex_check_table_path( ) +def valid_filters_path( + dataset_type: DatasetType, + sample_type: SampleType, + callset_path: str, +) -> str | None: + if ( + not Env.EXPECT_WES_FILTERS + or dataset_type != DatasetType.SNV_INDEL + or sample_type != SampleType.WES + or 'part_one_outputs' not in callset_path + ): + return None + return re.sub( + 'part_one_outputs/.*$', + 'part_two_outputs/*.filtered.*.vcf.gz', + callset_path, + ) + + def valid_reference_dataset_collection_path( reference_genome: ReferenceGenome, dataset_type: DatasetType, diff --git a/v03_pipeline/lib/paths_test.py b/v03_pipeline/lib/paths_test.py index d6f0b10ba..ccb0c47c7 100644 --- a/v03_pipeline/lib/paths_test.py +++ b/v03_pipeline/lib/paths_test.py @@ -6,11 +6,13 @@ DatasetType, ReferenceDatasetCollection, ReferenceGenome, + SampleType, ) from v03_pipeline.lib.paths import ( cached_reference_dataset_query_path, family_table_path, imported_callset_path, + imputed_sex_path, lookup_table_path, metadata_for_run_path, new_variants_table_path, @@ -18,6 +20,7 @@ relatedness_check_table_path, remapped_and_subsetted_callset_path, sex_check_table_path, + valid_filters_path, valid_reference_dataset_collection_path, variant_annotations_table_path, ) @@ -54,6 +57,24 @@ def test_family_table_path(self) -> None: 'gs://seqr-datasets/v03/GRCh37/SNV_INDEL/families/franklin.ht', ) + def test_valid_filters_path(self) -> None: + self.assertEqual( + valid_filters_path( + SampleType.WES, + 'gs://bucket/RDG_Broad_WES_Internal_Oct2023/part_one_outputs/chr*/*.vcf.gz', + ), + None, + ) + with patch('v03_pipeline.lib.paths.Env') as mock_env: + mock_env.EXPECT_WES_FILTERS = True + self.assertEqual( + valid_filters_path( + SampleType.WES, + 'gs://bucket/RDG_Broad_WES_Internal_Oct2023/part_one_outputs/chr*/*.vcf.gz', + ), + 'gs://bucket/RDG_Broad_WES_Internal_Oct2023/part_two_outputs/*.filtered.*.vcf.gz', + ) + def test_project_table_path(self) -> None: self.assertEqual( project_table_path( @@ -162,6 +183,16 @@ def test_imported_callset_path(self) -> None: '/seqr-loading-temp/v03/GRCh38/SNV_INDEL/imported_callsets/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.mt', ) + def test_imputed_sex_path(self) -> None: + self.assertEqual( + imputed_sex_path( + ReferenceGenome.GRCh38, + DatasetType.SNV_INDEL, + 'gs://abc.efg/callset.vcf.gz', + ), + '/seqr-loading-temp/v03/GRCh38/SNV_INDEL/imputed_sex/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.tsv', + ) + def test_new_variants_table_path(self) -> None: self.assertEqual( new_variants_table_path( diff --git a/v03_pipeline/lib/tasks/base/base_loading_run_params.py b/v03_pipeline/lib/tasks/base/base_loading_run_params.py index 1bfa204be..f24a0efbf 100644 --- a/v03_pipeline/lib/tasks/base/base_loading_run_params.py +++ b/v03_pipeline/lib/tasks/base/base_loading_run_params.py @@ -10,16 +10,6 @@ class BaseLoadingRunParams(luigi.Task): # but nothing else. sample_type = luigi.EnumParameter(enum=SampleType) callset_path = luigi.Parameter() - # HINT: OptionalParameter vs Parameter is significant here. - # The default Parameter will case `None` to the string "None". - imputed_sex_path = luigi.OptionalParameter( - default=None, - description='Optional path to a tsv of imputed sex values from the DRAGEN GVS pipeline.', - ) - filters_path = luigi.OptionalParameter( - default=None, - description='Optional path to part two outputs from callset (VCF shards containing filter information)', - ) ignore_missing_samples_when_remapping = luigi.BoolParameter( default=False, parsing=luigi.BoolParameter.EXPLICIT_PARSING, diff --git a/v03_pipeline/lib/tasks/write_imported_callset.py b/v03_pipeline/lib/tasks/write_imported_callset.py index 9cd146af4..b2ff5c356 100644 --- a/v03_pipeline/lib/tasks/write_imported_callset.py +++ b/v03_pipeline/lib/tasks/write_imported_callset.py @@ -16,12 +16,13 @@ validate_sample_type, ) from v03_pipeline.lib.misc.vets import annotate_vets -from v03_pipeline.lib.model import CachedReferenceDatasetQuery +from v03_pipeline.lib.model import CachedReferenceDatasetQuery, DatasetType, SampleType from v03_pipeline.lib.model.environment import Env from v03_pipeline.lib.paths import ( cached_reference_dataset_query_path, imported_callset_path, sex_check_table_path, + valid_filters_path, ) from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask @@ -53,10 +54,18 @@ def output(self) -> luigi.Target: def requires(self) -> list[luigi.Task]: requirements = [] - if self.filters_path: + if ( + Env.EXPECT_WES_FILTERS + and self.dataset_type == DatasetType.SNV_INDEL + and self.sample_type == SampleType.WES + ): requirements = [ *requirements, - CallsetTask(self.filters_path), + CallsetTask( + valid_filters_path( + self.dataset_type, self.sample_type, self.callset_path, + ), + ), ] if self.validate and self.dataset_type.can_run_validation: requirements = [ @@ -108,11 +117,14 @@ def additional_row_fields(self, mt): } def create_table(self) -> hl.MatrixTable: + filters_path = valid_filters_path( + self.dataset_type, self.sample_type, self.callset_path, + ) mt = import_callset( self.callset_path, self.reference_genome, self.dataset_type, - self.filters_path, + filters_path, ) mt = select_relevant_fields( mt, @@ -174,6 +186,6 @@ def create_table(self) -> hl.MatrixTable: ) return mt.annotate_globals( callset_path=self.callset_path, - filters_path=self.filters_path or hl.missing(hl.tstr), + filters_path=filters_path or hl.missing(hl.tstr), sample_type=self.sample_type.value, ) diff --git a/v03_pipeline/lib/tasks/write_sex_check_table.py b/v03_pipeline/lib/tasks/write_sex_check_table.py index 801330689..b8b4bb9e7 100644 --- a/v03_pipeline/lib/tasks/write_sex_check_table.py +++ b/v03_pipeline/lib/tasks/write_sex_check_table.py @@ -2,14 +2,13 @@ import luigi from v03_pipeline.lib.misc.io import import_imputed_sex -from v03_pipeline.lib.paths import sex_check_table_path +from v03_pipeline.lib.paths import imputed_sex_path, sex_check_table_path from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask from v03_pipeline.lib.tasks.files import GCSorLocalTarget, RawFileTask class WriteSexCheckTableTask(BaseWriteTask): callset_path = luigi.Parameter() - imputed_sex_path = luigi.Parameter() def output(self) -> luigi.Target: return GCSorLocalTarget( @@ -21,7 +20,13 @@ def output(self) -> luigi.Target: ) def requires(self) -> luigi.Task: - return RawFileTask(self.imputed_sex_path) + return RawFileTask( + imputed_sex_path( + self.reference_genome, + self.dataset_type, + self.callset_path, + ), + ) def create_table(self) -> hl.Table: return import_imputed_sex(self.input().path) From f1a8e63481372ff31581baf7288901aa895ff1be Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 20 Jun 2024 12:51:13 -0400 Subject: [PATCH 31/49] ruff --- v03_pipeline/lib/tasks/write_imported_callset.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/v03_pipeline/lib/tasks/write_imported_callset.py b/v03_pipeline/lib/tasks/write_imported_callset.py index b2ff5c356..2b6f59e1a 100644 --- a/v03_pipeline/lib/tasks/write_imported_callset.py +++ b/v03_pipeline/lib/tasks/write_imported_callset.py @@ -63,7 +63,9 @@ def requires(self) -> list[luigi.Task]: *requirements, CallsetTask( valid_filters_path( - self.dataset_type, self.sample_type, self.callset_path, + self.dataset_type, + self.sample_type, + self.callset_path, ), ), ] @@ -118,7 +120,9 @@ def additional_row_fields(self, mt): def create_table(self) -> hl.MatrixTable: filters_path = valid_filters_path( - self.dataset_type, self.sample_type, self.callset_path, + self.dataset_type, + self.sample_type, + self.callset_path, ) mt = import_callset( self.callset_path, From 0270a88a506613be1d3cc8257583d8492b95a6df Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 20 Jun 2024 13:03:13 -0400 Subject: [PATCH 32/49] formalize --- v03_pipeline/lib/model/dataset_type.py | 8 +++++++- v03_pipeline/lib/paths.py | 3 +-- v03_pipeline/lib/tasks/write_imported_callset.py | 5 ++--- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/v03_pipeline/lib/model/dataset_type.py b/v03_pipeline/lib/model/dataset_type.py index b376a6ae8..e7af6983f 100644 --- a/v03_pipeline/lib/model/dataset_type.py +++ b/v03_pipeline/lib/model/dataset_type.py @@ -4,7 +4,7 @@ import hail as hl from v03_pipeline.lib.annotations import gcnv, mito, shared, snv_indel, sv -from v03_pipeline.lib.model.definitions import ReferenceGenome +from v03_pipeline.lib.model.definitions import ReferenceGenome, SampleType MITO_MIN_HOM_THRESHOLD = 0.95 ZERO = 0.0 @@ -155,6 +155,12 @@ def has_gencode_ensembl_to_refseq_id_mapping( self == DatasetType.SNV_INDEL and reference_genome == ReferenceGenome.GRCh38 ) + def expect_filters( + self, + sample_type: SampleType, + ) -> bool: + return self == DatasetType.SNV_INDEL and sample_type == SampleType.WES + @property def has_gencode_gene_symbol_to_gene_id_mapping(self) -> bool: return self == DatasetType.SV diff --git a/v03_pipeline/lib/paths.py b/v03_pipeline/lib/paths.py index c6f987a77..3ab830e5f 100644 --- a/v03_pipeline/lib/paths.py +++ b/v03_pipeline/lib/paths.py @@ -223,8 +223,7 @@ def valid_filters_path( ) -> str | None: if ( not Env.EXPECT_WES_FILTERS - or dataset_type != DatasetType.SNV_INDEL - or sample_type != SampleType.WES + or not dataset_type.expect_filters(sample_type) or 'part_one_outputs' not in callset_path ): return None diff --git a/v03_pipeline/lib/tasks/write_imported_callset.py b/v03_pipeline/lib/tasks/write_imported_callset.py index 2b6f59e1a..9961f5edf 100644 --- a/v03_pipeline/lib/tasks/write_imported_callset.py +++ b/v03_pipeline/lib/tasks/write_imported_callset.py @@ -16,7 +16,7 @@ validate_sample_type, ) from v03_pipeline.lib.misc.vets import annotate_vets -from v03_pipeline.lib.model import CachedReferenceDatasetQuery, DatasetType, SampleType +from v03_pipeline.lib.model import CachedReferenceDatasetQuery from v03_pipeline.lib.model.environment import Env from v03_pipeline.lib.paths import ( cached_reference_dataset_query_path, @@ -56,8 +56,7 @@ def requires(self) -> list[luigi.Task]: requirements = [] if ( Env.EXPECT_WES_FILTERS - and self.dataset_type == DatasetType.SNV_INDEL - and self.sample_type == SampleType.WES + and self.dataset_type.expect_filters(self.sample_type) ): requirements = [ *requirements, From e33306a2409a23698747457c6093818d1b2484d9 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 20 Jun 2024 13:13:53 -0400 Subject: [PATCH 33/49] lint --- v03_pipeline/lib/tasks/write_imported_callset.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/v03_pipeline/lib/tasks/write_imported_callset.py b/v03_pipeline/lib/tasks/write_imported_callset.py index 9961f5edf..e049ad0e7 100644 --- a/v03_pipeline/lib/tasks/write_imported_callset.py +++ b/v03_pipeline/lib/tasks/write_imported_callset.py @@ -54,9 +54,8 @@ def output(self) -> luigi.Target: def requires(self) -> list[luigi.Task]: requirements = [] - if ( - Env.EXPECT_WES_FILTERS - and self.dataset_type.expect_filters(self.sample_type) + if Env.EXPECT_WES_FILTERS and self.dataset_type.expect_filters( + self.sample_type ): requirements = [ *requirements, From a9c485b62ba52154508a1c958ebad325e6ca301b Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 20 Jun 2024 13:16:42 -0400 Subject: [PATCH 34/49] lint --- v03_pipeline/lib/tasks/write_imported_callset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/v03_pipeline/lib/tasks/write_imported_callset.py b/v03_pipeline/lib/tasks/write_imported_callset.py index e049ad0e7..87b42ac9e 100644 --- a/v03_pipeline/lib/tasks/write_imported_callset.py +++ b/v03_pipeline/lib/tasks/write_imported_callset.py @@ -55,7 +55,7 @@ def output(self) -> luigi.Target: def requires(self) -> list[luigi.Task]: requirements = [] if Env.EXPECT_WES_FILTERS and self.dataset_type.expect_filters( - self.sample_type + self.sample_type, ): requirements = [ *requirements, From ac83cd0d3900f20f670632be9fae309dae7ae03b Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 20 Jun 2024 14:30:19 -0400 Subject: [PATCH 35/49] Fix arg --- v03_pipeline/lib/paths_test.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/v03_pipeline/lib/paths_test.py b/v03_pipeline/lib/paths_test.py index ccb0c47c7..b09b8f387 100644 --- a/v03_pipeline/lib/paths_test.py +++ b/v03_pipeline/lib/paths_test.py @@ -60,6 +60,7 @@ def test_family_table_path(self) -> None: def test_valid_filters_path(self) -> None: self.assertEqual( valid_filters_path( + DatasetType.SNV_INDEL, SampleType.WES, 'gs://bucket/RDG_Broad_WES_Internal_Oct2023/part_one_outputs/chr*/*.vcf.gz', ), @@ -69,6 +70,7 @@ def test_valid_filters_path(self) -> None: mock_env.EXPECT_WES_FILTERS = True self.assertEqual( valid_filters_path( + DatasetType.SNV_INDEL, SampleType.WES, 'gs://bucket/RDG_Broad_WES_Internal_Oct2023/part_one_outputs/chr*/*.vcf.gz', ), From 27c6bd8acfc4c638b77cbc3cc2e376db335821f7 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 20 Jun 2024 14:52:56 -0400 Subject: [PATCH 36/49] Change parameters again --- v03_pipeline/lib/model/environment.py | 2 -- .../lib/tasks/base/base_loading_run_params.py | 14 +++++++++----- v03_pipeline/lib/tasks/update_lookup_table_test.py | 2 -- .../lib/tasks/update_project_table_test.py | 1 - ...iant_annotations_table_with_new_samples_test.py | 11 ++--------- v03_pipeline/lib/tasks/write_family_table_test.py | 3 --- v03_pipeline/lib/tasks/write_imported_callset.py | 12 ++++++------ .../lib/tasks/write_metadata_for_run_test.py | 2 -- .../lib/tasks/write_project_family_tables_test.py | 1 - .../tasks/write_remapped_and_subsetted_callset.py | 4 ++-- .../write_remapped_and_subsetted_callset_test.py | 6 ++---- 11 files changed, 21 insertions(+), 37 deletions(-) diff --git a/v03_pipeline/lib/model/environment.py b/v03_pipeline/lib/model/environment.py index 831f430c6..e6695828a 100644 --- a/v03_pipeline/lib/model/environment.py +++ b/v03_pipeline/lib/model/environment.py @@ -24,7 +24,6 @@ ACCESS_PRIVATE_REFERENCE_DATASETS = ( os.environ.get('ACCESS_PRIVATE_REFERENCE_DATASETS') == '1' ) -EXPECT_WES_FILTERS = os.environ.get('EXPECT_WES_FILTERS') == '1' REFERENCE_DATA_AUTO_UPDATE = os.environ.get('REFERENCE_DATA_AUTO_UPDATE') == '1' SHOULD_REGISTER_ALLELES = os.environ.get('SHOULD_REGISTER_ALLELES') == '1' @@ -33,7 +32,6 @@ class Env: ACCESS_PRIVATE_REFERENCE_DATASETS: bool = ACCESS_PRIVATE_REFERENCE_DATASETS ALLELE_REGISTRY_SECRET_NAME: str | None = ALLELE_REGISTRY_SECRET_NAME - EXPECT_WES_FILTERS: bool = EXPECT_WES_FILTERS HAIL_TMPDIR: str = HAIL_TMPDIR HAIL_SEARCH_DATA: str = HAIL_SEARCH_DATA LOADING_DATASETS: str = LOADING_DATASETS diff --git a/v03_pipeline/lib/tasks/base/base_loading_run_params.py b/v03_pipeline/lib/tasks/base/base_loading_run_params.py index f24a0efbf..08cc8e065 100644 --- a/v03_pipeline/lib/tasks/base/base_loading_run_params.py +++ b/v03_pipeline/lib/tasks/base/base_loading_run_params.py @@ -14,16 +14,20 @@ class BaseLoadingRunParams(luigi.Task): default=False, parsing=luigi.BoolParameter.EXPLICIT_PARSING, ) - validate = luigi.BoolParameter( + force = luigi.BoolParameter( + default=False, + parsing=luigi.BoolParameter.EXPLICIT_PARSING, + ) + skip_check_sex_and_relatedness = luigi.BoolParameter( default=True, parsing=luigi.BoolParameter.EXPLICIT_PARSING, ) - force = luigi.BoolParameter( - default=False, + skip_expect_filters = luigi.BoolParameter( + default=True, parsing=luigi.BoolParameter.EXPLICIT_PARSING, ) - check_sex_and_relatedness = luigi.BoolParameter( - default=False, + skip_validation = luigi.BoolParameter( + default=True, parsing=luigi.BoolParameter.EXPLICIT_PARSING, ) is_new_gcnv_joint_call = luigi.BoolParameter( diff --git a/v03_pipeline/lib/tasks/update_lookup_table_test.py b/v03_pipeline/lib/tasks/update_lookup_table_test.py index dc5f22059..c6ee96df7 100644 --- a/v03_pipeline/lib/tasks/update_lookup_table_test.py +++ b/v03_pipeline/lib/tasks/update_lookup_table_test.py @@ -26,7 +26,6 @@ def test_skip_update_lookup_table_task(self) -> None: ], # a project excluded from the lookup table project_remap_paths=[TEST_REMAP], project_pedigree_paths=[TEST_PEDIGREE_3], - validate=False, liftover_ref_path=TEST_LIFTOVER, ) worker.add(uslt_task) @@ -58,7 +57,6 @@ def test_update_lookup_table_task(self) -> None: project_guids=['R0113_test_project'], project_remap_paths=[TEST_REMAP], project_pedigree_paths=[TEST_PEDIGREE_3], - validate=False, liftover_ref_path=TEST_LIFTOVER, ) worker.add(uslt_task) diff --git a/v03_pipeline/lib/tasks/update_project_table_test.py b/v03_pipeline/lib/tasks/update_project_table_test.py index 07278a2d3..0ab93f469 100644 --- a/v03_pipeline/lib/tasks/update_project_table_test.py +++ b/v03_pipeline/lib/tasks/update_project_table_test.py @@ -22,7 +22,6 @@ def test_update_project_table_task(self) -> None: project_guid='R0113_test_project', project_remap_path=TEST_REMAP, project_pedigree_path=TEST_PEDIGREE_3, - validate=False, liftover_ref_path=TEST_LIFTOVER, ) worker.add(upt_task) diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py index a00d5e3ab..94aff28ed 100644 --- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py +++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py @@ -163,7 +163,6 @@ def test_missing_pedigree( project_guids=['R0113_test_project'], project_remap_paths=[TEST_REMAP], project_pedigree_paths=['bad_pedigree'], - validate=False, liftover_ref_path=TEST_LIFTOVER, run_id=TEST_RUN_ID, ) @@ -197,7 +196,6 @@ def test_missing_interval_reference( project_guids=['R0113_test_project'], project_remap_paths=[TEST_REMAP], project_pedigree_paths=[TEST_PEDIGREE_3], - validate=False, liftover_ref_path=TEST_LIFTOVER, run_id=TEST_RUN_ID, ) @@ -366,7 +364,7 @@ def test_multiple_update_vat( project_guids=['R0113_test_project'], project_remap_paths=[TEST_REMAP], project_pedigree_paths=[TEST_PEDIGREE_3], - validate=True, + skip_validation=False, liftover_ref_path=TEST_LIFTOVER, run_id=TEST_RUN_ID, ) @@ -418,7 +416,7 @@ def test_multiple_update_vat( project_guids=['R0114_project4'], project_remap_paths=[TEST_REMAP], project_pedigree_paths=[TEST_PEDIGREE_4], - validate=True, + skip_validation=False, liftover_ref_path=TEST_LIFTOVER, run_id=TEST_RUN_ID, ) @@ -689,7 +687,6 @@ def test_update_vat_grch37( project_guids=['R0113_test_project'], project_remap_paths=[TEST_REMAP], project_pedigree_paths=[TEST_PEDIGREE_3], - validate=False, liftover_ref_path=TEST_LIFTOVER, run_id=TEST_RUN_ID, ) @@ -769,7 +766,6 @@ def test_update_vat_without_accessing_private_datasets( project_guids=['R0113_test_project'], project_remap_paths=[TEST_REMAP], project_pedigree_paths=[TEST_PEDIGREE_3], - validate=False, liftover_ref_path=TEST_LIFTOVER, run_id=TEST_RUN_ID, ) @@ -827,7 +823,6 @@ def test_mito_update_vat( project_guids=['R0115_test_project2'], project_remap_paths=['not_a_real_file'], project_pedigree_paths=[TEST_PEDIGREE_5], - validate=False, liftover_ref_path=TEST_LIFTOVER, run_id=TEST_RUN_ID, ) @@ -1092,7 +1087,6 @@ def test_sv_update_vat( project_guids=['R0115_test_project2'], project_remap_paths=['not_a_real_file'], project_pedigree_paths=[TEST_PEDIGREE_5], - validate=False, liftover_ref_path=TEST_LIFTOVER, run_id=TEST_RUN_ID, ) @@ -1654,7 +1648,6 @@ def test_gcnv_update_vat( project_guids=['R0115_test_project2'], project_remap_paths=['not_a_real_file'], project_pedigree_paths=[TEST_PEDIGREE_5], - validate=False, liftover_ref_path=TEST_LIFTOVER, run_id=TEST_RUN_ID, ) diff --git a/v03_pipeline/lib/tasks/write_family_table_test.py b/v03_pipeline/lib/tasks/write_family_table_test.py index 554735c59..637104cc3 100644 --- a/v03_pipeline/lib/tasks/write_family_table_test.py +++ b/v03_pipeline/lib/tasks/write_family_table_test.py @@ -26,7 +26,6 @@ def test_snv_write_family_table_task(self) -> None: project_remap_path=TEST_REMAP, project_pedigree_path=TEST_PEDIGREE_3, family_guid='abc_1', - validate=False, liftover_ref_path=TEST_LIFTOVER, ) worker.add(wft_task) @@ -164,7 +163,6 @@ def test_sv_write_family_table_task(self) -> None: project_remap_path='not_a_real_file', project_pedigree_path=TEST_PEDIGREE_5, family_guid='family_2_1', - validate=False, liftover_ref_path=TEST_LIFTOVER, ) worker.add(write_family_table_task) @@ -417,7 +415,6 @@ def test_gcnv_write_family_table_task(self) -> None: project_remap_path='not_a_real_file', project_pedigree_path=TEST_PEDIGREE_5, family_guid='family_2_1', - validate=False, liftover_ref_path=TEST_LIFTOVER, ) worker.add(write_family_table_task) diff --git a/v03_pipeline/lib/tasks/write_imported_callset.py b/v03_pipeline/lib/tasks/write_imported_callset.py index 87b42ac9e..666114658 100644 --- a/v03_pipeline/lib/tasks/write_imported_callset.py +++ b/v03_pipeline/lib/tasks/write_imported_callset.py @@ -54,7 +54,7 @@ def output(self) -> luigi.Target: def requires(self) -> list[luigi.Task]: requirements = [] - if Env.EXPECT_WES_FILTERS and self.dataset_type.expect_filters( + if not self.skip_expect_filters and self.dataset_type.expect_filters( self.sample_type, ): requirements = [ @@ -67,7 +67,7 @@ def requires(self) -> list[luigi.Task]: ), ), ] - if self.validate and self.dataset_type.can_run_validation: + if not self.skip_validation and self.dataset_type.can_run_validation: requirements = [ *requirements, ( @@ -86,7 +86,7 @@ def requires(self) -> list[luigi.Task]: ), ] if ( - self.check_sex_and_relatedness + not self.skip_check_sex_and_relatedness and self.dataset_type.check_sex_and_relatedness ): requirements = [ @@ -102,7 +102,7 @@ def additional_row_fields(self, mt): return { **( {'info.AF': hl.tarray(hl.tfloat64)} - if self.check_sex_and_relatedness + if not self.skip_check_sex_and_relatedness and self.dataset_type.check_sex_and_relatedness else {} ), @@ -154,7 +154,7 @@ def create_table(self) -> hl.MatrixTable: mt.locus.contig, ), ) - if self.validate and self.dataset_type.can_run_validation: + if not self.skip_validation and self.dataset_type.can_run_validation: validate_allele_type(mt) validate_no_duplicate_variants(mt) validate_expected_contig_frequency(mt, self.reference_genome) @@ -172,7 +172,7 @@ def create_table(self) -> hl.MatrixTable: self.sample_type, ) if ( - self.check_sex_and_relatedness + not self.skip_check_sex_and_relatedness and self.dataset_type.check_sex_and_relatedness ): sex_check_ht = hl.read_table( diff --git a/v03_pipeline/lib/tasks/write_metadata_for_run_test.py b/v03_pipeline/lib/tasks/write_metadata_for_run_test.py index cf61fcc4f..6c16782ed 100644 --- a/v03_pipeline/lib/tasks/write_metadata_for_run_test.py +++ b/v03_pipeline/lib/tasks/write_metadata_for_run_test.py @@ -23,8 +23,6 @@ def test_write_metadata_for_run_task(self) -> None: project_guids=['R0113_test_project', 'R0114_project4'], project_remap_paths=[TEST_REMAP_2, TEST_REMAP_2], project_pedigree_paths=[TEST_PEDIGREE_3, TEST_PEDIGREE_4], - validate=False, - check_sex_and_relatedness=False, run_id='run_123456', ) worker.add(write_metadata_for_run_task) diff --git a/v03_pipeline/lib/tasks/write_project_family_tables_test.py b/v03_pipeline/lib/tasks/write_project_family_tables_test.py index 9943771d2..18d818656 100644 --- a/v03_pipeline/lib/tasks/write_project_family_tables_test.py +++ b/v03_pipeline/lib/tasks/write_project_family_tables_test.py @@ -24,7 +24,6 @@ def test_snv_write_project_family_tables_task(self) -> None: project_guid='R0113_test_project', project_remap_path=TEST_REMAP, project_pedigree_path=TEST_PEDIGREE_4, - validate=False, liftover_ref_path=TEST_LIFTOVER, ) worker.add(write_project_family_tables) diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py index 81e5adf93..ec468eb2c 100644 --- a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py +++ b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py @@ -53,7 +53,7 @@ def requires(self) -> list[luigi.Task]: RawFileTask(self.project_pedigree_path), ] if ( - self.check_sex_and_relatedness + not self.skip_check_sex_and_relatedness and self.dataset_type.check_sex_and_relatedness ): requirements = [ @@ -88,7 +88,7 @@ def create_table(self) -> hl.MatrixTable: families_failed_relatedness_check = {} families_failed_sex_check = {} if ( - self.check_sex_and_relatedness + not self.skip_check_sex_and_relatedness and self.dataset_type.check_sex_and_relatedness ): relatedness_check_ht = hl.read_table(self.input()[2].path) diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py index 6cfd95098..a8e277dbb 100644 --- a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py +++ b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py @@ -82,8 +82,7 @@ def test_write_remapped_and_subsetted_callset_task( project_guid='R0113_test_project', project_remap_path=TEST_REMAP, project_pedigree_path=TEST_PEDIGREE_3, - validate=False, - check_sex_and_relatedness=True, + skip_check_sex_and_relatedness=False, ) worker.add(wrsc_task) worker.run() @@ -116,8 +115,7 @@ def test_write_remapped_and_subsetted_callset_task_failed_sex_check_family( project_guid='R0114_project4', project_remap_path=TEST_REMAP, project_pedigree_path=TEST_PEDIGREE_4, - validate=False, - check_sex_and_relatedness=True, + skip_check_sex_and_relatedness=False, ) worker.add(wrsc_task) worker.run() From 1840e1ea72e880b2ec8bb29c8bf46877a0fd4434 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 20 Jun 2024 15:29:21 -0400 Subject: [PATCH 37/49] Update a bunch of args --- v03_pipeline/lib/paths.py | 3 +-- v03_pipeline/lib/paths_test.py | 20 +++++++++---------- .../lib/tasks/base/base_loading_run_params.py | 6 +++--- .../lib/tasks/update_lookup_table_test.py | 2 ++ .../lib/tasks/update_project_table_test.py | 1 + ...annotations_table_with_new_samples_test.py | 20 +++++++++++++++++-- .../lib/tasks/write_family_table_test.py | 6 ++++++ .../lib/tasks/write_metadata_for_run_test.py | 2 ++ .../tasks/write_project_family_tables_test.py | 2 ++ ...ite_remapped_and_subsetted_callset_test.py | 2 ++ 10 files changed, 46 insertions(+), 18 deletions(-) diff --git a/v03_pipeline/lib/paths.py b/v03_pipeline/lib/paths.py index 3ab830e5f..8d5ff6335 100644 --- a/v03_pipeline/lib/paths.py +++ b/v03_pipeline/lib/paths.py @@ -222,8 +222,7 @@ def valid_filters_path( callset_path: str, ) -> str | None: if ( - not Env.EXPECT_WES_FILTERS - or not dataset_type.expect_filters(sample_type) + not dataset_type.expect_filters(sample_type) or 'part_one_outputs' not in callset_path ): return None diff --git a/v03_pipeline/lib/paths_test.py b/v03_pipeline/lib/paths_test.py index b09b8f387..50a080dc0 100644 --- a/v03_pipeline/lib/paths_test.py +++ b/v03_pipeline/lib/paths_test.py @@ -60,22 +60,20 @@ def test_family_table_path(self) -> None: def test_valid_filters_path(self) -> None: self.assertEqual( valid_filters_path( - DatasetType.SNV_INDEL, + DatasetType.MITO, SampleType.WES, 'gs://bucket/RDG_Broad_WES_Internal_Oct2023/part_one_outputs/chr*/*.vcf.gz', ), None, ) - with patch('v03_pipeline.lib.paths.Env') as mock_env: - mock_env.EXPECT_WES_FILTERS = True - self.assertEqual( - valid_filters_path( - DatasetType.SNV_INDEL, - SampleType.WES, - 'gs://bucket/RDG_Broad_WES_Internal_Oct2023/part_one_outputs/chr*/*.vcf.gz', - ), - 'gs://bucket/RDG_Broad_WES_Internal_Oct2023/part_two_outputs/*.filtered.*.vcf.gz', - ) + self.assertEqual( + valid_filters_path( + DatasetType.SNV_INDEL, + SampleType.WES, + 'gs://bucket/RDG_Broad_WES_Internal_Oct2023/part_one_outputs/chr*/*.vcf.gz', + ), + 'gs://bucket/RDG_Broad_WES_Internal_Oct2023/part_two_outputs/*.filtered.*.vcf.gz', + ) def test_project_table_path(self) -> None: self.assertEqual( diff --git a/v03_pipeline/lib/tasks/base/base_loading_run_params.py b/v03_pipeline/lib/tasks/base/base_loading_run_params.py index 08cc8e065..06fdcc967 100644 --- a/v03_pipeline/lib/tasks/base/base_loading_run_params.py +++ b/v03_pipeline/lib/tasks/base/base_loading_run_params.py @@ -19,15 +19,15 @@ class BaseLoadingRunParams(luigi.Task): parsing=luigi.BoolParameter.EXPLICIT_PARSING, ) skip_check_sex_and_relatedness = luigi.BoolParameter( - default=True, + default=False, parsing=luigi.BoolParameter.EXPLICIT_PARSING, ) skip_expect_filters = luigi.BoolParameter( - default=True, + default=False, parsing=luigi.BoolParameter.EXPLICIT_PARSING, ) skip_validation = luigi.BoolParameter( - default=True, + default=False, parsing=luigi.BoolParameter.EXPLICIT_PARSING, ) is_new_gcnv_joint_call = luigi.BoolParameter( diff --git a/v03_pipeline/lib/tasks/update_lookup_table_test.py b/v03_pipeline/lib/tasks/update_lookup_table_test.py index c6ee96df7..6dac008e7 100644 --- a/v03_pipeline/lib/tasks/update_lookup_table_test.py +++ b/v03_pipeline/lib/tasks/update_lookup_table_test.py @@ -26,6 +26,7 @@ def test_skip_update_lookup_table_task(self) -> None: ], # a project excluded from the lookup table project_remap_paths=[TEST_REMAP], project_pedigree_paths=[TEST_PEDIGREE_3], + skip_validation=True, liftover_ref_path=TEST_LIFTOVER, ) worker.add(uslt_task) @@ -57,6 +58,7 @@ def test_update_lookup_table_task(self) -> None: project_guids=['R0113_test_project'], project_remap_paths=[TEST_REMAP], project_pedigree_paths=[TEST_PEDIGREE_3], + skip_validation=True, liftover_ref_path=TEST_LIFTOVER, ) worker.add(uslt_task) diff --git a/v03_pipeline/lib/tasks/update_project_table_test.py b/v03_pipeline/lib/tasks/update_project_table_test.py index 0ab93f469..ff3f93c1f 100644 --- a/v03_pipeline/lib/tasks/update_project_table_test.py +++ b/v03_pipeline/lib/tasks/update_project_table_test.py @@ -22,6 +22,7 @@ def test_update_project_table_task(self) -> None: project_guid='R0113_test_project', project_remap_path=TEST_REMAP, project_pedigree_path=TEST_PEDIGREE_3, + skip_validation=True, liftover_ref_path=TEST_LIFTOVER, ) worker.add(upt_task) diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py index 94aff28ed..9991524a7 100644 --- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py +++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py @@ -164,6 +164,8 @@ def test_missing_pedigree( project_remap_paths=[TEST_REMAP], project_pedigree_paths=['bad_pedigree'], liftover_ref_path=TEST_LIFTOVER, + skip_validation=True, + skip_check_sex_and_relatedness=True, run_id=TEST_RUN_ID, ) worker = luigi.worker.Worker() @@ -197,6 +199,8 @@ def test_missing_interval_reference( project_remap_paths=[TEST_REMAP], project_pedigree_paths=[TEST_PEDIGREE_3], liftover_ref_path=TEST_LIFTOVER, + skip_validation=True, + skip_check_sex_and_relatedness=True, run_id=TEST_RUN_ID, ) worker = luigi.worker.Worker() @@ -364,8 +368,9 @@ def test_multiple_update_vat( project_guids=['R0113_test_project'], project_remap_paths=[TEST_REMAP], project_pedigree_paths=[TEST_PEDIGREE_3], - skip_validation=False, liftover_ref_path=TEST_LIFTOVER, + skip_validation=False, + skip_check_sex_and_relatedness=True, run_id=TEST_RUN_ID, ) worker.add(uvatwns_task_3) @@ -416,8 +421,9 @@ def test_multiple_update_vat( project_guids=['R0114_project4'], project_remap_paths=[TEST_REMAP], project_pedigree_paths=[TEST_PEDIGREE_4], - skip_validation=False, liftover_ref_path=TEST_LIFTOVER, + skip_validation=False, + skip_check_sex_and_relatedness=True, run_id=TEST_RUN_ID, ) worker.add(uvatwns_task_4) @@ -688,6 +694,8 @@ def test_update_vat_grch37( project_remap_paths=[TEST_REMAP], project_pedigree_paths=[TEST_PEDIGREE_3], liftover_ref_path=TEST_LIFTOVER, + skip_validation=True, + skip_check_sex_and_relatedness=True, run_id=TEST_RUN_ID, ) worker.add(uvatwns_task) @@ -767,6 +775,8 @@ def test_update_vat_without_accessing_private_datasets( project_remap_paths=[TEST_REMAP], project_pedigree_paths=[TEST_PEDIGREE_3], liftover_ref_path=TEST_LIFTOVER, + skip_validation=True, + skip_check_sex_and_relatedness=True, run_id=TEST_RUN_ID, ) worker.add(uvatwns_task) @@ -824,6 +834,8 @@ def test_mito_update_vat( project_remap_paths=['not_a_real_file'], project_pedigree_paths=[TEST_PEDIGREE_5], liftover_ref_path=TEST_LIFTOVER, + skip_validation=True, + skip_check_sex_and_relatedness=True, run_id=TEST_RUN_ID, ) ) @@ -1088,6 +1100,8 @@ def test_sv_update_vat( project_remap_paths=['not_a_real_file'], project_pedigree_paths=[TEST_PEDIGREE_5], liftover_ref_path=TEST_LIFTOVER, + skip_validation=True, + skip_check_sex_and_relatedness=True, run_id=TEST_RUN_ID, ) ) @@ -1649,6 +1663,8 @@ def test_gcnv_update_vat( project_remap_paths=['not_a_real_file'], project_pedigree_paths=[TEST_PEDIGREE_5], liftover_ref_path=TEST_LIFTOVER, + skip_validation=True, + skip_check_sex_and_relatedness=True, run_id=TEST_RUN_ID, ) ) diff --git a/v03_pipeline/lib/tasks/write_family_table_test.py b/v03_pipeline/lib/tasks/write_family_table_test.py index 637104cc3..2b455349d 100644 --- a/v03_pipeline/lib/tasks/write_family_table_test.py +++ b/v03_pipeline/lib/tasks/write_family_table_test.py @@ -26,6 +26,8 @@ def test_snv_write_family_table_task(self) -> None: project_remap_path=TEST_REMAP, project_pedigree_path=TEST_PEDIGREE_3, family_guid='abc_1', + skip_validation=True, + skip_check_sex_and_relatedness=True, liftover_ref_path=TEST_LIFTOVER, ) worker.add(wft_task) @@ -163,6 +165,8 @@ def test_sv_write_family_table_task(self) -> None: project_remap_path='not_a_real_file', project_pedigree_path=TEST_PEDIGREE_5, family_guid='family_2_1', + skip_validation=True, + skip_check_sex_and_relatedness=True, liftover_ref_path=TEST_LIFTOVER, ) worker.add(write_family_table_task) @@ -415,6 +419,8 @@ def test_gcnv_write_family_table_task(self) -> None: project_remap_path='not_a_real_file', project_pedigree_path=TEST_PEDIGREE_5, family_guid='family_2_1', + skip_validation=True, + skip_check_sex_and_relatedness=True, liftover_ref_path=TEST_LIFTOVER, ) worker.add(write_family_table_task) diff --git a/v03_pipeline/lib/tasks/write_metadata_for_run_test.py b/v03_pipeline/lib/tasks/write_metadata_for_run_test.py index 6c16782ed..1feca2434 100644 --- a/v03_pipeline/lib/tasks/write_metadata_for_run_test.py +++ b/v03_pipeline/lib/tasks/write_metadata_for_run_test.py @@ -23,6 +23,8 @@ def test_write_metadata_for_run_task(self) -> None: project_guids=['R0113_test_project', 'R0114_project4'], project_remap_paths=[TEST_REMAP_2, TEST_REMAP_2], project_pedigree_paths=[TEST_PEDIGREE_3, TEST_PEDIGREE_4], + skip_check_sex_and_relatedness=True, + skip_validation=True, run_id='run_123456', ) worker.add(write_metadata_for_run_task) diff --git a/v03_pipeline/lib/tasks/write_project_family_tables_test.py b/v03_pipeline/lib/tasks/write_project_family_tables_test.py index 18d818656..2642fe486 100644 --- a/v03_pipeline/lib/tasks/write_project_family_tables_test.py +++ b/v03_pipeline/lib/tasks/write_project_family_tables_test.py @@ -24,6 +24,8 @@ def test_snv_write_project_family_tables_task(self) -> None: project_guid='R0113_test_project', project_remap_path=TEST_REMAP, project_pedigree_path=TEST_PEDIGREE_4, + skip_validation=True, + skip_check_sex_and_relatedness=True, liftover_ref_path=TEST_LIFTOVER, ) worker.add(write_project_family_tables) diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py index a8e277dbb..c6576ba8a 100644 --- a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py +++ b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py @@ -82,6 +82,7 @@ def test_write_remapped_and_subsetted_callset_task( project_guid='R0113_test_project', project_remap_path=TEST_REMAP, project_pedigree_path=TEST_PEDIGREE_3, + skip_validation=True, skip_check_sex_and_relatedness=False, ) worker.add(wrsc_task) @@ -115,6 +116,7 @@ def test_write_remapped_and_subsetted_callset_task_failed_sex_check_family( project_guid='R0114_project4', project_remap_path=TEST_REMAP, project_pedigree_path=TEST_PEDIGREE_4, + skip_validation=True, skip_check_sex_and_relatedness=False, ) worker.add(wrsc_task) From a686ce8e5afef1726aa539e6f3dc413882da14b9 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 20 Jun 2024 18:10:13 -0400 Subject: [PATCH 38/49] missed a few --- v03_pipeline/lib/tasks/update_lookup_table_test.py | 2 ++ v03_pipeline/lib/tasks/update_project_table_test.py | 1 + 2 files changed, 3 insertions(+) diff --git a/v03_pipeline/lib/tasks/update_lookup_table_test.py b/v03_pipeline/lib/tasks/update_lookup_table_test.py index 6dac008e7..ae28f33da 100644 --- a/v03_pipeline/lib/tasks/update_lookup_table_test.py +++ b/v03_pipeline/lib/tasks/update_lookup_table_test.py @@ -27,6 +27,7 @@ def test_skip_update_lookup_table_task(self) -> None: project_remap_paths=[TEST_REMAP], project_pedigree_paths=[TEST_PEDIGREE_3], skip_validation=True, + skip_check_sex_and_relatedness=True, liftover_ref_path=TEST_LIFTOVER, ) worker.add(uslt_task) @@ -59,6 +60,7 @@ def test_update_lookup_table_task(self) -> None: project_remap_paths=[TEST_REMAP], project_pedigree_paths=[TEST_PEDIGREE_3], skip_validation=True, + skip_check_sex_and_relatedness=True, liftover_ref_path=TEST_LIFTOVER, ) worker.add(uslt_task) diff --git a/v03_pipeline/lib/tasks/update_project_table_test.py b/v03_pipeline/lib/tasks/update_project_table_test.py index ff3f93c1f..e45c98b1b 100644 --- a/v03_pipeline/lib/tasks/update_project_table_test.py +++ b/v03_pipeline/lib/tasks/update_project_table_test.py @@ -23,6 +23,7 @@ def test_update_project_table_task(self) -> None: project_remap_path=TEST_REMAP, project_pedigree_path=TEST_PEDIGREE_3, skip_validation=True, + skip_check_sex_and_relatedness=True, liftover_ref_path=TEST_LIFTOVER, ) worker.add(upt_task) From 47c581ecd91718a2b5b66f06a3f3e246c3fc0819 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 20 Jun 2024 18:37:38 -0400 Subject: [PATCH 39/49] fix liftover --- v03_pipeline/lib/model/environment.py | 4 ++++ v03_pipeline/lib/tasks/base/base_loading_run_params.py | 4 ---- v03_pipeline/lib/tasks/update_lookup_table_test.py | 3 --- v03_pipeline/lib/tasks/update_project_table_test.py | 2 -- ..._variant_annotations_table_with_new_samples_test.py | 10 ---------- v03_pipeline/lib/tasks/write_family_table_test.py | 4 ---- v03_pipeline/lib/tasks/write_new_variants_table.py | 1 + .../lib/tasks/write_project_family_tables_test.py | 2 -- 8 files changed, 5 insertions(+), 25 deletions(-) diff --git a/v03_pipeline/lib/model/environment.py b/v03_pipeline/lib/model/environment.py index e6695828a..dd4e3bb3a 100644 --- a/v03_pipeline/lib/model/environment.py +++ b/v03_pipeline/lib/model/environment.py @@ -4,6 +4,9 @@ # NB: using os.environ.get inside the dataclass defaults gives a lint error. HAIL_TMPDIR = os.environ.get('HAIL_TMPDIR', '/tmp') # noqa: S108 HAIL_SEARCH_DATA = os.environ.get('HAIL_SEARCH_DATA', '/hail-search-data') +LIFTOVER_REF_PATH = os.environ.get( + 'LIFTOVER_REF_PATH', 'gs://hail-common/references/grch38_to_grch37.over.chain.gz' +) LOADING_DATASETS = os.environ.get('LOADING_DATASETS', '/seqr-loading-temp') PRIVATE_REFERENCE_DATASETS = os.environ.get( 'PRIVATE_REFERENCE_DATASETS', @@ -34,6 +37,7 @@ class Env: ALLELE_REGISTRY_SECRET_NAME: str | None = ALLELE_REGISTRY_SECRET_NAME HAIL_TMPDIR: str = HAIL_TMPDIR HAIL_SEARCH_DATA: str = HAIL_SEARCH_DATA + LIFTOVER_REF_PATH: str = LIFTOVER_REF_PATH LOADING_DATASETS: str = LOADING_DATASETS PRIVATE_REFERENCE_DATASETS: str = PRIVATE_REFERENCE_DATASETS PROJECT_ID: str | None = PROJECT_ID diff --git a/v03_pipeline/lib/tasks/base/base_loading_run_params.py b/v03_pipeline/lib/tasks/base/base_loading_run_params.py index 06fdcc967..f5ce3b3e4 100644 --- a/v03_pipeline/lib/tasks/base/base_loading_run_params.py +++ b/v03_pipeline/lib/tasks/base/base_loading_run_params.py @@ -34,7 +34,3 @@ class BaseLoadingRunParams(luigi.Task): default=False, description='Is this a fully joint-called callset.', ) - liftover_ref_path = luigi.OptionalParameter( - default='gs://hail-common/references/grch38_to_grch37.over.chain.gz', - description='Path to GRCh38 to GRCh37 coordinates file', - ) diff --git a/v03_pipeline/lib/tasks/update_lookup_table_test.py b/v03_pipeline/lib/tasks/update_lookup_table_test.py index ae28f33da..758b6392c 100644 --- a/v03_pipeline/lib/tasks/update_lookup_table_test.py +++ b/v03_pipeline/lib/tasks/update_lookup_table_test.py @@ -7,7 +7,6 @@ ) from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase -TEST_LIFTOVER = 'v03_pipeline/var/test/liftover/grch38_to_grch37.over.chain.gz' TEST_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf' TEST_REMAP = 'v03_pipeline/var/test/remaps/test_remap_1.tsv' TEST_PEDIGREE_3 = 'v03_pipeline/var/test/pedigrees/test_pedigree_3.tsv' @@ -28,7 +27,6 @@ def test_skip_update_lookup_table_task(self) -> None: project_pedigree_paths=[TEST_PEDIGREE_3], skip_validation=True, skip_check_sex_and_relatedness=True, - liftover_ref_path=TEST_LIFTOVER, ) worker.add(uslt_task) worker.run() @@ -61,7 +59,6 @@ def test_update_lookup_table_task(self) -> None: project_pedigree_paths=[TEST_PEDIGREE_3], skip_validation=True, skip_check_sex_and_relatedness=True, - liftover_ref_path=TEST_LIFTOVER, ) worker.add(uslt_task) worker.run() diff --git a/v03_pipeline/lib/tasks/update_project_table_test.py b/v03_pipeline/lib/tasks/update_project_table_test.py index e45c98b1b..053be9cc1 100644 --- a/v03_pipeline/lib/tasks/update_project_table_test.py +++ b/v03_pipeline/lib/tasks/update_project_table_test.py @@ -5,7 +5,6 @@ from v03_pipeline.lib.tasks.update_project_table import UpdateProjectTableTask from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase -TEST_LIFTOVER = 'v03_pipeline/var/test/liftover/grch38_to_grch37.over.chain.gz' TEST_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf' TEST_REMAP = 'v03_pipeline/var/test/remaps/test_remap_1.tsv' TEST_PEDIGREE_3 = 'v03_pipeline/var/test/pedigrees/test_pedigree_3.tsv' @@ -24,7 +23,6 @@ def test_update_project_table_task(self) -> None: project_pedigree_path=TEST_PEDIGREE_3, skip_validation=True, skip_check_sex_and_relatedness=True, - liftover_ref_path=TEST_LIFTOVER, ) worker.add(upt_task) worker.run() diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py index 9991524a7..90d067043 100644 --- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py +++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py @@ -43,7 +43,6 @@ from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase from v03_pipeline.var.test.vep.mock_vep_data import MOCK_37_VEP_DATA, MOCK_38_VEP_DATA -TEST_LIFTOVER = 'v03_pipeline/var/test/liftover/grch38_to_grch37.over.chain.gz' TEST_MITO_MT = 'v03_pipeline/var/test/callsets/mito_1.mt' TEST_SNV_INDEL_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf' TEST_SV_VCF = 'v03_pipeline/var/test/callsets/sv_1.vcf' @@ -163,7 +162,6 @@ def test_missing_pedigree( project_guids=['R0113_test_project'], project_remap_paths=[TEST_REMAP], project_pedigree_paths=['bad_pedigree'], - liftover_ref_path=TEST_LIFTOVER, skip_validation=True, skip_check_sex_and_relatedness=True, run_id=TEST_RUN_ID, @@ -198,7 +196,6 @@ def test_missing_interval_reference( project_guids=['R0113_test_project'], project_remap_paths=[TEST_REMAP], project_pedigree_paths=[TEST_PEDIGREE_3], - liftover_ref_path=TEST_LIFTOVER, skip_validation=True, skip_check_sex_and_relatedness=True, run_id=TEST_RUN_ID, @@ -368,7 +365,6 @@ def test_multiple_update_vat( project_guids=['R0113_test_project'], project_remap_paths=[TEST_REMAP], project_pedigree_paths=[TEST_PEDIGREE_3], - liftover_ref_path=TEST_LIFTOVER, skip_validation=False, skip_check_sex_and_relatedness=True, run_id=TEST_RUN_ID, @@ -421,7 +417,6 @@ def test_multiple_update_vat( project_guids=['R0114_project4'], project_remap_paths=[TEST_REMAP], project_pedigree_paths=[TEST_PEDIGREE_4], - liftover_ref_path=TEST_LIFTOVER, skip_validation=False, skip_check_sex_and_relatedness=True, run_id=TEST_RUN_ID, @@ -693,7 +688,6 @@ def test_update_vat_grch37( project_guids=['R0113_test_project'], project_remap_paths=[TEST_REMAP], project_pedigree_paths=[TEST_PEDIGREE_3], - liftover_ref_path=TEST_LIFTOVER, skip_validation=True, skip_check_sex_and_relatedness=True, run_id=TEST_RUN_ID, @@ -774,7 +768,6 @@ def test_update_vat_without_accessing_private_datasets( project_guids=['R0113_test_project'], project_remap_paths=[TEST_REMAP], project_pedigree_paths=[TEST_PEDIGREE_3], - liftover_ref_path=TEST_LIFTOVER, skip_validation=True, skip_check_sex_and_relatedness=True, run_id=TEST_RUN_ID, @@ -833,7 +826,6 @@ def test_mito_update_vat( project_guids=['R0115_test_project2'], project_remap_paths=['not_a_real_file'], project_pedigree_paths=[TEST_PEDIGREE_5], - liftover_ref_path=TEST_LIFTOVER, skip_validation=True, skip_check_sex_and_relatedness=True, run_id=TEST_RUN_ID, @@ -1099,7 +1091,6 @@ def test_sv_update_vat( project_guids=['R0115_test_project2'], project_remap_paths=['not_a_real_file'], project_pedigree_paths=[TEST_PEDIGREE_5], - liftover_ref_path=TEST_LIFTOVER, skip_validation=True, skip_check_sex_and_relatedness=True, run_id=TEST_RUN_ID, @@ -1662,7 +1653,6 @@ def test_gcnv_update_vat( project_guids=['R0115_test_project2'], project_remap_paths=['not_a_real_file'], project_pedigree_paths=[TEST_PEDIGREE_5], - liftover_ref_path=TEST_LIFTOVER, skip_validation=True, skip_check_sex_and_relatedness=True, run_id=TEST_RUN_ID, diff --git a/v03_pipeline/lib/tasks/write_family_table_test.py b/v03_pipeline/lib/tasks/write_family_table_test.py index 2b455349d..516578348 100644 --- a/v03_pipeline/lib/tasks/write_family_table_test.py +++ b/v03_pipeline/lib/tasks/write_family_table_test.py @@ -5,7 +5,6 @@ from v03_pipeline.lib.tasks.write_family_table import WriteFamilyTableTask from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase -TEST_LIFTOVER = 'v03_pipeline/var/test/liftover/grch38_to_grch37.over.chain.gz' TEST_GCNV_BED_FILE = 'v03_pipeline/var/test/callsets/gcnv_1.tsv' TEST_SNV_INDEL_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf' TEST_SV_VCF = 'v03_pipeline/var/test/callsets/sv_1.vcf' @@ -28,7 +27,6 @@ def test_snv_write_family_table_task(self) -> None: family_guid='abc_1', skip_validation=True, skip_check_sex_and_relatedness=True, - liftover_ref_path=TEST_LIFTOVER, ) worker.add(wft_task) worker.run() @@ -167,7 +165,6 @@ def test_sv_write_family_table_task(self) -> None: family_guid='family_2_1', skip_validation=True, skip_check_sex_and_relatedness=True, - liftover_ref_path=TEST_LIFTOVER, ) worker.add(write_family_table_task) worker.run() @@ -421,7 +418,6 @@ def test_gcnv_write_family_table_task(self) -> None: family_guid='family_2_1', skip_validation=True, skip_check_sex_and_relatedness=True, - liftover_ref_path=TEST_LIFTOVER, ) worker.add(write_family_table_task) worker.run() diff --git a/v03_pipeline/lib/tasks/write_new_variants_table.py b/v03_pipeline/lib/tasks/write_new_variants_table.py index d35dccc71..b07a7785f 100644 --- a/v03_pipeline/lib/tasks/write_new_variants_table.py +++ b/v03_pipeline/lib/tasks/write_new_variants_table.py @@ -62,6 +62,7 @@ def annotation_dependencies(self) -> dict[str, hl.Table]: deps['gencode_gene_symbol_to_gene_id_mapping'] = hl.literal( load_gencode_gene_symbol_to_gene_id(GENCODE_RELEASE, ''), ) + deps['liftover_ref_path'] = Env.LIFTOVER_REF_PATH return deps def output(self) -> luigi.Target: diff --git a/v03_pipeline/lib/tasks/write_project_family_tables_test.py b/v03_pipeline/lib/tasks/write_project_family_tables_test.py index 2642fe486..37bb9a556 100644 --- a/v03_pipeline/lib/tasks/write_project_family_tables_test.py +++ b/v03_pipeline/lib/tasks/write_project_family_tables_test.py @@ -7,7 +7,6 @@ ) from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase -TEST_LIFTOVER = 'v03_pipeline/var/test/liftover/grch38_to_grch37.over.chain.gz' TEST_SNV_INDEL_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf' TEST_REMAP = 'v03_pipeline/var/test/remaps/test_remap_1.tsv' TEST_PEDIGREE_4 = 'v03_pipeline/var/test/pedigrees/test_pedigree_4.tsv' @@ -26,7 +25,6 @@ def test_snv_write_project_family_tables_task(self) -> None: project_pedigree_path=TEST_PEDIGREE_4, skip_validation=True, skip_check_sex_and_relatedness=True, - liftover_ref_path=TEST_LIFTOVER, ) worker.add(write_project_family_tables) worker.run() From 41a23a579ba1b4b986fe5b44b0355f48f5b0f6dd Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 20 Jun 2024 18:52:32 -0400 Subject: [PATCH 40/49] ruff --- v03_pipeline/lib/model/environment.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/v03_pipeline/lib/model/environment.py b/v03_pipeline/lib/model/environment.py index dd4e3bb3a..6a6a5c70e 100644 --- a/v03_pipeline/lib/model/environment.py +++ b/v03_pipeline/lib/model/environment.py @@ -5,7 +5,8 @@ HAIL_TMPDIR = os.environ.get('HAIL_TMPDIR', '/tmp') # noqa: S108 HAIL_SEARCH_DATA = os.environ.get('HAIL_SEARCH_DATA', '/hail-search-data') LIFTOVER_REF_PATH = os.environ.get( - 'LIFTOVER_REF_PATH', 'gs://hail-common/references/grch38_to_grch37.over.chain.gz' + 'LIFTOVER_REF_PATH', + 'gs://hail-common/references/grch38_to_grch37.over.chain.gz', ) LOADING_DATASETS = os.environ.get('LOADING_DATASETS', '/seqr-loading-temp') PRIVATE_REFERENCE_DATASETS = os.environ.get( From e5b446dfdc439469044713e52f07ff43b70a5364 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 20 Jun 2024 19:10:22 -0400 Subject: [PATCH 41/49] reformat filters annotation --- v03_pipeline/lib/misc/io.py | 4 ---- .../lib/tasks/write_imported_callset.py | 18 ++++++++++++------ 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/v03_pipeline/lib/misc/io.py b/v03_pipeline/lib/misc/io.py index 91d07e851..992e2585c 100644 --- a/v03_pipeline/lib/misc/io.py +++ b/v03_pipeline/lib/misc/io.py @@ -121,7 +121,6 @@ def import_callset( callset_path: str, reference_genome: ReferenceGenome, dataset_type: DatasetType, - filters_path: str | None = None, ) -> hl.MatrixTable: if dataset_type == DatasetType.GCNV: mt = import_gcnv_bed_file(callset_path) @@ -131,9 +130,6 @@ def import_callset( mt = hl.read_matrix_table(callset_path) if dataset_type == DatasetType.SV: mt = mt.annotate_rows(variant_id=mt.rsid) - if filters_path: - filters_ht = import_vcf(filters_path, reference_genome).rows() - mt = mt.annotate_rows(filters=filters_ht[mt.row_key].filters) return mt.key_rows_by(*dataset_type.table_key_type(reference_genome).fields) diff --git a/v03_pipeline/lib/tasks/write_imported_callset.py b/v03_pipeline/lib/tasks/write_imported_callset.py index 666114658..d995a16c7 100644 --- a/v03_pipeline/lib/tasks/write_imported_callset.py +++ b/v03_pipeline/lib/tasks/write_imported_callset.py @@ -4,6 +4,7 @@ from v03_pipeline.lib.misc.io import ( import_callset, + import_vcf, select_relevant_fields, split_multi_hts, ) @@ -117,17 +118,22 @@ def additional_row_fields(self, mt): } def create_table(self) -> hl.MatrixTable: - filters_path = valid_filters_path( - self.dataset_type, - self.sample_type, - self.callset_path, - ) mt = import_callset( self.callset_path, self.reference_genome, self.dataset_type, - filters_path, ) + filters_path = None + if not self.skip_expect_filters and self.dataset_type.expect_filters( + self.sample_type, + ): + filters_path = valid_filters_path( + self.dataset_type, + self.sample_type, + self.callset_path, + ) + filters_ht = import_vcf(filters_path, self.reference_genome).rows() + mt = mt.annotate_rows(filters=filters_ht[mt.row_key].filters) mt = select_relevant_fields( mt, self.dataset_type, From 3cd656b5733e6f5d4c9f7d1b7688a67051f52dd7 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 20 Jun 2024 21:26:11 -0400 Subject: [PATCH 42/49] add env vars too! --- v03_pipeline/lib/model/environment.py | 4 ++++ v03_pipeline/lib/paths.py | 3 ++- v03_pipeline/lib/paths_test.py | 18 ++++++++------- .../lib/tasks/update_lookup_table_test.py | 2 -- .../lib/tasks/update_project_table_test.py | 1 - ...annotations_table_with_new_samples_test.py | 9 -------- .../lib/tasks/write_family_table_test.py | 3 --- .../lib/tasks/write_imported_callset.py | 22 ++++++++++++++----- .../lib/tasks/write_metadata_for_run_test.py | 1 - .../write_remapped_and_subsetted_callset.py | 4 +++- 10 files changed, 35 insertions(+), 32 deletions(-) diff --git a/v03_pipeline/lib/model/environment.py b/v03_pipeline/lib/model/environment.py index 6a6a5c70e..d12201bef 100644 --- a/v03_pipeline/lib/model/environment.py +++ b/v03_pipeline/lib/model/environment.py @@ -28,6 +28,8 @@ ACCESS_PRIVATE_REFERENCE_DATASETS = ( os.environ.get('ACCESS_PRIVATE_REFERENCE_DATASETS') == '1' ) +CHECK_SEX_AND_RELATEDNESS = os.environ.get('CHECK_SEX_AND_RELATEDNESS') == '1' +EXPECT_WES_FILTERS = os.environ.get('EXPECT_WES_FILTERS') == '1' REFERENCE_DATA_AUTO_UPDATE = os.environ.get('REFERENCE_DATA_AUTO_UPDATE') == '1' SHOULD_REGISTER_ALLELES = os.environ.get('SHOULD_REGISTER_ALLELES') == '1' @@ -36,6 +38,8 @@ class Env: ACCESS_PRIVATE_REFERENCE_DATASETS: bool = ACCESS_PRIVATE_REFERENCE_DATASETS ALLELE_REGISTRY_SECRET_NAME: str | None = ALLELE_REGISTRY_SECRET_NAME + CHECK_SEX_AND_RELATEDNESS: bool = CHECK_SEX_AND_RELATEDNESS + EXPECT_WES_FILTERS: bool = EXPECT_WES_FILTERS HAIL_TMPDIR: str = HAIL_TMPDIR HAIL_SEARCH_DATA: str = HAIL_SEARCH_DATA LIFTOVER_REF_PATH: str = LIFTOVER_REF_PATH diff --git a/v03_pipeline/lib/paths.py b/v03_pipeline/lib/paths.py index 8d5ff6335..3caffa956 100644 --- a/v03_pipeline/lib/paths.py +++ b/v03_pipeline/lib/paths.py @@ -222,6 +222,7 @@ def valid_filters_path( callset_path: str, ) -> str | None: if ( + not Env.EXPECT_WES_FILTERS or not dataset_type.expect_filters(sample_type) or 'part_one_outputs' not in callset_path ): @@ -240,7 +241,7 @@ def valid_reference_dataset_collection_path( ) -> str | None: if ( not Env.ACCESS_PRIVATE_REFERENCE_DATASETS - and reference_dataset_collection.access_control == AccessControl.PRIVATE + or reference_dataset_collection.access_control == AccessControl.PUBLIC ): return None return os.path.join( diff --git a/v03_pipeline/lib/paths_test.py b/v03_pipeline/lib/paths_test.py index 50a080dc0..ff437cf45 100644 --- a/v03_pipeline/lib/paths_test.py +++ b/v03_pipeline/lib/paths_test.py @@ -66,14 +66,16 @@ def test_valid_filters_path(self) -> None: ), None, ) - self.assertEqual( - valid_filters_path( - DatasetType.SNV_INDEL, - SampleType.WES, - 'gs://bucket/RDG_Broad_WES_Internal_Oct2023/part_one_outputs/chr*/*.vcf.gz', - ), - 'gs://bucket/RDG_Broad_WES_Internal_Oct2023/part_two_outputs/*.filtered.*.vcf.gz', - ) + with patch('v03_pipeline.lib.paths.Env') as mock_env: + mock_env.EXPECT_WES_FILTERS = True + self.assertEqual( + valid_filters_path( + DatasetType.SNV_INDEL, + SampleType.WES, + 'gs://bucket/RDG_Broad_WES_Internal_Oct2023/part_one_outputs/chr*/*.vcf.gz', + ), + 'gs://bucket/RDG_Broad_WES_Internal_Oct2023/part_two_outputs/*.filtered.*.vcf.gz', + ) def test_project_table_path(self) -> None: self.assertEqual( diff --git a/v03_pipeline/lib/tasks/update_lookup_table_test.py b/v03_pipeline/lib/tasks/update_lookup_table_test.py index 758b6392c..8551d873e 100644 --- a/v03_pipeline/lib/tasks/update_lookup_table_test.py +++ b/v03_pipeline/lib/tasks/update_lookup_table_test.py @@ -26,7 +26,6 @@ def test_skip_update_lookup_table_task(self) -> None: project_remap_paths=[TEST_REMAP], project_pedigree_paths=[TEST_PEDIGREE_3], skip_validation=True, - skip_check_sex_and_relatedness=True, ) worker.add(uslt_task) worker.run() @@ -58,7 +57,6 @@ def test_update_lookup_table_task(self) -> None: project_remap_paths=[TEST_REMAP], project_pedigree_paths=[TEST_PEDIGREE_3], skip_validation=True, - skip_check_sex_and_relatedness=True, ) worker.add(uslt_task) worker.run() diff --git a/v03_pipeline/lib/tasks/update_project_table_test.py b/v03_pipeline/lib/tasks/update_project_table_test.py index 053be9cc1..c0a5a4e57 100644 --- a/v03_pipeline/lib/tasks/update_project_table_test.py +++ b/v03_pipeline/lib/tasks/update_project_table_test.py @@ -22,7 +22,6 @@ def test_update_project_table_task(self) -> None: project_remap_path=TEST_REMAP, project_pedigree_path=TEST_PEDIGREE_3, skip_validation=True, - skip_check_sex_and_relatedness=True, ) worker.add(upt_task) worker.run() diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py index 90d067043..3d723cb3a 100644 --- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py +++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py @@ -163,7 +163,6 @@ def test_missing_pedigree( project_remap_paths=[TEST_REMAP], project_pedigree_paths=['bad_pedigree'], skip_validation=True, - skip_check_sex_and_relatedness=True, run_id=TEST_RUN_ID, ) worker = luigi.worker.Worker() @@ -197,7 +196,6 @@ def test_missing_interval_reference( project_remap_paths=[TEST_REMAP], project_pedigree_paths=[TEST_PEDIGREE_3], skip_validation=True, - skip_check_sex_and_relatedness=True, run_id=TEST_RUN_ID, ) worker = luigi.worker.Worker() @@ -366,7 +364,6 @@ def test_multiple_update_vat( project_remap_paths=[TEST_REMAP], project_pedigree_paths=[TEST_PEDIGREE_3], skip_validation=False, - skip_check_sex_and_relatedness=True, run_id=TEST_RUN_ID, ) worker.add(uvatwns_task_3) @@ -418,7 +415,6 @@ def test_multiple_update_vat( project_remap_paths=[TEST_REMAP], project_pedigree_paths=[TEST_PEDIGREE_4], skip_validation=False, - skip_check_sex_and_relatedness=True, run_id=TEST_RUN_ID, ) worker.add(uvatwns_task_4) @@ -689,7 +685,6 @@ def test_update_vat_grch37( project_remap_paths=[TEST_REMAP], project_pedigree_paths=[TEST_PEDIGREE_3], skip_validation=True, - skip_check_sex_and_relatedness=True, run_id=TEST_RUN_ID, ) worker.add(uvatwns_task) @@ -769,7 +764,6 @@ def test_update_vat_without_accessing_private_datasets( project_remap_paths=[TEST_REMAP], project_pedigree_paths=[TEST_PEDIGREE_3], skip_validation=True, - skip_check_sex_and_relatedness=True, run_id=TEST_RUN_ID, ) worker.add(uvatwns_task) @@ -827,7 +821,6 @@ def test_mito_update_vat( project_remap_paths=['not_a_real_file'], project_pedigree_paths=[TEST_PEDIGREE_5], skip_validation=True, - skip_check_sex_and_relatedness=True, run_id=TEST_RUN_ID, ) ) @@ -1092,7 +1085,6 @@ def test_sv_update_vat( project_remap_paths=['not_a_real_file'], project_pedigree_paths=[TEST_PEDIGREE_5], skip_validation=True, - skip_check_sex_and_relatedness=True, run_id=TEST_RUN_ID, ) ) @@ -1654,7 +1646,6 @@ def test_gcnv_update_vat( project_remap_paths=['not_a_real_file'], project_pedigree_paths=[TEST_PEDIGREE_5], skip_validation=True, - skip_check_sex_and_relatedness=True, run_id=TEST_RUN_ID, ) ) diff --git a/v03_pipeline/lib/tasks/write_family_table_test.py b/v03_pipeline/lib/tasks/write_family_table_test.py index 516578348..3adc96901 100644 --- a/v03_pipeline/lib/tasks/write_family_table_test.py +++ b/v03_pipeline/lib/tasks/write_family_table_test.py @@ -26,7 +26,6 @@ def test_snv_write_family_table_task(self) -> None: project_pedigree_path=TEST_PEDIGREE_3, family_guid='abc_1', skip_validation=True, - skip_check_sex_and_relatedness=True, ) worker.add(wft_task) worker.run() @@ -164,7 +163,6 @@ def test_sv_write_family_table_task(self) -> None: project_pedigree_path=TEST_PEDIGREE_5, family_guid='family_2_1', skip_validation=True, - skip_check_sex_and_relatedness=True, ) worker.add(write_family_table_task) worker.run() @@ -417,7 +415,6 @@ def test_gcnv_write_family_table_task(self) -> None: project_pedigree_path=TEST_PEDIGREE_5, family_guid='family_2_1', skip_validation=True, - skip_check_sex_and_relatedness=True, ) worker.add(write_family_table_task) worker.run() diff --git a/v03_pipeline/lib/tasks/write_imported_callset.py b/v03_pipeline/lib/tasks/write_imported_callset.py index d995a16c7..be42fb750 100644 --- a/v03_pipeline/lib/tasks/write_imported_callset.py +++ b/v03_pipeline/lib/tasks/write_imported_callset.py @@ -55,8 +55,12 @@ def output(self) -> luigi.Target: def requires(self) -> list[luigi.Task]: requirements = [] - if not self.skip_expect_filters and self.dataset_type.expect_filters( - self.sample_type, + if ( + Env.EXPECT_WES_FILTERS + and not self.skip_expect_filters + and self.dataset_type.expect_filters( + self.sample_type, + ) ): requirements = [ *requirements, @@ -87,7 +91,8 @@ def requires(self) -> list[luigi.Task]: ), ] if ( - not self.skip_check_sex_and_relatedness + Env.CHECK_SEX_AND_RELATEDNESS + and not self.skip_check_sex_and_relatedness and self.dataset_type.check_sex_and_relatedness ): requirements = [ @@ -124,8 +129,12 @@ def create_table(self) -> hl.MatrixTable: self.dataset_type, ) filters_path = None - if not self.skip_expect_filters and self.dataset_type.expect_filters( - self.sample_type, + if ( + Env.EXPECT_WES_FILTERS + and not self.skip_expect_filters + and self.dataset_type.expect_filters( + self.sample_type, + ) ): filters_path = valid_filters_path( self.dataset_type, @@ -178,7 +187,8 @@ def create_table(self) -> hl.MatrixTable: self.sample_type, ) if ( - not self.skip_check_sex_and_relatedness + Env.CHECK_SEX_AND_RELATEDNESS + and not self.skip_check_sex_and_relatedness and self.dataset_type.check_sex_and_relatedness ): sex_check_ht = hl.read_table( diff --git a/v03_pipeline/lib/tasks/write_metadata_for_run_test.py b/v03_pipeline/lib/tasks/write_metadata_for_run_test.py index 1feca2434..f5d733a79 100644 --- a/v03_pipeline/lib/tasks/write_metadata_for_run_test.py +++ b/v03_pipeline/lib/tasks/write_metadata_for_run_test.py @@ -23,7 +23,6 @@ def test_write_metadata_for_run_task(self) -> None: project_guids=['R0113_test_project', 'R0114_project4'], project_remap_paths=[TEST_REMAP_2, TEST_REMAP_2], project_pedigree_paths=[TEST_PEDIGREE_3, TEST_PEDIGREE_4], - skip_check_sex_and_relatedness=True, skip_validation=True, run_id='run_123456', ) diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py index ec468eb2c..6d4753680 100644 --- a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py +++ b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py @@ -15,6 +15,7 @@ ) from v03_pipeline.lib.misc.pedigree import parse_pedigree_ht_to_families from v03_pipeline.lib.misc.sample_ids import remap_sample_ids, subset_samples +from v03_pipeline.lib.model.environment import Env from v03_pipeline.lib.paths import remapped_and_subsetted_callset_path from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask @@ -88,7 +89,8 @@ def create_table(self) -> hl.MatrixTable: families_failed_relatedness_check = {} families_failed_sex_check = {} if ( - not self.skip_check_sex_and_relatedness + Env.CHECK_SEX_AND_RELATEDNESS + and not self.skip_check_sex_and_relatedness and self.dataset_type.check_sex_and_relatedness ): relatedness_check_ht = hl.read_table(self.input()[2].path) From bab5b5e51cffc45407ef9d26485685344e38d05d Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 20 Jun 2024 21:34:48 -0400 Subject: [PATCH 43/49] ruff --- v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py | 1 + 1 file changed, 1 insertion(+) diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py index 6d4753680..fbd3c6c4a 100644 --- a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py +++ b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py @@ -54,6 +54,7 @@ def requires(self) -> list[luigi.Task]: RawFileTask(self.project_pedigree_path), ] if ( + Env.CHECK_SEX_AND_RELATEDNESS and not self.skip_check_sex_and_relatedness and self.dataset_type.check_sex_and_relatedness ): From c6816afdc7460b26274e53bfd729ce672fe92415 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 20 Jun 2024 21:37:04 -0400 Subject: [PATCH 44/49] ruff --- v03_pipeline/lib/paths.py | 4 ++-- .../lib/tasks/write_remapped_and_subsetted_callset.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/v03_pipeline/lib/paths.py b/v03_pipeline/lib/paths.py index 3caffa956..629898af2 100644 --- a/v03_pipeline/lib/paths.py +++ b/v03_pipeline/lib/paths.py @@ -222,8 +222,8 @@ def valid_filters_path( callset_path: str, ) -> str | None: if ( - not Env.EXPECT_WES_FILTERS or - not dataset_type.expect_filters(sample_type) + not Env.EXPECT_WES_FILTERS + or not dataset_type.expect_filters(sample_type) or 'part_one_outputs' not in callset_path ): return None diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py index fbd3c6c4a..f5e9eb48e 100644 --- a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py +++ b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py @@ -54,8 +54,8 @@ def requires(self) -> list[luigi.Task]: RawFileTask(self.project_pedigree_path), ] if ( - Env.CHECK_SEX_AND_RELATEDNESS and - not self.skip_check_sex_and_relatedness + Env.CHECK_SEX_AND_RELATEDNESS + and not self.skip_check_sex_and_relatedness and self.dataset_type.check_sex_and_relatedness ): requirements = [ From 56c71251ec5c59219063cace82087e160919e781 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 20 Jun 2024 21:49:45 -0400 Subject: [PATCH 45/49] Fix logic --- v03_pipeline/lib/paths.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/v03_pipeline/lib/paths.py b/v03_pipeline/lib/paths.py index 629898af2..3ab830e5f 100644 --- a/v03_pipeline/lib/paths.py +++ b/v03_pipeline/lib/paths.py @@ -241,7 +241,7 @@ def valid_reference_dataset_collection_path( ) -> str | None: if ( not Env.ACCESS_PRIVATE_REFERENCE_DATASETS - or reference_dataset_collection.access_control == AccessControl.PUBLIC + and reference_dataset_collection.access_control == AccessControl.PRIVATE ): return None return os.path.join( From 8bfb13b2611f59477a4ca0ea485d69d5cd718eb8 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 20 Jun 2024 22:18:52 -0400 Subject: [PATCH 46/49] add env mock --- .../lib/tasks/write_remapped_and_subsetted_callset_test.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py index c6576ba8a..48f2b481a 100644 --- a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py +++ b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py @@ -1,4 +1,5 @@ import shutil +from unittest.mock import Mock, patch import hail as hl import luigi.worker @@ -83,7 +84,6 @@ def test_write_remapped_and_subsetted_callset_task( project_remap_path=TEST_REMAP, project_pedigree_path=TEST_PEDIGREE_3, skip_validation=True, - skip_check_sex_and_relatedness=False, ) worker.add(wrsc_task) worker.run() @@ -104,9 +104,12 @@ def test_write_remapped_and_subsetted_callset_task( ], ) + @patch('v03_pipeline.lib.tasks.write_remapped_and_subsetted_callset.Env') def test_write_remapped_and_subsetted_callset_task_failed_sex_check_family( self, + mock_env: Mock, ) -> None: + mock_env.CHECK_SEX_AND_RELATEDNESS = True worker = luigi.worker.Worker() wrsc_task = WriteRemappedAndSubsettedCallsetTask( reference_genome=ReferenceGenome.GRCh38, @@ -117,7 +120,6 @@ def test_write_remapped_and_subsetted_callset_task_failed_sex_check_family( project_remap_path=TEST_REMAP, project_pedigree_path=TEST_PEDIGREE_4, skip_validation=True, - skip_check_sex_and_relatedness=False, ) worker.add(wrsc_task) worker.run() From 7056e34138b31a89ee5842601f0de984e5f2a33f Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Fri, 21 Jun 2024 10:21:51 -0400 Subject: [PATCH 47/49] Update environment.py --- v03_pipeline/lib/model/environment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/v03_pipeline/lib/model/environment.py b/v03_pipeline/lib/model/environment.py index d12201bef..4681939ca 100644 --- a/v03_pipeline/lib/model/environment.py +++ b/v03_pipeline/lib/model/environment.py @@ -46,8 +46,8 @@ class Env: LOADING_DATASETS: str = LOADING_DATASETS PRIVATE_REFERENCE_DATASETS: str = PRIVATE_REFERENCE_DATASETS PROJECT_ID: str | None = PROJECT_ID - REFERENCE_DATASETS: str = REFERENCE_DATASETS REFERENCE_DATA_AUTO_UPDATE: bool = REFERENCE_DATA_AUTO_UPDATE + REFERENCE_DATASETS: str = REFERENCE_DATASETS SHOULD_REGISTER_ALLELES: bool = SHOULD_REGISTER_ALLELES VEP_CONFIG_PATH: str | None = VEP_CONFIG_PATH VEP_CONFIG_URI: str | None = VEP_CONFIG_URI From 9fddc8769ed40b178b4dc7465378634945f914cd Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 27 Jun 2024 12:01:28 -0400 Subject: [PATCH 48/49] update zip --- v03_pipeline/lib/tasks/update_lookup_table.py | 2 +- v03_pipeline/lib/tasks/write_metadata_for_run.py | 2 +- v03_pipeline/lib/tasks/write_new_variants_table.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/v03_pipeline/lib/tasks/update_lookup_table.py b/v03_pipeline/lib/tasks/update_lookup_table.py index 139b89b62..eb04d1d48 100644 --- a/v03_pipeline/lib/tasks/update_lookup_table.py +++ b/v03_pipeline/lib/tasks/update_lookup_table.py @@ -62,7 +62,7 @@ def requires(self) -> list[luigi.Task]: self.project_guids, self.project_remap_paths, self.project_pedigree_paths, - strict=False, + strict=True, ) ] diff --git a/v03_pipeline/lib/tasks/write_metadata_for_run.py b/v03_pipeline/lib/tasks/write_metadata_for_run.py index c3e3f0542..b67cb1496 100644 --- a/v03_pipeline/lib/tasks/write_metadata_for_run.py +++ b/v03_pipeline/lib/tasks/write_metadata_for_run.py @@ -44,7 +44,7 @@ def requires(self) -> list[luigi.Task]: self.project_guids, self.project_remap_paths, self.project_pedigree_paths, - strict=False, + strict=True ) ] diff --git a/v03_pipeline/lib/tasks/write_new_variants_table.py b/v03_pipeline/lib/tasks/write_new_variants_table.py index b07a7785f..b70dc2a6f 100644 --- a/v03_pipeline/lib/tasks/write_new_variants_table.py +++ b/v03_pipeline/lib/tasks/write_new_variants_table.py @@ -113,7 +113,7 @@ def requires(self) -> list[luigi.Task]: self.project_guids, self.project_remap_paths, self.project_pedigree_paths, - strict=False, + strict=True, ) ], ) From 8cf46a0c4a7130e2c3a26118093337238d898e2f Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 27 Jun 2024 12:05:17 -0400 Subject: [PATCH 49/49] ruff --- v03_pipeline/lib/tasks/write_metadata_for_run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/v03_pipeline/lib/tasks/write_metadata_for_run.py b/v03_pipeline/lib/tasks/write_metadata_for_run.py index b67cb1496..69e470bb7 100644 --- a/v03_pipeline/lib/tasks/write_metadata_for_run.py +++ b/v03_pipeline/lib/tasks/write_metadata_for_run.py @@ -44,7 +44,7 @@ def requires(self) -> list[luigi.Task]: self.project_guids, self.project_remap_paths, self.project_pedigree_paths, - strict=True + strict=True, ) ]