diff --git a/v03_pipeline/lib/misc/io.py b/v03_pipeline/lib/misc/io.py index 91d07e851..992e2585c 100644 --- a/v03_pipeline/lib/misc/io.py +++ b/v03_pipeline/lib/misc/io.py @@ -121,7 +121,6 @@ def import_callset( callset_path: str, reference_genome: ReferenceGenome, dataset_type: DatasetType, - filters_path: str | None = None, ) -> hl.MatrixTable: if dataset_type == DatasetType.GCNV: mt = import_gcnv_bed_file(callset_path) @@ -131,9 +130,6 @@ def import_callset( mt = hl.read_matrix_table(callset_path) if dataset_type == DatasetType.SV: mt = mt.annotate_rows(variant_id=mt.rsid) - if filters_path: - filters_ht = import_vcf(filters_path, reference_genome).rows() - mt = mt.annotate_rows(filters=filters_ht[mt.row_key].filters) return mt.key_rows_by(*dataset_type.table_key_type(reference_genome).fields) diff --git a/v03_pipeline/lib/model/dataset_type.py b/v03_pipeline/lib/model/dataset_type.py index b376a6ae8..e7af6983f 100644 --- a/v03_pipeline/lib/model/dataset_type.py +++ b/v03_pipeline/lib/model/dataset_type.py @@ -4,7 +4,7 @@ import hail as hl from v03_pipeline.lib.annotations import gcnv, mito, shared, snv_indel, sv -from v03_pipeline.lib.model.definitions import ReferenceGenome +from v03_pipeline.lib.model.definitions import ReferenceGenome, SampleType MITO_MIN_HOM_THRESHOLD = 0.95 ZERO = 0.0 @@ -155,6 +155,12 @@ def has_gencode_ensembl_to_refseq_id_mapping( self == DatasetType.SNV_INDEL and reference_genome == ReferenceGenome.GRCh38 ) + def expect_filters( + self, + sample_type: SampleType, + ) -> bool: + return self == DatasetType.SNV_INDEL and sample_type == SampleType.WES + @property def has_gencode_gene_symbol_to_gene_id_mapping(self) -> bool: return self == DatasetType.SV diff --git a/v03_pipeline/lib/model/environment.py b/v03_pipeline/lib/model/environment.py index d89567d8b..d12201bef 100644 --- a/v03_pipeline/lib/model/environment.py +++ b/v03_pipeline/lib/model/environment.py @@ -2,12 +2,12 @@ from dataclasses import dataclass # NB: using os.environ.get inside the dataclass defaults gives a lint error. -ACCESS_PRIVATE_REFERENCE_DATASETS = ( - os.environ.get('ACCESS_PRIVATE_REFERENCE_DATASETS') == '1' -) -REFERENCE_DATA_AUTO_UPDATE = os.environ.get('REFERENCE_DATA_AUTO_UPDATE') == '1' HAIL_TMPDIR = os.environ.get('HAIL_TMPDIR', '/tmp') # noqa: S108 HAIL_SEARCH_DATA = os.environ.get('HAIL_SEARCH_DATA', '/hail-search-data') +LIFTOVER_REF_PATH = os.environ.get( + 'LIFTOVER_REF_PATH', + 'gs://hail-common/references/grch38_to_grch37.over.chain.gz', +) LOADING_DATASETS = os.environ.get('LOADING_DATASETS', '/seqr-loading-temp') PRIVATE_REFERENCE_DATASETS = os.environ.get( 'PRIVATE_REFERENCE_DATASETS', @@ -19,22 +19,35 @@ ) VEP_CONFIG_PATH = os.environ.get('VEP_CONFIG_PATH', None) VEP_CONFIG_URI = os.environ.get('VEP_CONFIG_URI', None) -SHOULD_REGISTER_ALLELES = os.environ.get('SHOULD_REGISTER_ALLELES') == '1' + +# Allele registry secrets :/ ALLELE_REGISTRY_SECRET_NAME = os.environ.get('ALLELE_REGISTRY_SECRET_NAME', None) PROJECT_ID = os.environ.get('PROJECT_ID', None) +# Feature Flags +ACCESS_PRIVATE_REFERENCE_DATASETS = ( + os.environ.get('ACCESS_PRIVATE_REFERENCE_DATASETS') == '1' +) +CHECK_SEX_AND_RELATEDNESS = os.environ.get('CHECK_SEX_AND_RELATEDNESS') == '1' +EXPECT_WES_FILTERS = os.environ.get('EXPECT_WES_FILTERS') == '1' +REFERENCE_DATA_AUTO_UPDATE = os.environ.get('REFERENCE_DATA_AUTO_UPDATE') == '1' +SHOULD_REGISTER_ALLELES = os.environ.get('SHOULD_REGISTER_ALLELES') == '1' + @dataclass class Env: ACCESS_PRIVATE_REFERENCE_DATASETS: bool = ACCESS_PRIVATE_REFERENCE_DATASETS ALLELE_REGISTRY_SECRET_NAME: str | None = ALLELE_REGISTRY_SECRET_NAME - REFERENCE_DATA_AUTO_UPDATE: bool = REFERENCE_DATA_AUTO_UPDATE + CHECK_SEX_AND_RELATEDNESS: bool = CHECK_SEX_AND_RELATEDNESS + EXPECT_WES_FILTERS: bool = EXPECT_WES_FILTERS HAIL_TMPDIR: str = HAIL_TMPDIR HAIL_SEARCH_DATA: str = HAIL_SEARCH_DATA + LIFTOVER_REF_PATH: str = LIFTOVER_REF_PATH LOADING_DATASETS: str = LOADING_DATASETS PRIVATE_REFERENCE_DATASETS: str = PRIVATE_REFERENCE_DATASETS PROJECT_ID: str | None = PROJECT_ID REFERENCE_DATASETS: str = REFERENCE_DATASETS + REFERENCE_DATA_AUTO_UPDATE: bool = REFERENCE_DATA_AUTO_UPDATE SHOULD_REGISTER_ALLELES: bool = SHOULD_REGISTER_ALLELES VEP_CONFIG_PATH: str | None = VEP_CONFIG_PATH VEP_CONFIG_URI: str | None = VEP_CONFIG_URI diff --git a/v03_pipeline/lib/paths.py b/v03_pipeline/lib/paths.py index 14482d831..3ab830e5f 100644 --- a/v03_pipeline/lib/paths.py +++ b/v03_pipeline/lib/paths.py @@ -1,5 +1,6 @@ import hashlib import os +import re from v03_pipeline.lib.model import ( AccessControl, @@ -9,6 +10,7 @@ PipelineVersion, ReferenceDatasetCollection, ReferenceGenome, + SampleType, ) @@ -73,6 +75,22 @@ def family_table_path( ) +def imputed_sex_path( + reference_genome: ReferenceGenome, + dataset_type: DatasetType, + callset_path: str, +) -> str: + return os.path.join( + _v03_pipeline_prefix( + Env.LOADING_DATASETS, + reference_genome, + dataset_type, + ), + 'imputed_sex', + f'{hashlib.sha256(callset_path.encode("utf8")).hexdigest()}.tsv', + ) + + def imported_callset_path( reference_genome: ReferenceGenome, dataset_type: DatasetType, @@ -198,6 +216,24 @@ def sex_check_table_path( ) +def valid_filters_path( + dataset_type: DatasetType, + sample_type: SampleType, + callset_path: str, +) -> str | None: + if ( + not Env.EXPECT_WES_FILTERS + or not dataset_type.expect_filters(sample_type) + or 'part_one_outputs' not in callset_path + ): + return None + return re.sub( + 'part_one_outputs/.*$', + 'part_two_outputs/*.filtered.*.vcf.gz', + callset_path, + ) + + def valid_reference_dataset_collection_path( reference_genome: ReferenceGenome, dataset_type: DatasetType, diff --git a/v03_pipeline/lib/paths_test.py b/v03_pipeline/lib/paths_test.py index d6f0b10ba..ff437cf45 100644 --- a/v03_pipeline/lib/paths_test.py +++ b/v03_pipeline/lib/paths_test.py @@ -6,11 +6,13 @@ DatasetType, ReferenceDatasetCollection, ReferenceGenome, + SampleType, ) from v03_pipeline.lib.paths import ( cached_reference_dataset_query_path, family_table_path, imported_callset_path, + imputed_sex_path, lookup_table_path, metadata_for_run_path, new_variants_table_path, @@ -18,6 +20,7 @@ relatedness_check_table_path, remapped_and_subsetted_callset_path, sex_check_table_path, + valid_filters_path, valid_reference_dataset_collection_path, variant_annotations_table_path, ) @@ -54,6 +57,26 @@ def test_family_table_path(self) -> None: 'gs://seqr-datasets/v03/GRCh37/SNV_INDEL/families/franklin.ht', ) + def test_valid_filters_path(self) -> None: + self.assertEqual( + valid_filters_path( + DatasetType.MITO, + SampleType.WES, + 'gs://bucket/RDG_Broad_WES_Internal_Oct2023/part_one_outputs/chr*/*.vcf.gz', + ), + None, + ) + with patch('v03_pipeline.lib.paths.Env') as mock_env: + mock_env.EXPECT_WES_FILTERS = True + self.assertEqual( + valid_filters_path( + DatasetType.SNV_INDEL, + SampleType.WES, + 'gs://bucket/RDG_Broad_WES_Internal_Oct2023/part_one_outputs/chr*/*.vcf.gz', + ), + 'gs://bucket/RDG_Broad_WES_Internal_Oct2023/part_two_outputs/*.filtered.*.vcf.gz', + ) + def test_project_table_path(self) -> None: self.assertEqual( project_table_path( @@ -162,6 +185,16 @@ def test_imported_callset_path(self) -> None: '/seqr-loading-temp/v03/GRCh38/SNV_INDEL/imported_callsets/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.mt', ) + def test_imputed_sex_path(self) -> None: + self.assertEqual( + imputed_sex_path( + ReferenceGenome.GRCh38, + DatasetType.SNV_INDEL, + 'gs://abc.efg/callset.vcf.gz', + ), + '/seqr-loading-temp/v03/GRCh38/SNV_INDEL/imputed_sex/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.tsv', + ) + def test_new_variants_table_path(self) -> None: self.assertEqual( new_variants_table_path( diff --git a/v03_pipeline/lib/tasks/base/base_loading_run_params.py b/v03_pipeline/lib/tasks/base/base_loading_run_params.py index 1bfa204be..f5ce3b3e4 100644 --- a/v03_pipeline/lib/tasks/base/base_loading_run_params.py +++ b/v03_pipeline/lib/tasks/base/base_loading_run_params.py @@ -10,29 +10,23 @@ class BaseLoadingRunParams(luigi.Task): # but nothing else. sample_type = luigi.EnumParameter(enum=SampleType) callset_path = luigi.Parameter() - # HINT: OptionalParameter vs Parameter is significant here. - # The default Parameter will case `None` to the string "None". - imputed_sex_path = luigi.OptionalParameter( - default=None, - description='Optional path to a tsv of imputed sex values from the DRAGEN GVS pipeline.', - ) - filters_path = luigi.OptionalParameter( - default=None, - description='Optional path to part two outputs from callset (VCF shards containing filter information)', - ) ignore_missing_samples_when_remapping = luigi.BoolParameter( default=False, parsing=luigi.BoolParameter.EXPLICIT_PARSING, ) - validate = luigi.BoolParameter( - default=True, + force = luigi.BoolParameter( + default=False, parsing=luigi.BoolParameter.EXPLICIT_PARSING, ) - force = luigi.BoolParameter( + skip_check_sex_and_relatedness = luigi.BoolParameter( default=False, parsing=luigi.BoolParameter.EXPLICIT_PARSING, ) - check_sex_and_relatedness = luigi.BoolParameter( + skip_expect_filters = luigi.BoolParameter( + default=False, + parsing=luigi.BoolParameter.EXPLICIT_PARSING, + ) + skip_validation = luigi.BoolParameter( default=False, parsing=luigi.BoolParameter.EXPLICIT_PARSING, ) @@ -40,7 +34,3 @@ class BaseLoadingRunParams(luigi.Task): default=False, description='Is this a fully joint-called callset.', ) - liftover_ref_path = luigi.OptionalParameter( - default='gs://hail-common/references/grch38_to_grch37.over.chain.gz', - description='Path to GRCh38 to GRCh37 coordinates file', - ) diff --git a/v03_pipeline/lib/tasks/update_lookup_table_test.py b/v03_pipeline/lib/tasks/update_lookup_table_test.py index dc5f22059..8551d873e 100644 --- a/v03_pipeline/lib/tasks/update_lookup_table_test.py +++ b/v03_pipeline/lib/tasks/update_lookup_table_test.py @@ -7,7 +7,6 @@ ) from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase -TEST_LIFTOVER = 'v03_pipeline/var/test/liftover/grch38_to_grch37.over.chain.gz' TEST_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf' TEST_REMAP = 'v03_pipeline/var/test/remaps/test_remap_1.tsv' TEST_PEDIGREE_3 = 'v03_pipeline/var/test/pedigrees/test_pedigree_3.tsv' @@ -26,8 +25,7 @@ def test_skip_update_lookup_table_task(self) -> None: ], # a project excluded from the lookup table project_remap_paths=[TEST_REMAP], project_pedigree_paths=[TEST_PEDIGREE_3], - validate=False, - liftover_ref_path=TEST_LIFTOVER, + skip_validation=True, ) worker.add(uslt_task) worker.run() @@ -58,8 +56,7 @@ def test_update_lookup_table_task(self) -> None: project_guids=['R0113_test_project'], project_remap_paths=[TEST_REMAP], project_pedigree_paths=[TEST_PEDIGREE_3], - validate=False, - liftover_ref_path=TEST_LIFTOVER, + skip_validation=True, ) worker.add(uslt_task) worker.run() diff --git a/v03_pipeline/lib/tasks/update_project_table_test.py b/v03_pipeline/lib/tasks/update_project_table_test.py index 07278a2d3..c0a5a4e57 100644 --- a/v03_pipeline/lib/tasks/update_project_table_test.py +++ b/v03_pipeline/lib/tasks/update_project_table_test.py @@ -5,7 +5,6 @@ from v03_pipeline.lib.tasks.update_project_table import UpdateProjectTableTask from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase -TEST_LIFTOVER = 'v03_pipeline/var/test/liftover/grch38_to_grch37.over.chain.gz' TEST_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf' TEST_REMAP = 'v03_pipeline/var/test/remaps/test_remap_1.tsv' TEST_PEDIGREE_3 = 'v03_pipeline/var/test/pedigrees/test_pedigree_3.tsv' @@ -22,8 +21,7 @@ def test_update_project_table_task(self) -> None: project_guid='R0113_test_project', project_remap_path=TEST_REMAP, project_pedigree_path=TEST_PEDIGREE_3, - validate=False, - liftover_ref_path=TEST_LIFTOVER, + skip_validation=True, ) worker.add(upt_task) worker.run() diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py index a00d5e3ab..3d723cb3a 100644 --- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py +++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py @@ -43,7 +43,6 @@ from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase from v03_pipeline.var.test.vep.mock_vep_data import MOCK_37_VEP_DATA, MOCK_38_VEP_DATA -TEST_LIFTOVER = 'v03_pipeline/var/test/liftover/grch38_to_grch37.over.chain.gz' TEST_MITO_MT = 'v03_pipeline/var/test/callsets/mito_1.mt' TEST_SNV_INDEL_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf' TEST_SV_VCF = 'v03_pipeline/var/test/callsets/sv_1.vcf' @@ -163,8 +162,7 @@ def test_missing_pedigree( project_guids=['R0113_test_project'], project_remap_paths=[TEST_REMAP], project_pedigree_paths=['bad_pedigree'], - validate=False, - liftover_ref_path=TEST_LIFTOVER, + skip_validation=True, run_id=TEST_RUN_ID, ) worker = luigi.worker.Worker() @@ -197,8 +195,7 @@ def test_missing_interval_reference( project_guids=['R0113_test_project'], project_remap_paths=[TEST_REMAP], project_pedigree_paths=[TEST_PEDIGREE_3], - validate=False, - liftover_ref_path=TEST_LIFTOVER, + skip_validation=True, run_id=TEST_RUN_ID, ) worker = luigi.worker.Worker() @@ -366,8 +363,7 @@ def test_multiple_update_vat( project_guids=['R0113_test_project'], project_remap_paths=[TEST_REMAP], project_pedigree_paths=[TEST_PEDIGREE_3], - validate=True, - liftover_ref_path=TEST_LIFTOVER, + skip_validation=False, run_id=TEST_RUN_ID, ) worker.add(uvatwns_task_3) @@ -418,8 +414,7 @@ def test_multiple_update_vat( project_guids=['R0114_project4'], project_remap_paths=[TEST_REMAP], project_pedigree_paths=[TEST_PEDIGREE_4], - validate=True, - liftover_ref_path=TEST_LIFTOVER, + skip_validation=False, run_id=TEST_RUN_ID, ) worker.add(uvatwns_task_4) @@ -689,8 +684,7 @@ def test_update_vat_grch37( project_guids=['R0113_test_project'], project_remap_paths=[TEST_REMAP], project_pedigree_paths=[TEST_PEDIGREE_3], - validate=False, - liftover_ref_path=TEST_LIFTOVER, + skip_validation=True, run_id=TEST_RUN_ID, ) worker.add(uvatwns_task) @@ -769,8 +763,7 @@ def test_update_vat_without_accessing_private_datasets( project_guids=['R0113_test_project'], project_remap_paths=[TEST_REMAP], project_pedigree_paths=[TEST_PEDIGREE_3], - validate=False, - liftover_ref_path=TEST_LIFTOVER, + skip_validation=True, run_id=TEST_RUN_ID, ) worker.add(uvatwns_task) @@ -827,8 +820,7 @@ def test_mito_update_vat( project_guids=['R0115_test_project2'], project_remap_paths=['not_a_real_file'], project_pedigree_paths=[TEST_PEDIGREE_5], - validate=False, - liftover_ref_path=TEST_LIFTOVER, + skip_validation=True, run_id=TEST_RUN_ID, ) ) @@ -1092,8 +1084,7 @@ def test_sv_update_vat( project_guids=['R0115_test_project2'], project_remap_paths=['not_a_real_file'], project_pedigree_paths=[TEST_PEDIGREE_5], - validate=False, - liftover_ref_path=TEST_LIFTOVER, + skip_validation=True, run_id=TEST_RUN_ID, ) ) @@ -1654,8 +1645,7 @@ def test_gcnv_update_vat( project_guids=['R0115_test_project2'], project_remap_paths=['not_a_real_file'], project_pedigree_paths=[TEST_PEDIGREE_5], - validate=False, - liftover_ref_path=TEST_LIFTOVER, + skip_validation=True, run_id=TEST_RUN_ID, ) ) diff --git a/v03_pipeline/lib/tasks/write_family_table_test.py b/v03_pipeline/lib/tasks/write_family_table_test.py index 554735c59..3adc96901 100644 --- a/v03_pipeline/lib/tasks/write_family_table_test.py +++ b/v03_pipeline/lib/tasks/write_family_table_test.py @@ -5,7 +5,6 @@ from v03_pipeline.lib.tasks.write_family_table import WriteFamilyTableTask from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase -TEST_LIFTOVER = 'v03_pipeline/var/test/liftover/grch38_to_grch37.over.chain.gz' TEST_GCNV_BED_FILE = 'v03_pipeline/var/test/callsets/gcnv_1.tsv' TEST_SNV_INDEL_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf' TEST_SV_VCF = 'v03_pipeline/var/test/callsets/sv_1.vcf' @@ -26,8 +25,7 @@ def test_snv_write_family_table_task(self) -> None: project_remap_path=TEST_REMAP, project_pedigree_path=TEST_PEDIGREE_3, family_guid='abc_1', - validate=False, - liftover_ref_path=TEST_LIFTOVER, + skip_validation=True, ) worker.add(wft_task) worker.run() @@ -164,8 +162,7 @@ def test_sv_write_family_table_task(self) -> None: project_remap_path='not_a_real_file', project_pedigree_path=TEST_PEDIGREE_5, family_guid='family_2_1', - validate=False, - liftover_ref_path=TEST_LIFTOVER, + skip_validation=True, ) worker.add(write_family_table_task) worker.run() @@ -417,8 +414,7 @@ def test_gcnv_write_family_table_task(self) -> None: project_remap_path='not_a_real_file', project_pedigree_path=TEST_PEDIGREE_5, family_guid='family_2_1', - validate=False, - liftover_ref_path=TEST_LIFTOVER, + skip_validation=True, ) worker.add(write_family_table_task) worker.run() diff --git a/v03_pipeline/lib/tasks/write_imported_callset.py b/v03_pipeline/lib/tasks/write_imported_callset.py index 9cd146af4..be42fb750 100644 --- a/v03_pipeline/lib/tasks/write_imported_callset.py +++ b/v03_pipeline/lib/tasks/write_imported_callset.py @@ -4,6 +4,7 @@ from v03_pipeline.lib.misc.io import ( import_callset, + import_vcf, select_relevant_fields, split_multi_hts, ) @@ -22,6 +23,7 @@ cached_reference_dataset_query_path, imported_callset_path, sex_check_table_path, + valid_filters_path, ) from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask @@ -53,12 +55,24 @@ def output(self) -> luigi.Target: def requires(self) -> list[luigi.Task]: requirements = [] - if self.filters_path: + if ( + Env.EXPECT_WES_FILTERS + and not self.skip_expect_filters + and self.dataset_type.expect_filters( + self.sample_type, + ) + ): requirements = [ *requirements, - CallsetTask(self.filters_path), + CallsetTask( + valid_filters_path( + self.dataset_type, + self.sample_type, + self.callset_path, + ), + ), ] - if self.validate and self.dataset_type.can_run_validation: + if not self.skip_validation and self.dataset_type.can_run_validation: requirements = [ *requirements, ( @@ -77,7 +91,8 @@ def requires(self) -> list[luigi.Task]: ), ] if ( - self.check_sex_and_relatedness + Env.CHECK_SEX_AND_RELATEDNESS + and not self.skip_check_sex_and_relatedness and self.dataset_type.check_sex_and_relatedness ): requirements = [ @@ -93,7 +108,7 @@ def additional_row_fields(self, mt): return { **( {'info.AF': hl.tarray(hl.tfloat64)} - if self.check_sex_and_relatedness + if not self.skip_check_sex_and_relatedness and self.dataset_type.check_sex_and_relatedness else {} ), @@ -112,8 +127,22 @@ def create_table(self) -> hl.MatrixTable: self.callset_path, self.reference_genome, self.dataset_type, - self.filters_path, ) + filters_path = None + if ( + Env.EXPECT_WES_FILTERS + and not self.skip_expect_filters + and self.dataset_type.expect_filters( + self.sample_type, + ) + ): + filters_path = valid_filters_path( + self.dataset_type, + self.sample_type, + self.callset_path, + ) + filters_ht = import_vcf(filters_path, self.reference_genome).rows() + mt = mt.annotate_rows(filters=filters_ht[mt.row_key].filters) mt = select_relevant_fields( mt, self.dataset_type, @@ -140,7 +169,7 @@ def create_table(self) -> hl.MatrixTable: mt.locus.contig, ), ) - if self.validate and self.dataset_type.can_run_validation: + if not self.skip_validation and self.dataset_type.can_run_validation: validate_allele_type(mt) validate_no_duplicate_variants(mt) validate_expected_contig_frequency(mt, self.reference_genome) @@ -158,7 +187,8 @@ def create_table(self) -> hl.MatrixTable: self.sample_type, ) if ( - self.check_sex_and_relatedness + Env.CHECK_SEX_AND_RELATEDNESS + and not self.skip_check_sex_and_relatedness and self.dataset_type.check_sex_and_relatedness ): sex_check_ht = hl.read_table( @@ -174,6 +204,6 @@ def create_table(self) -> hl.MatrixTable: ) return mt.annotate_globals( callset_path=self.callset_path, - filters_path=self.filters_path or hl.missing(hl.tstr), + filters_path=filters_path or hl.missing(hl.tstr), sample_type=self.sample_type.value, ) diff --git a/v03_pipeline/lib/tasks/write_metadata_for_run_test.py b/v03_pipeline/lib/tasks/write_metadata_for_run_test.py index cf61fcc4f..f5d733a79 100644 --- a/v03_pipeline/lib/tasks/write_metadata_for_run_test.py +++ b/v03_pipeline/lib/tasks/write_metadata_for_run_test.py @@ -23,8 +23,7 @@ def test_write_metadata_for_run_task(self) -> None: project_guids=['R0113_test_project', 'R0114_project4'], project_remap_paths=[TEST_REMAP_2, TEST_REMAP_2], project_pedigree_paths=[TEST_PEDIGREE_3, TEST_PEDIGREE_4], - validate=False, - check_sex_and_relatedness=False, + skip_validation=True, run_id='run_123456', ) worker.add(write_metadata_for_run_task) diff --git a/v03_pipeline/lib/tasks/write_new_variants_table.py b/v03_pipeline/lib/tasks/write_new_variants_table.py index d35dccc71..b07a7785f 100644 --- a/v03_pipeline/lib/tasks/write_new_variants_table.py +++ b/v03_pipeline/lib/tasks/write_new_variants_table.py @@ -62,6 +62,7 @@ def annotation_dependencies(self) -> dict[str, hl.Table]: deps['gencode_gene_symbol_to_gene_id_mapping'] = hl.literal( load_gencode_gene_symbol_to_gene_id(GENCODE_RELEASE, ''), ) + deps['liftover_ref_path'] = Env.LIFTOVER_REF_PATH return deps def output(self) -> luigi.Target: diff --git a/v03_pipeline/lib/tasks/write_project_family_tables_test.py b/v03_pipeline/lib/tasks/write_project_family_tables_test.py index 9943771d2..37bb9a556 100644 --- a/v03_pipeline/lib/tasks/write_project_family_tables_test.py +++ b/v03_pipeline/lib/tasks/write_project_family_tables_test.py @@ -7,7 +7,6 @@ ) from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase -TEST_LIFTOVER = 'v03_pipeline/var/test/liftover/grch38_to_grch37.over.chain.gz' TEST_SNV_INDEL_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf' TEST_REMAP = 'v03_pipeline/var/test/remaps/test_remap_1.tsv' TEST_PEDIGREE_4 = 'v03_pipeline/var/test/pedigrees/test_pedigree_4.tsv' @@ -24,8 +23,8 @@ def test_snv_write_project_family_tables_task(self) -> None: project_guid='R0113_test_project', project_remap_path=TEST_REMAP, project_pedigree_path=TEST_PEDIGREE_4, - validate=False, - liftover_ref_path=TEST_LIFTOVER, + skip_validation=True, + skip_check_sex_and_relatedness=True, ) worker.add(write_project_family_tables) worker.run() diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py index 81e5adf93..f5e9eb48e 100644 --- a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py +++ b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py @@ -15,6 +15,7 @@ ) from v03_pipeline.lib.misc.pedigree import parse_pedigree_ht_to_families from v03_pipeline.lib.misc.sample_ids import remap_sample_ids, subset_samples +from v03_pipeline.lib.model.environment import Env from v03_pipeline.lib.paths import remapped_and_subsetted_callset_path from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask @@ -53,7 +54,8 @@ def requires(self) -> list[luigi.Task]: RawFileTask(self.project_pedigree_path), ] if ( - self.check_sex_and_relatedness + Env.CHECK_SEX_AND_RELATEDNESS + and not self.skip_check_sex_and_relatedness and self.dataset_type.check_sex_and_relatedness ): requirements = [ @@ -88,7 +90,8 @@ def create_table(self) -> hl.MatrixTable: families_failed_relatedness_check = {} families_failed_sex_check = {} if ( - self.check_sex_and_relatedness + Env.CHECK_SEX_AND_RELATEDNESS + and not self.skip_check_sex_and_relatedness and self.dataset_type.check_sex_and_relatedness ): relatedness_check_ht = hl.read_table(self.input()[2].path) diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py index 6cfd95098..48f2b481a 100644 --- a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py +++ b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py @@ -1,4 +1,5 @@ import shutil +from unittest.mock import Mock, patch import hail as hl import luigi.worker @@ -82,8 +83,7 @@ def test_write_remapped_and_subsetted_callset_task( project_guid='R0113_test_project', project_remap_path=TEST_REMAP, project_pedigree_path=TEST_PEDIGREE_3, - validate=False, - check_sex_and_relatedness=True, + skip_validation=True, ) worker.add(wrsc_task) worker.run() @@ -104,9 +104,12 @@ def test_write_remapped_and_subsetted_callset_task( ], ) + @patch('v03_pipeline.lib.tasks.write_remapped_and_subsetted_callset.Env') def test_write_remapped_and_subsetted_callset_task_failed_sex_check_family( self, + mock_env: Mock, ) -> None: + mock_env.CHECK_SEX_AND_RELATEDNESS = True worker = luigi.worker.Worker() wrsc_task = WriteRemappedAndSubsettedCallsetTask( reference_genome=ReferenceGenome.GRCh38, @@ -116,8 +119,7 @@ def test_write_remapped_and_subsetted_callset_task_failed_sex_check_family( project_guid='R0114_project4', project_remap_path=TEST_REMAP, project_pedigree_path=TEST_PEDIGREE_4, - validate=False, - check_sex_and_relatedness=True, + skip_validation=True, ) worker.add(wrsc_task) worker.run() diff --git a/v03_pipeline/lib/tasks/write_sex_check_table.py b/v03_pipeline/lib/tasks/write_sex_check_table.py index 801330689..b8b4bb9e7 100644 --- a/v03_pipeline/lib/tasks/write_sex_check_table.py +++ b/v03_pipeline/lib/tasks/write_sex_check_table.py @@ -2,14 +2,13 @@ import luigi from v03_pipeline.lib.misc.io import import_imputed_sex -from v03_pipeline.lib.paths import sex_check_table_path +from v03_pipeline.lib.paths import imputed_sex_path, sex_check_table_path from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask from v03_pipeline.lib.tasks.files import GCSorLocalTarget, RawFileTask class WriteSexCheckTableTask(BaseWriteTask): callset_path = luigi.Parameter() - imputed_sex_path = luigi.Parameter() def output(self) -> luigi.Target: return GCSorLocalTarget( @@ -21,7 +20,13 @@ def output(self) -> luigi.Target: ) def requires(self) -> luigi.Task: - return RawFileTask(self.imputed_sex_path) + return RawFileTask( + imputed_sex_path( + self.reference_genome, + self.dataset_type, + self.callset_path, + ), + ) def create_table(self) -> hl.Table: return import_imputed_sex(self.input().path)