broadinstitute · bpblanken · Jun 21, 2024 · Jun 20, 2024 · Jun 20, 2024 · Jun 20, 2024
diff --git a/v03_pipeline/lib/misc/io.py b/v03_pipeline/lib/misc/io.py
@@ -121,7 +121,6 @@ def import_callset(
     callset_path: str,
     reference_genome: ReferenceGenome,
     dataset_type: DatasetType,
-    filters_path: str | None = None,
 ) -> hl.MatrixTable:
     if dataset_type == DatasetType.GCNV:
         mt = import_gcnv_bed_file(callset_path)
@@ -131,9 +130,6 @@ def import_callset(
         mt = hl.read_matrix_table(callset_path)
     if dataset_type == DatasetType.SV:
         mt = mt.annotate_rows(variant_id=mt.rsid)
-    if filters_path:
-        filters_ht = import_vcf(filters_path, reference_genome).rows()
-        mt = mt.annotate_rows(filters=filters_ht[mt.row_key].filters)
     return mt.key_rows_by(*dataset_type.table_key_type(reference_genome).fields)
 
 

diff --git a/v03_pipeline/lib/model/dataset_type.py b/v03_pipeline/lib/model/dataset_type.py
@@ -4,7 +4,7 @@
 import hail as hl
 
 from v03_pipeline.lib.annotations import gcnv, mito, shared, snv_indel, sv
-from v03_pipeline.lib.model.definitions import ReferenceGenome
+from v03_pipeline.lib.model.definitions import ReferenceGenome, SampleType
 
 MITO_MIN_HOM_THRESHOLD = 0.95
 ZERO = 0.0
@@ -155,6 +155,12 @@ def has_gencode_ensembl_to_refseq_id_mapping(
             self == DatasetType.SNV_INDEL and reference_genome == ReferenceGenome.GRCh38
         )
 
+    def expect_filters(
+        self,
+        sample_type: SampleType,
+    ) -> bool:
+        return self == DatasetType.SNV_INDEL and sample_type == SampleType.WES
+
     @property
     def has_gencode_gene_symbol_to_gene_id_mapping(self) -> bool:
         return self == DatasetType.SV

diff --git a/v03_pipeline/lib/model/environment.py b/v03_pipeline/lib/model/environment.py
@@ -2,12 +2,12 @@
 from dataclasses import dataclass
 
 # NB: using os.environ.get inside the dataclass defaults gives a lint error.
-ACCESS_PRIVATE_REFERENCE_DATASETS = (
-    os.environ.get('ACCESS_PRIVATE_REFERENCE_DATASETS') == '1'
-)
-REFERENCE_DATA_AUTO_UPDATE = os.environ.get('REFERENCE_DATA_AUTO_UPDATE') == '1'
 HAIL_TMPDIR = os.environ.get('HAIL_TMPDIR', '/tmp')  # noqa: S108
 HAIL_SEARCH_DATA = os.environ.get('HAIL_SEARCH_DATA', '/hail-search-data')
+LIFTOVER_REF_PATH = os.environ.get(
+    'LIFTOVER_REF_PATH',
+    'gs://hail-common/references/grch38_to_grch37.over.chain.gz',
+)
 LOADING_DATASETS = os.environ.get('LOADING_DATASETS', '/seqr-loading-temp')
 PRIVATE_REFERENCE_DATASETS = os.environ.get(
     'PRIVATE_REFERENCE_DATASETS',
@@ -19,22 +19,35 @@
 )
 VEP_CONFIG_PATH = os.environ.get('VEP_CONFIG_PATH', None)
 VEP_CONFIG_URI = os.environ.get('VEP_CONFIG_URI', None)
-SHOULD_REGISTER_ALLELES = os.environ.get('SHOULD_REGISTER_ALLELES') == '1'
+
+# Allele registry secrets :/
 ALLELE_REGISTRY_SECRET_NAME = os.environ.get('ALLELE_REGISTRY_SECRET_NAME', None)
 PROJECT_ID = os.environ.get('PROJECT_ID', None)
 
+# Feature Flags
+ACCESS_PRIVATE_REFERENCE_DATASETS = (
+    os.environ.get('ACCESS_PRIVATE_REFERENCE_DATASETS') == '1'
+)
+CHECK_SEX_AND_RELATEDNESS = os.environ.get('CHECK_SEX_AND_RELATEDNESS') == '1'
+EXPECT_WES_FILTERS = os.environ.get('EXPECT_WES_FILTERS') == '1'
+REFERENCE_DATA_AUTO_UPDATE = os.environ.get('REFERENCE_DATA_AUTO_UPDATE') == '1'
+SHOULD_REGISTER_ALLELES = os.environ.get('SHOULD_REGISTER_ALLELES') == '1'
+
 
 @dataclass
 class Env:
     ACCESS_PRIVATE_REFERENCE_DATASETS: bool = ACCESS_PRIVATE_REFERENCE_DATASETS
     ALLELE_REGISTRY_SECRET_NAME: str | None = ALLELE_REGISTRY_SECRET_NAME
-    REFERENCE_DATA_AUTO_UPDATE: bool = REFERENCE_DATA_AUTO_UPDATE
+    CHECK_SEX_AND_RELATEDNESS: bool = CHECK_SEX_AND_RELATEDNESS
+    EXPECT_WES_FILTERS: bool = EXPECT_WES_FILTERS
     HAIL_TMPDIR: str = HAIL_TMPDIR
     HAIL_SEARCH_DATA: str = HAIL_SEARCH_DATA
+    LIFTOVER_REF_PATH: str = LIFTOVER_REF_PATH
     LOADING_DATASETS: str = LOADING_DATASETS
     PRIVATE_REFERENCE_DATASETS: str = PRIVATE_REFERENCE_DATASETS
     PROJECT_ID: str | None = PROJECT_ID
     REFERENCE_DATASETS: str = REFERENCE_DATASETS
+    REFERENCE_DATA_AUTO_UPDATE: bool = REFERENCE_DATA_AUTO_UPDATE
     SHOULD_REGISTER_ALLELES: bool = SHOULD_REGISTER_ALLELES
     VEP_CONFIG_PATH: str | None = VEP_CONFIG_PATH
     VEP_CONFIG_URI: str | None = VEP_CONFIG_URI
diff --git a/v03_pipeline/lib/paths.py b/v03_pipeline/lib/paths.py
@@ -1,5 +1,6 @@
 import hashlib
 import os
+import re
 
 from v03_pipeline.lib.model import (
     AccessControl,
@@ -9,6 +10,7 @@
     PipelineVersion,
     ReferenceDatasetCollection,
     ReferenceGenome,
+    SampleType,
 )
 
 
@@ -73,6 +75,22 @@ def family_table_path(
     )
 
 
+def imputed_sex_path(
+    reference_genome: ReferenceGenome,
+    dataset_type: DatasetType,
+    callset_path: str,
+) -> str:
+    return os.path.join(
+        _v03_pipeline_prefix(
+            Env.LOADING_DATASETS,
+            reference_genome,
+            dataset_type,
+        ),
+        'imputed_sex',
+        f'{hashlib.sha256(callset_path.encode("utf8")).hexdigest()}.tsv',
+    )
+
+
 def imported_callset_path(
     reference_genome: ReferenceGenome,
     dataset_type: DatasetType,
@@ -198,6 +216,24 @@ def sex_check_table_path(
     )
 
 
+def valid_filters_path(
+    dataset_type: DatasetType,
+    sample_type: SampleType,
+    callset_path: str,
+) -> str | None:
+    if (
+        not Env.EXPECT_WES_FILTERS
+        or not dataset_type.expect_filters(sample_type)
+        or 'part_one_outputs' not in callset_path
+    ):
+        return None
+    return re.sub(
+        'part_one_outputs/.*$',
+        'part_two_outputs/*.filtered.*.vcf.gz',
+        callset_path,
+    )
+
+
 def valid_reference_dataset_collection_path(
     reference_genome: ReferenceGenome,
     dataset_type: DatasetType,

diff --git a/v03_pipeline/lib/paths_test.py b/v03_pipeline/lib/paths_test.py
@@ -6,18 +6,21 @@
     DatasetType,
     ReferenceDatasetCollection,
     ReferenceGenome,
+    SampleType,
 )
 from v03_pipeline.lib.paths import (
     cached_reference_dataset_query_path,
     family_table_path,
     imported_callset_path,
+    imputed_sex_path,
     lookup_table_path,
     metadata_for_run_path,
     new_variants_table_path,
     project_table_path,
     relatedness_check_table_path,
     remapped_and_subsetted_callset_path,
     sex_check_table_path,
+    valid_filters_path,
     valid_reference_dataset_collection_path,
     variant_annotations_table_path,
 )
@@ -54,6 +57,26 @@ def test_family_table_path(self) -> None:
                 'gs://seqr-datasets/v03/GRCh37/SNV_INDEL/families/franklin.ht',
             )
 
+    def test_valid_filters_path(self) -> None:
+        self.assertEqual(
+            valid_filters_path(
+                DatasetType.MITO,
+                SampleType.WES,
+                'gs://bucket/RDG_Broad_WES_Internal_Oct2023/part_one_outputs/chr*/*.vcf.gz',
+            ),
+            None,
+        )
+        with patch('v03_pipeline.lib.paths.Env') as mock_env:
+            mock_env.EXPECT_WES_FILTERS = True
+            self.assertEqual(
+                valid_filters_path(
+                    DatasetType.SNV_INDEL,
+                    SampleType.WES,
+                    'gs://bucket/RDG_Broad_WES_Internal_Oct2023/part_one_outputs/chr*/*.vcf.gz',
+                ),
+                'gs://bucket/RDG_Broad_WES_Internal_Oct2023/part_two_outputs/*.filtered.*.vcf.gz',
+            )
+
     def test_project_table_path(self) -> None:
         self.assertEqual(
             project_table_path(
@@ -162,6 +185,16 @@ def test_imported_callset_path(self) -> None:
             '/seqr-loading-temp/v03/GRCh38/SNV_INDEL/imported_callsets/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.mt',
         )
 
+    def test_imputed_sex_path(self) -> None:
+        self.assertEqual(
+            imputed_sex_path(
+                ReferenceGenome.GRCh38,
+                DatasetType.SNV_INDEL,
+                'gs://abc.efg/callset.vcf.gz',
+            ),
+            '/seqr-loading-temp/v03/GRCh38/SNV_INDEL/imputed_sex/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.tsv',
+        )
+
     def test_new_variants_table_path(self) -> None:
         self.assertEqual(
             new_variants_table_path(

diff --git a/v03_pipeline/lib/tasks/base/base_loading_run_params.py b/v03_pipeline/lib/tasks/base/base_loading_run_params.py
@@ -10,37 +10,27 @@ class BaseLoadingRunParams(luigi.Task):
     # but nothing else.
     sample_type = luigi.EnumParameter(enum=SampleType)
     callset_path = luigi.Parameter()
-    # HINT: OptionalParameter vs Parameter is significant here.
-    # The default Parameter will case `None` to the string "None".
-    imputed_sex_path = luigi.OptionalParameter(
-        default=None,
-        description='Optional path to a tsv of imputed sex values from the DRAGEN GVS pipeline.',
-    )
-    filters_path = luigi.OptionalParameter(
-        default=None,
-        description='Optional path to part two outputs from callset (VCF shards containing filter information)',
-    )
     ignore_missing_samples_when_remapping = luigi.BoolParameter(
         default=False,
         parsing=luigi.BoolParameter.EXPLICIT_PARSING,
     )
-    validate = luigi.BoolParameter(
-        default=True,
+    force = luigi.BoolParameter(
+        default=False,
         parsing=luigi.BoolParameter.EXPLICIT_PARSING,
     )
-    force = luigi.BoolParameter(
+    skip_check_sex_and_relatedness = luigi.BoolParameter(
         default=False,
         parsing=luigi.BoolParameter.EXPLICIT_PARSING,
     )
-    check_sex_and_relatedness = luigi.BoolParameter(
+    skip_expect_filters = luigi.BoolParameter(
+        default=False,
+        parsing=luigi.BoolParameter.EXPLICIT_PARSING,
+    )
+    skip_validation = luigi.BoolParameter(
         default=False,
         parsing=luigi.BoolParameter.EXPLICIT_PARSING,
     )
     is_new_gcnv_joint_call = luigi.BoolParameter(
         default=False,
         description='Is this a fully joint-called callset.',
     )
-    liftover_ref_path = luigi.OptionalParameter(
-        default='gs://hail-common/references/grch38_to_grch37.over.chain.gz',
-        description='Path to GRCh38 to GRCh37 coordinates file',
-    )
diff --git a/v03_pipeline/lib/tasks/update_lookup_table_test.py b/v03_pipeline/lib/tasks/update_lookup_table_test.py
@@ -7,7 +7,6 @@
 )
 from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase
 
-TEST_LIFTOVER = 'v03_pipeline/var/test/liftover/grch38_to_grch37.over.chain.gz'
 TEST_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf'
 TEST_REMAP = 'v03_pipeline/var/test/remaps/test_remap_1.tsv'
 TEST_PEDIGREE_3 = 'v03_pipeline/var/test/pedigrees/test_pedigree_3.tsv'
@@ -26,8 +25,7 @@ def test_skip_update_lookup_table_task(self) -> None:
             ],  # a project excluded from the lookup table
             project_remap_paths=[TEST_REMAP],
             project_pedigree_paths=[TEST_PEDIGREE_3],
-            validate=False,
-            liftover_ref_path=TEST_LIFTOVER,
+            skip_validation=True,
         )
         worker.add(uslt_task)
         worker.run()
@@ -58,8 +56,7 @@ def test_update_lookup_table_task(self) -> None:
             project_guids=['R0113_test_project'],
             project_remap_paths=[TEST_REMAP],
             project_pedigree_paths=[TEST_PEDIGREE_3],
-            validate=False,
-            liftover_ref_path=TEST_LIFTOVER,
+            skip_validation=True,
         )
         worker.add(uslt_task)
         worker.run()

diff --git a/v03_pipeline/lib/tasks/update_project_table_test.py b/v03_pipeline/lib/tasks/update_project_table_test.py
@@ -5,7 +5,6 @@
 from v03_pipeline.lib.tasks.update_project_table import UpdateProjectTableTask
 from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase
 
-TEST_LIFTOVER = 'v03_pipeline/var/test/liftover/grch38_to_grch37.over.chain.gz'
 TEST_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf'
 TEST_REMAP = 'v03_pipeline/var/test/remaps/test_remap_1.tsv'
 TEST_PEDIGREE_3 = 'v03_pipeline/var/test/pedigrees/test_pedigree_3.tsv'
@@ -22,8 +21,7 @@ def test_update_project_table_task(self) -> None:
             project_guid='R0113_test_project',
             project_remap_path=TEST_REMAP,
             project_pedigree_path=TEST_PEDIGREE_3,
-            validate=False,
-            liftover_ref_path=TEST_LIFTOVER,
+            skip_validation=True,
         )
         worker.add(upt_task)
         worker.run()