broadinstitute · bpblanken · Jul 1, 2024 · Jun 12, 2024 · Jun 12, 2024 · Jun 12, 2024
@@ -6,14 +6,11 @@
 from v03_pipeline.lib.paths import remapped_and_subsetted_callset_path
 
 
-def get_callset_ht(  # noqa: PLR0913
+def get_callset_ht(
     reference_genome: ReferenceGenome,
     dataset_type: DatasetType,
-    callset_paths: list[str],
+    callset_path: str,
     project_guids: list[str],
-    project_remap_paths: list[str],
-    project_pedigree_paths: list[str],
-    imputed_sex_paths: list[str] | None,
 ):
     callset_hts = [
         hl.read_matrix_table(
@@ -24,58 +21,10 @@ def get_callset_ht(  # noqa: PLR0913
                 project_guid,
             ),
         ).rows()
-        for (callset_path, project_guid, _, _, _) in callset_project_pairs(
-            callset_paths,
-            project_guids,
-            project_remap_paths,
-            project_pedigree_paths,
-            imputed_sex_paths,
-        )
+        for project_guid in project_guids
     ]
     callset_ht = functools.reduce(
         (lambda ht1, ht2: ht1.union(ht2, unify=True)),
         callset_hts,
     )
     return callset_ht.distinct()
-
-
-def callset_project_pairs(
-    callset_paths: list[str],
-    project_guids: list[str],
-    project_remap_paths: list[str],
-    project_pedigree_paths: list[str],
-    imputed_sex_paths: list[str] | None,
-):
-    if len(callset_paths) == len(project_guids):
-        return zip(
-            callset_paths,
-            project_guids,
-            project_remap_paths,
-            project_pedigree_paths,
-            imputed_sex_paths
-            if imputed_sex_paths is not None
-            else [None] * len(callset_paths),
-            strict=True,
-        )
-    return (
-        (
-            callset_path,
-            project_guid,
-            project_remap_path,
-            project_pedigree_path,
-            imputed_sex_path,
-        )
-        for callset_path, imputed_sex_path in zip(
-            callset_paths,
-            imputed_sex_paths
-            if imputed_sex_paths is not None
-            else [None] * len(callset_paths),
-            strict=False,
-        )
-        for (project_guid, project_remap_path, project_pedigree_path) in zip(
-            project_guids,
-            project_remap_paths,
-            project_pedigree_paths,
-            strict=True,
-        )
-    )
@@ -121,7 +121,6 @@ def import_callset(
     callset_path: str,
     reference_genome: ReferenceGenome,
     dataset_type: DatasetType,
-    filters_path: str | None = None,
 ) -> hl.MatrixTable:
     if dataset_type == DatasetType.GCNV:
         mt = import_gcnv_bed_file(callset_path)
@@ -131,9 +130,6 @@ def import_callset(
         mt = hl.read_matrix_table(callset_path)
     if dataset_type == DatasetType.SV:
         mt = mt.annotate_rows(variant_id=mt.rsid)
-    if filters_path:
-        filters_ht = import_vcf(filters_path, reference_genome).rows()
-        mt = mt.annotate_rows(filters=filters_ht[mt.row_key].filters)
     return mt.key_rows_by(*dataset_type.table_key_type(reference_genome).fields)
 
 

@@ -4,7 +4,7 @@
 import hail as hl
 
 from v03_pipeline.lib.annotations import gcnv, mito, shared, snv_indel, sv
-from v03_pipeline.lib.model.definitions import ReferenceGenome
+from v03_pipeline.lib.model.definitions import ReferenceGenome, SampleType
 
 MITO_MIN_HOM_THRESHOLD = 0.95
 ZERO = 0.0
@@ -155,6 +155,12 @@ def has_gencode_ensembl_to_refseq_id_mapping(
             self == DatasetType.SNV_INDEL and reference_genome == ReferenceGenome.GRCh38
         )
 
+    def expect_filters(
+        self,
+        sample_type: SampleType,
+    ) -> bool:
+        return self == DatasetType.SNV_INDEL and sample_type == SampleType.WES
+
     @property
     def has_gencode_gene_symbol_to_gene_id_mapping(self) -> bool:
         return self == DatasetType.SV

@@ -2,12 +2,12 @@
 from dataclasses import dataclass
 
 # NB: using os.environ.get inside the dataclass defaults gives a lint error.
-ACCESS_PRIVATE_REFERENCE_DATASETS = (
-    os.environ.get('ACCESS_PRIVATE_REFERENCE_DATASETS') == '1'
-)
-REFERENCE_DATA_AUTO_UPDATE = os.environ.get('REFERENCE_DATA_AUTO_UPDATE') == '1'
 HAIL_TMPDIR = os.environ.get('HAIL_TMPDIR', '/tmp')  # noqa: S108
 HAIL_SEARCH_DATA = os.environ.get('HAIL_SEARCH_DATA', '/hail-search-data')
+LIFTOVER_REF_PATH = os.environ.get(
+    'LIFTOVER_REF_PATH',
+    'gs://hail-common/references/grch38_to_grch37.over.chain.gz',
+)
 LOADING_DATASETS = os.environ.get('LOADING_DATASETS', '/seqr-loading-temp')
 PRIVATE_REFERENCE_DATASETS = os.environ.get(
     'PRIVATE_REFERENCE_DATASETS',
@@ -19,21 +19,34 @@
 )
 VEP_CONFIG_PATH = os.environ.get('VEP_CONFIG_PATH', None)
 VEP_CONFIG_URI = os.environ.get('VEP_CONFIG_URI', None)
-SHOULD_REGISTER_ALLELES = os.environ.get('SHOULD_REGISTER_ALLELES') == '1'
+
+# Allele registry secrets :/
 ALLELE_REGISTRY_SECRET_NAME = os.environ.get('ALLELE_REGISTRY_SECRET_NAME', None)
 PROJECT_ID = os.environ.get('PROJECT_ID', None)
 
+# Feature Flags
+ACCESS_PRIVATE_REFERENCE_DATASETS = (
+    os.environ.get('ACCESS_PRIVATE_REFERENCE_DATASETS') == '1'
+)
+CHECK_SEX_AND_RELATEDNESS = os.environ.get('CHECK_SEX_AND_RELATEDNESS') == '1'
+EXPECT_WES_FILTERS = os.environ.get('EXPECT_WES_FILTERS') == '1'
+REFERENCE_DATA_AUTO_UPDATE = os.environ.get('REFERENCE_DATA_AUTO_UPDATE') == '1'
+SHOULD_REGISTER_ALLELES = os.environ.get('SHOULD_REGISTER_ALLELES') == '1'
+
 
 @dataclass
 class Env:
     ACCESS_PRIVATE_REFERENCE_DATASETS: bool = ACCESS_PRIVATE_REFERENCE_DATASETS
     ALLELE_REGISTRY_SECRET_NAME: str | None = ALLELE_REGISTRY_SECRET_NAME
-    REFERENCE_DATA_AUTO_UPDATE: bool = REFERENCE_DATA_AUTO_UPDATE
+    CHECK_SEX_AND_RELATEDNESS: bool = CHECK_SEX_AND_RELATEDNESS
+    EXPECT_WES_FILTERS: bool = EXPECT_WES_FILTERS
     HAIL_TMPDIR: str = HAIL_TMPDIR
     HAIL_SEARCH_DATA: str = HAIL_SEARCH_DATA
+    LIFTOVER_REF_PATH: str = LIFTOVER_REF_PATH
     LOADING_DATASETS: str = LOADING_DATASETS
     PRIVATE_REFERENCE_DATASETS: str = PRIVATE_REFERENCE_DATASETS
     PROJECT_ID: str | None = PROJECT_ID
+    REFERENCE_DATA_AUTO_UPDATE: bool = REFERENCE_DATA_AUTO_UPDATE
     REFERENCE_DATASETS: str = REFERENCE_DATASETS
     SHOULD_REGISTER_ALLELES: bool = SHOULD_REGISTER_ALLELES
     VEP_CONFIG_PATH: str | None = VEP_CONFIG_PATH

@@ -1,5 +1,6 @@
 import hashlib
 import os
+import re
 
 from v03_pipeline.lib.model import (
     AccessControl,
@@ -9,6 +10,7 @@
     PipelineVersion,
     ReferenceDatasetCollection,
     ReferenceGenome,
+    SampleType,
 )
 
 
@@ -73,6 +75,22 @@ def family_table_path(
     )
 
 
+def imputed_sex_path(
+    reference_genome: ReferenceGenome,
+    dataset_type: DatasetType,
+    callset_path: str,
+) -> str:
+    return os.path.join(
+        _v03_pipeline_prefix(
+            Env.LOADING_DATASETS,
+            reference_genome,
+            dataset_type,
+        ),
+        'imputed_sex',
+        f'{hashlib.sha256(callset_path.encode("utf8")).hexdigest()}.tsv',
+    )
+
+
 def imported_callset_path(
     reference_genome: ReferenceGenome,
     dataset_type: DatasetType,
@@ -198,6 +216,24 @@ def sex_check_table_path(
     )
 
 
+def valid_filters_path(
+    dataset_type: DatasetType,
+    sample_type: SampleType,
+    callset_path: str,
+) -> str | None:
+    if (
+        not Env.EXPECT_WES_FILTERS
+        or not dataset_type.expect_filters(sample_type)
+        or 'part_one_outputs' not in callset_path
+    ):
+        return None
+    return re.sub(
+        'part_one_outputs/.*$',
+        'part_two_outputs/*.filtered.*.vcf.gz',
+        callset_path,
+    )
+
+
 def valid_reference_dataset_collection_path(
     reference_genome: ReferenceGenome,
     dataset_type: DatasetType,

@@ -6,18 +6,21 @@
     DatasetType,
     ReferenceDatasetCollection,
     ReferenceGenome,
+    SampleType,
 )
 from v03_pipeline.lib.paths import (
     cached_reference_dataset_query_path,
     family_table_path,
     imported_callset_path,
+    imputed_sex_path,
     lookup_table_path,
     metadata_for_run_path,
     new_variants_table_path,
     project_table_path,
     relatedness_check_table_path,
     remapped_and_subsetted_callset_path,
     sex_check_table_path,
+    valid_filters_path,
     valid_reference_dataset_collection_path,
     variant_annotations_table_path,
 )
@@ -54,6 +57,26 @@ def test_family_table_path(self) -> None:
                 'gs://seqr-datasets/v03/GRCh37/SNV_INDEL/families/franklin.ht',
             )
 
+    def test_valid_filters_path(self) -> None:
+        self.assertEqual(
+            valid_filters_path(
+                DatasetType.MITO,
+                SampleType.WES,
+                'gs://bucket/RDG_Broad_WES_Internal_Oct2023/part_one_outputs/chr*/*.vcf.gz',
+            ),
+            None,
+        )
+        with patch('v03_pipeline.lib.paths.Env') as mock_env:
+            mock_env.EXPECT_WES_FILTERS = True
+            self.assertEqual(
+                valid_filters_path(
+                    DatasetType.SNV_INDEL,
+                    SampleType.WES,
+                    'gs://bucket/RDG_Broad_WES_Internal_Oct2023/part_one_outputs/chr*/*.vcf.gz',
+                ),
+                'gs://bucket/RDG_Broad_WES_Internal_Oct2023/part_two_outputs/*.filtered.*.vcf.gz',
+            )
+
     def test_project_table_path(self) -> None:
         self.assertEqual(
             project_table_path(
@@ -162,6 +185,16 @@ def test_imported_callset_path(self) -> None:
             '/seqr-loading-temp/v03/GRCh38/SNV_INDEL/imported_callsets/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.mt',
         )
 
+    def test_imputed_sex_path(self) -> None:
+        self.assertEqual(
+            imputed_sex_path(
+                ReferenceGenome.GRCh38,
+                DatasetType.SNV_INDEL,
+                'gs://abc.efg/callset.vcf.gz',
+            ),
+            '/seqr-loading-temp/v03/GRCh38/SNV_INDEL/imputed_sex/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.tsv',
+        )
+
     def test_new_variants_table_path(self) -> None:
         self.assertEqual(
             new_variants_table_path(

@@ -0,0 +1,36 @@
+import luigi
+
+from v03_pipeline.lib.model import SampleType
+
+
+class BaseLoadingRunParams(luigi.Task):
+    # NB:
+    # These params are "inherited" with the special
+    # luigi.util.inherits function, copying params
+    # but nothing else.
+    sample_type = luigi.EnumParameter(enum=SampleType)
+    callset_path = luigi.Parameter()
+    ignore_missing_samples_when_remapping = luigi.BoolParameter(
+        default=False,
+        parsing=luigi.BoolParameter.EXPLICIT_PARSING,
+    )
+    force = luigi.BoolParameter(
+        default=False,
+        parsing=luigi.BoolParameter.EXPLICIT_PARSING,
+    )
+    skip_check_sex_and_relatedness = luigi.BoolParameter(
+        default=False,
+        parsing=luigi.BoolParameter.EXPLICIT_PARSING,
+    )
+    skip_expect_filters = luigi.BoolParameter(
+        default=False,
+        parsing=luigi.BoolParameter.EXPLICIT_PARSING,
+    )
+    skip_validation = luigi.BoolParameter(
+        default=False,
+        parsing=luigi.BoolParameter.EXPLICIT_PARSING,
+    )
+    is_new_gcnv_joint_call = luigi.BoolParameter(
+        default=False,
+        description='Is this a fully joint-called callset.',
+    )