broadinstitute
diff --git a/‎v03_pipeline/lib/model/cached_reference_dataset_query.py
Lines changed: 15 additions & 116 deletions b/‎v03_pipeline/lib/model/cached_reference_dataset_query.py
Lines changed: 15 additions & 116 deletions
diff --git a/‎v03_pipeline/lib/reference_data/compare_globals.py
Lines changed: 12 additions & 12 deletions b/‎v03_pipeline/lib/reference_data/compare_globals.py
Lines changed: 12 additions & 12 deletions
diff --git a/‎v03_pipeline/lib/reference_data/config.py
Lines changed: 2 additions & 0 deletions b/‎v03_pipeline/lib/reference_data/config.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎v03_pipeline/lib/reference_data/dataset_table_operations.py
Lines changed: 13 additions & 12 deletions b/‎v03_pipeline/lib/reference_data/dataset_table_operations.py
Lines changed: 13 additions & 12 deletions
diff --git a/‎v03_pipeline/lib/reference_data/queries.py
Lines changed: 121 additions & 0 deletions b/‎v03_pipeline/lib/reference_data/queries.py
Lines changed: 121 additions & 0 deletions
@@ -1,127 +1,17 @@
 from collections.abc import Callable
 from enum import Enum
-from typing import Any
 
 import hail as hl
 
-from v03_pipeline.lib.annotations.enums import (
-    CLINVAR_PATHOGENICITIES_LOOKUP,
-    CONSEQUENCE_TERMS,
-)
-from v03_pipeline.lib.annotations.expression_helpers import (
-    get_expr_for_vep_sorted_transcript_consequences_array,
-    get_expr_for_worst_transcript_consequence_annotations_struct,
-)
 from v03_pipeline.lib.model.dataset_type import DatasetType
 from v03_pipeline.lib.model.definitions import AccessControl, ReferenceGenome
 from v03_pipeline.lib.model.environment import Env
-
-CLINVAR_PATH_RANGE = ('Pathogenic', 'Pathogenic/Likely_risk_allele')
-CLINVAR_LIKELY_PATH_RANGE = ('Pathogenic/Likely_pathogenic', 'Likely_risk_allele')
-CONSEQUENCE_TERM_RANK_LOOKUP = hl.dict(
-    hl.enumerate(CONSEQUENCE_TERMS, index_first=False),
+from v03_pipeline.lib.reference_data.queries import (
+    clinvar_path_variants,
+    gnomad_coding_and_noncoding_variants,
+    gnomad_qc,
+    high_af_variants,
 )
-GNOMAD_CODING_NONCODING_HIGH_AF_THRESHOLD = 0.90
-ONE_PERCENT = 0.01
-THREE_PERCENT = 0.03
-FIVE_PERCENT = 0.05
-TEN_PERCENT = 0.10
-
-
-def clinvar_path_variants(
-    ht: hl.Table,
-    dataset_type: DatasetType,
-    **_: Any,
-) -> hl.Table:
-    clinvar_field = 'clinvar_mito' if dataset_type == DatasetType.MITO else 'clinvar'
-    ht = ht.select_globals()
-    ht = ht.select(
-        pathogenic=(
-            (
-                ht[clinvar_field].pathogenicity_id
-                >= CLINVAR_PATHOGENICITIES_LOOKUP[CLINVAR_PATH_RANGE[0]]
-            )
-            & (
-                ht[clinvar_field].pathogenicity_id
-                <= CLINVAR_PATHOGENICITIES_LOOKUP[CLINVAR_PATH_RANGE[1]]
-            )
-        ),
-        likely_pathogenic=(
-            (
-                ht[clinvar_field].pathogenicity_id
-                >= CLINVAR_PATHOGENICITIES_LOOKUP[CLINVAR_LIKELY_PATH_RANGE[0]]
-            )
-            & (
-                ht[clinvar_field].pathogenicity_id
-                <= CLINVAR_PATHOGENICITIES_LOOKUP[CLINVAR_LIKELY_PATH_RANGE[1]]
-            )
-        ),
-    )
-    return ht.filter(ht.pathogenic | ht.likely_pathogenic)
-
-
-def gnomad_coding_and_noncoding_variants(
-    ht: hl.Table,
-    reference_genome: ReferenceGenome,
-    **_: Any,
-) -> hl.Table:
-    filtered_contig = 'chr1' if reference_genome == ReferenceGenome.GRCh38 else '1'
-    ht = hl.filter_intervals(
-        ht,
-        [
-            hl.parse_locus_interval(
-                filtered_contig,
-                reference_genome=reference_genome.value,
-            ),
-        ],
-    )
-    ht = ht.filter(ht.freq[0].AF > GNOMAD_CODING_NONCODING_HIGH_AF_THRESHOLD)
-    ht = ht.annotate(
-        sorted_transaction_consequences=(
-            get_expr_for_vep_sorted_transcript_consequences_array(
-                ht.vep,
-                omit_consequences=[],
-            )
-        ),
-    )
-    ht = ht.annotate(
-        main_transcript=(
-            get_expr_for_worst_transcript_consequence_annotations_struct(
-                ht.sorted_transaction_consequences,
-            )
-        ),
-    )
-    ht = ht.select(
-        coding=(
-            ht.main_transcript.major_consequence_rank
-            <= CONSEQUENCE_TERM_RANK_LOOKUP['synonymous_variant']
-        ),
-        noncoding=(
-            ht.main_transcript.major_consequence_rank
-            >= CONSEQUENCE_TERM_RANK_LOOKUP['downstream_gene_variant']
-        ),
-    )
-    return ht.filter(ht.coding | ht.noncoding)
-
-
-def high_af_variants(
-    ht: hl.Table,
-    **_: Any,
-) -> hl.Table:
-    ht = ht.select_globals()
-    ht = ht.filter(ht.gnomad_genomes.AF_POPMAX_OR_GLOBAL > ONE_PERCENT)
-    return ht.select(
-        is_gt_3_percent=ht.gnomad_genomes.AF_POPMAX_OR_GLOBAL > THREE_PERCENT,
-        is_gt_5_percent=ht.gnomad_genomes.AF_POPMAX_OR_GLOBAL > FIVE_PERCENT,
-        is_gt_10_percent=ht.gnomad_genomes.AF_POPMAX_OR_GLOBAL > TEN_PERCENT,
-    )
-
-
-def gnomad_qc(
-    ht: hl.Table,
-    **_: Any,
-) -> hl.Table:
-    return ht.select()
 
 
 class CachedReferenceDatasetQuery(Enum):
@@ -137,12 +27,21 @@ def access_control(self) -> AccessControl:
         return AccessControl.PUBLIC
 
     @property
-    def reference_dataset(self) -> str | None:
+    def dataset(self) -> str | None:
         return {
+            CachedReferenceDatasetQuery.CLINVAR_PATH_VARIANTS: 'clinvar',
             CachedReferenceDatasetQuery.GNOMAD_CODING_AND_NONCODING_VARIANTS: 'gnomad_genomes',
             CachedReferenceDatasetQuery.GNOMAD_QC: 'gnomad_qc',
+            CachedReferenceDatasetQuery.HIGH_AF_VARIANTS: 'gnomad_genomes',
         }.get(self)
 
+    @property
+    def query_raw_dataset(self) -> bool:
+        return {
+            CachedReferenceDatasetQuery.GNOMAD_CODING_AND_NONCODING_VARIANTS: True,
+            CachedReferenceDatasetQuery.GNOMAD_QC: True,
+        }.get(self, False)
+
     @property
     def query(self) -> Callable[[hl.Table, ReferenceGenome], hl.Table]:
         return {
 
@@ -10,9 +10,7 @@
 from v03_pipeline.lib.reference_data.dataset_table_operations import (
     get_all_select_fields,
     get_enum_select_fields,
-    get_ht_path,
     import_ht_from_config_path,
-    parse_dataset_version,
 )
 
 logger = get_logger(__name__)
@@ -37,17 +35,15 @@ def from_dataset_configs(
         paths, versions, enums, selects = {}, {}, {}, {}
         for dataset in datasets:
             dataset_config = CONFIG[dataset][reference_genome.v02_value]
-            dataset_ht = import_ht_from_config_path(dataset_config, reference_genome)
-
-            paths[dataset] = get_ht_path(dataset_config)
-            versions[dataset] = hl.eval(
-                parse_dataset_version(
-                    dataset_ht,
-                    dataset,
-                    dataset_config,
-                ),
+            dataset_ht = import_ht_from_config_path(
+                dataset_config,
+                dataset,
+                reference_genome,
             )
-            enums[dataset] = dataset_config.get('enum_select', {})
+            dataset_ht_globals = hl.eval(dataset_ht.globals)
+            paths[dataset] = dataset_ht_globals.path
+            versions[dataset] = dataset_ht_globals.version
+            enums[dataset] = dict(dataset_ht_globals.enums)
             dataset_ht = dataset_ht.select(
                 **get_all_select_fields(dataset_ht, dataset_config),
             )
@@ -83,10 +79,14 @@ def from_ht(
 def get_datasets_to_update(
     ht1_globals: Globals,
     ht2_globals: Globals,
+    validate_selects: bool = True,
 ) -> list[str]:
     datasets_to_update = set()
 
     for field in dataclasses.fields(Globals):
+        if field.name == 'selects' and not validate_selects:
+            continue
+
         datasets_to_update.update(
             ht1_globals[field.name].keys() ^ ht2_globals[field.name].keys(),
         )
 
@@ -372,10 +372,12 @@ def custom_mpc_select(ht):
     },
     'gnomad_qc': {
         '37': {
+            'version': 'v2',
             'custom_import': import_matrix_table,
             'source_path': 'gs://gnomad/sample_qc/mt/gnomad.joint.high_callrate_common_biallelic_snps.pruned.mt',
         },
         '38': {
+            'version': 'v3.1',
             'custom_import': import_matrix_table,
             'source_path': 'gs://gnomad/sample_qc/mt/genomes_v3.1/gnomad_v3.1_qc_mt_v2_sites_dense.mt',
         },
 
@@ -53,7 +53,7 @@ def get_dataset_ht(
     reference_genome: ReferenceGenome,
 ) -> hl.Table:
     config = CONFIG[dataset][reference_genome.v02_value]
-    ht = import_ht_from_config_path(config, reference_genome)
+    ht = import_ht_from_config_path(config, dataset, reference_genome)
     if hasattr(ht, 'locus'):
         ht = ht.filter(
             hl.set(reference_genome.standard_contigs).contains(ht.locus.contig),
@@ -62,16 +62,6 @@ def get_dataset_ht(
     ht = ht.filter(config['filter'](ht)) if 'filter' in config else ht
     ht = ht.select(**get_all_select_fields(ht, config))
     ht = ht.transmute(**get_enum_select_fields(ht, config))
-    ht = ht.select_globals(
-        path=(config['source_path'] if 'custom_import' in config else config['path']),
-        version=parse_dataset_version(ht, dataset, config),
-        enums=hl.Struct(
-            **config.get(
-                'enum_select',
-                hl.missing(hl.tstruct(hl.tstr, hl.tarray(hl.tstr))),
-            ),
-        ),
-    )
     return ht.select(**{dataset: ht.row.drop(*ht.key)}).distinct()
 
 
@@ -81,14 +71,25 @@ def get_ht_path(config: dict) -> str:
 
 def import_ht_from_config_path(
     config: dict,
+    dataset: str,
     reference_genome: ReferenceGenome,
 ) -> hl.Table:
     path = get_ht_path(config)
-    return (
+    ht = (
         config['custom_import'](path, reference_genome)
         if 'custom_import' in config
         else hl.read_table(path)
     )
+    return ht.annotate_globals(
+        path=path,
+        version=parse_dataset_version(ht, dataset, config),
+        enums=hl.Struct(
+            **config.get(
+                'enum_select',
+                hl.missing(hl.tstruct(hl.tstr, hl.tarray(hl.tstr))),
+            ),
+        ),
+    )
 
 
 def get_select_fields(selects: list | dict | None, base_ht: hl.Table) -> dict:
 
@@ -0,0 +1,121 @@
+from typing import Any
+
+import hail as hl
+
+from v03_pipeline.lib.annotations.enums import (
+    CLINVAR_PATHOGENICITIES_LOOKUP,
+    CONSEQUENCE_TERMS,
+)
+from v03_pipeline.lib.annotations.expression_helpers import (
+    get_expr_for_vep_sorted_transcript_consequences_array,
+    get_expr_for_worst_transcript_consequence_annotations_struct,
+)
+from v03_pipeline.lib.model.dataset_type import DatasetType
+from v03_pipeline.lib.model.definitions import ReferenceGenome
+
+CLINVAR_PATH_RANGE = ('Pathogenic', 'Pathogenic/Likely_risk_allele')
+CLINVAR_LIKELY_PATH_RANGE = ('Pathogenic/Likely_pathogenic', 'Likely_risk_allele')
+CONSEQUENCE_TERM_RANK_LOOKUP = hl.dict(
+    hl.enumerate(CONSEQUENCE_TERMS, index_first=False),
+)
+GNOMAD_CODING_NONCODING_HIGH_AF_THRESHOLD = 0.90
+ONE_PERCENT = 0.01
+THREE_PERCENT = 0.03
+FIVE_PERCENT = 0.05
+TEN_PERCENT = 0.10
+
+
+def clinvar_path_variants(
+    ht: hl.Table,
+    dataset_type: DatasetType,
+    **_: Any,
+) -> hl.Table:
+    clinvar_field = 'clinvar_mito' if dataset_type == DatasetType.MITO else 'clinvar'
+    ht = ht.select_globals()
+    ht = ht.select(
+        pathogenic=(
+            (
+                ht[clinvar_field].pathogenicity_id
+                >= CLINVAR_PATHOGENICITIES_LOOKUP[CLINVAR_PATH_RANGE[0]]
+            )
+            & (
+                ht[clinvar_field].pathogenicity_id
+                <= CLINVAR_PATHOGENICITIES_LOOKUP[CLINVAR_PATH_RANGE[1]]
+            )
+        ),
+        likely_pathogenic=(
+            (
+                ht[clinvar_field].pathogenicity_id
+                >= CLINVAR_PATHOGENICITIES_LOOKUP[CLINVAR_LIKELY_PATH_RANGE[0]]
+            )
+            & (
+                ht[clinvar_field].pathogenicity_id
+                <= CLINVAR_PATHOGENICITIES_LOOKUP[CLINVAR_LIKELY_PATH_RANGE[1]]
+            )
+        ),
+    )
+    return ht.filter(ht.pathogenic | ht.likely_pathogenic)
+
+
+def gnomad_coding_and_noncoding_variants(
+    ht: hl.Table,
+    reference_genome: ReferenceGenome,
+    **_: Any,
+) -> hl.Table:
+    filtered_contig = 'chr1' if reference_genome == ReferenceGenome.GRCh38 else '1'
+    ht = hl.filter_intervals(
+        ht,
+        [
+            hl.parse_locus_interval(
+                filtered_contig,
+                reference_genome=reference_genome.value,
+            ),
+        ],
+    )
+    ht = ht.filter(ht.freq[0].AF > GNOMAD_CODING_NONCODING_HIGH_AF_THRESHOLD)
+    ht = ht.annotate(
+        sorted_transaction_consequences=(
+            get_expr_for_vep_sorted_transcript_consequences_array(
+                ht.vep,
+                omit_consequences=[],
+            )
+        ),
+    )
+    ht = ht.annotate(
+        main_transcript=(
+            get_expr_for_worst_transcript_consequence_annotations_struct(
+                ht.sorted_transaction_consequences,
+            )
+        ),
+    )
+    ht = ht.select(
+        coding=(
+            ht.main_transcript.major_consequence_rank
+            <= CONSEQUENCE_TERM_RANK_LOOKUP['synonymous_variant']
+        ),
+        noncoding=(
+            ht.main_transcript.major_consequence_rank
+            >= CONSEQUENCE_TERM_RANK_LOOKUP['downstream_gene_variant']
+        ),
+    )
+    return ht.filter(ht.coding | ht.noncoding)
+
+
+def high_af_variants(
+    ht: hl.Table,
+    **_: Any,
+) -> hl.Table:
+    ht = ht.select_globals()
+    ht = ht.filter(ht.gnomad_genomes.AF_POPMAX_OR_GLOBAL > ONE_PERCENT)
+    return ht.select(
+        is_gt_3_percent=ht.gnomad_genomes.AF_POPMAX_OR_GLOBAL > THREE_PERCENT,
+        is_gt_5_percent=ht.gnomad_genomes.AF_POPMAX_OR_GLOBAL > FIVE_PERCENT,
+        is_gt_10_percent=ht.gnomad_genomes.AF_POPMAX_OR_GLOBAL > TEN_PERCENT,
+    )
+
+
+def gnomad_qc(
+    ht: hl.Table,
+    **_: Any,
+) -> hl.Table:
+    return ht.select()