broadinstitute
diff --git a/‎v03_pipeline/bin/write_cached_reference_dataset_query_ht.py
Lines changed: 4 additions & 5 deletions b/‎v03_pipeline/bin/write_cached_reference_dataset_query_ht.py
Lines changed: 4 additions & 5 deletions
diff --git a/‎v03_pipeline/lib/annotations/constants.py
Lines changed: 5 additions & 0 deletions b/‎v03_pipeline/lib/annotations/constants.py
Lines changed: 5 additions & 0 deletions
diff --git a/‎v03_pipeline/lib/annotations/mito.py
Lines changed: 21 additions & 31 deletions b/‎v03_pipeline/lib/annotations/mito.py
Lines changed: 21 additions & 31 deletions
diff --git a/‎v03_pipeline/lib/annotations/snv_indel.py
Lines changed: 4 additions & 0 deletions b/‎v03_pipeline/lib/annotations/snv_indel.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎v03_pipeline/lib/annotations/snv_indel_test.py
Lines changed: 9 additions & 5 deletions b/‎v03_pipeline/lib/annotations/snv_indel_test.py
Lines changed: 9 additions & 5 deletions
diff --git a/‎v03_pipeline/lib/reference_data/compare_globals.py
Lines changed: 145 additions & 0 deletions b/‎v03_pipeline/lib/reference_data/compare_globals.py
Lines changed: 145 additions & 0 deletions
@@ -15,6 +15,9 @@
     valid_reference_dataset_collection_path,
 )
 from v03_pipeline.lib.reference_data.config import CONFIG
+from v03_pipeline.lib.reference_data.dataset_table_operations import (
+    import_ht_from_config_path,
+)
 
 
 def get_ht(
@@ -25,11 +28,7 @@ def get_ht(
     # If the query is defined over an uncombined reference dataset, use the combiner config.
     if query.reference_dataset:
         config = CONFIG[query.reference_dataset][reference_genome.v02_value]
-        return (
-            config['custom_import'](config['source_path'], reference_genome)
-            if 'custom_import' in config
-            else hl.read_table(config['path'])
-        )
+        return import_ht_from_config_path(config, reference_genome)
     return hl.read_table(
         valid_reference_dataset_collection_path(
             reference_genome,
 
@@ -0,0 +1,5 @@
+PROJECTS_EXCLUDED_FROM_GT_STATS = {
+    'R0555_seqr_demo',
+    'R0607_gregor_training_project_',
+    'R0608_gregor_training_project_',
+}
@@ -2,6 +2,7 @@
 
 import hail as hl
 
+from v03_pipeline.lib.annotations.constants import PROJECTS_EXCLUDED_FROM_GT_STATS
 from v03_pipeline.lib.annotations.enums import MITOTIP_PATHOGENICITIES
 
 MITOTIP_PATHOGENICITIES_LOOKUP = hl.dict(
@@ -14,31 +15,6 @@
 )
 
 
-def _AC_het(row: hl.StructExpression) -> hl.Int32Expression:  # noqa: N802
-    return sum(
-        row.heteroplasmic_samples[project_guid].length()
-        for project_guid in row.heteroplasmic_samples
-    )
-
-
-def _AC_hom(row: hl.StructExpression) -> hl.Int32Expression:  # noqa: N802
-    return sum(
-        row.homoplasmic_samples[project_guid].length()
-        for project_guid in row.homoplasmic_samples
-    )
-
-
-def _AN(row: hl.StructExpression) -> hl.Int32Expression:  # noqa: N802
-    return sum(
-        (
-            row.ref_samples[project_guid].length()
-            + row.heteroplasmic_samples[project_guid].length()
-            + row.homoplasmic_samples[project_guid].length()
-        )
-        for project_guid in row.ref_samples
-    )
-
-
 def common_low_heteroplasmy(ht: hl.Table, **_: Any) -> hl.Expression:
     return ht.common_low_heteroplasmy
 
@@ -90,12 +66,26 @@ def rsid(ht: hl.Table, **_: Any) -> hl.Expression:
     return ht.rsid.find(lambda x: hl.is_defined(x))
 
 
-def gt_stats(ht: hl.Table, sample_lookup_ht: hl.Table, **_: Any) -> hl.Expression:
+def gt_stats(
+    ht: hl.Table,
+    sample_lookup_ht: hl.Table,
+    **_: Any,
+) -> hl.Expression:
     row = sample_lookup_ht[ht.key]
+    AC_het, AC_hom, AN = 0, 0, 0  # noqa: N806
+    for project_guid in row.ref_samples:
+        if project_guid in PROJECTS_EXCLUDED_FROM_GT_STATS:
+            continue
+        ref_samples_length = row.ref_samples[project_guid].length()
+        heteroplasmic_samples_length = row.heteroplasmic_samples[project_guid].length()
+        homoplasmic_samples_length = row.homoplasmic_samples[project_guid].length()
+        AC_het += heteroplasmic_samples_length # noqa: N806
+        AC_hom += homoplasmic_samples_length # noqa: N806
+        AN += (ref_samples_length + heteroplasmic_samples_length + homoplasmic_samples_length) # noqa: N806
     return hl.Struct(
-        AC_het=_AC_het(row),
-        AF_het=hl.float32(_AC_het(row) / _AN(row)),
-        AC_hom=_AC_hom(row),
-        AF_hom=hl.float32(_AC_hom(row) / _AN(row)),
-        AN=_AN(row),
+        AC_het=AC_het,
+        AF_het=hl.float32(AC_het / AN),
+        AC_hom=AC_hom,
+        AF_hom=hl.float32(AC_hom / AN),
+        AN=AN,
     )
@@ -3,6 +3,8 @@
 
 import hail as hl
 
+from v03_pipeline.lib.annotations.constants import PROJECTS_EXCLUDED_FROM_GT_STATS
+
 N_ALT_REF = 0
 N_ALT_HET = 1
 N_ALT_HOM = 2
@@ -37,6 +39,8 @@ def gt_stats(
     row = sample_lookup_ht[ht.key]
     AC, AN, hom = 0, 0, 0
     for project_guid in row.ref_samples:
+        if project_guid in PROJECTS_EXCLUDED_FROM_GT_STATS:
+            continue
         ref_samples_length = row.ref_samples[project_guid].length()
         het_samples_length = row.het_samples[project_guid].length()
         hom_samples_length = row.hom_samples[project_guid].length()
 
@@ -25,33 +25,37 @@ def test_allele_count_annotations(self) -> None:
             [
                 {
                     'id': 0,
-                    'ref_samples': hl.Struct(project_1={'a', 'c'}, project_2=set()),
-                    'het_samples': hl.Struct(project_1={'b', 'd'}, project_2=set()),
-                    'hom_samples': hl.Struct(project_1={'e', 'f'}, project_2=set()),
+                    'ref_samples': hl.Struct(project_1={'a', 'c'}, project_2=set(), R0607_gregor_training_project_=set()),
+                    'het_samples': hl.Struct(project_1={'b', 'd'}, project_2=set(), R0607_gregor_training_project_=set()),
+                    'hom_samples': hl.Struct(project_1={'e', 'f'}, project_2=set(), R0607_gregor_training_project_={'l', 'm'}),
                 },
                 {
                     'id': 1,
                     'ref_samples': hl.Struct(
                         project_1={'a', 'b', 'c', 'd', 'e', 'f'},
                         project_2=set(),
+                        R0607_gregor_training_project_={'l', 'm'},
                     ),
-                    'het_samples': hl.Struct(project_1=set(), project_2=set()),
-                    'hom_samples': hl.Struct(project_1=set(), project_2=set()),
+                    'het_samples': hl.Struct(project_1=set(), project_2=set(), R0607_gregor_training_project_=set()),
+                    'hom_samples': hl.Struct(project_1=set(), project_2=set(), R0607_gregor_training_project_=set()),
                 },
             ],
             hl.tstruct(
                 id=hl.tint32,
                 ref_samples=hl.tstruct(
                     project_1=hl.tset(hl.tstr),
                     project_2=hl.tset(hl.tstr),
+                    R0607_gregor_training_project_=hl.tset(hl.tstr),
                 ),
                 het_samples=hl.tstruct(
                     project_1=hl.tset(hl.tstr),
                     project_2=hl.tset(hl.tstr),
+                    R0607_gregor_training_project_=hl.tset(hl.tstr),
                 ),
                 hom_samples=hl.tstruct(
                     project_1=hl.tset(hl.tstr),
                     project_2=hl.tset(hl.tstr),
+                    R0607_gregor_training_project_=hl.tset(hl.tstr),
                 ),
             ),
             key='id',
 
@@ -0,0 +1,145 @@
+import logging
+from dataclasses import dataclass
+
+import hail as hl
+
+from v03_pipeline.lib.model import ReferenceGenome
+from v03_pipeline.lib.reference_data.config import CONFIG
+from v03_pipeline.lib.reference_data.dataset_table_operations import (
+    get_all_select_fields,
+    get_ht_path,
+    import_ht_from_config_path,
+    parse_dataset_version,
+)
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ReferenceDataGlobals:
+    paths: dict[str]
+    versions: dict[str]
+    enums: dict[str, dict[str, list[str]]]
+
+    def __init__(self, globals_struct: hl.Struct):
+        self.paths = self._struct_to_dict(globals_struct.paths)
+        self.versions = self._struct_to_dict(globals_struct.versions)
+        self.enums = self._struct_to_dict(globals_struct.enums)
+
+    def _struct_to_dict(self, struct: hl.Struct) -> dict:
+        result_dict = {}
+        for field in struct:
+            if isinstance(struct[field], hl.Struct):
+                result_dict[field] = self._struct_to_dict(struct[field])
+            else:
+                result_dict[field] = struct[field]
+        return result_dict
+
+
+def get_datasets_to_update(
+    joined_ht: hl.Table,
+    datasets: list[str],
+    reference_genome: ReferenceGenome,
+) -> list[str]:
+    joined_ht_globals = ReferenceDataGlobals(hl.eval(joined_ht.index_globals()))
+    datasets_to_update = []
+    for dataset in datasets:
+        if dataset not in joined_ht.row:
+            datasets_to_update.append(dataset)
+            continue
+
+        if not validate_joined_ht_globals_match_config(
+            joined_ht,
+            joined_ht_globals,
+            dataset,
+            reference_genome,
+        ):
+            datasets_to_update.append(dataset)
+    return datasets_to_update
+
+
+def validate_joined_ht_globals_match_config(
+    joined_ht: hl.Table,
+    joined_ht_globals: ReferenceDataGlobals,
+    dataset: str,
+    reference_genome: ReferenceGenome,
+) -> bool:
+    dataset_config = CONFIG[dataset][reference_genome.v02_value]
+    dataset_ht = import_ht_from_config_path(dataset_config, reference_genome)
+    checks = {
+        'version': ht_version_matches_config(
+            joined_ht_globals,
+            dataset,
+            dataset_config,
+            dataset_ht,
+        ),
+        'path': ht_path_matches_config(joined_ht_globals, dataset, dataset_config),
+        'enum': ht_enums_match_config(joined_ht_globals, dataset, dataset_config),
+        'select': ht_selects_match_config(
+            joined_ht,
+            dataset,
+            dataset_config,
+            dataset_ht,
+        ),
+    }
+
+    results = []
+    for check, result in checks.items():
+        if result is False:
+            logger.info(f'{check} mismatch for {dataset}')
+        results.append(result)
+    return all(results)
+
+
+def ht_version_matches_config(
+    joined_ht_globals: ReferenceDataGlobals,
+    dataset: str,
+    dataset_config: dict,
+    dataset_ht: hl.Table,
+) -> bool:
+    joined_ht_version = joined_ht_globals.versions.get(dataset)
+    if joined_ht_version is None:
+        return False
+
+    config_or_dataset_version = hl.eval(
+        parse_dataset_version(
+            dataset_ht,
+            dataset,
+            dataset_config,
+        ),
+    )
+    return joined_ht_version == config_or_dataset_version
+
+
+def ht_path_matches_config(
+    joined_ht_globals: ReferenceDataGlobals,
+    dataset: str,
+    dataset_config: dict,
+) -> bool:
+    joined_ht_path = joined_ht_globals.paths.get(dataset)
+    if joined_ht_path is None:
+        return False
+
+    config_path = get_ht_path(dataset_config)
+    return joined_ht_path == config_path
+
+
+def ht_enums_match_config(
+    joined_ht_globals: ReferenceDataGlobals,
+    dataset: str,
+    dataset_config: dict,
+) -> bool:
+    joined_ht_enums = joined_ht_globals.enums.get(dataset, {})
+    config_enums = dataset_config.get('enum_select', {})
+    return joined_ht_enums == config_enums
+
+
+def ht_selects_match_config(
+    joined_ht: hl.Table,
+    dataset: str,
+    dataset_config: dict,
+    dataset_ht: hl.Table,
+) -> bool:
+    joined_ht_selects = set(joined_ht[dataset])
+    config_selects = set(get_all_select_fields(dataset_ht, dataset_config).keys())
+    return len(config_selects.symmetric_difference(joined_ht_selects)) == 0