broadinstitute · bpblanken · Jan 7, 2025 · Dec 13, 2024 · Dec 13, 2024 · Dec 13, 2024
@@ -2,7 +2,7 @@
 # This file is autogenerated by pip-compile with Python 3.10
 # by the following command:
 #
-#    pip-compile --resolver=backtracking requirements.in
+#    pip-compile requirements.in
 #
 aiodns==2.0.0
     # via hail

@@ -7,7 +7,7 @@
 
 from v03_pipeline.api.model import LoadingPipelineRequest
 from v03_pipeline.lib.logger import get_logger
-from v03_pipeline.lib.model import Env
+from v03_pipeline.lib.model import FeatureFlag
 from v03_pipeline.lib.paths import (
     loading_pipeline_queue_path,
     project_pedigree_path,
@@ -54,7 +54,7 @@ def main():
                 'run_id': run_id,
                 **{k: v for k, v in lpr.model_dump().items() if k != 'projects_to_run'},
             }
-            if Env.SHOULD_TRIGGER_HAIL_BACKEND_RELOAD:
+            if FeatureFlag.SHOULD_TRIGGER_HAIL_BACKEND_RELOAD:
                 tasks = [
                     TriggerHailBackendReload(**loading_run_task_params),
                 ]

@@ -16,93 +16,71 @@ def passes_relatedness_check(
     relatedness_check_lookup: dict[tuple[str, str], list],
     sample_id: str,
     other_id: str,
-    relation: Relation,
+    expected_relation: Relation,
+    additional_allowed_relation: Relation | None,
 ) -> tuple[bool, str | None]:
     # No relationship to check, return true
     if other_id is None:
         return True, None
     coefficients = relatedness_check_lookup.get(
         (min(sample_id, other_id), max(sample_id, other_id)),
     )
-    if not coefficients or not np.allclose(
-        coefficients,
-        relation.coefficients,
-        atol=RELATEDNESS_TOLERANCE,
+    if not coefficients or not any(
+        np.allclose(
+            coefficients,
+            relation.coefficients,
+            atol=RELATEDNESS_TOLERANCE,
+        )
+        for relation in (
+            [expected_relation, additional_allowed_relation]
+            if additional_allowed_relation
+            else [expected_relation]
+        )
     ):
         return (
             False,
-            f'Sample {sample_id} has expected relation "{relation.value}" to {other_id} but has coefficients {coefficients or []}',
+            f'Sample {sample_id} has expected relation "{expected_relation.value}" to {other_id} but has coefficients {coefficients or []}',
         )
     return True, None
 
 
-def all_relatedness_checks(  # noqa: C901
+def all_relatedness_checks(
     relatedness_check_lookup: dict[tuple[str, str], list],
+    family: Family,
     sample: Sample,
 ) -> list[str]:
     failure_reasons = []
-    for parent_id in [sample.mother, sample.father]:
-        success, reason = passes_relatedness_check(
-            relatedness_check_lookup,
-            sample.sample_id,
-            parent_id,
-            Relation.PARENT,
-        )
-        if not success:
-            failure_reasons.append(reason)
-
-    for grandparent_id in [
-        sample.maternal_grandmother,
-        sample.maternal_grandfather,
-        sample.paternal_grandmother,
-        sample.paternal_grandfather,
+    for relationship_set, relation, additional_allowed_relation in [
+        ([sample.mother, sample.father], Relation.PARENT_CHILD, None),
+        (
+            [
+                sample.maternal_grandmother,
+                sample.maternal_grandfather,
+                sample.paternal_grandmother,
+                sample.paternal_grandfather,
+            ],
+            Relation.GRANDPARENT_GRANDCHILD,
+            None,
+        ),
+        (sample.siblings, Relation.SIBLING, None),
+        (sample.half_siblings, Relation.HALF_SIBLING, Relation.SIBLING),
+        (sample.aunt_nephews, Relation.AUNT_NEPHEW, None),
     ]:
-        success, reason = passes_relatedness_check(
-            relatedness_check_lookup,
-            sample.sample_id,
-            grandparent_id,
-            Relation.GRANDPARENT,
-        )
-        if not success:
-            failure_reasons.append(reason)
-
-    for sibling_id in sample.siblings:
-        success, reason = passes_relatedness_check(
-            relatedness_check_lookup,
-            sample.sample_id,
-            sibling_id,
-            Relation.SIBLING,
-        )
-        if not success:
-            failure_reasons.append(reason)
-
-    for half_sibling_id in sample.half_siblings:
-        # NB: A "half sibling" parsed from the pedigree may actually be a sibling, so we allow those
-        # through as well.
-        success1, _ = passes_relatedness_check(
-            relatedness_check_lookup,
-            sample.sample_id,
-            half_sibling_id,
-            Relation.SIBLING,
-        )
-        success2, reason = passes_relatedness_check(
-            relatedness_check_lookup,
-            sample.sample_id,
-            half_sibling_id,
-            Relation.HALF_SIBLING,
-        )
-        if not success1 and not success2:
-            failure_reasons.append(reason)
-
-    for aunt_nephew_id in sample.aunt_nephews:
-        success, reason = passes_relatedness_check(
-            relatedness_check_lookup,
-            sample.sample_id,
-            aunt_nephew_id,
-            Relation.AUNT_NEPHEW,
-        )
-        if not success:
-            failure_reasons.append(reason)
+        for other_id in relationship_set:
+            # Handle case where relation is identified in the
+            # pedigree as a "dummy" but is not included in
+            # the list of samples to load.
+            if other_id not in family.samples:
+                continue
+            success, reason = passes_relatedness_check(
+                relatedness_check_lookup,
+                sample.sample_id,
+                other_id,
+                relation,
+                additional_allowed_relation,
+            )
+            if not success:
+                failure_reasons.append(reason)
     return failure_reasons
 
 
@@ -162,6 +140,7 @@ def get_families_failed_relatedness_check(
         for sample in family.samples.values():
             failure_reasons = all_relatedness_checks(
                 relatedness_check_lookup,
+                family,
                 sample,
             )
             if failure_reasons:

@@ -9,7 +9,7 @@
     get_families_failed_sex_check,
 )
 from v03_pipeline.lib.misc.io import import_pedigree
-from v03_pipeline.lib.misc.pedigree import Sample, parse_pedigree_ht_to_families
+from v03_pipeline.lib.misc.pedigree import Family, Sample, parse_pedigree_ht_to_families
 from v03_pipeline.lib.model import Sex
 
 TEST_PEDIGREE_6 = 'v03_pipeline/var/test/pedigrees/test_pedigree_6.tsv'
@@ -104,7 +104,21 @@ def test_all_relatedness_checks(self):
             paternal_grandfather='sample_3',
             half_siblings=['sample_4'],
         )
-        failure_reasons = all_relatedness_checks(relatedness_check_lookup, sample)
+        family = Family(
+            family_guid='family_1a',
+            samples={
+                'sample_1': sample,
+                'sample_2': Sample(sex=Sex.MALE, sample_id='sample_2'),
+                'sample_3': Sample(sex=Sex.MALE, sample_id='sample_3'),
+                'sample_4': Sample(sex=Sex.MALE, sample_id='sample_4'),
+                'sample_5': Sample(sex=Sex.MALE, sample_id='sample_5'),
+            },
+        )
+        failure_reasons = all_relatedness_checks(
+            relatedness_check_lookup,
+            family,
+            sample,
+        )
         self.assertListEqual(failure_reasons, [])
 
         # Defined grandparent missing in relatedness table
@@ -117,12 +131,13 @@ def test_all_relatedness_checks(self):
         )
         failure_reasons = all_relatedness_checks(
             relatedness_check_lookup,
+            family,
             sample,
         )
         self.assertListEqual(
             failure_reasons,
             [
-                'Sample sample_1 has expected relation "grandparent" to sample_5 but has coefficients []',
+                'Sample sample_1 has expected relation "grandparent_grandchild" to sample_5 but has coefficients []',
             ],
         )
 
@@ -140,6 +155,7 @@ def test_all_relatedness_checks(self):
         )
         failure_reasons = all_relatedness_checks(
             relatedness_check_lookup,
+            family,
             sample,
         )
         self.assertListEqual(
@@ -167,16 +183,42 @@ def test_all_relatedness_checks(self):
         )
         failure_reasons = all_relatedness_checks(
             relatedness_check_lookup,
+            family,
             sample,
         )
         self.assertListEqual(
             failure_reasons,
             [
-                'Sample sample_1 has expected relation "parent" to sample_2 but has coefficients [0.5, 0.5, 0.5, 0.5]',
+                'Sample sample_1 has expected relation "parent_child" to sample_2 but has coefficients [0.5, 0.5, 0.5, 0.5]',
                 'Sample sample_1 has expected relation "sibling" to sample_4 but has coefficients [0.5, 0.5, 0, 0.25]',
             ],
         )
 
+        # Some samples will include relationships with
+        # samples that are not expected to be included
+        # in the callset.  These should not trigger relatedness
+        # failures.
+        sample = Sample(
+            sex=Sex.FEMALE,
+            sample_id='sample_1',
+            mother='sample_2',
+        )
+        family = Family(
+            family_guid='family_1a',
+            samples={
+                'sample_1': sample,
+            },
+        )
+        failure_reasons = all_relatedness_checks(
+            {},
+            family,
+            sample,
+        )
+        self.assertListEqual(
+            failure_reasons,
+            [],
+        )
+
     def test_get_families_failed_sex_check(self):
         sex_check_ht = hl.Table.parallelize(
             [

@@ -8,17 +8,17 @@
 
 
 class Relation(Enum):
-    PARENT = 'parent'
-    GRANDPARENT = 'grandparent'
+    PARENT_CHILD = 'parent_child'
+    GRANDPARENT_GRANDCHILD = 'grandparent_grandchild'
     SIBLING = 'sibling'
     HALF_SIBLING = 'half_sibling'
     AUNT_NEPHEW = 'aunt_nephew'
 
     @property
     def coefficients(self):
         return {
-            Relation.PARENT: [0, 1, 0, 0.5],
-            Relation.GRANDPARENT: [0.5, 0.5, 0, 0.25],
+            Relation.PARENT_CHILD: [0, 1, 0, 0.5],
+            Relation.GRANDPARENT_GRANDCHILD: [0.5, 0.5, 0, 0.25],
             Relation.SIBLING: [0.25, 0.5, 0.25, 0.5],
             Relation.HALF_SIBLING: [0.5, 0.5, 0, 0.25],
             Relation.AUNT_NEPHEW: [0.5, 0.5, 0, 0.25],

@@ -3,28 +3,16 @@
 import hail as hl
 
 from v03_pipeline.lib.logger import get_logger
+from v03_pipeline.lib.misc.validation import SeqrValidationError
 
 logger = get_logger(__name__)
 
 
-class MatrixTableSampleSetError(Exception):
-    def __init__(self, message, missing_samples):
-        super().__init__(message)
-        self.missing_samples = missing_samples
-
-
-def vcf_remap(mt: hl.MatrixTable) -> hl.MatrixTable:
-    # TODO: add logic from Mike to remap vcf samples delivered from Broad WGS
-    return mt
-
-
 def remap_sample_ids(
     mt: hl.MatrixTable,
     project_remap_ht: hl.Table,
     ignore_missing_samples_when_remapping: bool,
 ) -> hl.MatrixTable:
-    mt = vcf_remap(mt)
-
     collected_remap = project_remap_ht.collect()
     s_dups = [k for k, v in Counter([r.s for r in collected_remap]).items() if v > 1]
     seqr_dups = [
@@ -33,7 +21,7 @@ def remap_sample_ids(
 
     if len(s_dups) > 0 or len(seqr_dups) > 0:
         msg = f'Duplicate s or seqr_id entries in remap file were found. Duplicate s:{s_dups}. Duplicate seqr_id:{seqr_dups}.'
-        raise ValueError(msg)
+        raise SeqrValidationError(msg)
 
     missing_samples = project_remap_ht.anti_join(mt.cols()).collect()
     remap_count = len(collected_remap)
@@ -48,7 +36,7 @@ def remap_sample_ids(
         if ignore_missing_samples_when_remapping:
             logger.info(message)
         else:
-            raise MatrixTableSampleSetError(message, missing_samples)
+            raise SeqrValidationError(message)
 
     mt = mt.annotate_cols(**project_remap_ht[mt.s])
     remap_expr = hl.if_else(hl.is_missing(mt.seqr_id), mt.s, mt.seqr_id)
@@ -67,7 +55,7 @@ def subset_samples(
     anti_join_ht_count = anti_join_ht.count()
     if subset_count == 0:
         message = '0 sample ids found the subset HT, something is probably wrong.'
-        raise MatrixTableSampleSetError(message, [])
+        raise SeqrValidationError(message)
 
     if anti_join_ht_count != 0:
         missing_samples = anti_join_ht.s.collect()
@@ -77,7 +65,7 @@ def subset_samples(
             f"IDs that aren't in the callset: {missing_samples}\n"
             f'All callset sample IDs:{mt.s.collect()}'
         )
-        raise MatrixTableSampleSetError(message, missing_samples)
+        raise SeqrValidationError(message)
     logger.info(f'Subsetted to {subset_count} sample ids')
     mt = mt.semi_join_cols(sample_subset_ht)
     return mt.filter_rows(hl.agg.any(hl.is_defined(mt.GT)))