broadinstitute
diff --git a/‎v03_pipeline/lib/methods/sex_check.py
Lines changed: 9 additions & 6 deletions b/‎v03_pipeline/lib/methods/sex_check.py
Lines changed: 9 additions & 6 deletions
diff --git a/‎v03_pipeline/lib/misc/family_loading_failures.py
Lines changed: 4 additions & 4 deletions b/‎v03_pipeline/lib/misc/family_loading_failures.py
Lines changed: 4 additions & 4 deletions
diff --git a/‎v03_pipeline/lib/misc/family_loading_failures_test.py
Lines changed: 11 additions & 11 deletions b/‎v03_pipeline/lib/misc/family_loading_failures_test.py
Lines changed: 11 additions & 11 deletions
diff --git a/‎v03_pipeline/lib/misc/pedigree.py
Lines changed: 34 additions & 39 deletions b/‎v03_pipeline/lib/misc/pedigree.py
Lines changed: 34 additions & 39 deletions
@@ -1,6 +1,6 @@
 import hail as hl
 
-from v03_pipeline.lib.model import Ploidy
+from v03_pipeline.lib.model import Sex
 
 IMPUTE_SEX_ANNOTATIONS = [
     'is_female',
@@ -13,6 +13,7 @@
 
 AMBIGUOUS_THRESHOLD_PERC: float = 0.01  # Fraction of samples identified as "ambiguous_sex" above which an error will be thrown.
 AAF_THRESHOLD: float = 0.05  # Alternate allele frequency threshold for `hl.impute_sex`.
+BIALLELIC: int = 2
 XX_FSTAT_THRESHOLD: float = (
     0.5  # F-stat threshold below which a sample will be called XX
 )
@@ -24,7 +25,9 @@
 def call_sex(mt: hl.MatrixTable) -> hl.Table:
     # Filter to SNVs and biallelics
     # NB: We should already have filtered biallelics, but just in case.
-    mt = mt.filter_rows(hl.is_snp(mt.alleles[0], mt.alleles[1]))
+    mt = mt.filter_rows(
+        (hl.len(mt.alleles) == BIALLELIC) & hl.is_snp(mt.alleles[0], mt.alleles[1]),
+    )
 
     # Filter to PASS variants only (variants with empty or missing filter set)
     mt = mt.filter_rows(
@@ -41,13 +44,13 @@ def call_sex(mt: hl.MatrixTable) -> hl.Table:
     ht = ht.annotate(
         sex=(
             hl.case()
-            .when(hl.is_missing(ht.is_female), Ploidy.UNKNOWN.value)
-            .when(ht.is_female, Ploidy.FEMALE.value)
-            .default(Ploidy.MALE.value)
+            .when(hl.is_missing(ht.is_female), Sex.UNKNOWN.value)
+            .when(ht.is_female, Sex.FEMALE.value)
+            .default(Sex.MALE.value)
         ),
     )
     ambiguous_perc = ht.aggregate(
-        hl.agg.fraction(ht.sex == Ploidy.UNKNOWN.value),
+        hl.agg.fraction(ht.sex == Sex.UNKNOWN.value),
     )
     if ambiguous_perc > AMBIGUOUS_THRESHOLD_PERC:
         msg = f'{ambiguous_perc:.2%} of samples identified as ambiguous.  Please contact the methods team to investigate the callset.'
 
@@ -4,7 +4,7 @@
 import numpy as np
 
 from v03_pipeline.lib.misc.pedigree import Family, Relation, Sample
-from v03_pipeline.lib.model import Ploidy
+from v03_pipeline.lib.model import Sex
 
 
 def passes_relatedness_check(
@@ -22,7 +22,7 @@ def passes_relatedness_check(
     if not coefficients or not np.allclose(
         coefficients,
         relation.coefficients,
-        0.1,
+        atol=0.1,
     ):
         return (
             False,
@@ -121,13 +121,13 @@ def build_relatedness_check_lookup(
 def build_sex_check_lookup(
     sex_check_ht: hl.Table,
     remap_lookup: hl.dict,
-) -> dict[str, Ploidy]:
+) -> dict[str, Sex]:
     # Build sex check lookup
     sex_check_ht = sex_check_ht.key_by(
         s=remap_lookup.get(sex_check_ht.s, sex_check_ht.s),
     )
     sex_check_ht = sex_check_ht.select('sex')
-    return {r.s: Ploidy(r.sex) for r in sex_check_ht.collect()}
+    return {r.s: Sex(r.sex) for r in sex_check_ht.collect()}
 
 
 def get_families_failed_missing_samples(
 
@@ -10,7 +10,7 @@
 )
 from v03_pipeline.lib.misc.io import import_pedigree
 from v03_pipeline.lib.misc.pedigree import Sample, parse_pedigree_ht_to_families
-from v03_pipeline.lib.model import Ploidy
+from v03_pipeline.lib.model import Sex
 
 TEST_PEDIGREE_6 = 'v03_pipeline/var/test/pedigrees/test_pedigree_6.tsv'
 
@@ -72,12 +72,12 @@ def test_build_sex_check_lookup(self):
         self.assertEqual(
             build_sex_check_lookup(ht, hl.dict({'ROS_006_18Y03226_D1': 'remapped_id'})),
             {
-                'remapped_id': Ploidy.MALE,
-                'ROS_006_18Y03227_D1': Ploidy.MALE,
-                'ROS_006_18Y03228_D1': Ploidy.MALE,
-                'ROS_007_19Y05919_D1': Ploidy.MALE,
-                'ROS_007_19Y05939_D1': Ploidy.FEMALE,
-                'ROS_007_19Y05987_D1': Ploidy.MALE,
+                'remapped_id': Sex.MALE,
+                'ROS_006_18Y03227_D1': Sex.MALE,
+                'ROS_006_18Y03228_D1': Sex.MALE,
+                'ROS_007_19Y05919_D1': Sex.MALE,
+                'ROS_007_19Y05939_D1': Sex.FEMALE,
+                'ROS_007_19Y05987_D1': Sex.MALE,
             },
         )
 
@@ -96,7 +96,7 @@ def test_all_relatedness_checks(self):
             ('sample_1', 'sample_4'): [0.25, 0.5, 0.25, 0.5],
         }
         sample = Sample(
-            sex=Ploidy.FEMALE,
+            sex=Sex.FEMALE,
             sample_id='sample_1',
             mother='sample_2',
             paternal_grandfather='sample_3',
@@ -107,7 +107,7 @@ def test_all_relatedness_checks(self):
 
         # Defined grandparent missing in relatedness table
         sample = Sample(
-            sex=Ploidy.FEMALE,
+            sex=Sex.FEMALE,
             sample_id='sample_1',
             mother='sample_2',
             paternal_grandfather='sample_3',
@@ -130,7 +130,7 @@ def test_all_relatedness_checks(self):
             ('sample_1', 'sample_4'): [0.5, 0.5, 0, 0.25],
         }
         sample = Sample(
-            sex=Ploidy.FEMALE,
+            sex=Sex.FEMALE,
             sample_id='sample_1',
             mother='sample_2',
             paternal_grandfather='sample_3',
@@ -157,7 +157,7 @@ def test_all_relatedness_checks(self):
             ],
         }
         sample = Sample(
-            sex=Ploidy.FEMALE,
+            sex=Sex.FEMALE,
             sample_id='sample_1',
             mother='sample_2',
             paternal_grandfather='sample_3',
 
@@ -4,7 +4,7 @@
 
 import hail as hl
 
-from v03_pipeline.lib.model import Ploidy
+from v03_pipeline.lib.model import Sex
 
 
 class Relation(Enum):
@@ -28,7 +28,7 @@ def coefficients(self):
 @dataclass
 class Sample:
     sample_id: str
-    sex: Ploidy
+    sex: Sex
     mother: str = None
     father: str = None
     maternal_grandmother: str = None
@@ -54,6 +54,23 @@ def is_aunt_nephew(self: 'Sample', other: 'Sample') -> bool:
             and (self.paternal_grandfather == other.father)
         )
 
+    def is_in_direct_lineage(self: 'Sample', other: 'Sample') -> bool:
+        return self.sample_id in {
+            other.mother,
+            other.father,
+            other.maternal_grandmother,
+            other.maternal_grandfather,
+            other.paternal_grandmother,
+            other.paternal_grandfather,
+        } or other.sample_id in {
+            self.mother,
+            self.father,
+            self.maternal_grandmother,
+            self.maternal_grandfather,
+            self.paternal_grandmother,
+            self.paternal_grandfather,
+        }
+
 
 @dataclass
 class Family:
@@ -69,7 +86,7 @@ def parse_direct_lineage(rows: list[hl.Struct]) -> dict[str, Sample]:  # noqa: C
         for row in rows:
             samples[row.s] = Sample(
                 sample_id=row.s,
-                sex=Ploidy(row.sex),
+                sex=Sex(row.sex),
                 mother=row.maternal_s,
                 father=row.paternal_s,
             )
@@ -107,56 +124,34 @@ def parse_collateral_lineage(
         # A sample_i that is siblings with sample_j, will list sample_j as as sibling, but
         # sample_j will not list sample_i as a sibling.  Relationships only appear in the
         # ibd table a single time, so we only need to check the pairing once.
-        for sample_i, sample_j in itertools.combinations(samples.keys(), 2):
-            # If other sample is already related, continue
-            if sample_j in {
-                samples[sample_i].mother,
-                samples[sample_i].father,
-                samples[sample_i].maternal_grandmother,
-                samples[sample_i].maternal_grandfather,
-                samples[sample_i].paternal_grandmother,
-                samples[sample_i].paternal_grandfather,
-            }:
+        for sample_i, sample_j in itertools.combinations(samples.values(), 2):
+            # If sample is already related from direct relationships, continue
+            if sample_i.is_in_direct_lineage(sample_j):
                 continue
 
             # If both parents are identified and the same, samples are siblings.
             if (
-                samples[sample_i].mother
-                and samples[sample_i].father
-                and (samples[sample_i].mother == samples[sample_j].mother)
-                and (samples[sample_i].father == samples[sample_j].father)
+                sample_i.mother
+                and sample_i.father
+                and (sample_i.mother == sample_j.mother)
+                and (sample_i.father == sample_j.father)
             ):
-                samples[sample_i].siblings.append(
-                    sample_j,
-                )
+                sample_i.siblings.append(sample_j.sample_id)
                 continue
 
             # If only a single parent is identified and the same, samples are half siblings
-            if (
-                samples[sample_i].mother
-                and samples[sample_i].mother == samples[sample_j].mother
-            ) or (
-                samples[sample_i].father
-                and samples[sample_i].father == samples[sample_j].father
+            if (sample_i.mother and sample_i.mother == sample_j.mother) or (
+                sample_i.father and sample_i.father == sample_j.father
             ):
-                samples[sample_i].half_siblings.append(
-                    sample_j,
-                )
+                sample_i.half_siblings.append(sample_j.sample_id)
                 continue
 
             # If either set of one's grandparents is identified and equal to the other's parents,
             # they're aunt/uncle related
-            # NB: because we will only check an  i, j pair of samples a single time, (itertools.combinations)
+            # NB: because we will only check an i, j pair of samples a single time, (itertools.combinations)
             # we need to check both grandparents_i == parents_j and parents_i == grandparents_j.
-            # fmt: off
-            if (
-                samples[sample_i].is_aunt_nephew(samples[sample_j])
-                or samples[sample_j].is_aunt_nephew(samples[sample_i])
-            ):
-                samples[sample_i].aunt_nephews.append(
-                    sample_j,
-                )
-            # fmt: on
+            if sample_i.is_aunt_nephew(sample_j) or sample_j.is_aunt_nephew(sample_i):
+                sample_i.aunt_nephews.append(sample_j.sample_id)
         return samples
 
     @classmethod