From aba2b942e0ff5fbdb6e6e25417ed022bf188e4c1 Mon Sep 17 00:00:00 2001
From: Julia Klugherz <juliaklugherz@gmail.com>
Date: Wed, 12 Mar 2025 22:59:39 -0400
Subject: [PATCH 01/10] only hard fail on samples in pedigree

---
 v03_pipeline/lib/misc/validation.py        | 13 ++++++++++--
 v03_pipeline/lib/misc/validation_test.py   | 24 +++++++++++++++++++---
 v03_pipeline/lib/tasks/validate_callset.py |  9 +++++++-
 3 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/v03_pipeline/lib/misc/validation.py b/v03_pipeline/lib/misc/validation.py
index 063312f47..25c3404e6 100644
--- a/v03_pipeline/lib/misc/validation.py
+++ b/v03_pipeline/lib/misc/validation.py
@@ -2,6 +2,7 @@
 
 import hail as hl
 
+from v03_pipeline.lib.misc.pedigree import Family
 from v03_pipeline.lib.model import (
     DatasetType,
     ReferenceGenome,
@@ -132,6 +133,7 @@ def validate_imputed_sex_ploidy(
     mt: hl.MatrixTable,
     # NB: sex_check_ht will be undefined if sex checking is disabled for the run
     sex_check_ht: hl.Table | None = None,
+    pedigree_families: set[Family] | None = None,
     **_: Any,
 ) -> None:
     if not sex_check_ht:
@@ -161,8 +163,15 @@ def validate_imputed_sex_ploidy(
     discrepant_samples = mt.aggregate_cols(
         hl.agg.filter(mt.discrepant, hl.agg.collect_as_set(mt.s)),
     )
-    if discrepant_samples:
-        sorted_discrepant_samples = sorted(discrepant_samples)
+    loading_samples = (
+        {sample_id for family in pedigree_families for sample_id in family.samples}
+        if pedigree_families
+        else set()
+    )
+    discrepant_loading_samples = discrepant_samples & loading_samples
+
+    if discrepant_loading_samples:
+        sorted_discrepant_samples = sorted(discrepant_loading_samples)
         msg = f'Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : {sorted_discrepant_samples[:10]}'
         raise SeqrValidationError(
             msg,
diff --git a/v03_pipeline/lib/misc/validation_test.py b/v03_pipeline/lib/misc/validation_test.py
index b1057e1e2..eafa1e5ac 100644
--- a/v03_pipeline/lib/misc/validation_test.py
+++ b/v03_pipeline/lib/misc/validation_test.py
@@ -2,6 +2,7 @@
 
 import hail as hl
 
+from v03_pipeline.lib.misc.pedigree import Family, Sample
 from v03_pipeline.lib.misc.validation import (
     SeqrValidationError,
     validate_allele_type,
@@ -11,7 +12,7 @@
     validate_no_duplicate_variants,
     validate_sample_type,
 )
-from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType
+from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType, Sex
 
 TEST_SEX_CHECK_1 = 'v03_pipeline/var/test/sex_check/test_sex_check_1.ht'
 TEST_MITO_MT = 'v03_pipeline/var/test/callsets/mito_1.mt'
@@ -171,7 +172,20 @@ def test_validate_imputed_sex_ploidy(self) -> None:
             .key_rows_by('locus')
             .key_cols_by('s')
         )
-        validate_imputed_sex_ploidy(mt, sex_check_ht)
+        pedigree_families = {
+            Family(
+                family_guid='',
+                samples={
+                    female_sample: Sample(female_sample, Sex.FEMALE),
+                    male_sample_1: Sample(male_sample_1, Sex.MALE),
+                    x0_sample: Sample(x0_sample, Sex.X0),
+                    xxy_sample: Sample(xxy_sample, Sex.XXY),
+                    xyy_sample: Sample(xyy_sample, Sex.XYY),
+                    xxx_sample: Sample(xxx_sample, Sex.XXX),
+                },
+            ),
+        }
+        validate_imputed_sex_ploidy(mt, sex_check_ht, pedigree_families)
 
         # All calls on Y chromosome are valid
         mt = (
@@ -211,7 +225,7 @@ def test_validate_imputed_sex_ploidy(self) -> None:
             .key_rows_by('locus')
             .key_cols_by('s')
         )
-        validate_imputed_sex_ploidy(mt, sex_check_ht)
+        validate_imputed_sex_ploidy(mt, sex_check_ht, pedigree_families)
 
         # Invalid X chromosome case
         mt = (
@@ -259,8 +273,12 @@ def test_validate_imputed_sex_ploidy(self) -> None:
             validate_imputed_sex_ploidy,
             mt,
             sex_check_ht,
+            pedigree_families,
         )
 
+        # Invalid X chromosome case, but invalid samples are missing from pedigree
+        validate_imputed_sex_ploidy(mt, sex_check_ht, pedigree_families=set())
+
     def test_validate_imported_field_types(self) -> None:
         mt = hl.read_matrix_table(TEST_MITO_MT)
         validate_imported_field_types(mt, DatasetType.MITO, {})
diff --git a/v03_pipeline/lib/tasks/validate_callset.py b/v03_pipeline/lib/tasks/validate_callset.py
index 7d0818240..6f9f69eeb 100644
--- a/v03_pipeline/lib/tasks/validate_callset.py
+++ b/v03_pipeline/lib/tasks/validate_callset.py
@@ -2,6 +2,8 @@
 import luigi
 import luigi.util
 
+from v03_pipeline.lib.misc.io import import_pedigree
+from v03_pipeline.lib.misc.pedigree import parse_pedigree_ht_to_families
 from v03_pipeline.lib.misc.validation import (
     SeqrValidationError,
     validate_allele_type,
@@ -19,7 +21,7 @@
 from v03_pipeline.lib.reference_datasets.reference_dataset import ReferenceDataset
 from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams
 from v03_pipeline.lib.tasks.base.base_update import BaseUpdateTask
-from v03_pipeline.lib.tasks.files import CallsetTask, GCSorLocalTarget
+from v03_pipeline.lib.tasks.files import CallsetTask, GCSorLocalTarget, RawFileTask
 from v03_pipeline.lib.tasks.reference_data.updated_reference_dataset import (
     UpdatedReferenceDatasetTask,
 )
@@ -52,6 +54,10 @@ def get_validation_dependencies(self) -> dict[str, hl.Table]:
                     self.callset_path,
                 ),
             )
+            deps['pedigree_families'] = parse_pedigree_ht_to_families(
+                import_pedigree(self.input()[1].path),
+            )
+
         return deps
 
     def complete(self) -> luigi.Target:
@@ -74,6 +80,7 @@ def output(self) -> luigi.Target:
     def requires(self) -> list[luigi.Task]:
         requirements = [
             self.clone(WriteImportedCallsetTask),
+            RawFileTask(self.project_pedigree_paths[self.project_i]),
         ]
         if not self.skip_validation and self.dataset_type.can_run_validation:
             requirements = [

From 2615bb95369518b26bf8a3b297cafdc934412b2f Mon Sep 17 00:00:00 2001
From: Julia Klugherz <juliaklugherz@gmail.com>
Date: Thu, 13 Mar 2025 16:10:13 -0400
Subject: [PATCH 02/10] make ploidy part of family validation

---
 .../lib/misc/family_loading_failures.py       |  41 ++++
 .../lib/misc/family_loading_failures_test.py  | 188 ++++++++++++++++++
 v03_pipeline/lib/misc/validation.py           |  52 -----
 v03_pipeline/lib/misc/validation_test.py      | 161 +--------------
 v03_pipeline/lib/tasks/validate_callset.py    |  48 +----
 .../write_remapped_and_subsetted_callset.py   |  18 ++
 ...ite_remapped_and_subsetted_callset_test.py |  31 ++-
 7 files changed, 281 insertions(+), 258 deletions(-)

diff --git a/v03_pipeline/lib/misc/family_loading_failures.py b/v03_pipeline/lib/misc/family_loading_failures.py
index 438b1706e..a43500fbf 100644
--- a/v03_pipeline/lib/misc/family_loading_failures.py
+++ b/v03_pipeline/lib/misc/family_loading_failures.py
@@ -174,3 +174,44 @@ def get_families_failed_sex_check(
                     f'Sample {sample_id} has pedigree sex {family.samples[sample_id].sex.value} but imputed sex {sex_check_lookup[sample_id].value}',
                 )
     return dict(failed_families)
+
+
+def get_families_failed_imputed_sex_ploidy(
+    mt: hl.MatrixTable,
+    sex_check_ht: hl.Table,
+    families: set[Family],
+) -> dict[Family, str]:
+    mt = mt.select_cols(
+        discrepant=(
+            (
+                # All calls are diploid or missing but the sex is Male
+                hl.agg.all(mt.GT.is_diploid() | hl.is_missing(mt.GT))
+                & (sex_check_ht[mt.s].predicted_sex == Sex.MALE.value)
+            )
+            | (
+                # At least one call is haploid but the sex is Female, X0, XXY, XYY, or XXX
+                hl.agg.any(~mt.GT.is_diploid())
+                & hl.literal(
+                    {
+                        Sex.FEMALE.value,
+                        Sex.X0.value,
+                        Sex.XYY.value,
+                        Sex.XXY.value,
+                        Sex.XXX.value,
+                    },
+                ).contains(sex_check_ht[mt.s].predicted_sex)
+            )
+        ),
+    )
+    discrepant_samples = mt.aggregate_cols(
+        hl.agg.filter(mt.discrepant, hl.agg.collect_as_set(mt.s)),
+    )
+    failed_families = {}
+    for family in families:
+        discrepant_loadable_samples = set(family.samples.keys()) & discrepant_samples
+        if discrepant_loadable_samples:
+            sorted_discrepant_samples = sorted(discrepant_loadable_samples)
+            failed_families[
+                family
+            ] = f'Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : {sorted_discrepant_samples[:10]}'
+    return failed_families
diff --git a/v03_pipeline/lib/misc/family_loading_failures_test.py b/v03_pipeline/lib/misc/family_loading_failures_test.py
index c9b1858b2..6e6c1c923 100644
--- a/v03_pipeline/lib/misc/family_loading_failures_test.py
+++ b/v03_pipeline/lib/misc/family_loading_failures_test.py
@@ -6,12 +6,14 @@
     all_relatedness_checks,
     build_relatedness_check_lookup,
     build_sex_check_lookup,
+    get_families_failed_imputed_sex_ploidy,
     get_families_failed_sex_check,
 )
 from v03_pipeline.lib.misc.io import import_pedigree
 from v03_pipeline.lib.misc.pedigree import Family, Sample, parse_pedigree_ht_to_families
 from v03_pipeline.lib.model import Sex
 
+TEST_SEX_CHECK_1 = 'v03_pipeline/var/test/sex_check/test_sex_check_1.ht'
 TEST_PEDIGREE_6 = 'v03_pipeline/var/test/pedigrees/test_pedigree_6.tsv'
 
 
@@ -250,3 +252,189 @@ def test_get_families_failed_sex_check(self):
                 ],
             ],
         )
+
+    def test_get_families_failed_imputed_sex_ploidy(self) -> None:
+        female_sample = 'HG00731_1'
+        male_sample_1 = 'HG00732_1'
+        male_sample_2 = 'HG00732_1'
+        x0_sample = 'NA20899_1'
+        xxy_sample = 'NA20889_1'
+        xyy_sample = 'NA20891_1'
+        xxx_sample = 'NA20892_1'
+
+        sex_check_ht = hl.read_table(TEST_SEX_CHECK_1)
+        families = {
+            Family(
+                family_guid='',
+                samples={
+                    female_sample: Sample(female_sample, Sex.FEMALE),
+                    male_sample_1: Sample(male_sample_1, Sex.MALE),
+                    male_sample_2: Sample(male_sample_2, Sex.MALE),
+                    x0_sample: Sample(x0_sample, Sex.X0),
+                    xxy_sample: Sample(xxy_sample, Sex.XXY),
+                    xyy_sample: Sample(xyy_sample, Sex.XYY),
+                    xxx_sample: Sample(xxx_sample, Sex.XXX),
+                },
+            ),
+        }
+
+        # All calls on X chromosome are valid
+        mt = (
+            hl.MatrixTable.from_parts(
+                rows={
+                    'locus': [
+                        hl.Locus(
+                            contig='chrX',
+                            position=1,
+                            reference_genome='GRCh38',
+                        ),
+                    ],
+                },
+                cols={
+                    's': [
+                        female_sample,
+                        male_sample_1,
+                        x0_sample,
+                        xxy_sample,
+                        xyy_sample,
+                        xxx_sample,
+                    ],
+                },
+                entries={
+                    'GT': [
+                        [
+                            hl.Call(alleles=[0, 0], phased=False),
+                            hl.Call(alleles=[0], phased=False),
+                            hl.Call(alleles=[0, 0], phased=False),  # X0
+                            hl.Call(alleles=[0, 0], phased=False),  # XXY
+                            hl.Call(alleles=[0, 0], phased=False),  # XYY
+                            hl.Call(alleles=[0, 0], phased=False),  # XXX
+                        ],
+                    ],
+                },
+            )
+            .key_rows_by('locus')
+            .key_cols_by('s')
+        )
+        failed_families = get_families_failed_imputed_sex_ploidy(
+            mt,
+            sex_check_ht,
+            families,
+        )
+        self.assertDictEqual(failed_families, {})
+
+        # All calls on Y chromosome are valid
+        mt = (
+            hl.MatrixTable.from_parts(
+                rows={
+                    'locus': [
+                        hl.Locus(
+                            contig='chrY',
+                            position=1,
+                            reference_genome='GRCh38',
+                        ),
+                    ],
+                },
+                cols={
+                    's': [
+                        female_sample,
+                        male_sample_1,
+                        x0_sample,
+                        xxy_sample,
+                        xyy_sample,
+                        xxx_sample,
+                    ],
+                },
+                entries={
+                    'GT': [
+                        [
+                            hl.missing(hl.tcall),
+                            hl.Call(alleles=[0], phased=False),
+                            hl.missing(hl.tcall),  # X0
+                            hl.Call(alleles=[0, 0], phased=False),  # XXY
+                            hl.Call(alleles=[0, 0], phased=False),  # XYY
+                            hl.missing(hl.tcall),  # XXX
+                        ],
+                    ],
+                },
+            )
+            .key_rows_by('locus')
+            .key_cols_by('s')
+        )
+        failed_families = get_families_failed_imputed_sex_ploidy(
+            mt,
+            sex_check_ht,
+            families,
+        )
+        self.assertDictEqual(failed_families, {})
+
+        # Invalid X chromosome case
+        mt = (
+            hl.MatrixTable.from_parts(
+                rows={
+                    'locus': [
+                        hl.Locus(
+                            contig='chrX',
+                            position=1,
+                            reference_genome='GRCh38',
+                        ),
+                    ],
+                },
+                cols={
+                    's': [
+                        female_sample,
+                        male_sample_1,
+                        male_sample_2,
+                        x0_sample,
+                        xxy_sample,
+                        xyy_sample,
+                        xxx_sample,
+                    ],
+                },
+                entries={
+                    'GT': [
+                        [
+                            hl.Call(alleles=[0], phased=False),  # invalid Female call
+                            hl.Call(alleles=[0], phased=False),  # valid Male call
+                            hl.missing(hl.tcall),  # invalid Male call
+                            hl.Call(alleles=[0], phased=False),  # invalid X0 call
+                            hl.Call(alleles=[0], phased=False),  # invalid XXY call
+                            hl.missing(hl.tcall),  # valid XYY call
+                            hl.Call(alleles=[0, 0], phased=False),  # valid XXX call
+                        ],
+                    ],
+                },
+            )
+            .key_rows_by('locus')
+            .key_cols_by('s')
+        )
+        failed_families = get_families_failed_imputed_sex_ploidy(
+            mt,
+            sex_check_ht,
+            families,
+        )
+        self.assertCountEqual(
+            failed_families.values(),
+            [
+                "Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : ['HG00731_1', 'HG00732_1', 'NA20889_1', 'NA20899_1']",
+            ],
+        )
+
+        # Invalid X chromosome case, but only discrepant family samples are reported
+        families = {
+            Family(
+                family_guid='',
+                samples={female_sample: Sample(female_sample, Sex.FEMALE)},
+            ),
+        }
+        failed_families = get_families_failed_imputed_sex_ploidy(
+            mt,
+            sex_check_ht,
+            families,
+        )
+        self.assertCountEqual(
+            failed_families.values(),
+            [
+                "Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : ['HG00731_1']",
+            ],
+        )
diff --git a/v03_pipeline/lib/misc/validation.py b/v03_pipeline/lib/misc/validation.py
index 25c3404e6..4bc735c23 100644
--- a/v03_pipeline/lib/misc/validation.py
+++ b/v03_pipeline/lib/misc/validation.py
@@ -2,12 +2,10 @@
 
 import hail as hl
 
-from v03_pipeline.lib.misc.pedigree import Family
 from v03_pipeline.lib.model import (
     DatasetType,
     ReferenceGenome,
     SampleType,
-    Sex,
 )
 
 AMBIGUOUS_THRESHOLD_PERC: float = 0.01  # Fraction of samples identified as "ambiguous_sex" above which an error will be thrown.
@@ -129,56 +127,6 @@ def _validate_field(
         raise SeqrValidationError(msg)
 
 
-def validate_imputed_sex_ploidy(
-    mt: hl.MatrixTable,
-    # NB: sex_check_ht will be undefined if sex checking is disabled for the run
-    sex_check_ht: hl.Table | None = None,
-    pedigree_families: set[Family] | None = None,
-    **_: Any,
-) -> None:
-    if not sex_check_ht:
-        return
-    mt = mt.select_cols(
-        discrepant=(
-            (
-                # All calls are diploid or missing but the sex is Male
-                hl.agg.all(mt.GT.is_diploid() | hl.is_missing(mt.GT))
-                & (sex_check_ht[mt.s].predicted_sex == Sex.MALE.value)
-            )
-            | (
-                # At least one call is haploid but the sex is Female, X0, XXY, XYY, or XXX
-                hl.agg.any(~mt.GT.is_diploid())
-                & hl.literal(
-                    {
-                        Sex.FEMALE.value,
-                        Sex.X0.value,
-                        Sex.XYY.value,
-                        Sex.XXY.value,
-                        Sex.XXX.value,
-                    },
-                ).contains(sex_check_ht[mt.s].predicted_sex)
-            )
-        ),
-    )
-    discrepant_samples = mt.aggregate_cols(
-        hl.agg.filter(mt.discrepant, hl.agg.collect_as_set(mt.s)),
-    )
-    loading_samples = (
-        {sample_id for family in pedigree_families for sample_id in family.samples}
-        if pedigree_families
-        else set()
-    )
-    discrepant_loading_samples = discrepant_samples & loading_samples
-
-    if discrepant_loading_samples:
-        sorted_discrepant_samples = sorted(discrepant_loading_samples)
-        msg = f'Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : {sorted_discrepant_samples[:10]}'
-        raise SeqrValidationError(
-            msg,
-            {'imputed_sex_ploidy_failures': sorted_discrepant_samples},
-        )
-
-
 def validate_sample_type(
     mt: hl.MatrixTable,
     coding_and_noncoding_variants_ht: hl.Table,
diff --git a/v03_pipeline/lib/misc/validation_test.py b/v03_pipeline/lib/misc/validation_test.py
index eafa1e5ac..e9f5a1a84 100644
--- a/v03_pipeline/lib/misc/validation_test.py
+++ b/v03_pipeline/lib/misc/validation_test.py
@@ -2,19 +2,16 @@
 
 import hail as hl
 
-from v03_pipeline.lib.misc.pedigree import Family, Sample
 from v03_pipeline.lib.misc.validation import (
     SeqrValidationError,
     validate_allele_type,
     validate_expected_contig_frequency,
     validate_imported_field_types,
-    validate_imputed_sex_ploidy,
     validate_no_duplicate_variants,
     validate_sample_type,
 )
-from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType, Sex
+from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType
 
-TEST_SEX_CHECK_1 = 'v03_pipeline/var/test/sex_check/test_sex_check_1.ht'
 TEST_MITO_MT = 'v03_pipeline/var/test/callsets/mito_1.mt'
 
 
@@ -123,162 +120,6 @@ def test_validate_allele_type(self) -> None:
             DatasetType.SNV_INDEL,
         )
 
-    def test_validate_imputed_sex_ploidy(self) -> None:
-        female_sample = 'HG00731_1'
-        male_sample_1 = 'HG00732_1'
-        male_sample_2 = 'HG00732_1'
-        x0_sample = 'NA20899_1'
-        xxy_sample = 'NA20889_1'
-        xyy_sample = 'NA20891_1'
-        xxx_sample = 'NA20892_1'
-
-        sex_check_ht = hl.read_table(TEST_SEX_CHECK_1)
-
-        # All calls on X chromosome are valid
-        mt = (
-            hl.MatrixTable.from_parts(
-                rows={
-                    'locus': [
-                        hl.Locus(
-                            contig='chrX',
-                            position=1,
-                            reference_genome='GRCh38',
-                        ),
-                    ],
-                },
-                cols={
-                    's': [
-                        female_sample,
-                        male_sample_1,
-                        x0_sample,
-                        xxy_sample,
-                        xyy_sample,
-                        xxx_sample,
-                    ],
-                },
-                entries={
-                    'GT': [
-                        [
-                            hl.Call(alleles=[0, 0], phased=False),
-                            hl.Call(alleles=[0], phased=False),
-                            hl.Call(alleles=[0, 0], phased=False),  # X0
-                            hl.Call(alleles=[0, 0], phased=False),  # XXY
-                            hl.Call(alleles=[0, 0], phased=False),  # XYY
-                            hl.Call(alleles=[0, 0], phased=False),  # XXX
-                        ],
-                    ],
-                },
-            )
-            .key_rows_by('locus')
-            .key_cols_by('s')
-        )
-        pedigree_families = {
-            Family(
-                family_guid='',
-                samples={
-                    female_sample: Sample(female_sample, Sex.FEMALE),
-                    male_sample_1: Sample(male_sample_1, Sex.MALE),
-                    x0_sample: Sample(x0_sample, Sex.X0),
-                    xxy_sample: Sample(xxy_sample, Sex.XXY),
-                    xyy_sample: Sample(xyy_sample, Sex.XYY),
-                    xxx_sample: Sample(xxx_sample, Sex.XXX),
-                },
-            ),
-        }
-        validate_imputed_sex_ploidy(mt, sex_check_ht, pedigree_families)
-
-        # All calls on Y chromosome are valid
-        mt = (
-            hl.MatrixTable.from_parts(
-                rows={
-                    'locus': [
-                        hl.Locus(
-                            contig='chrY',
-                            position=1,
-                            reference_genome='GRCh38',
-                        ),
-                    ],
-                },
-                cols={
-                    's': [
-                        female_sample,
-                        male_sample_1,
-                        x0_sample,
-                        xxy_sample,
-                        xyy_sample,
-                        xxx_sample,
-                    ],
-                },
-                entries={
-                    'GT': [
-                        [
-                            hl.missing(hl.tcall),
-                            hl.Call(alleles=[0], phased=False),
-                            hl.missing(hl.tcall),  # X0
-                            hl.Call(alleles=[0, 0], phased=False),  # XXY
-                            hl.Call(alleles=[0, 0], phased=False),  # XYY
-                            hl.missing(hl.tcall),  # XXX
-                        ],
-                    ],
-                },
-            )
-            .key_rows_by('locus')
-            .key_cols_by('s')
-        )
-        validate_imputed_sex_ploidy(mt, sex_check_ht, pedigree_families)
-
-        # Invalid X chromosome case
-        mt = (
-            hl.MatrixTable.from_parts(
-                rows={
-                    'locus': [
-                        hl.Locus(
-                            contig='chrX',
-                            position=1,
-                            reference_genome='GRCh38',
-                        ),
-                    ],
-                },
-                cols={
-                    's': [
-                        female_sample,
-                        male_sample_1,
-                        male_sample_2,
-                        x0_sample,
-                        xxy_sample,
-                        xyy_sample,
-                        xxx_sample,
-                    ],
-                },
-                entries={
-                    'GT': [
-                        [
-                            hl.Call(alleles=[0], phased=False),  # invalid Female call
-                            hl.Call(alleles=[0], phased=False),  # valid Male call
-                            hl.missing(hl.tcall),  # invalid Male call
-                            hl.Call(alleles=[0], phased=False),  # invalid X0 call
-                            hl.Call(alleles=[0], phased=False),  # invalid XXY call
-                            hl.missing(hl.tcall),  # valid XYY call
-                            hl.Call(alleles=[0, 0], phased=False),  # valid XXX call
-                        ],
-                    ],
-                },
-            )
-            .key_rows_by('locus')
-            .key_cols_by('s')
-        )
-        self.assertRaisesRegex(
-            SeqrValidationError,
-            "Found samples with misaligned ploidy with their provided imputed sex \\(first 10, if applicable\\) : \\['HG00731_1', 'HG00732_1', 'NA20889_1', 'NA20899_1'\\].*",
-            validate_imputed_sex_ploidy,
-            mt,
-            sex_check_ht,
-            pedigree_families,
-        )
-
-        # Invalid X chromosome case, but invalid samples are missing from pedigree
-        validate_imputed_sex_ploidy(mt, sex_check_ht, pedigree_families=set())
-
     def test_validate_imported_field_types(self) -> None:
         mt = hl.read_matrix_table(TEST_MITO_MT)
         validate_imported_field_types(mt, DatasetType.MITO, {})
diff --git a/v03_pipeline/lib/tasks/validate_callset.py b/v03_pipeline/lib/tasks/validate_callset.py
index 6f9f69eeb..cc9cb7dd1 100644
--- a/v03_pipeline/lib/tasks/validate_callset.py
+++ b/v03_pipeline/lib/tasks/validate_callset.py
@@ -2,31 +2,25 @@
 import luigi
 import luigi.util
 
-from v03_pipeline.lib.misc.io import import_pedigree
-from v03_pipeline.lib.misc.pedigree import parse_pedigree_ht_to_families
 from v03_pipeline.lib.misc.validation import (
     SeqrValidationError,
     validate_allele_type,
     validate_expected_contig_frequency,
-    validate_imputed_sex_ploidy,
     validate_no_duplicate_variants,
     validate_sample_type,
 )
-from v03_pipeline.lib.model.feature_flag import FeatureFlag
 from v03_pipeline.lib.paths import (
     imported_callset_path,
-    sex_check_table_path,
     valid_reference_dataset_path,
 )
 from v03_pipeline.lib.reference_datasets.reference_dataset import ReferenceDataset
 from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams
 from v03_pipeline.lib.tasks.base.base_update import BaseUpdateTask
-from v03_pipeline.lib.tasks.files import CallsetTask, GCSorLocalTarget, RawFileTask
+from v03_pipeline.lib.tasks.files import CallsetTask, GCSorLocalTarget
 from v03_pipeline.lib.tasks.reference_data.updated_reference_dataset import (
     UpdatedReferenceDatasetTask,
 )
 from v03_pipeline.lib.tasks.write_imported_callset import WriteImportedCallsetTask
-from v03_pipeline.lib.tasks.write_sex_check_table import WriteSexCheckTableTask
 from v03_pipeline.lib.tasks.write_validation_errors_for_run import (
     WriteValidationErrorsForRunTask,
 )
@@ -42,22 +36,6 @@ def get_validation_dependencies(self) -> dict[str, hl.Table]:
                 ReferenceDataset.gnomad_coding_and_noncoding,
             ),
         )
-        if (
-            FeatureFlag.CHECK_SEX_AND_RELATEDNESS
-            and self.dataset_type.check_sex_and_relatedness
-            and not self.skip_check_sex_and_relatedness
-        ):
-            deps['sex_check_ht'] = hl.read_table(
-                sex_check_table_path(
-                    self.reference_genome,
-                    self.dataset_type,
-                    self.callset_path,
-                ),
-            )
-            deps['pedigree_families'] = parse_pedigree_ht_to_families(
-                import_pedigree(self.input()[1].path),
-            )
-
         return deps
 
     def complete(self) -> luigi.Target:
@@ -78,10 +56,7 @@ def output(self) -> luigi.Target:
         )
 
     def requires(self) -> list[luigi.Task]:
-        requirements = [
-            self.clone(WriteImportedCallsetTask),
-            RawFileTask(self.project_pedigree_paths[self.project_i]),
-        ]
+        requirements = [self.clone(WriteImportedCallsetTask)]
         if not self.skip_validation and self.dataset_type.can_run_validation:
             requirements = [
                 *requirements,
@@ -92,15 +67,6 @@ def requires(self) -> list[luigi.Task]:
                     )
                 ),
             ]
-        if (
-            FeatureFlag.CHECK_SEX_AND_RELATEDNESS
-            and self.dataset_type.check_sex_and_relatedness
-            and not self.skip_check_sex_and_relatedness
-        ):
-            requirements = [
-                *requirements,
-                self.clone(WriteSexCheckTableTask),
-            ]
         return [
             *requirements,
             CallsetTask(self.callset_path),
@@ -128,10 +94,14 @@ def update_table(self, mt: hl.MatrixTable) -> hl.MatrixTable:
                 callset_path=self.callset_path,
                 validated_sample_type=self.sample_type.value,
             )
-        validation_dependencies = self.get_validation_dependencies()
+        coding_and_noncoding_variants_ht = hl.read_table(
+            valid_reference_dataset_path(
+                self.reference_genome,
+                ReferenceDataset.gnomad_coding_and_noncoding,
+            ),
+        )
         for validation_f in [
             validate_allele_type,
-            validate_imputed_sex_ploidy,
             validate_no_duplicate_variants,
             validate_expected_contig_frequency,
             validate_sample_type,
@@ -139,8 +109,8 @@ def update_table(self, mt: hl.MatrixTable) -> hl.MatrixTable:
             try:
                 validation_f(
                     mt,
+                    coding_and_noncoding_variants_ht=coding_and_noncoding_variants_ht,
                     **self.param_kwargs,
-                    **validation_dependencies,
                 )
             except SeqrValidationError as e:
                 validation_exceptions.append(e)
diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py
index 2ef35f3a1..479621601 100644
--- a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py
+++ b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py
@@ -3,6 +3,7 @@
 import luigi.util
 
 from v03_pipeline.lib.misc.family_loading_failures import (
+    get_families_failed_imputed_sex_ploidy,
     get_families_failed_missing_samples,
     get_families_failed_relatedness_check,
     get_families_failed_sex_check,
@@ -109,6 +110,7 @@ def create_table(self) -> hl.MatrixTable:
         )
         families_failed_relatedness_check = {}
         families_failed_sex_check = {}
+        families_failed_imputed_sex_ploidy = {}
         if (
             FeatureFlag.CHECK_SEX_AND_RELATEDNESS
             and self.dataset_type.check_sex_and_relatedness
@@ -134,12 +136,21 @@ def create_table(self) -> hl.MatrixTable:
                 sex_check_ht,
                 remap_lookup,
             )
+            families_failed_imputed_sex_ploidy = get_families_failed_imputed_sex_ploidy(
+                callset_mt,
+                sex_check_ht,
+                families
+                - families_failed_missing_samples.keys()
+                - families_failed_relatedness_check.keys()
+                - families_failed_sex_check.keys(),
+            )
 
         loadable_families = (
             families
             - families_failed_missing_samples.keys()
             - families_failed_relatedness_check.keys()
             - families_failed_sex_check.keys()
+            - families_failed_imputed_sex_ploidy.keys()
         )
         if not len(loadable_families):
             msg = 'All families failed validation checks'
@@ -154,6 +165,9 @@ def create_table(self) -> hl.MatrixTable:
                             families_failed_relatedness_check,
                         ),
                         'sex_check': format_failures(families_failed_sex_check),
+                        'ploidy_check': format_failures(
+                            families_failed_imputed_sex_ploidy,
+                        ),
                     },
                 },
             )
@@ -203,5 +217,9 @@ def create_table(self) -> hl.MatrixTable:
                     format_failures(families_failed_sex_check)
                     or hl.empty_dict(hl.tstr, hl.tdict(hl.tstr, hl.tarray(hl.tstr)))
                 ),
+                ploidy_check=(
+                    format_failures(families_failed_imputed_sex_ploidy)
+                    or hl.empty_dict(hl.tstr, hl.tdict(hl.tstr, hl.tarray(hl.tstr)))
+                ),
             ),
         )
diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py
index a5ed24799..afac86773 100644
--- a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py
+++ b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py
@@ -114,6 +114,7 @@ def test_write_remapped_and_subsetted_callset_task(
                         missing_samples={},
                         relatedness_check={},
                         sex_check={},
+                        ploidy_check={},
                     ),
                     family_samples={'abc_1': ['HG00731_1', 'HG00732_1', 'HG00733_1']},
                 ),
@@ -121,7 +122,7 @@ def test_write_remapped_and_subsetted_callset_task(
         )
 
     @patch('v03_pipeline.lib.tasks.write_remapped_and_subsetted_callset.FeatureFlag')
-    def test_write_remapped_and_subsetted_callset_task_failed_sex_check_family(
+    def test_write_remapped_and_subsetted_callset_task_failed_some_family_checks(
         self,
         mock_ff: Mock,
     ) -> None:
@@ -143,8 +144,9 @@ def test_write_remapped_and_subsetted_callset_task_failed_sex_check_family(
         worker.run()
         self.assertTrue(wrsc_task.complete())
         mt = hl.read_matrix_table(wrsc_task.output().path)
-        # NB: one "family"/"sample" has been removed because of a failed sex check!
-        self.assertEqual(mt.count(), (30, 12))
+        # NB: one "family"/"sample" has been removed because of a failed sex check,
+        # and 4 removed because of a failed ploidy check!
+        self.assertEqual(mt.count(), (30, 8))
         self.assertEqual(
             mt.globals.collect(),
             [
@@ -157,16 +159,12 @@ def test_write_remapped_and_subsetted_callset_task_failed_sex_check_family(
                     ),
                     family_samples={
                         '123_1': ['NA19675_1'],
-                        '234_1': ['NA19678_1'],
                         '345_1': ['NA19679_1'],
                         '456_1': ['NA20870_1'],
-                        '567_1': ['NA20872_1'],
                         '678_1': ['NA20874_1'],
                         '789_1': ['NA20875_1'],
                         '890_1': ['NA20876_1'],
                         '901_1': ['NA20877_1'],
-                        'bcd_1': ['NA20878_1'],
-                        'cde_1': ['NA20881_1'],
                         'efg_1': ['NA20888_1'],
                     },
                     failed_family_samples=hl.Struct(
@@ -180,6 +178,24 @@ def test_write_remapped_and_subsetted_callset_task_failed_sex_check_family(
                                 'samples': ['NA20885_1'],
                             },
                         },
+                        ploidy_check={
+                            '234_1': hl.Struct(
+                                samples=['NA19678_1'],
+                                reasons="Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : ['NA19678_1']",
+                            ),
+                            '567_1': hl.Struct(
+                                samples=['NA20872_1'],
+                                reasons="Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : ['NA20872_1']",
+                            ),
+                            'bcd_1': hl.Struct(
+                                samples=['NA20878_1'],
+                                reasons="Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : ['NA20878_1']",
+                            ),
+                            'cde_1': hl.Struct(
+                                samples=['NA20881_1'],
+                                reasons="Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : ['NA20881_1']",
+                            ),
+                        },
                     ),
                 ),
             ],
@@ -337,6 +353,7 @@ def test_write_remapped_and_subsetted_callset_task_all_families_failed(
                                 ],
                             },
                         },
+                        'ploidy_check': {},
                     },
                 },
             )

From 3493dcfe80cbe48a0fce4b425089e7c080d3c71a Mon Sep 17 00:00:00 2001
From: Julia Klugherz <juliaklugherz@gmail.com>
Date: Thu, 13 Mar 2025 16:14:01 -0400
Subject: [PATCH 03/10] remove get_validation_dependencies

---
 v03_pipeline/lib/tasks/validate_callset.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/v03_pipeline/lib/tasks/validate_callset.py b/v03_pipeline/lib/tasks/validate_callset.py
index cc9cb7dd1..cf1443097 100644
--- a/v03_pipeline/lib/tasks/validate_callset.py
+++ b/v03_pipeline/lib/tasks/validate_callset.py
@@ -28,16 +28,6 @@
 
 @luigi.util.inherits(BaseLoadingRunParams)
 class ValidateCallsetTask(BaseUpdateTask):
-    def get_validation_dependencies(self) -> dict[str, hl.Table]:
-        deps = {}
-        deps['coding_and_noncoding_variants_ht'] = hl.read_table(
-            valid_reference_dataset_path(
-                self.reference_genome,
-                ReferenceDataset.gnomad_coding_and_noncoding,
-            ),
-        )
-        return deps
-
     def complete(self) -> luigi.Target:
         if super().complete():
             mt = hl.read_matrix_table(self.output().path)

From 2983f79e10323f2c9aec0890f21bf3b46a311f86 Mon Sep 17 00:00:00 2001
From: Julia Klugherz <juliaklugherz@gmail.com>
Date: Thu, 13 Mar 2025 18:24:00 -0400
Subject: [PATCH 04/10] one failure

---
 v03_pipeline/lib/tasks/write_project_family_tables_test.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/v03_pipeline/lib/tasks/write_project_family_tables_test.py b/v03_pipeline/lib/tasks/write_project_family_tables_test.py
index dd535f988..e93d4fada 100644
--- a/v03_pipeline/lib/tasks/write_project_family_tables_test.py
+++ b/v03_pipeline/lib/tasks/write_project_family_tables_test.py
@@ -137,6 +137,7 @@ def test_snv_write_project_family_tables_task(self) -> None:
                 },
                 relatedness_check={},
                 sex_check={},
+                ploidy_check={},
             ),
         )
         # Project table still contains all family guids

From 049c2084a26d7745545a4c08ab52612151efb731 Mon Sep 17 00:00:00 2001
From: Julia Klugherz <juliaklugherz@gmail.com>
Date: Mon, 17 Mar 2025 14:37:42 -0400
Subject: [PATCH 05/10] comments

---
 .../lib/misc/family_loading_failures.py       |   2 +-
 .../lib/misc/family_loading_failures_test.py  |   8 +-
 .../write_remapped_and_subsetted_callset.py   |  12 +-
 ...ite_remapped_and_subsetted_callset_test.py | 103 ++++++------------
 4 files changed, 43 insertions(+), 82 deletions(-)

diff --git a/v03_pipeline/lib/misc/family_loading_failures.py b/v03_pipeline/lib/misc/family_loading_failures.py
index a43500fbf..aa8d988ac 100644
--- a/v03_pipeline/lib/misc/family_loading_failures.py
+++ b/v03_pipeline/lib/misc/family_loading_failures.py
@@ -177,9 +177,9 @@ def get_families_failed_sex_check(
 
 
 def get_families_failed_imputed_sex_ploidy(
+    families: set[Family],
     mt: hl.MatrixTable,
     sex_check_ht: hl.Table,
-    families: set[Family],
 ) -> dict[Family, str]:
     mt = mt.select_cols(
         discrepant=(
diff --git a/v03_pipeline/lib/misc/family_loading_failures_test.py b/v03_pipeline/lib/misc/family_loading_failures_test.py
index 6e6c1c923..58564d4b2 100644
--- a/v03_pipeline/lib/misc/family_loading_failures_test.py
+++ b/v03_pipeline/lib/misc/family_loading_failures_test.py
@@ -317,9 +317,9 @@ def test_get_families_failed_imputed_sex_ploidy(self) -> None:
             .key_cols_by('s')
         )
         failed_families = get_families_failed_imputed_sex_ploidy(
+            families,
             mt,
             sex_check_ht,
-            families,
         )
         self.assertDictEqual(failed_families, {})
 
@@ -362,9 +362,9 @@ def test_get_families_failed_imputed_sex_ploidy(self) -> None:
             .key_cols_by('s')
         )
         failed_families = get_families_failed_imputed_sex_ploidy(
+            families,
             mt,
             sex_check_ht,
-            families,
         )
         self.assertDictEqual(failed_families, {})
 
@@ -409,9 +409,9 @@ def test_get_families_failed_imputed_sex_ploidy(self) -> None:
             .key_cols_by('s')
         )
         failed_families = get_families_failed_imputed_sex_ploidy(
+            families,
             mt,
             sex_check_ht,
-            families,
         )
         self.assertCountEqual(
             failed_families.values(),
@@ -428,9 +428,9 @@ def test_get_families_failed_imputed_sex_ploidy(self) -> None:
             ),
         }
         failed_families = get_families_failed_imputed_sex_ploidy(
+            families,
             mt,
             sex_check_ht,
-            families,
         )
         self.assertCountEqual(
             failed_families.values(),
diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py
index 479621601..a4388910e 100644
--- a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py
+++ b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py
@@ -129,20 +129,20 @@ def create_table(self) -> hl.MatrixTable:
                 relatedness_check_ht,
                 remap_lookup,
             )
-            families_failed_sex_check = get_families_failed_sex_check(
+            families_failed_imputed_sex_ploidy = get_families_failed_imputed_sex_ploidy(
                 families
                 - families_failed_missing_samples.keys()
                 - families_failed_relatedness_check.keys(),
-                sex_check_ht,
-                remap_lookup,
-            )
-            families_failed_imputed_sex_ploidy = get_families_failed_imputed_sex_ploidy(
                 callset_mt,
                 sex_check_ht,
+            )
+            families_failed_sex_check = get_families_failed_sex_check(
                 families
                 - families_failed_missing_samples.keys()
                 - families_failed_relatedness_check.keys()
-                - families_failed_sex_check.keys(),
+                - families_failed_imputed_sex_ploidy.keys(),
+                sex_check_ht,
+                remap_lookup,
             )
 
         loadable_families = (
diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py
index afac86773..9471aafa3 100644
--- a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py
+++ b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py
@@ -237,123 +237,84 @@ def test_write_remapped_and_subsetted_callset_task_all_families_failed(
             self.assertDictEqual(
                 json.load(f),
                 {
-                    'project_guids': [
-                        'R0114_project4',
-                    ],
-                    'error_messages': [
-                        'All families failed validation checks',
-                    ],
+                    'project_guids': ['R0114_project4'],
+                    'error_messages': ['All families failed validation checks'],
                     'failed_family_samples': {
                         'missing_samples': {
                             'efg_1': {
-                                'samples': [
-                                    'NA99999_1',
-                                ],
-                                'reasons': [
-                                    "Missing samples: {'NA99999_1'}",
-                                ],
+                                'samples': ['NA99999_1'],
+                                'reasons': ["Missing samples: {'NA99999_1'}"],
                             },
                         },
                         'relatedness_check': {},
                         'sex_check': {
-                            '789_1': {
-                                'samples': [
-                                    'NA20875_1',
-                                ],
+                            '890_1': {
+                                'samples': ['NA20876_1'],
                                 'reasons': [
-                                    'Sample NA20875_1 has pedigree sex M but imputed sex F',
+                                    'Sample NA20876_1 has pedigree sex M but imputed sex F',
                                 ],
                             },
                             '456_1': {
-                                'samples': [
-                                    'NA20870_1',
-                                ],
+                                'samples': ['NA20870_1'],
                                 'reasons': [
                                     'Sample NA20870_1 has pedigree sex M but imputed sex F',
                                 ],
                             },
                             '123_1': {
-                                'samples': [
-                                    'NA19675_1',
-                                ],
+                                'samples': ['NA19675_1'],
                                 'reasons': [
                                     'Sample NA19675_1 has pedigree sex M but imputed sex F',
                                 ],
                             },
-                            'cde_1': {
-                                'samples': [
-                                    'NA20881_1',
-                                ],
+                            '678_1': {
+                                'samples': ['NA20874_1'],
                                 'reasons': [
-                                    'Sample NA20881_1 has pedigree sex F but imputed sex M',
+                                    'Sample NA20874_1 has pedigree sex M but imputed sex F',
                                 ],
                             },
-                            '901_1': {
-                                'samples': [
-                                    'NA20877_1',
-                                ],
+                            '789_1': {
+                                'samples': ['NA20875_1'],
                                 'reasons': [
-                                    'Sample NA20877_1 has pedigree sex M but imputed sex F',
+                                    'Sample NA20875_1 has pedigree sex M but imputed sex F',
                                 ],
                             },
-                            '678_1': {
-                                'samples': [
-                                    'NA20874_1',
-                                ],
+                            '901_1': {
+                                'samples': ['NA20877_1'],
                                 'reasons': [
-                                    'Sample NA20874_1 has pedigree sex M but imputed sex F',
+                                    'Sample NA20877_1 has pedigree sex M but imputed sex F',
                                 ],
                             },
                             '345_1': {
-                                'samples': [
-                                    'NA19679_1',
-                                ],
+                                'samples': ['NA19679_1'],
                                 'reasons': [
                                     'Sample NA19679_1 has pedigree sex M but imputed sex F',
                                 ],
                             },
-                            '890_1': {
-                                'samples': [
-                                    'NA20876_1',
-                                ],
-                                'reasons': [
-                                    'Sample NA20876_1 has pedigree sex M but imputed sex F',
-                                ],
-                            },
                             'def_1': {
-                                'samples': [
-                                    'NA20885_1',
-                                ],
+                                'samples': ['NA20885_1'],
                                 'reasons': [
                                     'Sample NA20885_1 has pedigree sex M but imputed sex F',
                                 ],
                             },
+                        },
+                        'ploidy_check': {
+                            'cde_1': {
+                                'samples': ['NA20881_1'],
+                                'reasons': "Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : ['NA20881_1']",
+                            },
                             '234_1': {
-                                'samples': [
-                                    'NA19678_1',
-                                ],
-                                'reasons': [
-                                    'Sample NA19678_1 has pedigree sex F but imputed sex M',
-                                ],
+                                'samples': ['NA19678_1'],
+                                'reasons': "Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : ['NA19678_1']",
                             },
                             'bcd_1': {
-                                'samples': [
-                                    'NA20878_1',
-                                ],
-                                'reasons': [
-                                    'Sample NA20878_1 has pedigree sex F but imputed sex M',
-                                ],
+                                'samples': ['NA20878_1'],
+                                'reasons': "Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : ['NA20878_1']",
                             },
                             '567_1': {
-                                'samples': [
-                                    'NA20872_1',
-                                ],
-                                'reasons': [
-                                    'Sample NA20872_1 has pedigree sex F but imputed sex M',
-                                ],
+                                'samples': ['NA20872_1'],
+                                'reasons': "Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : ['NA20872_1']",
                             },
                         },
-                        'ploidy_check': {},
                     },
                 },
             )

From 3d48d8a481c83fef2b83643fea51284824ccd92e Mon Sep 17 00:00:00 2001
From: Julia Klugherz <juliaklugherz@gmail.com>
Date: Mon, 17 Mar 2025 17:03:05 -0400
Subject: [PATCH 06/10] add to metadata json

---
 v03_pipeline/lib/tasks/write_metadata_for_run.py      | 8 +++++++-
 v03_pipeline/lib/tasks/write_metadata_for_run_test.py | 1 +
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/v03_pipeline/lib/tasks/write_metadata_for_run.py b/v03_pipeline/lib/tasks/write_metadata_for_run.py
index cc012a926..71b115c2a 100644
--- a/v03_pipeline/lib/tasks/write_metadata_for_run.py
+++ b/v03_pipeline/lib/tasks/write_metadata_for_run.py
@@ -48,6 +48,7 @@ def run(self) -> None:
                 'missing_samples': {},
                 'relatedness_check': {},
                 'sex_check': {},
+                'ploidy_check': {},
             },
             'relatedness_check_file_path': relatedness_check_tsv_path(
                 self.reference_genome,
@@ -62,7 +63,12 @@ def run(self) -> None:
                 **collected_globals['family_samples'],
                 **metadata_json['family_samples'],
             }
-            for key in ['missing_samples', 'relatedness_check', 'sex_check']:
+            for key in [
+                'missing_samples',
+                'relatedness_check',
+                'sex_check',
+                'ploidy_check',
+            ]:
                 metadata_json['failed_family_samples'][key] = {
                     **collected_globals['failed_family_samples'][key],
                     **metadata_json['failed_family_samples'][key],
diff --git a/v03_pipeline/lib/tasks/write_metadata_for_run_test.py b/v03_pipeline/lib/tasks/write_metadata_for_run_test.py
index dc007fbcb..471bb61c5 100644
--- a/v03_pipeline/lib/tasks/write_metadata_for_run_test.py
+++ b/v03_pipeline/lib/tasks/write_metadata_for_run_test.py
@@ -50,6 +50,7 @@ def test_write_metadata_for_run_task(self) -> None:
                         },
                         'relatedness_check': {},
                         'sex_check': {},
+                        'ploidy_check': {},
                     },
                     'family_samples': {
                         'abc_1': [

From b583a26656a2a70720f9f5a4e5a1a72201cfc186 Mon Sep 17 00:00:00 2001
From: Julia Klugherz <juliaklugherz@gmail.com>
Date: Tue, 18 Mar 2025 16:32:32 -0400
Subject: [PATCH 07/10] not applicable

---
 v03_pipeline/lib/misc/family_loading_failures.py |  2 +-
 .../lib/misc/family_loading_failures_test.py     |  4 ++--
 .../write_remapped_and_subsetted_callset_test.py | 16 ++++++++--------
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/v03_pipeline/lib/misc/family_loading_failures.py b/v03_pipeline/lib/misc/family_loading_failures.py
index aa8d988ac..8c3ce310a 100644
--- a/v03_pipeline/lib/misc/family_loading_failures.py
+++ b/v03_pipeline/lib/misc/family_loading_failures.py
@@ -213,5 +213,5 @@ def get_families_failed_imputed_sex_ploidy(
             sorted_discrepant_samples = sorted(discrepant_loadable_samples)
             failed_families[
                 family
-            ] = f'Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : {sorted_discrepant_samples[:10]}'
+            ] = f'Found samples with misaligned ploidy with their provided imputed sex: {sorted_discrepant_samples}'
     return failed_families
diff --git a/v03_pipeline/lib/misc/family_loading_failures_test.py b/v03_pipeline/lib/misc/family_loading_failures_test.py
index 58564d4b2..a7e4ec03b 100644
--- a/v03_pipeline/lib/misc/family_loading_failures_test.py
+++ b/v03_pipeline/lib/misc/family_loading_failures_test.py
@@ -416,7 +416,7 @@ def test_get_families_failed_imputed_sex_ploidy(self) -> None:
         self.assertCountEqual(
             failed_families.values(),
             [
-                "Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : ['HG00731_1', 'HG00732_1', 'NA20889_1', 'NA20899_1']",
+                "Found samples with misaligned ploidy with their provided imputed sex: ['HG00731_1', 'HG00732_1', 'NA20889_1', 'NA20899_1']",
             ],
         )
 
@@ -435,6 +435,6 @@ def test_get_families_failed_imputed_sex_ploidy(self) -> None:
         self.assertCountEqual(
             failed_families.values(),
             [
-                "Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : ['HG00731_1']",
+                "Found samples with misaligned ploidy with their provided imputed sex: ['HG00731_1']",
             ],
         )
diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py
index 9471aafa3..e9fb4c763 100644
--- a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py
+++ b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py
@@ -181,19 +181,19 @@ def test_write_remapped_and_subsetted_callset_task_failed_some_family_checks(
                         ploidy_check={
                             '234_1': hl.Struct(
                                 samples=['NA19678_1'],
-                                reasons="Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : ['NA19678_1']",
+                                reasons="Found samples with misaligned ploidy with their provided imputed sex: ['NA19678_1']",
                             ),
                             '567_1': hl.Struct(
                                 samples=['NA20872_1'],
-                                reasons="Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : ['NA20872_1']",
+                                reasons="Found samples with misaligned ploidy with their provided imputed sex: ['NA20872_1']",
                             ),
                             'bcd_1': hl.Struct(
                                 samples=['NA20878_1'],
-                                reasons="Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : ['NA20878_1']",
+                                reasons="Found samples with misaligned ploidy with their provided imputed sex: ['NA20878_1']",
                             ),
                             'cde_1': hl.Struct(
                                 samples=['NA20881_1'],
-                                reasons="Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : ['NA20881_1']",
+                                reasons="Found samples with misaligned ploidy with their provided imputed sex: ['NA20881_1']",
                             ),
                         },
                     ),
@@ -300,19 +300,19 @@ def test_write_remapped_and_subsetted_callset_task_all_families_failed(
                         'ploidy_check': {
                             'cde_1': {
                                 'samples': ['NA20881_1'],
-                                'reasons': "Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : ['NA20881_1']",
+                                'reasons': "Found samples with misaligned ploidy with their provided imputed sex: ['NA20881_1']",
                             },
                             '234_1': {
                                 'samples': ['NA19678_1'],
-                                'reasons': "Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : ['NA19678_1']",
+                                'reasons': "Found samples with misaligned ploidy with their provided imputed sex: ['NA19678_1']",
                             },
                             'bcd_1': {
                                 'samples': ['NA20878_1'],
-                                'reasons': "Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : ['NA20878_1']",
+                                'reasons': "Found samples with misaligned ploidy with their provided imputed sex: ['NA20878_1']",
                             },
                             '567_1': {
                                 'samples': ['NA20872_1'],
-                                'reasons': "Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : ['NA20872_1']",
+                                'reasons': "Found samples with misaligned ploidy with their provided imputed sex: ['NA20872_1']",
                             },
                         },
                     },

From 3a6c725ff46f785d79eaf9ab2fe94e8e7de1c230 Mon Sep 17 00:00:00 2001
From: Julia Klugherz <juliaklugherz@gmail.com>
Date: Tue, 18 Mar 2025 17:08:36 -0400
Subject: [PATCH 08/10] formatting

---
 .../lib/misc/family_loading_failures.py       |  8 +--
 .../lib/misc/family_loading_failures_test.py  |  8 ++-
 ...ite_remapped_and_subsetted_callset_test.py | 64 ++++++++++++-------
 3 files changed, 50 insertions(+), 30 deletions(-)

diff --git a/v03_pipeline/lib/misc/family_loading_failures.py b/v03_pipeline/lib/misc/family_loading_failures.py
index 8c3ce310a..f047e3f50 100644
--- a/v03_pipeline/lib/misc/family_loading_failures.py
+++ b/v03_pipeline/lib/misc/family_loading_failures.py
@@ -206,12 +206,12 @@ def get_families_failed_imputed_sex_ploidy(
     discrepant_samples = mt.aggregate_cols(
         hl.agg.filter(mt.discrepant, hl.agg.collect_as_set(mt.s)),
     )
-    failed_families = {}
+    failed_families = defaultdict(list)
     for family in families:
         discrepant_loadable_samples = set(family.samples.keys()) & discrepant_samples
         if discrepant_loadable_samples:
             sorted_discrepant_samples = sorted(discrepant_loadable_samples)
-            failed_families[
-                family
-            ] = f'Found samples with misaligned ploidy with their provided imputed sex: {sorted_discrepant_samples}'
+            failed_families[family].append(
+                f'Found samples with misaligned ploidy with their provided imputed sex: {sorted_discrepant_samples}',
+            )
     return failed_families
diff --git a/v03_pipeline/lib/misc/family_loading_failures_test.py b/v03_pipeline/lib/misc/family_loading_failures_test.py
index a7e4ec03b..2352542e0 100644
--- a/v03_pipeline/lib/misc/family_loading_failures_test.py
+++ b/v03_pipeline/lib/misc/family_loading_failures_test.py
@@ -416,7 +416,9 @@ def test_get_families_failed_imputed_sex_ploidy(self) -> None:
         self.assertCountEqual(
             failed_families.values(),
             [
-                "Found samples with misaligned ploidy with their provided imputed sex: ['HG00731_1', 'HG00732_1', 'NA20889_1', 'NA20899_1']",
+                [
+                    "Found samples with misaligned ploidy with their provided imputed sex: ['HG00731_1', 'HG00732_1', 'NA20889_1', 'NA20899_1']",
+                ],
             ],
         )
 
@@ -435,6 +437,8 @@ def test_get_families_failed_imputed_sex_ploidy(self) -> None:
         self.assertCountEqual(
             failed_families.values(),
             [
-                "Found samples with misaligned ploidy with their provided imputed sex: ['HG00731_1']",
+                [
+                    "Found samples with misaligned ploidy with their provided imputed sex: ['HG00731_1']",
+                ],
             ],
         )
diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py
index e9fb4c763..564eb2eed 100644
--- a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py
+++ b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py
@@ -179,22 +179,30 @@ def test_write_remapped_and_subsetted_callset_task_failed_some_family_checks(
                             },
                         },
                         ploidy_check={
-                            '234_1': hl.Struct(
-                                samples=['NA19678_1'],
-                                reasons="Found samples with misaligned ploidy with their provided imputed sex: ['NA19678_1']",
-                            ),
-                            '567_1': hl.Struct(
-                                samples=['NA20872_1'],
-                                reasons="Found samples with misaligned ploidy with their provided imputed sex: ['NA20872_1']",
-                            ),
-                            'bcd_1': hl.Struct(
-                                samples=['NA20878_1'],
-                                reasons="Found samples with misaligned ploidy with their provided imputed sex: ['NA20878_1']",
-                            ),
-                            'cde_1': hl.Struct(
-                                samples=['NA20881_1'],
-                                reasons="Found samples with misaligned ploidy with their provided imputed sex: ['NA20881_1']",
-                            ),
+                            '234_1': {
+                                'reasons': [
+                                    "Found samples with misaligned ploidy with their provided imputed sex: ['NA19678_1']",
+                                ],
+                                'samples': ['NA19678_1'],
+                            },
+                            '567_1': {
+                                'reasons': [
+                                    "Found samples with misaligned ploidy with their provided imputed sex: ['NA20872_1']",
+                                ],
+                                'samples': ['NA20872_1'],
+                            },
+                            'bcd_1': {
+                                'reasons': [
+                                    "Found samples with misaligned ploidy with their provided imputed sex: ['NA20878_1']",
+                                ],
+                                'samples': ['NA20878_1'],
+                            },
+                            'cde_1': {
+                                'reasons': [
+                                    "Found samples with misaligned ploidy with their provided imputed sex: ['NA20881_1']",
+                                ],
+                                'samples': ['NA20881_1'],
+                            },
                         },
                     ),
                 ),
@@ -298,21 +306,29 @@ def test_write_remapped_and_subsetted_callset_task_all_families_failed(
                             },
                         },
                         'ploidy_check': {
-                            'cde_1': {
-                                'samples': ['NA20881_1'],
-                                'reasons': "Found samples with misaligned ploidy with their provided imputed sex: ['NA20881_1']",
+                            '567_1': {
+                                'samples': ['NA20872_1'],
+                                'reasons': [
+                                    "Found samples with misaligned ploidy with their provided imputed sex: ['NA20872_1']",
+                                ],
                             },
                             '234_1': {
                                 'samples': ['NA19678_1'],
-                                'reasons': "Found samples with misaligned ploidy with their provided imputed sex: ['NA19678_1']",
+                                'reasons': [
+                                    "Found samples with misaligned ploidy with their provided imputed sex: ['NA19678_1']",
+                                ],
                             },
                             'bcd_1': {
                                 'samples': ['NA20878_1'],
-                                'reasons': "Found samples with misaligned ploidy with their provided imputed sex: ['NA20878_1']",
+                                'reasons': [
+                                    "Found samples with misaligned ploidy with their provided imputed sex: ['NA20878_1']",
+                                ],
                             },
-                            '567_1': {
-                                'samples': ['NA20872_1'],
-                                'reasons': "Found samples with misaligned ploidy with their provided imputed sex: ['NA20872_1']",
+                            'cde_1': {
+                                'samples': ['NA20881_1'],
+                                'reasons': [
+                                    "Found samples with misaligned ploidy with their provided imputed sex: ['NA20881_1']",
+                                ],
                             },
                         },
                     },

From 36b8f5e129e1f927876257332479e880c1a611d9 Mon Sep 17 00:00:00 2001
From: Benjamin Blankenmeister <b.p.blankenmeister@gmail.com>
Date: Thu, 20 Mar 2025 18:33:26 +0800
Subject: [PATCH 09/10] bugfix: handle missing metrics in tdr (#1060)

* handle missing metrics in tdr

* ruff
---
 .../lib/misc/terra_data_repository.py         | 17 ++++++++++++----
 .../lib/misc/terra_data_repository_test.py    | 20 ++++++++++++++++++-
 2 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/v03_pipeline/lib/misc/terra_data_repository.py b/v03_pipeline/lib/misc/terra_data_repository.py
index 2428a429c..8e4cd975a 100644
--- a/v03_pipeline/lib/misc/terra_data_repository.py
+++ b/v03_pipeline/lib/misc/terra_data_repository.py
@@ -10,10 +10,10 @@
 from v03_pipeline.lib.misc.requests import requests_retry_session
 
 BIGQUERY_METRICS = [
-    'collaborator_sample_id',
     'predicted_sex',
     'contamination_rate',
     'percent_bases_at_20x',
+    'collaborator_sample_id',
     'mean_coverage',
 ]
 BIGQUERY_RESOURCE = 'bigquery'
@@ -63,9 +63,18 @@ def bq_metrics_query(bq_table_name: str) -> google.cloud.bigquery.table.RowItera
         msg = f'{bq_table_name} does not match expected pattern'
         raise ValueError(msg)
     client = bigquery.Client()
+
+    # not all columns are guaranteed to be present, coalesce if missing
+    table_ddl = next(
+        client.query_and_wait(
+            f"""
+        SELECT ddl FROM `{bq_table_name}`.INFORMATION_SCHEMA.TABLES where table_name='sample';
+        """,  # noqa: S608
+        ),
+    )[0]
+    metrics = [(m if m in table_ddl else f'NULL AS {m}') for m in BIGQUERY_METRICS]
     return client.query_and_wait(
         f"""
-        SELECT {','.join(BIGQUERY_METRICS)}
-        FROM `{bq_table_name}.sample`
-    """,  # noqa: S608
+        SELECT {','.join(metrics)} FROM `{bq_table_name}.sample`;
+        """,  # noqa: S608
     )
diff --git a/v03_pipeline/lib/misc/terra_data_repository_test.py b/v03_pipeline/lib/misc/terra_data_repository_test.py
index 70f26c81a..7084bcd8b 100644
--- a/v03_pipeline/lib/misc/terra_data_repository_test.py
+++ b/v03_pipeline/lib/misc/terra_data_repository_test.py
@@ -2,13 +2,14 @@
 import os
 import unittest
 from types import SimpleNamespace
-from unittest.mock import Mock, patch
+from unittest.mock import Mock, call, patch
 
 import responses
 
 from v03_pipeline.lib.misc.terra_data_repository import (
     TDR_ROOT_URL,
     _get_dataset_ids,
+    bq_metrics_query,
     gen_bq_table_names,
 )
 
@@ -301,3 +302,20 @@ def test_gen_bq_table_names(self, _: Mock) -> None:
                 'datarepo-aada2e3b.datarepo_RP_3059',
             ],
         )
+
+    @patch('v03_pipeline.lib.misc.terra_data_repository.bigquery.Client')
+    def test_bq_metrics_query_missing_metrics(
+        self,
+        mock_bq_client: Mock,
+        _: Mock,
+    ) -> None:
+        mock_bq_client.return_value.query_and_wait.return_value = iter(
+            [['predicted_sex,contamination_rate,percent_bases_at_20x']],
+        )
+        bq_metrics_query('datarepo-7242affb.datarepo_RP_3053')
+        self.assertEqual(
+            mock_bq_client.return_value.query_and_wait.mock_calls[1],
+            call(
+                '\n        SELECT predicted_sex,contamination_rate,percent_bases_at_20x,NULL AS collaborator_sample_id,NULL AS mean_coverage FROM `datarepo-7242affb.datarepo_RP_3053.sample`;\n        ',
+            ),
+        )

From a7701e68e1ff05268c994405185312f7227cf7b8 Mon Sep 17 00:00:00 2001
From: Benjamin Blankenmeister <bblanken@broadinstitute.org>
Date: Thu, 20 Mar 2025 11:24:43 -0400
Subject: [PATCH 10/10] Re-enable hl.impute_sex for WES samples

---
 v03_pipeline/lib/methods/sex_check.py         | 50 +++++++++++++
 v03_pipeline/lib/methods/sex_check_test.py    | 55 ++++++++++++++
 v03_pipeline/lib/model/definitions.py         |  4 +
 .../lib/tasks/write_imported_callset.py       | 12 ---
 .../lib/tasks/write_metadata_for_run_test.py  |  2 +-
 .../lib/tasks/write_sex_check_table.py        | 62 +++++++++++++---
 .../lib/tasks/write_sex_check_table_test.py   | 74 ++++++++++++++++++-
 7 files changed, 235 insertions(+), 24 deletions(-)
 create mode 100644 v03_pipeline/lib/methods/sex_check.py
 create mode 100644 v03_pipeline/lib/methods/sex_check_test.py

diff --git a/v03_pipeline/lib/methods/sex_check.py b/v03_pipeline/lib/methods/sex_check.py
new file mode 100644
index 000000000..0d9aced28
--- /dev/null
+++ b/v03_pipeline/lib/methods/sex_check.py
@@ -0,0 +1,50 @@
+import hail as hl
+
+from v03_pipeline.lib.model import Sex
+
+AMBIGUOUS_THRESHOLD_PERC: float = 0.01  # Fraction of samples identified as "ambiguous_sex" above which an error will be thrown.
+AAF_THRESHOLD: float = 0.05  # Alternate allele frequency threshold for `hl.impute_sex`.
+BIALLELIC: int = 2
+XX_FSTAT_THRESHOLD: float = (
+    0.5  # F-stat threshold below which a sample will be called XX
+)
+XY_FSTAT_THRESHOLD: float = (
+    0.75  # F-stat threshold above which a sample will be called XY.
+)
+
+
+def compute_sex_check_ht(mt: hl.MatrixTable) -> hl.Table:
+    # Filter to SNVs and biallelics
+    # NB: We should already have filtered biallelics, but just in case.
+    mt = mt.filter_rows(
+        (hl.len(mt.alleles) == BIALLELIC) & hl.is_snp(mt.alleles[0], mt.alleles[1]),
+    )
+    mt = mt.filter_cols(hl.agg.all(mt.GT.is_diploid() | hl.is_missing(mt.GT)))
+
+    # Filter to PASS variants only (variants with empty or missing filter set)
+    mt = mt.filter_rows(
+        hl.is_missing(mt.filters) | (mt.filters.length() == 0),
+        keep=True,
+    )
+    impute_sex_ht = hl.impute_sex(
+        mt.GT,
+        male_threshold=XY_FSTAT_THRESHOLD,
+        female_threshold=XX_FSTAT_THRESHOLD,
+        aaf_threshold=AAF_THRESHOLD,
+    )
+    ht = mt.annotate_cols(**impute_sex_ht[mt.col_key]).cols()
+    ht = ht.select(
+        predicted_sex=(
+            hl.case()
+            .when(hl.is_missing(ht.is_female), Sex.UNKNOWN.value)
+            .when(ht.is_female, Sex.FEMALE.value)
+            .default(Sex.MALE.value)
+        ),
+    )
+    ambiguous_perc = ht.aggregate(
+        hl.agg.fraction(ht.predicted_sex == Sex.UNKNOWN.value),
+    )
+    if ambiguous_perc > AMBIGUOUS_THRESHOLD_PERC:
+        msg = f'{ambiguous_perc:.2%} of samples identified as ambiguous.  Please contact the methods team to investigate the callset.'
+        raise ValueError(msg)
+    return ht
diff --git a/v03_pipeline/lib/methods/sex_check_test.py b/v03_pipeline/lib/methods/sex_check_test.py
new file mode 100644
index 000000000..f87c07012
--- /dev/null
+++ b/v03_pipeline/lib/methods/sex_check_test.py
@@ -0,0 +1,55 @@
+import unittest
+from unittest.mock import patch
+
+import hail as hl
+
+from v03_pipeline.lib.methods.sex_check import compute_sex_check_ht
+
+TEST_SEX_AND_RELATEDNESS_CALLSET_MT = (
+    'v03_pipeline/var/test/callsets/sex_and_relatedness_1.mt'
+)
+TEST_PEDIGREE = 'v03_pipeline/var/test/pedigrees/test_pedigree_6.tsv'
+
+
+class SexCheckTest(unittest.TestCase):
+    def test_compute_sex_check_ht(self):
+        mt = hl.read_matrix_table(TEST_SEX_AND_RELATEDNESS_CALLSET_MT)
+        ht = compute_sex_check_ht(mt)
+        self.assertCountEqual(
+            ht.collect(),
+            [
+                hl.Struct(
+                    s='ROS_006_18Y03226_D1',
+                    predicted_sex='M',
+                ),
+                hl.Struct(
+                    s='ROS_006_18Y03227_D1',
+                    predicted_sex='M',
+                ),
+                hl.Struct(
+                    s='ROS_006_18Y03228_D1',
+                    predicted_sex='M',
+                ),
+                hl.Struct(
+                    s='ROS_007_19Y05919_D1',
+                    predicted_sex='M',
+                ),
+                hl.Struct(
+                    s='ROS_007_19Y05939_D1',
+                    predicted_sex='F',
+                ),
+                hl.Struct(
+                    s='ROS_007_19Y05987_D1',
+                    predicted_sex='M',
+                ),
+            ],
+        )
+
+    def test_compute_sex_check_ht_ambiguous(self):
+        mt = hl.read_matrix_table(TEST_SEX_AND_RELATEDNESS_CALLSET_MT)
+        with patch('v03_pipeline.lib.methods.sex_check.XY_FSTAT_THRESHOLD', 0.95):
+            self.assertRaises(
+                ValueError,
+                compute_sex_check_ht,
+                mt,
+            )
diff --git a/v03_pipeline/lib/model/definitions.py b/v03_pipeline/lib/model/definitions.py
index 171686e29..216a37c4f 100644
--- a/v03_pipeline/lib/model/definitions.py
+++ b/v03_pipeline/lib/model/definitions.py
@@ -165,3 +165,7 @@ def allele_registry_gnomad_id(self) -> str:
 class SampleType(StrEnum):
     WES = 'WES'
     WGS = 'WGS'
+
+    @property
+    def predicted_sex_from_tdr(self):
+        return self == SampleType.WGS
diff --git a/v03_pipeline/lib/tasks/write_imported_callset.py b/v03_pipeline/lib/tasks/write_imported_callset.py
index 75c62aeff..e547217d9 100644
--- a/v03_pipeline/lib/tasks/write_imported_callset.py
+++ b/v03_pipeline/lib/tasks/write_imported_callset.py
@@ -21,7 +21,6 @@
 from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams
 from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask
 from v03_pipeline.lib.tasks.files import CallsetTask, GCSorLocalTarget
-from v03_pipeline.lib.tasks.write_tdr_metrics_files import WriteTDRMetricsFilesTask
 from v03_pipeline.lib.tasks.write_validation_errors_for_run import (
     with_persisted_validation_errors,
 )
@@ -60,17 +59,6 @@ def requires(self) -> list[luigi.Task]:
                     ),
                 ),
             ]
-        if (
-            FeatureFlag.EXPECT_TDR_METRICS
-            and not self.skip_expect_tdr_metrics
-            and self.dataset_type.expect_tdr_metrics(
-                self.reference_genome,
-            )
-        ):
-            requirements = [
-                *requirements,
-                self.clone(WriteTDRMetricsFilesTask),
-            ]
         return [
             *requirements,
             CallsetTask(self.callset_path),
diff --git a/v03_pipeline/lib/tasks/write_metadata_for_run_test.py b/v03_pipeline/lib/tasks/write_metadata_for_run_test.py
index d1c9b1cf1..41e8e1c30 100644
--- a/v03_pipeline/lib/tasks/write_metadata_for_run_test.py
+++ b/v03_pipeline/lib/tasks/write_metadata_for_run_test.py
@@ -24,7 +24,7 @@ class WriteMetadataForRunTaskTest(MockedDatarootTestCase):
     )
     @mock.patch('v03_pipeline.lib.tasks.write_metadata_for_run.FeatureFlag')
     @mock.patch(
-        'v03_pipeline.lib.tasks.write_imported_callset.WriteTDRMetricsFilesTask',
+        'v03_pipeline.lib.tasks.write_sex_check_table.WriteTDRMetricsFilesTask',
     )
     def test_write_metadata_for_run_task(
         self,
diff --git a/v03_pipeline/lib/tasks/write_sex_check_table.py b/v03_pipeline/lib/tasks/write_sex_check_table.py
index f87233507..5a9ed6531 100644
--- a/v03_pipeline/lib/tasks/write_sex_check_table.py
+++ b/v03_pipeline/lib/tasks/write_sex_check_table.py
@@ -2,16 +2,37 @@
 import hailtop.fs as hfs
 import luigi
 
+from v03_pipeline.lib.methods.sex_check import compute_sex_check_ht
 from v03_pipeline.lib.misc.io import import_imputed_sex
-from v03_pipeline.lib.paths import sex_check_table_path, tdr_metrics_dir
+from v03_pipeline.lib.model.feature_flag import FeatureFlag
+from v03_pipeline.lib.paths import (
+    imported_callset_path,
+    sex_check_table_path,
+    tdr_metrics_dir,
+)
+from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams
 from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask
 from v03_pipeline.lib.tasks.files import GCSorLocalTarget
+from v03_pipeline.lib.tasks.write_imported_callset import WriteImportedCallsetTask
 from v03_pipeline.lib.tasks.write_tdr_metrics_files import WriteTDRMetricsFilesTask
 
 
+@luigi.util.inherits(BaseLoadingRunParams)
 class WriteSexCheckTableTask(BaseWriteTask):
     callset_path = luigi.Parameter()
 
+    @property
+    def predicted_sex_from_tdr(self):
+        # complicated enough to need a helper :/
+        return (
+            FeatureFlag.EXPECT_TDR_METRICS
+            and not self.skip_expect_tdr_metrics
+            and self.dataset_type.expect_tdr_metrics(
+                self.reference_genome,
+            )
+            and self.sample_type.predicted_sex_from_tdr
+        )
+
     def output(self) -> luigi.Target:
         return GCSorLocalTarget(
             sex_check_table_path(
@@ -21,16 +42,37 @@ def output(self) -> luigi.Target:
             ),
         )
 
-    def requires(self) -> luigi.Task:
-        return self.clone(WriteTDRMetricsFilesTask)
+    def requires(self) -> list[luigi.Task]:
+        requirements = []
+        if self.predicted_sex_from_tdr:
+            requirements = [
+                *requirements,
+                self.clone(WriteTDRMetricsFilesTask),
+            ]
+        else:
+            requirements = [
+                *requirements,
+                self.clone(WriteImportedCallsetTask),
+            ]
+        return requirements
 
     def create_table(self) -> hl.Table:
         ht = None
-        for tdr_metrics_file in hfs.ls(
-            tdr_metrics_dir(self.reference_genome, self.dataset_type),
-        ):
-            if not ht:
-                ht = import_imputed_sex(tdr_metrics_file.path)
-                continue
-            ht = ht.union(import_imputed_sex(tdr_metrics_file.path))
+        if self.predicted_sex_from_tdr:
+            for tdr_metrics_file in hfs.ls(
+                tdr_metrics_dir(self.reference_genome, self.dataset_type),
+            ):
+                if not ht:
+                    ht = import_imputed_sex(tdr_metrics_file.path)
+                    continue
+                ht = ht.union(import_imputed_sex(tdr_metrics_file.path))
+        else:
+            mt = hl.read_matrix_table(
+                imported_callset_path(
+                    self.reference_genome,
+                    self.dataset_type,
+                    self.callset_path,
+                ),
+            )
+            ht = compute_sex_check_ht(mt)
         return ht
diff --git a/v03_pipeline/lib/tasks/write_sex_check_table_test.py b/v03_pipeline/lib/tasks/write_sex_check_table_test.py
index fdf72d335..3b3e7bd4d 100644
--- a/v03_pipeline/lib/tasks/write_sex_check_table_test.py
+++ b/v03_pipeline/lib/tasks/write_sex_check_table_test.py
@@ -5,22 +5,31 @@
 import hail as hl
 import luigi.worker
 
-from v03_pipeline.lib.model import DatasetType, ReferenceGenome
+from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType
 from v03_pipeline.lib.paths import sex_check_table_path, tdr_metrics_path
 from v03_pipeline.lib.tasks.write_sex_check_table import (
     WriteSexCheckTableTask,
 )
 from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase
 
+TEST_SEX_AND_RELATEDNESS_CALLSET_MT = (
+    'v03_pipeline/var/test/callsets/sex_and_relatedness_1.mt'
+)
+
 
 class WriteSexCheckTableTaskTest(MockedDatarootTestCase):
     @patch('v03_pipeline.lib.tasks.write_tdr_metrics_files.gen_bq_table_names')
     @patch('v03_pipeline.lib.tasks.write_tdr_metrics_file.bq_metrics_query')
+    @patch(
+        'v03_pipeline.lib.tasks.write_sex_check_table.FeatureFlag',
+    )
     def test_snv_sex_check_table_task(
         self,
+        mock_ff: Mock,
         mock_bq_metrics_query: Mock,
         mock_gen_bq_table_names: Mock,
     ) -> None:
+        mock_ff.EXPECT_TDR_METRICS = True
         mock_gen_bq_table_names.return_value = [
             'datarepo-7242affb.datarepo_RP_3053',
             'datarepo-5a72e31b.datarepo_RP_3056',
@@ -111,7 +120,12 @@ def test_snv_sex_check_table_task(
         write_sex_check_table = WriteSexCheckTableTask(
             reference_genome=ReferenceGenome.GRCh38,
             dataset_type=DatasetType.SNV_INDEL,
+            sample_type=SampleType.WGS,
             callset_path='na',
+            project_guids=['R0113_test_project'],
+            project_remap_paths=['test_remap'],
+            project_pedigree_paths=['test_pedigree'],
+            run_id='manual__2024-04-03',
         )
         worker.add(write_sex_check_table)
         worker.run()
@@ -143,3 +157,61 @@ def test_snv_sex_check_table_task(
             ),
         ) as f:
             self.assertTrue('collaborator_sample_id' in f.read())
+
+    @patch(
+        'v03_pipeline.lib.tasks.write_sex_check_table.FeatureFlag',
+    )
+    def test_snv_wes_sex_check_table_task(
+        self,
+        mock_ff: Mock,
+    ) -> None:
+        mock_ff.EXPECT_TDR_METRICS = True
+        worker = luigi.worker.Worker()
+        write_sex_check_table = WriteSexCheckTableTask(
+            reference_genome=ReferenceGenome.GRCh38,
+            dataset_type=DatasetType.SNV_INDEL,
+            sample_type=SampleType.WES,
+            callset_path=TEST_SEX_AND_RELATEDNESS_CALLSET_MT,
+            project_guids=['R0113_test_project'],
+            project_remap_paths=['test_remap'],
+            project_pedigree_paths=['test_pedigree'],
+            run_id='manual__2024-04-04',
+        )
+        worker.add(write_sex_check_table)
+        worker.run()
+        sex_check_ht = hl.read_table(
+            sex_check_table_path(
+                ReferenceGenome.GRCh38,
+                DatasetType.SNV_INDEL,
+                TEST_SEX_AND_RELATEDNESS_CALLSET_MT,
+            ),
+        )
+        self.assertCountEqual(
+            sex_check_ht.collect(),
+            [
+                hl.Struct(
+                    s='ROS_006_18Y03226_D1',
+                    predicted_sex='M',
+                ),
+                hl.Struct(
+                    s='ROS_006_18Y03227_D1',
+                    predicted_sex='M',
+                ),
+                hl.Struct(
+                    s='ROS_006_18Y03228_D1',
+                    predicted_sex='M',
+                ),
+                hl.Struct(
+                    s='ROS_007_19Y05919_D1',
+                    predicted_sex='M',
+                ),
+                hl.Struct(
+                    s='ROS_007_19Y05939_D1',
+                    predicted_sex='F',
+                ),
+                hl.Struct(
+                    s='ROS_007_19Y05987_D1',
+                    predicted_sex='M',
+                ),
+            ],
+        )