Re-enable hl.impute_sex for WES samples

bpblanken · bpblanken · commit a7701e68e1ff · 2025-03-20T11:24:43.000-04:00
diff --git a/v03_pipeline/lib/methods/sex_check.py b/v03_pipeline/lib/methods/sex_check.py
@@ -0,0 +1,50 @@
+import hail as hl
+
+from v03_pipeline.lib.model import Sex
+
+AMBIGUOUS_THRESHOLD_PERC: float = 0.01  # Fraction of samples identified as "ambiguous_sex" above which an error will be thrown.
+AAF_THRESHOLD: float = 0.05  # Alternate allele frequency threshold for `hl.impute_sex`.
+BIALLELIC: int = 2
+XX_FSTAT_THRESHOLD: float = (
+    0.5  # F-stat threshold below which a sample will be called XX
+)
+XY_FSTAT_THRESHOLD: float = (
+    0.75  # F-stat threshold above which a sample will be called XY.
+)
+
+
+def compute_sex_check_ht(mt: hl.MatrixTable) -> hl.Table:
+    # Filter to SNVs and biallelics
+    # NB: We should already have filtered biallelics, but just in case.
+    mt = mt.filter_rows(
+        (hl.len(mt.alleles) == BIALLELIC) & hl.is_snp(mt.alleles[0], mt.alleles[1]),
+    )
+    mt = mt.filter_cols(hl.agg.all(mt.GT.is_diploid() | hl.is_missing(mt.GT)))
+
+    # Filter to PASS variants only (variants with empty or missing filter set)
+    mt = mt.filter_rows(
+        hl.is_missing(mt.filters) | (mt.filters.length() == 0),
+        keep=True,
+    )
+    impute_sex_ht = hl.impute_sex(
+        mt.GT,
+        male_threshold=XY_FSTAT_THRESHOLD,
+        female_threshold=XX_FSTAT_THRESHOLD,
+        aaf_threshold=AAF_THRESHOLD,
+    )
+    ht = mt.annotate_cols(**impute_sex_ht[mt.col_key]).cols()
+    ht = ht.select(
+        predicted_sex=(
+            hl.case()
+            .when(hl.is_missing(ht.is_female), Sex.UNKNOWN.value)
+            .when(ht.is_female, Sex.FEMALE.value)
+            .default(Sex.MALE.value)
+        ),
+    )
+    ambiguous_perc = ht.aggregate(
+        hl.agg.fraction(ht.predicted_sex == Sex.UNKNOWN.value),
+    )
+    if ambiguous_perc > AMBIGUOUS_THRESHOLD_PERC:
+        msg = f'{ambiguous_perc:.2%} of samples identified as ambiguous.  Please contact the methods team to investigate the callset.'
+        raise ValueError(msg)
+    return ht
diff --git a/v03_pipeline/lib/methods/sex_check_test.py b/v03_pipeline/lib/methods/sex_check_test.py
@@ -0,0 +1,55 @@
+import unittest
+from unittest.mock import patch
+
+import hail as hl
+
+from v03_pipeline.lib.methods.sex_check import compute_sex_check_ht
+
+TEST_SEX_AND_RELATEDNESS_CALLSET_MT = (
+    'v03_pipeline/var/test/callsets/sex_and_relatedness_1.mt'
+)
+TEST_PEDIGREE = 'v03_pipeline/var/test/pedigrees/test_pedigree_6.tsv'
+
+
+class SexCheckTest(unittest.TestCase):
+    def test_compute_sex_check_ht(self):
+        mt = hl.read_matrix_table(TEST_SEX_AND_RELATEDNESS_CALLSET_MT)
+        ht = compute_sex_check_ht(mt)
+        self.assertCountEqual(
+            ht.collect(),
+            [
+                hl.Struct(
+                    s='ROS_006_18Y03226_D1',
+                    predicted_sex='M',
+                ),
+                hl.Struct(
+                    s='ROS_006_18Y03227_D1',
+                    predicted_sex='M',
+                ),
+                hl.Struct(
+                    s='ROS_006_18Y03228_D1',
+                    predicted_sex='M',
+                ),
+                hl.Struct(
+                    s='ROS_007_19Y05919_D1',
+                    predicted_sex='M',
+                ),
+                hl.Struct(
+                    s='ROS_007_19Y05939_D1',
+                    predicted_sex='F',
+                ),
+                hl.Struct(
+                    s='ROS_007_19Y05987_D1',
+                    predicted_sex='M',
+                ),
+            ],
+        )
+
+    def test_compute_sex_check_ht_ambiguous(self):
+        mt = hl.read_matrix_table(TEST_SEX_AND_RELATEDNESS_CALLSET_MT)
+        with patch('v03_pipeline.lib.methods.sex_check.XY_FSTAT_THRESHOLD', 0.95):
+            self.assertRaises(
+                ValueError,
+                compute_sex_check_ht,
+                mt,
+            )
diff --git a/v03_pipeline/lib/model/definitions.py b/v03_pipeline/lib/model/definitions.py
@@ -165,3 +165,7 @@ def allele_registry_gnomad_id(self) -> str:
 class SampleType(StrEnum):
     WES = 'WES'
     WGS = 'WGS'
+
+    @property
+    def predicted_sex_from_tdr(self):
+        return self == SampleType.WGS
diff --git a/v03_pipeline/lib/tasks/write_imported_callset.py b/v03_pipeline/lib/tasks/write_imported_callset.py
@@ -21,7 +21,6 @@
 from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams
 from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask
 from v03_pipeline.lib.tasks.files import CallsetTask, GCSorLocalTarget
-from v03_pipeline.lib.tasks.write_tdr_metrics_files import WriteTDRMetricsFilesTask
 from v03_pipeline.lib.tasks.write_validation_errors_for_run import (
     with_persisted_validation_errors,
 )
@@ -60,17 +59,6 @@ def requires(self) -> list[luigi.Task]:
                     ),
                 ),
             ]
-        if (
-            FeatureFlag.EXPECT_TDR_METRICS
-            and not self.skip_expect_tdr_metrics
-            and self.dataset_type.expect_tdr_metrics(
-                self.reference_genome,
-            )
-        ):
-            requirements = [
-                *requirements,
-                self.clone(WriteTDRMetricsFilesTask),
-            ]
         return [
             *requirements,
             CallsetTask(self.callset_path),
diff --git a/v03_pipeline/lib/tasks/write_metadata_for_run_test.py b/v03_pipeline/lib/tasks/write_metadata_for_run_test.py
@@ -24,7 +24,7 @@ class WriteMetadataForRunTaskTest(MockedDatarootTestCase):
     )
     @mock.patch('v03_pipeline.lib.tasks.write_metadata_for_run.FeatureFlag')
     @mock.patch(
-        'v03_pipeline.lib.tasks.write_imported_callset.WriteTDRMetricsFilesTask',
+        'v03_pipeline.lib.tasks.write_sex_check_table.WriteTDRMetricsFilesTask',
     )
     def test_write_metadata_for_run_task(
         self,
diff --git a/v03_pipeline/lib/tasks/write_sex_check_table.py b/v03_pipeline/lib/tasks/write_sex_check_table.py
@@ -2,16 +2,37 @@
 import hailtop.fs as hfs
 import luigi
 
+from v03_pipeline.lib.methods.sex_check import compute_sex_check_ht
 from v03_pipeline.lib.misc.io import import_imputed_sex
-from v03_pipeline.lib.paths import sex_check_table_path, tdr_metrics_dir
+from v03_pipeline.lib.model.feature_flag import FeatureFlag
+from v03_pipeline.lib.paths import (
+    imported_callset_path,
+    sex_check_table_path,
+    tdr_metrics_dir,
+)
+from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams
 from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask
 from v03_pipeline.lib.tasks.files import GCSorLocalTarget
+from v03_pipeline.lib.tasks.write_imported_callset import WriteImportedCallsetTask
 from v03_pipeline.lib.tasks.write_tdr_metrics_files import WriteTDRMetricsFilesTask
 
 
+@luigi.util.inherits(BaseLoadingRunParams)
 class WriteSexCheckTableTask(BaseWriteTask):
     callset_path = luigi.Parameter()
 
+    @property
+    def predicted_sex_from_tdr(self):
+        # complicated enough to need a helper :/
+        return (
+            FeatureFlag.EXPECT_TDR_METRICS
+            and not self.skip_expect_tdr_metrics
+            and self.dataset_type.expect_tdr_metrics(
+                self.reference_genome,
+            )
+            and self.sample_type.predicted_sex_from_tdr
+        )
+
     def output(self) -> luigi.Target:
         return GCSorLocalTarget(
             sex_check_table_path(
@@ -21,16 +42,37 @@ def output(self) -> luigi.Target:
             ),
         )
 
-    def requires(self) -> luigi.Task:
-        return self.clone(WriteTDRMetricsFilesTask)
+    def requires(self) -> list[luigi.Task]:
+        requirements = []
+        if self.predicted_sex_from_tdr:
+            requirements = [
+                *requirements,
+                self.clone(WriteTDRMetricsFilesTask),
+            ]
+        else:
+            requirements = [
+                *requirements,
+                self.clone(WriteImportedCallsetTask),
+            ]
+        return requirements
 
     def create_table(self) -> hl.Table:
         ht = None
-        for tdr_metrics_file in hfs.ls(
-            tdr_metrics_dir(self.reference_genome, self.dataset_type),
-        ):
-            if not ht:
-                ht = import_imputed_sex(tdr_metrics_file.path)
-                continue
-            ht = ht.union(import_imputed_sex(tdr_metrics_file.path))
+        if self.predicted_sex_from_tdr:
+            for tdr_metrics_file in hfs.ls(
+                tdr_metrics_dir(self.reference_genome, self.dataset_type),
+            ):
+                if not ht:
+                    ht = import_imputed_sex(tdr_metrics_file.path)
+                    continue
+                ht = ht.union(import_imputed_sex(tdr_metrics_file.path))
+        else:
+            mt = hl.read_matrix_table(
+                imported_callset_path(
+                    self.reference_genome,
+                    self.dataset_type,
+                    self.callset_path,
+                ),
+            )
+            ht = compute_sex_check_ht(mt)
         return ht
diff --git a/v03_pipeline/lib/tasks/write_sex_check_table_test.py b/v03_pipeline/lib/tasks/write_sex_check_table_test.py
@@ -5,22 +5,31 @@
 import hail as hl
 import luigi.worker
 
-from v03_pipeline.lib.model import DatasetType, ReferenceGenome
+from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType
 from v03_pipeline.lib.paths import sex_check_table_path, tdr_metrics_path
 from v03_pipeline.lib.tasks.write_sex_check_table import (
     WriteSexCheckTableTask,
 )
 from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase
 
+TEST_SEX_AND_RELATEDNESS_CALLSET_MT = (
+    'v03_pipeline/var/test/callsets/sex_and_relatedness_1.mt'
+)
+
 
 class WriteSexCheckTableTaskTest(MockedDatarootTestCase):
     @patch('v03_pipeline.lib.tasks.write_tdr_metrics_files.gen_bq_table_names')
     @patch('v03_pipeline.lib.tasks.write_tdr_metrics_file.bq_metrics_query')
+    @patch(
+        'v03_pipeline.lib.tasks.write_sex_check_table.FeatureFlag',
+    )
     def test_snv_sex_check_table_task(
         self,
+        mock_ff: Mock,
         mock_bq_metrics_query: Mock,
         mock_gen_bq_table_names: Mock,
     ) -> None:
+        mock_ff.EXPECT_TDR_METRICS = True
         mock_gen_bq_table_names.return_value = [
             'datarepo-7242affb.datarepo_RP_3053',
             'datarepo-5a72e31b.datarepo_RP_3056',
@@ -111,7 +120,12 @@ def test_snv_sex_check_table_task(
         write_sex_check_table = WriteSexCheckTableTask(
             reference_genome=ReferenceGenome.GRCh38,
             dataset_type=DatasetType.SNV_INDEL,
+            sample_type=SampleType.WGS,
             callset_path='na',
+            project_guids=['R0113_test_project'],
+            project_remap_paths=['test_remap'],
+            project_pedigree_paths=['test_pedigree'],
+            run_id='manual__2024-04-03',
         )
         worker.add(write_sex_check_table)
         worker.run()
@@ -143,3 +157,61 @@ def test_snv_sex_check_table_task(
             ),
         ) as f:
             self.assertTrue('collaborator_sample_id' in f.read())
+
+    @patch(
+        'v03_pipeline.lib.tasks.write_sex_check_table.FeatureFlag',
+    )
+    def test_snv_wes_sex_check_table_task(
+        self,
+        mock_ff: Mock,
+    ) -> None:
+        mock_ff.EXPECT_TDR_METRICS = True
+        worker = luigi.worker.Worker()
+        write_sex_check_table = WriteSexCheckTableTask(
+            reference_genome=ReferenceGenome.GRCh38,
+            dataset_type=DatasetType.SNV_INDEL,
+            sample_type=SampleType.WES,
+            callset_path=TEST_SEX_AND_RELATEDNESS_CALLSET_MT,
+            project_guids=['R0113_test_project'],
+            project_remap_paths=['test_remap'],
+            project_pedigree_paths=['test_pedigree'],
+            run_id='manual__2024-04-04',
+        )
+        worker.add(write_sex_check_table)
+        worker.run()
+        sex_check_ht = hl.read_table(
+            sex_check_table_path(
+                ReferenceGenome.GRCh38,
+                DatasetType.SNV_INDEL,
+                TEST_SEX_AND_RELATEDNESS_CALLSET_MT,
+            ),
+        )
+        self.assertCountEqual(
+            sex_check_ht.collect(),
+            [
+                hl.Struct(
+                    s='ROS_006_18Y03226_D1',
+                    predicted_sex='M',
+                ),
+                hl.Struct(
+                    s='ROS_006_18Y03227_D1',
+                    predicted_sex='M',
+                ),
+                hl.Struct(
+                    s='ROS_006_18Y03228_D1',
+                    predicted_sex='M',
+                ),
+                hl.Struct(
+                    s='ROS_007_19Y05919_D1',
+                    predicted_sex='M',
+                ),
+                hl.Struct(
+                    s='ROS_007_19Y05939_D1',
+                    predicted_sex='F',
+                ),
+                hl.Struct(
+                    s='ROS_007_19Y05987_D1',
+                    predicted_sex='M',
+                ),
+            ],
+        )

Original file line number	Diff line number	Diff line change
`@@ -24,7 +24,7 @@ class WriteMetadataForRunTaskTest(MockedDatarootTestCase):`
`24`	`24`	`)`
`25`	`25`	`@mock.patch('v03_pipeline.lib.tasks.write_metadata_for_run.FeatureFlag')`
`26`	`26`	`@mock.patch(`
`27`		`- 'v03_pipeline.lib.tasks.write_imported_callset.WriteTDRMetricsFilesTask',`
	`27`	`+ 'v03_pipeline.lib.tasks.write_sex_check_table.WriteTDRMetricsFilesTask',`
`28`	`28`	`)`
`29`	`29`	`def test_write_metadata_for_run_task(`
`30`	`30`	`self,`