Skip to content

Commit a7701e6

Browse files
committed
Re-enable hl.impute_sex for WES samples
1 parent 10b4c02 commit a7701e6

File tree

7 files changed

+235
-24
lines changed

7 files changed

+235
-24
lines changed

v03_pipeline/lib/methods/sex_check.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
import hail as hl
2+
3+
from v03_pipeline.lib.model import Sex
4+
5+
AMBIGUOUS_THRESHOLD_PERC: float = 0.01 # Fraction of samples identified as "ambiguous_sex" above which an error will be thrown.
6+
AAF_THRESHOLD: float = 0.05 # Alternate allele frequency threshold for `hl.impute_sex`.
7+
BIALLELIC: int = 2
8+
XX_FSTAT_THRESHOLD: float = (
9+
0.5 # F-stat threshold below which a sample will be called XX
10+
)
11+
XY_FSTAT_THRESHOLD: float = (
12+
0.75 # F-stat threshold above which a sample will be called XY.
13+
)
14+
15+
16+
def compute_sex_check_ht(mt: hl.MatrixTable) -> hl.Table:
17+
# Filter to SNVs and biallelics
18+
# NB: We should already have filtered biallelics, but just in case.
19+
mt = mt.filter_rows(
20+
(hl.len(mt.alleles) == BIALLELIC) & hl.is_snp(mt.alleles[0], mt.alleles[1]),
21+
)
22+
mt = mt.filter_cols(hl.agg.all(mt.GT.is_diploid() | hl.is_missing(mt.GT)))
23+
24+
# Filter to PASS variants only (variants with empty or missing filter set)
25+
mt = mt.filter_rows(
26+
hl.is_missing(mt.filters) | (mt.filters.length() == 0),
27+
keep=True,
28+
)
29+
impute_sex_ht = hl.impute_sex(
30+
mt.GT,
31+
male_threshold=XY_FSTAT_THRESHOLD,
32+
female_threshold=XX_FSTAT_THRESHOLD,
33+
aaf_threshold=AAF_THRESHOLD,
34+
)
35+
ht = mt.annotate_cols(**impute_sex_ht[mt.col_key]).cols()
36+
ht = ht.select(
37+
predicted_sex=(
38+
hl.case()
39+
.when(hl.is_missing(ht.is_female), Sex.UNKNOWN.value)
40+
.when(ht.is_female, Sex.FEMALE.value)
41+
.default(Sex.MALE.value)
42+
),
43+
)
44+
ambiguous_perc = ht.aggregate(
45+
hl.agg.fraction(ht.predicted_sex == Sex.UNKNOWN.value),
46+
)
47+
if ambiguous_perc > AMBIGUOUS_THRESHOLD_PERC:
48+
msg = f'{ambiguous_perc:.2%} of samples identified as ambiguous. Please contact the methods team to investigate the callset.'
49+
raise ValueError(msg)
50+
return ht
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
import unittest
2+
from unittest.mock import patch
3+
4+
import hail as hl
5+
6+
from v03_pipeline.lib.methods.sex_check import compute_sex_check_ht
7+
8+
TEST_SEX_AND_RELATEDNESS_CALLSET_MT = (
9+
'v03_pipeline/var/test/callsets/sex_and_relatedness_1.mt'
10+
)
11+
TEST_PEDIGREE = 'v03_pipeline/var/test/pedigrees/test_pedigree_6.tsv'
12+
13+
14+
class SexCheckTest(unittest.TestCase):
15+
def test_compute_sex_check_ht(self):
16+
mt = hl.read_matrix_table(TEST_SEX_AND_RELATEDNESS_CALLSET_MT)
17+
ht = compute_sex_check_ht(mt)
18+
self.assertCountEqual(
19+
ht.collect(),
20+
[
21+
hl.Struct(
22+
s='ROS_006_18Y03226_D1',
23+
predicted_sex='M',
24+
),
25+
hl.Struct(
26+
s='ROS_006_18Y03227_D1',
27+
predicted_sex='M',
28+
),
29+
hl.Struct(
30+
s='ROS_006_18Y03228_D1',
31+
predicted_sex='M',
32+
),
33+
hl.Struct(
34+
s='ROS_007_19Y05919_D1',
35+
predicted_sex='M',
36+
),
37+
hl.Struct(
38+
s='ROS_007_19Y05939_D1',
39+
predicted_sex='F',
40+
),
41+
hl.Struct(
42+
s='ROS_007_19Y05987_D1',
43+
predicted_sex='M',
44+
),
45+
],
46+
)
47+
48+
def test_compute_sex_check_ht_ambiguous(self):
49+
mt = hl.read_matrix_table(TEST_SEX_AND_RELATEDNESS_CALLSET_MT)
50+
with patch('v03_pipeline.lib.methods.sex_check.XY_FSTAT_THRESHOLD', 0.95):
51+
self.assertRaises(
52+
ValueError,
53+
compute_sex_check_ht,
54+
mt,
55+
)

v03_pipeline/lib/model/definitions.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,3 +165,7 @@ def allele_registry_gnomad_id(self) -> str:
165165
class SampleType(StrEnum):
166166
WES = 'WES'
167167
WGS = 'WGS'
168+
169+
@property
170+
def predicted_sex_from_tdr(self):
171+
return self == SampleType.WGS

v03_pipeline/lib/tasks/write_imported_callset.py

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121
from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams
2222
from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask
2323
from v03_pipeline.lib.tasks.files import CallsetTask, GCSorLocalTarget
24-
from v03_pipeline.lib.tasks.write_tdr_metrics_files import WriteTDRMetricsFilesTask
2524
from v03_pipeline.lib.tasks.write_validation_errors_for_run import (
2625
with_persisted_validation_errors,
2726
)
@@ -60,17 +59,6 @@ def requires(self) -> list[luigi.Task]:
6059
),
6160
),
6261
]
63-
if (
64-
FeatureFlag.EXPECT_TDR_METRICS
65-
and not self.skip_expect_tdr_metrics
66-
and self.dataset_type.expect_tdr_metrics(
67-
self.reference_genome,
68-
)
69-
):
70-
requirements = [
71-
*requirements,
72-
self.clone(WriteTDRMetricsFilesTask),
73-
]
7462
return [
7563
*requirements,
7664
CallsetTask(self.callset_path),

v03_pipeline/lib/tasks/write_metadata_for_run_test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ class WriteMetadataForRunTaskTest(MockedDatarootTestCase):
2424
)
2525
@mock.patch('v03_pipeline.lib.tasks.write_metadata_for_run.FeatureFlag')
2626
@mock.patch(
27-
'v03_pipeline.lib.tasks.write_imported_callset.WriteTDRMetricsFilesTask',
27+
'v03_pipeline.lib.tasks.write_sex_check_table.WriteTDRMetricsFilesTask',
2828
)
2929
def test_write_metadata_for_run_task(
3030
self,

v03_pipeline/lib/tasks/write_sex_check_table.py

Lines changed: 52 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,37 @@
22
import hailtop.fs as hfs
33
import luigi
44

5+
from v03_pipeline.lib.methods.sex_check import compute_sex_check_ht
56
from v03_pipeline.lib.misc.io import import_imputed_sex
6-
from v03_pipeline.lib.paths import sex_check_table_path, tdr_metrics_dir
7+
from v03_pipeline.lib.model.feature_flag import FeatureFlag
8+
from v03_pipeline.lib.paths import (
9+
imported_callset_path,
10+
sex_check_table_path,
11+
tdr_metrics_dir,
12+
)
13+
from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams
714
from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask
815
from v03_pipeline.lib.tasks.files import GCSorLocalTarget
16+
from v03_pipeline.lib.tasks.write_imported_callset import WriteImportedCallsetTask
917
from v03_pipeline.lib.tasks.write_tdr_metrics_files import WriteTDRMetricsFilesTask
1018

1119

20+
@luigi.util.inherits(BaseLoadingRunParams)
1221
class WriteSexCheckTableTask(BaseWriteTask):
1322
callset_path = luigi.Parameter()
1423

24+
@property
25+
def predicted_sex_from_tdr(self):
26+
# complicated enough to need a helper :/
27+
return (
28+
FeatureFlag.EXPECT_TDR_METRICS
29+
and not self.skip_expect_tdr_metrics
30+
and self.dataset_type.expect_tdr_metrics(
31+
self.reference_genome,
32+
)
33+
and self.sample_type.predicted_sex_from_tdr
34+
)
35+
1536
def output(self) -> luigi.Target:
1637
return GCSorLocalTarget(
1738
sex_check_table_path(
@@ -21,16 +42,37 @@ def output(self) -> luigi.Target:
2142
),
2243
)
2344

24-
def requires(self) -> luigi.Task:
25-
return self.clone(WriteTDRMetricsFilesTask)
45+
def requires(self) -> list[luigi.Task]:
46+
requirements = []
47+
if self.predicted_sex_from_tdr:
48+
requirements = [
49+
*requirements,
50+
self.clone(WriteTDRMetricsFilesTask),
51+
]
52+
else:
53+
requirements = [
54+
*requirements,
55+
self.clone(WriteImportedCallsetTask),
56+
]
57+
return requirements
2658

2759
def create_table(self) -> hl.Table:
2860
ht = None
29-
for tdr_metrics_file in hfs.ls(
30-
tdr_metrics_dir(self.reference_genome, self.dataset_type),
31-
):
32-
if not ht:
33-
ht = import_imputed_sex(tdr_metrics_file.path)
34-
continue
35-
ht = ht.union(import_imputed_sex(tdr_metrics_file.path))
61+
if self.predicted_sex_from_tdr:
62+
for tdr_metrics_file in hfs.ls(
63+
tdr_metrics_dir(self.reference_genome, self.dataset_type),
64+
):
65+
if not ht:
66+
ht = import_imputed_sex(tdr_metrics_file.path)
67+
continue
68+
ht = ht.union(import_imputed_sex(tdr_metrics_file.path))
69+
else:
70+
mt = hl.read_matrix_table(
71+
imported_callset_path(
72+
self.reference_genome,
73+
self.dataset_type,
74+
self.callset_path,
75+
),
76+
)
77+
ht = compute_sex_check_ht(mt)
3678
return ht

v03_pipeline/lib/tasks/write_sex_check_table_test.py

Lines changed: 73 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,22 +5,31 @@
55
import hail as hl
66
import luigi.worker
77

8-
from v03_pipeline.lib.model import DatasetType, ReferenceGenome
8+
from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType
99
from v03_pipeline.lib.paths import sex_check_table_path, tdr_metrics_path
1010
from v03_pipeline.lib.tasks.write_sex_check_table import (
1111
WriteSexCheckTableTask,
1212
)
1313
from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase
1414

15+
TEST_SEX_AND_RELATEDNESS_CALLSET_MT = (
16+
'v03_pipeline/var/test/callsets/sex_and_relatedness_1.mt'
17+
)
18+
1519

1620
class WriteSexCheckTableTaskTest(MockedDatarootTestCase):
1721
@patch('v03_pipeline.lib.tasks.write_tdr_metrics_files.gen_bq_table_names')
1822
@patch('v03_pipeline.lib.tasks.write_tdr_metrics_file.bq_metrics_query')
23+
@patch(
24+
'v03_pipeline.lib.tasks.write_sex_check_table.FeatureFlag',
25+
)
1926
def test_snv_sex_check_table_task(
2027
self,
28+
mock_ff: Mock,
2129
mock_bq_metrics_query: Mock,
2230
mock_gen_bq_table_names: Mock,
2331
) -> None:
32+
mock_ff.EXPECT_TDR_METRICS = True
2433
mock_gen_bq_table_names.return_value = [
2534
'datarepo-7242affb.datarepo_RP_3053',
2635
'datarepo-5a72e31b.datarepo_RP_3056',
@@ -111,7 +120,12 @@ def test_snv_sex_check_table_task(
111120
write_sex_check_table = WriteSexCheckTableTask(
112121
reference_genome=ReferenceGenome.GRCh38,
113122
dataset_type=DatasetType.SNV_INDEL,
123+
sample_type=SampleType.WGS,
114124
callset_path='na',
125+
project_guids=['R0113_test_project'],
126+
project_remap_paths=['test_remap'],
127+
project_pedigree_paths=['test_pedigree'],
128+
run_id='manual__2024-04-03',
115129
)
116130
worker.add(write_sex_check_table)
117131
worker.run()
@@ -143,3 +157,61 @@ def test_snv_sex_check_table_task(
143157
),
144158
) as f:
145159
self.assertTrue('collaborator_sample_id' in f.read())
160+
161+
@patch(
162+
'v03_pipeline.lib.tasks.write_sex_check_table.FeatureFlag',
163+
)
164+
def test_snv_wes_sex_check_table_task(
165+
self,
166+
mock_ff: Mock,
167+
) -> None:
168+
mock_ff.EXPECT_TDR_METRICS = True
169+
worker = luigi.worker.Worker()
170+
write_sex_check_table = WriteSexCheckTableTask(
171+
reference_genome=ReferenceGenome.GRCh38,
172+
dataset_type=DatasetType.SNV_INDEL,
173+
sample_type=SampleType.WES,
174+
callset_path=TEST_SEX_AND_RELATEDNESS_CALLSET_MT,
175+
project_guids=['R0113_test_project'],
176+
project_remap_paths=['test_remap'],
177+
project_pedigree_paths=['test_pedigree'],
178+
run_id='manual__2024-04-04',
179+
)
180+
worker.add(write_sex_check_table)
181+
worker.run()
182+
sex_check_ht = hl.read_table(
183+
sex_check_table_path(
184+
ReferenceGenome.GRCh38,
185+
DatasetType.SNV_INDEL,
186+
TEST_SEX_AND_RELATEDNESS_CALLSET_MT,
187+
),
188+
)
189+
self.assertCountEqual(
190+
sex_check_ht.collect(),
191+
[
192+
hl.Struct(
193+
s='ROS_006_18Y03226_D1',
194+
predicted_sex='M',
195+
),
196+
hl.Struct(
197+
s='ROS_006_18Y03227_D1',
198+
predicted_sex='M',
199+
),
200+
hl.Struct(
201+
s='ROS_006_18Y03228_D1',
202+
predicted_sex='M',
203+
),
204+
hl.Struct(
205+
s='ROS_007_19Y05919_D1',
206+
predicted_sex='M',
207+
),
208+
hl.Struct(
209+
s='ROS_007_19Y05939_D1',
210+
predicted_sex='F',
211+
),
212+
hl.Struct(
213+
s='ROS_007_19Y05987_D1',
214+
predicted_sex='M',
215+
),
216+
],
217+
)

0 commit comments

Comments
 (0)