Skip to content

Commit 09c54da

Browse files
authored
Merge pull request #1061 from broadinstitute/dev
Dev
2 parents bb2af2d + a7701e6 commit 09c54da

18 files changed

+601
-327
lines changed

v03_pipeline/lib/methods/sex_check.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
import hail as hl
2+
3+
from v03_pipeline.lib.model import Sex
4+
5+
AMBIGUOUS_THRESHOLD_PERC: float = 0.01 # Fraction of samples identified as "ambiguous_sex" above which an error will be thrown.
6+
AAF_THRESHOLD: float = 0.05 # Alternate allele frequency threshold for `hl.impute_sex`.
7+
BIALLELIC: int = 2
8+
XX_FSTAT_THRESHOLD: float = (
9+
0.5 # F-stat threshold below which a sample will be called XX
10+
)
11+
XY_FSTAT_THRESHOLD: float = (
12+
0.75 # F-stat threshold above which a sample will be called XY.
13+
)
14+
15+
16+
def compute_sex_check_ht(mt: hl.MatrixTable) -> hl.Table:
17+
# Filter to SNVs and biallelics
18+
# NB: We should already have filtered biallelics, but just in case.
19+
mt = mt.filter_rows(
20+
(hl.len(mt.alleles) == BIALLELIC) & hl.is_snp(mt.alleles[0], mt.alleles[1]),
21+
)
22+
mt = mt.filter_cols(hl.agg.all(mt.GT.is_diploid() | hl.is_missing(mt.GT)))
23+
24+
# Filter to PASS variants only (variants with empty or missing filter set)
25+
mt = mt.filter_rows(
26+
hl.is_missing(mt.filters) | (mt.filters.length() == 0),
27+
keep=True,
28+
)
29+
impute_sex_ht = hl.impute_sex(
30+
mt.GT,
31+
male_threshold=XY_FSTAT_THRESHOLD,
32+
female_threshold=XX_FSTAT_THRESHOLD,
33+
aaf_threshold=AAF_THRESHOLD,
34+
)
35+
ht = mt.annotate_cols(**impute_sex_ht[mt.col_key]).cols()
36+
ht = ht.select(
37+
predicted_sex=(
38+
hl.case()
39+
.when(hl.is_missing(ht.is_female), Sex.UNKNOWN.value)
40+
.when(ht.is_female, Sex.FEMALE.value)
41+
.default(Sex.MALE.value)
42+
),
43+
)
44+
ambiguous_perc = ht.aggregate(
45+
hl.agg.fraction(ht.predicted_sex == Sex.UNKNOWN.value),
46+
)
47+
if ambiguous_perc > AMBIGUOUS_THRESHOLD_PERC:
48+
msg = f'{ambiguous_perc:.2%} of samples identified as ambiguous. Please contact the methods team to investigate the callset.'
49+
raise ValueError(msg)
50+
return ht
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
import unittest
2+
from unittest.mock import patch
3+
4+
import hail as hl
5+
6+
from v03_pipeline.lib.methods.sex_check import compute_sex_check_ht
7+
8+
TEST_SEX_AND_RELATEDNESS_CALLSET_MT = (
9+
'v03_pipeline/var/test/callsets/sex_and_relatedness_1.mt'
10+
)
11+
TEST_PEDIGREE = 'v03_pipeline/var/test/pedigrees/test_pedigree_6.tsv'
12+
13+
14+
class SexCheckTest(unittest.TestCase):
15+
def test_compute_sex_check_ht(self):
16+
mt = hl.read_matrix_table(TEST_SEX_AND_RELATEDNESS_CALLSET_MT)
17+
ht = compute_sex_check_ht(mt)
18+
self.assertCountEqual(
19+
ht.collect(),
20+
[
21+
hl.Struct(
22+
s='ROS_006_18Y03226_D1',
23+
predicted_sex='M',
24+
),
25+
hl.Struct(
26+
s='ROS_006_18Y03227_D1',
27+
predicted_sex='M',
28+
),
29+
hl.Struct(
30+
s='ROS_006_18Y03228_D1',
31+
predicted_sex='M',
32+
),
33+
hl.Struct(
34+
s='ROS_007_19Y05919_D1',
35+
predicted_sex='M',
36+
),
37+
hl.Struct(
38+
s='ROS_007_19Y05939_D1',
39+
predicted_sex='F',
40+
),
41+
hl.Struct(
42+
s='ROS_007_19Y05987_D1',
43+
predicted_sex='M',
44+
),
45+
],
46+
)
47+
48+
def test_compute_sex_check_ht_ambiguous(self):
49+
mt = hl.read_matrix_table(TEST_SEX_AND_RELATEDNESS_CALLSET_MT)
50+
with patch('v03_pipeline.lib.methods.sex_check.XY_FSTAT_THRESHOLD', 0.95):
51+
self.assertRaises(
52+
ValueError,
53+
compute_sex_check_ht,
54+
mt,
55+
)

v03_pipeline/lib/misc/family_loading_failures.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,3 +174,44 @@ def get_families_failed_sex_check(
174174
f'Sample {sample_id} has pedigree sex {family.samples[sample_id].sex.value} but imputed sex {sex_check_lookup[sample_id].value}',
175175
)
176176
return dict(failed_families)
177+
178+
179+
def get_families_failed_imputed_sex_ploidy(
180+
families: set[Family],
181+
mt: hl.MatrixTable,
182+
sex_check_ht: hl.Table,
183+
) -> dict[Family, str]:
184+
mt = mt.select_cols(
185+
discrepant=(
186+
(
187+
# All calls are diploid or missing but the sex is Male
188+
hl.agg.all(mt.GT.is_diploid() | hl.is_missing(mt.GT))
189+
& (sex_check_ht[mt.s].predicted_sex == Sex.MALE.value)
190+
)
191+
| (
192+
# At least one call is haploid but the sex is Female, X0, XXY, XYY, or XXX
193+
hl.agg.any(~mt.GT.is_diploid())
194+
& hl.literal(
195+
{
196+
Sex.FEMALE.value,
197+
Sex.X0.value,
198+
Sex.XYY.value,
199+
Sex.XXY.value,
200+
Sex.XXX.value,
201+
},
202+
).contains(sex_check_ht[mt.s].predicted_sex)
203+
)
204+
),
205+
)
206+
discrepant_samples = mt.aggregate_cols(
207+
hl.agg.filter(mt.discrepant, hl.agg.collect_as_set(mt.s)),
208+
)
209+
failed_families = defaultdict(list)
210+
for family in families:
211+
discrepant_loadable_samples = set(family.samples.keys()) & discrepant_samples
212+
if discrepant_loadable_samples:
213+
sorted_discrepant_samples = sorted(discrepant_loadable_samples)
214+
failed_families[family].append(
215+
f'Found samples with misaligned ploidy with their provided imputed sex: {sorted_discrepant_samples}',
216+
)
217+
return failed_families

v03_pipeline/lib/misc/family_loading_failures_test.py

Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,14 @@
66
all_relatedness_checks,
77
build_relatedness_check_lookup,
88
build_sex_check_lookup,
9+
get_families_failed_imputed_sex_ploidy,
910
get_families_failed_sex_check,
1011
)
1112
from v03_pipeline.lib.misc.io import import_pedigree
1213
from v03_pipeline.lib.misc.pedigree import Family, Sample, parse_pedigree_ht_to_families
1314
from v03_pipeline.lib.model import Sex
1415

16+
TEST_SEX_CHECK_1 = 'v03_pipeline/var/test/sex_check/test_sex_check_1.ht'
1517
TEST_PEDIGREE_6 = 'v03_pipeline/var/test/pedigrees/test_pedigree_6.tsv'
1618

1719

@@ -250,3 +252,193 @@ def test_get_families_failed_sex_check(self):
250252
],
251253
],
252254
)
255+
256+
def test_get_families_failed_imputed_sex_ploidy(self) -> None:
257+
female_sample = 'HG00731_1'
258+
male_sample_1 = 'HG00732_1'
259+
male_sample_2 = 'HG00732_1'
260+
x0_sample = 'NA20899_1'
261+
xxy_sample = 'NA20889_1'
262+
xyy_sample = 'NA20891_1'
263+
xxx_sample = 'NA20892_1'
264+
265+
sex_check_ht = hl.read_table(TEST_SEX_CHECK_1)
266+
families = {
267+
Family(
268+
family_guid='',
269+
samples={
270+
female_sample: Sample(female_sample, Sex.FEMALE),
271+
male_sample_1: Sample(male_sample_1, Sex.MALE),
272+
male_sample_2: Sample(male_sample_2, Sex.MALE),
273+
x0_sample: Sample(x0_sample, Sex.X0),
274+
xxy_sample: Sample(xxy_sample, Sex.XXY),
275+
xyy_sample: Sample(xyy_sample, Sex.XYY),
276+
xxx_sample: Sample(xxx_sample, Sex.XXX),
277+
},
278+
),
279+
}
280+
281+
# All calls on X chromosome are valid
282+
mt = (
283+
hl.MatrixTable.from_parts(
284+
rows={
285+
'locus': [
286+
hl.Locus(
287+
contig='chrX',
288+
position=1,
289+
reference_genome='GRCh38',
290+
),
291+
],
292+
},
293+
cols={
294+
's': [
295+
female_sample,
296+
male_sample_1,
297+
x0_sample,
298+
xxy_sample,
299+
xyy_sample,
300+
xxx_sample,
301+
],
302+
},
303+
entries={
304+
'GT': [
305+
[
306+
hl.Call(alleles=[0, 0], phased=False),
307+
hl.Call(alleles=[0], phased=False),
308+
hl.Call(alleles=[0, 0], phased=False), # X0
309+
hl.Call(alleles=[0, 0], phased=False), # XXY
310+
hl.Call(alleles=[0, 0], phased=False), # XYY
311+
hl.Call(alleles=[0, 0], phased=False), # XXX
312+
],
313+
],
314+
},
315+
)
316+
.key_rows_by('locus')
317+
.key_cols_by('s')
318+
)
319+
failed_families = get_families_failed_imputed_sex_ploidy(
320+
families,
321+
mt,
322+
sex_check_ht,
323+
)
324+
self.assertDictEqual(failed_families, {})
325+
326+
# All calls on Y chromosome are valid
327+
mt = (
328+
hl.MatrixTable.from_parts(
329+
rows={
330+
'locus': [
331+
hl.Locus(
332+
contig='chrY',
333+
position=1,
334+
reference_genome='GRCh38',
335+
),
336+
],
337+
},
338+
cols={
339+
's': [
340+
female_sample,
341+
male_sample_1,
342+
x0_sample,
343+
xxy_sample,
344+
xyy_sample,
345+
xxx_sample,
346+
],
347+
},
348+
entries={
349+
'GT': [
350+
[
351+
hl.missing(hl.tcall),
352+
hl.Call(alleles=[0], phased=False),
353+
hl.missing(hl.tcall), # X0
354+
hl.Call(alleles=[0, 0], phased=False), # XXY
355+
hl.Call(alleles=[0, 0], phased=False), # XYY
356+
hl.missing(hl.tcall), # XXX
357+
],
358+
],
359+
},
360+
)
361+
.key_rows_by('locus')
362+
.key_cols_by('s')
363+
)
364+
failed_families = get_families_failed_imputed_sex_ploidy(
365+
families,
366+
mt,
367+
sex_check_ht,
368+
)
369+
self.assertDictEqual(failed_families, {})
370+
371+
# Invalid X chromosome case
372+
mt = (
373+
hl.MatrixTable.from_parts(
374+
rows={
375+
'locus': [
376+
hl.Locus(
377+
contig='chrX',
378+
position=1,
379+
reference_genome='GRCh38',
380+
),
381+
],
382+
},
383+
cols={
384+
's': [
385+
female_sample,
386+
male_sample_1,
387+
male_sample_2,
388+
x0_sample,
389+
xxy_sample,
390+
xyy_sample,
391+
xxx_sample,
392+
],
393+
},
394+
entries={
395+
'GT': [
396+
[
397+
hl.Call(alleles=[0], phased=False), # invalid Female call
398+
hl.Call(alleles=[0], phased=False), # valid Male call
399+
hl.missing(hl.tcall), # invalid Male call
400+
hl.Call(alleles=[0], phased=False), # invalid X0 call
401+
hl.Call(alleles=[0], phased=False), # invalid XXY call
402+
hl.missing(hl.tcall), # valid XYY call
403+
hl.Call(alleles=[0, 0], phased=False), # valid XXX call
404+
],
405+
],
406+
},
407+
)
408+
.key_rows_by('locus')
409+
.key_cols_by('s')
410+
)
411+
failed_families = get_families_failed_imputed_sex_ploidy(
412+
families,
413+
mt,
414+
sex_check_ht,
415+
)
416+
self.assertCountEqual(
417+
failed_families.values(),
418+
[
419+
[
420+
"Found samples with misaligned ploidy with their provided imputed sex: ['HG00731_1', 'HG00732_1', 'NA20889_1', 'NA20899_1']",
421+
],
422+
],
423+
)
424+
425+
# Invalid X chromosome case, but only discrepant family samples are reported
426+
families = {
427+
Family(
428+
family_guid='',
429+
samples={female_sample: Sample(female_sample, Sex.FEMALE)},
430+
),
431+
}
432+
failed_families = get_families_failed_imputed_sex_ploidy(
433+
families,
434+
mt,
435+
sex_check_ht,
436+
)
437+
self.assertCountEqual(
438+
failed_families.values(),
439+
[
440+
[
441+
"Found samples with misaligned ploidy with their provided imputed sex: ['HG00731_1']",
442+
],
443+
],
444+
)

v03_pipeline/lib/misc/terra_data_repository.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,10 @@
1010
from v03_pipeline.lib.misc.requests import requests_retry_session
1111

1212
BIGQUERY_METRICS = [
13-
'collaborator_sample_id',
1413
'predicted_sex',
1514
'contamination_rate',
1615
'percent_bases_at_20x',
16+
'collaborator_sample_id',
1717
'mean_coverage',
1818
]
1919
BIGQUERY_RESOURCE = 'bigquery'
@@ -63,9 +63,18 @@ def bq_metrics_query(bq_table_name: str) -> google.cloud.bigquery.table.RowItera
6363
msg = f'{bq_table_name} does not match expected pattern'
6464
raise ValueError(msg)
6565
client = bigquery.Client()
66+
67+
# not all columns are guaranteed to be present, coalesce if missing
68+
table_ddl = next(
69+
client.query_and_wait(
70+
f"""
71+
SELECT ddl FROM `{bq_table_name}`.INFORMATION_SCHEMA.TABLES where table_name='sample';
72+
""", # noqa: S608
73+
),
74+
)[0]
75+
metrics = [(m if m in table_ddl else f'NULL AS {m}') for m in BIGQUERY_METRICS]
6676
return client.query_and_wait(
6777
f"""
68-
SELECT {','.join(BIGQUERY_METRICS)}
69-
FROM `{bq_table_name}.sample`
70-
""", # noqa: S608
78+
SELECT {','.join(metrics)} FROM `{bq_table_name}.sample`;
79+
""", # noqa: S608
7180
)

0 commit comments

Comments
 (0)