Skip to content

Commit 10b4c02

Browse files
authored
Merge pull request #1057 from broadinstitute/relax-ploidy-validation
move ploidy checks from callset integrity validation to family/sample validation
2 parents 36b8f5e + 3a6c725 commit 10b4c02

10 files changed

+334
-298
lines changed

v03_pipeline/lib/misc/family_loading_failures.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,3 +174,44 @@ def get_families_failed_sex_check(
174174
f'Sample {sample_id} has pedigree sex {family.samples[sample_id].sex.value} but imputed sex {sex_check_lookup[sample_id].value}',
175175
)
176176
return dict(failed_families)
177+
178+
179+
def get_families_failed_imputed_sex_ploidy(
180+
families: set[Family],
181+
mt: hl.MatrixTable,
182+
sex_check_ht: hl.Table,
183+
) -> dict[Family, str]:
184+
mt = mt.select_cols(
185+
discrepant=(
186+
(
187+
# All calls are diploid or missing but the sex is Male
188+
hl.agg.all(mt.GT.is_diploid() | hl.is_missing(mt.GT))
189+
& (sex_check_ht[mt.s].predicted_sex == Sex.MALE.value)
190+
)
191+
| (
192+
# At least one call is haploid but the sex is Female, X0, XXY, XYY, or XXX
193+
hl.agg.any(~mt.GT.is_diploid())
194+
& hl.literal(
195+
{
196+
Sex.FEMALE.value,
197+
Sex.X0.value,
198+
Sex.XYY.value,
199+
Sex.XXY.value,
200+
Sex.XXX.value,
201+
},
202+
).contains(sex_check_ht[mt.s].predicted_sex)
203+
)
204+
),
205+
)
206+
discrepant_samples = mt.aggregate_cols(
207+
hl.agg.filter(mt.discrepant, hl.agg.collect_as_set(mt.s)),
208+
)
209+
failed_families = defaultdict(list)
210+
for family in families:
211+
discrepant_loadable_samples = set(family.samples.keys()) & discrepant_samples
212+
if discrepant_loadable_samples:
213+
sorted_discrepant_samples = sorted(discrepant_loadable_samples)
214+
failed_families[family].append(
215+
f'Found samples with misaligned ploidy with their provided imputed sex: {sorted_discrepant_samples}',
216+
)
217+
return failed_families

v03_pipeline/lib/misc/family_loading_failures_test.py

Lines changed: 192 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,14 @@
66
all_relatedness_checks,
77
build_relatedness_check_lookup,
88
build_sex_check_lookup,
9+
get_families_failed_imputed_sex_ploidy,
910
get_families_failed_sex_check,
1011
)
1112
from v03_pipeline.lib.misc.io import import_pedigree
1213
from v03_pipeline.lib.misc.pedigree import Family, Sample, parse_pedigree_ht_to_families
1314
from v03_pipeline.lib.model import Sex
1415

16+
TEST_SEX_CHECK_1 = 'v03_pipeline/var/test/sex_check/test_sex_check_1.ht'
1517
TEST_PEDIGREE_6 = 'v03_pipeline/var/test/pedigrees/test_pedigree_6.tsv'
1618

1719

@@ -250,3 +252,193 @@ def test_get_families_failed_sex_check(self):
250252
],
251253
],
252254
)
255+
256+
def test_get_families_failed_imputed_sex_ploidy(self) -> None:
257+
female_sample = 'HG00731_1'
258+
male_sample_1 = 'HG00732_1'
259+
male_sample_2 = 'HG00732_1'
260+
x0_sample = 'NA20899_1'
261+
xxy_sample = 'NA20889_1'
262+
xyy_sample = 'NA20891_1'
263+
xxx_sample = 'NA20892_1'
264+
265+
sex_check_ht = hl.read_table(TEST_SEX_CHECK_1)
266+
families = {
267+
Family(
268+
family_guid='',
269+
samples={
270+
female_sample: Sample(female_sample, Sex.FEMALE),
271+
male_sample_1: Sample(male_sample_1, Sex.MALE),
272+
male_sample_2: Sample(male_sample_2, Sex.MALE),
273+
x0_sample: Sample(x0_sample, Sex.X0),
274+
xxy_sample: Sample(xxy_sample, Sex.XXY),
275+
xyy_sample: Sample(xyy_sample, Sex.XYY),
276+
xxx_sample: Sample(xxx_sample, Sex.XXX),
277+
},
278+
),
279+
}
280+
281+
# All calls on X chromosome are valid
282+
mt = (
283+
hl.MatrixTable.from_parts(
284+
rows={
285+
'locus': [
286+
hl.Locus(
287+
contig='chrX',
288+
position=1,
289+
reference_genome='GRCh38',
290+
),
291+
],
292+
},
293+
cols={
294+
's': [
295+
female_sample,
296+
male_sample_1,
297+
x0_sample,
298+
xxy_sample,
299+
xyy_sample,
300+
xxx_sample,
301+
],
302+
},
303+
entries={
304+
'GT': [
305+
[
306+
hl.Call(alleles=[0, 0], phased=False),
307+
hl.Call(alleles=[0], phased=False),
308+
hl.Call(alleles=[0, 0], phased=False), # X0
309+
hl.Call(alleles=[0, 0], phased=False), # XXY
310+
hl.Call(alleles=[0, 0], phased=False), # XYY
311+
hl.Call(alleles=[0, 0], phased=False), # XXX
312+
],
313+
],
314+
},
315+
)
316+
.key_rows_by('locus')
317+
.key_cols_by('s')
318+
)
319+
failed_families = get_families_failed_imputed_sex_ploidy(
320+
families,
321+
mt,
322+
sex_check_ht,
323+
)
324+
self.assertDictEqual(failed_families, {})
325+
326+
# All calls on Y chromosome are valid
327+
mt = (
328+
hl.MatrixTable.from_parts(
329+
rows={
330+
'locus': [
331+
hl.Locus(
332+
contig='chrY',
333+
position=1,
334+
reference_genome='GRCh38',
335+
),
336+
],
337+
},
338+
cols={
339+
's': [
340+
female_sample,
341+
male_sample_1,
342+
x0_sample,
343+
xxy_sample,
344+
xyy_sample,
345+
xxx_sample,
346+
],
347+
},
348+
entries={
349+
'GT': [
350+
[
351+
hl.missing(hl.tcall),
352+
hl.Call(alleles=[0], phased=False),
353+
hl.missing(hl.tcall), # X0
354+
hl.Call(alleles=[0, 0], phased=False), # XXY
355+
hl.Call(alleles=[0, 0], phased=False), # XYY
356+
hl.missing(hl.tcall), # XXX
357+
],
358+
],
359+
},
360+
)
361+
.key_rows_by('locus')
362+
.key_cols_by('s')
363+
)
364+
failed_families = get_families_failed_imputed_sex_ploidy(
365+
families,
366+
mt,
367+
sex_check_ht,
368+
)
369+
self.assertDictEqual(failed_families, {})
370+
371+
# Invalid X chromosome case
372+
mt = (
373+
hl.MatrixTable.from_parts(
374+
rows={
375+
'locus': [
376+
hl.Locus(
377+
contig='chrX',
378+
position=1,
379+
reference_genome='GRCh38',
380+
),
381+
],
382+
},
383+
cols={
384+
's': [
385+
female_sample,
386+
male_sample_1,
387+
male_sample_2,
388+
x0_sample,
389+
xxy_sample,
390+
xyy_sample,
391+
xxx_sample,
392+
],
393+
},
394+
entries={
395+
'GT': [
396+
[
397+
hl.Call(alleles=[0], phased=False), # invalid Female call
398+
hl.Call(alleles=[0], phased=False), # valid Male call
399+
hl.missing(hl.tcall), # invalid Male call
400+
hl.Call(alleles=[0], phased=False), # invalid X0 call
401+
hl.Call(alleles=[0], phased=False), # invalid XXY call
402+
hl.missing(hl.tcall), # valid XYY call
403+
hl.Call(alleles=[0, 0], phased=False), # valid XXX call
404+
],
405+
],
406+
},
407+
)
408+
.key_rows_by('locus')
409+
.key_cols_by('s')
410+
)
411+
failed_families = get_families_failed_imputed_sex_ploidy(
412+
families,
413+
mt,
414+
sex_check_ht,
415+
)
416+
self.assertCountEqual(
417+
failed_families.values(),
418+
[
419+
[
420+
"Found samples with misaligned ploidy with their provided imputed sex: ['HG00731_1', 'HG00732_1', 'NA20889_1', 'NA20899_1']",
421+
],
422+
],
423+
)
424+
425+
# Invalid X chromosome case, but only discrepant family samples are reported
426+
families = {
427+
Family(
428+
family_guid='',
429+
samples={female_sample: Sample(female_sample, Sex.FEMALE)},
430+
),
431+
}
432+
failed_families = get_families_failed_imputed_sex_ploidy(
433+
families,
434+
mt,
435+
sex_check_ht,
436+
)
437+
self.assertCountEqual(
438+
failed_families.values(),
439+
[
440+
[
441+
"Found samples with misaligned ploidy with their provided imputed sex: ['HG00731_1']",
442+
],
443+
],
444+
)

v03_pipeline/lib/misc/validation.py

Lines changed: 0 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
DatasetType,
77
ReferenceGenome,
88
SampleType,
9-
Sex,
109
)
1110

1211
AMBIGUOUS_THRESHOLD_PERC: float = 0.01 # Fraction of samples identified as "ambiguous_sex" above which an error will be thrown.
@@ -148,48 +147,6 @@ def _validate_field(
148147
raise SeqrValidationError(msg)
149148

150149

151-
def validate_imputed_sex_ploidy(
152-
mt: hl.MatrixTable,
153-
# NB: sex_check_ht will be undefined if sex checking is disabled for the run
154-
sex_check_ht: hl.Table | None = None,
155-
**_: Any,
156-
) -> None:
157-
if not sex_check_ht:
158-
return
159-
mt = mt.select_cols(
160-
discrepant=(
161-
(
162-
# All calls are diploid or missing but the sex is Male
163-
hl.agg.all(mt.GT.is_diploid() | hl.is_missing(mt.GT))
164-
& (sex_check_ht[mt.s].predicted_sex == Sex.MALE.value)
165-
)
166-
| (
167-
# At least one call is haploid but the sex is Female, X0, XXY, XYY, or XXX
168-
hl.agg.any(~mt.GT.is_diploid())
169-
& hl.literal(
170-
{
171-
Sex.FEMALE.value,
172-
Sex.X0.value,
173-
Sex.XYY.value,
174-
Sex.XXY.value,
175-
Sex.XXX.value,
176-
},
177-
).contains(sex_check_ht[mt.s].predicted_sex)
178-
)
179-
),
180-
)
181-
discrepant_samples = mt.aggregate_cols(
182-
hl.agg.filter(mt.discrepant, hl.agg.collect_as_set(mt.s)),
183-
)
184-
if discrepant_samples:
185-
sorted_discrepant_samples = sorted(discrepant_samples)
186-
msg = f'Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : {sorted_discrepant_samples[:10]}'
187-
raise SeqrValidationError(
188-
msg,
189-
{'imputed_sex_ploidy_failures': sorted_discrepant_samples},
190-
)
191-
192-
193150
def validate_sample_type(
194151
mt: hl.MatrixTable,
195152
coding_and_noncoding_variants_ht: hl.Table,

0 commit comments

Comments
 (0)