Skip to content

Commit 2615bb9

Browse files
committed
make ploidy part of family validation
1 parent aba2b94 commit 2615bb9

7 files changed

+281
-258
lines changed

v03_pipeline/lib/misc/family_loading_failures.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,3 +174,44 @@ def get_families_failed_sex_check(
174174
f'Sample {sample_id} has pedigree sex {family.samples[sample_id].sex.value} but imputed sex {sex_check_lookup[sample_id].value}',
175175
)
176176
return dict(failed_families)
177+
178+
179+
def get_families_failed_imputed_sex_ploidy(
180+
mt: hl.MatrixTable,
181+
sex_check_ht: hl.Table,
182+
families: set[Family],
183+
) -> dict[Family, str]:
184+
mt = mt.select_cols(
185+
discrepant=(
186+
(
187+
# All calls are diploid or missing but the sex is Male
188+
hl.agg.all(mt.GT.is_diploid() | hl.is_missing(mt.GT))
189+
& (sex_check_ht[mt.s].predicted_sex == Sex.MALE.value)
190+
)
191+
| (
192+
# At least one call is haploid but the sex is Female, X0, XXY, XYY, or XXX
193+
hl.agg.any(~mt.GT.is_diploid())
194+
& hl.literal(
195+
{
196+
Sex.FEMALE.value,
197+
Sex.X0.value,
198+
Sex.XYY.value,
199+
Sex.XXY.value,
200+
Sex.XXX.value,
201+
},
202+
).contains(sex_check_ht[mt.s].predicted_sex)
203+
)
204+
),
205+
)
206+
discrepant_samples = mt.aggregate_cols(
207+
hl.agg.filter(mt.discrepant, hl.agg.collect_as_set(mt.s)),
208+
)
209+
failed_families = {}
210+
for family in families:
211+
discrepant_loadable_samples = set(family.samples.keys()) & discrepant_samples
212+
if discrepant_loadable_samples:
213+
sorted_discrepant_samples = sorted(discrepant_loadable_samples)
214+
failed_families[
215+
family
216+
] = f'Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : {sorted_discrepant_samples[:10]}'
217+
return failed_families

v03_pipeline/lib/misc/family_loading_failures_test.py

Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,14 @@
66
all_relatedness_checks,
77
build_relatedness_check_lookup,
88
build_sex_check_lookup,
9+
get_families_failed_imputed_sex_ploidy,
910
get_families_failed_sex_check,
1011
)
1112
from v03_pipeline.lib.misc.io import import_pedigree
1213
from v03_pipeline.lib.misc.pedigree import Family, Sample, parse_pedigree_ht_to_families
1314
from v03_pipeline.lib.model import Sex
1415

16+
TEST_SEX_CHECK_1 = 'v03_pipeline/var/test/sex_check/test_sex_check_1.ht'
1517
TEST_PEDIGREE_6 = 'v03_pipeline/var/test/pedigrees/test_pedigree_6.tsv'
1618

1719

@@ -250,3 +252,189 @@ def test_get_families_failed_sex_check(self):
250252
],
251253
],
252254
)
255+
256+
def test_get_families_failed_imputed_sex_ploidy(self) -> None:
257+
female_sample = 'HG00731_1'
258+
male_sample_1 = 'HG00732_1'
259+
male_sample_2 = 'HG00732_1'
260+
x0_sample = 'NA20899_1'
261+
xxy_sample = 'NA20889_1'
262+
xyy_sample = 'NA20891_1'
263+
xxx_sample = 'NA20892_1'
264+
265+
sex_check_ht = hl.read_table(TEST_SEX_CHECK_1)
266+
families = {
267+
Family(
268+
family_guid='',
269+
samples={
270+
female_sample: Sample(female_sample, Sex.FEMALE),
271+
male_sample_1: Sample(male_sample_1, Sex.MALE),
272+
male_sample_2: Sample(male_sample_2, Sex.MALE),
273+
x0_sample: Sample(x0_sample, Sex.X0),
274+
xxy_sample: Sample(xxy_sample, Sex.XXY),
275+
xyy_sample: Sample(xyy_sample, Sex.XYY),
276+
xxx_sample: Sample(xxx_sample, Sex.XXX),
277+
},
278+
),
279+
}
280+
281+
# All calls on X chromosome are valid
282+
mt = (
283+
hl.MatrixTable.from_parts(
284+
rows={
285+
'locus': [
286+
hl.Locus(
287+
contig='chrX',
288+
position=1,
289+
reference_genome='GRCh38',
290+
),
291+
],
292+
},
293+
cols={
294+
's': [
295+
female_sample,
296+
male_sample_1,
297+
x0_sample,
298+
xxy_sample,
299+
xyy_sample,
300+
xxx_sample,
301+
],
302+
},
303+
entries={
304+
'GT': [
305+
[
306+
hl.Call(alleles=[0, 0], phased=False),
307+
hl.Call(alleles=[0], phased=False),
308+
hl.Call(alleles=[0, 0], phased=False), # X0
309+
hl.Call(alleles=[0, 0], phased=False), # XXY
310+
hl.Call(alleles=[0, 0], phased=False), # XYY
311+
hl.Call(alleles=[0, 0], phased=False), # XXX
312+
],
313+
],
314+
},
315+
)
316+
.key_rows_by('locus')
317+
.key_cols_by('s')
318+
)
319+
failed_families = get_families_failed_imputed_sex_ploidy(
320+
mt,
321+
sex_check_ht,
322+
families,
323+
)
324+
self.assertDictEqual(failed_families, {})
325+
326+
# All calls on Y chromosome are valid
327+
mt = (
328+
hl.MatrixTable.from_parts(
329+
rows={
330+
'locus': [
331+
hl.Locus(
332+
contig='chrY',
333+
position=1,
334+
reference_genome='GRCh38',
335+
),
336+
],
337+
},
338+
cols={
339+
's': [
340+
female_sample,
341+
male_sample_1,
342+
x0_sample,
343+
xxy_sample,
344+
xyy_sample,
345+
xxx_sample,
346+
],
347+
},
348+
entries={
349+
'GT': [
350+
[
351+
hl.missing(hl.tcall),
352+
hl.Call(alleles=[0], phased=False),
353+
hl.missing(hl.tcall), # X0
354+
hl.Call(alleles=[0, 0], phased=False), # XXY
355+
hl.Call(alleles=[0, 0], phased=False), # XYY
356+
hl.missing(hl.tcall), # XXX
357+
],
358+
],
359+
},
360+
)
361+
.key_rows_by('locus')
362+
.key_cols_by('s')
363+
)
364+
failed_families = get_families_failed_imputed_sex_ploidy(
365+
mt,
366+
sex_check_ht,
367+
families,
368+
)
369+
self.assertDictEqual(failed_families, {})
370+
371+
# Invalid X chromosome case
372+
mt = (
373+
hl.MatrixTable.from_parts(
374+
rows={
375+
'locus': [
376+
hl.Locus(
377+
contig='chrX',
378+
position=1,
379+
reference_genome='GRCh38',
380+
),
381+
],
382+
},
383+
cols={
384+
's': [
385+
female_sample,
386+
male_sample_1,
387+
male_sample_2,
388+
x0_sample,
389+
xxy_sample,
390+
xyy_sample,
391+
xxx_sample,
392+
],
393+
},
394+
entries={
395+
'GT': [
396+
[
397+
hl.Call(alleles=[0], phased=False), # invalid Female call
398+
hl.Call(alleles=[0], phased=False), # valid Male call
399+
hl.missing(hl.tcall), # invalid Male call
400+
hl.Call(alleles=[0], phased=False), # invalid X0 call
401+
hl.Call(alleles=[0], phased=False), # invalid XXY call
402+
hl.missing(hl.tcall), # valid XYY call
403+
hl.Call(alleles=[0, 0], phased=False), # valid XXX call
404+
],
405+
],
406+
},
407+
)
408+
.key_rows_by('locus')
409+
.key_cols_by('s')
410+
)
411+
failed_families = get_families_failed_imputed_sex_ploidy(
412+
mt,
413+
sex_check_ht,
414+
families,
415+
)
416+
self.assertCountEqual(
417+
failed_families.values(),
418+
[
419+
"Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : ['HG00731_1', 'HG00732_1', 'NA20889_1', 'NA20899_1']",
420+
],
421+
)
422+
423+
# Invalid X chromosome case, but only discrepant family samples are reported
424+
families = {
425+
Family(
426+
family_guid='',
427+
samples={female_sample: Sample(female_sample, Sex.FEMALE)},
428+
),
429+
}
430+
failed_families = get_families_failed_imputed_sex_ploidy(
431+
mt,
432+
sex_check_ht,
433+
families,
434+
)
435+
self.assertCountEqual(
436+
failed_families.values(),
437+
[
438+
"Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : ['HG00731_1']",
439+
],
440+
)

v03_pipeline/lib/misc/validation.py

Lines changed: 0 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,10 @@
22

33
import hail as hl
44

5-
from v03_pipeline.lib.misc.pedigree import Family
65
from v03_pipeline.lib.model import (
76
DatasetType,
87
ReferenceGenome,
98
SampleType,
10-
Sex,
119
)
1210

1311
AMBIGUOUS_THRESHOLD_PERC: float = 0.01 # Fraction of samples identified as "ambiguous_sex" above which an error will be thrown.
@@ -129,56 +127,6 @@ def _validate_field(
129127
raise SeqrValidationError(msg)
130128

131129

132-
def validate_imputed_sex_ploidy(
133-
mt: hl.MatrixTable,
134-
# NB: sex_check_ht will be undefined if sex checking is disabled for the run
135-
sex_check_ht: hl.Table | None = None,
136-
pedigree_families: set[Family] | None = None,
137-
**_: Any,
138-
) -> None:
139-
if not sex_check_ht:
140-
return
141-
mt = mt.select_cols(
142-
discrepant=(
143-
(
144-
# All calls are diploid or missing but the sex is Male
145-
hl.agg.all(mt.GT.is_diploid() | hl.is_missing(mt.GT))
146-
& (sex_check_ht[mt.s].predicted_sex == Sex.MALE.value)
147-
)
148-
| (
149-
# At least one call is haploid but the sex is Female, X0, XXY, XYY, or XXX
150-
hl.agg.any(~mt.GT.is_diploid())
151-
& hl.literal(
152-
{
153-
Sex.FEMALE.value,
154-
Sex.X0.value,
155-
Sex.XYY.value,
156-
Sex.XXY.value,
157-
Sex.XXX.value,
158-
},
159-
).contains(sex_check_ht[mt.s].predicted_sex)
160-
)
161-
),
162-
)
163-
discrepant_samples = mt.aggregate_cols(
164-
hl.agg.filter(mt.discrepant, hl.agg.collect_as_set(mt.s)),
165-
)
166-
loading_samples = (
167-
{sample_id for family in pedigree_families for sample_id in family.samples}
168-
if pedigree_families
169-
else set()
170-
)
171-
discrepant_loading_samples = discrepant_samples & loading_samples
172-
173-
if discrepant_loading_samples:
174-
sorted_discrepant_samples = sorted(discrepant_loading_samples)
175-
msg = f'Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : {sorted_discrepant_samples[:10]}'
176-
raise SeqrValidationError(
177-
msg,
178-
{'imputed_sex_ploidy_failures': sorted_discrepant_samples},
179-
)
180-
181-
182130
def validate_sample_type(
183131
mt: hl.MatrixTable,
184132
coding_and_noncoding_variants_ht: hl.Table,

0 commit comments

Comments
 (0)