1
+ from collections import defaultdict
2
+
1
3
import hail as hl
2
4
import numpy as np
3
5
@@ -10,83 +12,93 @@ def passes_relatedness_check(
10
12
sample_id : str ,
11
13
other_id : str ,
12
14
relation : Relation ,
13
- ) -> bool :
15
+ ) -> tuple [ bool , str | None ] :
14
16
# No relationship to check, return true
15
17
if other_id is None :
16
- return True
18
+ return True , None
17
19
coefficients = relatedness_check_lookup .get (
18
20
(min (sample_id , other_id ), max (sample_id , other_id )),
19
21
)
20
- if not coefficients :
21
- return False
22
- return np .allclose (
22
+ if not coefficients or not np .allclose (
23
23
coefficients ,
24
24
relation .coefficients ,
25
25
0.1 ,
26
- )
26
+ ):
27
+ return (
28
+ False ,
29
+ f'Sample { sample_id } has expected relation "{ relation .value } " to { other_id } but has coefficients { coefficients or []} ' ,
30
+ )
31
+ return True , None
27
32
28
33
29
- def passes_all_relatedness_checks ( # noqa: C901
34
+ def all_relatedness_checks ( # noqa: C901
30
35
relatedness_check_lookup : dict [tuple [str , str ], list ],
31
36
sample : Sample ,
32
- ) -> bool :
37
+ ) -> list [str ]:
38
+ failure_reasons = []
33
39
for parent_id in [sample .mother , sample .father ]:
34
- if not passes_relatedness_check (
40
+ success , reason = passes_relatedness_check (
35
41
relatedness_check_lookup ,
36
42
sample .sample_id ,
37
43
parent_id ,
38
44
Relation .PARENT ,
39
- ):
40
- return False
45
+ )
46
+ if not success :
47
+ failure_reasons .append (reason )
41
48
42
49
for grandparent_id in [
43
50
sample .maternal_grandmother ,
44
51
sample .maternal_grandfather ,
45
52
sample .paternal_grandmother ,
46
53
sample .paternal_grandfather ,
47
54
]:
48
- if not passes_relatedness_check (
55
+ success , reason = passes_relatedness_check (
49
56
relatedness_check_lookup ,
50
57
sample .sample_id ,
51
58
grandparent_id ,
52
59
Relation .GRANDPARENT ,
53
- ):
54
- return False
60
+ )
61
+ if not success :
62
+ failure_reasons .append (reason )
55
63
56
64
for sibling_id in sample .siblings :
57
- if not passes_relatedness_check (
65
+ success , reason = passes_relatedness_check (
58
66
relatedness_check_lookup ,
59
67
sample .sample_id ,
60
68
sibling_id ,
61
69
Relation .SIBLING ,
62
- ):
63
- return False
70
+ )
71
+ if not success :
72
+ failure_reasons .append (reason )
64
73
65
74
for half_sibling_id in sample .half_siblings :
66
75
# NB: A "half sibling" parsed from the pedigree may actually be a sibling, so we allow those
67
76
# through as well.
68
- if not passes_relatedness_check (
77
+ success1 , _ = passes_relatedness_check (
69
78
relatedness_check_lookup ,
70
79
sample .sample_id ,
71
80
half_sibling_id ,
72
- Relation .HALF_SIBLING ,
73
- ) and not passes_relatedness_check (
81
+ Relation .SIBLING ,
82
+ )
83
+ success2 , reason = passes_relatedness_check (
74
84
relatedness_check_lookup ,
75
85
sample .sample_id ,
76
86
half_sibling_id ,
77
- Relation .SIBLING ,
78
- ):
79
- return False
87
+ Relation .HALF_SIBLING ,
88
+ )
89
+ if not success1 and not success2 :
90
+ failure_reasons .append (reason )
80
91
81
92
for aunt_nephew_id in sample .aunt_nephews :
82
- if not passes_relatedness_check (
93
+ success , reason = passes_relatedness_check (
83
94
relatedness_check_lookup ,
84
95
sample .sample_id ,
85
96
aunt_nephew_id ,
86
97
Relation .AUNT_NEPHEW ,
87
- ):
88
- return False
89
- return True
98
+ )
99
+ if not success :
100
+ failure_reasons .append (reason )
101
+ return failure_reasons
90
102
91
103
92
104
def build_relatedness_check_lookup (
@@ -99,7 +111,9 @@ def build_relatedness_check_lookup(
99
111
j = remap_lookup .get (relatedness_check_ht .j , relatedness_check_ht .j ),
100
112
)
101
113
return {
102
- (r .i , r .j ): list (r .drop ('i' , 'j' ).values ())
114
+ # NB: samples are sorted in the original ibd but not necessarily
115
+ # sorted after remapping
116
+ (min (r .i , r .j ), max (r .i , r .j )): list (r .drop ('i' , 'j' ).values ())
103
117
for r in relatedness_check_ht .collect ()
104
118
}
105
119
@@ -119,43 +133,50 @@ def build_sex_check_lookup(
119
133
def get_families_failed_missing_samples (
120
134
mt : hl .MatrixTable ,
121
135
families : set [Family ],
122
- ) -> set [Family ]:
136
+ ) -> dict [Family , list [ str ] ]:
123
137
callset_samples = set (mt .cols ().s .collect ())
124
- failed_families = set ()
138
+ failed_families = {}
125
139
for family in families :
126
- if len (family .samples .keys () - callset_samples ) > 0 :
127
- failed_families .add (family )
140
+ missing_samples = family .samples .keys () - callset_samples
141
+ if len (missing_samples ) > 0 :
142
+ # NB: This is an array of a single element for consistency with
143
+ # the other checks.
144
+ failed_families [family ] = [f'Missing samples: { missing_samples } ' ]
128
145
return failed_families
129
146
130
147
131
148
def get_families_failed_relatedness_check (
132
149
families : set [Family ],
133
150
relatedness_check_ht : hl .Table ,
134
151
remap_lookup : hl .dict ,
135
- ) -> set [Family ]:
152
+ ) -> dict [Family , list [ str ] ]:
136
153
relatedness_check_lookup = build_relatedness_check_lookup (
137
154
relatedness_check_ht ,
138
155
remap_lookup ,
139
156
)
140
- failed_families = set ( )
157
+ failed_families = defaultdict ( list )
141
158
for family in families :
142
159
for sample in family .samples .values ():
143
- if not passes_all_relatedness_checks (relatedness_check_lookup , sample ):
144
- failed_families .add (family )
145
- break
146
- return failed_families
160
+ failure_reasons = all_relatedness_checks (
161
+ relatedness_check_lookup ,
162
+ sample ,
163
+ )
164
+ if failure_reasons :
165
+ failed_families [family ].extend (failure_reasons )
166
+ return dict (failed_families )
147
167
148
168
149
169
def get_families_failed_sex_check (
150
170
families : set [Family ],
151
171
sex_check_ht : hl .Table ,
152
172
remap_lookup : hl .dict ,
153
- ) -> set [Family ]:
173
+ ) -> dict [Family , list [ str ] ]:
154
174
sex_check_lookup = build_sex_check_lookup (sex_check_ht , remap_lookup )
155
- failed_families = set ( )
175
+ failed_families = defaultdict ( list )
156
176
for family in families :
157
177
for sample_id in family .samples :
158
178
if family .samples [sample_id ].sex != sex_check_lookup [sample_id ]:
159
- failed_families .add (family )
160
- break
161
- return failed_families
179
+ failed_families [family ].append (
180
+ f'Sample { sample_id } has pedigree sex { family .samples [sample_id ].sex .value } but imputed sex { sex_check_lookup [sample_id ].value } ' ,
181
+ )
182
+ return dict (failed_families )
0 commit comments