Skip to content

Commit 2adaa72

Browse files
authored
Merge pull request #717 from broadinstitute/dev
main <- dev
2 parents 1987548 + adf2869 commit 2adaa72

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

49 files changed

+284
-78
lines changed

v03_pipeline/lib/annotations/enums.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,7 @@
198198
CLINVAR_PATHOGENICITIES = [
199199
'Pathogenic',
200200
'Pathogenic/Likely_pathogenic',
201+
'Pathogenic/Likely_pathogenic/Established_risk_allele',
201202
'Pathogenic/Likely_pathogenic/Likely_risk_allele',
202203
'Pathogenic/Likely_risk_allele',
203204
'Likely_pathogenic',

v03_pipeline/lib/logger.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
'': {
2121
'level': 'INFO',
2222
'handlers': ['default'],
23-
'propagate': True,
23+
'propagate': False,
2424
},
2525
'py4j': {
2626
'level': 'CRITICAL',
Lines changed: 64 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from collections import defaultdict
2+
13
import hail as hl
24
import numpy as np
35

@@ -10,83 +12,93 @@ def passes_relatedness_check(
1012
sample_id: str,
1113
other_id: str,
1214
relation: Relation,
13-
) -> bool:
15+
) -> tuple[bool, str | None]:
1416
# No relationship to check, return true
1517
if other_id is None:
16-
return True
18+
return True, None
1719
coefficients = relatedness_check_lookup.get(
1820
(min(sample_id, other_id), max(sample_id, other_id)),
1921
)
20-
if not coefficients:
21-
return False
22-
return np.allclose(
22+
if not coefficients or not np.allclose(
2323
coefficients,
2424
relation.coefficients,
2525
0.1,
26-
)
26+
):
27+
return (
28+
False,
29+
f'Sample {sample_id} has expected relation "{relation.value}" to {other_id} but has coefficients {coefficients or []}',
30+
)
31+
return True, None
2732

2833

29-
def passes_all_relatedness_checks( # noqa: C901
34+
def all_relatedness_checks( # noqa: C901
3035
relatedness_check_lookup: dict[tuple[str, str], list],
3136
sample: Sample,
32-
) -> bool:
37+
) -> list[str]:
38+
failure_reasons = []
3339
for parent_id in [sample.mother, sample.father]:
34-
if not passes_relatedness_check(
40+
success, reason = passes_relatedness_check(
3541
relatedness_check_lookup,
3642
sample.sample_id,
3743
parent_id,
3844
Relation.PARENT,
39-
):
40-
return False
45+
)
46+
if not success:
47+
failure_reasons.append(reason)
4148

4249
for grandparent_id in [
4350
sample.maternal_grandmother,
4451
sample.maternal_grandfather,
4552
sample.paternal_grandmother,
4653
sample.paternal_grandfather,
4754
]:
48-
if not passes_relatedness_check(
55+
success, reason = passes_relatedness_check(
4956
relatedness_check_lookup,
5057
sample.sample_id,
5158
grandparent_id,
5259
Relation.GRANDPARENT,
53-
):
54-
return False
60+
)
61+
if not success:
62+
failure_reasons.append(reason)
5563

5664
for sibling_id in sample.siblings:
57-
if not passes_relatedness_check(
65+
success, reason = passes_relatedness_check(
5866
relatedness_check_lookup,
5967
sample.sample_id,
6068
sibling_id,
6169
Relation.SIBLING,
62-
):
63-
return False
70+
)
71+
if not success:
72+
failure_reasons.append(reason)
6473

6574
for half_sibling_id in sample.half_siblings:
6675
# NB: A "half sibling" parsed from the pedigree may actually be a sibling, so we allow those
6776
# through as well.
68-
if not passes_relatedness_check(
77+
success1, _ = passes_relatedness_check(
6978
relatedness_check_lookup,
7079
sample.sample_id,
7180
half_sibling_id,
72-
Relation.HALF_SIBLING,
73-
) and not passes_relatedness_check(
81+
Relation.SIBLING,
82+
)
83+
success2, reason = passes_relatedness_check(
7484
relatedness_check_lookup,
7585
sample.sample_id,
7686
half_sibling_id,
77-
Relation.SIBLING,
78-
):
79-
return False
87+
Relation.HALF_SIBLING,
88+
)
89+
if not success1 and not success2:
90+
failure_reasons.append(reason)
8091

8192
for aunt_nephew_id in sample.aunt_nephews:
82-
if not passes_relatedness_check(
93+
success, reason = passes_relatedness_check(
8394
relatedness_check_lookup,
8495
sample.sample_id,
8596
aunt_nephew_id,
8697
Relation.AUNT_NEPHEW,
87-
):
88-
return False
89-
return True
98+
)
99+
if not success:
100+
failure_reasons.append(reason)
101+
return failure_reasons
90102

91103

92104
def build_relatedness_check_lookup(
@@ -99,7 +111,9 @@ def build_relatedness_check_lookup(
99111
j=remap_lookup.get(relatedness_check_ht.j, relatedness_check_ht.j),
100112
)
101113
return {
102-
(r.i, r.j): list(r.drop('i', 'j').values())
114+
# NB: samples are sorted in the original ibd but not necessarily
115+
# sorted after remapping
116+
(min(r.i, r.j), max(r.i, r.j)): list(r.drop('i', 'j').values())
103117
for r in relatedness_check_ht.collect()
104118
}
105119

@@ -119,43 +133,50 @@ def build_sex_check_lookup(
119133
def get_families_failed_missing_samples(
120134
mt: hl.MatrixTable,
121135
families: set[Family],
122-
) -> set[Family]:
136+
) -> dict[Family, list[str]]:
123137
callset_samples = set(mt.cols().s.collect())
124-
failed_families = set()
138+
failed_families = {}
125139
for family in families:
126-
if len(family.samples.keys() - callset_samples) > 0:
127-
failed_families.add(family)
140+
missing_samples = family.samples.keys() - callset_samples
141+
if len(missing_samples) > 0:
142+
# NB: This is an array of a single element for consistency with
143+
# the other checks.
144+
failed_families[family] = [f'Missing samples: {missing_samples}']
128145
return failed_families
129146

130147

131148
def get_families_failed_relatedness_check(
132149
families: set[Family],
133150
relatedness_check_ht: hl.Table,
134151
remap_lookup: hl.dict,
135-
) -> set[Family]:
152+
) -> dict[Family, list[str]]:
136153
relatedness_check_lookup = build_relatedness_check_lookup(
137154
relatedness_check_ht,
138155
remap_lookup,
139156
)
140-
failed_families = set()
157+
failed_families = defaultdict(list)
141158
for family in families:
142159
for sample in family.samples.values():
143-
if not passes_all_relatedness_checks(relatedness_check_lookup, sample):
144-
failed_families.add(family)
145-
break
146-
return failed_families
160+
failure_reasons = all_relatedness_checks(
161+
relatedness_check_lookup,
162+
sample,
163+
)
164+
if failure_reasons:
165+
failed_families[family].extend(failure_reasons)
166+
return dict(failed_families)
147167

148168

149169
def get_families_failed_sex_check(
150170
families: set[Family],
151171
sex_check_ht: hl.Table,
152172
remap_lookup: hl.dict,
153-
) -> set[Family]:
173+
) -> dict[Family, list[str]]:
154174
sex_check_lookup = build_sex_check_lookup(sex_check_ht, remap_lookup)
155-
failed_families = set()
175+
failed_families = defaultdict(list)
156176
for family in families:
157177
for sample_id in family.samples:
158178
if family.samples[sample_id].sex != sex_check_lookup[sample_id]:
159-
failed_families.add(family)
160-
break
161-
return failed_families
179+
failed_families[family].append(
180+
f'Sample {sample_id} has pedigree sex {family.samples[sample_id].sex.value} but imputed sex {sex_check_lookup[sample_id].value}',
181+
)
182+
return dict(failed_families)

v03_pipeline/lib/misc/family_loading_failures_test.py

Lines changed: 88 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,17 @@
33
import hail as hl
44

55
from v03_pipeline.lib.misc.family_loading_failures import (
6+
all_relatedness_checks,
67
build_relatedness_check_lookup,
78
build_sex_check_lookup,
8-
passes_all_relatedness_checks,
9+
get_families_failed_sex_check,
910
)
10-
from v03_pipeline.lib.misc.pedigree import Sample
11+
from v03_pipeline.lib.misc.io import import_pedigree
12+
from v03_pipeline.lib.misc.pedigree import Sample, parse_pedigree_ht_to_families
1113
from v03_pipeline.lib.model import Ploidy
1214

15+
TEST_PEDIGREE_6 = 'v03_pipeline/var/test/pedigrees/test_pedigree_6.tsv'
16+
1317

1418
class FamilyLoadingFailuresTest(unittest.TestCase):
1519
def test_build_relatedness_check_lookup(self):
@@ -40,7 +44,7 @@ def test_build_relatedness_check_lookup(self):
4044
hl.dict({'ROS_006_18Y03226_D1': 'remapped_id'}),
4145
),
4246
{
43-
('remapped_id', 'ROS_007_19Y05939_D1'): [
47+
('ROS_007_19Y05939_D1', 'remapped_id'): [
4448
0.0,
4549
1.0,
4650
0.0,
@@ -77,7 +81,7 @@ def test_build_sex_check_lookup(self):
7781
},
7882
)
7983

80-
def test_passes_all_relatedness_checks(self):
84+
def test_all_relatedness_checks(self):
8185
relatedness_check_lookup = {
8286
# Parent
8387
('sample_1', 'sample_2'): [
@@ -98,9 +102,8 @@ def test_passes_all_relatedness_checks(self):
98102
paternal_grandfather='sample_3',
99103
half_siblings=['sample_4'],
100104
)
101-
self.assertTrue(
102-
passes_all_relatedness_checks(relatedness_check_lookup, sample),
103-
)
105+
failure_reasons = all_relatedness_checks(relatedness_check_lookup, sample)
106+
self.assertListEqual(failure_reasons, [])
104107

105108
# Defined grandparent missing in relatedness table
106109
sample = Sample(
@@ -110,8 +113,15 @@ def test_passes_all_relatedness_checks(self):
110113
paternal_grandfather='sample_3',
111114
paternal_grandmother='sample_5',
112115
)
113-
self.assertFalse(
114-
passes_all_relatedness_checks(relatedness_check_lookup, sample),
116+
failure_reasons = all_relatedness_checks(
117+
relatedness_check_lookup,
118+
sample,
119+
)
120+
self.assertListEqual(
121+
failure_reasons,
122+
[
123+
'Sample sample_1 has expected relation "grandparent" to sample_5 but has coefficients []',
124+
],
115125
)
116126

117127
# Sibling is actually a half sibling.
@@ -126,6 +136,73 @@ def test_passes_all_relatedness_checks(self):
126136
paternal_grandfather='sample_3',
127137
siblings=['sample_4'],
128138
)
129-
self.assertFalse(
130-
passes_all_relatedness_checks(relatedness_check_lookup, sample),
139+
failure_reasons = all_relatedness_checks(
140+
relatedness_check_lookup,
141+
sample,
142+
)
143+
self.assertListEqual(
144+
failure_reasons,
145+
[
146+
'Sample sample_1 has expected relation "sibling" to sample_4 but has coefficients [0.5, 0.5, 0, 0.25]',
147+
],
148+
)
149+
150+
relatedness_check_lookup = {
151+
**relatedness_check_lookup,
152+
('sample_1', 'sample_2'): [
153+
0.5,
154+
0.5,
155+
0.5,
156+
0.5,
157+
],
158+
}
159+
sample = Sample(
160+
sex=Ploidy.FEMALE,
161+
sample_id='sample_1',
162+
mother='sample_2',
163+
paternal_grandfather='sample_3',
164+
siblings=['sample_4'],
165+
)
166+
failure_reasons = all_relatedness_checks(
167+
relatedness_check_lookup,
168+
sample,
169+
)
170+
self.assertListEqual(
171+
failure_reasons,
172+
[
173+
'Sample sample_1 has expected relation "parent" to sample_2 but has coefficients [0.5, 0.5, 0.5, 0.5]',
174+
'Sample sample_1 has expected relation "sibling" to sample_4 but has coefficients [0.5, 0.5, 0, 0.25]',
175+
],
176+
)
177+
178+
def test_get_families_failed_sex_check(self):
179+
sex_check_ht = hl.Table.parallelize(
180+
[
181+
{'s': 'ROS_006_18Y03226_D1', 'sex': 'M'},
182+
{'s': 'ROS_006_18Y03227_D1', 'sex': 'F'},
183+
{'s': 'ROS_006_18Y03228_D1', 'sex': 'F'},
184+
{'s': 'ROS_007_19Y05919_D1', 'sex': 'F'},
185+
{'s': 'ROS_007_19Y05939_D1', 'sex': 'F'},
186+
{'s': 'ROS_007_19Y05987_D1', 'sex': 'F'},
187+
],
188+
hl.tstruct(
189+
s=hl.tstr,
190+
sex=hl.tstr,
191+
),
192+
key='s',
193+
)
194+
pedigree_ht = import_pedigree(TEST_PEDIGREE_6)
195+
failed_families = get_families_failed_sex_check(
196+
parse_pedigree_ht_to_families(pedigree_ht),
197+
sex_check_ht,
198+
{},
199+
)
200+
self.assertCountEqual(
201+
failed_families.values(),
202+
[
203+
[
204+
'Sample ROS_006_18Y03226_D1 has pedigree sex F but imputed sex M',
205+
'Sample ROS_006_18Y03227_D1 has pedigree sex M but imputed sex F',
206+
],
207+
],
131208
)

v03_pipeline/lib/reference_data/clinvar.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,10 @@ def parsed_clnsig(ht: hl.Table):
6464
'Likely_pathogenic,_low_penetrance',
6565
'Likely_pathogenic|low_penetrance',
6666
)
67+
.replace(
68+
'/Pathogenic,_low_penetrance/Established_risk_allele',
69+
'/Established_risk_allele|low_penetrance',
70+
)
6771
.replace(
6872
'/Pathogenic,_low_penetrance',
6973
'|low_penetrance',

0 commit comments

Comments
 (0)