Skip to content

Commit 239e869

Browse files
committed
Merge branch 'benb/lookup_table_refactor' of github.com:broadinstitute/seqr-loading-pipelines into benb/migration2
2 parents fadb07e + 29a8f42 commit 239e869

File tree

181 files changed

+1779
-586
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

181 files changed

+1779
-586
lines changed

v03_pipeline/lib/methods/relatedness.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def call_relatedness(
3030
# - brew install lz4
3131
# - CXXFLAGS='-I/opt/homebrew/include/' HAIL_COMPILE_NATIVES=1 make -C hail install
3232
# Hail issue here: https://discuss.hail.is/t/noclassdeffounderror-could-not-initialize-class-is-hail-methods-ibsffi/2453
33-
kin_ht = hl.identity_by_descent(mt, maf=mt.info.AF[1], min=0.10, max=1.0)
33+
kin_ht = hl.identity_by_descent(mt, maf=mt.info.AF[0], min=0.10, max=1.0)
3434
kin_ht = kin_ht.key_by('i', 'j')
3535
return kin_ht.select(
3636
ibd0=kin_ht.ibd.Z0,

v03_pipeline/lib/misc/family_entries.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -55,16 +55,16 @@ def compute_callset_family_entries_ht(
5555

5656

5757
def globalize_ids(ht: hl.Table) -> hl.Table:
58-
row = ht.take(1)
59-
has_family_entries = row and len(row[0].family_entries) > 0
58+
row = ht.take(1)[0] if ht.count() > 0 else None
59+
has_family_entries = row and len(row.family_entries) > 0
6060
ht = ht.annotate_globals(
6161
family_guids=(
62-
[fe[0].family_guid for fe in row[0].family_entries]
62+
[fe[0].family_guid for fe in row.family_entries]
6363
if has_family_entries
6464
else hl.empty_array(hl.tstr)
6565
),
6666
family_samples=(
67-
{fe[0].family_guid: [e.s for e in fe] for fe in row[0].family_entries}
67+
{fe[0].family_guid: [e.s for e in fe] for fe in row.family_entries}
6868
if has_family_entries
6969
else hl.empty_dict(hl.tstr, hl.tarray(hl.tstr))
7070
),
@@ -93,11 +93,10 @@ def deglobalize_ids(ht: hl.Table) -> hl.Table:
9393
return ht.drop('family_guids', 'family_samples')
9494

9595

96-
def remove_new_callset_family_guids(
96+
def remove_family_guids(
9797
ht: hl.Table,
98-
family_guids: list[str],
98+
family_guids: hl.SetExpression,
9999
) -> hl.Table:
100-
family_guids = hl.set(family_guids)
101100
# Remove families from the existing project table structure (both the entries arrays and the globals are mutated)
102101
family_indexes_to_keep = hl.array(
103102
hl.enumerate(ht.globals.family_guids)
@@ -142,7 +141,7 @@ def join_family_entries_hts(ht: hl.Table, callset_ht: hl.Table) -> hl.Table:
142141
.default(ht.family_entries.extend(ht.family_entries_1))
143142
),
144143
)
145-
# NB: transume because we want to drop the *_1 fields, but preserve other globals
144+
# NB: transmute because we want to drop the *_1 fields, but preserve other globals
146145
return ht.transmute_globals(
147146
family_guids=ht.family_guids.extend(ht.family_guids_1),
148147
family_samples=hl.dict(

v03_pipeline/lib/misc/family_entries_test.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
deglobalize_ids,
88
globalize_ids,
99
join_family_entries_hts,
10-
remove_new_callset_family_guids,
10+
remove_family_guids,
1111
)
1212
from v03_pipeline.lib.model import DatasetType
1313

@@ -202,7 +202,7 @@ def test_globalize_and_deglobalize(self) -> None:
202202
],
203203
)
204204

205-
def test_remove_new_callset_family_guids(self) -> None:
205+
def test_remove_family_guids(self) -> None:
206206
family_entries_ht = hl.Table.parallelize(
207207
[
208208
{
@@ -244,7 +244,10 @@ def test_remove_new_callset_family_guids(self) -> None:
244244
},
245245
),
246246
)
247-
family_entries_ht = remove_new_callset_family_guids(family_entries_ht, ['012'])
247+
family_entries_ht = remove_family_guids(
248+
family_entries_ht,
249+
hl.set(['012']),
250+
)
248251
self.assertCountEqual(
249252
family_entries_ht.globals.collect(),
250253
[
@@ -326,7 +329,7 @@ def test_remove_new_callset_family_guids_all_families(self) -> None:
326329
},
327330
),
328331
)
329-
ht = remove_new_callset_family_guids(family_entries_ht, ['012', '123'])
332+
ht = remove_family_guids(family_entries_ht, hl.set(['012', '123']))
330333
self.assertCountEqual(
331334
ht.globals.collect(),
332335
[hl.Struct(family_guids=[], family_samples={})],
Lines changed: 64 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from collections import defaultdict
2+
13
import hail as hl
24
import numpy as np
35

@@ -10,83 +12,93 @@ def passes_relatedness_check(
1012
sample_id: str,
1113
other_id: str,
1214
relation: Relation,
13-
) -> bool:
15+
) -> tuple[bool, str | None]:
1416
# No relationship to check, return true
1517
if other_id is None:
16-
return True
18+
return True, None
1719
coefficients = relatedness_check_lookup.get(
1820
(min(sample_id, other_id), max(sample_id, other_id)),
1921
)
20-
if not coefficients:
21-
return False
22-
return np.allclose(
22+
if not coefficients or not np.allclose(
2323
coefficients,
2424
relation.coefficients,
2525
0.1,
26-
)
26+
):
27+
return (
28+
False,
29+
f'Sample {sample_id} has expected relation "{relation.value}" to {other_id} but has coefficients {coefficients or []}',
30+
)
31+
return True, None
2732

2833

29-
def passes_all_relatedness_checks( # noqa: C901
34+
def all_relatedness_checks( # noqa: C901
3035
relatedness_check_lookup: dict[tuple[str, str], list],
3136
sample: Sample,
32-
) -> bool:
37+
) -> list[str]:
38+
failure_reasons = []
3339
for parent_id in [sample.mother, sample.father]:
34-
if not passes_relatedness_check(
40+
success, reason = passes_relatedness_check(
3541
relatedness_check_lookup,
3642
sample.sample_id,
3743
parent_id,
3844
Relation.PARENT,
39-
):
40-
return False
45+
)
46+
if not success:
47+
failure_reasons.append(reason)
4148

4249
for grandparent_id in [
4350
sample.maternal_grandmother,
4451
sample.maternal_grandfather,
4552
sample.paternal_grandmother,
4653
sample.paternal_grandfather,
4754
]:
48-
if not passes_relatedness_check(
55+
success, reason = passes_relatedness_check(
4956
relatedness_check_lookup,
5057
sample.sample_id,
5158
grandparent_id,
5259
Relation.GRANDPARENT,
53-
):
54-
return False
60+
)
61+
if not success:
62+
failure_reasons.append(reason)
5563

5664
for sibling_id in sample.siblings:
57-
if not passes_relatedness_check(
65+
success, reason = passes_relatedness_check(
5866
relatedness_check_lookup,
5967
sample.sample_id,
6068
sibling_id,
6169
Relation.SIBLING,
62-
):
63-
return False
70+
)
71+
if not success:
72+
failure_reasons.append(reason)
6473

6574
for half_sibling_id in sample.half_siblings:
6675
# NB: A "half sibling" parsed from the pedigree may actually be a sibling, so we allow those
6776
# through as well.
68-
if not passes_relatedness_check(
77+
success1, _ = passes_relatedness_check(
6978
relatedness_check_lookup,
7079
sample.sample_id,
7180
half_sibling_id,
72-
Relation.HALF_SIBLING,
73-
) and not passes_relatedness_check(
81+
Relation.SIBLING,
82+
)
83+
success2, reason = passes_relatedness_check(
7484
relatedness_check_lookup,
7585
sample.sample_id,
7686
half_sibling_id,
77-
Relation.SIBLING,
78-
):
79-
return False
87+
Relation.HALF_SIBLING,
88+
)
89+
if not success1 and not success2:
90+
failure_reasons.append(reason)
8091

8192
for aunt_nephew_id in sample.aunt_nephews:
82-
if not passes_relatedness_check(
93+
success, reason = passes_relatedness_check(
8394
relatedness_check_lookup,
8495
sample.sample_id,
8596
aunt_nephew_id,
8697
Relation.AUNT_NEPHEW,
87-
):
88-
return False
89-
return True
98+
)
99+
if not success:
100+
failure_reasons.append(reason)
101+
return failure_reasons
90102

91103

92104
def build_relatedness_check_lookup(
@@ -99,7 +111,9 @@ def build_relatedness_check_lookup(
99111
j=remap_lookup.get(relatedness_check_ht.j, relatedness_check_ht.j),
100112
)
101113
return {
102-
(r.i, r.j): list(r.drop('i', 'j').values())
114+
# NB: samples are sorted in the original ibd but not necessarily
115+
# sorted after remapping
116+
(min(r.i, r.j), max(r.i, r.j)): list(r.drop('i', 'j').values())
103117
for r in relatedness_check_ht.collect()
104118
}
105119

@@ -119,43 +133,50 @@ def build_sex_check_lookup(
119133
def get_families_failed_missing_samples(
120134
mt: hl.MatrixTable,
121135
families: set[Family],
122-
) -> set[Family]:
136+
) -> dict[Family, list[str]]:
123137
callset_samples = set(mt.cols().s.collect())
124-
failed_families = set()
138+
failed_families = {}
125139
for family in families:
126-
if len(family.samples.keys() - callset_samples) > 0:
127-
failed_families.add(family)
140+
missing_samples = family.samples.keys() - callset_samples
141+
if len(missing_samples) > 0:
142+
# NB: This is an array of a single element for consistency with
143+
# the other checks.
144+
failed_families[family] = [f'Missing samples: {missing_samples}']
128145
return failed_families
129146

130147

131148
def get_families_failed_relatedness_check(
132149
families: set[Family],
133150
relatedness_check_ht: hl.Table,
134151
remap_lookup: hl.dict,
135-
) -> set[Family]:
152+
) -> dict[Family, list[str]]:
136153
relatedness_check_lookup = build_relatedness_check_lookup(
137154
relatedness_check_ht,
138155
remap_lookup,
139156
)
140-
failed_families = set()
157+
failed_families = defaultdict(list)
141158
for family in families:
142159
for sample in family.samples.values():
143-
if not passes_all_relatedness_checks(relatedness_check_lookup, sample):
144-
failed_families.add(family)
145-
break
146-
return failed_families
160+
failure_reasons = all_relatedness_checks(
161+
relatedness_check_lookup,
162+
sample,
163+
)
164+
if failure_reasons:
165+
failed_families[family].extend(failure_reasons)
166+
return dict(failed_families)
147167

148168

149169
def get_families_failed_sex_check(
150170
families: set[Family],
151171
sex_check_ht: hl.Table,
152172
remap_lookup: hl.dict,
153-
) -> set[Family]:
173+
) -> dict[Family, list[str]]:
154174
sex_check_lookup = build_sex_check_lookup(sex_check_ht, remap_lookup)
155-
failed_families = set()
175+
failed_families = defaultdict(list)
156176
for family in families:
157177
for sample_id in family.samples:
158178
if family.samples[sample_id].sex != sex_check_lookup[sample_id]:
159-
failed_families.add(family)
160-
break
161-
return failed_families
179+
failed_families[family].append(
180+
f'Sample {sample_id} has pedigree sex {family.samples[sample_id].sex.value} but imputed sex {sex_check_lookup[sample_id].value}',
181+
)
182+
return dict(failed_families)

0 commit comments

Comments
 (0)