Skip to content

Commit 6ce3b54

Browse files
authored
Merge branch 'dev' into benb/lookup_table_refactor
2 parents 29a8f42 + d305d5f commit 6ce3b54

27 files changed

+384
-96
lines changed

v03_pipeline/lib/methods/sex_check.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import hail as hl
22

3-
from v03_pipeline.lib.model import Ploidy
3+
from v03_pipeline.lib.model import Sex
44

55
IMPUTE_SEX_ANNOTATIONS = [
66
'is_female',
@@ -13,6 +13,7 @@
1313

1414
AMBIGUOUS_THRESHOLD_PERC: float = 0.01 # Fraction of samples identified as "ambiguous_sex" above which an error will be thrown.
1515
AAF_THRESHOLD: float = 0.05 # Alternate allele frequency threshold for `hl.impute_sex`.
16+
BIALLELIC: int = 2
1617
XX_FSTAT_THRESHOLD: float = (
1718
0.5 # F-stat threshold below which a sample will be called XX
1819
)
@@ -24,7 +25,9 @@
2425
def call_sex(mt: hl.MatrixTable) -> hl.Table:
2526
# Filter to SNVs and biallelics
2627
# NB: We should already have filtered biallelics, but just in case.
27-
mt = mt.filter_rows(hl.is_snp(mt.alleles[0], mt.alleles[1]))
28+
mt = mt.filter_rows(
29+
(hl.len(mt.alleles) == BIALLELIC) & hl.is_snp(mt.alleles[0], mt.alleles[1]),
30+
)
2831

2932
# Filter to PASS variants only (variants with empty or missing filter set)
3033
mt = mt.filter_rows(
@@ -41,13 +44,13 @@ def call_sex(mt: hl.MatrixTable) -> hl.Table:
4144
ht = ht.annotate(
4245
sex=(
4346
hl.case()
44-
.when(hl.is_missing(ht.is_female), Ploidy.UNKNOWN.value)
45-
.when(ht.is_female, Ploidy.FEMALE.value)
46-
.default(Ploidy.MALE.value)
47+
.when(hl.is_missing(ht.is_female), Sex.UNKNOWN.value)
48+
.when(ht.is_female, Sex.FEMALE.value)
49+
.default(Sex.MALE.value)
4750
),
4851
)
4952
ambiguous_perc = ht.aggregate(
50-
hl.agg.fraction(ht.sex == Ploidy.UNKNOWN.value),
53+
hl.agg.fraction(ht.sex == Sex.UNKNOWN.value),
5154
)
5255
if ambiguous_perc > AMBIGUOUS_THRESHOLD_PERC:
5356
msg = f'{ambiguous_perc:.2%} of samples identified as ambiguous. Please contact the methods team to investigate the callset.'

v03_pipeline/lib/misc/family_loading_failures.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import numpy as np
55

66
from v03_pipeline.lib.misc.pedigree import Family, Relation, Sample
7-
from v03_pipeline.lib.model import Ploidy
7+
from v03_pipeline.lib.model import Sex
88

99

1010
def passes_relatedness_check(
@@ -22,7 +22,7 @@ def passes_relatedness_check(
2222
if not coefficients or not np.allclose(
2323
coefficients,
2424
relation.coefficients,
25-
0.1,
25+
atol=0.1,
2626
):
2727
return (
2828
False,
@@ -121,13 +121,13 @@ def build_relatedness_check_lookup(
121121
def build_sex_check_lookup(
122122
sex_check_ht: hl.Table,
123123
remap_lookup: hl.dict,
124-
) -> dict[str, Ploidy]:
124+
) -> dict[str, Sex]:
125125
# Build sex check lookup
126126
sex_check_ht = sex_check_ht.key_by(
127127
s=remap_lookup.get(sex_check_ht.s, sex_check_ht.s),
128128
)
129129
sex_check_ht = sex_check_ht.select('sex')
130-
return {r.s: Ploidy(r.sex) for r in sex_check_ht.collect()}
130+
return {r.s: Sex(r.sex) for r in sex_check_ht.collect()}
131131

132132

133133
def get_families_failed_missing_samples(

v03_pipeline/lib/misc/family_loading_failures_test.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
)
1111
from v03_pipeline.lib.misc.io import import_pedigree
1212
from v03_pipeline.lib.misc.pedigree import Sample, parse_pedigree_ht_to_families
13-
from v03_pipeline.lib.model import Ploidy
13+
from v03_pipeline.lib.model import Sex
1414

1515
TEST_PEDIGREE_6 = 'v03_pipeline/var/test/pedigrees/test_pedigree_6.tsv'
1616

@@ -72,12 +72,12 @@ def test_build_sex_check_lookup(self):
7272
self.assertEqual(
7373
build_sex_check_lookup(ht, hl.dict({'ROS_006_18Y03226_D1': 'remapped_id'})),
7474
{
75-
'remapped_id': Ploidy.MALE,
76-
'ROS_006_18Y03227_D1': Ploidy.MALE,
77-
'ROS_006_18Y03228_D1': Ploidy.MALE,
78-
'ROS_007_19Y05919_D1': Ploidy.MALE,
79-
'ROS_007_19Y05939_D1': Ploidy.FEMALE,
80-
'ROS_007_19Y05987_D1': Ploidy.MALE,
75+
'remapped_id': Sex.MALE,
76+
'ROS_006_18Y03227_D1': Sex.MALE,
77+
'ROS_006_18Y03228_D1': Sex.MALE,
78+
'ROS_007_19Y05919_D1': Sex.MALE,
79+
'ROS_007_19Y05939_D1': Sex.FEMALE,
80+
'ROS_007_19Y05987_D1': Sex.MALE,
8181
},
8282
)
8383

@@ -96,7 +96,7 @@ def test_all_relatedness_checks(self):
9696
('sample_1', 'sample_4'): [0.25, 0.5, 0.25, 0.5],
9797
}
9898
sample = Sample(
99-
sex=Ploidy.FEMALE,
99+
sex=Sex.FEMALE,
100100
sample_id='sample_1',
101101
mother='sample_2',
102102
paternal_grandfather='sample_3',
@@ -107,7 +107,7 @@ def test_all_relatedness_checks(self):
107107

108108
# Defined grandparent missing in relatedness table
109109
sample = Sample(
110-
sex=Ploidy.FEMALE,
110+
sex=Sex.FEMALE,
111111
sample_id='sample_1',
112112
mother='sample_2',
113113
paternal_grandfather='sample_3',
@@ -130,7 +130,7 @@ def test_all_relatedness_checks(self):
130130
('sample_1', 'sample_4'): [0.5, 0.5, 0, 0.25],
131131
}
132132
sample = Sample(
133-
sex=Ploidy.FEMALE,
133+
sex=Sex.FEMALE,
134134
sample_id='sample_1',
135135
mother='sample_2',
136136
paternal_grandfather='sample_3',
@@ -157,7 +157,7 @@ def test_all_relatedness_checks(self):
157157
],
158158
}
159159
sample = Sample(
160-
sex=Ploidy.FEMALE,
160+
sex=Sex.FEMALE,
161161
sample_id='sample_1',
162162
mother='sample_2',
163163
paternal_grandfather='sample_3',

v03_pipeline/lib/misc/pedigree.py

Lines changed: 34 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
import hail as hl
66

7-
from v03_pipeline.lib.model import Ploidy
7+
from v03_pipeline.lib.model import Sex
88

99

1010
class Relation(Enum):
@@ -28,7 +28,7 @@ def coefficients(self):
2828
@dataclass
2929
class Sample:
3030
sample_id: str
31-
sex: Ploidy
31+
sex: Sex
3232
mother: str = None
3333
father: str = None
3434
maternal_grandmother: str = None
@@ -54,6 +54,23 @@ def is_aunt_nephew(self: 'Sample', other: 'Sample') -> bool:
5454
and (self.paternal_grandfather == other.father)
5555
)
5656

57+
def is_in_direct_lineage(self: 'Sample', other: 'Sample') -> bool:
58+
return self.sample_id in {
59+
other.mother,
60+
other.father,
61+
other.maternal_grandmother,
62+
other.maternal_grandfather,
63+
other.paternal_grandmother,
64+
other.paternal_grandfather,
65+
} or other.sample_id in {
66+
self.mother,
67+
self.father,
68+
self.maternal_grandmother,
69+
self.maternal_grandfather,
70+
self.paternal_grandmother,
71+
self.paternal_grandfather,
72+
}
73+
5774

5875
@dataclass
5976
class Family:
@@ -69,7 +86,7 @@ def parse_direct_lineage(rows: list[hl.Struct]) -> dict[str, Sample]: # noqa: C
6986
for row in rows:
7087
samples[row.s] = Sample(
7188
sample_id=row.s,
72-
sex=Ploidy(row.sex),
89+
sex=Sex(row.sex),
7390
mother=row.maternal_s,
7491
father=row.paternal_s,
7592
)
@@ -107,56 +124,34 @@ def parse_collateral_lineage(
107124
# A sample_i that is siblings with sample_j, will list sample_j as as sibling, but
108125
# sample_j will not list sample_i as a sibling. Relationships only appear in the
109126
# ibd table a single time, so we only need to check the pairing once.
110-
for sample_i, sample_j in itertools.combinations(samples.keys(), 2):
111-
# If other sample is already related, continue
112-
if sample_j in {
113-
samples[sample_i].mother,
114-
samples[sample_i].father,
115-
samples[sample_i].maternal_grandmother,
116-
samples[sample_i].maternal_grandfather,
117-
samples[sample_i].paternal_grandmother,
118-
samples[sample_i].paternal_grandfather,
119-
}:
127+
for sample_i, sample_j in itertools.combinations(samples.values(), 2):
128+
# If sample is already related from direct relationships, continue
129+
if sample_i.is_in_direct_lineage(sample_j):
120130
continue
121131

122132
# If both parents are identified and the same, samples are siblings.
123133
if (
124-
samples[sample_i].mother
125-
and samples[sample_i].father
126-
and (samples[sample_i].mother == samples[sample_j].mother)
127-
and (samples[sample_i].father == samples[sample_j].father)
134+
sample_i.mother
135+
and sample_i.father
136+
and (sample_i.mother == sample_j.mother)
137+
and (sample_i.father == sample_j.father)
128138
):
129-
samples[sample_i].siblings.append(
130-
sample_j,
131-
)
139+
sample_i.siblings.append(sample_j.sample_id)
132140
continue
133141

134142
# If only a single parent is identified and the same, samples are half siblings
135-
if (
136-
samples[sample_i].mother
137-
and samples[sample_i].mother == samples[sample_j].mother
138-
) or (
139-
samples[sample_i].father
140-
and samples[sample_i].father == samples[sample_j].father
143+
if (sample_i.mother and sample_i.mother == sample_j.mother) or (
144+
sample_i.father and sample_i.father == sample_j.father
141145
):
142-
samples[sample_i].half_siblings.append(
143-
sample_j,
144-
)
146+
sample_i.half_siblings.append(sample_j.sample_id)
145147
continue
146148

147149
# If either set of one's grandparents is identified and equal to the other's parents,
148150
# they're aunt/uncle related
149-
# NB: because we will only check an i, j pair of samples a single time, (itertools.combinations)
151+
# NB: because we will only check an i, j pair of samples a single time, (itertools.combinations)
150152
# we need to check both grandparents_i == parents_j and parents_i == grandparents_j.
151-
# fmt: off
152-
if (
153-
samples[sample_i].is_aunt_nephew(samples[sample_j])
154-
or samples[sample_j].is_aunt_nephew(samples[sample_i])
155-
):
156-
samples[sample_i].aunt_nephews.append(
157-
sample_j,
158-
)
159-
# fmt: on
153+
if sample_i.is_aunt_nephew(sample_j) or sample_j.is_aunt_nephew(sample_i):
154+
sample_i.aunt_nephews.append(sample_j.sample_id)
160155
return samples
161156

162157
@classmethod

0 commit comments

Comments
 (0)