Skip to content

Commit c0bca33

Browse files
authored
Filter training projects (#670)
1 parent ec656fb commit c0bca33

File tree

4 files changed

+39
-36
lines changed

4 files changed

+39
-36
lines changed
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
PROJECTS_EXCLUDED_FROM_GT_STATS = {
2+
# TODO get final list of projects
3+
'R0607_gregor_training_project_',
4+
'R0610_gregor_training_project_',
5+
}

v03_pipeline/lib/annotations/mito.py

Lines changed: 21 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import hail as hl
44

5+
from v03_pipeline.lib.annotations.constants import PROJECTS_EXCLUDED_FROM_GT_STATS
56
from v03_pipeline.lib.annotations.enums import MITOTIP_PATHOGENICITIES
67

78
MITOTIP_PATHOGENICITIES_LOOKUP = hl.dict(
@@ -14,31 +15,6 @@
1415
)
1516

1617

17-
def _AC_het(row: hl.StructExpression) -> hl.Int32Expression: # noqa: N802
18-
return sum(
19-
row.heteroplasmic_samples[project_guid].length()
20-
for project_guid in row.heteroplasmic_samples
21-
)
22-
23-
24-
def _AC_hom(row: hl.StructExpression) -> hl.Int32Expression: # noqa: N802
25-
return sum(
26-
row.homoplasmic_samples[project_guid].length()
27-
for project_guid in row.homoplasmic_samples
28-
)
29-
30-
31-
def _AN(row: hl.StructExpression) -> hl.Int32Expression: # noqa: N802
32-
return sum(
33-
(
34-
row.ref_samples[project_guid].length()
35-
+ row.heteroplasmic_samples[project_guid].length()
36-
+ row.homoplasmic_samples[project_guid].length()
37-
)
38-
for project_guid in row.ref_samples
39-
)
40-
41-
4218
def common_low_heteroplasmy(ht: hl.Table, **_: Any) -> hl.Expression:
4319
return ht.common_low_heteroplasmy
4420

@@ -90,12 +66,26 @@ def rsid(ht: hl.Table, **_: Any) -> hl.Expression:
9066
return ht.rsid.find(lambda x: hl.is_defined(x))
9167

9268

93-
def gt_stats(ht: hl.Table, sample_lookup_ht: hl.Table, **_: Any) -> hl.Expression:
69+
def gt_stats(
70+
ht: hl.Table,
71+
sample_lookup_ht: hl.Table,
72+
**_: Any,
73+
) -> hl.Expression:
9474
row = sample_lookup_ht[ht.key]
75+
AC_het, AC_hom, AN = 0, 0, 0 # noqa: N806
76+
for project_guid in row.ref_samples:
77+
if project_guid in PROJECTS_EXCLUDED_FROM_GT_STATS:
78+
continue
79+
ref_samples_length = row.ref_samples[project_guid].length()
80+
heteroplasmic_samples_length = row.heteroplasmic_samples[project_guid].length()
81+
homoplasmic_samples_length = row.homoplasmic_samples[project_guid].length()
82+
AC_het += heteroplasmic_samples_length # noqa: N806
83+
AC_hom += homoplasmic_samples_length # noqa: N806
84+
AN += (ref_samples_length + heteroplasmic_samples_length + homoplasmic_samples_length) # noqa: N806
9585
return hl.Struct(
96-
AC_het=_AC_het(row),
97-
AF_het=hl.float32(_AC_het(row) / _AN(row)),
98-
AC_hom=_AC_hom(row),
99-
AF_hom=hl.float32(_AC_hom(row) / _AN(row)),
100-
AN=_AN(row),
86+
AC_het=AC_het,
87+
AF_het=hl.float32(AC_het / AN),
88+
AC_hom=AC_hom,
89+
AF_hom=hl.float32(AC_hom / AN),
90+
AN=AN,
10191
)

v03_pipeline/lib/annotations/snv_indel.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33

44
import hail as hl
55

6+
from v03_pipeline.lib.annotations.constants import PROJECTS_EXCLUDED_FROM_GT_STATS
7+
68
N_ALT_REF = 0
79
N_ALT_HET = 1
810
N_ALT_HOM = 2
@@ -37,6 +39,8 @@ def gt_stats(
3739
row = sample_lookup_ht[ht.key]
3840
AC, AN, hom = 0, 0, 0
3941
for project_guid in row.ref_samples:
42+
if project_guid in PROJECTS_EXCLUDED_FROM_GT_STATS:
43+
continue
4044
ref_samples_length = row.ref_samples[project_guid].length()
4145
het_samples_length = row.het_samples[project_guid].length()
4246
hom_samples_length = row.hom_samples[project_guid].length()

v03_pipeline/lib/annotations/snv_indel_test.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,33 +25,37 @@ def test_allele_count_annotations(self) -> None:
2525
[
2626
{
2727
'id': 0,
28-
'ref_samples': hl.Struct(project_1={'a', 'c'}, project_2=set()),
29-
'het_samples': hl.Struct(project_1={'b', 'd'}, project_2=set()),
30-
'hom_samples': hl.Struct(project_1={'e', 'f'}, project_2=set()),
28+
'ref_samples': hl.Struct(project_1={'a', 'c'}, project_2=set(), R0607_gregor_training_project_=set()),
29+
'het_samples': hl.Struct(project_1={'b', 'd'}, project_2=set(), R0607_gregor_training_project_=set()),
30+
'hom_samples': hl.Struct(project_1={'e', 'f'}, project_2=set(), R0607_gregor_training_project_={'l', 'm'}),
3131
},
3232
{
3333
'id': 1,
3434
'ref_samples': hl.Struct(
3535
project_1={'a', 'b', 'c', 'd', 'e', 'f'},
3636
project_2=set(),
37+
R0607_gregor_training_project_={'l', 'm'},
3738
),
38-
'het_samples': hl.Struct(project_1=set(), project_2=set()),
39-
'hom_samples': hl.Struct(project_1=set(), project_2=set()),
39+
'het_samples': hl.Struct(project_1=set(), project_2=set(), R0607_gregor_training_project_=set()),
40+
'hom_samples': hl.Struct(project_1=set(), project_2=set(), R0607_gregor_training_project_=set()),
4041
},
4142
],
4243
hl.tstruct(
4344
id=hl.tint32,
4445
ref_samples=hl.tstruct(
4546
project_1=hl.tset(hl.tstr),
4647
project_2=hl.tset(hl.tstr),
48+
R0607_gregor_training_project_=hl.tset(hl.tstr),
4749
),
4850
het_samples=hl.tstruct(
4951
project_1=hl.tset(hl.tstr),
5052
project_2=hl.tset(hl.tstr),
53+
R0607_gregor_training_project_=hl.tset(hl.tstr),
5154
),
5255
hom_samples=hl.tstruct(
5356
project_1=hl.tset(hl.tstr),
5457
project_2=hl.tset(hl.tstr),
58+
R0607_gregor_training_project_=hl.tset(hl.tstr),
5559
),
5660
),
5761
key='id',

0 commit comments

Comments
 (0)