Skip to content

Commit ef64f37

Browse files
authored
Merge pull request #737 from broadinstitute/dev
Dev
2 parents 141ea1b + 35f001f commit ef64f37

25 files changed

+1647
-961
lines changed

v03_pipeline/lib/annotations/fields_test.py

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -95,25 +95,38 @@ def test_get_formatting_fields(self, mock_vep: Mock, mock_validate: Mock) -> Non
9595
expected_fields,
9696
)
9797

98-
def test_get_sample_lookup_table_fields(
98+
def test_get_lookup_table_fields(
9999
self,
100100
) -> None:
101-
sample_lookup_ht = hl.Table.parallelize(
101+
lookup_ht = hl.Table.parallelize(
102102
[
103103
{
104104
'locus': hl.Locus('chr1', 1, ReferenceGenome.GRCh38.value),
105105
'alleles': ['A', 'C'],
106-
'ref_samples': hl.Struct(project_1={'a', 'c'}),
107-
'het_samples': hl.Struct(project_1={'b', 'd'}),
108-
'hom_samples': hl.Struct(project_1={'e', 'f'}),
106+
'project_stats': [
107+
[
108+
hl.Struct(
109+
ref_samples=2,
110+
het_samples=2,
111+
hom_samples=2,
112+
),
113+
],
114+
],
109115
},
110116
],
111117
hl.tstruct(
112118
locus=hl.tlocus(ReferenceGenome.GRCh38.value),
113119
alleles=hl.tarray(hl.tstr),
114-
ref_samples=hl.tstruct(project_1=hl.tset(hl.tstr)),
115-
het_samples=hl.tstruct(project_1=hl.tset(hl.tstr)),
116-
hom_samples=hl.tstruct(project_1=hl.tset(hl.tstr)),
120+
project_stats=hl.tarray(
121+
hl.tarray(
122+
hl.tstruct(
123+
**{
124+
field: hl.tint32
125+
for field in DatasetType.SNV_INDEL.lookup_table_fields_and_genotype_filter_fns
126+
},
127+
),
128+
),
129+
),
117130
),
118131
key=('locus', 'alleles'),
119132
globals=hl.Struct(
@@ -132,8 +145,8 @@ def test_get_sample_lookup_table_fields(
132145
list(
133146
get_fields(
134147
ht,
135-
DatasetType.SNV_INDEL.sample_lookup_table_annotation_fns,
136-
sample_lookup_ht=sample_lookup_ht,
148+
DatasetType.SNV_INDEL.lookup_table_annotation_fns,
149+
lookup_ht=lookup_ht,
137150
dataset_type=DatasetType.SNV_INDEL,
138151
reference_genome=ReferenceGenome.GRCh38,
139152
).keys(),

v03_pipeline/lib/annotations/mito.py

Lines changed: 10 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33
import hail as hl
44

5-
from v03_pipeline.lib.annotations.constants import PROJECTS_EXCLUDED_FROM_GT_STATS
65
from v03_pipeline.lib.annotations.enums import MITOTIP_PATHOGENICITIES
76

87
MITOTIP_PATHOGENICITIES_LOOKUP = hl.dict(
@@ -68,24 +67,18 @@ def rsid(ht: hl.Table, **_: Any) -> hl.Expression:
6867

6968
def gt_stats(
7069
ht: hl.Table,
71-
sample_lookup_ht: hl.Table,
70+
lookup_ht: hl.Table,
7271
**_: Any,
7372
) -> hl.Expression:
74-
row = sample_lookup_ht[ht.key]
75-
AC_het, AC_hom, AN = 0, 0, 0 # noqa: N806
76-
for project_guid in row.ref_samples:
77-
if project_guid in PROJECTS_EXCLUDED_FROM_GT_STATS:
78-
continue
79-
ref_samples_length = row.ref_samples[project_guid].length()
80-
heteroplasmic_samples_length = row.heteroplasmic_samples[project_guid].length()
81-
homoplasmic_samples_length = row.homoplasmic_samples[project_guid].length()
82-
AC_het += heteroplasmic_samples_length # noqa: N806
83-
AC_hom += homoplasmic_samples_length # noqa: N806
84-
AN += ( # noqa: N806
85-
ref_samples_length
86-
+ heteroplasmic_samples_length
87-
+ homoplasmic_samples_length
88-
)
73+
row = lookup_ht[ht.key]
74+
ref_samples = hl.sum(hl.flatten(row.project_stats.ref_samples))
75+
heteroplasmic_samples = hl.sum(hl.flatten(row.project_stats.heteroplasmic_samples))
76+
homoplasmic_samples = hl.sum(hl.flatten(row.project_stats.homoplasmic_samples))
77+
AC_het = heteroplasmic_samples # noqa: N806
78+
AC_hom = homoplasmic_samples # noqa: N806
79+
AN = ( # noqa: N806
80+
ref_samples + heteroplasmic_samples + homoplasmic_samples
81+
)
8982
return hl.Struct(
9083
AC_het=AC_het,
9184
AF_het=hl.float32(AC_het / AN),

v03_pipeline/lib/annotations/mito_test.py

Lines changed: 36 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import hail as hl
44

55
from v03_pipeline.lib.annotations.mito import gt_stats
6+
from v03_pipeline.lib.model import DatasetType
67

78

89
class MITOTest(unittest.TestCase):
@@ -21,51 +22,55 @@ def test_allele_count_annotations(self) -> None:
2122
),
2223
key='id',
2324
)
24-
sample_lookup_ht = hl.Table.parallelize(
25+
lookup_ht = hl.Table.parallelize(
2526
[
2627
{
2728
'id': 0,
28-
'ref_samples': hl.Struct(project_1={'a', 'c'}, project_2=set()),
29-
'heteroplasmic_samples': hl.Struct(
30-
project_1={'b', 'd'},
31-
project_2=set(),
32-
),
33-
'homoplasmic_samples': hl.Struct(
34-
project_1={'e', 'f'},
35-
project_2=set(),
36-
),
29+
'project_stats': [
30+
[
31+
hl.Struct(
32+
ref_samples=2,
33+
heteroplasmic_samples=2,
34+
homoplasmic_samples=2,
35+
),
36+
],
37+
[],
38+
],
3739
},
3840
{
3941
'id': 1,
40-
'ref_samples': hl.Struct(
41-
project_1={'a', 'b', 'c', 'd', 'e', 'f'},
42-
project_2=set(),
43-
),
44-
'heteroplasmic_samples': hl.Struct(
45-
project_1=set(),
46-
project_2=set(),
47-
),
48-
'homoplasmic_samples': hl.Struct(project_1=set(), project_2=set()),
42+
'project_stats': [
43+
[
44+
hl.Struct(
45+
ref_samples=6,
46+
heteroplasmic_samples=0,
47+
homoplasmic_samples=0,
48+
),
49+
],
50+
[],
51+
],
4952
},
5053
],
5154
hl.tstruct(
5255
id=hl.tint32,
53-
ref_samples=hl.tstruct(
54-
project_1=hl.tset(hl.tstr),
55-
project_2=hl.tset(hl.tstr),
56-
),
57-
heteroplasmic_samples=hl.tstruct(
58-
project_1=hl.tset(hl.tstr),
59-
project_2=hl.tset(hl.tstr),
60-
),
61-
homoplasmic_samples=hl.tstruct(
62-
project_1=hl.tset(hl.tstr),
63-
project_2=hl.tset(hl.tstr),
56+
project_stats=hl.tarray(
57+
hl.tarray(
58+
hl.tstruct(
59+
**{
60+
field: hl.tint32
61+
for field in DatasetType.MITO.lookup_table_fields_and_genotype_filter_fns
62+
},
63+
),
64+
),
6465
),
6566
),
6667
key='id',
68+
globals=hl.Struct(
69+
project_guids=['project_1', 'project_2'],
70+
project_families={'project_1': ['a'], 'project_2': []},
71+
),
6772
)
68-
ht = ht.select(gt_stats=gt_stats(ht, sample_lookup_ht))
73+
ht = ht.select(gt_stats=gt_stats(ht, lookup_ht))
6974
self.assertCountEqual(
7075
ht.collect(),
7176
[

v03_pipeline/lib/annotations/snv_indel.py

Lines changed: 8 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,6 @@
33

44
import hail as hl
55

6-
from v03_pipeline.lib.annotations.constants import PROJECTS_EXCLUDED_FROM_GT_STATS
7-
86
N_ALT_REF = 0
97
N_ALT_HET = 1
108
N_ALT_HOM = 2
@@ -33,24 +31,16 @@ def DP(mt: hl.MatrixTable, **_: Any) -> hl.Expression: # noqa: N802
3331

3432
def gt_stats(
3533
ht: hl.Table,
36-
sample_lookup_ht: hl.Table,
34+
lookup_ht: hl.Table,
3735
**_: Any,
3836
) -> hl.Expression:
39-
row = sample_lookup_ht[ht.key]
40-
AC, AN, hom = 0, 0, 0
41-
for project_guid in row.ref_samples:
42-
if project_guid in PROJECTS_EXCLUDED_FROM_GT_STATS:
43-
continue
44-
ref_samples_length = row.ref_samples[project_guid].length()
45-
het_samples_length = row.het_samples[project_guid].length()
46-
hom_samples_length = row.hom_samples[project_guid].length()
47-
AC += (
48-
ref_samples_length * N_ALT_REF
49-
+ het_samples_length * N_ALT_HET
50-
+ hom_samples_length * N_ALT_HOM
51-
)
52-
AN += 2 * (ref_samples_length + het_samples_length + hom_samples_length)
53-
hom += hom_samples_length
37+
row = lookup_ht[ht.key]
38+
ref_samples = hl.sum(hl.flatten(row.project_stats.ref_samples))
39+
het_samples = hl.sum(hl.flatten(row.project_stats.het_samples))
40+
hom_samples = hl.sum(hl.flatten(row.project_stats.hom_samples))
41+
AC = ref_samples * N_ALT_REF + het_samples * N_ALT_HET + hom_samples * N_ALT_HOM
42+
AN = 2 * (ref_samples + het_samples + hom_samples)
43+
hom = hom_samples
5444
return hl.Struct(
5545
AC=AC,
5646
AN=AN,

v03_pipeline/lib/annotations/snv_indel_test.py

Lines changed: 34 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import hail as hl
44

55
from v03_pipeline.lib.annotations.snv_indel import gt_stats
6+
from v03_pipeline.lib.model import DatasetType
67

78

89
class SNVTest(unittest.TestCase):
@@ -21,66 +22,53 @@ def test_allele_count_annotations(self) -> None:
2122
),
2223
key='id',
2324
)
24-
sample_lookup_ht = hl.Table.parallelize(
25+
lookup_ht = hl.Table.parallelize(
2526
[
2627
{
2728
'id': 0,
28-
'ref_samples': hl.Struct(
29-
project_1={'a', 'c'},
30-
project_2=set(),
31-
R0607_gregor_training_project_=set(),
32-
),
33-
'het_samples': hl.Struct(
34-
project_1={'b', 'd'},
35-
project_2=set(),
36-
R0607_gregor_training_project_=set(),
37-
),
38-
'hom_samples': hl.Struct(
39-
project_1={'e', 'f'},
40-
project_2=set(),
41-
R0607_gregor_training_project_={'l', 'm'},
42-
),
29+
'project_stats': [
30+
[
31+
hl.Struct(
32+
ref_samples=2,
33+
het_samples=2,
34+
hom_samples=2,
35+
),
36+
None,
37+
],
38+
[],
39+
],
4340
},
4441
{
4542
'id': 1,
46-
'ref_samples': hl.Struct(
47-
project_1={'a', 'b', 'c', 'd', 'e', 'f'},
48-
project_2=set(),
49-
R0607_gregor_training_project_={'l', 'm'},
50-
),
51-
'het_samples': hl.Struct(
52-
project_1=set(),
53-
project_2=set(),
54-
R0607_gregor_training_project_=set(),
55-
),
56-
'hom_samples': hl.Struct(
57-
project_1=set(),
58-
project_2=set(),
59-
R0607_gregor_training_project_=set(),
60-
),
43+
'project_stats': [
44+
[
45+
hl.Struct(
46+
ref_samples=6,
47+
het_samples=0,
48+
hom_samples=0,
49+
),
50+
None,
51+
],
52+
[],
53+
],
6154
},
6255
],
6356
hl.tstruct(
6457
id=hl.tint32,
65-
ref_samples=hl.tstruct(
66-
project_1=hl.tset(hl.tstr),
67-
project_2=hl.tset(hl.tstr),
68-
R0607_gregor_training_project_=hl.tset(hl.tstr),
69-
),
70-
het_samples=hl.tstruct(
71-
project_1=hl.tset(hl.tstr),
72-
project_2=hl.tset(hl.tstr),
73-
R0607_gregor_training_project_=hl.tset(hl.tstr),
74-
),
75-
hom_samples=hl.tstruct(
76-
project_1=hl.tset(hl.tstr),
77-
project_2=hl.tset(hl.tstr),
78-
R0607_gregor_training_project_=hl.tset(hl.tstr),
58+
project_stats=hl.tarray(
59+
hl.tarray(
60+
hl.tstruct(
61+
**{
62+
field: hl.tint32
63+
for field in DatasetType.SNV_INDEL.lookup_table_fields_and_genotype_filter_fns
64+
},
65+
),
66+
),
7967
),
8068
),
8169
key='id',
8270
)
83-
ht = ht.select(gt_stats=gt_stats(ht, sample_lookup_ht))
71+
ht = ht.select(gt_stats=gt_stats(ht, lookup_ht))
8472
self.assertCountEqual(
8573
ht.collect(),
8674
[

0 commit comments

Comments
 (0)