1
1
import hail as hl
2
2
3
- from v03_pipeline .lib .model import DatasetType
3
+ from v03_pipeline .lib .model import DatasetType , SampleType
4
4
5
5
6
6
def compute_callset_lookup_ht (
7
7
dataset_type : DatasetType ,
8
8
mt : hl .MatrixTable ,
9
9
project_guid : str ,
10
+ sample_type : SampleType ,
10
11
) -> hl .Table :
11
12
sample_id_to_family_guid = hl .dict (
12
13
{
@@ -38,7 +39,7 @@ def compute_callset_lookup_ht(
38
39
],
39
40
),
40
41
).rows ()
41
- ht = globalize_ids (ht , project_guid )
42
+ ht = globalize_ids (ht , project_guid , sample_type )
42
43
return ht .annotate (
43
44
project_stats = [
44
45
# Set a family to missing if all values are 0
@@ -52,13 +53,14 @@ def compute_callset_lookup_ht(
52
53
)
53
54
54
55
55
- def globalize_ids (ht : hl .Table , project_guid : str ) -> hl .Table :
56
+ def globalize_ids (ht : hl .Table , project_guid : str , sample_type : SampleType ) -> hl .Table :
56
57
row = ht .take (1 )[0 ] if ht .count () > 0 else None
57
58
has_project_stats = row and len (row .project_stats ) > 0
59
+ project_key = (project_guid , sample_type .value )
58
60
ht = ht .annotate_globals (
59
- project_guids = [ project_guid ],
61
+ project_sample_types = [ project_key ],
60
62
project_families = (
61
- {project_guid : [fs .family_guid for fs in ps ] for ps in row .project_stats }
63
+ {project_key : [fs .family_guid for fs in ps ] for ps in row .project_stats }
62
64
if has_project_stats
63
65
else hl .empty_dict (hl .tstr , hl .tarray (hl .tstr ))
64
66
),
@@ -73,14 +75,16 @@ def globalize_ids(ht: hl.Table, project_guid: str) -> hl.Table:
73
75
def remove_family_guids (
74
76
ht : hl .Table ,
75
77
project_guid : str ,
78
+ sample_type : SampleType ,
76
79
family_guids : hl .SetExpression ,
77
80
) -> hl .Table :
78
- if project_guid not in hl .eval (ht .globals .project_families ):
81
+ project_key = (project_guid , sample_type .value )
82
+ if project_key not in hl .eval (ht .globals .project_families ):
79
83
return ht
80
- project_i = ht .project_guids .index (project_guid )
84
+ project_i = ht .project_sample_types .index (project_key )
81
85
family_indexes_to_keep = hl .eval (
82
86
hl .array (
83
- hl .enumerate (ht .globals .project_families [project_guid ])
87
+ hl .enumerate (ht .globals .project_families [project_key ])
84
88
.filter (lambda item : ~ family_guids .contains (item [1 ]))
85
89
.map (lambda item : item [0 ]),
86
90
),
@@ -112,11 +116,11 @@ def remove_family_guids(
112
116
ht .project_families .items ().map (
113
117
lambda item : (
114
118
hl .if_else (
115
- item [0 ] != project_guid ,
119
+ item [0 ] != project_key ,
116
120
item ,
117
121
(
118
122
item [0 ],
119
- ht .project_families [project_guid ].filter (
123
+ ht .project_families [project_key ].filter (
120
124
lambda family_guid : ~ family_guids .contains (family_guid ),
121
125
),
122
126
),
@@ -130,13 +134,15 @@ def remove_family_guids(
130
134
def remove_project (
131
135
ht : hl .Table ,
132
136
project_guid : str ,
137
+ sample_type : SampleType ,
133
138
) -> hl .Table :
134
- existing_project_guids = hl .eval (ht .globals .project_guids )
135
- if project_guid not in existing_project_guids :
139
+ existing_projects = hl .eval (ht .globals .project_sample_types )
140
+ project_key = (project_guid , sample_type .value )
141
+ if project_key not in existing_projects :
136
142
return ht
137
143
project_indexes_to_keep = hl .eval (
138
- hl .enumerate (existing_project_guids )
139
- .filter (lambda item : item [1 ] != project_guid )
144
+ hl .enumerate (existing_projects )
145
+ .filter (lambda item : item [1 ] != project_key )
140
146
.map (lambda item : item [0 ]),
141
147
)
142
148
ht = ht .annotate (
@@ -149,11 +155,11 @@ def remove_project(
149
155
)
150
156
ht = ht .filter (hl .any (ht .project_stats .map (hl .is_defined )))
151
157
return ht .annotate_globals (
152
- project_guids = ht .project_guids .filter (
153
- lambda p : p != project_guid ,
158
+ project_sample_types = ht .project_sample_types .filter (
159
+ lambda p : p != project_key ,
154
160
),
155
161
project_families = hl .dict (
156
- ht .project_families .items ().filter (lambda item : item [0 ] != project_guid ),
162
+ ht .project_families .items ().filter (lambda item : item [0 ] != project_key ),
157
163
),
158
164
)
159
165
@@ -163,8 +169,8 @@ def join_lookup_hts(
163
169
callset_ht : hl .Table ,
164
170
) -> hl .Table :
165
171
ht = ht .join (callset_ht , 'outer' )
166
- project_guid = ht .project_guids_1 [0 ]
167
- ht_project_i = ht .project_guids .index (project_guid )
172
+ project_key = ht .project_sample_types_1 [0 ]
173
+ ht_project_i = ht .project_sample_types .index (project_key )
168
174
ht = ht .select (
169
175
# We have 6 unique cases here.
170
176
# 1) The project has not been loaded before, the row is missing
@@ -183,14 +189,14 @@ def join_lookup_hts(
183
189
hl .case ()
184
190
.when (
185
191
(hl .is_missing (ht_project_i ) & hl .is_missing (ht .project_stats )),
186
- ht .project_guids .map (
192
+ ht .project_sample_types .map (
187
193
lambda _ : hl .missing (ht .project_stats .dtype .element_type ),
188
194
).extend (ht .project_stats_1 ),
189
195
)
190
196
.when (
191
197
(hl .is_missing (ht_project_i ) & hl .is_missing (ht .project_stats_1 )),
192
198
ht .project_stats .extend (
193
- ht .project_guids_1 .map (
199
+ ht .project_sample_types_1 .map (
194
200
lambda _ : hl .missing (ht .project_stats_1 .dtype .element_type ),
195
201
),
196
202
),
@@ -201,7 +207,7 @@ def join_lookup_hts(
201
207
)
202
208
.when (
203
209
hl .is_missing (ht .project_stats ),
204
- hl .enumerate (ht .project_guids ).starmap (
210
+ hl .enumerate (ht .project_sample_types ).starmap (
205
211
# Add a missing project_stats value for every loaded project,
206
212
# then add a missing value for every family for "this project"
207
213
# and extend the new families on the right.
@@ -230,7 +236,7 @@ def join_lookup_hts(
230
236
i != ht_project_i ,
231
237
ps ,
232
238
ps .extend (
233
- ht .project_families_1 [project_guid ].map (
239
+ ht .project_families_1 [project_key ].map (
234
240
lambda _ : hl .missing (
235
241
ht .project_stats .dtype .element_type .element_type ,
236
242
),
@@ -256,26 +262,26 @@ def join_lookup_hts(
256
262
),
257
263
)
258
264
# NB: double reference these because the source ht has changed :/
259
- project_guid = ht .project_guids_1 [0 ]
260
- ht_project_i = ht .project_guids .index (project_guid )
265
+ project_key = ht .project_sample_types_1 [0 ]
266
+ ht_project_i = ht .project_sample_types .index (project_key )
261
267
return ht .transmute_globals (
262
- project_guids = hl .if_else (
268
+ project_sample_types = hl .if_else (
263
269
hl .is_missing (ht_project_i ),
264
- ht .project_guids .extend (ht .project_guids_1 ),
265
- ht .project_guids ,
270
+ ht .project_sample_types .extend (ht .project_sample_types_1 ),
271
+ ht .project_sample_types ,
266
272
),
267
273
project_families = hl .if_else (
268
274
hl .is_missing (ht_project_i ),
269
275
hl .dict (ht .project_families .items ().extend (ht .project_families_1 .items ())),
270
276
hl .dict (
271
277
ht .project_families .items ().map (
272
278
lambda item : hl .if_else (
273
- item [0 ] != project_guid ,
279
+ item [0 ] != project_key ,
274
280
item ,
275
281
(
276
282
item [0 ],
277
- ht .project_families [project_guid ].extend (
278
- ht .project_families_1 [project_guid ],
283
+ ht .project_families [project_key ].extend (
284
+ ht .project_families_1 [project_key ],
279
285
),
280
286
),
281
287
),
0 commit comments