Skip to content

Commit 474d0ca

Browse files
authored
Reference Data Update Type Equality Check (#789)
* Finish validity check test * ruff * update dbnsfp field * More types * more types * ugh * twiddle it back * update type * more tweaks * lint * fix floats * decompose * ruff formatg * Update compare_globals_test.py
1 parent 82140db commit 474d0ca

File tree

72 files changed

+109
-36
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

72 files changed

+109
-36
lines changed

v03_pipeline/lib/reference_data/compare_globals.py

Lines changed: 34 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ class Globals:
2121
paths: dict[str, str]
2222
versions: dict[str, str]
2323
enums: dict[str, dict[str, list[str]]]
24-
selects: dict[str, set[str]]
24+
selects: dict[str, dict[str, hl.dtype]]
2525

2626
def __getitem__(self, name: str):
2727
return getattr(self, name)
@@ -50,7 +50,11 @@ def from_dataset_configs(
5050
dataset_ht = dataset_ht.transmute(
5151
**get_enum_select_fields(dataset_ht, dataset_config),
5252
)
53-
selects[dataset] = set(dataset_ht.row) - set(dataset_ht.key)
53+
selects[dataset] = {
54+
k: v.dtype
55+
for k, v in dict(dataset_ht.row).items()
56+
if k not in set(dataset_ht.key)
57+
}
5458
return cls(paths, versions, enums, selects)
5559

5660
@classmethod
@@ -69,32 +73,52 @@ def from_ht(
6973
if dataset in ht.row:
7074
# NB: handle an edge case (mito high constraint) where we annotate a bool from the reference dataset collection
7175
selects[dataset] = (
72-
set(ht[dataset])
76+
{k: v.dtype for k, v in dict(ht[dataset]).items()}
7377
if isinstance(ht[dataset], hl.StructExpression)
74-
else set()
78+
else {}
7579
)
7680
return cls(paths, versions, enums, selects)
7781

7882

83+
def validate_selects_types(
84+
ht1_globals: Globals,
85+
ht2_globals: Globals,
86+
dataset: str,
87+
) -> None:
88+
# Assert that all shared annotations have identical types
89+
shared_selects = (
90+
ht1_globals['selects'][dataset].keys()
91+
& ht2_globals['selects'].get(dataset).keys()
92+
)
93+
mismatched_select_types = [
94+
(select, ht2_globals['selects'][dataset][select])
95+
for select in shared_selects
96+
if (
97+
ht1_globals['selects'][dataset][select]
98+
!= ht2_globals['selects'][dataset][select]
99+
)
100+
]
101+
if mismatched_select_types:
102+
msg = f'Unexpected field types detected in {dataset}: {mismatched_select_types}'
103+
raise ValueError(msg)
104+
105+
79106
def get_datasets_to_update(
80107
ht1_globals: Globals,
81108
ht2_globals: Globals,
82109
validate_selects: bool = True,
83110
) -> list[str]:
84111
datasets_to_update = set()
85-
86112
for field in dataclasses.fields(Globals):
87113
if field.name == 'selects' and not validate_selects:
88114
continue
89-
90115
datasets_to_update.update(
91116
ht1_globals[field.name].keys() ^ ht2_globals[field.name].keys(),
92117
)
93118
for dataset in ht1_globals[field.name].keys() & ht2_globals[field.name].keys():
94-
if ht1_globals[field.name].get(dataset) != ht2_globals[field.name].get(
95-
dataset,
96-
):
119+
if field.name == 'selects':
120+
validate_selects_types(ht1_globals, ht2_globals, dataset)
121+
if ht1_globals[field.name][dataset] != ht2_globals[field.name][dataset]:
97122
logger.info(f'{field.name} mismatch for {dataset}')
98123
datasets_to_update.add(dataset)
99-
100124
return sorted(datasets_to_update)

v03_pipeline/lib/reference_data/compare_globals_test.py

Lines changed: 64 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -103,8 +103,15 @@ def test_create_globals_from_dataset_configs(
103103
self.assertTrue(
104104
dataset_config_globals.selects
105105
== {
106-
'a': {'test_select', 'test_enum_id'},
107-
'b': {'test_select', 'field2', 'test_enum_id'},
106+
'a': {
107+
'test_select': hl.tint32,
108+
'test_enum_id': hl.tint32,
109+
},
110+
'b': {
111+
'test_select': hl.tint32,
112+
'field2': hl.tint32,
113+
'test_enum_id': hl.tint32,
114+
},
108115
},
109116
)
110117

@@ -134,7 +141,11 @@ def test_create_globals_from_dataset_configs_single_dataset(self, mock_read_tabl
134141
self.assertTrue(
135142
dataset_config_globals.selects
136143
== {
137-
'b': {'test_select', 'field2', 'test_enum_id'},
144+
'b': {
145+
'test_select': hl.tint32,
146+
'field2': hl.tint32,
147+
'test_enum_id': hl.tint32,
148+
},
138149
},
139150
)
140151

@@ -186,8 +197,8 @@ def test_from_rdc_or_annotations_ht(self):
186197
self.assertTrue(
187198
rdc_globals.selects
188199
== {
189-
'gnomad_non_coding_constraint': {'z_score'},
190-
'screen': {'region_type_ids'},
200+
'gnomad_non_coding_constraint': {'z_score': hl.tfloat32},
201+
'screen': {'region_type_ids': hl.tarray(hl.tint32)},
191202
},
192203
)
193204

@@ -198,13 +209,13 @@ def test_get_datasets_to_update_version_different(self):
198209
# 'a' has a different version, 'c' is missing version in ht2_globals
199210
versions={'a': 'v2', 'b': 'v2', 'c': 'v1'},
200211
enums={'a': {}, 'b': {}, 'c': {}},
201-
selects={'a': set(), 'b': set()},
212+
selects={'a': {}, 'b': {}},
202213
),
203214
ht2_globals=Globals(
204215
paths={'a': 'a_path', 'b': 'b_path'},
205216
versions={'a': 'v1', 'b': 'v2'},
206217
enums={'a': {}, 'b': {}},
207-
selects={'a': set(), 'b': set()},
218+
selects={'a': {}, 'b': {}},
208219
),
209220
)
210221
self.assertTrue(result == ['a', 'c'])
@@ -216,13 +227,13 @@ def test_get_datasets_to_update_path_different(self):
216227
paths={'a': 'a_path', 'b': 'old_b_path', 'c': 'extra_c_path'},
217228
versions={'a': 'v1', 'b': 'v2'},
218229
enums={'a': {}, 'b': {}},
219-
selects={'a': set(), 'b': set()},
230+
selects={'a': {}, 'b': {}},
220231
),
221232
ht2_globals=Globals(
222233
paths={'a': 'a_path', 'b': 'b_path'},
223234
versions={'a': 'v1', 'b': 'v2'},
224235
enums={'a': {}, 'b': {}},
225-
selects={'a': set(), 'b': set()},
236+
selects={'a': {}, 'b': {}},
226237
),
227238
)
228239
self.assertTrue(result == ['b', 'c'])
@@ -238,13 +249,13 @@ def test_get_datasets_to_update_enum_different(self):
238249
'b': {'enum_key_1': []},
239250
'c': {},
240251
},
241-
selects={'a': set(), 'b': set()},
252+
selects={'a': {}, 'b': {}},
242253
),
243254
ht2_globals=Globals(
244255
paths={'a': 'a_path', 'b': 'b_path'},
245256
versions={'a': 'v1', 'b': 'v2'},
246257
enums={'a': {'test_enum': ['C', 'D']}, 'b': {'enum_key_2': []}},
247-
selects={'a': set(), 'b': set()},
258+
selects={'a': {}, 'b': {}},
248259
),
249260
)
250261
self.assertTrue(result == ['a', 'b', 'c'])
@@ -257,16 +268,54 @@ def test_get_datasets_to_update_select_different(self):
257268
enums={'a': {}, 'b': {}},
258269
# 'a' has extra select, 'b' has different select, 'c' is missing select in ht2_globals
259270
selects={
260-
'a': {'field1', 'field2'},
261-
'b': {'test_select'},
262-
'c': set('test_select'),
271+
'a': {'field1': hl.tint32, 'field2': hl.tint32},
272+
'b': {'test_select': hl.tint32},
273+
'c': {'test_select': hl.tint32},
263274
},
264275
),
265276
ht2_globals=Globals(
266277
paths={'a': 'a_path', 'b': 'b_path'},
267278
versions={'a': 'v1', 'b': 'v2'},
268279
enums={'a': {}, 'b': {}},
269-
selects={'a': {'field1'}, 'b': {'test_select_2'}},
280+
selects={'a': {'field1': hl.tint32}, 'b': {'test_select_2': hl.tint32}},
270281
),
271282
)
272283
self.assertTrue(result == ['a', 'b', 'c'])
284+
285+
def test_get_datasets_to_update_select_type_validation(self):
286+
self.assertRaisesRegex(
287+
ValueError,
288+
"Unexpected field types detected in a: \\[\\('field1', dtype\\('int32'\\)\\)\\]",
289+
get_datasets_to_update,
290+
ht1_globals=Globals(
291+
paths={'a': 'a_path'},
292+
versions={'a': 'v1'},
293+
enums={'a': {}},
294+
selects={
295+
'a': {'field1': hl.tarray(hl.tint32)},
296+
},
297+
),
298+
ht2_globals=Globals(
299+
paths={'a': 'a_path'},
300+
versions={'a': 'v1'},
301+
enums={'a': {}},
302+
selects={'a': {'field1': hl.tint32, 'field2': hl.tint32}},
303+
),
304+
)
305+
result = get_datasets_to_update(
306+
ht1_globals=Globals(
307+
paths={'a': 'a_path'},
308+
versions={'a': 'v1'},
309+
enums={'a': {}},
310+
selects={
311+
'a': {'field1': hl.tarray(hl.tint32)},
312+
},
313+
),
314+
ht2_globals=Globals(
315+
paths={'a': 'a_path'},
316+
versions={'a': 'v1'},
317+
enums={'a': {}},
318+
selects={'a': {'field1': hl.tarray(hl.tint32), 'field2': hl.tint32}},
319+
),
320+
)
321+
self.assertTrue(result == ['a'])

v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@
4545
hl.tstruct(
4646
locus=hl.tlocus('GRCh38'),
4747
alleles=hl.tarray(hl.tstr),
48-
PHRED=hl.tint32,
48+
PHRED=hl.tfloat32,
4949
),
5050
key=['locus', 'alleles'],
5151
globals=hl.Struct(
@@ -760,7 +760,7 @@ def test_update_vat_with_updated_rdc_snv_indel_38(
760760
conditions=None,
761761
),
762762
dbnsfp=hl.Struct(
763-
REVEL_score=0.043,
763+
REVEL_score=0.0430000014603138,
764764
SIFT_score=None,
765765
Polyphen2_HVAR_score=None,
766766
MutationTaster_pred_id=0,
@@ -1168,7 +1168,7 @@ def test_update_vat_with_updated_rdc_snv_indel_37(
11681168
conditions=None,
11691169
),
11701170
dbnsfp=hl.Struct(
1171-
REVEL_score=0.043,
1171+
REVEL_score=0.0430000014603138,
11721172
SIFT_score=None,
11731173
Polyphen2_HVAR_score=None,
11741174
MutationTaster_pred_id=0,
Binary file not shown.
Binary file not shown.
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
This folder comprises a Hail (www.hail.is) native Table or MatrixTable.
2-
Written with version 0.2.128-eead8100a1c1
3-
Created at 2024/05/09 20:02:21
2+
Written with version 0.2.130-bea04d9c79b5
3+
Created at 2024/05/20 13:48:16
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
This folder comprises a Hail (www.hail.is) native Table or MatrixTable.
2-
Written with version 0.2.128-eead8100a1c1
3-
Created at 2024/03/21 11:28:13
2+
Written with version 0.2.130-bea04d9c79b5
3+
Created at 2024/05/20 15:38:26
Binary file not shown.
Binary file not shown.
Binary file not shown.
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
This folder comprises a Hail (www.hail.is) native Table or MatrixTable.
2-
Written with version 0.2.128-eead8100a1c1
3-
Created at 2024/03/21 11:35:30
2+
Written with version 0.2.130-bea04d9c79b5
3+
Created at 2024/05/20 14:08:17
Binary file not shown.
Binary file not shown.
Binary file not shown.
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
This folder comprises a Hail (www.hail.is) native Table or MatrixTable.
2-
Written with version 0.2.114-cc8d36408b36
3-
Created at 2023/07/13 19:51:12
2+
Written with version 0.2.130-bea04d9c79b5
3+
Created at 2024/05/20 13:22:32
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

0 commit comments

Comments
 (0)