Skip to content

Reference Data Update Type Equality Check #789

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
May 23, 2024
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 34 additions & 10 deletions v03_pipeline/lib/reference_data/compare_globals.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ class Globals:
paths: dict[str, str]
versions: dict[str, str]
enums: dict[str, dict[str, list[str]]]
selects: dict[str, set[str]]
selects: dict[str, dict[str, hl.dtype]]

def __getitem__(self, name: str):
return getattr(self, name)
Expand Down Expand Up @@ -50,7 +50,11 @@ def from_dataset_configs(
dataset_ht = dataset_ht.transmute(
**get_enum_select_fields(dataset_ht, dataset_config),
)
selects[dataset] = set(dataset_ht.row) - set(dataset_ht.key)
selects[dataset] = {
k: v.dtype
for k, v in dict(dataset_ht.row).items()
if k not in set(dataset_ht.key)
}
return cls(paths, versions, enums, selects)

@classmethod
Expand All @@ -69,32 +73,52 @@ def from_ht(
if dataset in ht.row:
# NB: handle an edge case (mito high constraint) where we annotate a bool from the reference dataset collection
selects[dataset] = (
set(ht[dataset])
{k: v.dtype for k, v in dict(ht[dataset]).items()}
if isinstance(ht[dataset], hl.StructExpression)
else set()
else {}
)
return cls(paths, versions, enums, selects)


def validate_selects_types(
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The idea here is to just ensure that any fields shared between the existing table and an "update" are identically typed.

imo we're getting very close to needing a statically defined schema for these, but trying to squeeze that in was much messier.

ht1_globals: Globals,
ht2_globals: Globals,
dataset: str,
) -> None:
# Assert that all shared annotations have identical types
shared_selects = (
ht1_globals['selects'][dataset].keys()
& ht2_globals['selects'].get(dataset).keys()
)
mismatched_select_types = [
(select, ht2_globals['selects'][dataset][select])
for select in shared_selects
if (
ht1_globals['selects'][dataset][select]
!= ht2_globals['selects'][dataset][select]
)
]
if mismatched_select_types:
msg = f'Unexpected field types detected in {dataset}: {mismatched_select_types}'
raise ValueError(msg)


def get_datasets_to_update(
ht1_globals: Globals,
ht2_globals: Globals,
validate_selects: bool = True,
) -> list[str]:
datasets_to_update = set()

for field in dataclasses.fields(Globals):
if field.name == 'selects' and not validate_selects:
continue

datasets_to_update.update(
ht1_globals[field.name].keys() ^ ht2_globals[field.name].keys(),
)
for dataset in ht1_globals[field.name].keys() & ht2_globals[field.name].keys():
if ht1_globals[field.name].get(dataset) != ht2_globals[field.name].get(
dataset,
):
if field.name == 'selects':
validate_selects_types(ht1_globals, ht2_globals, dataset)
if ht1_globals[field.name][dataset] != ht2_globals[field.name][dataset]:
logger.info(f'{field.name} mismatch for {dataset}')
datasets_to_update.add(dataset)

return sorted(datasets_to_update)
80 changes: 65 additions & 15 deletions v03_pipeline/lib/reference_data/compare_globals_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,15 @@ def test_create_globals_from_dataset_configs(
self.assertTrue(
dataset_config_globals.selects
== {
'a': {'test_select', 'test_enum_id'},
'b': {'test_select', 'field2', 'test_enum_id'},
'a': {
'test_select': hl.tint32,
'test_enum_id': hl.tint32,
},
'b': {
'test_select': hl.tint32,
'field2': hl.tint32,
'test_enum_id': hl.tint32,
},
},
)

Expand Down Expand Up @@ -134,7 +141,11 @@ def test_create_globals_from_dataset_configs_single_dataset(self, mock_read_tabl
self.assertTrue(
dataset_config_globals.selects
== {
'b': {'test_select', 'field2', 'test_enum_id'},
'b': {
'test_select': hl.tint32,
'field2': hl.tint32,
'test_enum_id': hl.tint32,
},
},
)

Expand Down Expand Up @@ -183,11 +194,12 @@ def test_from_rdc_or_annotations_ht(self):
self.assertTrue(
rdc_globals.enums == {'screen': {'region_type': ['C', 'D']}},
)
print(rdc_globals.selects)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I assume this is left in by mistake?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yep 🤦 I'm going to add a lint rule for this...

self.assertTrue(
rdc_globals.selects
== {
'gnomad_non_coding_constraint': {'z_score'},
'screen': {'region_type_ids'},
'gnomad_non_coding_constraint': {'z_score': hl.tfloat32},
'screen': {'region_type_ids': hl.tarray(hl.tint32)},
},
)

Expand All @@ -198,13 +210,13 @@ def test_get_datasets_to_update_version_different(self):
# 'a' has a different version, 'c' is missing version in ht2_globals
versions={'a': 'v2', 'b': 'v2', 'c': 'v1'},
enums={'a': {}, 'b': {}, 'c': {}},
selects={'a': set(), 'b': set()},
selects={'a': {}, 'b': {}},
),
ht2_globals=Globals(
paths={'a': 'a_path', 'b': 'b_path'},
versions={'a': 'v1', 'b': 'v2'},
enums={'a': {}, 'b': {}},
selects={'a': set(), 'b': set()},
selects={'a': {}, 'b': {}},
),
)
self.assertTrue(result == ['a', 'c'])
Expand All @@ -216,13 +228,13 @@ def test_get_datasets_to_update_path_different(self):
paths={'a': 'a_path', 'b': 'old_b_path', 'c': 'extra_c_path'},
versions={'a': 'v1', 'b': 'v2'},
enums={'a': {}, 'b': {}},
selects={'a': set(), 'b': set()},
selects={'a': {}, 'b': {}},
),
ht2_globals=Globals(
paths={'a': 'a_path', 'b': 'b_path'},
versions={'a': 'v1', 'b': 'v2'},
enums={'a': {}, 'b': {}},
selects={'a': set(), 'b': set()},
selects={'a': {}, 'b': {}},
),
)
self.assertTrue(result == ['b', 'c'])
Expand All @@ -238,13 +250,13 @@ def test_get_datasets_to_update_enum_different(self):
'b': {'enum_key_1': []},
'c': {},
},
selects={'a': set(), 'b': set()},
selects={'a': {}, 'b': {}},
),
ht2_globals=Globals(
paths={'a': 'a_path', 'b': 'b_path'},
versions={'a': 'v1', 'b': 'v2'},
enums={'a': {'test_enum': ['C', 'D']}, 'b': {'enum_key_2': []}},
selects={'a': set(), 'b': set()},
selects={'a': {}, 'b': {}},
),
)
self.assertTrue(result == ['a', 'b', 'c'])
Expand All @@ -257,16 +269,54 @@ def test_get_datasets_to_update_select_different(self):
enums={'a': {}, 'b': {}},
# 'a' has extra select, 'b' has different select, 'c' is missing select in ht2_globals
selects={
'a': {'field1', 'field2'},
'b': {'test_select'},
'c': set('test_select'),
'a': {'field1': hl.tint32, 'field2': hl.tint32},
'b': {'test_select': hl.tint32},
'c': {'test_select': hl.tint32},
},
),
ht2_globals=Globals(
paths={'a': 'a_path', 'b': 'b_path'},
versions={'a': 'v1', 'b': 'v2'},
enums={'a': {}, 'b': {}},
selects={'a': {'field1'}, 'b': {'test_select_2'}},
selects={'a': {'field1': hl.tint32}, 'b': {'test_select_2': hl.tint32}},
),
)
self.assertTrue(result == ['a', 'b', 'c'])

def test_get_datasets_to_update_select_type_validation(self):
self.assertRaisesRegex(
ValueError,
"Unexpected field types detected in a: \\[\\('field1', dtype\\('int32'\\)\\)\\]",
get_datasets_to_update,
ht1_globals=Globals(
paths={'a': 'a_path'},
versions={'a': 'v1'},
enums={'a': {}},
selects={
'a': {'field1': hl.tarray(hl.tint32)},
},
),
ht2_globals=Globals(
paths={'a': 'a_path'},
versions={'a': 'v1'},
enums={'a': {}},
selects={'a': {'field1': hl.tint32, 'field2': hl.tint32}},
),
)
result = get_datasets_to_update(
ht1_globals=Globals(
paths={'a': 'a_path'},
versions={'a': 'v1'},
enums={'a': {}},
selects={
'a': {'field1': hl.tarray(hl.tint32)},
},
),
ht2_globals=Globals(
paths={'a': 'a_path'},
versions={'a': 'v1'},
enums={'a': {}},
selects={'a': {'field1': hl.tarray(hl.tint32), 'field2': hl.tint32}},
),
)
self.assertTrue(result == ['a'])
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
hl.tstruct(
locus=hl.tlocus('GRCh38'),
alleles=hl.tarray(hl.tstr),
PHRED=hl.tint32,
PHRED=hl.tfloat32,
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we had a few types in our tests/mock data that weren't right and doing this work caught them 🤩 !

),
key=['locus', 'alleles'],
globals=hl.Struct(
Expand Down Expand Up @@ -760,7 +760,7 @@ def test_update_vat_with_updated_rdc_snv_indel_38(
conditions=None,
),
dbnsfp=hl.Struct(
REVEL_score=0.043,
REVEL_score=0.0430000014603138,
SIFT_score=None,
Polyphen2_HVAR_score=None,
MutationTaster_pred_id=0,
Expand Down Expand Up @@ -1168,7 +1168,7 @@ def test_update_vat_with_updated_rdc_snv_indel_37(
conditions=None,
),
dbnsfp=hl.Struct(
REVEL_score=0.043,
REVEL_score=0.0430000014603138,
SIFT_score=None,
Polyphen2_HVAR_score=None,
MutationTaster_pred_id=0,
Expand Down
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
This folder comprises a Hail (www.hail.is) native Table or MatrixTable.
Written with version 0.2.128-eead8100a1c1
Created at 2024/05/09 20:02:21
Written with version 0.2.130-bea04d9c79b5
Created at 2024/05/20 13:48:16
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
This folder comprises a Hail (www.hail.is) native Table or MatrixTable.
Written with version 0.2.128-eead8100a1c1
Created at 2024/03/21 11:28:13
Written with version 0.2.130-bea04d9c79b5
Created at 2024/05/20 15:38:26
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
This folder comprises a Hail (www.hail.is) native Table or MatrixTable.
Written with version 0.2.128-eead8100a1c1
Created at 2024/03/21 11:35:30
Written with version 0.2.130-bea04d9c79b5
Created at 2024/05/20 14:08:17
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
This folder comprises a Hail (www.hail.is) native Table or MatrixTable.
Written with version 0.2.114-cc8d36408b36
Created at 2023/07/13 19:51:12
Written with version 0.2.130-bea04d9c79b5
Created at 2024/05/20 13:22:32
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.