Skip to content

Commit b23394a

Browse files
committed
Merge branch 'dev' of github.com:broadinstitute/seqr-loading-pipelines
2 parents 4b32dc5 + fe5e913 commit b23394a

7 files changed

+167
-201
lines changed

v03_pipeline/lib/misc/family_entries.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ def remove_new_callset_family_guids(
107107
family_entries=(
108108
hl.array(family_indexes_to_keep).map(lambda i: ht.family_entries[i])
109109
if len(family_indexes_to_keep) > 0
110-
else hl.missing(ht.family_entries.dtype.element_type)
110+
else hl.missing(ht.family_entries.dtype)
111111
),
112112
)
113113
return ht.annotate_globals(

v03_pipeline/lib/reference_data/compare_globals.py

Lines changed: 22 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,6 @@
44

55
from v03_pipeline.lib.logger import get_logger
66
from v03_pipeline.lib.model import (
7-
DatasetType,
8-
ReferenceDatasetCollection,
97
ReferenceGenome,
108
)
119
from v03_pipeline.lib.reference_data.config import CONFIG
@@ -22,8 +20,8 @@
2220

2321
@dataclasses.dataclass
2422
class Globals:
25-
paths: dict[str]
26-
versions: dict[str]
23+
paths: dict[str, str]
24+
versions: dict[str, str]
2725
enums: dict[str, dict[str, list[str]]]
2826
selects: dict[str, set[str]]
2927

@@ -33,12 +31,11 @@ def __getitem__(self, name: str):
3331
@classmethod
3432
def from_dataset_configs(
3533
cls,
36-
rdc: ReferenceDatasetCollection,
37-
dataset_type: DatasetType,
3834
reference_genome: ReferenceGenome,
35+
datasets: list[str],
3936
):
4037
paths, versions, enums, selects = {}, {}, {}, {}
41-
for dataset in rdc.datasets(dataset_type):
38+
for dataset in datasets:
4239
dataset_config = CONFIG[dataset][reference_genome.v02_value]
4340
dataset_ht = import_ht_from_config_path(dataset_config, reference_genome)
4441

@@ -64,17 +61,21 @@ def from_dataset_configs(
6461
def from_ht(
6562
cls,
6663
ht: hl.Table,
67-
rdc: ReferenceDatasetCollection,
68-
dataset_type: DatasetType,
64+
datasets: list[str],
6965
):
7066
rdc_globals_struct = hl.eval(ht.globals)
7167
paths = dict(rdc_globals_struct.paths)
7268
versions = dict(rdc_globals_struct.versions)
7369
# enums are nested structs
7470
enums = {k: dict(v) for k, v in rdc_globals_struct.enums.items()}
7571

72+
for global_dict in [paths, versions, enums]:
73+
for dataset in list(global_dict.keys()):
74+
if dataset not in datasets:
75+
global_dict.pop(dataset)
76+
7677
selects = {}
77-
for dataset in rdc.datasets(dataset_type):
78+
for dataset in datasets:
7879
if dataset in ht.row:
7980
# NB: handle an edge case (mito high constraint) where we annotate a bool from the reference dataset collection
8081
selects[dataset] = (
@@ -86,30 +87,20 @@ def from_ht(
8687

8788

8889
def get_datasets_to_update(
89-
rdc: ReferenceDatasetCollection,
9090
ht1_globals: Globals,
9191
ht2_globals: Globals,
92-
dataset_type: DatasetType,
9392
) -> list[str]:
94-
return [
95-
dataset
96-
for dataset in rdc.datasets(dataset_type)
97-
if not validate_globals_match(rdc, ht1_globals, ht2_globals, dataset)
98-
]
99-
93+
datasets_to_update = set()
10094

101-
def validate_globals_match(
102-
rdc: ReferenceDatasetCollection,
103-
ht1_globals: Globals,
104-
ht2_globals: Globals,
105-
dataset: str,
106-
) -> bool:
107-
results = []
10895
for field in dataclasses.fields(Globals):
109-
result = ht1_globals[field.name].get(dataset) == ht2_globals[field.name].get(
110-
dataset,
96+
datasets_to_update.update(
97+
ht1_globals[field.name].keys() ^ ht2_globals[field.name].keys(),
11198
)
112-
if result is False:
113-
logger.info(f'{field.name} mismatch for {dataset}, {rdc.value}')
114-
results.append(result)
115-
return all(results)
99+
for dataset in ht1_globals[field.name].keys() & ht2_globals[field.name].keys():
100+
if ht1_globals[field.name].get(dataset) != ht2_globals[field.name].get(
101+
dataset,
102+
):
103+
logger.info(f'{field.name} mismatch for {dataset}')
104+
datasets_to_update.add(dataset)
105+
106+
return sorted(datasets_to_update)

v03_pipeline/lib/reference_data/compare_globals_test.py

Lines changed: 87 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -4,55 +4,68 @@
44
import hail as hl
55

66
from v03_pipeline.lib.model import (
7-
DatasetType,
8-
ReferenceDatasetCollection,
97
ReferenceGenome,
108
)
119
from v03_pipeline.lib.reference_data.compare_globals import (
1210
Globals,
1311
get_datasets_to_update,
1412
)
1513

16-
17-
class CompareGlobalsTest(unittest.TestCase):
18-
@mock.patch.dict(
19-
'v03_pipeline.lib.reference_data.compare_globals.CONFIG',
20-
{
21-
'a': {
22-
'38': {
23-
'custom_import': None,
24-
'source_path': 'a_path', # 'a' has a custom import
25-
'select': {
26-
'test_select': 'info.test_select',
27-
'test_enum': 'test_enum',
28-
},
29-
'version': 'a_version',
30-
'enum_select': {'test_enum': ['A', 'B']},
31-
},
14+
CONFIG = {
15+
'a': {
16+
'38': {
17+
'custom_import': None,
18+
'source_path': 'a_path', # 'a' has a custom import
19+
'select': {
20+
'test_select': 'info.test_select',
21+
'test_enum': 'test_enum',
3222
},
33-
'b': { # b is missing version
34-
'38': {
35-
'path': 'b_path',
36-
'select': {
37-
'test_select': 'info.test_select',
38-
'test_enum': 'test_enum',
39-
},
40-
'enum_select': {'test_enum': ['C', 'D']},
41-
'custom_select': lambda ht: {'field2': ht.info.test_select_2},
42-
},
23+
'version': 'a_version',
24+
'enum_select': {'test_enum': ['A', 'B']},
25+
},
26+
},
27+
'b': { # b is missing version
28+
'38': {
29+
'path': 'b_path',
30+
'select': {
31+
'test_select': 'info.test_select',
32+
'test_enum': 'test_enum',
4333
},
34+
'enum_select': {'test_enum': ['C', 'D']},
35+
'custom_select': lambda ht: {'field2': ht.info.test_select_2},
4436
},
45-
)
37+
},
38+
}
39+
40+
B_TABLE = hl.Table.parallelize(
41+
[],
42+
schema=hl.tstruct(
43+
locus=hl.tlocus('GRCh38'),
44+
alleles=hl.tarray(hl.tstr),
45+
info=hl.tstruct(
46+
test_select=hl.tint,
47+
test_select_2=hl.tint,
48+
),
49+
test_enum=hl.tstr,
50+
),
51+
globals=hl.Struct(
52+
version='b_version',
53+
path='b_path',
54+
enums=hl.Struct(test_enum=['C', 'D']),
55+
),
56+
key=['locus', 'alleles'],
57+
)
58+
59+
60+
class CompareGlobalsTest(unittest.TestCase):
61+
@mock.patch.dict('v03_pipeline.lib.reference_data.compare_globals.CONFIG', CONFIG)
4662
@mock.patch(
4763
'v03_pipeline.lib.reference_data.compare_globals.import_ht_from_config_path',
4864
)
49-
@mock.patch.object(ReferenceDatasetCollection, 'datasets')
50-
def test_create_globals_from_dataset_ht_configs(
65+
def test_create_globals_from_dataset_configs(
5166
self,
52-
mock_rdc_datasets,
5367
mock_import_dataset_ht,
5468
):
55-
mock_rdc_datasets.return_value = ['a', 'b']
5669
mock_import_dataset_ht.side_effect = [
5770
hl.Table.parallelize(
5871
[],
@@ -64,28 +77,18 @@ def test_create_globals_from_dataset_ht_configs(
6477
),
6578
test_enum=hl.tstr,
6679
),
67-
globals=hl.Struct(version='a_version'),
68-
key=['locus', 'alleles'],
69-
),
70-
hl.Table.parallelize(
71-
[],
72-
schema=hl.tstruct(
73-
locus=hl.tlocus('GRCh38'),
74-
alleles=hl.tarray(hl.tstr),
75-
info=hl.tstruct(
76-
test_select=hl.tint,
77-
test_select_2=hl.tint,
78-
),
79-
test_enum=hl.tstr,
80+
globals=hl.Struct(
81+
version='a_version',
82+
path='a_path',
83+
enums=hl.Struct(test_enum=['A', 'B']),
8084
),
81-
globals=hl.Struct(version='b_version'),
8285
key=['locus', 'alleles'],
8386
),
87+
B_TABLE,
8488
]
8589
dataset_config_globals = Globals.from_dataset_configs(
86-
rdc=ReferenceDatasetCollection.INTERVAL,
87-
dataset_type=DatasetType.SNV_INDEL,
8890
reference_genome=ReferenceGenome.GRCh38,
91+
datasets=['a', 'b'],
8992
)
9093
self.assertTrue(
9194
dataset_config_globals.versions == {'a': 'a_version', 'b': 'b_version'},
@@ -105,6 +108,36 @@ def test_create_globals_from_dataset_ht_configs(
105108
},
106109
)
107110

111+
@mock.patch.dict('v03_pipeline.lib.reference_data.compare_globals.CONFIG', CONFIG)
112+
@mock.patch(
113+
'v03_pipeline.lib.reference_data.dataset_table_operations.hl.read_table',
114+
)
115+
def test_create_globals_from_dataset_configs_single_dataset(self, mock_read_table):
116+
# by mocking hl.read_table() (only possible for a dataset without a custom import),
117+
# we can test the code inside import_ht_from_config_path()
118+
mock_read_table.return_value = B_TABLE
119+
120+
dataset_config_globals = Globals.from_dataset_configs(
121+
reference_genome=ReferenceGenome.GRCh38,
122+
datasets=['b'],
123+
)
124+
125+
self.assertTrue(
126+
dataset_config_globals.versions == {'b': 'b_version'},
127+
)
128+
self.assertTrue(
129+
dataset_config_globals.paths == {'b': 'b_path'},
130+
)
131+
self.assertTrue(
132+
dataset_config_globals.enums == {'b': {'test_enum': ['C', 'D']}},
133+
)
134+
self.assertTrue(
135+
dataset_config_globals.selects
136+
== {
137+
'b': {'test_select', 'field2', 'test_enum_id'},
138+
},
139+
)
140+
108141
def test_from_rdc_or_annotations_ht(self):
109142
rdc_ht = hl.Table.parallelize(
110143
[],
@@ -134,8 +167,7 @@ def test_from_rdc_or_annotations_ht(self):
134167
)
135168
rdc_globals = Globals.from_ht(
136169
rdc_ht,
137-
rdc=ReferenceDatasetCollection.INTERVAL,
138-
dataset_type=DatasetType.SNV_INDEL,
170+
['gnomad_non_coding_constraint', 'screen'],
139171
)
140172
self.assertTrue(
141173
rdc_globals.versions
@@ -159,11 +191,8 @@ def test_from_rdc_or_annotations_ht(self):
159191
},
160192
)
161193

162-
@mock.patch.object(ReferenceDatasetCollection, 'datasets')
163-
def test_get_datasets_to_update_version_different(self, mock_rdc_datasets):
164-
mock_rdc_datasets.return_value = ['a', 'b', 'c']
194+
def test_get_datasets_to_update_version_different(self):
165195
result = get_datasets_to_update(
166-
rdc=ReferenceDatasetCollection.INTERVAL,
167196
ht1_globals=Globals(
168197
paths={'a': 'a_path', 'b': 'b_path'},
169198
# 'a' has a different version, 'c' is missing version in ht2_globals
@@ -177,15 +206,11 @@ def test_get_datasets_to_update_version_different(self, mock_rdc_datasets):
177206
enums={'a': {}, 'b': {}},
178207
selects={'a': set(), 'b': set()},
179208
),
180-
dataset_type=DatasetType.SNV_INDEL,
181209
)
182210
self.assertTrue(result == ['a', 'c'])
183211

184-
@mock.patch.object(ReferenceDatasetCollection, 'datasets')
185-
def test_get_datasets_to_update_path_different(self, mock_rdc_datasets):
186-
mock_rdc_datasets.return_value = ['a', 'b', 'c']
212+
def test_get_datasets_to_update_path_different(self):
187213
result = get_datasets_to_update(
188-
rdc=ReferenceDatasetCollection.INTERVAL,
189214
ht1_globals=Globals(
190215
# 'b' has a different path, 'c' is missing path in ht2_globals
191216
paths={'a': 'a_path', 'b': 'old_b_path', 'c': 'extra_c_path'},
@@ -199,15 +224,11 @@ def test_get_datasets_to_update_path_different(self, mock_rdc_datasets):
199224
enums={'a': {}, 'b': {}},
200225
selects={'a': set(), 'b': set()},
201226
),
202-
dataset_type=DatasetType.SNV_INDEL,
203227
)
204228
self.assertTrue(result == ['b', 'c'])
205229

206-
@mock.patch.object(ReferenceDatasetCollection, 'datasets')
207-
def test_get_datasets_to_update_enum_different(self, mock_rdc_datasets):
208-
mock_rdc_datasets.return_value = ['a', 'b', 'c']
230+
def test_get_datasets_to_update_enum_different(self):
209231
result = get_datasets_to_update(
210-
rdc=ReferenceDatasetCollection.INTERVAL,
211232
ht1_globals=Globals(
212233
paths={'a': 'a_path', 'b': 'b_path'},
213234
versions={'a': 'v1', 'b': 'v2'},
@@ -225,15 +246,11 @@ def test_get_datasets_to_update_enum_different(self, mock_rdc_datasets):
225246
enums={'a': {'test_enum': ['C', 'D']}, 'b': {'enum_key_2': []}},
226247
selects={'a': set(), 'b': set()},
227248
),
228-
dataset_type=DatasetType.SNV_INDEL,
229249
)
230250
self.assertTrue(result == ['a', 'b', 'c'])
231251

232-
@mock.patch.object(ReferenceDatasetCollection, 'datasets')
233-
def test_get_datasets_to_update_select_different(self, mock_rdc_datasets):
234-
mock_rdc_datasets.return_value = ['a', 'b', 'c']
252+
def test_get_datasets_to_update_select_different(self):
235253
result = get_datasets_to_update(
236-
rdc=ReferenceDatasetCollection.INTERVAL,
237254
ht1_globals=Globals(
238255
paths={'a': 'a_path', 'b': 'b_path'},
239256
versions={'a': 'v1', 'b': 'v2'},
@@ -251,6 +268,5 @@ def test_get_datasets_to_update_select_different(self, mock_rdc_datasets):
251268
enums={'a': {}, 'b': {}},
252269
selects={'a': {'field1'}, 'b': {'test_select_2'}},
253270
),
254-
dataset_type=DatasetType.SNV_INDEL,
255271
)
256272
self.assertTrue(result == ['a', 'b', 'c'])

0 commit comments

Comments
 (0)