Skip to content

Commit 408a3db

Browse files
authored
Bugfixes for reference data update (#690)
* add logger * ws * lint * quiet loggers * pyproj * Fix bug * Handle enums correctly * lint * Improve logging * another * account for key in selects compare * i guess this is fine for now? * Leave a comment * Question * Bugfixes * almost there * another batch * ugh * lint * Import enums * Missed one * support dropping a dataset * lint
1 parent 928924e commit 408a3db

14 files changed

+150
-54
lines changed

v03_pipeline/lib/annotations/mito.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ def HL(mt: hl.MatrixTable, **_: Any) -> hl.Expression: # noqa: N802
4444
return hl.if_else(is_called, mt.HL, 0)
4545

4646

47-
def high_constraint_region(
47+
def high_constraint_region_mito(
4848
ht: hl.Table,
4949
interval_ht: hl.Table,
5050
**_: Any,

v03_pipeline/lib/model/dataset_type.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,7 @@ def formatting_annotation_fns(
175175
DatasetType.MITO: [
176176
mito.common_low_heteroplasmy,
177177
mito.haplogroup,
178-
mito.high_constraint_region,
178+
mito.high_constraint_region_mito,
179179
mito.mitotip,
180180
mito.rsid,
181181
shared.sorted_transcript_consequences,

v03_pipeline/lib/model/reference_dataset_collection.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ def datasets(self, dataset_type: DatasetType) -> list[str]:
5454
(ReferenceDatasetCollection.INTERVAL, DatasetType.MITO): [
5555
'high_constraint_region_mito',
5656
],
57-
}[(self, dataset_type)]
57+
}.get((self, dataset_type), [])
5858

5959
def table_key_type(
6060
self,

v03_pipeline/lib/reference_data/clinvar.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import gzip
2+
import os
23
import subprocess
34
import tempfile
45
import urllib
@@ -7,6 +8,7 @@
78

89
from v03_pipeline.lib.annotations.enums import CLINVAR_PATHOGENICITIES_LOOKUP
910
from v03_pipeline.lib.logger import get_logger
11+
from v03_pipeline.lib.model import Env
1012
from v03_pipeline.lib.model.definitions import ReferenceGenome
1113

1214
CLINVAR_ASSERTIONS = [
@@ -104,7 +106,7 @@ def download_and_import_latest_clinvar_vcf(
104106

105107
with tempfile.NamedTemporaryFile(suffix='.vcf.gz', delete=False) as tmp_file:
106108
urllib.request.urlretrieve(clinvar_url, tmp_file.name) # noqa: S310
107-
gcs_tmp_file_name = f'gs://seqr-scratch-temp{tmp_file.name}'
109+
gcs_tmp_file_name = os.path.join(Env.HAIL_TMPDIR, os.path.basename(tmp_file.name))
108110
safely_move_to_gcs(tmp_file.name, gcs_tmp_file_name)
109111
mt = hl.import_vcf(
110112
gcs_tmp_file_name,

v03_pipeline/lib/reference_data/compare_globals.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from v03_pipeline.lib.reference_data.config import CONFIG
1212
from v03_pipeline.lib.reference_data.dataset_table_operations import (
1313
get_all_select_fields,
14+
get_enum_select_fields,
1415
get_ht_path,
1516
import_ht_from_config_path,
1617
parse_dataset_version,
@@ -50,9 +51,9 @@ def from_dataset_configs(
5051
),
5152
)
5253
enums[dataset] = dataset_config.get('enum_select', {})
53-
selects[dataset] = set(
54-
get_all_select_fields(dataset_ht, dataset_config).keys(),
55-
)
54+
dataset_ht = dataset_ht.select(**get_all_select_fields(dataset_ht, dataset_config))
55+
dataset_ht = dataset_ht.transmute(**get_enum_select_fields(dataset_ht, dataset_config))
56+
selects[dataset] = set(dataset_ht.row) - set(dataset_ht.key)
5657
return cls(paths, versions, enums, selects)
5758

5859
@classmethod
@@ -65,12 +66,14 @@ def from_ht(
6566
rdc_globals_struct = hl.eval(ht.globals)
6667
paths = dict(rdc_globals_struct.paths)
6768
versions = dict(rdc_globals_struct.versions)
68-
enums = dict(rdc_globals_struct.enums)
69+
# enums are nested structs
70+
enums = {k: dict(v) for k,v in rdc_globals_struct.enums.items()}
6971

7072
selects = {}
7173
for dataset in rdc.datasets(dataset_type):
7274
if dataset in ht.row:
73-
selects[dataset] = set(ht[dataset])
75+
# NB: handle an edge case (mito high constraint) where we annotate a bool from the reference dataset collection
76+
selects[dataset] = set(ht[dataset]) if isinstance(ht[dataset], hl.StructExpression) else set()
7477
return cls(paths, versions, enums, selects)
7578

7679

v03_pipeline/lib/reference_data/compare_globals_test.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,15 +22,15 @@ class CompareGlobalsTest(unittest.TestCase):
2222
'38': {
2323
'custom_import': None,
2424
'source_path': 'a_path', # 'a' has a custom import
25-
'select': {'test_select': 'info.test_select'},
25+
'select': {'test_select': 'info.test_select', 'test_enum': 'test_enum'},
2626
'version': 'a_version',
2727
'enum_select': {'test_enum': ['A', 'B']},
2828
},
2929
},
3030
'b': { # b is missing version
3131
'38': {
3232
'path': 'b_path',
33-
'select': {'test_select': 'info.test_select'},
33+
'select': {'test_select': 'info.test_select', 'test_enum': 'test_enum'},
3434
'enum_select': {'test_enum': ['C', 'D']},
3535
'custom_select': lambda ht: {'field2': ht.info.test_select_2},
3636
},
@@ -56,8 +56,10 @@ def test_create_globals_from_dataset_ht_configs(
5656
info=hl.tstruct(
5757
test_select=hl.tint,
5858
),
59+
test_enum=hl.tstr,
5960
),
6061
globals=hl.Struct(version='a_version'),
62+
key=['locus', 'alleles'],
6163
),
6264
hl.Table.parallelize(
6365
[],
@@ -68,8 +70,10 @@ def test_create_globals_from_dataset_ht_configs(
6870
test_select=hl.tint,
6971
test_select_2=hl.tint,
7072
),
73+
test_enum=hl.tstr,
7174
),
7275
globals=hl.Struct(version='b_version'),
76+
key=['locus', 'alleles'],
7377
),
7478
]
7579
dataset_config_globals = Globals.from_dataset_configs(
@@ -89,7 +93,7 @@ def test_create_globals_from_dataset_ht_configs(
8993
)
9094
self.assertTrue(
9195
dataset_config_globals.selects
92-
== {'a': {'test_select'}, 'b': {'test_select', 'field2'}},
96+
== {'a': {'test_select', 'test_enum_id'}, 'b': {'test_select', 'field2', 'test_enum_id'}},
9397
)
9498

9599
def test_from_rdc_or_annotations_ht(self):
@@ -115,7 +119,7 @@ def test_from_rdc_or_annotations_ht(self):
115119
screen='v2',
116120
),
117121
enums=hl.Struct(
118-
screen={'region_type': ['C', 'D']},
122+
screen=hl.Struct(region_type=['C', 'D']),
119123
),
120124
),
121125
)

v03_pipeline/lib/reference_data/dataset_table_operations.py

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,21 @@ def update_or_create_joined_ht(
2020
joined_ht: hl.Table,
2121
) -> hl.Table:
2222
for dataset in datasets:
23-
dataset_ht = get_dataset_ht(dataset, reference_genome)
24-
23+
# Drop the dataset if it exists.
2524
if dataset in joined_ht.row:
2625
joined_ht = joined_ht.drop(dataset)
26+
joined_ht = joined_ht.annotate_globals(
27+
paths=joined_ht.paths.drop(dataset),
28+
versions=joined_ht.versions.drop(dataset),
29+
enums=joined_ht.enums.drop(dataset),
30+
)
2731

32+
# Handle cases where a dataset has been dropped OR renamed.
33+
if dataset not in CONFIG:
34+
continue
35+
36+
# Join the new one!
37+
dataset_ht = get_dataset_ht(dataset, reference_genome)
2838
joined_ht = joined_ht.join(dataset_ht, 'outer')
2939
joined_ht = annotate_dataset_globals(joined_ht, dataset, dataset_ht)
3040

@@ -51,7 +61,7 @@ def get_dataset_ht(
5161

5262
ht = ht.filter(config['filter'](ht)) if 'filter' in config else ht
5363
ht = ht.select(**get_all_select_fields(ht, config))
54-
ht = ht.transmute(**get_enum_select_fields(config.get('enum_select'), ht))
64+
ht = ht.transmute(**get_enum_select_fields(ht, config))
5565
ht = ht.select_globals(
5666
path=(config['source_path'] if 'custom_import' in config else config['path']),
5767
version=parse_dataset_version(ht, dataset, config),
@@ -129,7 +139,8 @@ def get_all_select_fields(
129139
}
130140

131141

132-
def get_enum_select_fields(enum_selects: dict | None, ht: hl.Table) -> dict:
142+
def get_enum_select_fields(ht: hl.Table, config: dict) -> dict:
143+
enum_selects = config.get('enum_select')
133144
enum_select_fields = {}
134145
if enum_selects is None:
135146
return enum_select_fields

v03_pipeline/lib/reference_data/dataset_table_operations_test.py

Lines changed: 50 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -244,11 +244,13 @@ def test_get_enum_select_fields(self):
244244
),
245245
)
246246
enum_select_fields = get_enum_select_fields(
247+
ht,
247248
{
248-
'variant': ['1', '2', '3', '4'],
249-
'sv_type': ['a', 'b', 'c', 'd'],
249+
'enum_select': {
250+
'variant': ['1', '2', '3', '4'],
251+
'sv_type': ['a', 'b', 'c', 'd'],
252+
},
250253
},
251-
ht,
252254
)
253255
mapped_ht = ht.transmute(**enum_select_fields)
254256
self.assertListEqual(
@@ -262,8 +264,10 @@ def test_get_enum_select_fields(self):
262264
)
263265

264266
enum_select_fields = get_enum_select_fields(
265-
{'sv_type': ['d']},
266267
ht,
268+
{
269+
'enum_select': {'sv_type': ['d']},
270+
},
267271
)
268272
mapped_ht = ht.select(**enum_select_fields)
269273
self.assertRaises(Exception, mapped_ht.collect)
@@ -472,6 +476,7 @@ def test_parse_version(self, mock_read_table):
472476
)
473477
self.assertRaises(Exception, ht.globals.collect)
474478

479+
@mock.patch.dict(f'{PATH_TO_FILE_UNDER_TEST}.CONFIG', MOCK_CONFIG)
475480
@mock.patch(f'{PATH_TO_FILE_UNDER_TEST}.get_dataset_ht')
476481
@mock.patch(f'{PATH_TO_FILE_UNDER_TEST}.datetime', wraps=datetime)
477482
@mock.patch.object(ReferenceDatasetCollection, 'datasets')
@@ -538,3 +543,44 @@ def test_update_or_create_joined_ht_all_datasets(
538543
EXPECTED_JOINED_DATA,
539544
)
540545
self.assertCountEqual(ht.globals.collect(), EXPECTED_GLOBALS)
546+
547+
@mock.patch.dict(f'{PATH_TO_FILE_UNDER_TEST}.CONFIG', MOCK_CONFIG)
548+
@mock.patch.object(ReferenceDatasetCollection, 'datasets')
549+
def test_update_or_create_joined_ht_drop_a_dataset(
550+
self,
551+
mock_reference_dataset_collection_datasets,
552+
):
553+
mock_reference_dataset_collection_datasets.return_value = ['b']
554+
ht = hl.Table.parallelize(
555+
[],
556+
hl.tstruct(
557+
locus=hl.tlocus(ReferenceGenome.GRCh38.value),
558+
alleles=hl.tarray(hl.tstr),
559+
c=hl.tint32,
560+
b=hl.tint32,
561+
),
562+
key=('locus', 'alleles'),
563+
globals=hl.Struct(
564+
paths=hl.Struct(c='abc', b='123'),
565+
versions=hl.Struct(c='def', b='456'),
566+
enums=hl.Struct(c=hl.Struct(d=['a', 'b'])),
567+
),
568+
)
569+
ht = update_or_create_joined_ht(
570+
ReferenceDatasetCollection.COMBINED,
571+
DatasetType.SNV_INDEL,
572+
ReferenceGenome.GRCh38,
573+
datasets=['c'],
574+
joined_ht=ht,
575+
)
576+
self.assertCountEqual(
577+
ht.globals.collect(),
578+
[
579+
hl.Struct(
580+
paths=hl.Struct(b='123'),
581+
versions=hl.Struct(b='456'),
582+
enums=hl.Struct(),
583+
),
584+
],
585+
)
586+

v03_pipeline/lib/tasks/base/base_variant_annotations_table.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import hail as hl
22
import luigi
33

4+
from v03_pipeline.lib.annotations.enums import annotate_enums
45
from v03_pipeline.lib.model import Env, ReferenceDatasetCollection
56
from v03_pipeline.lib.paths import (
67
valid_reference_dataset_collection_path,
@@ -79,10 +80,15 @@ def initialize_table(self) -> hl.Table:
7980
def update_table(self, ht: hl.Table) -> hl.Table:
8081
return ht
8182

82-
def annotate_reference_dataset_collection_globals(
83+
def annotate_globals(
8384
self,
8485
ht: hl.Table,
8586
) -> hl.Table:
87+
ht = ht.annotate_globals(
88+
paths=hl.Struct(),
89+
versions=hl.Struct(),
90+
enums=hl.Struct(),
91+
)
8692
for rdc in ReferenceDatasetCollection.for_reference_genome_dataset_type(
8793
self.reference_genome,
8894
self.dataset_type,
@@ -104,4 +110,4 @@ def annotate_reference_dataset_collection_globals(
104110
),
105111
updates=ht.globals.updates,
106112
)
107-
return ht
113+
return annotate_enums(ht, self.reference_genome, self.dataset_type)

v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset.py

Lines changed: 22 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import hail as hl
22

3+
from v03_pipeline.lib.annotations.fields import get_fields
34
from v03_pipeline.lib.model import ReferenceDatasetCollection
45
from v03_pipeline.lib.reference_data.compare_globals import (
56
Globals,
@@ -13,18 +14,17 @@
1314
class UpdateVariantAnnotationsTableWithUpdatedReferenceDataset(
1415
BaseVariantAnnotationsTableTask,
1516
):
16-
_datasets_to_update: list[str]
17+
18+
def __init__(self, *args, **kwargs):
19+
super().__init__(*args, **kwargs)
20+
self._datasets_to_update = []
1721

1822
@property
1923
def reference_dataset_collections(self) -> list[ReferenceDatasetCollection]:
20-
return [
21-
rdc
22-
for rdc in ReferenceDatasetCollection.for_reference_genome_dataset_type(
23-
self.reference_genome,
24-
self.dataset_type,
25-
)
26-
if not rdc.requires_annotation
27-
]
24+
return ReferenceDatasetCollection.for_reference_genome_dataset_type(
25+
self.reference_genome,
26+
self.dataset_type,
27+
)
2828

2929
def complete(self) -> bool:
3030
self._datasets_to_update = []
@@ -63,10 +63,18 @@ def update_table(self, ht: hl.Table) -> hl.Table:
6363
for dataset in self._datasets_to_update:
6464
rdc = ReferenceDatasetCollection.for_dataset(dataset, self.dataset_type)
6565
rdc_ht = self.rdc_annotation_dependencies[f'{rdc.value}_ht']
66-
6766
if dataset in ht.row:
6867
ht = ht.drop(dataset)
69-
70-
ht = ht.join(rdc_ht.select(dataset), 'left')
71-
72-
return self.annotate_reference_dataset_collection_globals(ht)
68+
if rdc.requires_annotation:
69+
formatting_fn = next(x for x in self.dataset_type.formatting_annotation_fns(self.reference_genome) if x.__name__ == dataset)
70+
ht = ht.annotate(
71+
**get_fields(
72+
ht,
73+
[formatting_fn],
74+
**self.rdc_annotation_dependencies,
75+
**self.param_kwargs,
76+
),
77+
)
78+
else:
79+
ht = ht.join(rdc_ht.select(dataset), 'left')
80+
return self.annotate_globals(ht)

0 commit comments

Comments
 (0)