Skip to content

Commit 631e29b

Browse files
authored
create updated reference dataset collection task (#656)
* first pass at update task * move stuff back to /lib file * add tests for former combine file * add tests for former combine file * comment w/ logic for complete methjod * task unit test * add missing test cases and default luigi datasets param * delete_reference_data_ht removed * remove test setup tmpdir stuff
1 parent 44e0b9c commit 631e29b

32 files changed

+708
-249
lines changed

download_and_create_reference_datasets/v02/hail_scripts/write_combined_interval_ref_data.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
import hail as hl
55

6-
from v03_pipeline.lib.reference_data.combine import join_hts
6+
from v03_pipeline.lib.reference_data.dataset_table_operations import join_hts
77

88
VERSION = '2.0.5'
99
OUTPUT_PATH = "gs://seqr-reference-data/GRCh38/combined_interval_reference_data/combined_interval_reference_data.ht"

download_and_create_reference_datasets/v02/hail_scripts/write_combined_reference_data_ht.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
import hail as hl
55

6-
from v03_pipeline.lib.reference_data.combine import join_hts
6+
from v03_pipeline.lib.reference_data.dataset_table_operations import join_hts
77
from v03_pipeline.lib.reference_data.config import CONFIG
88

99
VERSION = '2.0.4'

download_and_create_reference_datasets/v02/mito/write_combined_mito_reference_data_hts.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
import hail as hl
66

7-
from v03_pipeline.lib.reference_data.combine import join_hts
7+
from v03_pipeline.lib.reference_data.dataset_table_operations import join_hts
88

99
VERSION = '2.0.4'
1010
OUTPUT_PATH = 'gs://seqr-reference-data/GRCh38/mitochondrial/all_mito_reference_data/combined_reference_data_chrM.ht'

luigi_pipeline/tests/test_hail_tasks.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -40,12 +40,10 @@ def _hail_matrix_table_task(self):
4040

4141
def _set_validation_configs(self):
4242
global_config = GlobalConfig()
43-
global_config.param_kwargs[
44-
'validation_37_coding_ht'
45-
] = global_config.validation_37_coding_ht = 'tests/data/validation_37_coding.ht'
46-
global_config.param_kwargs[
47-
'validation_37_noncoding_ht'
48-
] = (
43+
global_config.param_kwargs['validation_37_coding_ht'] = (
44+
global_config.validation_37_coding_ht
45+
) = 'tests/data/validation_37_coding.ht'
46+
global_config.param_kwargs['validation_37_noncoding_ht'] = (
4947
global_config.validation_37_noncoding_ht
5048
) = 'tests/data/validation_37_noncoding.ht'
5149

pyproject.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,3 +67,6 @@ inline-quotes = "single"
6767

6868
[tool.ruff.pylint]
6969
max-args = 6
70+
71+
[tool.ruff.format]
72+
quote-style = "single"

v03_pipeline/bin/write_combined_interval_reference_ht.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,10 @@
1313
ReferenceGenome,
1414
)
1515
from v03_pipeline.lib.paths import valid_reference_dataset_collection_path
16-
from v03_pipeline.lib.reference_data.combine import join_hts, update_existing_joined_hts
16+
from v03_pipeline.lib.reference_data.dataset_table_operations import (
17+
join_hts,
18+
update_existing_joined_hts,
19+
)
1720

1821

1922
def run(dataset_type: DatasetType, dataset: str | None):

v03_pipeline/bin/write_combined_reference_ht.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,10 @@
1414
ReferenceGenome,
1515
)
1616
from v03_pipeline.lib.paths import valid_reference_dataset_collection_path
17-
from v03_pipeline.lib.reference_data.combine import join_hts, update_existing_joined_hts
17+
from v03_pipeline.lib.reference_data.dataset_table_operations import (
18+
join_hts,
19+
update_existing_joined_hts,
20+
)
1821

1922

2023
def run(

v03_pipeline/bin/write_hgmd_ht.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
ReferenceGenome,
99
)
1010
from v03_pipeline.lib.paths import valid_reference_dataset_collection_path
11-
from v03_pipeline.lib.reference_data.combine import join_hts
11+
from v03_pipeline.lib.reference_data.dataset_table_operations import join_hts
1212

1313

1414
def run(reference_genome: ReferenceGenome):

v03_pipeline/lib/misc/sample_entries.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,7 @@ def globalize_sample_ids(ht: hl.Table) -> hl.Table:
88
# NB: normal python expression here because the row is localized.
99
# I had an easier time with this than hl.agg.take(1), which was an
1010
# alternative implementation.
11-
[e.s for e in row[0].entries]
12-
if len(row) > 0
13-
else hl.empty_array(hl.tstr)
11+
[e.s for e in row[0].entries] if len(row) > 0 else hl.empty_array(hl.tstr)
1412
),
1513
)
1614
return ht.annotate(entries=ht.entries.map(lambda s: s.drop('s')))

v03_pipeline/lib/reference_data/combine.py renamed to v03_pipeline/lib/reference_data/dataset_table_operations.py

Lines changed: 83 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from datetime import datetime
2+
from types import FunctionType
23

34
import hail as hl
45
import pytz
@@ -11,21 +12,75 @@
1112
from v03_pipeline.lib.reference_data.config import CONFIG
1213

1314

14-
def parse_version(ht: hl.Table, dataset: str, config: dict) -> hl.StringExpression:
15-
annotated_version = ht.globals.get('version', hl.missing(hl.tstr))
16-
config_version = config.get('version', hl.missing(hl.tstr))
17-
return (
18-
hl.case()
19-
.when(hl.is_missing(config_version), annotated_version)
20-
.when(hl.is_missing(annotated_version), config_version)
21-
.when(annotated_version == config_version, config_version)
22-
.or_error(
23-
f'found mismatching versions for dataset {dataset}, {config_version}, {hl.eval(annotated_version)}',
15+
def update_or_create_joined_ht(
16+
reference_dataset_collection: ReferenceDatasetCollection,
17+
dataset_type: DatasetType,
18+
reference_genome: ReferenceGenome,
19+
dataset: str | None,
20+
joined_ht: hl.Table,
21+
) -> hl.Table:
22+
datasets = (
23+
[dataset]
24+
if dataset is not None
25+
else reference_dataset_collection.datasets(dataset_type)
26+
)
27+
28+
for dataset in datasets:
29+
dataset_ht = get_dataset_ht(dataset, reference_genome)
30+
31+
if dataset in joined_ht.row:
32+
joined_ht = joined_ht.drop(dataset)
33+
34+
joined_ht = joined_ht.join(dataset_ht, 'outer')
35+
joined_ht = annotate_dataset_globals(joined_ht, dataset, dataset_ht)
36+
37+
return joined_ht.filter(
38+
hl.any(
39+
[
40+
~hl.is_missing(joined_ht[dataset])
41+
for dataset in reference_dataset_collection.datasets(dataset_type)
42+
],
43+
),
44+
)
45+
46+
47+
def get_dataset_ht(
48+
dataset: str,
49+
reference_genome: ReferenceGenome,
50+
) -> hl.Table:
51+
config = CONFIG[dataset][reference_genome.v02_value]
52+
ht = (
53+
config['custom_import'](config['source_path'], reference_genome)
54+
if 'custom_import' in config
55+
else hl.read_table(config['path'])
56+
)
57+
if hasattr(ht, 'locus'):
58+
ht = ht.filter(
59+
hl.set(reference_genome.standard_contigs).contains(ht.locus.contig),
2460
)
61+
62+
ht = ht.filter(config['filter'](ht)) if 'filter' in config else ht
63+
ht = ht.select(
64+
**{
65+
**get_select_fields(config.get('select'), ht),
66+
**get_custom_select_fields(config.get('custom_select'), ht),
67+
},
2568
)
69+
ht = ht.transmute(**get_enum_select_fields(config.get('enum_select'), ht))
70+
ht = ht.select_globals(
71+
path=(config['source_path'] if 'custom_import' in config else config['path']),
72+
version=parse_dataset_version(ht, dataset, config),
73+
enums=hl.Struct(
74+
**config.get(
75+
'enum_select',
76+
hl.missing(hl.tstruct(hl.tstr, hl.tarray(hl.tstr))),
77+
),
78+
),
79+
)
80+
return ht.select(**{dataset: ht.row.drop(*ht.key)}).distinct()
2681

2782

28-
def get_select_fields(selects, base_ht):
83+
def get_select_fields(selects: list | dict | None, base_ht: hl.Table) -> dict:
2984
"""
3085
Generic function that takes in a select config and base_ht and generates a
3186
select dict that is generated from traversing the base_ht and extracting the right
@@ -57,13 +112,13 @@ def get_select_fields(selects, base_ht):
57112
return select_fields
58113

59114

60-
def get_custom_select_fields(custom_select, ht):
115+
def get_custom_select_fields(custom_select: FunctionType | None, ht: hl.Table) -> dict:
61116
if custom_select is None:
62117
return {}
63118
return custom_select(ht)
64119

65120

66-
def get_enum_select_fields(enum_selects, ht):
121+
def get_enum_select_fields(enum_selects: dict | None, ht: hl.Table) -> dict:
67122
enum_select_fields = {}
68123
if enum_selects is None:
69124
return enum_select_fields
@@ -89,40 +144,22 @@ def get_enum_select_fields(enum_selects, ht):
89144
return enum_select_fields
90145

91146

92-
def get_ht(
147+
def parse_dataset_version(
148+
ht: hl.Table,
93149
dataset: str,
94-
reference_genome: ReferenceGenome,
95-
):
96-
config = CONFIG[dataset][reference_genome.v02_value]
97-
ht = (
98-
config['custom_import'](config['source_path'], reference_genome)
99-
if 'custom_import' in config
100-
else hl.read_table(config['path'])
101-
)
102-
if hasattr(ht, 'locus'):
103-
ht = ht.filter(
104-
hl.set(reference_genome.standard_contigs).contains(ht.locus.contig),
150+
config: dict,
151+
) -> hl.StringExpression:
152+
annotated_version = ht.globals.get('version', hl.missing(hl.tstr))
153+
config_version = config.get('version', hl.missing(hl.tstr))
154+
return (
155+
hl.case()
156+
.when(hl.is_missing(config_version), annotated_version)
157+
.when(hl.is_missing(annotated_version), config_version)
158+
.when(annotated_version == config_version, config_version)
159+
.or_error(
160+
f'found mismatching versions for dataset {dataset}, {config_version}, {hl.eval(annotated_version)}',
105161
)
106-
107-
ht = ht.filter(config['filter'](ht)) if 'filter' in config else ht
108-
ht = ht.select(
109-
**{
110-
**get_select_fields(config.get('select'), ht),
111-
**get_custom_select_fields(config.get('custom_select'), ht),
112-
},
113162
)
114-
ht = ht.transmute(**get_enum_select_fields(config.get('enum_select'), ht))
115-
ht = ht.select_globals(
116-
path=(config['source_path'] if 'custom_import' in config else config['path']),
117-
version=parse_version(ht, dataset, config),
118-
enums=hl.Struct(
119-
**config.get(
120-
'enum_select',
121-
hl.missing(hl.tstruct(hl.tstr, hl.tarray(hl.tstr))),
122-
),
123-
),
124-
)
125-
return ht.select(**{dataset: ht.row.drop(*ht.key)}).distinct()
126163

127164

128165
def annotate_dataset_globals(joined_ht: hl.Table, dataset: str, dataset_ht: hl.Table):
@@ -153,7 +190,7 @@ def join_hts(
153190
),
154191
)
155192
for dataset in reference_dataset_collection.datasets(dataset_type):
156-
dataset_ht = get_ht(dataset, reference_genome)
193+
dataset_ht = get_dataset_ht(dataset, reference_genome)
157194
joined_ht = joined_ht.join(dataset_ht, 'outer')
158195
joined_ht = annotate_dataset_globals(joined_ht, dataset, dataset_ht)
159196
return joined_ht
@@ -167,7 +204,7 @@ def update_existing_joined_hts(
167204
reference_dataset_collection: ReferenceDatasetCollection,
168205
):
169206
joined_ht = hl.read_table(destination_path)
170-
dataset_ht = get_ht(dataset, reference_genome)
207+
dataset_ht = get_dataset_ht(dataset, reference_genome)
171208
joined_ht = joined_ht.drop(dataset)
172209
joined_ht = joined_ht.join(dataset_ht, 'outer')
173210
joined_ht = joined_ht.filter(

0 commit comments

Comments
 (0)