Skip to content

Commit c7c6fb9

Browse files
authored
Make dataset collection reference non hardcoded (#956)
1 parent 9df1ce8 commit c7c6fb9

File tree

2 files changed

+16
-11
lines changed

2 files changed

+16
-11
lines changed

v03_pipeline/lib/model/cached_reference_dataset_query.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@
55

66
from v03_pipeline.lib.model.dataset_type import DatasetType
77
from v03_pipeline.lib.model.definitions import ReferenceGenome
8+
from v03_pipeline.lib.model.reference_dataset_collection import (
9+
ReferenceDatasetCollection,
10+
)
811
from v03_pipeline.lib.reference_data.queries import (
912
clinvar_path_variants,
1013
gnomad_coding_and_noncoding_variants,
@@ -30,11 +33,13 @@ def dataset(self, dataset_type: DatasetType) -> str | None:
3033
}.get(self)
3134

3235
@property
33-
def query_raw_dataset(self) -> bool:
36+
def reference_dataset_collection(self) -> ReferenceDatasetCollection:
3437
return {
35-
CachedReferenceDatasetQuery.GNOMAD_CODING_AND_NONCODING_VARIANTS: True,
36-
CachedReferenceDatasetQuery.GNOMAD_QC: True,
37-
}.get(self, False)
38+
CachedReferenceDatasetQuery.CLINVAR_PATH_VARIANTS: ReferenceDatasetCollection.COMBINED,
39+
CachedReferenceDatasetQuery.GNOMAD_CODING_AND_NONCODING_VARIANTS: None,
40+
CachedReferenceDatasetQuery.GNOMAD_QC: None,
41+
CachedReferenceDatasetQuery.HIGH_AF_VARIANTS: ReferenceDatasetCollection.COMBINED,
42+
}[self]
3843

3944
@property
4045
def query(self) -> Callable[[hl.Table, ReferenceGenome], hl.Table]:

v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ def output(self) -> luigi.Target:
7272
)
7373

7474
def requires(self) -> luigi.Task:
75-
if self.crdq.query_raw_dataset:
75+
if not self.crdq.reference_dataset_collection:
7676
return HailTableTask(
7777
get_ht_path(
7878
CONFIG[self.crdq.dataset(self.dataset_type)][
@@ -83,7 +83,7 @@ def requires(self) -> luigi.Task:
8383
# Special nested import to avoid a circular dependency issue
8484
# (ValidateCallset -> this file -> UpdatedReferenceDatasetCollection -> ValidateCallset)
8585
# The specific CRDQ referenced in ValidateCallset will never reach
86-
# this line due to it being a "query_raw_dataset". In theory this
86+
# this line due to it being a raw dataset query. In theory this
8787
# would be fixed by splitting the CRDQ into raw_dataset and non-raw_dataset
8888
# queries.
8989
from v03_pipeline.lib.tasks.reference_data.updated_reference_dataset_collection import (
@@ -93,12 +93,12 @@ def requires(self) -> luigi.Task:
9393
return UpdatedReferenceDatasetCollectionTask(
9494
self.reference_genome,
9595
self.dataset_type,
96-
ReferenceDatasetCollection.COMBINED,
96+
self.crdq.reference_dataset_collection,
9797
)
9898

9999
def create_table(self) -> hl.Table:
100100
dataset: str = self.crdq.dataset(self.dataset_type)
101-
if self.crdq.query_raw_dataset:
101+
if not self.crdq.reference_dataset_collection:
102102
query_ht = import_ht_from_config_path(
103103
CONFIG[dataset][self.reference_genome.v02_value],
104104
dataset,
@@ -121,21 +121,21 @@ def create_table(self) -> hl.Table:
121121
paths=hl.Struct(
122122
**{
123123
dataset: query_ht.index_globals().path
124-
if self.crdq.query_raw_dataset
124+
if not self.crdq.reference_dataset_collection
125125
else query_ht.index_globals().paths[dataset],
126126
},
127127
),
128128
versions=hl.Struct(
129129
**{
130130
dataset: query_ht.index_globals().version
131-
if self.crdq.query_raw_dataset
131+
if not self.crdq.reference_dataset_collection
132132
else query_ht.index_globals().versions[dataset],
133133
},
134134
),
135135
enums=hl.Struct(
136136
**{
137137
dataset: query_ht.index_globals().enums
138-
if self.crdq.query_raw_dataset
138+
if not self.crdq.reference_dataset_collection
139139
else query_ht.index_globals().enums[dataset],
140140
},
141141
),

0 commit comments

Comments
 (0)