Skip to content

Commit 4c30439

Browse files
authored
Merge pull request #696 from broadinstitute/benb/refactor_crdq
add UpdatedCachedReferenceDatasetQuery
2 parents a1b38fd + 11a2273 commit 4c30439

25 files changed

+530
-140
lines changed

v03_pipeline/lib/model/cached_reference_dataset_query.py

Lines changed: 15 additions & 116 deletions
Original file line numberDiff line numberDiff line change
@@ -1,127 +1,17 @@
11
from collections.abc import Callable
22
from enum import Enum
3-
from typing import Any
43

54
import hail as hl
65

7-
from v03_pipeline.lib.annotations.enums import (
8-
CLINVAR_PATHOGENICITIES_LOOKUP,
9-
CONSEQUENCE_TERMS,
10-
)
11-
from v03_pipeline.lib.annotations.expression_helpers import (
12-
get_expr_for_vep_sorted_transcript_consequences_array,
13-
get_expr_for_worst_transcript_consequence_annotations_struct,
14-
)
156
from v03_pipeline.lib.model.dataset_type import DatasetType
167
from v03_pipeline.lib.model.definitions import AccessControl, ReferenceGenome
178
from v03_pipeline.lib.model.environment import Env
18-
19-
CLINVAR_PATH_RANGE = ('Pathogenic', 'Pathogenic/Likely_risk_allele')
20-
CLINVAR_LIKELY_PATH_RANGE = ('Pathogenic/Likely_pathogenic', 'Likely_risk_allele')
21-
CONSEQUENCE_TERM_RANK_LOOKUP = hl.dict(
22-
hl.enumerate(CONSEQUENCE_TERMS, index_first=False),
9+
from v03_pipeline.lib.reference_data.queries import (
10+
clinvar_path_variants,
11+
gnomad_coding_and_noncoding_variants,
12+
gnomad_qc,
13+
high_af_variants,
2314
)
24-
GNOMAD_CODING_NONCODING_HIGH_AF_THRESHOLD = 0.90
25-
ONE_PERCENT = 0.01
26-
THREE_PERCENT = 0.03
27-
FIVE_PERCENT = 0.05
28-
TEN_PERCENT = 0.10
29-
30-
31-
def clinvar_path_variants(
32-
ht: hl.Table,
33-
dataset_type: DatasetType,
34-
**_: Any,
35-
) -> hl.Table:
36-
clinvar_field = 'clinvar_mito' if dataset_type == DatasetType.MITO else 'clinvar'
37-
ht = ht.select_globals()
38-
ht = ht.select(
39-
pathogenic=(
40-
(
41-
ht[clinvar_field].pathogenicity_id
42-
>= CLINVAR_PATHOGENICITIES_LOOKUP[CLINVAR_PATH_RANGE[0]]
43-
)
44-
& (
45-
ht[clinvar_field].pathogenicity_id
46-
<= CLINVAR_PATHOGENICITIES_LOOKUP[CLINVAR_PATH_RANGE[1]]
47-
)
48-
),
49-
likely_pathogenic=(
50-
(
51-
ht[clinvar_field].pathogenicity_id
52-
>= CLINVAR_PATHOGENICITIES_LOOKUP[CLINVAR_LIKELY_PATH_RANGE[0]]
53-
)
54-
& (
55-
ht[clinvar_field].pathogenicity_id
56-
<= CLINVAR_PATHOGENICITIES_LOOKUP[CLINVAR_LIKELY_PATH_RANGE[1]]
57-
)
58-
),
59-
)
60-
return ht.filter(ht.pathogenic | ht.likely_pathogenic)
61-
62-
63-
def gnomad_coding_and_noncoding_variants(
64-
ht: hl.Table,
65-
reference_genome: ReferenceGenome,
66-
**_: Any,
67-
) -> hl.Table:
68-
filtered_contig = 'chr1' if reference_genome == ReferenceGenome.GRCh38 else '1'
69-
ht = hl.filter_intervals(
70-
ht,
71-
[
72-
hl.parse_locus_interval(
73-
filtered_contig,
74-
reference_genome=reference_genome.value,
75-
),
76-
],
77-
)
78-
ht = ht.filter(ht.freq[0].AF > GNOMAD_CODING_NONCODING_HIGH_AF_THRESHOLD)
79-
ht = ht.annotate(
80-
sorted_transaction_consequences=(
81-
get_expr_for_vep_sorted_transcript_consequences_array(
82-
ht.vep,
83-
omit_consequences=[],
84-
)
85-
),
86-
)
87-
ht = ht.annotate(
88-
main_transcript=(
89-
get_expr_for_worst_transcript_consequence_annotations_struct(
90-
ht.sorted_transaction_consequences,
91-
)
92-
),
93-
)
94-
ht = ht.select(
95-
coding=(
96-
ht.main_transcript.major_consequence_rank
97-
<= CONSEQUENCE_TERM_RANK_LOOKUP['synonymous_variant']
98-
),
99-
noncoding=(
100-
ht.main_transcript.major_consequence_rank
101-
>= CONSEQUENCE_TERM_RANK_LOOKUP['downstream_gene_variant']
102-
),
103-
)
104-
return ht.filter(ht.coding | ht.noncoding)
105-
106-
107-
def high_af_variants(
108-
ht: hl.Table,
109-
**_: Any,
110-
) -> hl.Table:
111-
ht = ht.select_globals()
112-
ht = ht.filter(ht.gnomad_genomes.AF_POPMAX_OR_GLOBAL > ONE_PERCENT)
113-
return ht.select(
114-
is_gt_3_percent=ht.gnomad_genomes.AF_POPMAX_OR_GLOBAL > THREE_PERCENT,
115-
is_gt_5_percent=ht.gnomad_genomes.AF_POPMAX_OR_GLOBAL > FIVE_PERCENT,
116-
is_gt_10_percent=ht.gnomad_genomes.AF_POPMAX_OR_GLOBAL > TEN_PERCENT,
117-
)
118-
119-
120-
def gnomad_qc(
121-
ht: hl.Table,
122-
**_: Any,
123-
) -> hl.Table:
124-
return ht.select()
12515

12616

12717
class CachedReferenceDatasetQuery(Enum):
@@ -137,12 +27,21 @@ def access_control(self) -> AccessControl:
13727
return AccessControl.PUBLIC
13828

13929
@property
140-
def reference_dataset(self) -> str | None:
30+
def dataset(self) -> str | None:
14131
return {
32+
CachedReferenceDatasetQuery.CLINVAR_PATH_VARIANTS: 'clinvar',
14233
CachedReferenceDatasetQuery.GNOMAD_CODING_AND_NONCODING_VARIANTS: 'gnomad_genomes',
14334
CachedReferenceDatasetQuery.GNOMAD_QC: 'gnomad_qc',
35+
CachedReferenceDatasetQuery.HIGH_AF_VARIANTS: 'gnomad_genomes',
14436
}.get(self)
14537

38+
@property
39+
def query_raw_dataset(self) -> bool:
40+
return {
41+
CachedReferenceDatasetQuery.GNOMAD_CODING_AND_NONCODING_VARIANTS: True,
42+
CachedReferenceDatasetQuery.GNOMAD_QC: True,
43+
}.get(self, False)
44+
14645
@property
14746
def query(self) -> Callable[[hl.Table, ReferenceGenome], hl.Table]:
14847
return {

v03_pipeline/lib/reference_data/compare_globals.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,7 @@
1010
from v03_pipeline.lib.reference_data.dataset_table_operations import (
1111
get_all_select_fields,
1212
get_enum_select_fields,
13-
get_ht_path,
1413
import_ht_from_config_path,
15-
parse_dataset_version,
1614
)
1715

1816
logger = get_logger(__name__)
@@ -37,17 +35,15 @@ def from_dataset_configs(
3735
paths, versions, enums, selects = {}, {}, {}, {}
3836
for dataset in datasets:
3937
dataset_config = CONFIG[dataset][reference_genome.v02_value]
40-
dataset_ht = import_ht_from_config_path(dataset_config, reference_genome)
41-
42-
paths[dataset] = get_ht_path(dataset_config)
43-
versions[dataset] = hl.eval(
44-
parse_dataset_version(
45-
dataset_ht,
46-
dataset,
47-
dataset_config,
48-
),
38+
dataset_ht = import_ht_from_config_path(
39+
dataset_config,
40+
dataset,
41+
reference_genome,
4942
)
50-
enums[dataset] = dataset_config.get('enum_select', {})
43+
dataset_ht_globals = hl.eval(dataset_ht.globals)
44+
paths[dataset] = dataset_ht_globals.path
45+
versions[dataset] = dataset_ht_globals.version
46+
enums[dataset] = dict(dataset_ht_globals.enums)
5147
dataset_ht = dataset_ht.select(
5248
**get_all_select_fields(dataset_ht, dataset_config),
5349
)
@@ -83,10 +79,14 @@ def from_ht(
8379
def get_datasets_to_update(
8480
ht1_globals: Globals,
8581
ht2_globals: Globals,
82+
validate_selects: bool = True,
8683
) -> list[str]:
8784
datasets_to_update = set()
8885

8986
for field in dataclasses.fields(Globals):
87+
if field.name == 'selects' and not validate_selects:
88+
continue
89+
9090
datasets_to_update.update(
9191
ht1_globals[field.name].keys() ^ ht2_globals[field.name].keys(),
9292
)

v03_pipeline/lib/reference_data/config.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -372,10 +372,12 @@ def custom_mpc_select(ht):
372372
},
373373
'gnomad_qc': {
374374
'37': {
375+
'version': 'v2',
375376
'custom_import': import_matrix_table,
376377
'source_path': 'gs://gnomad/sample_qc/mt/gnomad.joint.high_callrate_common_biallelic_snps.pruned.mt',
377378
},
378379
'38': {
380+
'version': 'v3.1',
379381
'custom_import': import_matrix_table,
380382
'source_path': 'gs://gnomad/sample_qc/mt/genomes_v3.1/gnomad_v3.1_qc_mt_v2_sites_dense.mt',
381383
},

v03_pipeline/lib/reference_data/dataset_table_operations.py

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ def get_dataset_ht(
5353
reference_genome: ReferenceGenome,
5454
) -> hl.Table:
5555
config = CONFIG[dataset][reference_genome.v02_value]
56-
ht = import_ht_from_config_path(config, reference_genome)
56+
ht = import_ht_from_config_path(config, dataset, reference_genome)
5757
if hasattr(ht, 'locus'):
5858
ht = ht.filter(
5959
hl.set(reference_genome.standard_contigs).contains(ht.locus.contig),
@@ -62,16 +62,6 @@ def get_dataset_ht(
6262
ht = ht.filter(config['filter'](ht)) if 'filter' in config else ht
6363
ht = ht.select(**get_all_select_fields(ht, config))
6464
ht = ht.transmute(**get_enum_select_fields(ht, config))
65-
ht = ht.select_globals(
66-
path=(config['source_path'] if 'custom_import' in config else config['path']),
67-
version=parse_dataset_version(ht, dataset, config),
68-
enums=hl.Struct(
69-
**config.get(
70-
'enum_select',
71-
hl.missing(hl.tstruct(hl.tstr, hl.tarray(hl.tstr))),
72-
),
73-
),
74-
)
7565
return ht.select(**{dataset: ht.row.drop(*ht.key)}).distinct()
7666

7767

@@ -81,14 +71,25 @@ def get_ht_path(config: dict) -> str:
8171

8272
def import_ht_from_config_path(
8373
config: dict,
74+
dataset: str,
8475
reference_genome: ReferenceGenome,
8576
) -> hl.Table:
8677
path = get_ht_path(config)
87-
return (
78+
ht = (
8879
config['custom_import'](path, reference_genome)
8980
if 'custom_import' in config
9081
else hl.read_table(path)
9182
)
83+
return ht.annotate_globals(
84+
path=path,
85+
version=parse_dataset_version(ht, dataset, config),
86+
enums=hl.Struct(
87+
**config.get(
88+
'enum_select',
89+
hl.missing(hl.tstruct(hl.tstr, hl.tarray(hl.tstr))),
90+
),
91+
),
92+
)
9293

9394

9495
def get_select_fields(selects: list | dict | None, base_ht: hl.Table) -> dict:
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
from typing import Any
2+
3+
import hail as hl
4+
5+
from v03_pipeline.lib.annotations.enums import (
6+
CLINVAR_PATHOGENICITIES_LOOKUP,
7+
CONSEQUENCE_TERMS,
8+
)
9+
from v03_pipeline.lib.annotations.expression_helpers import (
10+
get_expr_for_vep_sorted_transcript_consequences_array,
11+
get_expr_for_worst_transcript_consequence_annotations_struct,
12+
)
13+
from v03_pipeline.lib.model.dataset_type import DatasetType
14+
from v03_pipeline.lib.model.definitions import ReferenceGenome
15+
16+
CLINVAR_PATH_RANGE = ('Pathogenic', 'Pathogenic/Likely_risk_allele')
17+
CLINVAR_LIKELY_PATH_RANGE = ('Pathogenic/Likely_pathogenic', 'Likely_risk_allele')
18+
CONSEQUENCE_TERM_RANK_LOOKUP = hl.dict(
19+
hl.enumerate(CONSEQUENCE_TERMS, index_first=False),
20+
)
21+
GNOMAD_CODING_NONCODING_HIGH_AF_THRESHOLD = 0.90
22+
ONE_PERCENT = 0.01
23+
THREE_PERCENT = 0.03
24+
FIVE_PERCENT = 0.05
25+
TEN_PERCENT = 0.10
26+
27+
28+
def clinvar_path_variants(
29+
ht: hl.Table,
30+
dataset_type: DatasetType,
31+
**_: Any,
32+
) -> hl.Table:
33+
clinvar_field = 'clinvar_mito' if dataset_type == DatasetType.MITO else 'clinvar'
34+
ht = ht.select_globals()
35+
ht = ht.select(
36+
pathogenic=(
37+
(
38+
ht[clinvar_field].pathogenicity_id
39+
>= CLINVAR_PATHOGENICITIES_LOOKUP[CLINVAR_PATH_RANGE[0]]
40+
)
41+
& (
42+
ht[clinvar_field].pathogenicity_id
43+
<= CLINVAR_PATHOGENICITIES_LOOKUP[CLINVAR_PATH_RANGE[1]]
44+
)
45+
),
46+
likely_pathogenic=(
47+
(
48+
ht[clinvar_field].pathogenicity_id
49+
>= CLINVAR_PATHOGENICITIES_LOOKUP[CLINVAR_LIKELY_PATH_RANGE[0]]
50+
)
51+
& (
52+
ht[clinvar_field].pathogenicity_id
53+
<= CLINVAR_PATHOGENICITIES_LOOKUP[CLINVAR_LIKELY_PATH_RANGE[1]]
54+
)
55+
),
56+
)
57+
return ht.filter(ht.pathogenic | ht.likely_pathogenic)
58+
59+
60+
def gnomad_coding_and_noncoding_variants(
61+
ht: hl.Table,
62+
reference_genome: ReferenceGenome,
63+
**_: Any,
64+
) -> hl.Table:
65+
filtered_contig = 'chr1' if reference_genome == ReferenceGenome.GRCh38 else '1'
66+
ht = hl.filter_intervals(
67+
ht,
68+
[
69+
hl.parse_locus_interval(
70+
filtered_contig,
71+
reference_genome=reference_genome.value,
72+
),
73+
],
74+
)
75+
ht = ht.filter(ht.freq[0].AF > GNOMAD_CODING_NONCODING_HIGH_AF_THRESHOLD)
76+
ht = ht.annotate(
77+
sorted_transaction_consequences=(
78+
get_expr_for_vep_sorted_transcript_consequences_array(
79+
ht.vep,
80+
omit_consequences=[],
81+
)
82+
),
83+
)
84+
ht = ht.annotate(
85+
main_transcript=(
86+
get_expr_for_worst_transcript_consequence_annotations_struct(
87+
ht.sorted_transaction_consequences,
88+
)
89+
),
90+
)
91+
ht = ht.select(
92+
coding=(
93+
ht.main_transcript.major_consequence_rank
94+
<= CONSEQUENCE_TERM_RANK_LOOKUP['synonymous_variant']
95+
),
96+
noncoding=(
97+
ht.main_transcript.major_consequence_rank
98+
>= CONSEQUENCE_TERM_RANK_LOOKUP['downstream_gene_variant']
99+
),
100+
)
101+
return ht.filter(ht.coding | ht.noncoding)
102+
103+
104+
def high_af_variants(
105+
ht: hl.Table,
106+
**_: Any,
107+
) -> hl.Table:
108+
ht = ht.select_globals()
109+
ht = ht.filter(ht.gnomad_genomes.AF_POPMAX_OR_GLOBAL > ONE_PERCENT)
110+
return ht.select(
111+
is_gt_3_percent=ht.gnomad_genomes.AF_POPMAX_OR_GLOBAL > THREE_PERCENT,
112+
is_gt_5_percent=ht.gnomad_genomes.AF_POPMAX_OR_GLOBAL > FIVE_PERCENT,
113+
is_gt_10_percent=ht.gnomad_genomes.AF_POPMAX_OR_GLOBAL > TEN_PERCENT,
114+
)
115+
116+
117+
def gnomad_qc(
118+
ht: hl.Table,
119+
**_: Any,
120+
) -> hl.Table:
121+
return ht.select()

0 commit comments

Comments
 (0)