Skip to content

Commit 4ec6830

Browse files
committed
Merge branch 'dev' of github.com:broadinstitute/seqr-loading-pipelines into benb/lookup_table_refactor
2 parents 6aaf595 + 4c30439 commit 4ec6830

File tree

161 files changed

+1201
-200
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

161 files changed

+1201
-200
lines changed

v03_pipeline/lib/methods/relatedness.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ def call_relatedness(
3030
# - brew install lz4
3131
# - CXXFLAGS='-I/opt/homebrew/include/' HAIL_COMPILE_NATIVES=1 make -C hail install
3232
# Hail issue here: https://discuss.hail.is/t/noclassdeffounderror-could-not-initialize-class-is-hail-methods-ibsffi/2453
33-
kin_ht = hl.identity_by_descent(mt, maf=mt.info.AF[1], min=0.10, max=1.0)
33+
kin_ht = hl.identity_by_descent(mt, maf=mt.info.AF[0], min=0.10, max=1.0)
3434
kin_ht = kin_ht.key_by('i', 'j')
3535
return kin_ht.select(
3636
ibd0=kin_ht.ibd.Z0,

v03_pipeline/lib/model/cached_reference_dataset_query.py

Lines changed: 15 additions & 116 deletions
Original file line numberDiff line numberDiff line change
@@ -1,127 +1,17 @@
11
from collections.abc import Callable
22
from enum import Enum
3-
from typing import Any
43

54
import hail as hl
65

7-
from v03_pipeline.lib.annotations.enums import (
8-
CLINVAR_PATHOGENICITIES_LOOKUP,
9-
CONSEQUENCE_TERMS,
10-
)
11-
from v03_pipeline.lib.annotations.expression_helpers import (
12-
get_expr_for_vep_sorted_transcript_consequences_array,
13-
get_expr_for_worst_transcript_consequence_annotations_struct,
14-
)
156
from v03_pipeline.lib.model.dataset_type import DatasetType
167
from v03_pipeline.lib.model.definitions import AccessControl, ReferenceGenome
178
from v03_pipeline.lib.model.environment import Env
18-
19-
CLINVAR_PATH_RANGE = ('Pathogenic', 'Pathogenic/Likely_risk_allele')
20-
CLINVAR_LIKELY_PATH_RANGE = ('Pathogenic/Likely_pathogenic', 'Likely_risk_allele')
21-
CONSEQUENCE_TERM_RANK_LOOKUP = hl.dict(
22-
hl.enumerate(CONSEQUENCE_TERMS, index_first=False),
9+
from v03_pipeline.lib.reference_data.queries import (
10+
clinvar_path_variants,
11+
gnomad_coding_and_noncoding_variants,
12+
gnomad_qc,
13+
high_af_variants,
2314
)
24-
GNOMAD_CODING_NONCODING_HIGH_AF_THRESHOLD = 0.90
25-
ONE_PERCENT = 0.01
26-
THREE_PERCENT = 0.03
27-
FIVE_PERCENT = 0.05
28-
TEN_PERCENT = 0.10
29-
30-
31-
def clinvar_path_variants(
32-
ht: hl.Table,
33-
dataset_type: DatasetType,
34-
**_: Any,
35-
) -> hl.Table:
36-
clinvar_field = 'clinvar_mito' if dataset_type == DatasetType.MITO else 'clinvar'
37-
ht = ht.select_globals()
38-
ht = ht.select(
39-
pathogenic=(
40-
(
41-
ht[clinvar_field].pathogenicity_id
42-
>= CLINVAR_PATHOGENICITIES_LOOKUP[CLINVAR_PATH_RANGE[0]]
43-
)
44-
& (
45-
ht[clinvar_field].pathogenicity_id
46-
<= CLINVAR_PATHOGENICITIES_LOOKUP[CLINVAR_PATH_RANGE[1]]
47-
)
48-
),
49-
likely_pathogenic=(
50-
(
51-
ht[clinvar_field].pathogenicity_id
52-
>= CLINVAR_PATHOGENICITIES_LOOKUP[CLINVAR_LIKELY_PATH_RANGE[0]]
53-
)
54-
& (
55-
ht[clinvar_field].pathogenicity_id
56-
<= CLINVAR_PATHOGENICITIES_LOOKUP[CLINVAR_LIKELY_PATH_RANGE[1]]
57-
)
58-
),
59-
)
60-
return ht.filter(ht.pathogenic | ht.likely_pathogenic)
61-
62-
63-
def gnomad_coding_and_noncoding_variants(
64-
ht: hl.Table,
65-
reference_genome: ReferenceGenome,
66-
**_: Any,
67-
) -> hl.Table:
68-
filtered_contig = 'chr1' if reference_genome == ReferenceGenome.GRCh38 else '1'
69-
ht = hl.filter_intervals(
70-
ht,
71-
[
72-
hl.parse_locus_interval(
73-
filtered_contig,
74-
reference_genome=reference_genome.value,
75-
),
76-
],
77-
)
78-
ht = ht.filter(ht.freq[0].AF > GNOMAD_CODING_NONCODING_HIGH_AF_THRESHOLD)
79-
ht = ht.annotate(
80-
sorted_transaction_consequences=(
81-
get_expr_for_vep_sorted_transcript_consequences_array(
82-
ht.vep,
83-
omit_consequences=[],
84-
)
85-
),
86-
)
87-
ht = ht.annotate(
88-
main_transcript=(
89-
get_expr_for_worst_transcript_consequence_annotations_struct(
90-
ht.sorted_transaction_consequences,
91-
)
92-
),
93-
)
94-
ht = ht.select(
95-
coding=(
96-
ht.main_transcript.major_consequence_rank
97-
<= CONSEQUENCE_TERM_RANK_LOOKUP['synonymous_variant']
98-
),
99-
noncoding=(
100-
ht.main_transcript.major_consequence_rank
101-
>= CONSEQUENCE_TERM_RANK_LOOKUP['downstream_gene_variant']
102-
),
103-
)
104-
return ht.filter(ht.coding | ht.noncoding)
105-
106-
107-
def high_af_variants(
108-
ht: hl.Table,
109-
**_: Any,
110-
) -> hl.Table:
111-
ht = ht.select_globals()
112-
ht = ht.filter(ht.gnomad_genomes.AF_POPMAX_OR_GLOBAL > ONE_PERCENT)
113-
return ht.select(
114-
is_gt_3_percent=ht.gnomad_genomes.AF_POPMAX_OR_GLOBAL > THREE_PERCENT,
115-
is_gt_5_percent=ht.gnomad_genomes.AF_POPMAX_OR_GLOBAL > FIVE_PERCENT,
116-
is_gt_10_percent=ht.gnomad_genomes.AF_POPMAX_OR_GLOBAL > TEN_PERCENT,
117-
)
118-
119-
120-
def gnomad_qc(
121-
ht: hl.Table,
122-
**_: Any,
123-
) -> hl.Table:
124-
return ht.select()
12515

12616

12717
class CachedReferenceDatasetQuery(Enum):
@@ -137,12 +27,21 @@ def access_control(self) -> AccessControl:
13727
return AccessControl.PUBLIC
13828

13929
@property
140-
def reference_dataset(self) -> str | None:
30+
def dataset(self) -> str | None:
14131
return {
32+
CachedReferenceDatasetQuery.CLINVAR_PATH_VARIANTS: 'clinvar',
14233
CachedReferenceDatasetQuery.GNOMAD_CODING_AND_NONCODING_VARIANTS: 'gnomad_genomes',
14334
CachedReferenceDatasetQuery.GNOMAD_QC: 'gnomad_qc',
35+
CachedReferenceDatasetQuery.HIGH_AF_VARIANTS: 'gnomad_genomes',
14436
}.get(self)
14537

38+
@property
39+
def query_raw_dataset(self) -> bool:
40+
return {
41+
CachedReferenceDatasetQuery.GNOMAD_CODING_AND_NONCODING_VARIANTS: True,
42+
CachedReferenceDatasetQuery.GNOMAD_QC: True,
43+
}.get(self, False)
44+
14645
@property
14746
def query(self) -> Callable[[hl.Table, ReferenceGenome], hl.Table]:
14847
return {

v03_pipeline/lib/reference_data/compare_globals.py

Lines changed: 13 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,7 @@
1010
from v03_pipeline.lib.reference_data.dataset_table_operations import (
1111
get_all_select_fields,
1212
get_enum_select_fields,
13-
get_ht_path,
1413
import_ht_from_config_path,
15-
parse_dataset_version,
1614
)
1715

1816
logger = get_logger(__name__)
@@ -37,17 +35,15 @@ def from_dataset_configs(
3735
paths, versions, enums, selects = {}, {}, {}, {}
3836
for dataset in datasets:
3937
dataset_config = CONFIG[dataset][reference_genome.v02_value]
40-
dataset_ht = import_ht_from_config_path(dataset_config, reference_genome)
41-
42-
paths[dataset] = get_ht_path(dataset_config)
43-
versions[dataset] = hl.eval(
44-
parse_dataset_version(
45-
dataset_ht,
46-
dataset,
47-
dataset_config,
48-
),
38+
dataset_ht = import_ht_from_config_path(
39+
dataset_config,
40+
dataset,
41+
reference_genome,
4942
)
50-
enums[dataset] = dataset_config.get('enum_select', {})
43+
dataset_ht_globals = hl.eval(dataset_ht.globals)
44+
paths[dataset] = dataset_ht_globals.path
45+
versions[dataset] = dataset_ht_globals.version
46+
enums[dataset] = dict(dataset_ht_globals.enums)
5147
dataset_ht = dataset_ht.select(
5248
**get_all_select_fields(dataset_ht, dataset_config),
5349
)
@@ -67,13 +63,7 @@ def from_ht(
6763
paths = dict(rdc_globals_struct.paths)
6864
versions = dict(rdc_globals_struct.versions)
6965
# enums are nested structs
70-
enums = {k: dict(v) for k, v in rdc_globals_struct.enums.items()}
71-
72-
for global_dict in [paths, versions, enums]:
73-
for dataset in list(global_dict.keys()):
74-
if dataset not in datasets:
75-
global_dict.pop(dataset)
76-
66+
enums = {k: dict(v) for k, v in rdc_globals_struct.enums.items() if k in paths}
7767
selects = {}
7868
for dataset in datasets:
7969
if dataset in ht.row:
@@ -89,10 +79,14 @@ def from_ht(
8979
def get_datasets_to_update(
9080
ht1_globals: Globals,
9181
ht2_globals: Globals,
82+
validate_selects: bool = True,
9283
) -> list[str]:
9384
datasets_to_update = set()
9485

9586
for field in dataclasses.fields(Globals):
87+
if field.name == 'selects' and not validate_selects:
88+
continue
89+
9690
datasets_to_update.update(
9791
ht1_globals[field.name].keys() ^ ht2_globals[field.name].keys(),
9892
)

v03_pipeline/lib/reference_data/config.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -372,10 +372,12 @@ def custom_mpc_select(ht):
372372
},
373373
'gnomad_qc': {
374374
'37': {
375+
'version': 'v2',
375376
'custom_import': import_matrix_table,
376377
'source_path': 'gs://gnomad/sample_qc/mt/gnomad.joint.high_callrate_common_biallelic_snps.pruned.mt',
377378
},
378379
'38': {
380+
'version': 'v3.1',
379381
'custom_import': import_matrix_table,
380382
'source_path': 'gs://gnomad/sample_qc/mt/genomes_v3.1/gnomad_v3.1_qc_mt_v2_sites_dense.mt',
381383
},

v03_pipeline/lib/reference_data/dataset_table_operations.py

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ def get_dataset_ht(
5353
reference_genome: ReferenceGenome,
5454
) -> hl.Table:
5555
config = CONFIG[dataset][reference_genome.v02_value]
56-
ht = import_ht_from_config_path(config, reference_genome)
56+
ht = import_ht_from_config_path(config, dataset, reference_genome)
5757
if hasattr(ht, 'locus'):
5858
ht = ht.filter(
5959
hl.set(reference_genome.standard_contigs).contains(ht.locus.contig),
@@ -62,16 +62,6 @@ def get_dataset_ht(
6262
ht = ht.filter(config['filter'](ht)) if 'filter' in config else ht
6363
ht = ht.select(**get_all_select_fields(ht, config))
6464
ht = ht.transmute(**get_enum_select_fields(ht, config))
65-
ht = ht.select_globals(
66-
path=(config['source_path'] if 'custom_import' in config else config['path']),
67-
version=parse_dataset_version(ht, dataset, config),
68-
enums=hl.Struct(
69-
**config.get(
70-
'enum_select',
71-
hl.missing(hl.tstruct(hl.tstr, hl.tarray(hl.tstr))),
72-
),
73-
),
74-
)
7565
return ht.select(**{dataset: ht.row.drop(*ht.key)}).distinct()
7666

7767

@@ -81,14 +71,25 @@ def get_ht_path(config: dict) -> str:
8171

8272
def import_ht_from_config_path(
8373
config: dict,
74+
dataset: str,
8475
reference_genome: ReferenceGenome,
8576
) -> hl.Table:
8677
path = get_ht_path(config)
87-
return (
78+
ht = (
8879
config['custom_import'](path, reference_genome)
8980
if 'custom_import' in config
9081
else hl.read_table(path)
9182
)
83+
return ht.annotate_globals(
84+
path=path,
85+
version=parse_dataset_version(ht, dataset, config),
86+
enums=hl.Struct(
87+
**config.get(
88+
'enum_select',
89+
hl.missing(hl.tstruct(hl.tstr, hl.tarray(hl.tstr))),
90+
),
91+
),
92+
)
9293

9394

9495
def get_select_fields(selects: list | dict | None, base_ht: hl.Table) -> dict:

0 commit comments

Comments
 (0)