Skip to content

Commit 5754b81

Browse files
committed
Merge branch 'dev' of github.com:broadinstitute/seqr-loading-pipelines
2 parents 1bb1d80 + 5f90d87 commit 5754b81

27 files changed

+823
-180
lines changed

v03_pipeline/bin/write_cached_reference_dataset_query_ht.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@
1515
valid_reference_dataset_collection_path,
1616
)
1717
from v03_pipeline.lib.reference_data.config import CONFIG
18+
from v03_pipeline.lib.reference_data.dataset_table_operations import (
19+
import_ht_from_config_path,
20+
)
1821

1922

2023
def get_ht(
@@ -25,11 +28,7 @@ def get_ht(
2528
# If the query is defined over an uncombined reference dataset, use the combiner config.
2629
if query.reference_dataset:
2730
config = CONFIG[query.reference_dataset][reference_genome.v02_value]
28-
return (
29-
config['custom_import'](config['source_path'], reference_genome)
30-
if 'custom_import' in config
31-
else hl.read_table(config['path'])
32-
)
31+
return import_ht_from_config_path(config, reference_genome)
3332
return hl.read_table(
3433
valid_reference_dataset_collection_path(
3534
reference_genome,
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
PROJECTS_EXCLUDED_FROM_GT_STATS = {
2+
'R0555_seqr_demo',
3+
'R0607_gregor_training_project_',
4+
'R0608_gregor_training_project_',
5+
}

v03_pipeline/lib/annotations/mito.py

Lines changed: 21 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import hail as hl
44

5+
from v03_pipeline.lib.annotations.constants import PROJECTS_EXCLUDED_FROM_GT_STATS
56
from v03_pipeline.lib.annotations.enums import MITOTIP_PATHOGENICITIES
67

78
MITOTIP_PATHOGENICITIES_LOOKUP = hl.dict(
@@ -14,31 +15,6 @@
1415
)
1516

1617

17-
def _AC_het(row: hl.StructExpression) -> hl.Int32Expression: # noqa: N802
18-
return sum(
19-
row.heteroplasmic_samples[project_guid].length()
20-
for project_guid in row.heteroplasmic_samples
21-
)
22-
23-
24-
def _AC_hom(row: hl.StructExpression) -> hl.Int32Expression: # noqa: N802
25-
return sum(
26-
row.homoplasmic_samples[project_guid].length()
27-
for project_guid in row.homoplasmic_samples
28-
)
29-
30-
31-
def _AN(row: hl.StructExpression) -> hl.Int32Expression: # noqa: N802
32-
return sum(
33-
(
34-
row.ref_samples[project_guid].length()
35-
+ row.heteroplasmic_samples[project_guid].length()
36-
+ row.homoplasmic_samples[project_guid].length()
37-
)
38-
for project_guid in row.ref_samples
39-
)
40-
41-
4218
def common_low_heteroplasmy(ht: hl.Table, **_: Any) -> hl.Expression:
4319
return ht.common_low_heteroplasmy
4420

@@ -90,12 +66,26 @@ def rsid(ht: hl.Table, **_: Any) -> hl.Expression:
9066
return ht.rsid.find(lambda x: hl.is_defined(x))
9167

9268

93-
def gt_stats(ht: hl.Table, sample_lookup_ht: hl.Table, **_: Any) -> hl.Expression:
69+
def gt_stats(
70+
ht: hl.Table,
71+
sample_lookup_ht: hl.Table,
72+
**_: Any,
73+
) -> hl.Expression:
9474
row = sample_lookup_ht[ht.key]
75+
AC_het, AC_hom, AN = 0, 0, 0 # noqa: N806
76+
for project_guid in row.ref_samples:
77+
if project_guid in PROJECTS_EXCLUDED_FROM_GT_STATS:
78+
continue
79+
ref_samples_length = row.ref_samples[project_guid].length()
80+
heteroplasmic_samples_length = row.heteroplasmic_samples[project_guid].length()
81+
homoplasmic_samples_length = row.homoplasmic_samples[project_guid].length()
82+
AC_het += heteroplasmic_samples_length # noqa: N806
83+
AC_hom += homoplasmic_samples_length # noqa: N806
84+
AN += (ref_samples_length + heteroplasmic_samples_length + homoplasmic_samples_length) # noqa: N806
9585
return hl.Struct(
96-
AC_het=_AC_het(row),
97-
AF_het=hl.float32(_AC_het(row) / _AN(row)),
98-
AC_hom=_AC_hom(row),
99-
AF_hom=hl.float32(_AC_hom(row) / _AN(row)),
100-
AN=_AN(row),
86+
AC_het=AC_het,
87+
AF_het=hl.float32(AC_het / AN),
88+
AC_hom=AC_hom,
89+
AF_hom=hl.float32(AC_hom / AN),
90+
AN=AN,
10191
)

v03_pipeline/lib/annotations/snv_indel.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33

44
import hail as hl
55

6+
from v03_pipeline.lib.annotations.constants import PROJECTS_EXCLUDED_FROM_GT_STATS
7+
68
N_ALT_REF = 0
79
N_ALT_HET = 1
810
N_ALT_HOM = 2
@@ -37,6 +39,8 @@ def gt_stats(
3739
row = sample_lookup_ht[ht.key]
3840
AC, AN, hom = 0, 0, 0
3941
for project_guid in row.ref_samples:
42+
if project_guid in PROJECTS_EXCLUDED_FROM_GT_STATS:
43+
continue
4044
ref_samples_length = row.ref_samples[project_guid].length()
4145
het_samples_length = row.het_samples[project_guid].length()
4246
hom_samples_length = row.hom_samples[project_guid].length()

v03_pipeline/lib/annotations/snv_indel_test.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,33 +25,37 @@ def test_allele_count_annotations(self) -> None:
2525
[
2626
{
2727
'id': 0,
28-
'ref_samples': hl.Struct(project_1={'a', 'c'}, project_2=set()),
29-
'het_samples': hl.Struct(project_1={'b', 'd'}, project_2=set()),
30-
'hom_samples': hl.Struct(project_1={'e', 'f'}, project_2=set()),
28+
'ref_samples': hl.Struct(project_1={'a', 'c'}, project_2=set(), R0607_gregor_training_project_=set()),
29+
'het_samples': hl.Struct(project_1={'b', 'd'}, project_2=set(), R0607_gregor_training_project_=set()),
30+
'hom_samples': hl.Struct(project_1={'e', 'f'}, project_2=set(), R0607_gregor_training_project_={'l', 'm'}),
3131
},
3232
{
3333
'id': 1,
3434
'ref_samples': hl.Struct(
3535
project_1={'a', 'b', 'c', 'd', 'e', 'f'},
3636
project_2=set(),
37+
R0607_gregor_training_project_={'l', 'm'},
3738
),
38-
'het_samples': hl.Struct(project_1=set(), project_2=set()),
39-
'hom_samples': hl.Struct(project_1=set(), project_2=set()),
39+
'het_samples': hl.Struct(project_1=set(), project_2=set(), R0607_gregor_training_project_=set()),
40+
'hom_samples': hl.Struct(project_1=set(), project_2=set(), R0607_gregor_training_project_=set()),
4041
},
4142
],
4243
hl.tstruct(
4344
id=hl.tint32,
4445
ref_samples=hl.tstruct(
4546
project_1=hl.tset(hl.tstr),
4647
project_2=hl.tset(hl.tstr),
48+
R0607_gregor_training_project_=hl.tset(hl.tstr),
4749
),
4850
het_samples=hl.tstruct(
4951
project_1=hl.tset(hl.tstr),
5052
project_2=hl.tset(hl.tstr),
53+
R0607_gregor_training_project_=hl.tset(hl.tstr),
5154
),
5255
hom_samples=hl.tstruct(
5356
project_1=hl.tset(hl.tstr),
5457
project_2=hl.tset(hl.tstr),
58+
R0607_gregor_training_project_=hl.tset(hl.tstr),
5559
),
5660
),
5761
key='id',
Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
import logging
2+
from dataclasses import dataclass
3+
4+
import hail as hl
5+
6+
from v03_pipeline.lib.model import ReferenceGenome
7+
from v03_pipeline.lib.reference_data.config import CONFIG
8+
from v03_pipeline.lib.reference_data.dataset_table_operations import (
9+
get_all_select_fields,
10+
get_ht_path,
11+
import_ht_from_config_path,
12+
parse_dataset_version,
13+
)
14+
15+
logger = logging.getLogger(__name__)
16+
17+
18+
@dataclass
19+
class ReferenceDataGlobals:
20+
paths: dict[str]
21+
versions: dict[str]
22+
enums: dict[str, dict[str, list[str]]]
23+
24+
def __init__(self, globals_struct: hl.Struct):
25+
self.paths = self._struct_to_dict(globals_struct.paths)
26+
self.versions = self._struct_to_dict(globals_struct.versions)
27+
self.enums = self._struct_to_dict(globals_struct.enums)
28+
29+
def _struct_to_dict(self, struct: hl.Struct) -> dict:
30+
result_dict = {}
31+
for field in struct:
32+
if isinstance(struct[field], hl.Struct):
33+
result_dict[field] = self._struct_to_dict(struct[field])
34+
else:
35+
result_dict[field] = struct[field]
36+
return result_dict
37+
38+
39+
def get_datasets_to_update(
40+
joined_ht: hl.Table,
41+
datasets: list[str],
42+
reference_genome: ReferenceGenome,
43+
) -> list[str]:
44+
joined_ht_globals = ReferenceDataGlobals(hl.eval(joined_ht.index_globals()))
45+
datasets_to_update = []
46+
for dataset in datasets:
47+
if dataset not in joined_ht.row:
48+
datasets_to_update.append(dataset)
49+
continue
50+
51+
if not validate_joined_ht_globals_match_config(
52+
joined_ht,
53+
joined_ht_globals,
54+
dataset,
55+
reference_genome,
56+
):
57+
datasets_to_update.append(dataset)
58+
return datasets_to_update
59+
60+
61+
def validate_joined_ht_globals_match_config(
62+
joined_ht: hl.Table,
63+
joined_ht_globals: ReferenceDataGlobals,
64+
dataset: str,
65+
reference_genome: ReferenceGenome,
66+
) -> bool:
67+
dataset_config = CONFIG[dataset][reference_genome.v02_value]
68+
dataset_ht = import_ht_from_config_path(dataset_config, reference_genome)
69+
checks = {
70+
'version': ht_version_matches_config(
71+
joined_ht_globals,
72+
dataset,
73+
dataset_config,
74+
dataset_ht,
75+
),
76+
'path': ht_path_matches_config(joined_ht_globals, dataset, dataset_config),
77+
'enum': ht_enums_match_config(joined_ht_globals, dataset, dataset_config),
78+
'select': ht_selects_match_config(
79+
joined_ht,
80+
dataset,
81+
dataset_config,
82+
dataset_ht,
83+
),
84+
}
85+
86+
results = []
87+
for check, result in checks.items():
88+
if result is False:
89+
logger.info(f'{check} mismatch for {dataset}')
90+
results.append(result)
91+
return all(results)
92+
93+
94+
def ht_version_matches_config(
95+
joined_ht_globals: ReferenceDataGlobals,
96+
dataset: str,
97+
dataset_config: dict,
98+
dataset_ht: hl.Table,
99+
) -> bool:
100+
joined_ht_version = joined_ht_globals.versions.get(dataset)
101+
if joined_ht_version is None:
102+
return False
103+
104+
config_or_dataset_version = hl.eval(
105+
parse_dataset_version(
106+
dataset_ht,
107+
dataset,
108+
dataset_config,
109+
),
110+
)
111+
return joined_ht_version == config_or_dataset_version
112+
113+
114+
def ht_path_matches_config(
115+
joined_ht_globals: ReferenceDataGlobals,
116+
dataset: str,
117+
dataset_config: dict,
118+
) -> bool:
119+
joined_ht_path = joined_ht_globals.paths.get(dataset)
120+
if joined_ht_path is None:
121+
return False
122+
123+
config_path = get_ht_path(dataset_config)
124+
return joined_ht_path == config_path
125+
126+
127+
def ht_enums_match_config(
128+
joined_ht_globals: ReferenceDataGlobals,
129+
dataset: str,
130+
dataset_config: dict,
131+
) -> bool:
132+
joined_ht_enums = joined_ht_globals.enums.get(dataset, {})
133+
config_enums = dataset_config.get('enum_select', {})
134+
return joined_ht_enums == config_enums
135+
136+
137+
def ht_selects_match_config(
138+
joined_ht: hl.Table,
139+
dataset: str,
140+
dataset_config: dict,
141+
dataset_ht: hl.Table,
142+
) -> bool:
143+
joined_ht_selects = set(joined_ht[dataset])
144+
config_selects = set(get_all_select_fields(dataset_ht, dataset_config).keys())
145+
return len(config_selects.symmetric_difference(joined_ht_selects)) == 0

0 commit comments

Comments
 (0)