Skip to content

Commit 2cf5353

Browse files
Dev (#804)
* add missing test file :/ * clear globals on new variants table (#799) * Import fields with fully qualified names. (#797) * first pass * i think this works? * rename * comment and fix logic * no longer use optional row fields * clean up additional field handling * None arg * i think this is better? * format * missed one * fix * Update write_remapped_and_subsetted_callset.py * Add Type Assertions to imported fields. (#798) * first pass * gcnv * first pass type assertions * Finish * add validation test * cleanup * cleanup * add unit test * fix svid * Bump tornado from 6.3.3 to 6.4.1 (#802) Bumps [tornado](https://github.com/tornadoweb/tornado) from 6.3.3 to 6.4.1. - [Changelog](https://github.com/tornadoweb/tornado/blob/master/docs/releases.rst) - [Commits](tornadoweb/tornado@v6.3.3...v6.4.1) --- updated-dependencies: - dependency-name: tornado dependency-type: indirect ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * add excluded project --------- Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
1 parent 61267b4 commit 2cf5353

16 files changed

+484
-126
lines changed

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -347,7 +347,7 @@ tenacity==8.2.3
347347
# plotly
348348
threadpoolctl==3.2.0
349349
# via scikit-learn
350-
tornado==6.3.3
350+
tornado==6.4.1
351351
# via
352352
# bokeh
353353
# luigi

v03_pipeline/lib/annotations/mito.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ def high_constraint_region_mito(
5656

5757

5858
def mito_cn(mt: hl.MatrixTable, **_: Any) -> hl.Expression:
59-
return hl.int(mt.mito_cn)
59+
return hl.int32(mt.mito_cn)
6060

6161

6262
def mitotip(ht: hl.Table, **_: Any) -> hl.Expression:

v03_pipeline/lib/annotations/sv.py

Lines changed: 36 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,21 @@
1010
from v03_pipeline.lib.annotations.shared import add_rg38_liftover
1111
from v03_pipeline.lib.model.definitions import ReferenceGenome
1212

13-
CONSEQ_PREDICTED_PREFIX = 'PREDICTED_'
14-
NON_GENE_PREDICTIONS = {
15-
'PREDICTED_INTERGENIC',
16-
'PREDICTED_NONCODING_BREAKPOINT',
17-
'PREDICTED_NONCODING_SPAN',
13+
CONSEQ_PREDICTED_PREFIX = 'info.PREDICTED_'
14+
CONSEQ_PREDICTED_GENE_COLS = {
15+
'info.PREDICTED_BREAKEND_EXONIC': hl.tarray(hl.tstr),
16+
'info.PREDICTED_COPY_GAIN': hl.tarray(hl.tstr),
17+
'info.PREDICTED_DUP_PARTIAL': hl.tarray(hl.tstr),
18+
'info.PREDICTED_INTRAGENIC_EXON_DUP': hl.tarray(hl.tstr),
19+
'info.PREDICTED_INTRONIC': hl.tarray(hl.tstr),
20+
'info.PREDICTED_INV_SPAN': hl.tarray(hl.tstr),
21+
'info.PREDICTED_LOF': hl.tarray(hl.tstr),
22+
'info.PREDICTED_MSV_EXON_OVERLAP': hl.tarray(hl.tstr),
23+
'info.PREDICTED_NEAREST_TSS': hl.tarray(hl.tstr),
24+
'info.PREDICTED_PARTIAL_EXON_DUP': hl.tarray(hl.tstr),
25+
'info.PREDICTED_PROMOTER': hl.tarray(hl.tstr),
26+
'info.PREDICTED_TSS_DUP': hl.tarray(hl.tstr),
27+
'info.PREDICTED_UTR': hl.tarray(hl.tstr),
1828
}
1929

2030
PREVIOUS_GENOTYPE_N_ALT_ALLELES = hl.dict(
@@ -72,7 +82,7 @@ def _sv_types(ht: hl.Table) -> hl.ArrayExpression:
7282

7383

7484
def algorithms(ht: hl.Table, **_: Any) -> hl.Expression:
75-
return hl.str(',').join(ht.info.ALGORITHMS)
85+
return hl.str(',').join(ht['info.ALGORITHMS'])
7686

7787

7888
def bothsides_support(ht: hl.Table, **_: Any) -> hl.Expression:
@@ -110,37 +120,40 @@ def cpx_intervals(
110120
**_: Any,
111121
) -> hl.Expression:
112122
return hl.or_missing(
113-
hl.is_defined(ht.info.CPX_INTERVALS),
114-
ht.info.CPX_INTERVALS.map(lambda x: _get_cpx_interval(x, reference_genome)),
123+
hl.is_defined(ht['info.CPX_INTERVALS']),
124+
ht['info.CPX_INTERVALS'].map(lambda x: _get_cpx_interval(x, reference_genome)),
115125
)
116126

117127

118128
def end_locus(ht: hl.Table, **_: Any) -> hl.StructExpression:
119129
rg38_lengths = hl.literal(hl.get_reference(ReferenceGenome.GRCh38.value).lengths)
120130
return hl.if_else(
121-
(hl.is_defined(ht.info.END2) & (rg38_lengths[ht.info.CHR2] >= ht.info.END2)),
122-
hl.locus(ht.info.CHR2, ht.info.END2, ReferenceGenome.GRCh38.value),
131+
(
132+
hl.is_defined(ht['info.END2'])
133+
& (rg38_lengths[ht['info.CHR2']] >= ht['info.END2'])
134+
),
135+
hl.locus(ht['info.CHR2'], ht['info.END2'], ReferenceGenome.GRCh38.value),
123136
hl.or_missing(
124-
(rg38_lengths[ht.locus.contig] >= ht.info.END),
125-
hl.locus(ht.locus.contig, ht.info.END, ReferenceGenome.GRCh38.value),
137+
(rg38_lengths[ht.locus.contig] >= ht['info.END']),
138+
hl.locus(ht.locus.contig, ht['info.END'], ReferenceGenome.GRCh38.value),
126139
),
127140
)
128141

129142

130143
def gnomad_svs(ht: hl.Table, **_: Any) -> hl.Expression:
131144
return hl.or_missing(
132-
hl.is_defined(ht.info.gnomAD_V2_AF),
133-
hl.struct(AF=hl.float32(ht.info.gnomAD_V2_AF), ID=ht.info.gnomAD_V2_SVID),
145+
hl.is_defined(ht['info.gnomAD_V2_AF']),
146+
hl.struct(AF=hl.float32(ht['info.gnomAD_V2_AF']), ID=ht['info.gnomAD_V2_SVID']),
134147
)
135148

136149

137150
def gt_stats(ht: hl.Table, **_: Any) -> hl.Expression:
138151
return hl.struct(
139-
AF=hl.float32(ht.info.AF[0]),
140-
AC=ht.info.AC[0],
141-
AN=ht.info.AN,
142-
Hom=ht.info.N_HOMALT,
143-
Het=ht.info.N_HET,
152+
AF=hl.float32(ht['info.AF'][0]),
153+
AC=ht['info.AC'][0],
154+
AN=ht['info.AN'],
155+
Hom=ht['info.N_HOMALT'],
156+
Het=ht['info.N_HET'],
144157
)
145158

146159

@@ -174,28 +187,22 @@ def sorted_gene_consequences(
174187
**_: Any,
175188
) -> hl.Expression:
176189
# In lieu of sorted_transcript_consequences seen on SNV/MITO.
177-
conseq_predicted_gene_cols = [
178-
gene_col
179-
for gene_col in ht.info
180-
if gene_col.startswith(CONSEQ_PREDICTED_PREFIX)
181-
and gene_col not in NON_GENE_PREDICTIONS
182-
]
183190
mapped_genes = [
184-
ht.info[gene_col].map(
191+
ht[gene_col].map(
185192
lambda gene: hl.struct(
186193
gene_id=gencode_mapping.get(gene),
187194
major_consequence_id=SV_CONSEQUENCE_RANKS_LOOKUP[
188195
gene_col.replace(CONSEQ_PREDICTED_PREFIX, '', 1) # noqa: B023
189196
],
190197
),
191198
)
192-
for gene_col in conseq_predicted_gene_cols
199+
for gene_col in CONSEQ_PREDICTED_GENE_COLS
193200
]
194201
return hl.filter(hl.is_defined, mapped_genes).flatmap(lambda x: x)
195202

196203

197204
def strvctvre(ht: hl.Table, **_: Any) -> hl.Expression:
198-
return hl.struct(score=hl.parse_float32(ht.info.StrVCTVRE))
205+
return hl.struct(score=hl.parse_float32(ht['info.StrVCTVRE']))
199206

200207

201208
def sv_type_id(ht: hl.Table, **_: Any) -> hl.Expression:
@@ -206,7 +213,7 @@ def sv_type_detail_id(ht: hl.Table, **_: Any) -> hl.Expression:
206213
sv_types = _sv_types(ht)
207214
return hl.if_else(
208215
sv_types[0] == 'CPX',
209-
SV_TYPE_DETAILS_LOOKUP[ht.info.CPX_TYPE],
216+
SV_TYPE_DETAILS_LOOKUP[ht['info.CPX_TYPE']],
210217
hl.or_missing(
211218
(sv_types[0] == 'INS') & (hl.len(sv_types) > 1),
212219
SV_TYPE_DETAILS_LOOKUP[sv_types[1]],

v03_pipeline/lib/misc/callsets.py

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -32,13 +32,6 @@ def get_callset_ht( # noqa: PLR0913
3232
imputed_sex_paths,
3333
)
3434
]
35-
36-
# Drop any fields potentially unshared/unused by the annotations.
37-
for i, callset_ht in enumerate(callset_hts):
38-
for row_field in dataset_type.optional_row_fields:
39-
if hasattr(callset_ht, row_field):
40-
callset_hts[i] = callset_ht.drop(row_field)
41-
4235
callset_ht = functools.reduce(
4336
(lambda ht1, ht2: ht1.union(ht2, unify=True)),
4437
callset_hts,

v03_pipeline/lib/misc/io.py

Lines changed: 21 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import hail as hl
66

77
from v03_pipeline.lib.misc.gcnv import parse_gcnv_genes
8+
from v03_pipeline.lib.misc.nested_field import parse_nested_field
89
from v03_pipeline.lib.model import DatasetType, Env, ReferenceGenome, Sex
910

1011
BIALLELIC = 2
@@ -64,17 +65,9 @@ def import_gcnv_bed_file(callset_path: str) -> hl.MatrixTable:
6465
ht = hl.import_table(
6566
callset_path,
6667
types={
67-
'start': hl.tint32,
68-
'end': hl.tint32,
69-
'CN': hl.tint32,
70-
'QS': hl.tint32,
71-
'defragmented': hl.tbool,
72-
'sf': hl.tfloat64,
73-
'sc': hl.tint32,
74-
'genes_any_overlap_totalExons': hl.tint32,
75-
'genes_strict_overlap_totalExons': hl.tint32,
76-
'no_ovl': hl.tbool,
77-
'is_latest': hl.tbool,
68+
**DatasetType.GCNV.col_fields,
69+
**DatasetType.GCNV.entries_fields,
70+
**DatasetType.GCNV.row_fields,
7871
},
7972
force=callset_path.endswith('gz'),
8073
)
@@ -147,16 +140,25 @@ def import_callset(
147140
def select_relevant_fields(
148141
mt: hl.MatrixTable,
149142
dataset_type: DatasetType,
143+
additional_row_fields: None | dict[str, hl.expr.types.HailType | set] = None,
150144
) -> hl.MatrixTable:
151145
mt = mt.select_globals()
152-
optional_row_fields = [
153-
row_field
154-
for row_field in dataset_type.optional_row_fields
155-
if hasattr(mt, row_field)
156-
]
157-
mt = mt.select_rows(*dataset_type.row_fields, *optional_row_fields)
158-
mt = mt.select_cols(*dataset_type.col_fields)
159-
return mt.select_entries(*dataset_type.entries_fields)
146+
mt = mt.select_rows(
147+
**{field: parse_nested_field(mt, field) for field in dataset_type.row_fields},
148+
**{
149+
field: parse_nested_field(mt, field)
150+
for field in (additional_row_fields or [])
151+
},
152+
)
153+
mt = mt.select_cols(
154+
**{field: parse_nested_field(mt, field) for field in dataset_type.col_fields},
155+
)
156+
return mt.select_entries(
157+
**{
158+
field: parse_nested_field(mt, field)
159+
for field in dataset_type.entries_fields
160+
},
161+
)
160162

161163

162164
def import_imputed_sex(imputed_sex_path: str) -> hl.Table:

v03_pipeline/lib/misc/nested_field.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
import hail as hl
2+
3+
4+
def parse_nested_field(t: hl.MatrixTable | hl.Table, fields: str):
5+
# Grab the field and continually select it from the hail table.
6+
expression = t
7+
for field in fields.split('.'):
8+
# Select from multi-allelic list.
9+
if field.endswith('#'):
10+
expression = expression[field[:-1]][
11+
(t.a_index if hasattr(t, 'a_index') else 1) - 1
12+
]
13+
else:
14+
expression = expression[field]
15+
return expression
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
import unittest
2+
3+
import hail as hl
4+
5+
from v03_pipeline.lib.misc.nested_field import parse_nested_field
6+
7+
8+
class TestConstrainFunction(unittest.TestCase):
9+
def test_parse_nested_field(self):
10+
ht = hl.Table.parallelize(
11+
[
12+
{
13+
'locus': hl.Locus(
14+
contig='chr1',
15+
position=1,
16+
reference_genome='GRCh38',
17+
),
18+
'alleles': ['A', 'C'],
19+
'a': hl.Struct(d=1),
20+
'b': hl.Struct(e=[2, 9]),
21+
'a_index': 1,
22+
},
23+
{
24+
'locus': hl.Locus(
25+
contig='chr1',
26+
position=2,
27+
reference_genome='GRCh38',
28+
),
29+
'alleles': ['A', 'C'],
30+
'a': hl.Struct(d=3),
31+
'b': hl.Struct(e=[4, 5]),
32+
'a_index': 1,
33+
},
34+
],
35+
hl.tstruct(
36+
locus=hl.tlocus('GRCh38'),
37+
alleles=hl.tarray(hl.tstr),
38+
a=hl.tstruct(d=hl.tint32),
39+
b=hl.tstruct(e=hl.tarray(hl.tint32)),
40+
a_index=hl.tint32,
41+
),
42+
key=['locus', 'alleles'],
43+
)
44+
ht = ht.select(
45+
d=parse_nested_field(ht, 'a.d'),
46+
e=parse_nested_field(ht, 'b.e#'),
47+
f=parse_nested_field(ht, 'a'),
48+
)
49+
self.assertListEqual(
50+
ht.collect(),
51+
[
52+
hl.Struct(
53+
locus=hl.Locus(
54+
contig='chr1',
55+
position=1,
56+
reference_genome='GRCh38',
57+
),
58+
alleles=['A', 'C'],
59+
d=1,
60+
e=2,
61+
f=hl.Struct(d=1),
62+
),
63+
hl.Struct(
64+
locus=hl.Locus(
65+
contig='chr1',
66+
position=2,
67+
reference_genome='GRCh38',
68+
),
69+
alleles=['A', 'C'],
70+
d=3,
71+
e=4,
72+
f=hl.Struct(d=3),
73+
),
74+
],
75+
)

v03_pipeline/lib/misc/validation.py

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import hail as hl
22

3-
from v03_pipeline.lib.model import ReferenceGenome, SampleType, Sex
3+
from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType, Sex
44

55
AMBIGUOUS_THRESHOLD_PERC: float = 0.01 # Fraction of samples identified as "ambiguous_sex" above which an error will be thrown.
66
MIN_ROWS_PER_CONTIG = 100
@@ -60,6 +60,43 @@ def validate_expected_contig_frequency(
6060
raise SeqrValidationError(msg)
6161

6262

63+
def validate_imported_field_types(
64+
mt: hl.MatrixTable,
65+
dataset_type: DatasetType,
66+
additional_row_fields: dict[str, hl.expr.types.HailType | set],
67+
) -> None:
68+
def _validate_field(
69+
mt_schema: hl.StructExpression,
70+
field: str,
71+
dtype: hl.expr.types.HailType,
72+
) -> str | None:
73+
if field not in mt_schema:
74+
return f'{field}: missing'
75+
if (
76+
(
77+
dtype == hl.tstruct
78+
and type(mt_schema[field])
79+
== hl.expr.expressions.typed_expressions.StructExpression
80+
)
81+
or (isinstance(dtype, set) and mt_schema[field].dtype in dtype)
82+
or (mt_schema[field].dtype == dtype)
83+
):
84+
return None
85+
return f'{field}: {mt_schema[field].dtype}'
86+
87+
unexpected_field_types = []
88+
for field, dtype in dataset_type.col_fields.items():
89+
unexpected_field_types.append(_validate_field(mt.col, field, dtype))
90+
for field, dtype in dataset_type.entries_fields.items():
91+
unexpected_field_types.append(_validate_field(mt.entry, field, dtype))
92+
for field, dtype in {**dataset_type.row_fields, **additional_row_fields}.items():
93+
unexpected_field_types.append(_validate_field(mt.row, field, dtype))
94+
unexpected_field_types = [x for x in unexpected_field_types if x is not None]
95+
if unexpected_field_types:
96+
msg = f'Found unexpected field types on MatrixTable after import: {unexpected_field_types}'
97+
raise SeqrValidationError(msg)
98+
99+
63100
def validate_imputed_sex_ploidy(
64101
mt: hl.MatrixTable,
65102
sex_check_ht: hl.Table,

0 commit comments

Comments
 (0)