Skip to content

Dev #804

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jun 11, 2024
Merged

Dev #804

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -347,7 +347,7 @@ tenacity==8.2.3
# plotly
threadpoolctl==3.2.0
# via scikit-learn
tornado==6.3.3
tornado==6.4.1
# via
# bokeh
# luigi
Expand Down
2 changes: 1 addition & 1 deletion v03_pipeline/lib/annotations/mito.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def high_constraint_region_mito(


def mito_cn(mt: hl.MatrixTable, **_: Any) -> hl.Expression:
return hl.int(mt.mito_cn)
return hl.int32(mt.mito_cn)


def mitotip(ht: hl.Table, **_: Any) -> hl.Expression:
Expand Down
65 changes: 36 additions & 29 deletions v03_pipeline/lib/annotations/sv.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,21 @@
from v03_pipeline.lib.annotations.shared import add_rg38_liftover
from v03_pipeline.lib.model.definitions import ReferenceGenome

CONSEQ_PREDICTED_PREFIX = 'PREDICTED_'
NON_GENE_PREDICTIONS = {
'PREDICTED_INTERGENIC',
'PREDICTED_NONCODING_BREAKPOINT',
'PREDICTED_NONCODING_SPAN',
CONSEQ_PREDICTED_PREFIX = 'info.PREDICTED_'
CONSEQ_PREDICTED_GENE_COLS = {
'info.PREDICTED_BREAKEND_EXONIC': hl.tarray(hl.tstr),
'info.PREDICTED_COPY_GAIN': hl.tarray(hl.tstr),
'info.PREDICTED_DUP_PARTIAL': hl.tarray(hl.tstr),
'info.PREDICTED_INTRAGENIC_EXON_DUP': hl.tarray(hl.tstr),
'info.PREDICTED_INTRONIC': hl.tarray(hl.tstr),
'info.PREDICTED_INV_SPAN': hl.tarray(hl.tstr),
'info.PREDICTED_LOF': hl.tarray(hl.tstr),
'info.PREDICTED_MSV_EXON_OVERLAP': hl.tarray(hl.tstr),
'info.PREDICTED_NEAREST_TSS': hl.tarray(hl.tstr),
'info.PREDICTED_PARTIAL_EXON_DUP': hl.tarray(hl.tstr),
'info.PREDICTED_PROMOTER': hl.tarray(hl.tstr),
'info.PREDICTED_TSS_DUP': hl.tarray(hl.tstr),
'info.PREDICTED_UTR': hl.tarray(hl.tstr),
}

PREVIOUS_GENOTYPE_N_ALT_ALLELES = hl.dict(
Expand Down Expand Up @@ -72,7 +82,7 @@ def _sv_types(ht: hl.Table) -> hl.ArrayExpression:


def algorithms(ht: hl.Table, **_: Any) -> hl.Expression:
return hl.str(',').join(ht.info.ALGORITHMS)
return hl.str(',').join(ht['info.ALGORITHMS'])


def bothsides_support(ht: hl.Table, **_: Any) -> hl.Expression:
Expand Down Expand Up @@ -110,37 +120,40 @@ def cpx_intervals(
**_: Any,
) -> hl.Expression:
return hl.or_missing(
hl.is_defined(ht.info.CPX_INTERVALS),
ht.info.CPX_INTERVALS.map(lambda x: _get_cpx_interval(x, reference_genome)),
hl.is_defined(ht['info.CPX_INTERVALS']),
ht['info.CPX_INTERVALS'].map(lambda x: _get_cpx_interval(x, reference_genome)),
)


def end_locus(ht: hl.Table, **_: Any) -> hl.StructExpression:
rg38_lengths = hl.literal(hl.get_reference(ReferenceGenome.GRCh38.value).lengths)
return hl.if_else(
(hl.is_defined(ht.info.END2) & (rg38_lengths[ht.info.CHR2] >= ht.info.END2)),
hl.locus(ht.info.CHR2, ht.info.END2, ReferenceGenome.GRCh38.value),
(
hl.is_defined(ht['info.END2'])
& (rg38_lengths[ht['info.CHR2']] >= ht['info.END2'])
),
hl.locus(ht['info.CHR2'], ht['info.END2'], ReferenceGenome.GRCh38.value),
hl.or_missing(
(rg38_lengths[ht.locus.contig] >= ht.info.END),
hl.locus(ht.locus.contig, ht.info.END, ReferenceGenome.GRCh38.value),
(rg38_lengths[ht.locus.contig] >= ht['info.END']),
hl.locus(ht.locus.contig, ht['info.END'], ReferenceGenome.GRCh38.value),
),
)


def gnomad_svs(ht: hl.Table, **_: Any) -> hl.Expression:
return hl.or_missing(
hl.is_defined(ht.info.gnomAD_V2_AF),
hl.struct(AF=hl.float32(ht.info.gnomAD_V2_AF), ID=ht.info.gnomAD_V2_SVID),
hl.is_defined(ht['info.gnomAD_V2_AF']),
hl.struct(AF=hl.float32(ht['info.gnomAD_V2_AF']), ID=ht['info.gnomAD_V2_SVID']),
)


def gt_stats(ht: hl.Table, **_: Any) -> hl.Expression:
return hl.struct(
AF=hl.float32(ht.info.AF[0]),
AC=ht.info.AC[0],
AN=ht.info.AN,
Hom=ht.info.N_HOMALT,
Het=ht.info.N_HET,
AF=hl.float32(ht['info.AF'][0]),
AC=ht['info.AC'][0],
AN=ht['info.AN'],
Hom=ht['info.N_HOMALT'],
Het=ht['info.N_HET'],
)


Expand Down Expand Up @@ -174,28 +187,22 @@ def sorted_gene_consequences(
**_: Any,
) -> hl.Expression:
# In lieu of sorted_transcript_consequences seen on SNV/MITO.
conseq_predicted_gene_cols = [
gene_col
for gene_col in ht.info
if gene_col.startswith(CONSEQ_PREDICTED_PREFIX)
and gene_col not in NON_GENE_PREDICTIONS
]
mapped_genes = [
ht.info[gene_col].map(
ht[gene_col].map(
lambda gene: hl.struct(
gene_id=gencode_mapping.get(gene),
major_consequence_id=SV_CONSEQUENCE_RANKS_LOOKUP[
gene_col.replace(CONSEQ_PREDICTED_PREFIX, '', 1) # noqa: B023
],
),
)
for gene_col in conseq_predicted_gene_cols
for gene_col in CONSEQ_PREDICTED_GENE_COLS
]
return hl.filter(hl.is_defined, mapped_genes).flatmap(lambda x: x)


def strvctvre(ht: hl.Table, **_: Any) -> hl.Expression:
return hl.struct(score=hl.parse_float32(ht.info.StrVCTVRE))
return hl.struct(score=hl.parse_float32(ht['info.StrVCTVRE']))


def sv_type_id(ht: hl.Table, **_: Any) -> hl.Expression:
Expand All @@ -206,7 +213,7 @@ def sv_type_detail_id(ht: hl.Table, **_: Any) -> hl.Expression:
sv_types = _sv_types(ht)
return hl.if_else(
sv_types[0] == 'CPX',
SV_TYPE_DETAILS_LOOKUP[ht.info.CPX_TYPE],
SV_TYPE_DETAILS_LOOKUP[ht['info.CPX_TYPE']],
hl.or_missing(
(sv_types[0] == 'INS') & (hl.len(sv_types) > 1),
SV_TYPE_DETAILS_LOOKUP[sv_types[1]],
Expand Down
7 changes: 0 additions & 7 deletions v03_pipeline/lib/misc/callsets.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,6 @@ def get_callset_ht( # noqa: PLR0913
imputed_sex_paths,
)
]

# Drop any fields potentially unshared/unused by the annotations.
for i, callset_ht in enumerate(callset_hts):
for row_field in dataset_type.optional_row_fields:
if hasattr(callset_ht, row_field):
callset_hts[i] = callset_ht.drop(row_field)

callset_ht = functools.reduce(
(lambda ht1, ht2: ht1.union(ht2, unify=True)),
callset_hts,
Expand Down
40 changes: 21 additions & 19 deletions v03_pipeline/lib/misc/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import hail as hl

from v03_pipeline.lib.misc.gcnv import parse_gcnv_genes
from v03_pipeline.lib.misc.nested_field import parse_nested_field
from v03_pipeline.lib.model import DatasetType, Env, ReferenceGenome, Sex

BIALLELIC = 2
Expand Down Expand Up @@ -64,17 +65,9 @@ def import_gcnv_bed_file(callset_path: str) -> hl.MatrixTable:
ht = hl.import_table(
callset_path,
types={
'start': hl.tint32,
'end': hl.tint32,
'CN': hl.tint32,
'QS': hl.tint32,
'defragmented': hl.tbool,
'sf': hl.tfloat64,
'sc': hl.tint32,
'genes_any_overlap_totalExons': hl.tint32,
'genes_strict_overlap_totalExons': hl.tint32,
'no_ovl': hl.tbool,
'is_latest': hl.tbool,
**DatasetType.GCNV.col_fields,
**DatasetType.GCNV.entries_fields,
**DatasetType.GCNV.row_fields,
},
force=callset_path.endswith('gz'),
)
Expand Down Expand Up @@ -147,16 +140,25 @@ def import_callset(
def select_relevant_fields(
mt: hl.MatrixTable,
dataset_type: DatasetType,
additional_row_fields: None | dict[str, hl.expr.types.HailType | set] = None,
) -> hl.MatrixTable:
mt = mt.select_globals()
optional_row_fields = [
row_field
for row_field in dataset_type.optional_row_fields
if hasattr(mt, row_field)
]
mt = mt.select_rows(*dataset_type.row_fields, *optional_row_fields)
mt = mt.select_cols(*dataset_type.col_fields)
return mt.select_entries(*dataset_type.entries_fields)
mt = mt.select_rows(
**{field: parse_nested_field(mt, field) for field in dataset_type.row_fields},
**{
field: parse_nested_field(mt, field)
for field in (additional_row_fields or [])
},
)
mt = mt.select_cols(
**{field: parse_nested_field(mt, field) for field in dataset_type.col_fields},
)
return mt.select_entries(
**{
field: parse_nested_field(mt, field)
for field in dataset_type.entries_fields
},
)


def import_imputed_sex(imputed_sex_path: str) -> hl.Table:
Expand Down
15 changes: 15 additions & 0 deletions v03_pipeline/lib/misc/nested_field.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import hail as hl


def parse_nested_field(t: hl.MatrixTable | hl.Table, fields: str):
# Grab the field and continually select it from the hail table.
expression = t
for field in fields.split('.'):
# Select from multi-allelic list.
if field.endswith('#'):
expression = expression[field[:-1]][
(t.a_index if hasattr(t, 'a_index') else 1) - 1
]
else:
expression = expression[field]
return expression
75 changes: 75 additions & 0 deletions v03_pipeline/lib/misc/nested_field_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import unittest

import hail as hl

from v03_pipeline.lib.misc.nested_field import parse_nested_field


class TestConstrainFunction(unittest.TestCase):
def test_parse_nested_field(self):
ht = hl.Table.parallelize(
[
{
'locus': hl.Locus(
contig='chr1',
position=1,
reference_genome='GRCh38',
),
'alleles': ['A', 'C'],
'a': hl.Struct(d=1),
'b': hl.Struct(e=[2, 9]),
'a_index': 1,
},
{
'locus': hl.Locus(
contig='chr1',
position=2,
reference_genome='GRCh38',
),
'alleles': ['A', 'C'],
'a': hl.Struct(d=3),
'b': hl.Struct(e=[4, 5]),
'a_index': 1,
},
],
hl.tstruct(
locus=hl.tlocus('GRCh38'),
alleles=hl.tarray(hl.tstr),
a=hl.tstruct(d=hl.tint32),
b=hl.tstruct(e=hl.tarray(hl.tint32)),
a_index=hl.tint32,
),
key=['locus', 'alleles'],
)
ht = ht.select(
d=parse_nested_field(ht, 'a.d'),
e=parse_nested_field(ht, 'b.e#'),
f=parse_nested_field(ht, 'a'),
)
self.assertListEqual(
ht.collect(),
[
hl.Struct(
locus=hl.Locus(
contig='chr1',
position=1,
reference_genome='GRCh38',
),
alleles=['A', 'C'],
d=1,
e=2,
f=hl.Struct(d=1),
),
hl.Struct(
locus=hl.Locus(
contig='chr1',
position=2,
reference_genome='GRCh38',
),
alleles=['A', 'C'],
d=3,
e=4,
f=hl.Struct(d=3),
),
],
)
39 changes: 38 additions & 1 deletion v03_pipeline/lib/misc/validation.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import hail as hl

from v03_pipeline.lib.model import ReferenceGenome, SampleType, Sex
from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType, Sex

AMBIGUOUS_THRESHOLD_PERC: float = 0.01 # Fraction of samples identified as "ambiguous_sex" above which an error will be thrown.
MIN_ROWS_PER_CONTIG = 100
Expand Down Expand Up @@ -60,6 +60,43 @@ def validate_expected_contig_frequency(
raise SeqrValidationError(msg)


def validate_imported_field_types(
mt: hl.MatrixTable,
dataset_type: DatasetType,
additional_row_fields: dict[str, hl.expr.types.HailType | set],
) -> None:
def _validate_field(
mt_schema: hl.StructExpression,
field: str,
dtype: hl.expr.types.HailType,
) -> str | None:
if field not in mt_schema:
return f'{field}: missing'
if (
(
dtype == hl.tstruct
and type(mt_schema[field])
== hl.expr.expressions.typed_expressions.StructExpression
)
or (isinstance(dtype, set) and mt_schema[field].dtype in dtype)
or (mt_schema[field].dtype == dtype)
):
return None
return f'{field}: {mt_schema[field].dtype}'

unexpected_field_types = []
for field, dtype in dataset_type.col_fields.items():
unexpected_field_types.append(_validate_field(mt.col, field, dtype))
for field, dtype in dataset_type.entries_fields.items():
unexpected_field_types.append(_validate_field(mt.entry, field, dtype))
for field, dtype in {**dataset_type.row_fields, **additional_row_fields}.items():
unexpected_field_types.append(_validate_field(mt.row, field, dtype))
unexpected_field_types = [x for x in unexpected_field_types if x is not None]
if unexpected_field_types:
msg = f'Found unexpected field types on MatrixTable after import: {unexpected_field_types}'
raise SeqrValidationError(msg)


def validate_imputed_sex_ploidy(
mt: hl.MatrixTable,
sex_check_ht: hl.Table,
Expand Down
Loading
Loading