broadinstitute · bpblanken · Jun 11, 2024 · Jun 5, 2024 · Jun 6, 2024 · Jun 10, 2024
@@ -347,7 +347,7 @@ tenacity==8.2.3
     #   plotly
 threadpoolctl==3.2.0
     # via scikit-learn
-tornado==6.3.3
+tornado==6.4.1
     # via
     #   bokeh
     #   luigi

@@ -56,7 +56,7 @@ def high_constraint_region_mito(
 
 
 def mito_cn(mt: hl.MatrixTable, **_: Any) -> hl.Expression:
-    return hl.int(mt.mito_cn)
+    return hl.int32(mt.mito_cn)
 
 
 def mitotip(ht: hl.Table, **_: Any) -> hl.Expression:

@@ -10,11 +10,21 @@
 from v03_pipeline.lib.annotations.shared import add_rg38_liftover
 from v03_pipeline.lib.model.definitions import ReferenceGenome
 
-CONSEQ_PREDICTED_PREFIX = 'PREDICTED_'
-NON_GENE_PREDICTIONS = {
-    'PREDICTED_INTERGENIC',
-    'PREDICTED_NONCODING_BREAKPOINT',
-    'PREDICTED_NONCODING_SPAN',
+CONSEQ_PREDICTED_PREFIX = 'info.PREDICTED_'
+CONSEQ_PREDICTED_GENE_COLS = {
+    'info.PREDICTED_BREAKEND_EXONIC': hl.tarray(hl.tstr),
+    'info.PREDICTED_COPY_GAIN': hl.tarray(hl.tstr),
+    'info.PREDICTED_DUP_PARTIAL': hl.tarray(hl.tstr),
+    'info.PREDICTED_INTRAGENIC_EXON_DUP': hl.tarray(hl.tstr),
+    'info.PREDICTED_INTRONIC': hl.tarray(hl.tstr),
+    'info.PREDICTED_INV_SPAN': hl.tarray(hl.tstr),
+    'info.PREDICTED_LOF': hl.tarray(hl.tstr),
+    'info.PREDICTED_MSV_EXON_OVERLAP': hl.tarray(hl.tstr),
+    'info.PREDICTED_NEAREST_TSS': hl.tarray(hl.tstr),
+    'info.PREDICTED_PARTIAL_EXON_DUP': hl.tarray(hl.tstr),
+    'info.PREDICTED_PROMOTER': hl.tarray(hl.tstr),
+    'info.PREDICTED_TSS_DUP': hl.tarray(hl.tstr),
+    'info.PREDICTED_UTR': hl.tarray(hl.tstr),
 }
 
 PREVIOUS_GENOTYPE_N_ALT_ALLELES = hl.dict(
@@ -72,7 +82,7 @@ def _sv_types(ht: hl.Table) -> hl.ArrayExpression:
 
 
 def algorithms(ht: hl.Table, **_: Any) -> hl.Expression:
-    return hl.str(',').join(ht.info.ALGORITHMS)
+    return hl.str(',').join(ht['info.ALGORITHMS'])
 
 
 def bothsides_support(ht: hl.Table, **_: Any) -> hl.Expression:
@@ -110,37 +120,40 @@ def cpx_intervals(
     **_: Any,
 ) -> hl.Expression:
     return hl.or_missing(
-        hl.is_defined(ht.info.CPX_INTERVALS),
-        ht.info.CPX_INTERVALS.map(lambda x: _get_cpx_interval(x, reference_genome)),
+        hl.is_defined(ht['info.CPX_INTERVALS']),
+        ht['info.CPX_INTERVALS'].map(lambda x: _get_cpx_interval(x, reference_genome)),
     )
 
 
 def end_locus(ht: hl.Table, **_: Any) -> hl.StructExpression:
     rg38_lengths = hl.literal(hl.get_reference(ReferenceGenome.GRCh38.value).lengths)
     return hl.if_else(
-        (hl.is_defined(ht.info.END2) & (rg38_lengths[ht.info.CHR2] >= ht.info.END2)),
-        hl.locus(ht.info.CHR2, ht.info.END2, ReferenceGenome.GRCh38.value),
+        (
+            hl.is_defined(ht['info.END2'])
+            & (rg38_lengths[ht['info.CHR2']] >= ht['info.END2'])
+        ),
+        hl.locus(ht['info.CHR2'], ht['info.END2'], ReferenceGenome.GRCh38.value),
         hl.or_missing(
-            (rg38_lengths[ht.locus.contig] >= ht.info.END),
-            hl.locus(ht.locus.contig, ht.info.END, ReferenceGenome.GRCh38.value),
+            (rg38_lengths[ht.locus.contig] >= ht['info.END']),
+            hl.locus(ht.locus.contig, ht['info.END'], ReferenceGenome.GRCh38.value),
         ),
     )
 
 
 def gnomad_svs(ht: hl.Table, **_: Any) -> hl.Expression:
     return hl.or_missing(
-        hl.is_defined(ht.info.gnomAD_V2_AF),
-        hl.struct(AF=hl.float32(ht.info.gnomAD_V2_AF), ID=ht.info.gnomAD_V2_SVID),
+        hl.is_defined(ht['info.gnomAD_V2_AF']),
+        hl.struct(AF=hl.float32(ht['info.gnomAD_V2_AF']), ID=ht['info.gnomAD_V2_SVID']),
     )
 
 
 def gt_stats(ht: hl.Table, **_: Any) -> hl.Expression:
     return hl.struct(
-        AF=hl.float32(ht.info.AF[0]),
-        AC=ht.info.AC[0],
-        AN=ht.info.AN,
-        Hom=ht.info.N_HOMALT,
-        Het=ht.info.N_HET,
+        AF=hl.float32(ht['info.AF'][0]),
+        AC=ht['info.AC'][0],
+        AN=ht['info.AN'],
+        Hom=ht['info.N_HOMALT'],
+        Het=ht['info.N_HET'],
     )
 
 
@@ -174,28 +187,22 @@ def sorted_gene_consequences(
     **_: Any,
 ) -> hl.Expression:
     # In lieu of sorted_transcript_consequences seen on SNV/MITO.
-    conseq_predicted_gene_cols = [
-        gene_col
-        for gene_col in ht.info
-        if gene_col.startswith(CONSEQ_PREDICTED_PREFIX)
-        and gene_col not in NON_GENE_PREDICTIONS
-    ]
     mapped_genes = [
-        ht.info[gene_col].map(
+        ht[gene_col].map(
             lambda gene: hl.struct(
                 gene_id=gencode_mapping.get(gene),
                 major_consequence_id=SV_CONSEQUENCE_RANKS_LOOKUP[
                     gene_col.replace(CONSEQ_PREDICTED_PREFIX, '', 1)  # noqa: B023
                 ],
             ),
         )
-        for gene_col in conseq_predicted_gene_cols
+        for gene_col in CONSEQ_PREDICTED_GENE_COLS
     ]
     return hl.filter(hl.is_defined, mapped_genes).flatmap(lambda x: x)
 
 
 def strvctvre(ht: hl.Table, **_: Any) -> hl.Expression:
-    return hl.struct(score=hl.parse_float32(ht.info.StrVCTVRE))
+    return hl.struct(score=hl.parse_float32(ht['info.StrVCTVRE']))
 
 
 def sv_type_id(ht: hl.Table, **_: Any) -> hl.Expression:
@@ -206,7 +213,7 @@ def sv_type_detail_id(ht: hl.Table, **_: Any) -> hl.Expression:
     sv_types = _sv_types(ht)
     return hl.if_else(
         sv_types[0] == 'CPX',
-        SV_TYPE_DETAILS_LOOKUP[ht.info.CPX_TYPE],
+        SV_TYPE_DETAILS_LOOKUP[ht['info.CPX_TYPE']],
         hl.or_missing(
             (sv_types[0] == 'INS') & (hl.len(sv_types) > 1),
             SV_TYPE_DETAILS_LOOKUP[sv_types[1]],

@@ -32,13 +32,6 @@ def get_callset_ht(  # noqa: PLR0913
             imputed_sex_paths,
         )
     ]
-
-    # Drop any fields potentially unshared/unused by the annotations.
-    for i, callset_ht in enumerate(callset_hts):
-        for row_field in dataset_type.optional_row_fields:
-            if hasattr(callset_ht, row_field):
-                callset_hts[i] = callset_ht.drop(row_field)
-
     callset_ht = functools.reduce(
         (lambda ht1, ht2: ht1.union(ht2, unify=True)),
         callset_hts,

@@ -5,6 +5,7 @@
 import hail as hl
 
 from v03_pipeline.lib.misc.gcnv import parse_gcnv_genes
+from v03_pipeline.lib.misc.nested_field import parse_nested_field
 from v03_pipeline.lib.model import DatasetType, Env, ReferenceGenome, Sex
 
 BIALLELIC = 2
@@ -64,17 +65,9 @@ def import_gcnv_bed_file(callset_path: str) -> hl.MatrixTable:
     ht = hl.import_table(
         callset_path,
         types={
-            'start': hl.tint32,
-            'end': hl.tint32,
-            'CN': hl.tint32,
-            'QS': hl.tint32,
-            'defragmented': hl.tbool,
-            'sf': hl.tfloat64,
-            'sc': hl.tint32,
-            'genes_any_overlap_totalExons': hl.tint32,
-            'genes_strict_overlap_totalExons': hl.tint32,
-            'no_ovl': hl.tbool,
-            'is_latest': hl.tbool,
+            **DatasetType.GCNV.col_fields,
+            **DatasetType.GCNV.entries_fields,
+            **DatasetType.GCNV.row_fields,
         },
         force=callset_path.endswith('gz'),
     )
@@ -147,16 +140,25 @@ def import_callset(
 def select_relevant_fields(
     mt: hl.MatrixTable,
     dataset_type: DatasetType,
+    additional_row_fields: None | dict[str, hl.expr.types.HailType | set] = None,
 ) -> hl.MatrixTable:
     mt = mt.select_globals()
-    optional_row_fields = [
-        row_field
-        for row_field in dataset_type.optional_row_fields
-        if hasattr(mt, row_field)
-    ]
-    mt = mt.select_rows(*dataset_type.row_fields, *optional_row_fields)
-    mt = mt.select_cols(*dataset_type.col_fields)
-    return mt.select_entries(*dataset_type.entries_fields)
+    mt = mt.select_rows(
+        **{field: parse_nested_field(mt, field) for field in dataset_type.row_fields},
+        **{
+            field: parse_nested_field(mt, field)
+            for field in (additional_row_fields or [])
+        },
+    )
+    mt = mt.select_cols(
+        **{field: parse_nested_field(mt, field) for field in dataset_type.col_fields},
+    )
+    return mt.select_entries(
+        **{
+            field: parse_nested_field(mt, field)
+            for field in dataset_type.entries_fields
+        },
+    )
 
 
 def import_imputed_sex(imputed_sex_path: str) -> hl.Table:

@@ -0,0 +1,15 @@
+import hail as hl
+
+
+def parse_nested_field(t: hl.MatrixTable | hl.Table, fields: str):
+    # Grab the field and continually select it from the hail table.
+    expression = t
+    for field in fields.split('.'):
+        # Select from multi-allelic list.
+        if field.endswith('#'):
+            expression = expression[field[:-1]][
+                (t.a_index if hasattr(t, 'a_index') else 1) - 1
+            ]
+        else:
+            expression = expression[field]
+    return expression
@@ -0,0 +1,75 @@
+import unittest
+
+import hail as hl
+
+from v03_pipeline.lib.misc.nested_field import parse_nested_field
+
+
+class TestConstrainFunction(unittest.TestCase):
+    def test_parse_nested_field(self):
+        ht = hl.Table.parallelize(
+            [
+                {
+                    'locus': hl.Locus(
+                        contig='chr1',
+                        position=1,
+                        reference_genome='GRCh38',
+                    ),
+                    'alleles': ['A', 'C'],
+                    'a': hl.Struct(d=1),
+                    'b': hl.Struct(e=[2, 9]),
+                    'a_index': 1,
+                },
+                {
+                    'locus': hl.Locus(
+                        contig='chr1',
+                        position=2,
+                        reference_genome='GRCh38',
+                    ),
+                    'alleles': ['A', 'C'],
+                    'a': hl.Struct(d=3),
+                    'b': hl.Struct(e=[4, 5]),
+                    'a_index': 1,
+                },
+            ],
+            hl.tstruct(
+                locus=hl.tlocus('GRCh38'),
+                alleles=hl.tarray(hl.tstr),
+                a=hl.tstruct(d=hl.tint32),
+                b=hl.tstruct(e=hl.tarray(hl.tint32)),
+                a_index=hl.tint32,
+            ),
+            key=['locus', 'alleles'],
+        )
+        ht = ht.select(
+            d=parse_nested_field(ht, 'a.d'),
+            e=parse_nested_field(ht, 'b.e#'),
+            f=parse_nested_field(ht, 'a'),
+        )
+        self.assertListEqual(
+            ht.collect(),
+            [
+                hl.Struct(
+                    locus=hl.Locus(
+                        contig='chr1',
+                        position=1,
+                        reference_genome='GRCh38',
+                    ),
+                    alleles=['A', 'C'],
+                    d=1,
+                    e=2,
+                    f=hl.Struct(d=1),
+                ),
+                hl.Struct(
+                    locus=hl.Locus(
+                        contig='chr1',
+                        position=2,
+                        reference_genome='GRCh38',
+                    ),
+                    alleles=['A', 'C'],
+                    d=3,
+                    e=4,
+                    f=hl.Struct(d=3),
+                ),
+            ],
+        )
@@ -1,6 +1,6 @@
 import hail as hl
 
-from v03_pipeline.lib.model import ReferenceGenome, SampleType, Sex
+from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType, Sex
 
 AMBIGUOUS_THRESHOLD_PERC: float = 0.01  # Fraction of samples identified as "ambiguous_sex" above which an error will be thrown.
 MIN_ROWS_PER_CONTIG = 100
@@ -60,6 +60,43 @@ def validate_expected_contig_frequency(
             raise SeqrValidationError(msg)
 
 
+def validate_imported_field_types(
+    mt: hl.MatrixTable,
+    dataset_type: DatasetType,
+    additional_row_fields: dict[str, hl.expr.types.HailType | set],
+) -> None:
+    def _validate_field(
+        mt_schema: hl.StructExpression,
+        field: str,
+        dtype: hl.expr.types.HailType,
+    ) -> str | None:
+        if field not in mt_schema:
+            return f'{field}: missing'
+        if (
+            (
+                dtype == hl.tstruct
+                and type(mt_schema[field])
+                == hl.expr.expressions.typed_expressions.StructExpression
+            )
+            or (isinstance(dtype, set) and mt_schema[field].dtype in dtype)
+            or (mt_schema[field].dtype == dtype)
+        ):
+            return None
+        return f'{field}: {mt_schema[field].dtype}'
+
+    unexpected_field_types = []
+    for field, dtype in dataset_type.col_fields.items():
+        unexpected_field_types.append(_validate_field(mt.col, field, dtype))
+    for field, dtype in dataset_type.entries_fields.items():
+        unexpected_field_types.append(_validate_field(mt.entry, field, dtype))
+    for field, dtype in {**dataset_type.row_fields, **additional_row_fields}.items():
+        unexpected_field_types.append(_validate_field(mt.row, field, dtype))
+    unexpected_field_types = [x for x in unexpected_field_types if x is not None]
+    if unexpected_field_types:
+        msg = f'Found unexpected field types on MatrixTable after import: {unexpected_field_types}'
+        raise SeqrValidationError(msg)
+
+
 def validate_imputed_sex_ploidy(
     mt: hl.MatrixTable,
     sex_check_ht: hl.Table,