diff --git a/requirements.txt b/requirements.txt
index b9a3cc37b..9dafbd4e5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -347,7 +347,7 @@ tenacity==8.2.3
     #   plotly
 threadpoolctl==3.2.0
     # via scikit-learn
-tornado==6.3.3
+tornado==6.4.1
     # via
     #   bokeh
     #   luigi
diff --git a/v03_pipeline/lib/annotations/mito.py b/v03_pipeline/lib/annotations/mito.py
index eac1829e6..49ed0c108 100644
--- a/v03_pipeline/lib/annotations/mito.py
+++ b/v03_pipeline/lib/annotations/mito.py
@@ -56,7 +56,7 @@ def high_constraint_region_mito(
 
 
 def mito_cn(mt: hl.MatrixTable, **_: Any) -> hl.Expression:
-    return hl.int(mt.mito_cn)
+    return hl.int32(mt.mito_cn)
 
 
 def mitotip(ht: hl.Table, **_: Any) -> hl.Expression:
diff --git a/v03_pipeline/lib/annotations/sv.py b/v03_pipeline/lib/annotations/sv.py
index 4893e2a2d..39e1856d4 100644
--- a/v03_pipeline/lib/annotations/sv.py
+++ b/v03_pipeline/lib/annotations/sv.py
@@ -10,11 +10,21 @@
 from v03_pipeline.lib.annotations.shared import add_rg38_liftover
 from v03_pipeline.lib.model.definitions import ReferenceGenome
 
-CONSEQ_PREDICTED_PREFIX = 'PREDICTED_'
-NON_GENE_PREDICTIONS = {
-    'PREDICTED_INTERGENIC',
-    'PREDICTED_NONCODING_BREAKPOINT',
-    'PREDICTED_NONCODING_SPAN',
+CONSEQ_PREDICTED_PREFIX = 'info.PREDICTED_'
+CONSEQ_PREDICTED_GENE_COLS = {
+    'info.PREDICTED_BREAKEND_EXONIC': hl.tarray(hl.tstr),
+    'info.PREDICTED_COPY_GAIN': hl.tarray(hl.tstr),
+    'info.PREDICTED_DUP_PARTIAL': hl.tarray(hl.tstr),
+    'info.PREDICTED_INTRAGENIC_EXON_DUP': hl.tarray(hl.tstr),
+    'info.PREDICTED_INTRONIC': hl.tarray(hl.tstr),
+    'info.PREDICTED_INV_SPAN': hl.tarray(hl.tstr),
+    'info.PREDICTED_LOF': hl.tarray(hl.tstr),
+    'info.PREDICTED_MSV_EXON_OVERLAP': hl.tarray(hl.tstr),
+    'info.PREDICTED_NEAREST_TSS': hl.tarray(hl.tstr),
+    'info.PREDICTED_PARTIAL_EXON_DUP': hl.tarray(hl.tstr),
+    'info.PREDICTED_PROMOTER': hl.tarray(hl.tstr),
+    'info.PREDICTED_TSS_DUP': hl.tarray(hl.tstr),
+    'info.PREDICTED_UTR': hl.tarray(hl.tstr),
 }
 
 PREVIOUS_GENOTYPE_N_ALT_ALLELES = hl.dict(
@@ -72,7 +82,7 @@ def _sv_types(ht: hl.Table) -> hl.ArrayExpression:
 
 
 def algorithms(ht: hl.Table, **_: Any) -> hl.Expression:
-    return hl.str(',').join(ht.info.ALGORITHMS)
+    return hl.str(',').join(ht['info.ALGORITHMS'])
 
 
 def bothsides_support(ht: hl.Table, **_: Any) -> hl.Expression:
@@ -110,37 +120,40 @@ def cpx_intervals(
     **_: Any,
 ) -> hl.Expression:
     return hl.or_missing(
-        hl.is_defined(ht.info.CPX_INTERVALS),
-        ht.info.CPX_INTERVALS.map(lambda x: _get_cpx_interval(x, reference_genome)),
+        hl.is_defined(ht['info.CPX_INTERVALS']),
+        ht['info.CPX_INTERVALS'].map(lambda x: _get_cpx_interval(x, reference_genome)),
     )
 
 
 def end_locus(ht: hl.Table, **_: Any) -> hl.StructExpression:
     rg38_lengths = hl.literal(hl.get_reference(ReferenceGenome.GRCh38.value).lengths)
     return hl.if_else(
-        (hl.is_defined(ht.info.END2) & (rg38_lengths[ht.info.CHR2] >= ht.info.END2)),
-        hl.locus(ht.info.CHR2, ht.info.END2, ReferenceGenome.GRCh38.value),
+        (
+            hl.is_defined(ht['info.END2'])
+            & (rg38_lengths[ht['info.CHR2']] >= ht['info.END2'])
+        ),
+        hl.locus(ht['info.CHR2'], ht['info.END2'], ReferenceGenome.GRCh38.value),
         hl.or_missing(
-            (rg38_lengths[ht.locus.contig] >= ht.info.END),
-            hl.locus(ht.locus.contig, ht.info.END, ReferenceGenome.GRCh38.value),
+            (rg38_lengths[ht.locus.contig] >= ht['info.END']),
+            hl.locus(ht.locus.contig, ht['info.END'], ReferenceGenome.GRCh38.value),
         ),
     )
 
 
 def gnomad_svs(ht: hl.Table, **_: Any) -> hl.Expression:
     return hl.or_missing(
-        hl.is_defined(ht.info.gnomAD_V2_AF),
-        hl.struct(AF=hl.float32(ht.info.gnomAD_V2_AF), ID=ht.info.gnomAD_V2_SVID),
+        hl.is_defined(ht['info.gnomAD_V2_AF']),
+        hl.struct(AF=hl.float32(ht['info.gnomAD_V2_AF']), ID=ht['info.gnomAD_V2_SVID']),
     )
 
 
 def gt_stats(ht: hl.Table, **_: Any) -> hl.Expression:
     return hl.struct(
-        AF=hl.float32(ht.info.AF[0]),
-        AC=ht.info.AC[0],
-        AN=ht.info.AN,
-        Hom=ht.info.N_HOMALT,
-        Het=ht.info.N_HET,
+        AF=hl.float32(ht['info.AF'][0]),
+        AC=ht['info.AC'][0],
+        AN=ht['info.AN'],
+        Hom=ht['info.N_HOMALT'],
+        Het=ht['info.N_HET'],
     )
 
 
@@ -174,14 +187,8 @@ def sorted_gene_consequences(
     **_: Any,
 ) -> hl.Expression:
     # In lieu of sorted_transcript_consequences seen on SNV/MITO.
-    conseq_predicted_gene_cols = [
-        gene_col
-        for gene_col in ht.info
-        if gene_col.startswith(CONSEQ_PREDICTED_PREFIX)
-        and gene_col not in NON_GENE_PREDICTIONS
-    ]
     mapped_genes = [
-        ht.info[gene_col].map(
+        ht[gene_col].map(
             lambda gene: hl.struct(
                 gene_id=gencode_mapping.get(gene),
                 major_consequence_id=SV_CONSEQUENCE_RANKS_LOOKUP[
@@ -189,13 +196,13 @@ def sorted_gene_consequences(
                 ],
             ),
         )
-        for gene_col in conseq_predicted_gene_cols
+        for gene_col in CONSEQ_PREDICTED_GENE_COLS
     ]
     return hl.filter(hl.is_defined, mapped_genes).flatmap(lambda x: x)
 
 
 def strvctvre(ht: hl.Table, **_: Any) -> hl.Expression:
-    return hl.struct(score=hl.parse_float32(ht.info.StrVCTVRE))
+    return hl.struct(score=hl.parse_float32(ht['info.StrVCTVRE']))
 
 
 def sv_type_id(ht: hl.Table, **_: Any) -> hl.Expression:
@@ -206,7 +213,7 @@ def sv_type_detail_id(ht: hl.Table, **_: Any) -> hl.Expression:
     sv_types = _sv_types(ht)
     return hl.if_else(
         sv_types[0] == 'CPX',
-        SV_TYPE_DETAILS_LOOKUP[ht.info.CPX_TYPE],
+        SV_TYPE_DETAILS_LOOKUP[ht['info.CPX_TYPE']],
         hl.or_missing(
             (sv_types[0] == 'INS') & (hl.len(sv_types) > 1),
             SV_TYPE_DETAILS_LOOKUP[sv_types[1]],
diff --git a/v03_pipeline/lib/misc/callsets.py b/v03_pipeline/lib/misc/callsets.py
index 3757b4da5..e65e9bc61 100644
--- a/v03_pipeline/lib/misc/callsets.py
+++ b/v03_pipeline/lib/misc/callsets.py
@@ -32,13 +32,6 @@ def get_callset_ht(  # noqa: PLR0913
             imputed_sex_paths,
         )
     ]
-
-    # Drop any fields potentially unshared/unused by the annotations.
-    for i, callset_ht in enumerate(callset_hts):
-        for row_field in dataset_type.optional_row_fields:
-            if hasattr(callset_ht, row_field):
-                callset_hts[i] = callset_ht.drop(row_field)
-
     callset_ht = functools.reduce(
         (lambda ht1, ht2: ht1.union(ht2, unify=True)),
         callset_hts,
diff --git a/v03_pipeline/lib/misc/io.py b/v03_pipeline/lib/misc/io.py
index 1548814ff..71572dd62 100644
--- a/v03_pipeline/lib/misc/io.py
+++ b/v03_pipeline/lib/misc/io.py
@@ -5,6 +5,7 @@
 import hail as hl
 
 from v03_pipeline.lib.misc.gcnv import parse_gcnv_genes
+from v03_pipeline.lib.misc.nested_field import parse_nested_field
 from v03_pipeline.lib.model import DatasetType, Env, ReferenceGenome, Sex
 
 BIALLELIC = 2
@@ -64,17 +65,9 @@ def import_gcnv_bed_file(callset_path: str) -> hl.MatrixTable:
     ht = hl.import_table(
         callset_path,
         types={
-            'start': hl.tint32,
-            'end': hl.tint32,
-            'CN': hl.tint32,
-            'QS': hl.tint32,
-            'defragmented': hl.tbool,
-            'sf': hl.tfloat64,
-            'sc': hl.tint32,
-            'genes_any_overlap_totalExons': hl.tint32,
-            'genes_strict_overlap_totalExons': hl.tint32,
-            'no_ovl': hl.tbool,
-            'is_latest': hl.tbool,
+            **DatasetType.GCNV.col_fields,
+            **DatasetType.GCNV.entries_fields,
+            **DatasetType.GCNV.row_fields,
         },
         force=callset_path.endswith('gz'),
     )
@@ -147,16 +140,25 @@ def import_callset(
 def select_relevant_fields(
     mt: hl.MatrixTable,
     dataset_type: DatasetType,
+    additional_row_fields: None | dict[str, hl.expr.types.HailType | set] = None,
 ) -> hl.MatrixTable:
     mt = mt.select_globals()
-    optional_row_fields = [
-        row_field
-        for row_field in dataset_type.optional_row_fields
-        if hasattr(mt, row_field)
-    ]
-    mt = mt.select_rows(*dataset_type.row_fields, *optional_row_fields)
-    mt = mt.select_cols(*dataset_type.col_fields)
-    return mt.select_entries(*dataset_type.entries_fields)
+    mt = mt.select_rows(
+        **{field: parse_nested_field(mt, field) for field in dataset_type.row_fields},
+        **{
+            field: parse_nested_field(mt, field)
+            for field in (additional_row_fields or [])
+        },
+    )
+    mt = mt.select_cols(
+        **{field: parse_nested_field(mt, field) for field in dataset_type.col_fields},
+    )
+    return mt.select_entries(
+        **{
+            field: parse_nested_field(mt, field)
+            for field in dataset_type.entries_fields
+        },
+    )
 
 
 def import_imputed_sex(imputed_sex_path: str) -> hl.Table:
diff --git a/v03_pipeline/lib/misc/nested_field.py b/v03_pipeline/lib/misc/nested_field.py
new file mode 100644
index 000000000..02d1b9c48
--- /dev/null
+++ b/v03_pipeline/lib/misc/nested_field.py
@@ -0,0 +1,15 @@
+import hail as hl
+
+
+def parse_nested_field(t: hl.MatrixTable | hl.Table, fields: str):
+    # Grab the field and continually select it from the hail table.
+    expression = t
+    for field in fields.split('.'):
+        # Select from multi-allelic list.
+        if field.endswith('#'):
+            expression = expression[field[:-1]][
+                (t.a_index if hasattr(t, 'a_index') else 1) - 1
+            ]
+        else:
+            expression = expression[field]
+    return expression
diff --git a/v03_pipeline/lib/misc/nested_field_test.py b/v03_pipeline/lib/misc/nested_field_test.py
new file mode 100644
index 000000000..6e0bd5d95
--- /dev/null
+++ b/v03_pipeline/lib/misc/nested_field_test.py
@@ -0,0 +1,75 @@
+import unittest
+
+import hail as hl
+
+from v03_pipeline.lib.misc.nested_field import parse_nested_field
+
+
+class TestConstrainFunction(unittest.TestCase):
+    def test_parse_nested_field(self):
+        ht = hl.Table.parallelize(
+            [
+                {
+                    'locus': hl.Locus(
+                        contig='chr1',
+                        position=1,
+                        reference_genome='GRCh38',
+                    ),
+                    'alleles': ['A', 'C'],
+                    'a': hl.Struct(d=1),
+                    'b': hl.Struct(e=[2, 9]),
+                    'a_index': 1,
+                },
+                {
+                    'locus': hl.Locus(
+                        contig='chr1',
+                        position=2,
+                        reference_genome='GRCh38',
+                    ),
+                    'alleles': ['A', 'C'],
+                    'a': hl.Struct(d=3),
+                    'b': hl.Struct(e=[4, 5]),
+                    'a_index': 1,
+                },
+            ],
+            hl.tstruct(
+                locus=hl.tlocus('GRCh38'),
+                alleles=hl.tarray(hl.tstr),
+                a=hl.tstruct(d=hl.tint32),
+                b=hl.tstruct(e=hl.tarray(hl.tint32)),
+                a_index=hl.tint32,
+            ),
+            key=['locus', 'alleles'],
+        )
+        ht = ht.select(
+            d=parse_nested_field(ht, 'a.d'),
+            e=parse_nested_field(ht, 'b.e#'),
+            f=parse_nested_field(ht, 'a'),
+        )
+        self.assertListEqual(
+            ht.collect(),
+            [
+                hl.Struct(
+                    locus=hl.Locus(
+                        contig='chr1',
+                        position=1,
+                        reference_genome='GRCh38',
+                    ),
+                    alleles=['A', 'C'],
+                    d=1,
+                    e=2,
+                    f=hl.Struct(d=1),
+                ),
+                hl.Struct(
+                    locus=hl.Locus(
+                        contig='chr1',
+                        position=2,
+                        reference_genome='GRCh38',
+                    ),
+                    alleles=['A', 'C'],
+                    d=3,
+                    e=4,
+                    f=hl.Struct(d=3),
+                ),
+            ],
+        )
diff --git a/v03_pipeline/lib/misc/validation.py b/v03_pipeline/lib/misc/validation.py
index 86b672ddc..f3abae409 100644
--- a/v03_pipeline/lib/misc/validation.py
+++ b/v03_pipeline/lib/misc/validation.py
@@ -1,6 +1,6 @@
 import hail as hl
 
-from v03_pipeline.lib.model import ReferenceGenome, SampleType, Sex
+from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType, Sex
 
 AMBIGUOUS_THRESHOLD_PERC: float = 0.01  # Fraction of samples identified as "ambiguous_sex" above which an error will be thrown.
 MIN_ROWS_PER_CONTIG = 100
@@ -60,6 +60,43 @@ def validate_expected_contig_frequency(
             raise SeqrValidationError(msg)
 
 
+def validate_imported_field_types(
+    mt: hl.MatrixTable,
+    dataset_type: DatasetType,
+    additional_row_fields: dict[str, hl.expr.types.HailType | set],
+) -> None:
+    def _validate_field(
+        mt_schema: hl.StructExpression,
+        field: str,
+        dtype: hl.expr.types.HailType,
+    ) -> str | None:
+        if field not in mt_schema:
+            return f'{field}: missing'
+        if (
+            (
+                dtype == hl.tstruct
+                and type(mt_schema[field])
+                == hl.expr.expressions.typed_expressions.StructExpression
+            )
+            or (isinstance(dtype, set) and mt_schema[field].dtype in dtype)
+            or (mt_schema[field].dtype == dtype)
+        ):
+            return None
+        return f'{field}: {mt_schema[field].dtype}'
+
+    unexpected_field_types = []
+    for field, dtype in dataset_type.col_fields.items():
+        unexpected_field_types.append(_validate_field(mt.col, field, dtype))
+    for field, dtype in dataset_type.entries_fields.items():
+        unexpected_field_types.append(_validate_field(mt.entry, field, dtype))
+    for field, dtype in {**dataset_type.row_fields, **additional_row_fields}.items():
+        unexpected_field_types.append(_validate_field(mt.row, field, dtype))
+    unexpected_field_types = [x for x in unexpected_field_types if x is not None]
+    if unexpected_field_types:
+        msg = f'Found unexpected field types on MatrixTable after import: {unexpected_field_types}'
+        raise SeqrValidationError(msg)
+
+
 def validate_imputed_sex_ploidy(
     mt: hl.MatrixTable,
     sex_check_ht: hl.Table,
diff --git a/v03_pipeline/lib/misc/validation_test.py b/v03_pipeline/lib/misc/validation_test.py
index 0512d9284..d7cd7c7fc 100644
--- a/v03_pipeline/lib/misc/validation_test.py
+++ b/v03_pipeline/lib/misc/validation_test.py
@@ -6,13 +6,15 @@
     SeqrValidationError,
     validate_allele_type,
     validate_expected_contig_frequency,
+    validate_imported_field_types,
     validate_imputed_sex_ploidy,
     validate_no_duplicate_variants,
     validate_sample_type,
 )
-from v03_pipeline.lib.model import ReferenceGenome, SampleType
+from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType
 
 TEST_SEX_CHECK_1 = 'v03_pipeline/var/test/sex_check/test_sex_check_1.ht'
+TEST_MITO_MT = 'v03_pipeline/var/test/callsets/mito_1.mt'
 
 
 def _mt_from_contigs(contigs):
@@ -122,6 +124,21 @@ def test_validate_imputed_sex_ploidy(self) -> None:
             sex_check_ht,
         )
 
+    def test_validate_imported_field_types(self) -> None:
+        mt = hl.read_matrix_table(TEST_MITO_MT)
+        validate_imported_field_types(mt, DatasetType.MITO, {})
+        mt = mt.annotate_cols(contamination=hl.int32(mt.contamination))
+        mt = mt.annotate_entries(DP=hl.float32(mt.DP))
+        mt = mt.annotate_rows(vep=hl.dict({'t': '1'}))
+        self.assertRaisesRegex(
+            SeqrValidationError,
+            "Found unexpected field types on MatrixTable after import: \\['contamination: int32', 'DP: float32', 'vep: dict<str, str>', 'tester: missing'\\]",
+            validate_imported_field_types,
+            mt,
+            DatasetType.MITO,
+            {'tester': hl.tfloat32},
+        )
+
     def test_validate_no_duplicate_variants(self) -> None:
         mt = hl.MatrixTable.from_parts(
             rows={
diff --git a/v03_pipeline/lib/model/constants.py b/v03_pipeline/lib/model/constants.py
index b406b8798..7c485c6f3 100644
--- a/v03_pipeline/lib/model/constants.py
+++ b/v03_pipeline/lib/model/constants.py
@@ -2,4 +2,5 @@
     'R0555_seqr_demo',
     'R0607_gregor_training_project_',
     'R0608_gregor_training_project_',
+    'R0801_gregor_training_project_',
 }
diff --git a/v03_pipeline/lib/model/dataset_type.py b/v03_pipeline/lib/model/dataset_type.py
index cc9123848..5731a1529 100644
--- a/v03_pipeline/lib/model/dataset_type.py
+++ b/v03_pipeline/lib/model/dataset_type.py
@@ -34,10 +34,13 @@ def col_fields(
         self,
     ) -> list[str]:
         return {
-            DatasetType.SNV_INDEL: [],
-            DatasetType.MITO: ['contamination', 'mito_cn'],
-            DatasetType.SV: [],
-            DatasetType.GCNV: [],
+            DatasetType.SNV_INDEL: {},
+            DatasetType.MITO: {
+                'contamination': {hl.tstr, hl.tfloat64},
+                'mito_cn': hl.tfloat64,
+            },
+            DatasetType.SV: {},
+            DatasetType.GCNV: {},
         }[self]
 
     @property
@@ -45,34 +48,37 @@ def entries_fields(
         self,
     ) -> list[str]:
         return {
-            DatasetType.SNV_INDEL: ['GT', 'AD', 'GQ'],
-            DatasetType.MITO: ['GT', 'DP', 'MQ', 'HL'],
-            DatasetType.SV: ['GT', 'CONC_ST', 'GQ', 'RD_CN'],
-            DatasetType.GCNV: [
-                'any_ovl',
-                'defragmented',
-                'genes_any_overlap_Ensemble_ID',
-                'genes_any_overlap_totalExons',
-                'identical_ovl',
-                'is_latest',
-                'no_ovl',
-                'sample_start',
-                'sample_end',
-                'CN',
-                'GT',
-                'QS',
-            ],
-        }[self]
-
-    @property
-    def optional_row_fields(
-        self,
-    ) -> list[str]:
-        return {
-            DatasetType.SNV_INDEL: ['info'],
-            DatasetType.MITO: [],
-            DatasetType.SV: [],
-            DatasetType.GCNV: [],
+            DatasetType.SNV_INDEL: {
+                'GT': hl.tcall,
+                'AD': hl.tarray(hl.tint32),
+                'GQ': hl.tint32,
+            },
+            DatasetType.MITO: {
+                'GT': hl.tcall,
+                'DP': hl.tint32,
+                'MQ': {hl.tfloat64, hl.tint32},
+                'HL': hl.tfloat64,
+            },
+            DatasetType.SV: {
+                'GT': hl.tcall,
+                'CONC_ST': hl.tarray(hl.tstr),
+                'GQ': hl.tint32,
+                'RD_CN': hl.tint32,
+            },
+            DatasetType.GCNV: {
+                'any_ovl': hl.tstr,
+                'defragmented': hl.tbool,
+                'genes_any_overlap_Ensemble_ID': hl.tstr,
+                'genes_any_overlap_totalExons': hl.tint32,
+                'identical_ovl': hl.tstr,
+                'is_latest': hl.tbool,
+                'no_ovl': hl.tbool,
+                'sample_start': hl.tint32,
+                'sample_end': hl.tint32,
+                'CN': hl.tint32,
+                'GT': hl.tstr,
+                'QS': hl.tint32,
+            },
         }[self]
 
     @property
@@ -80,30 +86,52 @@ def row_fields(
         self,
     ) -> list[str]:
         return {
-            DatasetType.SNV_INDEL: ['rsid', 'filters'],
-            DatasetType.MITO: [
-                'rsid',
-                'filters',
-                'common_low_heteroplasmy',
-                'hap_defining_variant',
-                'mitotip_trna_prediction',
-                'vep',
-            ],
-            DatasetType.SV: ['locus', 'alleles', 'filters', 'info'],
-            DatasetType.GCNV: [
-                'cg_genes',
-                'chr',
-                'end',
-                'filters',
-                'gene_ids',
-                'lof_genes',
-                'num_exon',
-                'sc',
-                'sf',
-                'start',
-                'strvctvre_score',
-                'svtype',
-            ],
+            DatasetType.SNV_INDEL: {
+                'rsid': hl.tstr,
+                'filters': hl.tset(hl.tstr),
+            },
+            DatasetType.MITO: {
+                'rsid': hl.tset(hl.tstr),
+                'filters': hl.tset(hl.tstr),
+                'common_low_heteroplasmy': hl.tbool,
+                'hap_defining_variant': hl.tbool,
+                'mitotip_trna_prediction': hl.tstr,
+                'vep': hl.tstruct,
+            },
+            DatasetType.SV: {
+                'locus': hl.tlocus(ReferenceGenome.GRCh38.value),
+                'alleles': hl.tarray(hl.tstr),
+                'filters': hl.tset(hl.tstr),
+                'info.AC': hl.tarray(hl.tint32),
+                'info.AF': hl.tarray(hl.tfloat64),
+                'info.ALGORITHMS': hl.tarray(hl.tstr),
+                'info.AN': hl.tint32,
+                'info.CHR2': hl.tstr,
+                'info.CPX_INTERVALS': hl.tarray(hl.tstr),
+                'info.CPX_TYPE': hl.tstr,
+                'info.END': hl.tint32,
+                'info.END2': hl.tint32,
+                'info.gnomAD_V2_AF': hl.tfloat64,
+                'info.gnomAD_V2_SVID': hl.tstr,
+                'info.N_HET': hl.tint32,
+                'info.N_HOMALT': hl.tint32,
+                'info.StrVCTVRE': hl.tstr,
+                **sv.CONSEQ_PREDICTED_GENE_COLS,
+            },
+            DatasetType.GCNV: {
+                'cg_genes': hl.tset(hl.tstr),
+                'chr': hl.tstr,
+                'end': hl.tint32,
+                'filters': hl.tset(hl.tstr),
+                'gene_ids': hl.tset(hl.tstr),
+                'lof_genes': hl.tset(hl.tstr),
+                'num_exon': hl.tint32,
+                'sc': hl.tint32,
+                'sf': hl.tfloat64,
+                'start': hl.tint32,
+                'strvctvre_score': hl.tstr,
+                'svtype': hl.tstr,
+            },
         }[self]
 
     @property
diff --git a/v03_pipeline/lib/reference_data/dataset_table_operations.py b/v03_pipeline/lib/reference_data/dataset_table_operations.py
index 2a979ce02..7d4d44b67 100644
--- a/v03_pipeline/lib/reference_data/dataset_table_operations.py
+++ b/v03_pipeline/lib/reference_data/dataset_table_operations.py
@@ -4,6 +4,7 @@
 import hail as hl
 import pytz
 
+from v03_pipeline.lib.misc.nested_field import parse_nested_field
 from v03_pipeline.lib.model import (
     DatasetType,
     ReferenceDatasetCollection,
@@ -109,14 +110,7 @@ def get_select_fields(selects: list | dict | None, base_ht: hl.Table) -> dict:
         select_fields = {selection: base_ht[selection] for selection in selects}
     elif isinstance(selects, dict):
         for key, val in selects.items():
-            # Grab the field and continually select it from the hail table.
-            expression = base_ht
-            for attr in val.split('.'):
-                # Select from multi-allelic list.
-                if attr.endswith('#'):
-                    expression = expression[attr[:-1]][base_ht.a_index - 1]
-                else:
-                    expression = expression[attr]
+            expression = parse_nested_field(base_ht, val)
             # Parse float64s into float32s to save space!
             if expression.dtype == hl.tfloat64:
                 expression = hl.float32(expression)
diff --git a/v03_pipeline/lib/reference_data/gencode/mapping_gene_ids_tests.py b/v03_pipeline/lib/reference_data/gencode/mapping_gene_ids_tests.py
new file mode 100644
index 000000000..04cd92b02
--- /dev/null
+++ b/v03_pipeline/lib/reference_data/gencode/mapping_gene_ids_tests.py
@@ -0,0 +1,150 @@
+import unittest
+from unittest import mock
+
+from v03_pipeline.lib.reference_data.gencode.mapping_gene_ids import load_gencode
+
+DOWNLOAD_PATH = 'test/path'
+GS_DOWNLOAD_PATH ='gs://test-bucket/test/path'
+DOWNLOAD_FILE = 'test/path/gencode.v29.annotation.gtf.gz'
+PICKLE_FILE = 'test/path/gencode.v29.annotation.gtf.pickle'
+PICKLE_FILE_HANDLE = 'handle'
+GTF_DATA = [
+    '#description: evidence-based annotation of the human genome, version 31 (Ensembl 97), mapped to GRCh37 with gencode-backmap\n',
+    'chr1	HAVANA	gene	11869	14409	.	+	.	gene_id "ENSG00000223972.5_2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; level 2; hgnc_id "HGNC:37102"; havana_gene "OTTHUMG00000000961.2_2"; remap_status "full_contig"; remap_num_mappings 1; remap_target_status "overlap";\n',
+    'chr1	HAVANA	gene	621059	622053	.	-	.	gene_id "ENSG00000284662.1_2"; gene_type "protein_coding"; gene_name "OR4F16"; level 2; hgnc_id "HGNC:15079"; havana_gene "OTTHUMG00000002581.3_2"; remap_status "full_contig"; remap_num_mappings 1; remap_target_status "overlap";\n',
+    'GL000193.1	HAVANA	gene	77815	78162	.	+	.	gene_id "ENSG00000279783.1_5"; gene_type "processed_pseudogene"; gene_name "AC018692.2"; level 2; havana_gene "OTTHUMG00000189459.1_5"; remap_status "full_contig"; remap_num_mappings 1; remap_target_status "new";\n',
+]
+GENE_ID_MAPPING = {"DDX11L1": "ENSG00000223972", "OR4F16": "ENSG00000284662", "AC018692.2": "ENSG00000279783"}
+
+
+class LoadGencodeTestCase(unittest.TestCase):
+
+    @mock.patch('v03_pipeline.lib.reference_data.gencode.mapping_gene_ids.logger')
+    @mock.patch('v03_pipeline.lib.reference_data.gencode.mapping_gene_ids.path_exists')
+    @mock.patch('v03_pipeline.lib.reference_data.gencode.mapping_gene_ids.pickle')
+    @mock.patch('v03_pipeline.lib.reference_data.gencode.mapping_gene_ids.open')
+    @mock.patch('v03_pipeline.lib.reference_data.gencode.mapping_gene_ids.gzip.open')
+    @mock.patch('v03_pipeline.lib.reference_data.gencode.mapping_gene_ids.file_writer')
+    @mock.patch('v03_pipeline.lib.reference_data.gencode.mapping_gene_ids.download_file')
+    def test_load_gencode_local(self, mock_download_file, mock_file_writer, mock_gopen, mock_open, mock_pickle,
+                                mock_path_exists, mock_logger):
+        # test using saved file
+        mock_path_exists.side_effect = [True]
+        mock_pickle.load.return_value = GENE_ID_MAPPING
+        gene_id_mapping = load_gencode(23, download_path=DOWNLOAD_PATH)
+        mock_file_writer.assert_not_called()
+        mock_download_file.assert_not_called()
+        mock_gopen.assert_not_called()
+        mock_open.assert_called_with('test/path/gencode.v23.annotation.gtf.pickle', 'rb')
+        mock_pickle.load.assert_called_with(mock_open.return_value.__enter__.return_value)
+        mock_path_exists.assert_called_with('test/path/gencode.v23.annotation.gtf.pickle')
+        mock_logger.info.assert_has_calls([
+            mock.call('Use the existing pickle file test/path/gencode.v23.annotation.gtf.pickle.\nIf you want to reload the data, please delete it and re-run the data loading.'),
+            mock.call('Got 3 gene id mapping records'),
+        ])
+        self.assertEqual(gene_id_mapping, GENE_ID_MAPPING)
+
+        # test downloading and parsing gtf data
+        mock_path_exists.reset_mock()
+        mock_logger.reset_mock()
+        mock_pickle.reset_mock()
+        mock_open.reset_mock()
+        mock_path_exists.side_effect = [False, False]
+        mock_download_file.return_value = 'test/path/gencode.v24.annotation.gtf.gz'
+        mock_gopen.return_value.__iter__.return_value = GTF_DATA
+        mock_f = mock.MagicMock()
+        mock_file_writer.return_value.__enter__.return_value = mock_f, None
+        gene_id_mapping = load_gencode(24, download_path=DOWNLOAD_PATH)
+        self.assertEqual(gene_id_mapping, GENE_ID_MAPPING)
+        mock_path_exists.assert_has_calls([
+            mock.call('test/path/gencode.v24.annotation.gtf.pickle'),
+            mock.call('test/path/gencode.v24.annotation.gtf.gz'),
+        ])
+        mock_download_file.assert_called_with(
+            'http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_24/gencode.v24.annotation.gtf.gz',
+            to_dir='test/path',
+        )
+        mock_file_writer.assert_called_with('test/path/gencode.v24.annotation.gtf.pickle')
+        mock_pickle.dump.assert_called_with(GENE_ID_MAPPING, mock_f, protocol=mock.ANY)
+        mock_gopen.assert_called_with('test/path/gencode.v24.annotation.gtf.gz', 'rt')
+        mock_open.assert_not_called()
+        mock_logger.info.assert_has_calls([
+            mock.call('Downloaded to test/path/gencode.v24.annotation.gtf.gz'),
+            mock.call('Loading test/path/gencode.v24.annotation.gtf.gz'),
+            mock.call('Saving to pickle test/path/gencode.v24.annotation.gtf.pickle'),
+            mock.call('Got 3 gene id mapping records')
+        ])
+        mock_pickle.load.assert_not_called()
+
+        # test using downloaded file
+        mock_path_exists.reset_mock()
+        mock_logger.reset_mock()
+        mock_download_file.reset_mock()
+        mock_pickle.reset_mock()
+        mock_path_exists.side_effect = [False, True]
+        mock_gopen.return_value.__iter__.return_value = GTF_DATA
+        gene_id_mapping = load_gencode(24, download_path=DOWNLOAD_PATH)
+        self.assertEqual(gene_id_mapping, GENE_ID_MAPPING)
+        mock_path_exists.assert_has_calls([
+            mock.call('test/path/gencode.v24.annotation.gtf.pickle'),
+            mock.call('test/path/gencode.v24.annotation.gtf.gz'),
+        ])
+        mock_gopen.assert_called_with('test/path/gencode.v24.annotation.gtf.gz', 'rt')
+        mock_download_file.assert_not_called()
+        mock_file_writer.assert_called_with('test/path/gencode.v24.annotation.gtf.pickle')
+        mock_pickle.dump.assert_called_with(GENE_ID_MAPPING, mock_f, protocol=mock.ANY)
+        mock_open.assert_not_called()
+        mock_logger.info.assert_has_calls([
+            mock.call('Use the existing downloaded file test/path/gencode.v24.annotation.gtf.gz.\nIf you want to re-download it, please delete the file and re-run the pipeline.'),
+            mock.call('Loading test/path/gencode.v24.annotation.gtf.gz'),
+            mock.call('Saving to pickle test/path/gencode.v24.annotation.gtf.pickle'),
+            mock.call('Got 3 gene id mapping records')
+        ])
+        mock_pickle.load.assert_not_called()
+
+        # bad gtf data test
+        mock_path_exists.side_effect = [False, False]
+        mock_gopen.return_value.__iter__.return_value = ['bad data']
+        with self.assertRaises(ValueError) as ve:
+            _ = load_gencode(24, download_path=DOWNLOAD_PATH)
+        self.assertEqual(str(ve.exception), "Unexpected number of fields on line #0: ['bad data']")
+
+    @mock.patch('v03_pipeline.lib.reference_data.gencode.mapping_gene_ids.gzip')
+    @mock.patch('v03_pipeline.lib.reference_data.gencode.mapping_gene_ids.logger')
+    @mock.patch('v03_pipeline.lib.reference_data.gencode.mapping_gene_ids.path_exists')
+    @mock.patch('v03_pipeline.lib.reference_data.gencode.mapping_gene_ids.pickle')
+    @mock.patch('v03_pipeline.lib.reference_data.gencode.mapping_gene_ids.stream_gs_file')
+    @mock.patch('v03_pipeline.lib.reference_data.gencode.mapping_gene_ids.file_writer')
+    def test_load_gencode_using_gs(self, mock_file_writer, mock_stream_gs_file, mock_pickle, mock_path_exists,
+                                   mock_logger, mock_gzip):
+
+        # test using saved file.
+        mock_path_exists.side_effect = [True]
+        mock_pickle.loads.return_value = GENE_ID_MAPPING
+        gene_id_mapping = load_gencode(25, download_path=GS_DOWNLOAD_PATH)
+        self.assertEqual(gene_id_mapping, GENE_ID_MAPPING)
+        mock_path_exists.assert_called_with('gs://test-bucket/test/path/gencode.v25.annotation.gtf.pickle')
+        mock_logger.info.assert_has_calls([
+            mock.call('Use the existing pickle file gs://test-bucket/test/path/gencode.v25.annotation.gtf.pickle.\n'
+                      'If you want to reload the data, please delete it and re-run the data loading.'),
+            mock.call('Got 3 gene id mapping records')
+        ])
+        mock_stream_gs_file.assert_called_with('gs://test-bucket/test/path/gencode.v25.annotation.gtf.pickle')
+        mock_pickle.dump.assert_not_called()
+        mock_file_writer.assert_not_called()
+
+        # test using downloaded file.
+        mock_path_exists.side_effect = [False, True]
+        mock_gzip.decompress.return_value = ''.join(GTF_DATA).encode()
+        mock_f = mock.MagicMock()
+        mock_file_writer.return_value.__enter__.return_value = mock_f, None
+        gene_id_mapping = load_gencode(25, download_path=GS_DOWNLOAD_PATH)
+        self.assertEqual(gene_id_mapping, GENE_ID_MAPPING)
+        mock_path_exists.assert_has_calls([
+            mock.call('gs://test-bucket/test/path/gencode.v25.annotation.gtf.pickle'),
+            mock.call('gs://test-bucket/test/path/gencode.v25.annotation.gtf.gz'),
+        ])
+        mock_stream_gs_file.assert_called_with('gs://test-bucket/test/path/gencode.v25.annotation.gtf.gz', raw_download=True)
+        mock_gzip.decompress.assert_called_with(mock_stream_gs_file.return_value)
+        mock_file_writer.assert_called_with('gs://test-bucket/test/path/gencode.v25.annotation.gtf.pickle')
+        mock_pickle.dump.assert_called_with(GENE_ID_MAPPING, mock_f, protocol=mock.ANY)
diff --git a/v03_pipeline/lib/tasks/write_imported_callset.py b/v03_pipeline/lib/tasks/write_imported_callset.py
index c8b795821..345af90e2 100644
--- a/v03_pipeline/lib/tasks/write_imported_callset.py
+++ b/v03_pipeline/lib/tasks/write_imported_callset.py
@@ -9,6 +9,7 @@
 from v03_pipeline.lib.misc.validation import (
     validate_allele_type,
     validate_expected_contig_frequency,
+    validate_imported_field_types,
     validate_imputed_sex_ploidy,
     validate_no_duplicate_variants,
     validate_sample_type,
@@ -112,6 +113,24 @@ def requires(self) -> list[luigi.Task]:
             CallsetTask(self.callset_path),
         ]
 
+    def additional_row_fields(self, mt):
+        return {
+            **(
+                {'info.AF': hl.tarray(hl.tfloat64)}
+                if self.check_sex_and_relatedness
+                and self.dataset_type.check_sex_and_relatedness
+                else {}
+            ),
+            # this field is never required, the pipeline
+            # will run smoothly even in its absence, but
+            # will trigger special handling if it is present.
+            **(
+                {'info.CALIBRATION_SENSITIVITY': hl.tarray(hl.tstr)}
+                if hasattr(mt, 'info') and hasattr(mt.info, 'CALIBRATION_SENSITIVITY')
+                else {}
+            ),
+        }
+
     def create_table(self) -> hl.MatrixTable:
         mt = import_callset(
             self.callset_path,
@@ -119,7 +138,18 @@ def create_table(self) -> hl.MatrixTable:
             self.dataset_type,
             self.filters_path,
         )
-        mt = select_relevant_fields(mt, self.dataset_type)
+        mt = select_relevant_fields(
+            mt,
+            self.dataset_type,
+            self.additional_row_fields(mt),
+        )
+        # This validation isn't override-able.  If a field is the wrong
+        # type, the pipeline will likely hard-fail downstream.
+        validate_imported_field_types(
+            mt,
+            self.dataset_type,
+            self.additional_row_fields(mt),
+        )
         if self.dataset_type.has_multi_allelic_variants:
             mt = split_multi_hts(mt)
         # Special handling of variant-level filter annotation for VETs filters.
diff --git a/v03_pipeline/lib/tasks/write_new_variants_table.py b/v03_pipeline/lib/tasks/write_new_variants_table.py
index 34ea5ee6a..5e183b05a 100644
--- a/v03_pipeline/lib/tasks/write_new_variants_table.py
+++ b/v03_pipeline/lib/tasks/write_new_variants_table.py
@@ -265,7 +265,7 @@ def create_table(self) -> hl.Table:
                 ar_ht = ar_ht.union(ar_ht_chunk)
             new_variants_ht = new_variants_ht.join(ar_ht, 'left')
 
-        return new_variants_ht.annotate_globals(
+        return new_variants_ht.select_globals(
             updates={
                 hl.Struct(callset=callset_path, project_guid=project_guid)
                 for (
diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py
index 6b5aeced6..75b6bf7b3 100644
--- a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py
+++ b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py
@@ -7,7 +7,11 @@
     get_families_failed_relatedness_check,
     get_families_failed_sex_check,
 )
-from v03_pipeline.lib.misc.io import does_file_exist, import_pedigree, import_remap
+from v03_pipeline.lib.misc.io import (
+    does_file_exist,
+    import_pedigree,
+    import_remap,
+)
 from v03_pipeline.lib.misc.pedigree import parse_pedigree_ht_to_families
 from v03_pipeline.lib.misc.sample_ids import remap_sample_ids, subset_samples
 from v03_pipeline.lib.paths import remapped_and_subsetted_callset_path
@@ -176,6 +180,11 @@ def create_table(self) -> hl.MatrixTable:
             ),
             self.ignore_missing_samples_when_subsetting,
         )
+        # Drop additional fields imported onto the intermediate callsets but
+        # not used when creating the downstream optimized tables.
+        for field in mt.row_value:
+            if field not in self.dataset_type.row_fields:
+                mt = mt.drop(field)
         return mt.select_globals(
             family_samples=(
                 {