diff --git a/requirements.txt b/requirements.txt index b9a3cc37b..9dafbd4e5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -347,7 +347,7 @@ tenacity==8.2.3 # plotly threadpoolctl==3.2.0 # via scikit-learn -tornado==6.3.3 +tornado==6.4.1 # via # bokeh # luigi diff --git a/v03_pipeline/lib/annotations/mito.py b/v03_pipeline/lib/annotations/mito.py index eac1829e6..49ed0c108 100644 --- a/v03_pipeline/lib/annotations/mito.py +++ b/v03_pipeline/lib/annotations/mito.py @@ -56,7 +56,7 @@ def high_constraint_region_mito( def mito_cn(mt: hl.MatrixTable, **_: Any) -> hl.Expression: - return hl.int(mt.mito_cn) + return hl.int32(mt.mito_cn) def mitotip(ht: hl.Table, **_: Any) -> hl.Expression: diff --git a/v03_pipeline/lib/annotations/sv.py b/v03_pipeline/lib/annotations/sv.py index 4893e2a2d..39e1856d4 100644 --- a/v03_pipeline/lib/annotations/sv.py +++ b/v03_pipeline/lib/annotations/sv.py @@ -10,11 +10,21 @@ from v03_pipeline.lib.annotations.shared import add_rg38_liftover from v03_pipeline.lib.model.definitions import ReferenceGenome -CONSEQ_PREDICTED_PREFIX = 'PREDICTED_' -NON_GENE_PREDICTIONS = { - 'PREDICTED_INTERGENIC', - 'PREDICTED_NONCODING_BREAKPOINT', - 'PREDICTED_NONCODING_SPAN', +CONSEQ_PREDICTED_PREFIX = 'info.PREDICTED_' +CONSEQ_PREDICTED_GENE_COLS = { + 'info.PREDICTED_BREAKEND_EXONIC': hl.tarray(hl.tstr), + 'info.PREDICTED_COPY_GAIN': hl.tarray(hl.tstr), + 'info.PREDICTED_DUP_PARTIAL': hl.tarray(hl.tstr), + 'info.PREDICTED_INTRAGENIC_EXON_DUP': hl.tarray(hl.tstr), + 'info.PREDICTED_INTRONIC': hl.tarray(hl.tstr), + 'info.PREDICTED_INV_SPAN': hl.tarray(hl.tstr), + 'info.PREDICTED_LOF': hl.tarray(hl.tstr), + 'info.PREDICTED_MSV_EXON_OVERLAP': hl.tarray(hl.tstr), + 'info.PREDICTED_NEAREST_TSS': hl.tarray(hl.tstr), + 'info.PREDICTED_PARTIAL_EXON_DUP': hl.tarray(hl.tstr), + 'info.PREDICTED_PROMOTER': hl.tarray(hl.tstr), + 'info.PREDICTED_TSS_DUP': hl.tarray(hl.tstr), + 'info.PREDICTED_UTR': hl.tarray(hl.tstr), } PREVIOUS_GENOTYPE_N_ALT_ALLELES = hl.dict( @@ -72,7 +82,7 @@ def _sv_types(ht: hl.Table) -> hl.ArrayExpression: def algorithms(ht: hl.Table, **_: Any) -> hl.Expression: - return hl.str(',').join(ht.info.ALGORITHMS) + return hl.str(',').join(ht['info.ALGORITHMS']) def bothsides_support(ht: hl.Table, **_: Any) -> hl.Expression: @@ -110,37 +120,40 @@ def cpx_intervals( **_: Any, ) -> hl.Expression: return hl.or_missing( - hl.is_defined(ht.info.CPX_INTERVALS), - ht.info.CPX_INTERVALS.map(lambda x: _get_cpx_interval(x, reference_genome)), + hl.is_defined(ht['info.CPX_INTERVALS']), + ht['info.CPX_INTERVALS'].map(lambda x: _get_cpx_interval(x, reference_genome)), ) def end_locus(ht: hl.Table, **_: Any) -> hl.StructExpression: rg38_lengths = hl.literal(hl.get_reference(ReferenceGenome.GRCh38.value).lengths) return hl.if_else( - (hl.is_defined(ht.info.END2) & (rg38_lengths[ht.info.CHR2] >= ht.info.END2)), - hl.locus(ht.info.CHR2, ht.info.END2, ReferenceGenome.GRCh38.value), + ( + hl.is_defined(ht['info.END2']) + & (rg38_lengths[ht['info.CHR2']] >= ht['info.END2']) + ), + hl.locus(ht['info.CHR2'], ht['info.END2'], ReferenceGenome.GRCh38.value), hl.or_missing( - (rg38_lengths[ht.locus.contig] >= ht.info.END), - hl.locus(ht.locus.contig, ht.info.END, ReferenceGenome.GRCh38.value), + (rg38_lengths[ht.locus.contig] >= ht['info.END']), + hl.locus(ht.locus.contig, ht['info.END'], ReferenceGenome.GRCh38.value), ), ) def gnomad_svs(ht: hl.Table, **_: Any) -> hl.Expression: return hl.or_missing( - hl.is_defined(ht.info.gnomAD_V2_AF), - hl.struct(AF=hl.float32(ht.info.gnomAD_V2_AF), ID=ht.info.gnomAD_V2_SVID), + hl.is_defined(ht['info.gnomAD_V2_AF']), + hl.struct(AF=hl.float32(ht['info.gnomAD_V2_AF']), ID=ht['info.gnomAD_V2_SVID']), ) def gt_stats(ht: hl.Table, **_: Any) -> hl.Expression: return hl.struct( - AF=hl.float32(ht.info.AF[0]), - AC=ht.info.AC[0], - AN=ht.info.AN, - Hom=ht.info.N_HOMALT, - Het=ht.info.N_HET, + AF=hl.float32(ht['info.AF'][0]), + AC=ht['info.AC'][0], + AN=ht['info.AN'], + Hom=ht['info.N_HOMALT'], + Het=ht['info.N_HET'], ) @@ -174,14 +187,8 @@ def sorted_gene_consequences( **_: Any, ) -> hl.Expression: # In lieu of sorted_transcript_consequences seen on SNV/MITO. - conseq_predicted_gene_cols = [ - gene_col - for gene_col in ht.info - if gene_col.startswith(CONSEQ_PREDICTED_PREFIX) - and gene_col not in NON_GENE_PREDICTIONS - ] mapped_genes = [ - ht.info[gene_col].map( + ht[gene_col].map( lambda gene: hl.struct( gene_id=gencode_mapping.get(gene), major_consequence_id=SV_CONSEQUENCE_RANKS_LOOKUP[ @@ -189,13 +196,13 @@ def sorted_gene_consequences( ], ), ) - for gene_col in conseq_predicted_gene_cols + for gene_col in CONSEQ_PREDICTED_GENE_COLS ] return hl.filter(hl.is_defined, mapped_genes).flatmap(lambda x: x) def strvctvre(ht: hl.Table, **_: Any) -> hl.Expression: - return hl.struct(score=hl.parse_float32(ht.info.StrVCTVRE)) + return hl.struct(score=hl.parse_float32(ht['info.StrVCTVRE'])) def sv_type_id(ht: hl.Table, **_: Any) -> hl.Expression: @@ -206,7 +213,7 @@ def sv_type_detail_id(ht: hl.Table, **_: Any) -> hl.Expression: sv_types = _sv_types(ht) return hl.if_else( sv_types[0] == 'CPX', - SV_TYPE_DETAILS_LOOKUP[ht.info.CPX_TYPE], + SV_TYPE_DETAILS_LOOKUP[ht['info.CPX_TYPE']], hl.or_missing( (sv_types[0] == 'INS') & (hl.len(sv_types) > 1), SV_TYPE_DETAILS_LOOKUP[sv_types[1]], diff --git a/v03_pipeline/lib/misc/callsets.py b/v03_pipeline/lib/misc/callsets.py index 3757b4da5..e65e9bc61 100644 --- a/v03_pipeline/lib/misc/callsets.py +++ b/v03_pipeline/lib/misc/callsets.py @@ -32,13 +32,6 @@ def get_callset_ht( # noqa: PLR0913 imputed_sex_paths, ) ] - - # Drop any fields potentially unshared/unused by the annotations. - for i, callset_ht in enumerate(callset_hts): - for row_field in dataset_type.optional_row_fields: - if hasattr(callset_ht, row_field): - callset_hts[i] = callset_ht.drop(row_field) - callset_ht = functools.reduce( (lambda ht1, ht2: ht1.union(ht2, unify=True)), callset_hts, diff --git a/v03_pipeline/lib/misc/io.py b/v03_pipeline/lib/misc/io.py index 1548814ff..71572dd62 100644 --- a/v03_pipeline/lib/misc/io.py +++ b/v03_pipeline/lib/misc/io.py @@ -5,6 +5,7 @@ import hail as hl from v03_pipeline.lib.misc.gcnv import parse_gcnv_genes +from v03_pipeline.lib.misc.nested_field import parse_nested_field from v03_pipeline.lib.model import DatasetType, Env, ReferenceGenome, Sex BIALLELIC = 2 @@ -64,17 +65,9 @@ def import_gcnv_bed_file(callset_path: str) -> hl.MatrixTable: ht = hl.import_table( callset_path, types={ - 'start': hl.tint32, - 'end': hl.tint32, - 'CN': hl.tint32, - 'QS': hl.tint32, - 'defragmented': hl.tbool, - 'sf': hl.tfloat64, - 'sc': hl.tint32, - 'genes_any_overlap_totalExons': hl.tint32, - 'genes_strict_overlap_totalExons': hl.tint32, - 'no_ovl': hl.tbool, - 'is_latest': hl.tbool, + **DatasetType.GCNV.col_fields, + **DatasetType.GCNV.entries_fields, + **DatasetType.GCNV.row_fields, }, force=callset_path.endswith('gz'), ) @@ -147,16 +140,25 @@ def import_callset( def select_relevant_fields( mt: hl.MatrixTable, dataset_type: DatasetType, + additional_row_fields: None | dict[str, hl.expr.types.HailType | set] = None, ) -> hl.MatrixTable: mt = mt.select_globals() - optional_row_fields = [ - row_field - for row_field in dataset_type.optional_row_fields - if hasattr(mt, row_field) - ] - mt = mt.select_rows(*dataset_type.row_fields, *optional_row_fields) - mt = mt.select_cols(*dataset_type.col_fields) - return mt.select_entries(*dataset_type.entries_fields) + mt = mt.select_rows( + **{field: parse_nested_field(mt, field) for field in dataset_type.row_fields}, + **{ + field: parse_nested_field(mt, field) + for field in (additional_row_fields or []) + }, + ) + mt = mt.select_cols( + **{field: parse_nested_field(mt, field) for field in dataset_type.col_fields}, + ) + return mt.select_entries( + **{ + field: parse_nested_field(mt, field) + for field in dataset_type.entries_fields + }, + ) def import_imputed_sex(imputed_sex_path: str) -> hl.Table: diff --git a/v03_pipeline/lib/misc/nested_field.py b/v03_pipeline/lib/misc/nested_field.py new file mode 100644 index 000000000..02d1b9c48 --- /dev/null +++ b/v03_pipeline/lib/misc/nested_field.py @@ -0,0 +1,15 @@ +import hail as hl + + +def parse_nested_field(t: hl.MatrixTable | hl.Table, fields: str): + # Grab the field and continually select it from the hail table. + expression = t + for field in fields.split('.'): + # Select from multi-allelic list. + if field.endswith('#'): + expression = expression[field[:-1]][ + (t.a_index if hasattr(t, 'a_index') else 1) - 1 + ] + else: + expression = expression[field] + return expression diff --git a/v03_pipeline/lib/misc/nested_field_test.py b/v03_pipeline/lib/misc/nested_field_test.py new file mode 100644 index 000000000..6e0bd5d95 --- /dev/null +++ b/v03_pipeline/lib/misc/nested_field_test.py @@ -0,0 +1,75 @@ +import unittest + +import hail as hl + +from v03_pipeline.lib.misc.nested_field import parse_nested_field + + +class TestConstrainFunction(unittest.TestCase): + def test_parse_nested_field(self): + ht = hl.Table.parallelize( + [ + { + 'locus': hl.Locus( + contig='chr1', + position=1, + reference_genome='GRCh38', + ), + 'alleles': ['A', 'C'], + 'a': hl.Struct(d=1), + 'b': hl.Struct(e=[2, 9]), + 'a_index': 1, + }, + { + 'locus': hl.Locus( + contig='chr1', + position=2, + reference_genome='GRCh38', + ), + 'alleles': ['A', 'C'], + 'a': hl.Struct(d=3), + 'b': hl.Struct(e=[4, 5]), + 'a_index': 1, + }, + ], + hl.tstruct( + locus=hl.tlocus('GRCh38'), + alleles=hl.tarray(hl.tstr), + a=hl.tstruct(d=hl.tint32), + b=hl.tstruct(e=hl.tarray(hl.tint32)), + a_index=hl.tint32, + ), + key=['locus', 'alleles'], + ) + ht = ht.select( + d=parse_nested_field(ht, 'a.d'), + e=parse_nested_field(ht, 'b.e#'), + f=parse_nested_field(ht, 'a'), + ) + self.assertListEqual( + ht.collect(), + [ + hl.Struct( + locus=hl.Locus( + contig='chr1', + position=1, + reference_genome='GRCh38', + ), + alleles=['A', 'C'], + d=1, + e=2, + f=hl.Struct(d=1), + ), + hl.Struct( + locus=hl.Locus( + contig='chr1', + position=2, + reference_genome='GRCh38', + ), + alleles=['A', 'C'], + d=3, + e=4, + f=hl.Struct(d=3), + ), + ], + ) diff --git a/v03_pipeline/lib/misc/validation.py b/v03_pipeline/lib/misc/validation.py index 86b672ddc..f3abae409 100644 --- a/v03_pipeline/lib/misc/validation.py +++ b/v03_pipeline/lib/misc/validation.py @@ -1,6 +1,6 @@ import hail as hl -from v03_pipeline.lib.model import ReferenceGenome, SampleType, Sex +from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType, Sex AMBIGUOUS_THRESHOLD_PERC: float = 0.01 # Fraction of samples identified as "ambiguous_sex" above which an error will be thrown. MIN_ROWS_PER_CONTIG = 100 @@ -60,6 +60,43 @@ def validate_expected_contig_frequency( raise SeqrValidationError(msg) +def validate_imported_field_types( + mt: hl.MatrixTable, + dataset_type: DatasetType, + additional_row_fields: dict[str, hl.expr.types.HailType | set], +) -> None: + def _validate_field( + mt_schema: hl.StructExpression, + field: str, + dtype: hl.expr.types.HailType, + ) -> str | None: + if field not in mt_schema: + return f'{field}: missing' + if ( + ( + dtype == hl.tstruct + and type(mt_schema[field]) + == hl.expr.expressions.typed_expressions.StructExpression + ) + or (isinstance(dtype, set) and mt_schema[field].dtype in dtype) + or (mt_schema[field].dtype == dtype) + ): + return None + return f'{field}: {mt_schema[field].dtype}' + + unexpected_field_types = [] + for field, dtype in dataset_type.col_fields.items(): + unexpected_field_types.append(_validate_field(mt.col, field, dtype)) + for field, dtype in dataset_type.entries_fields.items(): + unexpected_field_types.append(_validate_field(mt.entry, field, dtype)) + for field, dtype in {**dataset_type.row_fields, **additional_row_fields}.items(): + unexpected_field_types.append(_validate_field(mt.row, field, dtype)) + unexpected_field_types = [x for x in unexpected_field_types if x is not None] + if unexpected_field_types: + msg = f'Found unexpected field types on MatrixTable after import: {unexpected_field_types}' + raise SeqrValidationError(msg) + + def validate_imputed_sex_ploidy( mt: hl.MatrixTable, sex_check_ht: hl.Table, diff --git a/v03_pipeline/lib/misc/validation_test.py b/v03_pipeline/lib/misc/validation_test.py index 0512d9284..d7cd7c7fc 100644 --- a/v03_pipeline/lib/misc/validation_test.py +++ b/v03_pipeline/lib/misc/validation_test.py @@ -6,13 +6,15 @@ SeqrValidationError, validate_allele_type, validate_expected_contig_frequency, + validate_imported_field_types, validate_imputed_sex_ploidy, validate_no_duplicate_variants, validate_sample_type, ) -from v03_pipeline.lib.model import ReferenceGenome, SampleType +from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType TEST_SEX_CHECK_1 = 'v03_pipeline/var/test/sex_check/test_sex_check_1.ht' +TEST_MITO_MT = 'v03_pipeline/var/test/callsets/mito_1.mt' def _mt_from_contigs(contigs): @@ -122,6 +124,21 @@ def test_validate_imputed_sex_ploidy(self) -> None: sex_check_ht, ) + def test_validate_imported_field_types(self) -> None: + mt = hl.read_matrix_table(TEST_MITO_MT) + validate_imported_field_types(mt, DatasetType.MITO, {}) + mt = mt.annotate_cols(contamination=hl.int32(mt.contamination)) + mt = mt.annotate_entries(DP=hl.float32(mt.DP)) + mt = mt.annotate_rows(vep=hl.dict({'t': '1'})) + self.assertRaisesRegex( + SeqrValidationError, + "Found unexpected field types on MatrixTable after import: \\['contamination: int32', 'DP: float32', 'vep: dict', 'tester: missing'\\]", + validate_imported_field_types, + mt, + DatasetType.MITO, + {'tester': hl.tfloat32}, + ) + def test_validate_no_duplicate_variants(self) -> None: mt = hl.MatrixTable.from_parts( rows={ diff --git a/v03_pipeline/lib/model/constants.py b/v03_pipeline/lib/model/constants.py index b406b8798..7c485c6f3 100644 --- a/v03_pipeline/lib/model/constants.py +++ b/v03_pipeline/lib/model/constants.py @@ -2,4 +2,5 @@ 'R0555_seqr_demo', 'R0607_gregor_training_project_', 'R0608_gregor_training_project_', + 'R0801_gregor_training_project_', } diff --git a/v03_pipeline/lib/model/dataset_type.py b/v03_pipeline/lib/model/dataset_type.py index cc9123848..5731a1529 100644 --- a/v03_pipeline/lib/model/dataset_type.py +++ b/v03_pipeline/lib/model/dataset_type.py @@ -34,10 +34,13 @@ def col_fields( self, ) -> list[str]: return { - DatasetType.SNV_INDEL: [], - DatasetType.MITO: ['contamination', 'mito_cn'], - DatasetType.SV: [], - DatasetType.GCNV: [], + DatasetType.SNV_INDEL: {}, + DatasetType.MITO: { + 'contamination': {hl.tstr, hl.tfloat64}, + 'mito_cn': hl.tfloat64, + }, + DatasetType.SV: {}, + DatasetType.GCNV: {}, }[self] @property @@ -45,34 +48,37 @@ def entries_fields( self, ) -> list[str]: return { - DatasetType.SNV_INDEL: ['GT', 'AD', 'GQ'], - DatasetType.MITO: ['GT', 'DP', 'MQ', 'HL'], - DatasetType.SV: ['GT', 'CONC_ST', 'GQ', 'RD_CN'], - DatasetType.GCNV: [ - 'any_ovl', - 'defragmented', - 'genes_any_overlap_Ensemble_ID', - 'genes_any_overlap_totalExons', - 'identical_ovl', - 'is_latest', - 'no_ovl', - 'sample_start', - 'sample_end', - 'CN', - 'GT', - 'QS', - ], - }[self] - - @property - def optional_row_fields( - self, - ) -> list[str]: - return { - DatasetType.SNV_INDEL: ['info'], - DatasetType.MITO: [], - DatasetType.SV: [], - DatasetType.GCNV: [], + DatasetType.SNV_INDEL: { + 'GT': hl.tcall, + 'AD': hl.tarray(hl.tint32), + 'GQ': hl.tint32, + }, + DatasetType.MITO: { + 'GT': hl.tcall, + 'DP': hl.tint32, + 'MQ': {hl.tfloat64, hl.tint32}, + 'HL': hl.tfloat64, + }, + DatasetType.SV: { + 'GT': hl.tcall, + 'CONC_ST': hl.tarray(hl.tstr), + 'GQ': hl.tint32, + 'RD_CN': hl.tint32, + }, + DatasetType.GCNV: { + 'any_ovl': hl.tstr, + 'defragmented': hl.tbool, + 'genes_any_overlap_Ensemble_ID': hl.tstr, + 'genes_any_overlap_totalExons': hl.tint32, + 'identical_ovl': hl.tstr, + 'is_latest': hl.tbool, + 'no_ovl': hl.tbool, + 'sample_start': hl.tint32, + 'sample_end': hl.tint32, + 'CN': hl.tint32, + 'GT': hl.tstr, + 'QS': hl.tint32, + }, }[self] @property @@ -80,30 +86,52 @@ def row_fields( self, ) -> list[str]: return { - DatasetType.SNV_INDEL: ['rsid', 'filters'], - DatasetType.MITO: [ - 'rsid', - 'filters', - 'common_low_heteroplasmy', - 'hap_defining_variant', - 'mitotip_trna_prediction', - 'vep', - ], - DatasetType.SV: ['locus', 'alleles', 'filters', 'info'], - DatasetType.GCNV: [ - 'cg_genes', - 'chr', - 'end', - 'filters', - 'gene_ids', - 'lof_genes', - 'num_exon', - 'sc', - 'sf', - 'start', - 'strvctvre_score', - 'svtype', - ], + DatasetType.SNV_INDEL: { + 'rsid': hl.tstr, + 'filters': hl.tset(hl.tstr), + }, + DatasetType.MITO: { + 'rsid': hl.tset(hl.tstr), + 'filters': hl.tset(hl.tstr), + 'common_low_heteroplasmy': hl.tbool, + 'hap_defining_variant': hl.tbool, + 'mitotip_trna_prediction': hl.tstr, + 'vep': hl.tstruct, + }, + DatasetType.SV: { + 'locus': hl.tlocus(ReferenceGenome.GRCh38.value), + 'alleles': hl.tarray(hl.tstr), + 'filters': hl.tset(hl.tstr), + 'info.AC': hl.tarray(hl.tint32), + 'info.AF': hl.tarray(hl.tfloat64), + 'info.ALGORITHMS': hl.tarray(hl.tstr), + 'info.AN': hl.tint32, + 'info.CHR2': hl.tstr, + 'info.CPX_INTERVALS': hl.tarray(hl.tstr), + 'info.CPX_TYPE': hl.tstr, + 'info.END': hl.tint32, + 'info.END2': hl.tint32, + 'info.gnomAD_V2_AF': hl.tfloat64, + 'info.gnomAD_V2_SVID': hl.tstr, + 'info.N_HET': hl.tint32, + 'info.N_HOMALT': hl.tint32, + 'info.StrVCTVRE': hl.tstr, + **sv.CONSEQ_PREDICTED_GENE_COLS, + }, + DatasetType.GCNV: { + 'cg_genes': hl.tset(hl.tstr), + 'chr': hl.tstr, + 'end': hl.tint32, + 'filters': hl.tset(hl.tstr), + 'gene_ids': hl.tset(hl.tstr), + 'lof_genes': hl.tset(hl.tstr), + 'num_exon': hl.tint32, + 'sc': hl.tint32, + 'sf': hl.tfloat64, + 'start': hl.tint32, + 'strvctvre_score': hl.tstr, + 'svtype': hl.tstr, + }, }[self] @property diff --git a/v03_pipeline/lib/reference_data/dataset_table_operations.py b/v03_pipeline/lib/reference_data/dataset_table_operations.py index 2a979ce02..7d4d44b67 100644 --- a/v03_pipeline/lib/reference_data/dataset_table_operations.py +++ b/v03_pipeline/lib/reference_data/dataset_table_operations.py @@ -4,6 +4,7 @@ import hail as hl import pytz +from v03_pipeline.lib.misc.nested_field import parse_nested_field from v03_pipeline.lib.model import ( DatasetType, ReferenceDatasetCollection, @@ -109,14 +110,7 @@ def get_select_fields(selects: list | dict | None, base_ht: hl.Table) -> dict: select_fields = {selection: base_ht[selection] for selection in selects} elif isinstance(selects, dict): for key, val in selects.items(): - # Grab the field and continually select it from the hail table. - expression = base_ht - for attr in val.split('.'): - # Select from multi-allelic list. - if attr.endswith('#'): - expression = expression[attr[:-1]][base_ht.a_index - 1] - else: - expression = expression[attr] + expression = parse_nested_field(base_ht, val) # Parse float64s into float32s to save space! if expression.dtype == hl.tfloat64: expression = hl.float32(expression) diff --git a/v03_pipeline/lib/reference_data/gencode/mapping_gene_ids_tests.py b/v03_pipeline/lib/reference_data/gencode/mapping_gene_ids_tests.py new file mode 100644 index 000000000..04cd92b02 --- /dev/null +++ b/v03_pipeline/lib/reference_data/gencode/mapping_gene_ids_tests.py @@ -0,0 +1,150 @@ +import unittest +from unittest import mock + +from v03_pipeline.lib.reference_data.gencode.mapping_gene_ids import load_gencode + +DOWNLOAD_PATH = 'test/path' +GS_DOWNLOAD_PATH ='gs://test-bucket/test/path' +DOWNLOAD_FILE = 'test/path/gencode.v29.annotation.gtf.gz' +PICKLE_FILE = 'test/path/gencode.v29.annotation.gtf.pickle' +PICKLE_FILE_HANDLE = 'handle' +GTF_DATA = [ + '#description: evidence-based annotation of the human genome, version 31 (Ensembl 97), mapped to GRCh37 with gencode-backmap\n', + 'chr1 HAVANA gene 11869 14409 . + . gene_id "ENSG00000223972.5_2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; level 2; hgnc_id "HGNC:37102"; havana_gene "OTTHUMG00000000961.2_2"; remap_status "full_contig"; remap_num_mappings 1; remap_target_status "overlap";\n', + 'chr1 HAVANA gene 621059 622053 . - . gene_id "ENSG00000284662.1_2"; gene_type "protein_coding"; gene_name "OR4F16"; level 2; hgnc_id "HGNC:15079"; havana_gene "OTTHUMG00000002581.3_2"; remap_status "full_contig"; remap_num_mappings 1; remap_target_status "overlap";\n', + 'GL000193.1 HAVANA gene 77815 78162 . + . gene_id "ENSG00000279783.1_5"; gene_type "processed_pseudogene"; gene_name "AC018692.2"; level 2; havana_gene "OTTHUMG00000189459.1_5"; remap_status "full_contig"; remap_num_mappings 1; remap_target_status "new";\n', +] +GENE_ID_MAPPING = {"DDX11L1": "ENSG00000223972", "OR4F16": "ENSG00000284662", "AC018692.2": "ENSG00000279783"} + + +class LoadGencodeTestCase(unittest.TestCase): + + @mock.patch('v03_pipeline.lib.reference_data.gencode.mapping_gene_ids.logger') + @mock.patch('v03_pipeline.lib.reference_data.gencode.mapping_gene_ids.path_exists') + @mock.patch('v03_pipeline.lib.reference_data.gencode.mapping_gene_ids.pickle') + @mock.patch('v03_pipeline.lib.reference_data.gencode.mapping_gene_ids.open') + @mock.patch('v03_pipeline.lib.reference_data.gencode.mapping_gene_ids.gzip.open') + @mock.patch('v03_pipeline.lib.reference_data.gencode.mapping_gene_ids.file_writer') + @mock.patch('v03_pipeline.lib.reference_data.gencode.mapping_gene_ids.download_file') + def test_load_gencode_local(self, mock_download_file, mock_file_writer, mock_gopen, mock_open, mock_pickle, + mock_path_exists, mock_logger): + # test using saved file + mock_path_exists.side_effect = [True] + mock_pickle.load.return_value = GENE_ID_MAPPING + gene_id_mapping = load_gencode(23, download_path=DOWNLOAD_PATH) + mock_file_writer.assert_not_called() + mock_download_file.assert_not_called() + mock_gopen.assert_not_called() + mock_open.assert_called_with('test/path/gencode.v23.annotation.gtf.pickle', 'rb') + mock_pickle.load.assert_called_with(mock_open.return_value.__enter__.return_value) + mock_path_exists.assert_called_with('test/path/gencode.v23.annotation.gtf.pickle') + mock_logger.info.assert_has_calls([ + mock.call('Use the existing pickle file test/path/gencode.v23.annotation.gtf.pickle.\nIf you want to reload the data, please delete it and re-run the data loading.'), + mock.call('Got 3 gene id mapping records'), + ]) + self.assertEqual(gene_id_mapping, GENE_ID_MAPPING) + + # test downloading and parsing gtf data + mock_path_exists.reset_mock() + mock_logger.reset_mock() + mock_pickle.reset_mock() + mock_open.reset_mock() + mock_path_exists.side_effect = [False, False] + mock_download_file.return_value = 'test/path/gencode.v24.annotation.gtf.gz' + mock_gopen.return_value.__iter__.return_value = GTF_DATA + mock_f = mock.MagicMock() + mock_file_writer.return_value.__enter__.return_value = mock_f, None + gene_id_mapping = load_gencode(24, download_path=DOWNLOAD_PATH) + self.assertEqual(gene_id_mapping, GENE_ID_MAPPING) + mock_path_exists.assert_has_calls([ + mock.call('test/path/gencode.v24.annotation.gtf.pickle'), + mock.call('test/path/gencode.v24.annotation.gtf.gz'), + ]) + mock_download_file.assert_called_with( + 'http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_24/gencode.v24.annotation.gtf.gz', + to_dir='test/path', + ) + mock_file_writer.assert_called_with('test/path/gencode.v24.annotation.gtf.pickle') + mock_pickle.dump.assert_called_with(GENE_ID_MAPPING, mock_f, protocol=mock.ANY) + mock_gopen.assert_called_with('test/path/gencode.v24.annotation.gtf.gz', 'rt') + mock_open.assert_not_called() + mock_logger.info.assert_has_calls([ + mock.call('Downloaded to test/path/gencode.v24.annotation.gtf.gz'), + mock.call('Loading test/path/gencode.v24.annotation.gtf.gz'), + mock.call('Saving to pickle test/path/gencode.v24.annotation.gtf.pickle'), + mock.call('Got 3 gene id mapping records') + ]) + mock_pickle.load.assert_not_called() + + # test using downloaded file + mock_path_exists.reset_mock() + mock_logger.reset_mock() + mock_download_file.reset_mock() + mock_pickle.reset_mock() + mock_path_exists.side_effect = [False, True] + mock_gopen.return_value.__iter__.return_value = GTF_DATA + gene_id_mapping = load_gencode(24, download_path=DOWNLOAD_PATH) + self.assertEqual(gene_id_mapping, GENE_ID_MAPPING) + mock_path_exists.assert_has_calls([ + mock.call('test/path/gencode.v24.annotation.gtf.pickle'), + mock.call('test/path/gencode.v24.annotation.gtf.gz'), + ]) + mock_gopen.assert_called_with('test/path/gencode.v24.annotation.gtf.gz', 'rt') + mock_download_file.assert_not_called() + mock_file_writer.assert_called_with('test/path/gencode.v24.annotation.gtf.pickle') + mock_pickle.dump.assert_called_with(GENE_ID_MAPPING, mock_f, protocol=mock.ANY) + mock_open.assert_not_called() + mock_logger.info.assert_has_calls([ + mock.call('Use the existing downloaded file test/path/gencode.v24.annotation.gtf.gz.\nIf you want to re-download it, please delete the file and re-run the pipeline.'), + mock.call('Loading test/path/gencode.v24.annotation.gtf.gz'), + mock.call('Saving to pickle test/path/gencode.v24.annotation.gtf.pickle'), + mock.call('Got 3 gene id mapping records') + ]) + mock_pickle.load.assert_not_called() + + # bad gtf data test + mock_path_exists.side_effect = [False, False] + mock_gopen.return_value.__iter__.return_value = ['bad data'] + with self.assertRaises(ValueError) as ve: + _ = load_gencode(24, download_path=DOWNLOAD_PATH) + self.assertEqual(str(ve.exception), "Unexpected number of fields on line #0: ['bad data']") + + @mock.patch('v03_pipeline.lib.reference_data.gencode.mapping_gene_ids.gzip') + @mock.patch('v03_pipeline.lib.reference_data.gencode.mapping_gene_ids.logger') + @mock.patch('v03_pipeline.lib.reference_data.gencode.mapping_gene_ids.path_exists') + @mock.patch('v03_pipeline.lib.reference_data.gencode.mapping_gene_ids.pickle') + @mock.patch('v03_pipeline.lib.reference_data.gencode.mapping_gene_ids.stream_gs_file') + @mock.patch('v03_pipeline.lib.reference_data.gencode.mapping_gene_ids.file_writer') + def test_load_gencode_using_gs(self, mock_file_writer, mock_stream_gs_file, mock_pickle, mock_path_exists, + mock_logger, mock_gzip): + + # test using saved file. + mock_path_exists.side_effect = [True] + mock_pickle.loads.return_value = GENE_ID_MAPPING + gene_id_mapping = load_gencode(25, download_path=GS_DOWNLOAD_PATH) + self.assertEqual(gene_id_mapping, GENE_ID_MAPPING) + mock_path_exists.assert_called_with('gs://test-bucket/test/path/gencode.v25.annotation.gtf.pickle') + mock_logger.info.assert_has_calls([ + mock.call('Use the existing pickle file gs://test-bucket/test/path/gencode.v25.annotation.gtf.pickle.\n' + 'If you want to reload the data, please delete it and re-run the data loading.'), + mock.call('Got 3 gene id mapping records') + ]) + mock_stream_gs_file.assert_called_with('gs://test-bucket/test/path/gencode.v25.annotation.gtf.pickle') + mock_pickle.dump.assert_not_called() + mock_file_writer.assert_not_called() + + # test using downloaded file. + mock_path_exists.side_effect = [False, True] + mock_gzip.decompress.return_value = ''.join(GTF_DATA).encode() + mock_f = mock.MagicMock() + mock_file_writer.return_value.__enter__.return_value = mock_f, None + gene_id_mapping = load_gencode(25, download_path=GS_DOWNLOAD_PATH) + self.assertEqual(gene_id_mapping, GENE_ID_MAPPING) + mock_path_exists.assert_has_calls([ + mock.call('gs://test-bucket/test/path/gencode.v25.annotation.gtf.pickle'), + mock.call('gs://test-bucket/test/path/gencode.v25.annotation.gtf.gz'), + ]) + mock_stream_gs_file.assert_called_with('gs://test-bucket/test/path/gencode.v25.annotation.gtf.gz', raw_download=True) + mock_gzip.decompress.assert_called_with(mock_stream_gs_file.return_value) + mock_file_writer.assert_called_with('gs://test-bucket/test/path/gencode.v25.annotation.gtf.pickle') + mock_pickle.dump.assert_called_with(GENE_ID_MAPPING, mock_f, protocol=mock.ANY) diff --git a/v03_pipeline/lib/tasks/write_imported_callset.py b/v03_pipeline/lib/tasks/write_imported_callset.py index c8b795821..345af90e2 100644 --- a/v03_pipeline/lib/tasks/write_imported_callset.py +++ b/v03_pipeline/lib/tasks/write_imported_callset.py @@ -9,6 +9,7 @@ from v03_pipeline.lib.misc.validation import ( validate_allele_type, validate_expected_contig_frequency, + validate_imported_field_types, validate_imputed_sex_ploidy, validate_no_duplicate_variants, validate_sample_type, @@ -112,6 +113,24 @@ def requires(self) -> list[luigi.Task]: CallsetTask(self.callset_path), ] + def additional_row_fields(self, mt): + return { + **( + {'info.AF': hl.tarray(hl.tfloat64)} + if self.check_sex_and_relatedness + and self.dataset_type.check_sex_and_relatedness + else {} + ), + # this field is never required, the pipeline + # will run smoothly even in its absence, but + # will trigger special handling if it is present. + **( + {'info.CALIBRATION_SENSITIVITY': hl.tarray(hl.tstr)} + if hasattr(mt, 'info') and hasattr(mt.info, 'CALIBRATION_SENSITIVITY') + else {} + ), + } + def create_table(self) -> hl.MatrixTable: mt = import_callset( self.callset_path, @@ -119,7 +138,18 @@ def create_table(self) -> hl.MatrixTable: self.dataset_type, self.filters_path, ) - mt = select_relevant_fields(mt, self.dataset_type) + mt = select_relevant_fields( + mt, + self.dataset_type, + self.additional_row_fields(mt), + ) + # This validation isn't override-able. If a field is the wrong + # type, the pipeline will likely hard-fail downstream. + validate_imported_field_types( + mt, + self.dataset_type, + self.additional_row_fields(mt), + ) if self.dataset_type.has_multi_allelic_variants: mt = split_multi_hts(mt) # Special handling of variant-level filter annotation for VETs filters. diff --git a/v03_pipeline/lib/tasks/write_new_variants_table.py b/v03_pipeline/lib/tasks/write_new_variants_table.py index 34ea5ee6a..5e183b05a 100644 --- a/v03_pipeline/lib/tasks/write_new_variants_table.py +++ b/v03_pipeline/lib/tasks/write_new_variants_table.py @@ -265,7 +265,7 @@ def create_table(self) -> hl.Table: ar_ht = ar_ht.union(ar_ht_chunk) new_variants_ht = new_variants_ht.join(ar_ht, 'left') - return new_variants_ht.annotate_globals( + return new_variants_ht.select_globals( updates={ hl.Struct(callset=callset_path, project_guid=project_guid) for ( diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py index 6b5aeced6..75b6bf7b3 100644 --- a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py +++ b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py @@ -7,7 +7,11 @@ get_families_failed_relatedness_check, get_families_failed_sex_check, ) -from v03_pipeline.lib.misc.io import does_file_exist, import_pedigree, import_remap +from v03_pipeline.lib.misc.io import ( + does_file_exist, + import_pedigree, + import_remap, +) from v03_pipeline.lib.misc.pedigree import parse_pedigree_ht_to_families from v03_pipeline.lib.misc.sample_ids import remap_sample_ids, subset_samples from v03_pipeline.lib.paths import remapped_and_subsetted_callset_path @@ -176,6 +180,11 @@ def create_table(self) -> hl.MatrixTable: ), self.ignore_missing_samples_when_subsetting, ) + # Drop additional fields imported onto the intermediate callsets but + # not used when creating the downstream optimized tables. + for field in mt.row_value: + if field not in self.dataset_type.row_fields: + mt = mt.drop(field) return mt.select_globals( family_samples=( {