From aba2b942e0ff5fbdb6e6e25417ed022bf188e4c1 Mon Sep 17 00:00:00 2001 From: Julia Klugherz Date: Wed, 12 Mar 2025 22:59:39 -0400 Subject: [PATCH 01/10] only hard fail on samples in pedigree --- v03_pipeline/lib/misc/validation.py | 13 ++++++++++-- v03_pipeline/lib/misc/validation_test.py | 24 +++++++++++++++++++--- v03_pipeline/lib/tasks/validate_callset.py | 9 +++++++- 3 files changed, 40 insertions(+), 6 deletions(-) diff --git a/v03_pipeline/lib/misc/validation.py b/v03_pipeline/lib/misc/validation.py index 063312f47..25c3404e6 100644 --- a/v03_pipeline/lib/misc/validation.py +++ b/v03_pipeline/lib/misc/validation.py @@ -2,6 +2,7 @@ import hail as hl +from v03_pipeline.lib.misc.pedigree import Family from v03_pipeline.lib.model import ( DatasetType, ReferenceGenome, @@ -132,6 +133,7 @@ def validate_imputed_sex_ploidy( mt: hl.MatrixTable, # NB: sex_check_ht will be undefined if sex checking is disabled for the run sex_check_ht: hl.Table | None = None, + pedigree_families: set[Family] | None = None, **_: Any, ) -> None: if not sex_check_ht: @@ -161,8 +163,15 @@ def validate_imputed_sex_ploidy( discrepant_samples = mt.aggregate_cols( hl.agg.filter(mt.discrepant, hl.agg.collect_as_set(mt.s)), ) - if discrepant_samples: - sorted_discrepant_samples = sorted(discrepant_samples) + loading_samples = ( + {sample_id for family in pedigree_families for sample_id in family.samples} + if pedigree_families + else set() + ) + discrepant_loading_samples = discrepant_samples & loading_samples + + if discrepant_loading_samples: + sorted_discrepant_samples = sorted(discrepant_loading_samples) msg = f'Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : {sorted_discrepant_samples[:10]}' raise SeqrValidationError( msg, diff --git a/v03_pipeline/lib/misc/validation_test.py b/v03_pipeline/lib/misc/validation_test.py index b1057e1e2..eafa1e5ac 100644 --- a/v03_pipeline/lib/misc/validation_test.py +++ b/v03_pipeline/lib/misc/validation_test.py @@ -2,6 +2,7 @@ import hail as hl +from v03_pipeline.lib.misc.pedigree import Family, Sample from v03_pipeline.lib.misc.validation import ( SeqrValidationError, validate_allele_type, @@ -11,7 +12,7 @@ validate_no_duplicate_variants, validate_sample_type, ) -from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType +from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType, Sex TEST_SEX_CHECK_1 = 'v03_pipeline/var/test/sex_check/test_sex_check_1.ht' TEST_MITO_MT = 'v03_pipeline/var/test/callsets/mito_1.mt' @@ -171,7 +172,20 @@ def test_validate_imputed_sex_ploidy(self) -> None: .key_rows_by('locus') .key_cols_by('s') ) - validate_imputed_sex_ploidy(mt, sex_check_ht) + pedigree_families = { + Family( + family_guid='', + samples={ + female_sample: Sample(female_sample, Sex.FEMALE), + male_sample_1: Sample(male_sample_1, Sex.MALE), + x0_sample: Sample(x0_sample, Sex.X0), + xxy_sample: Sample(xxy_sample, Sex.XXY), + xyy_sample: Sample(xyy_sample, Sex.XYY), + xxx_sample: Sample(xxx_sample, Sex.XXX), + }, + ), + } + validate_imputed_sex_ploidy(mt, sex_check_ht, pedigree_families) # All calls on Y chromosome are valid mt = ( @@ -211,7 +225,7 @@ def test_validate_imputed_sex_ploidy(self) -> None: .key_rows_by('locus') .key_cols_by('s') ) - validate_imputed_sex_ploidy(mt, sex_check_ht) + validate_imputed_sex_ploidy(mt, sex_check_ht, pedigree_families) # Invalid X chromosome case mt = ( @@ -259,8 +273,12 @@ def test_validate_imputed_sex_ploidy(self) -> None: validate_imputed_sex_ploidy, mt, sex_check_ht, + pedigree_families, ) + # Invalid X chromosome case, but invalid samples are missing from pedigree + validate_imputed_sex_ploidy(mt, sex_check_ht, pedigree_families=set()) + def test_validate_imported_field_types(self) -> None: mt = hl.read_matrix_table(TEST_MITO_MT) validate_imported_field_types(mt, DatasetType.MITO, {}) diff --git a/v03_pipeline/lib/tasks/validate_callset.py b/v03_pipeline/lib/tasks/validate_callset.py index 7d0818240..6f9f69eeb 100644 --- a/v03_pipeline/lib/tasks/validate_callset.py +++ b/v03_pipeline/lib/tasks/validate_callset.py @@ -2,6 +2,8 @@ import luigi import luigi.util +from v03_pipeline.lib.misc.io import import_pedigree +from v03_pipeline.lib.misc.pedigree import parse_pedigree_ht_to_families from v03_pipeline.lib.misc.validation import ( SeqrValidationError, validate_allele_type, @@ -19,7 +21,7 @@ from v03_pipeline.lib.reference_datasets.reference_dataset import ReferenceDataset from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams from v03_pipeline.lib.tasks.base.base_update import BaseUpdateTask -from v03_pipeline.lib.tasks.files import CallsetTask, GCSorLocalTarget +from v03_pipeline.lib.tasks.files import CallsetTask, GCSorLocalTarget, RawFileTask from v03_pipeline.lib.tasks.reference_data.updated_reference_dataset import ( UpdatedReferenceDatasetTask, ) @@ -52,6 +54,10 @@ def get_validation_dependencies(self) -> dict[str, hl.Table]: self.callset_path, ), ) + deps['pedigree_families'] = parse_pedigree_ht_to_families( + import_pedigree(self.input()[1].path), + ) + return deps def complete(self) -> luigi.Target: @@ -74,6 +80,7 @@ def output(self) -> luigi.Target: def requires(self) -> list[luigi.Task]: requirements = [ self.clone(WriteImportedCallsetTask), + RawFileTask(self.project_pedigree_paths[self.project_i]), ] if not self.skip_validation and self.dataset_type.can_run_validation: requirements = [ From 2615bb95369518b26bf8a3b297cafdc934412b2f Mon Sep 17 00:00:00 2001 From: Julia Klugherz Date: Thu, 13 Mar 2025 16:10:13 -0400 Subject: [PATCH 02/10] make ploidy part of family validation --- .../lib/misc/family_loading_failures.py | 41 ++++ .../lib/misc/family_loading_failures_test.py | 188 ++++++++++++++++++ v03_pipeline/lib/misc/validation.py | 52 ----- v03_pipeline/lib/misc/validation_test.py | 161 +-------------- v03_pipeline/lib/tasks/validate_callset.py | 48 +---- .../write_remapped_and_subsetted_callset.py | 18 ++ ...ite_remapped_and_subsetted_callset_test.py | 31 ++- 7 files changed, 281 insertions(+), 258 deletions(-) diff --git a/v03_pipeline/lib/misc/family_loading_failures.py b/v03_pipeline/lib/misc/family_loading_failures.py index 438b1706e..a43500fbf 100644 --- a/v03_pipeline/lib/misc/family_loading_failures.py +++ b/v03_pipeline/lib/misc/family_loading_failures.py @@ -174,3 +174,44 @@ def get_families_failed_sex_check( f'Sample {sample_id} has pedigree sex {family.samples[sample_id].sex.value} but imputed sex {sex_check_lookup[sample_id].value}', ) return dict(failed_families) + + +def get_families_failed_imputed_sex_ploidy( + mt: hl.MatrixTable, + sex_check_ht: hl.Table, + families: set[Family], +) -> dict[Family, str]: + mt = mt.select_cols( + discrepant=( + ( + # All calls are diploid or missing but the sex is Male + hl.agg.all(mt.GT.is_diploid() | hl.is_missing(mt.GT)) + & (sex_check_ht[mt.s].predicted_sex == Sex.MALE.value) + ) + | ( + # At least one call is haploid but the sex is Female, X0, XXY, XYY, or XXX + hl.agg.any(~mt.GT.is_diploid()) + & hl.literal( + { + Sex.FEMALE.value, + Sex.X0.value, + Sex.XYY.value, + Sex.XXY.value, + Sex.XXX.value, + }, + ).contains(sex_check_ht[mt.s].predicted_sex) + ) + ), + ) + discrepant_samples = mt.aggregate_cols( + hl.agg.filter(mt.discrepant, hl.agg.collect_as_set(mt.s)), + ) + failed_families = {} + for family in families: + discrepant_loadable_samples = set(family.samples.keys()) & discrepant_samples + if discrepant_loadable_samples: + sorted_discrepant_samples = sorted(discrepant_loadable_samples) + failed_families[ + family + ] = f'Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : {sorted_discrepant_samples[:10]}' + return failed_families diff --git a/v03_pipeline/lib/misc/family_loading_failures_test.py b/v03_pipeline/lib/misc/family_loading_failures_test.py index c9b1858b2..6e6c1c923 100644 --- a/v03_pipeline/lib/misc/family_loading_failures_test.py +++ b/v03_pipeline/lib/misc/family_loading_failures_test.py @@ -6,12 +6,14 @@ all_relatedness_checks, build_relatedness_check_lookup, build_sex_check_lookup, + get_families_failed_imputed_sex_ploidy, get_families_failed_sex_check, ) from v03_pipeline.lib.misc.io import import_pedigree from v03_pipeline.lib.misc.pedigree import Family, Sample, parse_pedigree_ht_to_families from v03_pipeline.lib.model import Sex +TEST_SEX_CHECK_1 = 'v03_pipeline/var/test/sex_check/test_sex_check_1.ht' TEST_PEDIGREE_6 = 'v03_pipeline/var/test/pedigrees/test_pedigree_6.tsv' @@ -250,3 +252,189 @@ def test_get_families_failed_sex_check(self): ], ], ) + + def test_get_families_failed_imputed_sex_ploidy(self) -> None: + female_sample = 'HG00731_1' + male_sample_1 = 'HG00732_1' + male_sample_2 = 'HG00732_1' + x0_sample = 'NA20899_1' + xxy_sample = 'NA20889_1' + xyy_sample = 'NA20891_1' + xxx_sample = 'NA20892_1' + + sex_check_ht = hl.read_table(TEST_SEX_CHECK_1) + families = { + Family( + family_guid='', + samples={ + female_sample: Sample(female_sample, Sex.FEMALE), + male_sample_1: Sample(male_sample_1, Sex.MALE), + male_sample_2: Sample(male_sample_2, Sex.MALE), + x0_sample: Sample(x0_sample, Sex.X0), + xxy_sample: Sample(xxy_sample, Sex.XXY), + xyy_sample: Sample(xyy_sample, Sex.XYY), + xxx_sample: Sample(xxx_sample, Sex.XXX), + }, + ), + } + + # All calls on X chromosome are valid + mt = ( + hl.MatrixTable.from_parts( + rows={ + 'locus': [ + hl.Locus( + contig='chrX', + position=1, + reference_genome='GRCh38', + ), + ], + }, + cols={ + 's': [ + female_sample, + male_sample_1, + x0_sample, + xxy_sample, + xyy_sample, + xxx_sample, + ], + }, + entries={ + 'GT': [ + [ + hl.Call(alleles=[0, 0], phased=False), + hl.Call(alleles=[0], phased=False), + hl.Call(alleles=[0, 0], phased=False), # X0 + hl.Call(alleles=[0, 0], phased=False), # XXY + hl.Call(alleles=[0, 0], phased=False), # XYY + hl.Call(alleles=[0, 0], phased=False), # XXX + ], + ], + }, + ) + .key_rows_by('locus') + .key_cols_by('s') + ) + failed_families = get_families_failed_imputed_sex_ploidy( + mt, + sex_check_ht, + families, + ) + self.assertDictEqual(failed_families, {}) + + # All calls on Y chromosome are valid + mt = ( + hl.MatrixTable.from_parts( + rows={ + 'locus': [ + hl.Locus( + contig='chrY', + position=1, + reference_genome='GRCh38', + ), + ], + }, + cols={ + 's': [ + female_sample, + male_sample_1, + x0_sample, + xxy_sample, + xyy_sample, + xxx_sample, + ], + }, + entries={ + 'GT': [ + [ + hl.missing(hl.tcall), + hl.Call(alleles=[0], phased=False), + hl.missing(hl.tcall), # X0 + hl.Call(alleles=[0, 0], phased=False), # XXY + hl.Call(alleles=[0, 0], phased=False), # XYY + hl.missing(hl.tcall), # XXX + ], + ], + }, + ) + .key_rows_by('locus') + .key_cols_by('s') + ) + failed_families = get_families_failed_imputed_sex_ploidy( + mt, + sex_check_ht, + families, + ) + self.assertDictEqual(failed_families, {}) + + # Invalid X chromosome case + mt = ( + hl.MatrixTable.from_parts( + rows={ + 'locus': [ + hl.Locus( + contig='chrX', + position=1, + reference_genome='GRCh38', + ), + ], + }, + cols={ + 's': [ + female_sample, + male_sample_1, + male_sample_2, + x0_sample, + xxy_sample, + xyy_sample, + xxx_sample, + ], + }, + entries={ + 'GT': [ + [ + hl.Call(alleles=[0], phased=False), # invalid Female call + hl.Call(alleles=[0], phased=False), # valid Male call + hl.missing(hl.tcall), # invalid Male call + hl.Call(alleles=[0], phased=False), # invalid X0 call + hl.Call(alleles=[0], phased=False), # invalid XXY call + hl.missing(hl.tcall), # valid XYY call + hl.Call(alleles=[0, 0], phased=False), # valid XXX call + ], + ], + }, + ) + .key_rows_by('locus') + .key_cols_by('s') + ) + failed_families = get_families_failed_imputed_sex_ploidy( + mt, + sex_check_ht, + families, + ) + self.assertCountEqual( + failed_families.values(), + [ + "Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : ['HG00731_1', 'HG00732_1', 'NA20889_1', 'NA20899_1']", + ], + ) + + # Invalid X chromosome case, but only discrepant family samples are reported + families = { + Family( + family_guid='', + samples={female_sample: Sample(female_sample, Sex.FEMALE)}, + ), + } + failed_families = get_families_failed_imputed_sex_ploidy( + mt, + sex_check_ht, + families, + ) + self.assertCountEqual( + failed_families.values(), + [ + "Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : ['HG00731_1']", + ], + ) diff --git a/v03_pipeline/lib/misc/validation.py b/v03_pipeline/lib/misc/validation.py index 25c3404e6..4bc735c23 100644 --- a/v03_pipeline/lib/misc/validation.py +++ b/v03_pipeline/lib/misc/validation.py @@ -2,12 +2,10 @@ import hail as hl -from v03_pipeline.lib.misc.pedigree import Family from v03_pipeline.lib.model import ( DatasetType, ReferenceGenome, SampleType, - Sex, ) AMBIGUOUS_THRESHOLD_PERC: float = 0.01 # Fraction of samples identified as "ambiguous_sex" above which an error will be thrown. @@ -129,56 +127,6 @@ def _validate_field( raise SeqrValidationError(msg) -def validate_imputed_sex_ploidy( - mt: hl.MatrixTable, - # NB: sex_check_ht will be undefined if sex checking is disabled for the run - sex_check_ht: hl.Table | None = None, - pedigree_families: set[Family] | None = None, - **_: Any, -) -> None: - if not sex_check_ht: - return - mt = mt.select_cols( - discrepant=( - ( - # All calls are diploid or missing but the sex is Male - hl.agg.all(mt.GT.is_diploid() | hl.is_missing(mt.GT)) - & (sex_check_ht[mt.s].predicted_sex == Sex.MALE.value) - ) - | ( - # At least one call is haploid but the sex is Female, X0, XXY, XYY, or XXX - hl.agg.any(~mt.GT.is_diploid()) - & hl.literal( - { - Sex.FEMALE.value, - Sex.X0.value, - Sex.XYY.value, - Sex.XXY.value, - Sex.XXX.value, - }, - ).contains(sex_check_ht[mt.s].predicted_sex) - ) - ), - ) - discrepant_samples = mt.aggregate_cols( - hl.agg.filter(mt.discrepant, hl.agg.collect_as_set(mt.s)), - ) - loading_samples = ( - {sample_id for family in pedigree_families for sample_id in family.samples} - if pedigree_families - else set() - ) - discrepant_loading_samples = discrepant_samples & loading_samples - - if discrepant_loading_samples: - sorted_discrepant_samples = sorted(discrepant_loading_samples) - msg = f'Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : {sorted_discrepant_samples[:10]}' - raise SeqrValidationError( - msg, - {'imputed_sex_ploidy_failures': sorted_discrepant_samples}, - ) - - def validate_sample_type( mt: hl.MatrixTable, coding_and_noncoding_variants_ht: hl.Table, diff --git a/v03_pipeline/lib/misc/validation_test.py b/v03_pipeline/lib/misc/validation_test.py index eafa1e5ac..e9f5a1a84 100644 --- a/v03_pipeline/lib/misc/validation_test.py +++ b/v03_pipeline/lib/misc/validation_test.py @@ -2,19 +2,16 @@ import hail as hl -from v03_pipeline.lib.misc.pedigree import Family, Sample from v03_pipeline.lib.misc.validation import ( SeqrValidationError, validate_allele_type, validate_expected_contig_frequency, validate_imported_field_types, - validate_imputed_sex_ploidy, validate_no_duplicate_variants, validate_sample_type, ) -from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType, Sex +from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType -TEST_SEX_CHECK_1 = 'v03_pipeline/var/test/sex_check/test_sex_check_1.ht' TEST_MITO_MT = 'v03_pipeline/var/test/callsets/mito_1.mt' @@ -123,162 +120,6 @@ def test_validate_allele_type(self) -> None: DatasetType.SNV_INDEL, ) - def test_validate_imputed_sex_ploidy(self) -> None: - female_sample = 'HG00731_1' - male_sample_1 = 'HG00732_1' - male_sample_2 = 'HG00732_1' - x0_sample = 'NA20899_1' - xxy_sample = 'NA20889_1' - xyy_sample = 'NA20891_1' - xxx_sample = 'NA20892_1' - - sex_check_ht = hl.read_table(TEST_SEX_CHECK_1) - - # All calls on X chromosome are valid - mt = ( - hl.MatrixTable.from_parts( - rows={ - 'locus': [ - hl.Locus( - contig='chrX', - position=1, - reference_genome='GRCh38', - ), - ], - }, - cols={ - 's': [ - female_sample, - male_sample_1, - x0_sample, - xxy_sample, - xyy_sample, - xxx_sample, - ], - }, - entries={ - 'GT': [ - [ - hl.Call(alleles=[0, 0], phased=False), - hl.Call(alleles=[0], phased=False), - hl.Call(alleles=[0, 0], phased=False), # X0 - hl.Call(alleles=[0, 0], phased=False), # XXY - hl.Call(alleles=[0, 0], phased=False), # XYY - hl.Call(alleles=[0, 0], phased=False), # XXX - ], - ], - }, - ) - .key_rows_by('locus') - .key_cols_by('s') - ) - pedigree_families = { - Family( - family_guid='', - samples={ - female_sample: Sample(female_sample, Sex.FEMALE), - male_sample_1: Sample(male_sample_1, Sex.MALE), - x0_sample: Sample(x0_sample, Sex.X0), - xxy_sample: Sample(xxy_sample, Sex.XXY), - xyy_sample: Sample(xyy_sample, Sex.XYY), - xxx_sample: Sample(xxx_sample, Sex.XXX), - }, - ), - } - validate_imputed_sex_ploidy(mt, sex_check_ht, pedigree_families) - - # All calls on Y chromosome are valid - mt = ( - hl.MatrixTable.from_parts( - rows={ - 'locus': [ - hl.Locus( - contig='chrY', - position=1, - reference_genome='GRCh38', - ), - ], - }, - cols={ - 's': [ - female_sample, - male_sample_1, - x0_sample, - xxy_sample, - xyy_sample, - xxx_sample, - ], - }, - entries={ - 'GT': [ - [ - hl.missing(hl.tcall), - hl.Call(alleles=[0], phased=False), - hl.missing(hl.tcall), # X0 - hl.Call(alleles=[0, 0], phased=False), # XXY - hl.Call(alleles=[0, 0], phased=False), # XYY - hl.missing(hl.tcall), # XXX - ], - ], - }, - ) - .key_rows_by('locus') - .key_cols_by('s') - ) - validate_imputed_sex_ploidy(mt, sex_check_ht, pedigree_families) - - # Invalid X chromosome case - mt = ( - hl.MatrixTable.from_parts( - rows={ - 'locus': [ - hl.Locus( - contig='chrX', - position=1, - reference_genome='GRCh38', - ), - ], - }, - cols={ - 's': [ - female_sample, - male_sample_1, - male_sample_2, - x0_sample, - xxy_sample, - xyy_sample, - xxx_sample, - ], - }, - entries={ - 'GT': [ - [ - hl.Call(alleles=[0], phased=False), # invalid Female call - hl.Call(alleles=[0], phased=False), # valid Male call - hl.missing(hl.tcall), # invalid Male call - hl.Call(alleles=[0], phased=False), # invalid X0 call - hl.Call(alleles=[0], phased=False), # invalid XXY call - hl.missing(hl.tcall), # valid XYY call - hl.Call(alleles=[0, 0], phased=False), # valid XXX call - ], - ], - }, - ) - .key_rows_by('locus') - .key_cols_by('s') - ) - self.assertRaisesRegex( - SeqrValidationError, - "Found samples with misaligned ploidy with their provided imputed sex \\(first 10, if applicable\\) : \\['HG00731_1', 'HG00732_1', 'NA20889_1', 'NA20899_1'\\].*", - validate_imputed_sex_ploidy, - mt, - sex_check_ht, - pedigree_families, - ) - - # Invalid X chromosome case, but invalid samples are missing from pedigree - validate_imputed_sex_ploidy(mt, sex_check_ht, pedigree_families=set()) - def test_validate_imported_field_types(self) -> None: mt = hl.read_matrix_table(TEST_MITO_MT) validate_imported_field_types(mt, DatasetType.MITO, {}) diff --git a/v03_pipeline/lib/tasks/validate_callset.py b/v03_pipeline/lib/tasks/validate_callset.py index 6f9f69eeb..cc9cb7dd1 100644 --- a/v03_pipeline/lib/tasks/validate_callset.py +++ b/v03_pipeline/lib/tasks/validate_callset.py @@ -2,31 +2,25 @@ import luigi import luigi.util -from v03_pipeline.lib.misc.io import import_pedigree -from v03_pipeline.lib.misc.pedigree import parse_pedigree_ht_to_families from v03_pipeline.lib.misc.validation import ( SeqrValidationError, validate_allele_type, validate_expected_contig_frequency, - validate_imputed_sex_ploidy, validate_no_duplicate_variants, validate_sample_type, ) -from v03_pipeline.lib.model.feature_flag import FeatureFlag from v03_pipeline.lib.paths import ( imported_callset_path, - sex_check_table_path, valid_reference_dataset_path, ) from v03_pipeline.lib.reference_datasets.reference_dataset import ReferenceDataset from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams from v03_pipeline.lib.tasks.base.base_update import BaseUpdateTask -from v03_pipeline.lib.tasks.files import CallsetTask, GCSorLocalTarget, RawFileTask +from v03_pipeline.lib.tasks.files import CallsetTask, GCSorLocalTarget from v03_pipeline.lib.tasks.reference_data.updated_reference_dataset import ( UpdatedReferenceDatasetTask, ) from v03_pipeline.lib.tasks.write_imported_callset import WriteImportedCallsetTask -from v03_pipeline.lib.tasks.write_sex_check_table import WriteSexCheckTableTask from v03_pipeline.lib.tasks.write_validation_errors_for_run import ( WriteValidationErrorsForRunTask, ) @@ -42,22 +36,6 @@ def get_validation_dependencies(self) -> dict[str, hl.Table]: ReferenceDataset.gnomad_coding_and_noncoding, ), ) - if ( - FeatureFlag.CHECK_SEX_AND_RELATEDNESS - and self.dataset_type.check_sex_and_relatedness - and not self.skip_check_sex_and_relatedness - ): - deps['sex_check_ht'] = hl.read_table( - sex_check_table_path( - self.reference_genome, - self.dataset_type, - self.callset_path, - ), - ) - deps['pedigree_families'] = parse_pedigree_ht_to_families( - import_pedigree(self.input()[1].path), - ) - return deps def complete(self) -> luigi.Target: @@ -78,10 +56,7 @@ def output(self) -> luigi.Target: ) def requires(self) -> list[luigi.Task]: - requirements = [ - self.clone(WriteImportedCallsetTask), - RawFileTask(self.project_pedigree_paths[self.project_i]), - ] + requirements = [self.clone(WriteImportedCallsetTask)] if not self.skip_validation and self.dataset_type.can_run_validation: requirements = [ *requirements, @@ -92,15 +67,6 @@ def requires(self) -> list[luigi.Task]: ) ), ] - if ( - FeatureFlag.CHECK_SEX_AND_RELATEDNESS - and self.dataset_type.check_sex_and_relatedness - and not self.skip_check_sex_and_relatedness - ): - requirements = [ - *requirements, - self.clone(WriteSexCheckTableTask), - ] return [ *requirements, CallsetTask(self.callset_path), @@ -128,10 +94,14 @@ def update_table(self, mt: hl.MatrixTable) -> hl.MatrixTable: callset_path=self.callset_path, validated_sample_type=self.sample_type.value, ) - validation_dependencies = self.get_validation_dependencies() + coding_and_noncoding_variants_ht = hl.read_table( + valid_reference_dataset_path( + self.reference_genome, + ReferenceDataset.gnomad_coding_and_noncoding, + ), + ) for validation_f in [ validate_allele_type, - validate_imputed_sex_ploidy, validate_no_duplicate_variants, validate_expected_contig_frequency, validate_sample_type, @@ -139,8 +109,8 @@ def update_table(self, mt: hl.MatrixTable) -> hl.MatrixTable: try: validation_f( mt, + coding_and_noncoding_variants_ht=coding_and_noncoding_variants_ht, **self.param_kwargs, - **validation_dependencies, ) except SeqrValidationError as e: validation_exceptions.append(e) diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py index 2ef35f3a1..479621601 100644 --- a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py +++ b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py @@ -3,6 +3,7 @@ import luigi.util from v03_pipeline.lib.misc.family_loading_failures import ( + get_families_failed_imputed_sex_ploidy, get_families_failed_missing_samples, get_families_failed_relatedness_check, get_families_failed_sex_check, @@ -109,6 +110,7 @@ def create_table(self) -> hl.MatrixTable: ) families_failed_relatedness_check = {} families_failed_sex_check = {} + families_failed_imputed_sex_ploidy = {} if ( FeatureFlag.CHECK_SEX_AND_RELATEDNESS and self.dataset_type.check_sex_and_relatedness @@ -134,12 +136,21 @@ def create_table(self) -> hl.MatrixTable: sex_check_ht, remap_lookup, ) + families_failed_imputed_sex_ploidy = get_families_failed_imputed_sex_ploidy( + callset_mt, + sex_check_ht, + families + - families_failed_missing_samples.keys() + - families_failed_relatedness_check.keys() + - families_failed_sex_check.keys(), + ) loadable_families = ( families - families_failed_missing_samples.keys() - families_failed_relatedness_check.keys() - families_failed_sex_check.keys() + - families_failed_imputed_sex_ploidy.keys() ) if not len(loadable_families): msg = 'All families failed validation checks' @@ -154,6 +165,9 @@ def create_table(self) -> hl.MatrixTable: families_failed_relatedness_check, ), 'sex_check': format_failures(families_failed_sex_check), + 'ploidy_check': format_failures( + families_failed_imputed_sex_ploidy, + ), }, }, ) @@ -203,5 +217,9 @@ def create_table(self) -> hl.MatrixTable: format_failures(families_failed_sex_check) or hl.empty_dict(hl.tstr, hl.tdict(hl.tstr, hl.tarray(hl.tstr))) ), + ploidy_check=( + format_failures(families_failed_imputed_sex_ploidy) + or hl.empty_dict(hl.tstr, hl.tdict(hl.tstr, hl.tarray(hl.tstr))) + ), ), ) diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py index a5ed24799..afac86773 100644 --- a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py +++ b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py @@ -114,6 +114,7 @@ def test_write_remapped_and_subsetted_callset_task( missing_samples={}, relatedness_check={}, sex_check={}, + ploidy_check={}, ), family_samples={'abc_1': ['HG00731_1', 'HG00732_1', 'HG00733_1']}, ), @@ -121,7 +122,7 @@ def test_write_remapped_and_subsetted_callset_task( ) @patch('v03_pipeline.lib.tasks.write_remapped_and_subsetted_callset.FeatureFlag') - def test_write_remapped_and_subsetted_callset_task_failed_sex_check_family( + def test_write_remapped_and_subsetted_callset_task_failed_some_family_checks( self, mock_ff: Mock, ) -> None: @@ -143,8 +144,9 @@ def test_write_remapped_and_subsetted_callset_task_failed_sex_check_family( worker.run() self.assertTrue(wrsc_task.complete()) mt = hl.read_matrix_table(wrsc_task.output().path) - # NB: one "family"/"sample" has been removed because of a failed sex check! - self.assertEqual(mt.count(), (30, 12)) + # NB: one "family"/"sample" has been removed because of a failed sex check, + # and 4 removed because of a failed ploidy check! + self.assertEqual(mt.count(), (30, 8)) self.assertEqual( mt.globals.collect(), [ @@ -157,16 +159,12 @@ def test_write_remapped_and_subsetted_callset_task_failed_sex_check_family( ), family_samples={ '123_1': ['NA19675_1'], - '234_1': ['NA19678_1'], '345_1': ['NA19679_1'], '456_1': ['NA20870_1'], - '567_1': ['NA20872_1'], '678_1': ['NA20874_1'], '789_1': ['NA20875_1'], '890_1': ['NA20876_1'], '901_1': ['NA20877_1'], - 'bcd_1': ['NA20878_1'], - 'cde_1': ['NA20881_1'], 'efg_1': ['NA20888_1'], }, failed_family_samples=hl.Struct( @@ -180,6 +178,24 @@ def test_write_remapped_and_subsetted_callset_task_failed_sex_check_family( 'samples': ['NA20885_1'], }, }, + ploidy_check={ + '234_1': hl.Struct( + samples=['NA19678_1'], + reasons="Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : ['NA19678_1']", + ), + '567_1': hl.Struct( + samples=['NA20872_1'], + reasons="Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : ['NA20872_1']", + ), + 'bcd_1': hl.Struct( + samples=['NA20878_1'], + reasons="Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : ['NA20878_1']", + ), + 'cde_1': hl.Struct( + samples=['NA20881_1'], + reasons="Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : ['NA20881_1']", + ), + }, ), ), ], @@ -337,6 +353,7 @@ def test_write_remapped_and_subsetted_callset_task_all_families_failed( ], }, }, + 'ploidy_check': {}, }, }, ) From 3493dcfe80cbe48a0fce4b425089e7c080d3c71a Mon Sep 17 00:00:00 2001 From: Julia Klugherz Date: Thu, 13 Mar 2025 16:14:01 -0400 Subject: [PATCH 03/10] remove get_validation_dependencies --- v03_pipeline/lib/tasks/validate_callset.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/v03_pipeline/lib/tasks/validate_callset.py b/v03_pipeline/lib/tasks/validate_callset.py index cc9cb7dd1..cf1443097 100644 --- a/v03_pipeline/lib/tasks/validate_callset.py +++ b/v03_pipeline/lib/tasks/validate_callset.py @@ -28,16 +28,6 @@ @luigi.util.inherits(BaseLoadingRunParams) class ValidateCallsetTask(BaseUpdateTask): - def get_validation_dependencies(self) -> dict[str, hl.Table]: - deps = {} - deps['coding_and_noncoding_variants_ht'] = hl.read_table( - valid_reference_dataset_path( - self.reference_genome, - ReferenceDataset.gnomad_coding_and_noncoding, - ), - ) - return deps - def complete(self) -> luigi.Target: if super().complete(): mt = hl.read_matrix_table(self.output().path) From 2983f79e10323f2c9aec0890f21bf3b46a311f86 Mon Sep 17 00:00:00 2001 From: Julia Klugherz Date: Thu, 13 Mar 2025 18:24:00 -0400 Subject: [PATCH 04/10] one failure --- v03_pipeline/lib/tasks/write_project_family_tables_test.py | 1 + 1 file changed, 1 insertion(+) diff --git a/v03_pipeline/lib/tasks/write_project_family_tables_test.py b/v03_pipeline/lib/tasks/write_project_family_tables_test.py index dd535f988..e93d4fada 100644 --- a/v03_pipeline/lib/tasks/write_project_family_tables_test.py +++ b/v03_pipeline/lib/tasks/write_project_family_tables_test.py @@ -137,6 +137,7 @@ def test_snv_write_project_family_tables_task(self) -> None: }, relatedness_check={}, sex_check={}, + ploidy_check={}, ), ) # Project table still contains all family guids From 049c2084a26d7745545a4c08ab52612151efb731 Mon Sep 17 00:00:00 2001 From: Julia Klugherz Date: Mon, 17 Mar 2025 14:37:42 -0400 Subject: [PATCH 05/10] comments --- .../lib/misc/family_loading_failures.py | 2 +- .../lib/misc/family_loading_failures_test.py | 8 +- .../write_remapped_and_subsetted_callset.py | 12 +- ...ite_remapped_and_subsetted_callset_test.py | 103 ++++++------------ 4 files changed, 43 insertions(+), 82 deletions(-) diff --git a/v03_pipeline/lib/misc/family_loading_failures.py b/v03_pipeline/lib/misc/family_loading_failures.py index a43500fbf..aa8d988ac 100644 --- a/v03_pipeline/lib/misc/family_loading_failures.py +++ b/v03_pipeline/lib/misc/family_loading_failures.py @@ -177,9 +177,9 @@ def get_families_failed_sex_check( def get_families_failed_imputed_sex_ploidy( + families: set[Family], mt: hl.MatrixTable, sex_check_ht: hl.Table, - families: set[Family], ) -> dict[Family, str]: mt = mt.select_cols( discrepant=( diff --git a/v03_pipeline/lib/misc/family_loading_failures_test.py b/v03_pipeline/lib/misc/family_loading_failures_test.py index 6e6c1c923..58564d4b2 100644 --- a/v03_pipeline/lib/misc/family_loading_failures_test.py +++ b/v03_pipeline/lib/misc/family_loading_failures_test.py @@ -317,9 +317,9 @@ def test_get_families_failed_imputed_sex_ploidy(self) -> None: .key_cols_by('s') ) failed_families = get_families_failed_imputed_sex_ploidy( + families, mt, sex_check_ht, - families, ) self.assertDictEqual(failed_families, {}) @@ -362,9 +362,9 @@ def test_get_families_failed_imputed_sex_ploidy(self) -> None: .key_cols_by('s') ) failed_families = get_families_failed_imputed_sex_ploidy( + families, mt, sex_check_ht, - families, ) self.assertDictEqual(failed_families, {}) @@ -409,9 +409,9 @@ def test_get_families_failed_imputed_sex_ploidy(self) -> None: .key_cols_by('s') ) failed_families = get_families_failed_imputed_sex_ploidy( + families, mt, sex_check_ht, - families, ) self.assertCountEqual( failed_families.values(), @@ -428,9 +428,9 @@ def test_get_families_failed_imputed_sex_ploidy(self) -> None: ), } failed_families = get_families_failed_imputed_sex_ploidy( + families, mt, sex_check_ht, - families, ) self.assertCountEqual( failed_families.values(), diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py index 479621601..a4388910e 100644 --- a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py +++ b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py @@ -129,20 +129,20 @@ def create_table(self) -> hl.MatrixTable: relatedness_check_ht, remap_lookup, ) - families_failed_sex_check = get_families_failed_sex_check( + families_failed_imputed_sex_ploidy = get_families_failed_imputed_sex_ploidy( families - families_failed_missing_samples.keys() - families_failed_relatedness_check.keys(), - sex_check_ht, - remap_lookup, - ) - families_failed_imputed_sex_ploidy = get_families_failed_imputed_sex_ploidy( callset_mt, sex_check_ht, + ) + families_failed_sex_check = get_families_failed_sex_check( families - families_failed_missing_samples.keys() - families_failed_relatedness_check.keys() - - families_failed_sex_check.keys(), + - families_failed_imputed_sex_ploidy.keys(), + sex_check_ht, + remap_lookup, ) loadable_families = ( diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py index afac86773..9471aafa3 100644 --- a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py +++ b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py @@ -237,123 +237,84 @@ def test_write_remapped_and_subsetted_callset_task_all_families_failed( self.assertDictEqual( json.load(f), { - 'project_guids': [ - 'R0114_project4', - ], - 'error_messages': [ - 'All families failed validation checks', - ], + 'project_guids': ['R0114_project4'], + 'error_messages': ['All families failed validation checks'], 'failed_family_samples': { 'missing_samples': { 'efg_1': { - 'samples': [ - 'NA99999_1', - ], - 'reasons': [ - "Missing samples: {'NA99999_1'}", - ], + 'samples': ['NA99999_1'], + 'reasons': ["Missing samples: {'NA99999_1'}"], }, }, 'relatedness_check': {}, 'sex_check': { - '789_1': { - 'samples': [ - 'NA20875_1', - ], + '890_1': { + 'samples': ['NA20876_1'], 'reasons': [ - 'Sample NA20875_1 has pedigree sex M but imputed sex F', + 'Sample NA20876_1 has pedigree sex M but imputed sex F', ], }, '456_1': { - 'samples': [ - 'NA20870_1', - ], + 'samples': ['NA20870_1'], 'reasons': [ 'Sample NA20870_1 has pedigree sex M but imputed sex F', ], }, '123_1': { - 'samples': [ - 'NA19675_1', - ], + 'samples': ['NA19675_1'], 'reasons': [ 'Sample NA19675_1 has pedigree sex M but imputed sex F', ], }, - 'cde_1': { - 'samples': [ - 'NA20881_1', - ], + '678_1': { + 'samples': ['NA20874_1'], 'reasons': [ - 'Sample NA20881_1 has pedigree sex F but imputed sex M', + 'Sample NA20874_1 has pedigree sex M but imputed sex F', ], }, - '901_1': { - 'samples': [ - 'NA20877_1', - ], + '789_1': { + 'samples': ['NA20875_1'], 'reasons': [ - 'Sample NA20877_1 has pedigree sex M but imputed sex F', + 'Sample NA20875_1 has pedigree sex M but imputed sex F', ], }, - '678_1': { - 'samples': [ - 'NA20874_1', - ], + '901_1': { + 'samples': ['NA20877_1'], 'reasons': [ - 'Sample NA20874_1 has pedigree sex M but imputed sex F', + 'Sample NA20877_1 has pedigree sex M but imputed sex F', ], }, '345_1': { - 'samples': [ - 'NA19679_1', - ], + 'samples': ['NA19679_1'], 'reasons': [ 'Sample NA19679_1 has pedigree sex M but imputed sex F', ], }, - '890_1': { - 'samples': [ - 'NA20876_1', - ], - 'reasons': [ - 'Sample NA20876_1 has pedigree sex M but imputed sex F', - ], - }, 'def_1': { - 'samples': [ - 'NA20885_1', - ], + 'samples': ['NA20885_1'], 'reasons': [ 'Sample NA20885_1 has pedigree sex M but imputed sex F', ], }, + }, + 'ploidy_check': { + 'cde_1': { + 'samples': ['NA20881_1'], + 'reasons': "Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : ['NA20881_1']", + }, '234_1': { - 'samples': [ - 'NA19678_1', - ], - 'reasons': [ - 'Sample NA19678_1 has pedigree sex F but imputed sex M', - ], + 'samples': ['NA19678_1'], + 'reasons': "Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : ['NA19678_1']", }, 'bcd_1': { - 'samples': [ - 'NA20878_1', - ], - 'reasons': [ - 'Sample NA20878_1 has pedigree sex F but imputed sex M', - ], + 'samples': ['NA20878_1'], + 'reasons': "Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : ['NA20878_1']", }, '567_1': { - 'samples': [ - 'NA20872_1', - ], - 'reasons': [ - 'Sample NA20872_1 has pedigree sex F but imputed sex M', - ], + 'samples': ['NA20872_1'], + 'reasons': "Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : ['NA20872_1']", }, }, - 'ploidy_check': {}, }, }, ) From 3d48d8a481c83fef2b83643fea51284824ccd92e Mon Sep 17 00:00:00 2001 From: Julia Klugherz Date: Mon, 17 Mar 2025 17:03:05 -0400 Subject: [PATCH 06/10] add to metadata json --- v03_pipeline/lib/tasks/write_metadata_for_run.py | 8 +++++++- v03_pipeline/lib/tasks/write_metadata_for_run_test.py | 1 + 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/v03_pipeline/lib/tasks/write_metadata_for_run.py b/v03_pipeline/lib/tasks/write_metadata_for_run.py index cc012a926..71b115c2a 100644 --- a/v03_pipeline/lib/tasks/write_metadata_for_run.py +++ b/v03_pipeline/lib/tasks/write_metadata_for_run.py @@ -48,6 +48,7 @@ def run(self) -> None: 'missing_samples': {}, 'relatedness_check': {}, 'sex_check': {}, + 'ploidy_check': {}, }, 'relatedness_check_file_path': relatedness_check_tsv_path( self.reference_genome, @@ -62,7 +63,12 @@ def run(self) -> None: **collected_globals['family_samples'], **metadata_json['family_samples'], } - for key in ['missing_samples', 'relatedness_check', 'sex_check']: + for key in [ + 'missing_samples', + 'relatedness_check', + 'sex_check', + 'ploidy_check', + ]: metadata_json['failed_family_samples'][key] = { **collected_globals['failed_family_samples'][key], **metadata_json['failed_family_samples'][key], diff --git a/v03_pipeline/lib/tasks/write_metadata_for_run_test.py b/v03_pipeline/lib/tasks/write_metadata_for_run_test.py index dc007fbcb..471bb61c5 100644 --- a/v03_pipeline/lib/tasks/write_metadata_for_run_test.py +++ b/v03_pipeline/lib/tasks/write_metadata_for_run_test.py @@ -50,6 +50,7 @@ def test_write_metadata_for_run_task(self) -> None: }, 'relatedness_check': {}, 'sex_check': {}, + 'ploidy_check': {}, }, 'family_samples': { 'abc_1': [ From b583a26656a2a70720f9f5a4e5a1a72201cfc186 Mon Sep 17 00:00:00 2001 From: Julia Klugherz Date: Tue, 18 Mar 2025 16:32:32 -0400 Subject: [PATCH 07/10] not applicable --- v03_pipeline/lib/misc/family_loading_failures.py | 2 +- .../lib/misc/family_loading_failures_test.py | 4 ++-- .../write_remapped_and_subsetted_callset_test.py | 16 ++++++++-------- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/v03_pipeline/lib/misc/family_loading_failures.py b/v03_pipeline/lib/misc/family_loading_failures.py index aa8d988ac..8c3ce310a 100644 --- a/v03_pipeline/lib/misc/family_loading_failures.py +++ b/v03_pipeline/lib/misc/family_loading_failures.py @@ -213,5 +213,5 @@ def get_families_failed_imputed_sex_ploidy( sorted_discrepant_samples = sorted(discrepant_loadable_samples) failed_families[ family - ] = f'Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : {sorted_discrepant_samples[:10]}' + ] = f'Found samples with misaligned ploidy with their provided imputed sex: {sorted_discrepant_samples}' return failed_families diff --git a/v03_pipeline/lib/misc/family_loading_failures_test.py b/v03_pipeline/lib/misc/family_loading_failures_test.py index 58564d4b2..a7e4ec03b 100644 --- a/v03_pipeline/lib/misc/family_loading_failures_test.py +++ b/v03_pipeline/lib/misc/family_loading_failures_test.py @@ -416,7 +416,7 @@ def test_get_families_failed_imputed_sex_ploidy(self) -> None: self.assertCountEqual( failed_families.values(), [ - "Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : ['HG00731_1', 'HG00732_1', 'NA20889_1', 'NA20899_1']", + "Found samples with misaligned ploidy with their provided imputed sex: ['HG00731_1', 'HG00732_1', 'NA20889_1', 'NA20899_1']", ], ) @@ -435,6 +435,6 @@ def test_get_families_failed_imputed_sex_ploidy(self) -> None: self.assertCountEqual( failed_families.values(), [ - "Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : ['HG00731_1']", + "Found samples with misaligned ploidy with their provided imputed sex: ['HG00731_1']", ], ) diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py index 9471aafa3..e9fb4c763 100644 --- a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py +++ b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py @@ -181,19 +181,19 @@ def test_write_remapped_and_subsetted_callset_task_failed_some_family_checks( ploidy_check={ '234_1': hl.Struct( samples=['NA19678_1'], - reasons="Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : ['NA19678_1']", + reasons="Found samples with misaligned ploidy with their provided imputed sex: ['NA19678_1']", ), '567_1': hl.Struct( samples=['NA20872_1'], - reasons="Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : ['NA20872_1']", + reasons="Found samples with misaligned ploidy with their provided imputed sex: ['NA20872_1']", ), 'bcd_1': hl.Struct( samples=['NA20878_1'], - reasons="Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : ['NA20878_1']", + reasons="Found samples with misaligned ploidy with their provided imputed sex: ['NA20878_1']", ), 'cde_1': hl.Struct( samples=['NA20881_1'], - reasons="Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : ['NA20881_1']", + reasons="Found samples with misaligned ploidy with their provided imputed sex: ['NA20881_1']", ), }, ), @@ -300,19 +300,19 @@ def test_write_remapped_and_subsetted_callset_task_all_families_failed( 'ploidy_check': { 'cde_1': { 'samples': ['NA20881_1'], - 'reasons': "Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : ['NA20881_1']", + 'reasons': "Found samples with misaligned ploidy with their provided imputed sex: ['NA20881_1']", }, '234_1': { 'samples': ['NA19678_1'], - 'reasons': "Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : ['NA19678_1']", + 'reasons': "Found samples with misaligned ploidy with their provided imputed sex: ['NA19678_1']", }, 'bcd_1': { 'samples': ['NA20878_1'], - 'reasons': "Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : ['NA20878_1']", + 'reasons': "Found samples with misaligned ploidy with their provided imputed sex: ['NA20878_1']", }, '567_1': { 'samples': ['NA20872_1'], - 'reasons': "Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : ['NA20872_1']", + 'reasons': "Found samples with misaligned ploidy with their provided imputed sex: ['NA20872_1']", }, }, }, From 3a6c725ff46f785d79eaf9ab2fe94e8e7de1c230 Mon Sep 17 00:00:00 2001 From: Julia Klugherz Date: Tue, 18 Mar 2025 17:08:36 -0400 Subject: [PATCH 08/10] formatting --- .../lib/misc/family_loading_failures.py | 8 +-- .../lib/misc/family_loading_failures_test.py | 8 ++- ...ite_remapped_and_subsetted_callset_test.py | 64 ++++++++++++------- 3 files changed, 50 insertions(+), 30 deletions(-) diff --git a/v03_pipeline/lib/misc/family_loading_failures.py b/v03_pipeline/lib/misc/family_loading_failures.py index 8c3ce310a..f047e3f50 100644 --- a/v03_pipeline/lib/misc/family_loading_failures.py +++ b/v03_pipeline/lib/misc/family_loading_failures.py @@ -206,12 +206,12 @@ def get_families_failed_imputed_sex_ploidy( discrepant_samples = mt.aggregate_cols( hl.agg.filter(mt.discrepant, hl.agg.collect_as_set(mt.s)), ) - failed_families = {} + failed_families = defaultdict(list) for family in families: discrepant_loadable_samples = set(family.samples.keys()) & discrepant_samples if discrepant_loadable_samples: sorted_discrepant_samples = sorted(discrepant_loadable_samples) - failed_families[ - family - ] = f'Found samples with misaligned ploidy with their provided imputed sex: {sorted_discrepant_samples}' + failed_families[family].append( + f'Found samples with misaligned ploidy with their provided imputed sex: {sorted_discrepant_samples}', + ) return failed_families diff --git a/v03_pipeline/lib/misc/family_loading_failures_test.py b/v03_pipeline/lib/misc/family_loading_failures_test.py index a7e4ec03b..2352542e0 100644 --- a/v03_pipeline/lib/misc/family_loading_failures_test.py +++ b/v03_pipeline/lib/misc/family_loading_failures_test.py @@ -416,7 +416,9 @@ def test_get_families_failed_imputed_sex_ploidy(self) -> None: self.assertCountEqual( failed_families.values(), [ - "Found samples with misaligned ploidy with their provided imputed sex: ['HG00731_1', 'HG00732_1', 'NA20889_1', 'NA20899_1']", + [ + "Found samples with misaligned ploidy with their provided imputed sex: ['HG00731_1', 'HG00732_1', 'NA20889_1', 'NA20899_1']", + ], ], ) @@ -435,6 +437,8 @@ def test_get_families_failed_imputed_sex_ploidy(self) -> None: self.assertCountEqual( failed_families.values(), [ - "Found samples with misaligned ploidy with their provided imputed sex: ['HG00731_1']", + [ + "Found samples with misaligned ploidy with their provided imputed sex: ['HG00731_1']", + ], ], ) diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py index e9fb4c763..564eb2eed 100644 --- a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py +++ b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py @@ -179,22 +179,30 @@ def test_write_remapped_and_subsetted_callset_task_failed_some_family_checks( }, }, ploidy_check={ - '234_1': hl.Struct( - samples=['NA19678_1'], - reasons="Found samples with misaligned ploidy with their provided imputed sex: ['NA19678_1']", - ), - '567_1': hl.Struct( - samples=['NA20872_1'], - reasons="Found samples with misaligned ploidy with their provided imputed sex: ['NA20872_1']", - ), - 'bcd_1': hl.Struct( - samples=['NA20878_1'], - reasons="Found samples with misaligned ploidy with their provided imputed sex: ['NA20878_1']", - ), - 'cde_1': hl.Struct( - samples=['NA20881_1'], - reasons="Found samples with misaligned ploidy with their provided imputed sex: ['NA20881_1']", - ), + '234_1': { + 'reasons': [ + "Found samples with misaligned ploidy with their provided imputed sex: ['NA19678_1']", + ], + 'samples': ['NA19678_1'], + }, + '567_1': { + 'reasons': [ + "Found samples with misaligned ploidy with their provided imputed sex: ['NA20872_1']", + ], + 'samples': ['NA20872_1'], + }, + 'bcd_1': { + 'reasons': [ + "Found samples with misaligned ploidy with their provided imputed sex: ['NA20878_1']", + ], + 'samples': ['NA20878_1'], + }, + 'cde_1': { + 'reasons': [ + "Found samples with misaligned ploidy with their provided imputed sex: ['NA20881_1']", + ], + 'samples': ['NA20881_1'], + }, }, ), ), @@ -298,21 +306,29 @@ def test_write_remapped_and_subsetted_callset_task_all_families_failed( }, }, 'ploidy_check': { - 'cde_1': { - 'samples': ['NA20881_1'], - 'reasons': "Found samples with misaligned ploidy with their provided imputed sex: ['NA20881_1']", + '567_1': { + 'samples': ['NA20872_1'], + 'reasons': [ + "Found samples with misaligned ploidy with their provided imputed sex: ['NA20872_1']", + ], }, '234_1': { 'samples': ['NA19678_1'], - 'reasons': "Found samples with misaligned ploidy with their provided imputed sex: ['NA19678_1']", + 'reasons': [ + "Found samples with misaligned ploidy with their provided imputed sex: ['NA19678_1']", + ], }, 'bcd_1': { 'samples': ['NA20878_1'], - 'reasons': "Found samples with misaligned ploidy with their provided imputed sex: ['NA20878_1']", + 'reasons': [ + "Found samples with misaligned ploidy with their provided imputed sex: ['NA20878_1']", + ], }, - '567_1': { - 'samples': ['NA20872_1'], - 'reasons': "Found samples with misaligned ploidy with their provided imputed sex: ['NA20872_1']", + 'cde_1': { + 'samples': ['NA20881_1'], + 'reasons': [ + "Found samples with misaligned ploidy with their provided imputed sex: ['NA20881_1']", + ], }, }, }, From 36b8f5e129e1f927876257332479e880c1a611d9 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 20 Mar 2025 18:33:26 +0800 Subject: [PATCH 09/10] bugfix: handle missing metrics in tdr (#1060) * handle missing metrics in tdr * ruff --- .../lib/misc/terra_data_repository.py | 17 ++++++++++++---- .../lib/misc/terra_data_repository_test.py | 20 ++++++++++++++++++- 2 files changed, 32 insertions(+), 5 deletions(-) diff --git a/v03_pipeline/lib/misc/terra_data_repository.py b/v03_pipeline/lib/misc/terra_data_repository.py index 2428a429c..8e4cd975a 100644 --- a/v03_pipeline/lib/misc/terra_data_repository.py +++ b/v03_pipeline/lib/misc/terra_data_repository.py @@ -10,10 +10,10 @@ from v03_pipeline.lib.misc.requests import requests_retry_session BIGQUERY_METRICS = [ - 'collaborator_sample_id', 'predicted_sex', 'contamination_rate', 'percent_bases_at_20x', + 'collaborator_sample_id', 'mean_coverage', ] BIGQUERY_RESOURCE = 'bigquery' @@ -63,9 +63,18 @@ def bq_metrics_query(bq_table_name: str) -> google.cloud.bigquery.table.RowItera msg = f'{bq_table_name} does not match expected pattern' raise ValueError(msg) client = bigquery.Client() + + # not all columns are guaranteed to be present, coalesce if missing + table_ddl = next( + client.query_and_wait( + f""" + SELECT ddl FROM `{bq_table_name}`.INFORMATION_SCHEMA.TABLES where table_name='sample'; + """, # noqa: S608 + ), + )[0] + metrics = [(m if m in table_ddl else f'NULL AS {m}') for m in BIGQUERY_METRICS] return client.query_and_wait( f""" - SELECT {','.join(BIGQUERY_METRICS)} - FROM `{bq_table_name}.sample` - """, # noqa: S608 + SELECT {','.join(metrics)} FROM `{bq_table_name}.sample`; + """, # noqa: S608 ) diff --git a/v03_pipeline/lib/misc/terra_data_repository_test.py b/v03_pipeline/lib/misc/terra_data_repository_test.py index 70f26c81a..7084bcd8b 100644 --- a/v03_pipeline/lib/misc/terra_data_repository_test.py +++ b/v03_pipeline/lib/misc/terra_data_repository_test.py @@ -2,13 +2,14 @@ import os import unittest from types import SimpleNamespace -from unittest.mock import Mock, patch +from unittest.mock import Mock, call, patch import responses from v03_pipeline.lib.misc.terra_data_repository import ( TDR_ROOT_URL, _get_dataset_ids, + bq_metrics_query, gen_bq_table_names, ) @@ -301,3 +302,20 @@ def test_gen_bq_table_names(self, _: Mock) -> None: 'datarepo-aada2e3b.datarepo_RP_3059', ], ) + + @patch('v03_pipeline.lib.misc.terra_data_repository.bigquery.Client') + def test_bq_metrics_query_missing_metrics( + self, + mock_bq_client: Mock, + _: Mock, + ) -> None: + mock_bq_client.return_value.query_and_wait.return_value = iter( + [['predicted_sex,contamination_rate,percent_bases_at_20x']], + ) + bq_metrics_query('datarepo-7242affb.datarepo_RP_3053') + self.assertEqual( + mock_bq_client.return_value.query_and_wait.mock_calls[1], + call( + '\n SELECT predicted_sex,contamination_rate,percent_bases_at_20x,NULL AS collaborator_sample_id,NULL AS mean_coverage FROM `datarepo-7242affb.datarepo_RP_3053.sample`;\n ', + ), + ) From a7701e68e1ff05268c994405185312f7227cf7b8 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 20 Mar 2025 11:24:43 -0400 Subject: [PATCH 10/10] Re-enable hl.impute_sex for WES samples --- v03_pipeline/lib/methods/sex_check.py | 50 +++++++++++++ v03_pipeline/lib/methods/sex_check_test.py | 55 ++++++++++++++ v03_pipeline/lib/model/definitions.py | 4 + .../lib/tasks/write_imported_callset.py | 12 --- .../lib/tasks/write_metadata_for_run_test.py | 2 +- .../lib/tasks/write_sex_check_table.py | 62 +++++++++++++--- .../lib/tasks/write_sex_check_table_test.py | 74 ++++++++++++++++++- 7 files changed, 235 insertions(+), 24 deletions(-) create mode 100644 v03_pipeline/lib/methods/sex_check.py create mode 100644 v03_pipeline/lib/methods/sex_check_test.py diff --git a/v03_pipeline/lib/methods/sex_check.py b/v03_pipeline/lib/methods/sex_check.py new file mode 100644 index 000000000..0d9aced28 --- /dev/null +++ b/v03_pipeline/lib/methods/sex_check.py @@ -0,0 +1,50 @@ +import hail as hl + +from v03_pipeline.lib.model import Sex + +AMBIGUOUS_THRESHOLD_PERC: float = 0.01 # Fraction of samples identified as "ambiguous_sex" above which an error will be thrown. +AAF_THRESHOLD: float = 0.05 # Alternate allele frequency threshold for `hl.impute_sex`. +BIALLELIC: int = 2 +XX_FSTAT_THRESHOLD: float = ( + 0.5 # F-stat threshold below which a sample will be called XX +) +XY_FSTAT_THRESHOLD: float = ( + 0.75 # F-stat threshold above which a sample will be called XY. +) + + +def compute_sex_check_ht(mt: hl.MatrixTable) -> hl.Table: + # Filter to SNVs and biallelics + # NB: We should already have filtered biallelics, but just in case. + mt = mt.filter_rows( + (hl.len(mt.alleles) == BIALLELIC) & hl.is_snp(mt.alleles[0], mt.alleles[1]), + ) + mt = mt.filter_cols(hl.agg.all(mt.GT.is_diploid() | hl.is_missing(mt.GT))) + + # Filter to PASS variants only (variants with empty or missing filter set) + mt = mt.filter_rows( + hl.is_missing(mt.filters) | (mt.filters.length() == 0), + keep=True, + ) + impute_sex_ht = hl.impute_sex( + mt.GT, + male_threshold=XY_FSTAT_THRESHOLD, + female_threshold=XX_FSTAT_THRESHOLD, + aaf_threshold=AAF_THRESHOLD, + ) + ht = mt.annotate_cols(**impute_sex_ht[mt.col_key]).cols() + ht = ht.select( + predicted_sex=( + hl.case() + .when(hl.is_missing(ht.is_female), Sex.UNKNOWN.value) + .when(ht.is_female, Sex.FEMALE.value) + .default(Sex.MALE.value) + ), + ) + ambiguous_perc = ht.aggregate( + hl.agg.fraction(ht.predicted_sex == Sex.UNKNOWN.value), + ) + if ambiguous_perc > AMBIGUOUS_THRESHOLD_PERC: + msg = f'{ambiguous_perc:.2%} of samples identified as ambiguous. Please contact the methods team to investigate the callset.' + raise ValueError(msg) + return ht diff --git a/v03_pipeline/lib/methods/sex_check_test.py b/v03_pipeline/lib/methods/sex_check_test.py new file mode 100644 index 000000000..f87c07012 --- /dev/null +++ b/v03_pipeline/lib/methods/sex_check_test.py @@ -0,0 +1,55 @@ +import unittest +from unittest.mock import patch + +import hail as hl + +from v03_pipeline.lib.methods.sex_check import compute_sex_check_ht + +TEST_SEX_AND_RELATEDNESS_CALLSET_MT = ( + 'v03_pipeline/var/test/callsets/sex_and_relatedness_1.mt' +) +TEST_PEDIGREE = 'v03_pipeline/var/test/pedigrees/test_pedigree_6.tsv' + + +class SexCheckTest(unittest.TestCase): + def test_compute_sex_check_ht(self): + mt = hl.read_matrix_table(TEST_SEX_AND_RELATEDNESS_CALLSET_MT) + ht = compute_sex_check_ht(mt) + self.assertCountEqual( + ht.collect(), + [ + hl.Struct( + s='ROS_006_18Y03226_D1', + predicted_sex='M', + ), + hl.Struct( + s='ROS_006_18Y03227_D1', + predicted_sex='M', + ), + hl.Struct( + s='ROS_006_18Y03228_D1', + predicted_sex='M', + ), + hl.Struct( + s='ROS_007_19Y05919_D1', + predicted_sex='M', + ), + hl.Struct( + s='ROS_007_19Y05939_D1', + predicted_sex='F', + ), + hl.Struct( + s='ROS_007_19Y05987_D1', + predicted_sex='M', + ), + ], + ) + + def test_compute_sex_check_ht_ambiguous(self): + mt = hl.read_matrix_table(TEST_SEX_AND_RELATEDNESS_CALLSET_MT) + with patch('v03_pipeline.lib.methods.sex_check.XY_FSTAT_THRESHOLD', 0.95): + self.assertRaises( + ValueError, + compute_sex_check_ht, + mt, + ) diff --git a/v03_pipeline/lib/model/definitions.py b/v03_pipeline/lib/model/definitions.py index 171686e29..216a37c4f 100644 --- a/v03_pipeline/lib/model/definitions.py +++ b/v03_pipeline/lib/model/definitions.py @@ -165,3 +165,7 @@ def allele_registry_gnomad_id(self) -> str: class SampleType(StrEnum): WES = 'WES' WGS = 'WGS' + + @property + def predicted_sex_from_tdr(self): + return self == SampleType.WGS diff --git a/v03_pipeline/lib/tasks/write_imported_callset.py b/v03_pipeline/lib/tasks/write_imported_callset.py index 75c62aeff..e547217d9 100644 --- a/v03_pipeline/lib/tasks/write_imported_callset.py +++ b/v03_pipeline/lib/tasks/write_imported_callset.py @@ -21,7 +21,6 @@ from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask from v03_pipeline.lib.tasks.files import CallsetTask, GCSorLocalTarget -from v03_pipeline.lib.tasks.write_tdr_metrics_files import WriteTDRMetricsFilesTask from v03_pipeline.lib.tasks.write_validation_errors_for_run import ( with_persisted_validation_errors, ) @@ -60,17 +59,6 @@ def requires(self) -> list[luigi.Task]: ), ), ] - if ( - FeatureFlag.EXPECT_TDR_METRICS - and not self.skip_expect_tdr_metrics - and self.dataset_type.expect_tdr_metrics( - self.reference_genome, - ) - ): - requirements = [ - *requirements, - self.clone(WriteTDRMetricsFilesTask), - ] return [ *requirements, CallsetTask(self.callset_path), diff --git a/v03_pipeline/lib/tasks/write_metadata_for_run_test.py b/v03_pipeline/lib/tasks/write_metadata_for_run_test.py index d1c9b1cf1..41e8e1c30 100644 --- a/v03_pipeline/lib/tasks/write_metadata_for_run_test.py +++ b/v03_pipeline/lib/tasks/write_metadata_for_run_test.py @@ -24,7 +24,7 @@ class WriteMetadataForRunTaskTest(MockedDatarootTestCase): ) @mock.patch('v03_pipeline.lib.tasks.write_metadata_for_run.FeatureFlag') @mock.patch( - 'v03_pipeline.lib.tasks.write_imported_callset.WriteTDRMetricsFilesTask', + 'v03_pipeline.lib.tasks.write_sex_check_table.WriteTDRMetricsFilesTask', ) def test_write_metadata_for_run_task( self, diff --git a/v03_pipeline/lib/tasks/write_sex_check_table.py b/v03_pipeline/lib/tasks/write_sex_check_table.py index f87233507..5a9ed6531 100644 --- a/v03_pipeline/lib/tasks/write_sex_check_table.py +++ b/v03_pipeline/lib/tasks/write_sex_check_table.py @@ -2,16 +2,37 @@ import hailtop.fs as hfs import luigi +from v03_pipeline.lib.methods.sex_check import compute_sex_check_ht from v03_pipeline.lib.misc.io import import_imputed_sex -from v03_pipeline.lib.paths import sex_check_table_path, tdr_metrics_dir +from v03_pipeline.lib.model.feature_flag import FeatureFlag +from v03_pipeline.lib.paths import ( + imported_callset_path, + sex_check_table_path, + tdr_metrics_dir, +) +from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask from v03_pipeline.lib.tasks.files import GCSorLocalTarget +from v03_pipeline.lib.tasks.write_imported_callset import WriteImportedCallsetTask from v03_pipeline.lib.tasks.write_tdr_metrics_files import WriteTDRMetricsFilesTask +@luigi.util.inherits(BaseLoadingRunParams) class WriteSexCheckTableTask(BaseWriteTask): callset_path = luigi.Parameter() + @property + def predicted_sex_from_tdr(self): + # complicated enough to need a helper :/ + return ( + FeatureFlag.EXPECT_TDR_METRICS + and not self.skip_expect_tdr_metrics + and self.dataset_type.expect_tdr_metrics( + self.reference_genome, + ) + and self.sample_type.predicted_sex_from_tdr + ) + def output(self) -> luigi.Target: return GCSorLocalTarget( sex_check_table_path( @@ -21,16 +42,37 @@ def output(self) -> luigi.Target: ), ) - def requires(self) -> luigi.Task: - return self.clone(WriteTDRMetricsFilesTask) + def requires(self) -> list[luigi.Task]: + requirements = [] + if self.predicted_sex_from_tdr: + requirements = [ + *requirements, + self.clone(WriteTDRMetricsFilesTask), + ] + else: + requirements = [ + *requirements, + self.clone(WriteImportedCallsetTask), + ] + return requirements def create_table(self) -> hl.Table: ht = None - for tdr_metrics_file in hfs.ls( - tdr_metrics_dir(self.reference_genome, self.dataset_type), - ): - if not ht: - ht = import_imputed_sex(tdr_metrics_file.path) - continue - ht = ht.union(import_imputed_sex(tdr_metrics_file.path)) + if self.predicted_sex_from_tdr: + for tdr_metrics_file in hfs.ls( + tdr_metrics_dir(self.reference_genome, self.dataset_type), + ): + if not ht: + ht = import_imputed_sex(tdr_metrics_file.path) + continue + ht = ht.union(import_imputed_sex(tdr_metrics_file.path)) + else: + mt = hl.read_matrix_table( + imported_callset_path( + self.reference_genome, + self.dataset_type, + self.callset_path, + ), + ) + ht = compute_sex_check_ht(mt) return ht diff --git a/v03_pipeline/lib/tasks/write_sex_check_table_test.py b/v03_pipeline/lib/tasks/write_sex_check_table_test.py index fdf72d335..3b3e7bd4d 100644 --- a/v03_pipeline/lib/tasks/write_sex_check_table_test.py +++ b/v03_pipeline/lib/tasks/write_sex_check_table_test.py @@ -5,22 +5,31 @@ import hail as hl import luigi.worker -from v03_pipeline.lib.model import DatasetType, ReferenceGenome +from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType from v03_pipeline.lib.paths import sex_check_table_path, tdr_metrics_path from v03_pipeline.lib.tasks.write_sex_check_table import ( WriteSexCheckTableTask, ) from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase +TEST_SEX_AND_RELATEDNESS_CALLSET_MT = ( + 'v03_pipeline/var/test/callsets/sex_and_relatedness_1.mt' +) + class WriteSexCheckTableTaskTest(MockedDatarootTestCase): @patch('v03_pipeline.lib.tasks.write_tdr_metrics_files.gen_bq_table_names') @patch('v03_pipeline.lib.tasks.write_tdr_metrics_file.bq_metrics_query') + @patch( + 'v03_pipeline.lib.tasks.write_sex_check_table.FeatureFlag', + ) def test_snv_sex_check_table_task( self, + mock_ff: Mock, mock_bq_metrics_query: Mock, mock_gen_bq_table_names: Mock, ) -> None: + mock_ff.EXPECT_TDR_METRICS = True mock_gen_bq_table_names.return_value = [ 'datarepo-7242affb.datarepo_RP_3053', 'datarepo-5a72e31b.datarepo_RP_3056', @@ -111,7 +120,12 @@ def test_snv_sex_check_table_task( write_sex_check_table = WriteSexCheckTableTask( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.SNV_INDEL, + sample_type=SampleType.WGS, callset_path='na', + project_guids=['R0113_test_project'], + project_remap_paths=['test_remap'], + project_pedigree_paths=['test_pedigree'], + run_id='manual__2024-04-03', ) worker.add(write_sex_check_table) worker.run() @@ -143,3 +157,61 @@ def test_snv_sex_check_table_task( ), ) as f: self.assertTrue('collaborator_sample_id' in f.read()) + + @patch( + 'v03_pipeline.lib.tasks.write_sex_check_table.FeatureFlag', + ) + def test_snv_wes_sex_check_table_task( + self, + mock_ff: Mock, + ) -> None: + mock_ff.EXPECT_TDR_METRICS = True + worker = luigi.worker.Worker() + write_sex_check_table = WriteSexCheckTableTask( + reference_genome=ReferenceGenome.GRCh38, + dataset_type=DatasetType.SNV_INDEL, + sample_type=SampleType.WES, + callset_path=TEST_SEX_AND_RELATEDNESS_CALLSET_MT, + project_guids=['R0113_test_project'], + project_remap_paths=['test_remap'], + project_pedigree_paths=['test_pedigree'], + run_id='manual__2024-04-04', + ) + worker.add(write_sex_check_table) + worker.run() + sex_check_ht = hl.read_table( + sex_check_table_path( + ReferenceGenome.GRCh38, + DatasetType.SNV_INDEL, + TEST_SEX_AND_RELATEDNESS_CALLSET_MT, + ), + ) + self.assertCountEqual( + sex_check_ht.collect(), + [ + hl.Struct( + s='ROS_006_18Y03226_D1', + predicted_sex='M', + ), + hl.Struct( + s='ROS_006_18Y03227_D1', + predicted_sex='M', + ), + hl.Struct( + s='ROS_006_18Y03228_D1', + predicted_sex='M', + ), + hl.Struct( + s='ROS_007_19Y05919_D1', + predicted_sex='M', + ), + hl.Struct( + s='ROS_007_19Y05939_D1', + predicted_sex='F', + ), + hl.Struct( + s='ROS_007_19Y05987_D1', + predicted_sex='M', + ), + ], + )