diff --git a/v03_pipeline/lib/methods/sex_check.py b/v03_pipeline/lib/methods/sex_check.py new file mode 100644 index 000000000..0d9aced28 --- /dev/null +++ b/v03_pipeline/lib/methods/sex_check.py @@ -0,0 +1,50 @@ +import hail as hl + +from v03_pipeline.lib.model import Sex + +AMBIGUOUS_THRESHOLD_PERC: float = 0.01 # Fraction of samples identified as "ambiguous_sex" above which an error will be thrown. +AAF_THRESHOLD: float = 0.05 # Alternate allele frequency threshold for `hl.impute_sex`. +BIALLELIC: int = 2 +XX_FSTAT_THRESHOLD: float = ( + 0.5 # F-stat threshold below which a sample will be called XX +) +XY_FSTAT_THRESHOLD: float = ( + 0.75 # F-stat threshold above which a sample will be called XY. +) + + +def compute_sex_check_ht(mt: hl.MatrixTable) -> hl.Table: + # Filter to SNVs and biallelics + # NB: We should already have filtered biallelics, but just in case. + mt = mt.filter_rows( + (hl.len(mt.alleles) == BIALLELIC) & hl.is_snp(mt.alleles[0], mt.alleles[1]), + ) + mt = mt.filter_cols(hl.agg.all(mt.GT.is_diploid() | hl.is_missing(mt.GT))) + + # Filter to PASS variants only (variants with empty or missing filter set) + mt = mt.filter_rows( + hl.is_missing(mt.filters) | (mt.filters.length() == 0), + keep=True, + ) + impute_sex_ht = hl.impute_sex( + mt.GT, + male_threshold=XY_FSTAT_THRESHOLD, + female_threshold=XX_FSTAT_THRESHOLD, + aaf_threshold=AAF_THRESHOLD, + ) + ht = mt.annotate_cols(**impute_sex_ht[mt.col_key]).cols() + ht = ht.select( + predicted_sex=( + hl.case() + .when(hl.is_missing(ht.is_female), Sex.UNKNOWN.value) + .when(ht.is_female, Sex.FEMALE.value) + .default(Sex.MALE.value) + ), + ) + ambiguous_perc = ht.aggregate( + hl.agg.fraction(ht.predicted_sex == Sex.UNKNOWN.value), + ) + if ambiguous_perc > AMBIGUOUS_THRESHOLD_PERC: + msg = f'{ambiguous_perc:.2%} of samples identified as ambiguous. Please contact the methods team to investigate the callset.' + raise ValueError(msg) + return ht diff --git a/v03_pipeline/lib/methods/sex_check_test.py b/v03_pipeline/lib/methods/sex_check_test.py new file mode 100644 index 000000000..f87c07012 --- /dev/null +++ b/v03_pipeline/lib/methods/sex_check_test.py @@ -0,0 +1,55 @@ +import unittest +from unittest.mock import patch + +import hail as hl + +from v03_pipeline.lib.methods.sex_check import compute_sex_check_ht + +TEST_SEX_AND_RELATEDNESS_CALLSET_MT = ( + 'v03_pipeline/var/test/callsets/sex_and_relatedness_1.mt' +) +TEST_PEDIGREE = 'v03_pipeline/var/test/pedigrees/test_pedigree_6.tsv' + + +class SexCheckTest(unittest.TestCase): + def test_compute_sex_check_ht(self): + mt = hl.read_matrix_table(TEST_SEX_AND_RELATEDNESS_CALLSET_MT) + ht = compute_sex_check_ht(mt) + self.assertCountEqual( + ht.collect(), + [ + hl.Struct( + s='ROS_006_18Y03226_D1', + predicted_sex='M', + ), + hl.Struct( + s='ROS_006_18Y03227_D1', + predicted_sex='M', + ), + hl.Struct( + s='ROS_006_18Y03228_D1', + predicted_sex='M', + ), + hl.Struct( + s='ROS_007_19Y05919_D1', + predicted_sex='M', + ), + hl.Struct( + s='ROS_007_19Y05939_D1', + predicted_sex='F', + ), + hl.Struct( + s='ROS_007_19Y05987_D1', + predicted_sex='M', + ), + ], + ) + + def test_compute_sex_check_ht_ambiguous(self): + mt = hl.read_matrix_table(TEST_SEX_AND_RELATEDNESS_CALLSET_MT) + with patch('v03_pipeline.lib.methods.sex_check.XY_FSTAT_THRESHOLD', 0.95): + self.assertRaises( + ValueError, + compute_sex_check_ht, + mt, + ) diff --git a/v03_pipeline/lib/misc/family_loading_failures.py b/v03_pipeline/lib/misc/family_loading_failures.py index 438b1706e..f047e3f50 100644 --- a/v03_pipeline/lib/misc/family_loading_failures.py +++ b/v03_pipeline/lib/misc/family_loading_failures.py @@ -174,3 +174,44 @@ def get_families_failed_sex_check( f'Sample {sample_id} has pedigree sex {family.samples[sample_id].sex.value} but imputed sex {sex_check_lookup[sample_id].value}', ) return dict(failed_families) + + +def get_families_failed_imputed_sex_ploidy( + families: set[Family], + mt: hl.MatrixTable, + sex_check_ht: hl.Table, +) -> dict[Family, str]: + mt = mt.select_cols( + discrepant=( + ( + # All calls are diploid or missing but the sex is Male + hl.agg.all(mt.GT.is_diploid() | hl.is_missing(mt.GT)) + & (sex_check_ht[mt.s].predicted_sex == Sex.MALE.value) + ) + | ( + # At least one call is haploid but the sex is Female, X0, XXY, XYY, or XXX + hl.agg.any(~mt.GT.is_diploid()) + & hl.literal( + { + Sex.FEMALE.value, + Sex.X0.value, + Sex.XYY.value, + Sex.XXY.value, + Sex.XXX.value, + }, + ).contains(sex_check_ht[mt.s].predicted_sex) + ) + ), + ) + discrepant_samples = mt.aggregate_cols( + hl.agg.filter(mt.discrepant, hl.agg.collect_as_set(mt.s)), + ) + failed_families = defaultdict(list) + for family in families: + discrepant_loadable_samples = set(family.samples.keys()) & discrepant_samples + if discrepant_loadable_samples: + sorted_discrepant_samples = sorted(discrepant_loadable_samples) + failed_families[family].append( + f'Found samples with misaligned ploidy with their provided imputed sex: {sorted_discrepant_samples}', + ) + return failed_families diff --git a/v03_pipeline/lib/misc/family_loading_failures_test.py b/v03_pipeline/lib/misc/family_loading_failures_test.py index c9b1858b2..2352542e0 100644 --- a/v03_pipeline/lib/misc/family_loading_failures_test.py +++ b/v03_pipeline/lib/misc/family_loading_failures_test.py @@ -6,12 +6,14 @@ all_relatedness_checks, build_relatedness_check_lookup, build_sex_check_lookup, + get_families_failed_imputed_sex_ploidy, get_families_failed_sex_check, ) from v03_pipeline.lib.misc.io import import_pedigree from v03_pipeline.lib.misc.pedigree import Family, Sample, parse_pedigree_ht_to_families from v03_pipeline.lib.model import Sex +TEST_SEX_CHECK_1 = 'v03_pipeline/var/test/sex_check/test_sex_check_1.ht' TEST_PEDIGREE_6 = 'v03_pipeline/var/test/pedigrees/test_pedigree_6.tsv' @@ -250,3 +252,193 @@ def test_get_families_failed_sex_check(self): ], ], ) + + def test_get_families_failed_imputed_sex_ploidy(self) -> None: + female_sample = 'HG00731_1' + male_sample_1 = 'HG00732_1' + male_sample_2 = 'HG00732_1' + x0_sample = 'NA20899_1' + xxy_sample = 'NA20889_1' + xyy_sample = 'NA20891_1' + xxx_sample = 'NA20892_1' + + sex_check_ht = hl.read_table(TEST_SEX_CHECK_1) + families = { + Family( + family_guid='', + samples={ + female_sample: Sample(female_sample, Sex.FEMALE), + male_sample_1: Sample(male_sample_1, Sex.MALE), + male_sample_2: Sample(male_sample_2, Sex.MALE), + x0_sample: Sample(x0_sample, Sex.X0), + xxy_sample: Sample(xxy_sample, Sex.XXY), + xyy_sample: Sample(xyy_sample, Sex.XYY), + xxx_sample: Sample(xxx_sample, Sex.XXX), + }, + ), + } + + # All calls on X chromosome are valid + mt = ( + hl.MatrixTable.from_parts( + rows={ + 'locus': [ + hl.Locus( + contig='chrX', + position=1, + reference_genome='GRCh38', + ), + ], + }, + cols={ + 's': [ + female_sample, + male_sample_1, + x0_sample, + xxy_sample, + xyy_sample, + xxx_sample, + ], + }, + entries={ + 'GT': [ + [ + hl.Call(alleles=[0, 0], phased=False), + hl.Call(alleles=[0], phased=False), + hl.Call(alleles=[0, 0], phased=False), # X0 + hl.Call(alleles=[0, 0], phased=False), # XXY + hl.Call(alleles=[0, 0], phased=False), # XYY + hl.Call(alleles=[0, 0], phased=False), # XXX + ], + ], + }, + ) + .key_rows_by('locus') + .key_cols_by('s') + ) + failed_families = get_families_failed_imputed_sex_ploidy( + families, + mt, + sex_check_ht, + ) + self.assertDictEqual(failed_families, {}) + + # All calls on Y chromosome are valid + mt = ( + hl.MatrixTable.from_parts( + rows={ + 'locus': [ + hl.Locus( + contig='chrY', + position=1, + reference_genome='GRCh38', + ), + ], + }, + cols={ + 's': [ + female_sample, + male_sample_1, + x0_sample, + xxy_sample, + xyy_sample, + xxx_sample, + ], + }, + entries={ + 'GT': [ + [ + hl.missing(hl.tcall), + hl.Call(alleles=[0], phased=False), + hl.missing(hl.tcall), # X0 + hl.Call(alleles=[0, 0], phased=False), # XXY + hl.Call(alleles=[0, 0], phased=False), # XYY + hl.missing(hl.tcall), # XXX + ], + ], + }, + ) + .key_rows_by('locus') + .key_cols_by('s') + ) + failed_families = get_families_failed_imputed_sex_ploidy( + families, + mt, + sex_check_ht, + ) + self.assertDictEqual(failed_families, {}) + + # Invalid X chromosome case + mt = ( + hl.MatrixTable.from_parts( + rows={ + 'locus': [ + hl.Locus( + contig='chrX', + position=1, + reference_genome='GRCh38', + ), + ], + }, + cols={ + 's': [ + female_sample, + male_sample_1, + male_sample_2, + x0_sample, + xxy_sample, + xyy_sample, + xxx_sample, + ], + }, + entries={ + 'GT': [ + [ + hl.Call(alleles=[0], phased=False), # invalid Female call + hl.Call(alleles=[0], phased=False), # valid Male call + hl.missing(hl.tcall), # invalid Male call + hl.Call(alleles=[0], phased=False), # invalid X0 call + hl.Call(alleles=[0], phased=False), # invalid XXY call + hl.missing(hl.tcall), # valid XYY call + hl.Call(alleles=[0, 0], phased=False), # valid XXX call + ], + ], + }, + ) + .key_rows_by('locus') + .key_cols_by('s') + ) + failed_families = get_families_failed_imputed_sex_ploidy( + families, + mt, + sex_check_ht, + ) + self.assertCountEqual( + failed_families.values(), + [ + [ + "Found samples with misaligned ploidy with their provided imputed sex: ['HG00731_1', 'HG00732_1', 'NA20889_1', 'NA20899_1']", + ], + ], + ) + + # Invalid X chromosome case, but only discrepant family samples are reported + families = { + Family( + family_guid='', + samples={female_sample: Sample(female_sample, Sex.FEMALE)}, + ), + } + failed_families = get_families_failed_imputed_sex_ploidy( + families, + mt, + sex_check_ht, + ) + self.assertCountEqual( + failed_families.values(), + [ + [ + "Found samples with misaligned ploidy with their provided imputed sex: ['HG00731_1']", + ], + ], + ) diff --git a/v03_pipeline/lib/misc/terra_data_repository.py b/v03_pipeline/lib/misc/terra_data_repository.py index 2428a429c..8e4cd975a 100644 --- a/v03_pipeline/lib/misc/terra_data_repository.py +++ b/v03_pipeline/lib/misc/terra_data_repository.py @@ -10,10 +10,10 @@ from v03_pipeline.lib.misc.requests import requests_retry_session BIGQUERY_METRICS = [ - 'collaborator_sample_id', 'predicted_sex', 'contamination_rate', 'percent_bases_at_20x', + 'collaborator_sample_id', 'mean_coverage', ] BIGQUERY_RESOURCE = 'bigquery' @@ -63,9 +63,18 @@ def bq_metrics_query(bq_table_name: str) -> google.cloud.bigquery.table.RowItera msg = f'{bq_table_name} does not match expected pattern' raise ValueError(msg) client = bigquery.Client() + + # not all columns are guaranteed to be present, coalesce if missing + table_ddl = next( + client.query_and_wait( + f""" + SELECT ddl FROM `{bq_table_name}`.INFORMATION_SCHEMA.TABLES where table_name='sample'; + """, # noqa: S608 + ), + )[0] + metrics = [(m if m in table_ddl else f'NULL AS {m}') for m in BIGQUERY_METRICS] return client.query_and_wait( f""" - SELECT {','.join(BIGQUERY_METRICS)} - FROM `{bq_table_name}.sample` - """, # noqa: S608 + SELECT {','.join(metrics)} FROM `{bq_table_name}.sample`; + """, # noqa: S608 ) diff --git a/v03_pipeline/lib/misc/terra_data_repository_test.py b/v03_pipeline/lib/misc/terra_data_repository_test.py index 70f26c81a..7084bcd8b 100644 --- a/v03_pipeline/lib/misc/terra_data_repository_test.py +++ b/v03_pipeline/lib/misc/terra_data_repository_test.py @@ -2,13 +2,14 @@ import os import unittest from types import SimpleNamespace -from unittest.mock import Mock, patch +from unittest.mock import Mock, call, patch import responses from v03_pipeline.lib.misc.terra_data_repository import ( TDR_ROOT_URL, _get_dataset_ids, + bq_metrics_query, gen_bq_table_names, ) @@ -301,3 +302,20 @@ def test_gen_bq_table_names(self, _: Mock) -> None: 'datarepo-aada2e3b.datarepo_RP_3059', ], ) + + @patch('v03_pipeline.lib.misc.terra_data_repository.bigquery.Client') + def test_bq_metrics_query_missing_metrics( + self, + mock_bq_client: Mock, + _: Mock, + ) -> None: + mock_bq_client.return_value.query_and_wait.return_value = iter( + [['predicted_sex,contamination_rate,percent_bases_at_20x']], + ) + bq_metrics_query('datarepo-7242affb.datarepo_RP_3053') + self.assertEqual( + mock_bq_client.return_value.query_and_wait.mock_calls[1], + call( + '\n SELECT predicted_sex,contamination_rate,percent_bases_at_20x,NULL AS collaborator_sample_id,NULL AS mean_coverage FROM `datarepo-7242affb.datarepo_RP_3053.sample`;\n ', + ), + ) diff --git a/v03_pipeline/lib/misc/validation.py b/v03_pipeline/lib/misc/validation.py index 58a9c681c..377bd7f88 100644 --- a/v03_pipeline/lib/misc/validation.py +++ b/v03_pipeline/lib/misc/validation.py @@ -6,7 +6,6 @@ DatasetType, ReferenceGenome, SampleType, - Sex, ) AMBIGUOUS_THRESHOLD_PERC: float = 0.01 # Fraction of samples identified as "ambiguous_sex" above which an error will be thrown. @@ -148,48 +147,6 @@ def _validate_field( raise SeqrValidationError(msg) -def validate_imputed_sex_ploidy( - mt: hl.MatrixTable, - # NB: sex_check_ht will be undefined if sex checking is disabled for the run - sex_check_ht: hl.Table | None = None, - **_: Any, -) -> None: - if not sex_check_ht: - return - mt = mt.select_cols( - discrepant=( - ( - # All calls are diploid or missing but the sex is Male - hl.agg.all(mt.GT.is_diploid() | hl.is_missing(mt.GT)) - & (sex_check_ht[mt.s].predicted_sex == Sex.MALE.value) - ) - | ( - # At least one call is haploid but the sex is Female, X0, XXY, XYY, or XXX - hl.agg.any(~mt.GT.is_diploid()) - & hl.literal( - { - Sex.FEMALE.value, - Sex.X0.value, - Sex.XYY.value, - Sex.XXY.value, - Sex.XXX.value, - }, - ).contains(sex_check_ht[mt.s].predicted_sex) - ) - ), - ) - discrepant_samples = mt.aggregate_cols( - hl.agg.filter(mt.discrepant, hl.agg.collect_as_set(mt.s)), - ) - if discrepant_samples: - sorted_discrepant_samples = sorted(discrepant_samples) - msg = f'Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : {sorted_discrepant_samples[:10]}' - raise SeqrValidationError( - msg, - {'imputed_sex_ploidy_failures': sorted_discrepant_samples}, - ) - - def validate_sample_type( mt: hl.MatrixTable, coding_and_noncoding_variants_ht: hl.Table, diff --git a/v03_pipeline/lib/misc/validation_test.py b/v03_pipeline/lib/misc/validation_test.py index 1a9807b77..4a4b745a6 100644 --- a/v03_pipeline/lib/misc/validation_test.py +++ b/v03_pipeline/lib/misc/validation_test.py @@ -8,13 +8,11 @@ validate_allele_type, validate_expected_contig_frequency, validate_imported_field_types, - validate_imputed_sex_ploidy, validate_no_duplicate_variants, validate_sample_type, ) from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType -TEST_SEX_CHECK_1 = 'v03_pipeline/var/test/sex_check/test_sex_check_1.ht' TEST_MITO_MT = 'v03_pipeline/var/test/callsets/mito_1.mt' @@ -180,145 +178,6 @@ def test_validate_allele_depth_length(self) -> None: DatasetType.SNV_INDEL, ) - def test_validate_imputed_sex_ploidy(self) -> None: - female_sample = 'HG00731_1' - male_sample_1 = 'HG00732_1' - male_sample_2 = 'HG00732_1' - x0_sample = 'NA20899_1' - xxy_sample = 'NA20889_1' - xyy_sample = 'NA20891_1' - xxx_sample = 'NA20892_1' - - sex_check_ht = hl.read_table(TEST_SEX_CHECK_1) - - # All calls on X chromosome are valid - mt = ( - hl.MatrixTable.from_parts( - rows={ - 'locus': [ - hl.Locus( - contig='chrX', - position=1, - reference_genome='GRCh38', - ), - ], - }, - cols={ - 's': [ - female_sample, - male_sample_1, - x0_sample, - xxy_sample, - xyy_sample, - xxx_sample, - ], - }, - entries={ - 'GT': [ - [ - hl.Call(alleles=[0, 0], phased=False), - hl.Call(alleles=[0], phased=False), - hl.Call(alleles=[0, 0], phased=False), # X0 - hl.Call(alleles=[0, 0], phased=False), # XXY - hl.Call(alleles=[0, 0], phased=False), # XYY - hl.Call(alleles=[0, 0], phased=False), # XXX - ], - ], - }, - ) - .key_rows_by('locus') - .key_cols_by('s') - ) - validate_imputed_sex_ploidy(mt, sex_check_ht) - - # All calls on Y chromosome are valid - mt = ( - hl.MatrixTable.from_parts( - rows={ - 'locus': [ - hl.Locus( - contig='chrY', - position=1, - reference_genome='GRCh38', - ), - ], - }, - cols={ - 's': [ - female_sample, - male_sample_1, - x0_sample, - xxy_sample, - xyy_sample, - xxx_sample, - ], - }, - entries={ - 'GT': [ - [ - hl.missing(hl.tcall), - hl.Call(alleles=[0], phased=False), - hl.missing(hl.tcall), # X0 - hl.Call(alleles=[0, 0], phased=False), # XXY - hl.Call(alleles=[0, 0], phased=False), # XYY - hl.missing(hl.tcall), # XXX - ], - ], - }, - ) - .key_rows_by('locus') - .key_cols_by('s') - ) - validate_imputed_sex_ploidy(mt, sex_check_ht) - - # Invalid X chromosome case - mt = ( - hl.MatrixTable.from_parts( - rows={ - 'locus': [ - hl.Locus( - contig='chrX', - position=1, - reference_genome='GRCh38', - ), - ], - }, - cols={ - 's': [ - female_sample, - male_sample_1, - male_sample_2, - x0_sample, - xxy_sample, - xyy_sample, - xxx_sample, - ], - }, - entries={ - 'GT': [ - [ - hl.Call(alleles=[0], phased=False), # invalid Female call - hl.Call(alleles=[0], phased=False), # valid Male call - hl.missing(hl.tcall), # invalid Male call - hl.Call(alleles=[0], phased=False), # invalid X0 call - hl.Call(alleles=[0], phased=False), # invalid XXY call - hl.missing(hl.tcall), # valid XYY call - hl.Call(alleles=[0, 0], phased=False), # valid XXX call - ], - ], - }, - ) - .key_rows_by('locus') - .key_cols_by('s') - ) - self.assertRaisesRegex( - SeqrValidationError, - "Found samples with misaligned ploidy with their provided imputed sex \\(first 10, if applicable\\) : \\['HG00731_1', 'HG00732_1', 'NA20889_1', 'NA20899_1'\\].*", - validate_imputed_sex_ploidy, - mt, - sex_check_ht, - ) - def test_validate_imported_field_types(self) -> None: mt = hl.read_matrix_table(TEST_MITO_MT) validate_imported_field_types(mt, DatasetType.MITO, {}) diff --git a/v03_pipeline/lib/model/definitions.py b/v03_pipeline/lib/model/definitions.py index 171686e29..216a37c4f 100644 --- a/v03_pipeline/lib/model/definitions.py +++ b/v03_pipeline/lib/model/definitions.py @@ -165,3 +165,7 @@ def allele_registry_gnomad_id(self) -> str: class SampleType(StrEnum): WES = 'WES' WGS = 'WGS' + + @property + def predicted_sex_from_tdr(self): + return self == SampleType.WGS diff --git a/v03_pipeline/lib/tasks/validate_callset.py b/v03_pipeline/lib/tasks/validate_callset.py index a93f7fb9b..8d4ff674c 100644 --- a/v03_pipeline/lib/tasks/validate_callset.py +++ b/v03_pipeline/lib/tasks/validate_callset.py @@ -7,14 +7,11 @@ validate_allele_depth_length, validate_allele_type, validate_expected_contig_frequency, - validate_imputed_sex_ploidy, validate_no_duplicate_variants, validate_sample_type, ) -from v03_pipeline.lib.model.feature_flag import FeatureFlag from v03_pipeline.lib.paths import ( imported_callset_path, - sex_check_table_path, valid_reference_dataset_path, ) from v03_pipeline.lib.reference_datasets.reference_dataset import ReferenceDataset @@ -25,7 +22,6 @@ UpdatedReferenceDatasetTask, ) from v03_pipeline.lib.tasks.write_imported_callset import WriteImportedCallsetTask -from v03_pipeline.lib.tasks.write_sex_check_table import WriteSexCheckTableTask from v03_pipeline.lib.tasks.write_validation_errors_for_run import ( WriteValidationErrorsForRunTask, ) @@ -33,28 +29,6 @@ @luigi.util.inherits(BaseLoadingRunParams) class ValidateCallsetTask(BaseUpdateTask): - def get_validation_dependencies(self) -> dict[str, hl.Table]: - deps = {} - deps['coding_and_noncoding_variants_ht'] = hl.read_table( - valid_reference_dataset_path( - self.reference_genome, - ReferenceDataset.gnomad_coding_and_noncoding, - ), - ) - if ( - FeatureFlag.CHECK_SEX_AND_RELATEDNESS - and self.dataset_type.check_sex_and_relatedness - and not self.skip_check_sex_and_relatedness - ): - deps['sex_check_ht'] = hl.read_table( - sex_check_table_path( - self.reference_genome, - self.dataset_type, - self.callset_path, - ), - ) - return deps - def complete(self) -> luigi.Target: if super().complete(): mt = hl.read_matrix_table(self.output().path) @@ -73,9 +47,7 @@ def output(self) -> luigi.Target: ) def requires(self) -> list[luigi.Task]: - requirements = [ - self.clone(WriteImportedCallsetTask), - ] + requirements = [self.clone(WriteImportedCallsetTask)] if not self.skip_validation and self.dataset_type.can_run_validation: requirements = [ *requirements, @@ -86,15 +58,6 @@ def requires(self) -> list[luigi.Task]: ) ), ] - if ( - FeatureFlag.CHECK_SEX_AND_RELATEDNESS - and self.dataset_type.check_sex_and_relatedness - and not self.skip_check_sex_and_relatedness - ): - requirements = [ - *requirements, - self.clone(WriteSexCheckTableTask), - ] return [ *requirements, CallsetTask(self.callset_path), @@ -122,11 +85,15 @@ def update_table(self, mt: hl.MatrixTable) -> hl.MatrixTable: callset_path=self.callset_path, validated_sample_type=self.sample_type.value, ) - validation_dependencies = self.get_validation_dependencies() + coding_and_noncoding_variants_ht = hl.read_table( + valid_reference_dataset_path( + self.reference_genome, + ReferenceDataset.gnomad_coding_and_noncoding, + ), + ) for validation_f in [ validate_allele_depth_length, validate_allele_type, - validate_imputed_sex_ploidy, validate_no_duplicate_variants, validate_expected_contig_frequency, validate_sample_type, @@ -134,8 +101,8 @@ def update_table(self, mt: hl.MatrixTable) -> hl.MatrixTable: try: validation_f( mt, + coding_and_noncoding_variants_ht=coding_and_noncoding_variants_ht, **self.param_kwargs, - **validation_dependencies, ) except SeqrValidationError as e: validation_exceptions.append(e) diff --git a/v03_pipeline/lib/tasks/write_imported_callset.py b/v03_pipeline/lib/tasks/write_imported_callset.py index 75c62aeff..e547217d9 100644 --- a/v03_pipeline/lib/tasks/write_imported_callset.py +++ b/v03_pipeline/lib/tasks/write_imported_callset.py @@ -21,7 +21,6 @@ from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask from v03_pipeline.lib.tasks.files import CallsetTask, GCSorLocalTarget -from v03_pipeline.lib.tasks.write_tdr_metrics_files import WriteTDRMetricsFilesTask from v03_pipeline.lib.tasks.write_validation_errors_for_run import ( with_persisted_validation_errors, ) @@ -60,17 +59,6 @@ def requires(self) -> list[luigi.Task]: ), ), ] - if ( - FeatureFlag.EXPECT_TDR_METRICS - and not self.skip_expect_tdr_metrics - and self.dataset_type.expect_tdr_metrics( - self.reference_genome, - ) - ): - requirements = [ - *requirements, - self.clone(WriteTDRMetricsFilesTask), - ] return [ *requirements, CallsetTask(self.callset_path), diff --git a/v03_pipeline/lib/tasks/write_metadata_for_run.py b/v03_pipeline/lib/tasks/write_metadata_for_run.py index 513422ac8..6a5969f36 100644 --- a/v03_pipeline/lib/tasks/write_metadata_for_run.py +++ b/v03_pipeline/lib/tasks/write_metadata_for_run.py @@ -50,6 +50,7 @@ def run(self) -> None: 'missing_samples': {}, 'relatedness_check': {}, 'sex_check': {}, + 'ploidy_check': {}, }, 'relatedness_check_file_path': relatedness_check_tsv_path( self.reference_genome, @@ -65,7 +66,12 @@ def run(self) -> None: **collected_globals['family_samples'], **metadata_json['family_samples'], } - for key in ['missing_samples', 'relatedness_check', 'sex_check']: + for key in [ + 'missing_samples', + 'relatedness_check', + 'sex_check', + 'ploidy_check', + ]: metadata_json['failed_family_samples'][key] = { **collected_globals['failed_family_samples'][key], **metadata_json['failed_family_samples'][key], diff --git a/v03_pipeline/lib/tasks/write_metadata_for_run_test.py b/v03_pipeline/lib/tasks/write_metadata_for_run_test.py index 72c4f6fb3..41e8e1c30 100644 --- a/v03_pipeline/lib/tasks/write_metadata_for_run_test.py +++ b/v03_pipeline/lib/tasks/write_metadata_for_run_test.py @@ -24,7 +24,7 @@ class WriteMetadataForRunTaskTest(MockedDatarootTestCase): ) @mock.patch('v03_pipeline.lib.tasks.write_metadata_for_run.FeatureFlag') @mock.patch( - 'v03_pipeline.lib.tasks.write_imported_callset.WriteTDRMetricsFilesTask', + 'v03_pipeline.lib.tasks.write_sex_check_table.WriteTDRMetricsFilesTask', ) def test_write_metadata_for_run_task( self, @@ -68,6 +68,7 @@ def test_write_metadata_for_run_task( }, 'relatedness_check': {}, 'sex_check': {}, + 'ploidy_check': {}, }, 'family_samples': { 'abc_1': [ diff --git a/v03_pipeline/lib/tasks/write_project_family_tables_test.py b/v03_pipeline/lib/tasks/write_project_family_tables_test.py index dd535f988..e93d4fada 100644 --- a/v03_pipeline/lib/tasks/write_project_family_tables_test.py +++ b/v03_pipeline/lib/tasks/write_project_family_tables_test.py @@ -137,6 +137,7 @@ def test_snv_write_project_family_tables_task(self) -> None: }, relatedness_check={}, sex_check={}, + ploidy_check={}, ), ) # Project table still contains all family guids diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py index f689e61eb..85642c1ed 100644 --- a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py +++ b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py @@ -3,6 +3,7 @@ import luigi.util from v03_pipeline.lib.misc.family_loading_failures import ( + get_families_failed_imputed_sex_ploidy, get_families_failed_missing_samples, get_families_failed_relatedness_check, get_families_failed_sex_check, @@ -121,6 +122,7 @@ def create_table(self) -> hl.MatrixTable: ) families_failed_relatedness_check = {} families_failed_sex_check = {} + families_failed_imputed_sex_ploidy = {} if ( FeatureFlag.CHECK_SEX_AND_RELATEDNESS and self.dataset_type.check_sex_and_relatedness @@ -139,10 +141,18 @@ def create_table(self) -> hl.MatrixTable: relatedness_check_ht, remap_lookup, ) - families_failed_sex_check = get_families_failed_sex_check( + families_failed_imputed_sex_ploidy = get_families_failed_imputed_sex_ploidy( families - families_failed_missing_samples.keys() - families_failed_relatedness_check.keys(), + callset_mt, + sex_check_ht, + ) + families_failed_sex_check = get_families_failed_sex_check( + families + - families_failed_missing_samples.keys() + - families_failed_relatedness_check.keys() + - families_failed_imputed_sex_ploidy.keys(), sex_check_ht, remap_lookup, ) @@ -152,6 +162,7 @@ def create_table(self) -> hl.MatrixTable: - families_failed_missing_samples.keys() - families_failed_relatedness_check.keys() - families_failed_sex_check.keys() + - families_failed_imputed_sex_ploidy.keys() ) if not len(loadable_families): msg = 'All families failed validation checks' @@ -166,6 +177,9 @@ def create_table(self) -> hl.MatrixTable: families_failed_relatedness_check, ), 'sex_check': format_failures(families_failed_sex_check), + 'ploidy_check': format_failures( + families_failed_imputed_sex_ploidy, + ), }, }, ) @@ -215,5 +229,9 @@ def create_table(self) -> hl.MatrixTable: format_failures(families_failed_sex_check) or hl.empty_dict(hl.tstr, hl.tdict(hl.tstr, hl.tarray(hl.tstr))) ), + ploidy_check=( + format_failures(families_failed_imputed_sex_ploidy) + or hl.empty_dict(hl.tstr, hl.tdict(hl.tstr, hl.tarray(hl.tstr))) + ), ), ) diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py index 96e16da78..e0187d756 100644 --- a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py +++ b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py @@ -115,6 +115,7 @@ def test_write_remapped_and_subsetted_callset_task( missing_samples={}, relatedness_check={}, sex_check={}, + ploidy_check={}, ), family_samples={'abc_1': ['HG00731_1', 'HG00732_1', 'HG00733_1']}, ), @@ -122,7 +123,7 @@ def test_write_remapped_and_subsetted_callset_task( ) @patch('v03_pipeline.lib.tasks.write_remapped_and_subsetted_callset.FeatureFlag') - def test_write_remapped_and_subsetted_callset_task_failed_sex_check_family( + def test_write_remapped_and_subsetted_callset_task_failed_some_family_checks( self, mock_ff: Mock, ) -> None: @@ -145,8 +146,9 @@ def test_write_remapped_and_subsetted_callset_task_failed_sex_check_family( worker.run() self.assertTrue(wrsc_task.complete()) mt = hl.read_matrix_table(wrsc_task.output().path) - # NB: one "family"/"sample" has been removed because of a failed sex check! - self.assertEqual(mt.count(), (30, 12)) + # NB: one "family"/"sample" has been removed because of a failed sex check, + # and 4 removed because of a failed ploidy check! + self.assertEqual(mt.count(), (30, 8)) self.assertEqual( mt.globals.collect(), [ @@ -159,16 +161,12 @@ def test_write_remapped_and_subsetted_callset_task_failed_sex_check_family( ), family_samples={ '123_1': ['NA19675_1'], - '234_1': ['NA19678_1'], '345_1': ['NA19679_1'], '456_1': ['NA20870_1'], - '567_1': ['NA20872_1'], '678_1': ['NA20874_1'], '789_1': ['NA20875_1'], '890_1': ['NA20876_1'], '901_1': ['NA20877_1'], - 'bcd_1': ['NA20878_1'], - 'cde_1': ['NA20881_1'], 'efg_1': ['NA20888_1'], }, failed_family_samples=hl.Struct( @@ -182,6 +180,32 @@ def test_write_remapped_and_subsetted_callset_task_failed_sex_check_family( 'samples': ['NA20885_1'], }, }, + ploidy_check={ + '234_1': { + 'reasons': [ + "Found samples with misaligned ploidy with their provided imputed sex: ['NA19678_1']", + ], + 'samples': ['NA19678_1'], + }, + '567_1': { + 'reasons': [ + "Found samples with misaligned ploidy with their provided imputed sex: ['NA20872_1']", + ], + 'samples': ['NA20872_1'], + }, + 'bcd_1': { + 'reasons': [ + "Found samples with misaligned ploidy with their provided imputed sex: ['NA20878_1']", + ], + 'samples': ['NA20878_1'], + }, + 'cde_1': { + 'reasons': [ + "Found samples with misaligned ploidy with their provided imputed sex: ['NA20881_1']", + ], + 'samples': ['NA20881_1'], + }, + }, ), ), ], @@ -224,119 +248,89 @@ def test_write_remapped_and_subsetted_callset_task_all_families_failed( self.assertDictEqual( json.load(f), { - 'project_guids': [ - 'R0114_project4', - ], - 'error_messages': [ - 'All families failed validation checks', - ], + 'project_guids': ['R0114_project4'], + 'error_messages': ['All families failed validation checks'], 'failed_family_samples': { 'missing_samples': { 'efg_1': { - 'samples': [ - 'NA99999_1', - ], - 'reasons': [ - "Missing samples: {'NA99999_1'}", - ], + 'samples': ['NA99999_1'], + 'reasons': ["Missing samples: {'NA99999_1'}"], }, }, 'relatedness_check': {}, 'sex_check': { - '789_1': { - 'samples': [ - 'NA20875_1', - ], + '890_1': { + 'samples': ['NA20876_1'], 'reasons': [ - 'Sample NA20875_1 has pedigree sex M but imputed sex F', + 'Sample NA20876_1 has pedigree sex M but imputed sex F', ], }, '456_1': { - 'samples': [ - 'NA20870_1', - ], + 'samples': ['NA20870_1'], 'reasons': [ 'Sample NA20870_1 has pedigree sex M but imputed sex F', ], }, '123_1': { - 'samples': [ - 'NA19675_1', - ], + 'samples': ['NA19675_1'], 'reasons': [ 'Sample NA19675_1 has pedigree sex M but imputed sex F', ], }, - 'cde_1': { - 'samples': [ - 'NA20881_1', - ], + '678_1': { + 'samples': ['NA20874_1'], 'reasons': [ - 'Sample NA20881_1 has pedigree sex F but imputed sex M', + 'Sample NA20874_1 has pedigree sex M but imputed sex F', ], }, - '901_1': { - 'samples': [ - 'NA20877_1', - ], + '789_1': { + 'samples': ['NA20875_1'], 'reasons': [ - 'Sample NA20877_1 has pedigree sex M but imputed sex F', + 'Sample NA20875_1 has pedigree sex M but imputed sex F', ], }, - '678_1': { - 'samples': [ - 'NA20874_1', - ], + '901_1': { + 'samples': ['NA20877_1'], 'reasons': [ - 'Sample NA20874_1 has pedigree sex M but imputed sex F', + 'Sample NA20877_1 has pedigree sex M but imputed sex F', ], }, '345_1': { - 'samples': [ - 'NA19679_1', - ], + 'samples': ['NA19679_1'], 'reasons': [ 'Sample NA19679_1 has pedigree sex M but imputed sex F', ], }, - '890_1': { - 'samples': [ - 'NA20876_1', - ], + 'def_1': { + 'samples': ['NA20885_1'], 'reasons': [ - 'Sample NA20876_1 has pedigree sex M but imputed sex F', + 'Sample NA20885_1 has pedigree sex M but imputed sex F', ], }, - 'def_1': { - 'samples': [ - 'NA20885_1', - ], + }, + 'ploidy_check': { + '567_1': { + 'samples': ['NA20872_1'], 'reasons': [ - 'Sample NA20885_1 has pedigree sex M but imputed sex F', + "Found samples with misaligned ploidy with their provided imputed sex: ['NA20872_1']", ], }, '234_1': { - 'samples': [ - 'NA19678_1', - ], + 'samples': ['NA19678_1'], 'reasons': [ - 'Sample NA19678_1 has pedigree sex F but imputed sex M', + "Found samples with misaligned ploidy with their provided imputed sex: ['NA19678_1']", ], }, 'bcd_1': { - 'samples': [ - 'NA20878_1', - ], + 'samples': ['NA20878_1'], 'reasons': [ - 'Sample NA20878_1 has pedigree sex F but imputed sex M', + "Found samples with misaligned ploidy with their provided imputed sex: ['NA20878_1']", ], }, - '567_1': { - 'samples': [ - 'NA20872_1', - ], + 'cde_1': { + 'samples': ['NA20881_1'], 'reasons': [ - 'Sample NA20872_1 has pedigree sex F but imputed sex M', + "Found samples with misaligned ploidy with their provided imputed sex: ['NA20881_1']", ], }, }, diff --git a/v03_pipeline/lib/tasks/write_sex_check_table.py b/v03_pipeline/lib/tasks/write_sex_check_table.py index f87233507..5a9ed6531 100644 --- a/v03_pipeline/lib/tasks/write_sex_check_table.py +++ b/v03_pipeline/lib/tasks/write_sex_check_table.py @@ -2,16 +2,37 @@ import hailtop.fs as hfs import luigi +from v03_pipeline.lib.methods.sex_check import compute_sex_check_ht from v03_pipeline.lib.misc.io import import_imputed_sex -from v03_pipeline.lib.paths import sex_check_table_path, tdr_metrics_dir +from v03_pipeline.lib.model.feature_flag import FeatureFlag +from v03_pipeline.lib.paths import ( + imported_callset_path, + sex_check_table_path, + tdr_metrics_dir, +) +from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask from v03_pipeline.lib.tasks.files import GCSorLocalTarget +from v03_pipeline.lib.tasks.write_imported_callset import WriteImportedCallsetTask from v03_pipeline.lib.tasks.write_tdr_metrics_files import WriteTDRMetricsFilesTask +@luigi.util.inherits(BaseLoadingRunParams) class WriteSexCheckTableTask(BaseWriteTask): callset_path = luigi.Parameter() + @property + def predicted_sex_from_tdr(self): + # complicated enough to need a helper :/ + return ( + FeatureFlag.EXPECT_TDR_METRICS + and not self.skip_expect_tdr_metrics + and self.dataset_type.expect_tdr_metrics( + self.reference_genome, + ) + and self.sample_type.predicted_sex_from_tdr + ) + def output(self) -> luigi.Target: return GCSorLocalTarget( sex_check_table_path( @@ -21,16 +42,37 @@ def output(self) -> luigi.Target: ), ) - def requires(self) -> luigi.Task: - return self.clone(WriteTDRMetricsFilesTask) + def requires(self) -> list[luigi.Task]: + requirements = [] + if self.predicted_sex_from_tdr: + requirements = [ + *requirements, + self.clone(WriteTDRMetricsFilesTask), + ] + else: + requirements = [ + *requirements, + self.clone(WriteImportedCallsetTask), + ] + return requirements def create_table(self) -> hl.Table: ht = None - for tdr_metrics_file in hfs.ls( - tdr_metrics_dir(self.reference_genome, self.dataset_type), - ): - if not ht: - ht = import_imputed_sex(tdr_metrics_file.path) - continue - ht = ht.union(import_imputed_sex(tdr_metrics_file.path)) + if self.predicted_sex_from_tdr: + for tdr_metrics_file in hfs.ls( + tdr_metrics_dir(self.reference_genome, self.dataset_type), + ): + if not ht: + ht = import_imputed_sex(tdr_metrics_file.path) + continue + ht = ht.union(import_imputed_sex(tdr_metrics_file.path)) + else: + mt = hl.read_matrix_table( + imported_callset_path( + self.reference_genome, + self.dataset_type, + self.callset_path, + ), + ) + ht = compute_sex_check_ht(mt) return ht diff --git a/v03_pipeline/lib/tasks/write_sex_check_table_test.py b/v03_pipeline/lib/tasks/write_sex_check_table_test.py index fdf72d335..3b3e7bd4d 100644 --- a/v03_pipeline/lib/tasks/write_sex_check_table_test.py +++ b/v03_pipeline/lib/tasks/write_sex_check_table_test.py @@ -5,22 +5,31 @@ import hail as hl import luigi.worker -from v03_pipeline.lib.model import DatasetType, ReferenceGenome +from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType from v03_pipeline.lib.paths import sex_check_table_path, tdr_metrics_path from v03_pipeline.lib.tasks.write_sex_check_table import ( WriteSexCheckTableTask, ) from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase +TEST_SEX_AND_RELATEDNESS_CALLSET_MT = ( + 'v03_pipeline/var/test/callsets/sex_and_relatedness_1.mt' +) + class WriteSexCheckTableTaskTest(MockedDatarootTestCase): @patch('v03_pipeline.lib.tasks.write_tdr_metrics_files.gen_bq_table_names') @patch('v03_pipeline.lib.tasks.write_tdr_metrics_file.bq_metrics_query') + @patch( + 'v03_pipeline.lib.tasks.write_sex_check_table.FeatureFlag', + ) def test_snv_sex_check_table_task( self, + mock_ff: Mock, mock_bq_metrics_query: Mock, mock_gen_bq_table_names: Mock, ) -> None: + mock_ff.EXPECT_TDR_METRICS = True mock_gen_bq_table_names.return_value = [ 'datarepo-7242affb.datarepo_RP_3053', 'datarepo-5a72e31b.datarepo_RP_3056', @@ -111,7 +120,12 @@ def test_snv_sex_check_table_task( write_sex_check_table = WriteSexCheckTableTask( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.SNV_INDEL, + sample_type=SampleType.WGS, callset_path='na', + project_guids=['R0113_test_project'], + project_remap_paths=['test_remap'], + project_pedigree_paths=['test_pedigree'], + run_id='manual__2024-04-03', ) worker.add(write_sex_check_table) worker.run() @@ -143,3 +157,61 @@ def test_snv_sex_check_table_task( ), ) as f: self.assertTrue('collaborator_sample_id' in f.read()) + + @patch( + 'v03_pipeline.lib.tasks.write_sex_check_table.FeatureFlag', + ) + def test_snv_wes_sex_check_table_task( + self, + mock_ff: Mock, + ) -> None: + mock_ff.EXPECT_TDR_METRICS = True + worker = luigi.worker.Worker() + write_sex_check_table = WriteSexCheckTableTask( + reference_genome=ReferenceGenome.GRCh38, + dataset_type=DatasetType.SNV_INDEL, + sample_type=SampleType.WES, + callset_path=TEST_SEX_AND_RELATEDNESS_CALLSET_MT, + project_guids=['R0113_test_project'], + project_remap_paths=['test_remap'], + project_pedigree_paths=['test_pedigree'], + run_id='manual__2024-04-04', + ) + worker.add(write_sex_check_table) + worker.run() + sex_check_ht = hl.read_table( + sex_check_table_path( + ReferenceGenome.GRCh38, + DatasetType.SNV_INDEL, + TEST_SEX_AND_RELATEDNESS_CALLSET_MT, + ), + ) + self.assertCountEqual( + sex_check_ht.collect(), + [ + hl.Struct( + s='ROS_006_18Y03226_D1', + predicted_sex='M', + ), + hl.Struct( + s='ROS_006_18Y03227_D1', + predicted_sex='M', + ), + hl.Struct( + s='ROS_006_18Y03228_D1', + predicted_sex='M', + ), + hl.Struct( + s='ROS_007_19Y05919_D1', + predicted_sex='M', + ), + hl.Struct( + s='ROS_007_19Y05939_D1', + predicted_sex='F', + ), + hl.Struct( + s='ROS_007_19Y05987_D1', + predicted_sex='M', + ), + ], + )