Skip to content

Commit 53f152d

Browse files
bpblankenmatren395
andauthored
Validate imputed sex results against the callset. (#780)
* scaffold * Add validation of imputed sex file * add args * ruff * Update v03_pipeline/lib/misc/validation_test.py Co-authored-by: Daniel Marten <78616802+matren395@users.noreply.github.com> * Update v03_pipeline/lib/misc/validation_test.py Co-authored-by: Daniel Marten <78616802+matren395@users.noreply.github.com> * Removing support for Unknown * More explicit * format * lint --------- Co-authored-by: Daniel Marten <78616802+matren395@users.noreply.github.com>
1 parent f78238d commit 53f152d

File tree

8 files changed

+144
-22
lines changed

8 files changed

+144
-22
lines changed

v03_pipeline/lib/misc/io.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
import hail as hl
66

77
from v03_pipeline.lib.misc.gcnv import parse_gcnv_genes
8-
from v03_pipeline.lib.misc.validation import validate_ambiguous_sex
98
from v03_pipeline.lib.model import DatasetType, Env, ReferenceGenome, Sex
109

1110
BIALLELIC = 2
@@ -168,12 +167,15 @@ def import_imputed_sex(imputed_sex_path: str) -> hl.Table:
168167
hl.case()
169168
.when(ht.predicted_sex == FEMALE, Sex.FEMALE.value)
170169
.when(ht.predicted_sex == MALE, Sex.MALE.value)
171-
.default(Sex.UNKNOWN.value)
170+
.or_error(
171+
hl.format(
172+
'Found unexpected value %s in imputed sex file',
173+
ht.predicted_sex,
174+
),
175+
)
172176
),
173177
)
174-
ht = ht.key_by(ht.s)
175-
validate_ambiguous_sex(ht)
176-
return ht
178+
return ht.key_by(ht.s)
177179

178180

179181
def import_remap(remap_path: str) -> hl.Table:

v03_pipeline/lib/misc/io_test.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@
99
)
1010

1111
TEST_IMPUTED_SEX = 'v03_pipeline/var/test/sex_check/test_imputed_sex.tsv'
12+
TEST_IMPUTED_SEX_UNEXPECTED_VALUE = (
13+
'v03_pipeline/var/test/sex_check/test_imputed_sex_unexpected_value.tsv'
14+
)
1215
TEST_MITO_MT = 'v03_pipeline/var/test/callsets/mito_1.mt'
1316
TEST_SV_VCF = 'v03_pipeline/var/test/callsets/sv_1.vcf'
1417

@@ -35,3 +38,11 @@ def test_import_imputed_sex(self) -> None:
3538
hl.Struct(s='abc_3', predicted_sex='M'),
3639
],
3740
)
41+
42+
def test_import_imputed_sex_unexpected_value(self) -> None:
43+
ht = import_imputed_sex(TEST_IMPUTED_SEX_UNEXPECTED_VALUE)
44+
self.assertRaisesRegex(
45+
hl.utils.java.HailUserError,
46+
'Found unexpected value Unknown in imputed sex file',
47+
ht.collect,
48+
)

v03_pipeline/lib/misc/validation.py

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,15 +11,6 @@ class SeqrValidationError(Exception):
1111
pass
1212

1313

14-
def validate_ambiguous_sex(ht: hl.Table) -> None:
15-
ambiguous_perc = ht.aggregate(
16-
hl.agg.fraction(ht.predicted_sex == Sex.UNKNOWN.value),
17-
)
18-
if ambiguous_perc > AMBIGUOUS_THRESHOLD_PERC:
19-
msg = f'{ambiguous_perc:.2%} of samples identified as ambiguous. Please contact the methods team to investigate the callset.'
20-
raise ValueError(msg)
21-
22-
2314
def validate_no_duplicate_variants(
2415
mt: hl.MatrixTable,
2516
) -> None:
@@ -56,6 +47,30 @@ def validate_expected_contig_frequency(
5647
raise SeqrValidationError(msg)
5748

5849

50+
def validate_imputed_sex_ploidy(
51+
mt: hl.MatrixTable,
52+
sex_check_ht: hl.Table,
53+
) -> None:
54+
mt = mt.select_cols(
55+
discrepant=(
56+
(
57+
# All calls are diploid but the sex is Male
58+
hl.agg.all(mt.GT.is_diploid())
59+
& (sex_check_ht[mt.s].predicted_sex == Sex.MALE.value)
60+
)
61+
| (
62+
# At least one call is haploid but the sex is Female
63+
hl.agg.any(~mt.GT.is_diploid())
64+
& (sex_check_ht[mt.s].predicted_sex == Sex.FEMALE.value)
65+
)
66+
),
67+
)
68+
discrepant_rate = mt.aggregate_cols(hl.agg.fraction(mt.discrepant))
69+
if discrepant_rate:
70+
msg = f'{discrepant_rate:.2%} of samples have misaligned ploidy with their provided imputed sex.'
71+
raise SeqrValidationError(msg)
72+
73+
5974
def validate_sample_type(
6075
mt: hl.MatrixTable,
6176
coding_and_noncoding_variants_ht: hl.Table,

v03_pipeline/lib/misc/validation_test.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,14 @@
55
from v03_pipeline.lib.misc.validation import (
66
SeqrValidationError,
77
validate_expected_contig_frequency,
8+
validate_imputed_sex_ploidy,
89
validate_no_duplicate_variants,
910
validate_sample_type,
1011
)
1112
from v03_pipeline.lib.model import ReferenceGenome, SampleType
1213

14+
TEST_SEX_CHECK_1 = 'v03_pipeline/var/test/sex_check/test_sex_check_1.ht'
15+
1316

1417
def _mt_from_contigs(contigs):
1518
return hl.MatrixTable.from_parts(
@@ -29,6 +32,57 @@ def _mt_from_contigs(contigs):
2932

3033

3134
class ValidationTest(unittest.TestCase):
35+
def test_validate_imputed_sex_ploidy(self) -> None:
36+
sex_check_ht = hl.read_table(TEST_SEX_CHECK_1)
37+
mt = hl.MatrixTable.from_parts(
38+
rows={
39+
'locus': [
40+
hl.Locus(
41+
contig='chrX',
42+
position=1,
43+
reference_genome='GRCh38',
44+
),
45+
],
46+
},
47+
cols={'s': ['HG00731_1', 'HG00732_1']},
48+
entries={
49+
'GT': [
50+
[
51+
hl.Call(alleles=[0, 0], phased=False),
52+
hl.Call(alleles=[0], phased=False),
53+
],
54+
],
55+
},
56+
).key_rows_by('locus')
57+
validate_imputed_sex_ploidy(mt, sex_check_ht)
58+
mt = hl.MatrixTable.from_parts(
59+
rows={
60+
'locus': [
61+
hl.Locus(
62+
contig='chrX',
63+
position=1,
64+
reference_genome='GRCh38',
65+
),
66+
],
67+
},
68+
cols={'s': ['HG00731_1', 'HG00732_1']},
69+
entries={
70+
'GT': [
71+
[
72+
hl.Call(alleles=[0], phased=False),
73+
hl.Call(alleles=[0], phased=False),
74+
],
75+
],
76+
},
77+
).key_rows_by('locus')
78+
self.assertRaisesRegex(
79+
SeqrValidationError,
80+
'50.00% of samples have misaligned ploidy',
81+
validate_imputed_sex_ploidy,
82+
mt,
83+
sex_check_ht,
84+
)
85+
3286
def test_validate_no_duplicate_variants(self) -> None:
3387
mt = hl.MatrixTable.from_parts(
3488
rows={

v03_pipeline/lib/model/definitions.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@ class AccessControl(Enum):
1111
class Sex(Enum):
1212
FEMALE = 'F'
1313
MALE = 'M'
14-
UNKNOWN = 'U'
1514

1615

1716
class PipelineVersion(Enum):

v03_pipeline/lib/tasks/write_imported_callset.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,21 +8,25 @@
88
)
99
from v03_pipeline.lib.misc.validation import (
1010
validate_expected_contig_frequency,
11+
validate_imputed_sex_ploidy,
1112
validate_no_duplicate_variants,
1213
validate_sample_type,
1314
)
1415
from v03_pipeline.lib.misc.vets import annotate_vets
1516
from v03_pipeline.lib.model import CachedReferenceDatasetQuery
1617
from v03_pipeline.lib.paths import (
1718
imported_callset_path,
19+
sex_check_table_path,
1820
valid_cached_reference_dataset_query_path,
1921
)
2022
from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask
2123
from v03_pipeline.lib.tasks.files import CallsetTask, GCSorLocalTarget, HailTableTask
24+
from v03_pipeline.lib.tasks.write_sex_check_table import WriteSexCheckTableTask
2225

2326

2427
class WriteImportedCallsetTask(BaseWriteTask):
2528
callset_path = luigi.Parameter()
29+
imputed_sex_path = luigi.Parameter(default=None)
2630
filters_path = luigi.OptionalParameter(
2731
default=None,
2832
description='Optional path to part two outputs from callset (VCF shards containing filter information)',
@@ -35,6 +39,10 @@ class WriteImportedCallsetTask(BaseWriteTask):
3539
default=False,
3640
parsing=luigi.BoolParameter.EXPLICIT_PARSING,
3741
)
42+
check_sex_and_relatedness = luigi.BoolParameter(
43+
default=False,
44+
parsing=luigi.BoolParameter.EXPLICIT_PARSING,
45+
)
3846

3947
def complete(self) -> luigi.Target:
4048
if not self.force and super().complete():
@@ -71,6 +79,20 @@ def requires(self) -> list[luigi.Task]:
7179
),
7280
),
7381
]
82+
if (
83+
self.check_sex_and_relatedness
84+
and self.dataset_type.check_sex_and_relatedness
85+
):
86+
requirements = [
87+
*requirements,
88+
WriteSexCheckTableTask(
89+
self.reference_genome,
90+
self.dataset_type,
91+
self.sample_type,
92+
self.callset_path,
93+
self.imputed_sex_path,
94+
),
95+
]
7496
return [
7597
*requirements,
7698
CallsetTask(self.callset_path),
@@ -114,6 +136,21 @@ def create_table(self) -> hl.MatrixTable:
114136
self.reference_genome,
115137
self.sample_type,
116138
)
139+
if (
140+
self.check_sex_and_relatedness
141+
and self.dataset_type.check_sex_and_relatedness
142+
):
143+
sex_check_ht = hl.read_table(
144+
sex_check_table_path(
145+
self.reference_genome,
146+
self.dataset_type,
147+
self.callset_path,
148+
),
149+
)
150+
validate_imputed_sex_ploidy(
151+
mt,
152+
sex_check_ht,
153+
)
117154
return mt.annotate_globals(
118155
callset_path=self.callset_path,
119156
filters_path=self.filters_path or hl.missing(hl.tstr),

v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -65,16 +65,18 @@ def output(self) -> luigi.Target:
6565
def requires(self) -> list[luigi.Task]:
6666
requirements = [
6767
WriteImportedCallsetTask(
68-
self.reference_genome,
69-
self.dataset_type,
70-
self.sample_type,
71-
self.callset_path,
68+
reference_genome=self.reference_genome,
69+
dataset_type=self.dataset_type,
70+
sample_type=self.sample_type,
71+
callset_path=self.callset_path,
72+
imputed_sex_path=self.imputed_sex_path,
7273
# NB: filters_path is explicitly passed as None here
7374
# to avoid carrying it throughout the rest of the pipeline.
7475
# Only the primary import task itself should be aware of it.
75-
None,
76-
self.validate,
77-
False,
76+
filters_path=None,
77+
validate=self.validate,
78+
force=False,
79+
check_sex_and_relatedness=self.check_sex_and_relatedness,
7880
),
7981
RawFileTask(self.project_pedigree_path),
8082
]
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
entity:sample_id collaborator_participant_id collaborator_sample_id contamination_rate coverage_region_1_metrics_file crai_path cram_md5_path cram_path datarepo_row_id dragen_version import:snapshot_id import:timestamp mapped_percentage mapping_metrics_file material_type mean_coverage original_material_type participant_id pass_fail_value pdo percent_bases_at_20x percent_callability predicted_sex product receipt_date reported_sex research_project single_sample_vcf_index_path single_sample_vcf_md5_path single_sample_vcf_path total_bases variant_calling_metrics_file
2+
SM-DM69X abc_2 abc_2 0E+00 gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/c4c07edf-7735-4aa7-9283-7cb2607b60a2/GLE-5774-3-3.qc-coverage-region-1_coverage_metrics.csv gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/dcd4c271-0249-47f1-8e91-81f74735c5a1/GLE-5774-3-3.cram.crai gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/ec41ec06-673f-4fe2-a063-23dc5fe1dcce/GLE-5774-3-3.cram.md5sum gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/aad0e270-2ad5-4f39-b968-9b4beafeb5cc/GLE-5774-3-3.cram a4b04a39-9234-4028-a155-442c4acf12a0 07.021.604.3.7.8 ce74d94c-c33d-49d7-85c9-5f3cbd08aff7 2024-04-17T15:02:46 99.800000000 gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/c3a9e6f2-4c68-410b-823d-46ca406e5061/GLE-5774-3-3.mapping_metrics.csv DNA:DNA Genomic 35.300000000 Whole Blood:Whole Blood PT-24OHM Pass PDO-32755 96.320000000 97.340000000 Unknown P-WG-0139 2017-04-12 04:00:00 Unknown RP-3061 gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/c71cd2a1-c789-4715-9ebc-dbfc40d9f2e2/GLE-5774-3-3.vcf.gz.tbi gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/957a99cb-c9a9-4fc5-a0ec-53f9e461469e/GLE-5774-3-3.vcf.gz.md5sum gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/df520949-5f2b-4976-9d46-80d1cc299813/GLE-5774-3-3.vcf.gz 133253714921.000000000 gs://datarepo-556a9c15-bucket/2a4202b0-93f5-4ebe-8d2b-fd4cfb2b881d/2e98e51b-9394-4e64-977f-e9010a4e16dc/GLE-5774-3-3.vc_metrics.csv

0 commit comments

Comments
 (0)