Skip to content

Commit aba2b94

Browse files
committed
only hard fail on samples in pedigree
1 parent aa0027b commit aba2b94

File tree

3 files changed

+40
-6
lines changed

3 files changed

+40
-6
lines changed

v03_pipeline/lib/misc/validation.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import hail as hl
44

5+
from v03_pipeline.lib.misc.pedigree import Family
56
from v03_pipeline.lib.model import (
67
DatasetType,
78
ReferenceGenome,
@@ -132,6 +133,7 @@ def validate_imputed_sex_ploidy(
132133
mt: hl.MatrixTable,
133134
# NB: sex_check_ht will be undefined if sex checking is disabled for the run
134135
sex_check_ht: hl.Table | None = None,
136+
pedigree_families: set[Family] | None = None,
135137
**_: Any,
136138
) -> None:
137139
if not sex_check_ht:
@@ -161,8 +163,15 @@ def validate_imputed_sex_ploidy(
161163
discrepant_samples = mt.aggregate_cols(
162164
hl.agg.filter(mt.discrepant, hl.agg.collect_as_set(mt.s)),
163165
)
164-
if discrepant_samples:
165-
sorted_discrepant_samples = sorted(discrepant_samples)
166+
loading_samples = (
167+
{sample_id for family in pedigree_families for sample_id in family.samples}
168+
if pedigree_families
169+
else set()
170+
)
171+
discrepant_loading_samples = discrepant_samples & loading_samples
172+
173+
if discrepant_loading_samples:
174+
sorted_discrepant_samples = sorted(discrepant_loading_samples)
166175
msg = f'Found samples with misaligned ploidy with their provided imputed sex (first 10, if applicable) : {sorted_discrepant_samples[:10]}'
167176
raise SeqrValidationError(
168177
msg,

v03_pipeline/lib/misc/validation_test.py

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import hail as hl
44

5+
from v03_pipeline.lib.misc.pedigree import Family, Sample
56
from v03_pipeline.lib.misc.validation import (
67
SeqrValidationError,
78
validate_allele_type,
@@ -11,7 +12,7 @@
1112
validate_no_duplicate_variants,
1213
validate_sample_type,
1314
)
14-
from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType
15+
from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType, Sex
1516

1617
TEST_SEX_CHECK_1 = 'v03_pipeline/var/test/sex_check/test_sex_check_1.ht'
1718
TEST_MITO_MT = 'v03_pipeline/var/test/callsets/mito_1.mt'
@@ -171,7 +172,20 @@ def test_validate_imputed_sex_ploidy(self) -> None:
171172
.key_rows_by('locus')
172173
.key_cols_by('s')
173174
)
174-
validate_imputed_sex_ploidy(mt, sex_check_ht)
175+
pedigree_families = {
176+
Family(
177+
family_guid='',
178+
samples={
179+
female_sample: Sample(female_sample, Sex.FEMALE),
180+
male_sample_1: Sample(male_sample_1, Sex.MALE),
181+
x0_sample: Sample(x0_sample, Sex.X0),
182+
xxy_sample: Sample(xxy_sample, Sex.XXY),
183+
xyy_sample: Sample(xyy_sample, Sex.XYY),
184+
xxx_sample: Sample(xxx_sample, Sex.XXX),
185+
},
186+
),
187+
}
188+
validate_imputed_sex_ploidy(mt, sex_check_ht, pedigree_families)
175189

176190
# All calls on Y chromosome are valid
177191
mt = (
@@ -211,7 +225,7 @@ def test_validate_imputed_sex_ploidy(self) -> None:
211225
.key_rows_by('locus')
212226
.key_cols_by('s')
213227
)
214-
validate_imputed_sex_ploidy(mt, sex_check_ht)
228+
validate_imputed_sex_ploidy(mt, sex_check_ht, pedigree_families)
215229

216230
# Invalid X chromosome case
217231
mt = (
@@ -259,8 +273,12 @@ def test_validate_imputed_sex_ploidy(self) -> None:
259273
validate_imputed_sex_ploidy,
260274
mt,
261275
sex_check_ht,
276+
pedigree_families,
262277
)
263278

279+
# Invalid X chromosome case, but invalid samples are missing from pedigree
280+
validate_imputed_sex_ploidy(mt, sex_check_ht, pedigree_families=set())
281+
264282
def test_validate_imported_field_types(self) -> None:
265283
mt = hl.read_matrix_table(TEST_MITO_MT)
266284
validate_imported_field_types(mt, DatasetType.MITO, {})

v03_pipeline/lib/tasks/validate_callset.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
import luigi
33
import luigi.util
44

5+
from v03_pipeline.lib.misc.io import import_pedigree
6+
from v03_pipeline.lib.misc.pedigree import parse_pedigree_ht_to_families
57
from v03_pipeline.lib.misc.validation import (
68
SeqrValidationError,
79
validate_allele_type,
@@ -19,7 +21,7 @@
1921
from v03_pipeline.lib.reference_datasets.reference_dataset import ReferenceDataset
2022
from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams
2123
from v03_pipeline.lib.tasks.base.base_update import BaseUpdateTask
22-
from v03_pipeline.lib.tasks.files import CallsetTask, GCSorLocalTarget
24+
from v03_pipeline.lib.tasks.files import CallsetTask, GCSorLocalTarget, RawFileTask
2325
from v03_pipeline.lib.tasks.reference_data.updated_reference_dataset import (
2426
UpdatedReferenceDatasetTask,
2527
)
@@ -52,6 +54,10 @@ def get_validation_dependencies(self) -> dict[str, hl.Table]:
5254
self.callset_path,
5355
),
5456
)
57+
deps['pedigree_families'] = parse_pedigree_ht_to_families(
58+
import_pedigree(self.input()[1].path),
59+
)
60+
5561
return deps
5662

5763
def complete(self) -> luigi.Target:
@@ -74,6 +80,7 @@ def output(self) -> luigi.Target:
7480
def requires(self) -> list[luigi.Task]:
7581
requirements = [
7682
self.clone(WriteImportedCallsetTask),
83+
RawFileTask(self.project_pedigree_paths[self.project_i]),
7784
]
7885
if not self.skip_validation and self.dataset_type.can_run_validation:
7986
requirements = [

0 commit comments

Comments
 (0)