Skip to content

Commit d86baba

Browse files
committed
Merge branch 'dev' of github.com:broadinstitute/seqr-loading-pipelines
2 parents 9eb916e + 48537ac commit d86baba

16 files changed

+393
-3
lines changed

v03_pipeline/lib/methods/sample_qc.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
import hail as hl
2+
from gnomad.sample_qc.pipeline import filter_rows_for_qc
3+
4+
from v03_pipeline.lib.model import SampleType
5+
6+
GNOMAD_FILTER_MIN_AF = 0.001
7+
GNOMAD_FILTER_MIN_CALLRATE = 0.99
8+
9+
CALLRATE_LOW_THRESHOLD = 0.85
10+
CONTAMINATION_UPPER_THRESHOLD = 5
11+
WES_COVERAGE_LOW_THRESHOLD = 85
12+
WGS_CALLRATE_LOW_THRESHOLD = 30
13+
14+
15+
def call_sample_qc(
16+
mt: hl.MatrixTable,
17+
tdr_metrics_ht: hl.Table,
18+
sample_type: SampleType,
19+
):
20+
mt = mt.annotate_entries(
21+
GT=hl.case()
22+
.when(mt.GT.is_diploid(), hl.call(mt.GT[0], mt.GT[1], phased=False))
23+
.when(mt.GT.is_haploid(), hl.call(mt.GT[0], phased=False))
24+
.default(hl.missing(hl.tcall)),
25+
)
26+
mt = annotate_filtered_callrate(mt)
27+
return annotate_filter_flags(mt, tdr_metrics_ht, sample_type)
28+
29+
30+
def annotate_filtered_callrate(mt: hl.MatrixTable) -> hl.MatrixTable:
31+
filtered_mt = filter_rows_for_qc(
32+
mt,
33+
min_af=GNOMAD_FILTER_MIN_AF,
34+
min_callrate=GNOMAD_FILTER_MIN_CALLRATE,
35+
bi_allelic_only=True,
36+
snv_only=True,
37+
apply_hard_filters=False,
38+
min_inbreeding_coeff_threshold=None,
39+
min_hardy_weinberg_threshold=None,
40+
)
41+
callrate_ht = filtered_mt.select_cols(
42+
filtered_callrate=hl.agg.fraction(hl.is_defined(filtered_mt.GT)),
43+
).cols()
44+
return mt.annotate_cols(**callrate_ht[mt.col_key])
45+
46+
47+
def annotate_filter_flags(
48+
mt: hl.MatrixTable,
49+
tdr_metrics_ht: hl.Table,
50+
sample_type: SampleType,
51+
) -> hl.MatrixTable:
52+
mt = mt.annotate_cols(**tdr_metrics_ht[mt.col_key])
53+
flags = {
54+
'callrate': mt.filtered_callrate < CALLRATE_LOW_THRESHOLD,
55+
'contamination': mt.contamination_rate > CONTAMINATION_UPPER_THRESHOLD,
56+
}
57+
if sample_type == SampleType.WES:
58+
flags['coverage'] = mt.percent_bases_at_20x < WES_COVERAGE_LOW_THRESHOLD
59+
else:
60+
flags['coverage'] = mt.mean_coverage < WGS_CALLRATE_LOW_THRESHOLD
61+
62+
return mt.annotate_cols(
63+
filter_flags=hl.array(
64+
[hl.or_missing(filter_cond, name) for name, filter_cond in flags.items()],
65+
).filter(hl.is_defined),
66+
)

v03_pipeline/lib/misc/io.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,24 @@ def import_imputed_sex(imputed_sex_path: str) -> hl.Table:
244244
return ht.key_by(ht.s)
245245

246246

247+
def import_tdr_qc_metrics(file_path: str) -> hl.Table:
248+
ht = hl.import_table(
249+
file_path,
250+
types={
251+
'contamination_rate': hl.tfloat32,
252+
'percent_bases_at_20x': hl.tfloat32,
253+
'mean_coverage': hl.tfloat32,
254+
},
255+
)
256+
ht = ht.select(
257+
s=ht.collaborator_sample_id,
258+
contamination_rate=ht.contamination_rate,
259+
percent_bases_at_20x=ht.percent_bases_at_20x,
260+
mean_coverage=ht.mean_coverage,
261+
)
262+
return ht.key_by(ht.s)
263+
264+
247265
def import_remap(remap_path: str) -> hl.Table:
248266
ht = hl.import_table(remap_path)
249267
ht = ht.select(

v03_pipeline/lib/misc/validation.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,26 @@ def validate_allele_type(
4848
raise SeqrValidationError(msg)
4949

5050

51+
def validate_allele_depth_length(
52+
mt: hl.MatrixTable,
53+
reference_genome: ReferenceGenome,
54+
dataset_type: DatasetType,
55+
**_: Any,
56+
) -> None:
57+
ht = mt.select_rows(
58+
found_ad_lengths=hl.agg.collect_as_set(hl.len(mt.AD)).remove(
59+
hl.missing(hl.tint32),
60+
),
61+
).rows()
62+
ht = ht.filter(
63+
hl.len(ht.found_ad_lengths) > 1,
64+
)
65+
if ht.count() > 0:
66+
variant_format = dataset_type.table_key_format_fn(reference_genome)
67+
msg = f'Found variants with unequal Allele Depth array lengths over samples (first 10, if applicable): {({variant_format(v): v.found_ad_lengths for v in ht.take(10)})}'
68+
raise SeqrValidationError(msg)
69+
70+
5171
def validate_no_duplicate_variants(
5272
t: hl.Table | hl.MatrixTable,
5373
reference_genome: ReferenceGenome,

v03_pipeline/lib/misc/validation_test.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
from v03_pipeline.lib.misc.validation import (
66
SeqrValidationError,
7+
validate_allele_depth_length,
78
validate_allele_type,
89
validate_expected_contig_frequency,
910
validate_imported_field_types,
@@ -122,6 +123,63 @@ def test_validate_allele_type(self) -> None:
122123
DatasetType.SNV_INDEL,
123124
)
124125

126+
def test_validate_allele_depth_length(self) -> None:
127+
mt = (
128+
hl.MatrixTable.from_parts(
129+
rows={
130+
'locus': [
131+
hl.Locus(
132+
contig='chr1',
133+
position=1,
134+
reference_genome='GRCh38',
135+
),
136+
hl.Locus(
137+
contig='chr1',
138+
position=2,
139+
reference_genome='GRCh38',
140+
),
141+
hl.Locus(
142+
contig='chr1',
143+
position=3,
144+
reference_genome='GRCh38',
145+
),
146+
hl.Locus(
147+
contig='chr1',
148+
position=4,
149+
reference_genome='GRCh38',
150+
),
151+
],
152+
'alleles': [
153+
['A', 'T'],
154+
# NB: star alleles should pass through this validation just fine,
155+
# but are eventually filtered out upstream.
156+
['A', 'TC', 'TG'],
157+
['A', 'TTT'],
158+
['A', 'CCC'],
159+
],
160+
},
161+
cols={'s': ['sample_1', 'sample_2']},
162+
entries={
163+
'AD': [
164+
[[1, 0], [1, 0]],
165+
[[1], [1, 0, 1]],
166+
[[1, 0], [1]],
167+
[[1, 0], [1, 0]],
168+
],
169+
},
170+
)
171+
.key_rows_by('locus', 'alleles')
172+
.key_cols_by('s')
173+
)
174+
self.assertRaisesRegex(
175+
SeqrValidationError,
176+
"Found variants with unequal Allele Depth array lengths over samples \\(first 10, if applicable\\): \\{'1-2-A-TC-TG': \\{1, 3\\}, '1-3-A-TTT': \\{1, 2\\}\\}",
177+
validate_allele_depth_length,
178+
mt,
179+
ReferenceGenome.GRCh38,
180+
DatasetType.SNV_INDEL,
181+
)
182+
125183
def test_validate_imputed_sex_ploidy(self) -> None:
126184
female_sample = 'HG00731_1'
127185
male_sample_1 = 'HG00732_1'

v03_pipeline/lib/model/dataset_type.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def table_key_format_fn(
3636
if self in {DatasetType.GCNV, DatasetType.SV}:
3737
return lambda s: s.variant_id
3838
return (
39-
lambda s: f'{s.locus.contig if reference_genome == ReferenceGenome.GRCh37 else s.locus.contig.replace("chr", "")}-{s.locus.position}-{s.alleles[0]}-{s.alleles[1]}'
39+
lambda s: f'{s.locus.contig if reference_genome == ReferenceGenome.GRCh37 else s.locus.contig.replace("chr", "")}-{s.locus.position}-{"-".join(s.alleles)}'
4040
)
4141

4242
@property

v03_pipeline/lib/paths.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,22 @@ def relatedness_check_tsv_path(
219219
)
220220

221221

222+
def sample_qc_json_path(
223+
reference_genome: ReferenceGenome,
224+
dataset_type: DatasetType,
225+
callset_path: str,
226+
) -> str:
227+
return os.path.join(
228+
pipeline_prefix(
229+
Env.LOADING_DATASETS_DIR,
230+
reference_genome,
231+
dataset_type,
232+
),
233+
'sample_qc',
234+
f'{_callset_path_hash(callset_path)}.json',
235+
)
236+
237+
222238
def remapped_and_subsetted_callset_path(
223239
reference_genome: ReferenceGenome,
224240
dataset_type: DatasetType,

v03_pipeline/lib/tasks/validate_callset.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
from v03_pipeline.lib.misc.validation import (
66
SeqrValidationError,
7+
validate_allele_depth_length,
78
validate_allele_type,
89
validate_expected_contig_frequency,
910
validate_imputed_sex_ploidy,
@@ -123,6 +124,7 @@ def update_table(self, mt: hl.MatrixTable) -> hl.MatrixTable:
123124
)
124125
validation_dependencies = self.get_validation_dependencies()
125126
for validation_f in [
127+
validate_allele_depth_length,
126128
validate_allele_type,
127129
validate_imputed_sex_ploidy,
128130
validate_no_duplicate_variants,

v03_pipeline/lib/tasks/write_metadata_for_run.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,11 @@
44
import luigi
55
import luigi.util
66

7+
from v03_pipeline.lib.model import FeatureFlag
78
from v03_pipeline.lib.paths import (
89
metadata_for_run_path,
910
relatedness_check_tsv_path,
11+
sample_qc_json_path,
1012
)
1113
from v03_pipeline.lib.tasks.base.base_loading_run_params import (
1214
BaseLoadingRunParams,
@@ -54,6 +56,7 @@ def run(self) -> None:
5456
self.dataset_type,
5557
self.callset_path,
5658
),
59+
'sample_qc': {},
5760
}
5861
for remapped_and_subsetted_callset in self.input():
5962
callset_mt = hl.read_matrix_table(remapped_and_subsetted_callset.path)
@@ -67,6 +70,20 @@ def run(self) -> None:
6770
**collected_globals['failed_family_samples'][key],
6871
**metadata_json['failed_family_samples'][key],
6972
}
70-
73+
if (
74+
FeatureFlag.EXPECT_TDR_METRICS
75+
and not self.skip_expect_tdr_metrics
76+
and self.dataset_type.expect_tdr_metrics(
77+
self.reference_genome,
78+
)
79+
):
80+
with open(
81+
sample_qc_json_path(
82+
self.reference_genome,
83+
self.dataset_type,
84+
self.callset_path,
85+
),
86+
) as f:
87+
metadata_json['sample_qc'] = json.load(f)
7188
with self.output().open('w') as f:
7289
json.dump(metadata_json, f)

v03_pipeline/lib/tasks/write_metadata_for_run_test.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,38 @@
11
import json
2+
from unittest import mock
3+
from unittest.mock import Mock
24

35
import luigi.worker
46

57
from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType
68
from v03_pipeline.lib.paths import relatedness_check_tsv_path
79
from v03_pipeline.lib.tasks.write_metadata_for_run import WriteMetadataForRunTask
10+
from v03_pipeline.lib.test.mock_complete_task import MockCompleteTask
811
from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase
912

1013
TEST_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf'
1114
TEST_REMAP_2 = 'v03_pipeline/var/test/remaps/test_remap_2.tsv'
1215
TEST_PEDIGREE_3 = 'v03_pipeline/var/test/pedigrees/test_pedigree_3.tsv'
1316
TEST_PEDIGREE_4 = 'v03_pipeline/var/test/pedigrees/test_pedigree_4.tsv'
17+
TEST_SAMPLE_QC_JSON = 'v03_pipeline/var/test/sample_qc_1.json'
1418

1519

1620
class WriteMetadataForRunTaskTest(MockedDatarootTestCase):
17-
def test_write_metadata_for_run_task(self) -> None:
21+
@mock.patch(
22+
'v03_pipeline.lib.tasks.write_metadata_for_run.sample_qc_json_path',
23+
lambda *_: TEST_SAMPLE_QC_JSON,
24+
)
25+
@mock.patch('v03_pipeline.lib.tasks.write_metadata_for_run.FeatureFlag')
26+
@mock.patch(
27+
'v03_pipeline.lib.tasks.write_imported_callset.WriteTDRMetricsFilesTask',
28+
)
29+
def test_write_metadata_for_run_task(
30+
self,
31+
write_tdr_metrics_task: Mock,
32+
mock_ff: Mock,
33+
) -> None:
34+
mock_ff.EXPECT_TDR_METRICS = True
35+
write_tdr_metrics_task.return_value = MockCompleteTask()
1836
worker = luigi.worker.Worker()
1937
write_metadata_for_run_task = WriteMetadataForRunTask(
2038
reference_genome=ReferenceGenome.GRCh38,
@@ -77,5 +95,11 @@ def test_write_metadata_for_run_task(self) -> None:
7795
DatasetType.SNV_INDEL,
7896
TEST_VCF,
7997
),
98+
'sample_qc': {
99+
'HG00731': {'filter_flags': ['coverage', 'contamination']},
100+
'HG00732': {'filter_flags': ['coverage']},
101+
'HG00733': {'filter_flags': ['contamination']},
102+
'NA19675': {'filter_flags': []},
103+
},
80104
},
81105
)

v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
from v03_pipeline.lib.tasks.write_relatedness_check_tsv import (
3030
WriteRelatednessCheckTsvTask,
3131
)
32+
from v03_pipeline.lib.tasks.write_sample_qc_json import WriteSampleQCJsonTask
3233
from v03_pipeline.lib.tasks.write_sex_check_table import WriteSexCheckTableTask
3334
from v03_pipeline.lib.tasks.write_validation_errors_for_run import (
3435
with_persisted_validation_errors,
@@ -83,6 +84,17 @@ def requires(self) -> list[luigi.Task]:
8384
self.clone(WriteRelatednessCheckTsvTask),
8485
self.clone(WriteSexCheckTableTask),
8586
]
87+
if (
88+
FeatureFlag.EXPECT_TDR_METRICS
89+
and not self.skip_expect_tdr_metrics
90+
and self.dataset_type.expect_tdr_metrics(
91+
self.reference_genome,
92+
)
93+
):
94+
requirements = [
95+
*requirements,
96+
self.clone(WriteSampleQCJsonTask),
97+
]
8698
return requirements
8799

88100
@with_persisted_validation_errors

v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ def test_write_remapped_and_subsetted_callset_task(
9494
project_pedigree_paths=[TEST_PEDIGREE_3],
9595
project_i=0,
9696
skip_validation=True,
97+
skip_expect_tdr_metrics=True,
9798
)
9899
worker.add(wrsc_task)
99100
worker.run()
@@ -138,6 +139,7 @@ def test_write_remapped_and_subsetted_callset_task_failed_sex_check_family(
138139
project_pedigree_paths=[TEST_PEDIGREE_4],
139140
project_i=0,
140141
skip_validation=True,
142+
skip_expect_tdr_metrics=True,
141143
)
142144
worker.add(wrsc_task)
143145
worker.run()
@@ -203,6 +205,7 @@ def test_write_remapped_and_subsetted_callset_task_all_families_failed(
203205
project_pedigree_paths=[TEST_PEDIGREE_7],
204206
project_i=0,
205207
skip_validation=True,
208+
skip_expect_tdr_metrics=True,
206209
)
207210
worker.add(wrsc_task)
208211
worker.run()

0 commit comments

Comments
 (0)