Skip to content

Commit 48537ac

Browse files
authored
Merge pull request #1034 from broadinstitute/sample-qc-filtered-callrate
add sample qc task and filter_flags
2 parents 56ced7a + 0aac554 commit 48537ac

12 files changed

+312
-2
lines changed

v03_pipeline/lib/methods/sample_qc.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
import hail as hl
2+
from gnomad.sample_qc.pipeline import filter_rows_for_qc
3+
4+
from v03_pipeline.lib.model import SampleType
5+
6+
GNOMAD_FILTER_MIN_AF = 0.001
7+
GNOMAD_FILTER_MIN_CALLRATE = 0.99
8+
9+
CALLRATE_LOW_THRESHOLD = 0.85
10+
CONTAMINATION_UPPER_THRESHOLD = 5
11+
WES_COVERAGE_LOW_THRESHOLD = 85
12+
WGS_CALLRATE_LOW_THRESHOLD = 30
13+
14+
15+
def call_sample_qc(
16+
mt: hl.MatrixTable,
17+
tdr_metrics_ht: hl.Table,
18+
sample_type: SampleType,
19+
):
20+
mt = mt.annotate_entries(
21+
GT=hl.case()
22+
.when(mt.GT.is_diploid(), hl.call(mt.GT[0], mt.GT[1], phased=False))
23+
.when(mt.GT.is_haploid(), hl.call(mt.GT[0], phased=False))
24+
.default(hl.missing(hl.tcall)),
25+
)
26+
mt = annotate_filtered_callrate(mt)
27+
return annotate_filter_flags(mt, tdr_metrics_ht, sample_type)
28+
29+
30+
def annotate_filtered_callrate(mt: hl.MatrixTable) -> hl.MatrixTable:
31+
filtered_mt = filter_rows_for_qc(
32+
mt,
33+
min_af=GNOMAD_FILTER_MIN_AF,
34+
min_callrate=GNOMAD_FILTER_MIN_CALLRATE,
35+
bi_allelic_only=True,
36+
snv_only=True,
37+
apply_hard_filters=False,
38+
min_inbreeding_coeff_threshold=None,
39+
min_hardy_weinberg_threshold=None,
40+
)
41+
callrate_ht = filtered_mt.select_cols(
42+
filtered_callrate=hl.agg.fraction(hl.is_defined(filtered_mt.GT)),
43+
).cols()
44+
return mt.annotate_cols(**callrate_ht[mt.col_key])
45+
46+
47+
def annotate_filter_flags(
48+
mt: hl.MatrixTable,
49+
tdr_metrics_ht: hl.Table,
50+
sample_type: SampleType,
51+
) -> hl.MatrixTable:
52+
mt = mt.annotate_cols(**tdr_metrics_ht[mt.col_key])
53+
flags = {
54+
'callrate': mt.filtered_callrate < CALLRATE_LOW_THRESHOLD,
55+
'contamination': mt.contamination_rate > CONTAMINATION_UPPER_THRESHOLD,
56+
}
57+
if sample_type == SampleType.WES:
58+
flags['coverage'] = mt.percent_bases_at_20x < WES_COVERAGE_LOW_THRESHOLD
59+
else:
60+
flags['coverage'] = mt.mean_coverage < WGS_CALLRATE_LOW_THRESHOLD
61+
62+
return mt.annotate_cols(
63+
filter_flags=hl.array(
64+
[hl.or_missing(filter_cond, name) for name, filter_cond in flags.items()],
65+
).filter(hl.is_defined),
66+
)

v03_pipeline/lib/misc/io.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,24 @@ def import_imputed_sex(imputed_sex_path: str) -> hl.Table:
244244
return ht.key_by(ht.s)
245245

246246

247+
def import_tdr_qc_metrics(file_path: str) -> hl.Table:
248+
ht = hl.import_table(
249+
file_path,
250+
types={
251+
'contamination_rate': hl.tfloat32,
252+
'percent_bases_at_20x': hl.tfloat32,
253+
'mean_coverage': hl.tfloat32,
254+
},
255+
)
256+
ht = ht.select(
257+
s=ht.collaborator_sample_id,
258+
contamination_rate=ht.contamination_rate,
259+
percent_bases_at_20x=ht.percent_bases_at_20x,
260+
mean_coverage=ht.mean_coverage,
261+
)
262+
return ht.key_by(ht.s)
263+
264+
247265
def import_remap(remap_path: str) -> hl.Table:
248266
ht = hl.import_table(remap_path)
249267
ht = ht.select(

v03_pipeline/lib/paths.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,22 @@ def relatedness_check_tsv_path(
219219
)
220220

221221

222+
def sample_qc_json_path(
223+
reference_genome: ReferenceGenome,
224+
dataset_type: DatasetType,
225+
callset_path: str,
226+
) -> str:
227+
return os.path.join(
228+
pipeline_prefix(
229+
Env.LOADING_DATASETS_DIR,
230+
reference_genome,
231+
dataset_type,
232+
),
233+
'sample_qc',
234+
f'{_callset_path_hash(callset_path)}.json',
235+
)
236+
237+
222238
def remapped_and_subsetted_callset_path(
223239
reference_genome: ReferenceGenome,
224240
dataset_type: DatasetType,

v03_pipeline/lib/tasks/write_metadata_for_run.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,11 @@
44
import luigi
55
import luigi.util
66

7+
from v03_pipeline.lib.model import FeatureFlag
78
from v03_pipeline.lib.paths import (
89
metadata_for_run_path,
910
relatedness_check_tsv_path,
11+
sample_qc_json_path,
1012
)
1113
from v03_pipeline.lib.tasks.base.base_loading_run_params import (
1214
BaseLoadingRunParams,
@@ -54,6 +56,7 @@ def run(self) -> None:
5456
self.dataset_type,
5557
self.callset_path,
5658
),
59+
'sample_qc': {},
5760
}
5861
for remapped_and_subsetted_callset in self.input():
5962
callset_mt = hl.read_matrix_table(remapped_and_subsetted_callset.path)
@@ -67,6 +70,20 @@ def run(self) -> None:
6770
**collected_globals['failed_family_samples'][key],
6871
**metadata_json['failed_family_samples'][key],
6972
}
70-
73+
if (
74+
FeatureFlag.EXPECT_TDR_METRICS
75+
and not self.skip_expect_tdr_metrics
76+
and self.dataset_type.expect_tdr_metrics(
77+
self.reference_genome,
78+
)
79+
):
80+
with open(
81+
sample_qc_json_path(
82+
self.reference_genome,
83+
self.dataset_type,
84+
self.callset_path,
85+
),
86+
) as f:
87+
metadata_json['sample_qc'] = json.load(f)
7188
with self.output().open('w') as f:
7289
json.dump(metadata_json, f)

v03_pipeline/lib/tasks/write_metadata_for_run_test.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,38 @@
11
import json
2+
from unittest import mock
3+
from unittest.mock import Mock
24

35
import luigi.worker
46

57
from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType
68
from v03_pipeline.lib.paths import relatedness_check_tsv_path
79
from v03_pipeline.lib.tasks.write_metadata_for_run import WriteMetadataForRunTask
10+
from v03_pipeline.lib.test.mock_complete_task import MockCompleteTask
811
from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase
912

1013
TEST_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf'
1114
TEST_REMAP_2 = 'v03_pipeline/var/test/remaps/test_remap_2.tsv'
1215
TEST_PEDIGREE_3 = 'v03_pipeline/var/test/pedigrees/test_pedigree_3.tsv'
1316
TEST_PEDIGREE_4 = 'v03_pipeline/var/test/pedigrees/test_pedigree_4.tsv'
17+
TEST_SAMPLE_QC_JSON = 'v03_pipeline/var/test/sample_qc_1.json'
1418

1519

1620
class WriteMetadataForRunTaskTest(MockedDatarootTestCase):
17-
def test_write_metadata_for_run_task(self) -> None:
21+
@mock.patch(
22+
'v03_pipeline.lib.tasks.write_metadata_for_run.sample_qc_json_path',
23+
lambda *_: TEST_SAMPLE_QC_JSON,
24+
)
25+
@mock.patch('v03_pipeline.lib.tasks.write_metadata_for_run.FeatureFlag')
26+
@mock.patch(
27+
'v03_pipeline.lib.tasks.write_imported_callset.WriteTDRMetricsFilesTask',
28+
)
29+
def test_write_metadata_for_run_task(
30+
self,
31+
write_tdr_metrics_task: Mock,
32+
mock_ff: Mock,
33+
) -> None:
34+
mock_ff.EXPECT_TDR_METRICS = True
35+
write_tdr_metrics_task.return_value = MockCompleteTask()
1836
worker = luigi.worker.Worker()
1937
write_metadata_for_run_task = WriteMetadataForRunTask(
2038
reference_genome=ReferenceGenome.GRCh38,
@@ -77,5 +95,11 @@ def test_write_metadata_for_run_task(self) -> None:
7795
DatasetType.SNV_INDEL,
7896
TEST_VCF,
7997
),
98+
'sample_qc': {
99+
'HG00731': {'filter_flags': ['coverage', 'contamination']},
100+
'HG00732': {'filter_flags': ['coverage']},
101+
'HG00733': {'filter_flags': ['contamination']},
102+
'NA19675': {'filter_flags': []},
103+
},
80104
},
81105
)

v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
from v03_pipeline.lib.tasks.write_relatedness_check_tsv import (
3030
WriteRelatednessCheckTsvTask,
3131
)
32+
from v03_pipeline.lib.tasks.write_sample_qc_json import WriteSampleQCJsonTask
3233
from v03_pipeline.lib.tasks.write_sex_check_table import WriteSexCheckTableTask
3334
from v03_pipeline.lib.tasks.write_validation_errors_for_run import (
3435
with_persisted_validation_errors,
@@ -83,6 +84,17 @@ def requires(self) -> list[luigi.Task]:
8384
self.clone(WriteRelatednessCheckTsvTask),
8485
self.clone(WriteSexCheckTableTask),
8586
]
87+
if (
88+
FeatureFlag.EXPECT_TDR_METRICS
89+
and not self.skip_expect_tdr_metrics
90+
and self.dataset_type.expect_tdr_metrics(
91+
self.reference_genome,
92+
)
93+
):
94+
requirements = [
95+
*requirements,
96+
self.clone(WriteSampleQCJsonTask),
97+
]
8698
return requirements
8799

88100
@with_persisted_validation_errors

v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ def test_write_remapped_and_subsetted_callset_task(
9494
project_pedigree_paths=[TEST_PEDIGREE_3],
9595
project_i=0,
9696
skip_validation=True,
97+
skip_expect_tdr_metrics=True,
9798
)
9899
worker.add(wrsc_task)
99100
worker.run()
@@ -138,6 +139,7 @@ def test_write_remapped_and_subsetted_callset_task_failed_sex_check_family(
138139
project_pedigree_paths=[TEST_PEDIGREE_4],
139140
project_i=0,
140141
skip_validation=True,
142+
skip_expect_tdr_metrics=True,
141143
)
142144
worker.add(wrsc_task)
143145
worker.run()
@@ -203,6 +205,7 @@ def test_write_remapped_and_subsetted_callset_task_all_families_failed(
203205
project_pedigree_paths=[TEST_PEDIGREE_7],
204206
project_i=0,
205207
skip_validation=True,
208+
skip_expect_tdr_metrics=True,
206209
)
207210
worker.add(wrsc_task)
208211
worker.run()
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
import json
2+
from collections import defaultdict
3+
4+
import hail as hl
5+
import hailtop.fs as hfs
6+
import luigi
7+
import luigi.util
8+
9+
from v03_pipeline.lib.methods.sample_qc import call_sample_qc
10+
from v03_pipeline.lib.misc.io import import_tdr_qc_metrics
11+
from v03_pipeline.lib.paths import sample_qc_json_path, tdr_metrics_dir
12+
from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams
13+
from v03_pipeline.lib.tasks.files import GCSorLocalTarget
14+
from v03_pipeline.lib.tasks.validate_callset import ValidateCallsetTask
15+
from v03_pipeline.lib.tasks.write_tdr_metrics_files import WriteTDRMetricsFilesTask
16+
17+
18+
@luigi.util.inherits(BaseLoadingRunParams)
19+
class WriteSampleQCJsonTask(luigi.Task):
20+
def output(self) -> luigi.Target:
21+
return GCSorLocalTarget(
22+
sample_qc_json_path(
23+
self.reference_genome,
24+
self.dataset_type,
25+
self.callset_path,
26+
),
27+
)
28+
29+
def requires(self):
30+
return [self.clone(ValidateCallsetTask), self.clone(WriteTDRMetricsFilesTask)]
31+
32+
def run(self):
33+
callset_mt = hl.read_matrix_table(self.input()[0].path)
34+
35+
tdr_metrics_ht = None
36+
for tdr_metrics_file in hfs.ls(
37+
tdr_metrics_dir(self.reference_genome, self.dataset_type),
38+
):
39+
if not tdr_metrics_ht:
40+
tdr_metrics_ht = import_tdr_qc_metrics(tdr_metrics_file.path)
41+
continue
42+
tdr_metrics_ht = tdr_metrics_ht.union(
43+
import_tdr_qc_metrics(tdr_metrics_file.path),
44+
)
45+
46+
callset_mt = call_sample_qc(
47+
callset_mt,
48+
tdr_metrics_ht,
49+
self.sample_type,
50+
)
51+
ht = callset_mt.cols()
52+
sample_qc_dict = defaultdict(dict)
53+
for row in ht.flatten().collect():
54+
r = dict(row)
55+
sample_id = r.pop('s')
56+
for field, value in r.items():
57+
sample_qc_dict[sample_id][field] = value
58+
59+
with self.output().open('w') as f:
60+
json.dump(sample_qc_dict, f)

0 commit comments

Comments
 (0)