Skip to content

Commit 1fea1f7

Browse files
authored
Dependency reordering so that ValidateCallsetTask runs before updating the reference data. (#950)
* Parse clinvar version from header * Dependency reordering for reference data updates and validation * ruff * missed one * Revert relatedness changes * push * Fix import issue * Fix sample type * ruff * Fix import mocking * imports * responses activate * fix test * Tweaks * comment
1 parent 2debf88 commit 1fea1f7

10 files changed

+159
-74
lines changed

v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -36,16 +36,12 @@ def output(self) -> luigi.Target:
3636

3737
def requires(self) -> list[luigi.Task]:
3838
requirements = [
39-
UpdateCachedReferenceDatasetQueries(
40-
reference_genome=self.reference_genome,
41-
dataset_type=self.dataset_type,
42-
),
39+
self.clone(UpdateCachedReferenceDatasetQueries),
4340
]
4441
requirements.extend(
45-
UpdatedReferenceDatasetCollectionTask(
46-
self.reference_genome,
47-
self.dataset_type,
48-
rdc,
42+
self.clone(
43+
UpdatedReferenceDatasetCollectionTask,
44+
reference_dataset_collection=rdc,
4945
)
5046
for rdc in ReferenceDatasetCollection.for_reference_genome_dataset_type(
5147
self.reference_genome,

v03_pipeline/lib/tasks/reference_data/update_cached_reference_dataset_queries.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,15 @@
44
from v03_pipeline.lib.model import (
55
CachedReferenceDatasetQuery,
66
)
7-
from v03_pipeline.lib.tasks.base.base_loading_pipeline_params import (
8-
BaseLoadingPipelineParams,
7+
from v03_pipeline.lib.tasks.base.base_loading_run_params import (
8+
BaseLoadingRunParams,
99
)
1010
from v03_pipeline.lib.tasks.reference_data.updated_cached_reference_dataset_query import (
1111
UpdatedCachedReferenceDatasetQuery,
1212
)
1313

1414

15-
@luigi.util.inherits(BaseLoadingPipelineParams)
15+
@luigi.util.inherits(BaseLoadingRunParams)
1616
class UpdateCachedReferenceDatasetQueries(luigi.Task):
1717
def __init__(self, *args, **kwargs):
1818
super().__init__(*args, **kwargs)

v03_pipeline/lib/tasks/reference_data/update_cached_reference_dataset_queries_test.py

Lines changed: 56 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
CachedReferenceDatasetQuery,
88
DatasetType,
99
ReferenceGenome,
10+
SampleType,
1011
)
1112
from v03_pipeline.lib.tasks.reference_data.update_cached_reference_dataset_queries import (
1213
UpdateCachedReferenceDatasetQueries,
@@ -21,99 +22,100 @@ class UpdateCachedReferenceDatasetQueriesTest(unittest.TestCase):
2122
def test_37_snv_indel(self, mock_crdq_task):
2223
mock_crdq_task.return_value = MockCompleteTask()
2324
worker = luigi.worker.Worker()
25+
kwargs = {
26+
'sample_type': SampleType.WGS,
27+
'callset_path': '',
28+
'project_guids': [],
29+
'project_remap_paths': [],
30+
'project_pedigree_paths': [],
31+
'skip_validation': True,
32+
'run_id': '1',
33+
}
2434
task = UpdateCachedReferenceDatasetQueries(
2535
reference_genome=ReferenceGenome.GRCh37,
2636
dataset_type=DatasetType.SNV_INDEL,
37+
**kwargs,
2738
)
2839
worker.add(task)
2940
worker.run()
3041
self.assertTrue(task.complete())
31-
mock_crdq_task.assert_has_calls(
32-
[
33-
mock.call(
34-
reference_genome=ReferenceGenome.GRCh37,
35-
dataset_type=DatasetType.SNV_INDEL,
36-
crdq=CachedReferenceDatasetQuery.CLINVAR_PATH_VARIANTS,
37-
),
38-
mock.call(
39-
reference_genome=ReferenceGenome.GRCh37,
40-
dataset_type=DatasetType.SNV_INDEL,
41-
crdq=CachedReferenceDatasetQuery.GNOMAD_CODING_AND_NONCODING_VARIANTS,
42-
),
43-
mock.call(
44-
reference_genome=ReferenceGenome.GRCh37,
45-
dataset_type=DatasetType.SNV_INDEL,
46-
crdq=CachedReferenceDatasetQuery.GNOMAD_QC,
47-
),
48-
mock.call(
49-
reference_genome=ReferenceGenome.GRCh37,
50-
dataset_type=DatasetType.SNV_INDEL,
51-
crdq=CachedReferenceDatasetQuery.HIGH_AF_VARIANTS,
52-
),
53-
],
42+
call_args_list = mock_crdq_task.call_args_list
43+
self.assertEqual(len(call_args_list), 4)
44+
self.assertEqual(
45+
[x.kwargs['crdq'] for x in call_args_list],
46+
list(CachedReferenceDatasetQuery),
5447
)
5548

5649
def test_38_snv_indel(self, mock_crdq_task):
5750
mock_crdq_task.return_value = MockCompleteTask()
5851
worker = luigi.worker.Worker()
52+
kwargs = {
53+
'sample_type': SampleType.WGS,
54+
'callset_path': '',
55+
'project_guids': [],
56+
'project_remap_paths': [],
57+
'project_pedigree_paths': [],
58+
'skip_validation': True,
59+
'run_id': '2',
60+
}
5961
task = UpdateCachedReferenceDatasetQueries(
6062
reference_genome=ReferenceGenome.GRCh38,
6163
dataset_type=DatasetType.SNV_INDEL,
64+
**kwargs,
6265
)
6366
worker.add(task)
6467
worker.run()
6568
self.assertTrue(task.complete())
66-
mock_crdq_task.assert_has_calls(
67-
[
68-
mock.call(
69-
reference_genome=ReferenceGenome.GRCh38,
70-
dataset_type=DatasetType.SNV_INDEL,
71-
crdq=CachedReferenceDatasetQuery.CLINVAR_PATH_VARIANTS,
72-
),
73-
mock.call(
74-
reference_genome=ReferenceGenome.GRCh38,
75-
dataset_type=DatasetType.SNV_INDEL,
76-
crdq=CachedReferenceDatasetQuery.GNOMAD_CODING_AND_NONCODING_VARIANTS,
77-
),
78-
mock.call(
79-
reference_genome=ReferenceGenome.GRCh38,
80-
dataset_type=DatasetType.SNV_INDEL,
81-
crdq=CachedReferenceDatasetQuery.GNOMAD_QC,
82-
),
83-
mock.call(
84-
reference_genome=ReferenceGenome.GRCh38,
85-
dataset_type=DatasetType.SNV_INDEL,
86-
crdq=CachedReferenceDatasetQuery.HIGH_AF_VARIANTS,
87-
),
88-
],
69+
call_args_list = mock_crdq_task.call_args_list
70+
self.assertEqual(len(call_args_list), 4)
71+
self.assertEqual(
72+
[x.kwargs['crdq'] for x in call_args_list],
73+
list(CachedReferenceDatasetQuery),
8974
)
9075

9176
def test_38_mito(self, mock_crdq_task):
9277
mock_crdq_task.return_value = MockCompleteTask()
9378
worker = luigi.worker.Worker()
79+
kwargs = {
80+
'sample_type': SampleType.WGS,
81+
'callset_path': '',
82+
'project_guids': [],
83+
'project_remap_paths': [],
84+
'project_pedigree_paths': [],
85+
'skip_validation': True,
86+
'run_id': '3',
87+
}
9488
task = UpdateCachedReferenceDatasetQueries(
9589
reference_genome=ReferenceGenome.GRCh38,
9690
dataset_type=DatasetType.MITO,
91+
**kwargs,
9792
)
9893
worker.add(task)
9994
worker.run()
10095
self.assertTrue(task.complete())
101-
mock_crdq_task.assert_has_calls(
102-
[
103-
mock.call(
104-
reference_genome=ReferenceGenome.GRCh38,
105-
dataset_type=DatasetType.MITO,
106-
crdq=CachedReferenceDatasetQuery.CLINVAR_PATH_VARIANTS,
107-
),
108-
],
96+
call_args_list = mock_crdq_task.call_args_list
97+
self.assertEqual(len(call_args_list), 1)
98+
self.assertEqual(
99+
next(x.kwargs['crdq'] for x in call_args_list),
100+
CachedReferenceDatasetQuery.CLINVAR_PATH_VARIANTS,
109101
)
110102

111103
def test_38_sv(self, mock_crdq_task):
112104
mock_crdq_task.return_value = MockCompleteTask()
113105
worker = luigi.worker.Worker()
106+
kwargs = {
107+
'sample_type': SampleType.WGS,
108+
'callset_path': '',
109+
'project_guids': [],
110+
'project_remap_paths': [],
111+
'project_pedigree_paths': [],
112+
'skip_validation': True,
113+
'run_id': '4',
114+
}
114115
task = UpdateCachedReferenceDatasetQueries(
115116
reference_genome=ReferenceGenome.GRCh38,
116117
dataset_type=DatasetType.SV,
118+
**kwargs,
117119
)
118120
worker.add(task)
119121
worker.run()

v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import hail as hl
2+
import luigi
23

34
from v03_pipeline.lib.annotations.fields import get_fields
45
from v03_pipeline.lib.logger import get_logger
@@ -8,13 +9,17 @@
89
get_datasets_to_update,
910
)
1011
from v03_pipeline.lib.reference_data.config import CONFIG
12+
from v03_pipeline.lib.tasks.base.base_loading_run_params import (
13+
BaseLoadingRunParams,
14+
)
1115
from v03_pipeline.lib.tasks.base.base_update_variant_annotations_table import (
1216
BaseUpdateVariantAnnotationsTableTask,
1317
)
1418

1519
logger = get_logger(__name__)
1620

1721

22+
@luigi.util.inherits(BaseLoadingRunParams)
1823
class UpdateVariantAnnotationsTableWithUpdatedReferenceDataset(
1924
BaseUpdateVariantAnnotationsTableTask,
2025
):

v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
DatasetType,
2020
ReferenceDatasetCollection,
2121
ReferenceGenome,
22+
SampleType,
2223
)
2324
from v03_pipeline.lib.paths import valid_reference_dataset_collection_path
2425
from v03_pipeline.lib.reference_data.clinvar import CLINVAR_ASSERTIONS
@@ -37,6 +38,8 @@
3738
TEST_INTERVAL_MITO_1 = 'v03_pipeline/var/test/reference_data/test_interval_mito_1.ht'
3839
TEST_COMBINED_37 = 'v03_pipeline/var/test/reference_data/test_combined_37.ht'
3940
TEST_HGMD_37 = 'v03_pipeline/var/test/reference_data/test_hgmd_37.ht'
41+
TEST_SNV_INDEL_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf'
42+
TEST_MITO_MT = 'v03_pipeline/var/test/callsets/mito_1.mt'
4043

4144

4245
MOCK_CADD_CONFIG = {
@@ -754,6 +757,13 @@ def test_update_vat_with_updated_rdc_snv_indel_38(
754757
task = UpdateVariantAnnotationsTableWithUpdatedReferenceDataset(
755758
reference_genome=ReferenceGenome.GRCh38,
756759
dataset_type=DatasetType.SNV_INDEL,
760+
sample_type=SampleType.WGS,
761+
callset_path=TEST_SNV_INDEL_VCF,
762+
project_guids=[],
763+
project_remap_paths=[],
764+
project_pedigree_paths=[],
765+
skip_validation=True,
766+
run_id='3',
757767
)
758768
worker = luigi.worker.Worker()
759769
worker.add(task)
@@ -964,6 +974,13 @@ def test_update_vat_with_updated_rdc_mito_38(
964974
task = UpdateVariantAnnotationsTableWithUpdatedReferenceDataset(
965975
reference_genome=ReferenceGenome.GRCh38,
966976
dataset_type=DatasetType.MITO,
977+
sample_type=SampleType.WGS,
978+
callset_path=TEST_MITO_MT,
979+
project_guids=[],
980+
project_remap_paths=[],
981+
project_pedigree_paths=[],
982+
skip_validation=True,
983+
run_id='1',
967984
)
968985
worker = luigi.worker.Worker()
969986
worker.add(task)
@@ -1114,6 +1131,13 @@ def test_update_vat_with_updated_rdc_snv_indel_37(
11141131
task = UpdateVariantAnnotationsTableWithUpdatedReferenceDataset(
11151132
reference_genome=ReferenceGenome.GRCh37,
11161133
dataset_type=DatasetType.SNV_INDEL,
1134+
sample_type=SampleType.WGS,
1135+
callset_path=TEST_SNV_INDEL_VCF,
1136+
project_guids=[],
1137+
project_remap_paths=[],
1138+
project_pedigree_paths=[],
1139+
skip_validation=True,
1140+
run_id='2',
11171141
)
11181142
worker = luigi.worker.Worker()
11191143
worker.add(task)

v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,15 +19,16 @@
1919
get_ht_path,
2020
import_ht_from_config_path,
2121
)
22+
from v03_pipeline.lib.tasks.base.base_loading_run_params import (
23+
BaseLoadingRunParams,
24+
)
2225
from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask
2326
from v03_pipeline.lib.tasks.files import GCSorLocalTarget, HailTableTask
24-
from v03_pipeline.lib.tasks.reference_data.updated_reference_dataset_collection import (
25-
UpdatedReferenceDatasetCollectionTask,
26-
)
2727

2828
logger = get_logger(__name__)
2929

3030

31+
@luigi.util.inherits(BaseLoadingRunParams)
3132
class UpdatedCachedReferenceDatasetQuery(BaseWriteTask):
3233
crdq = luigi.EnumParameter(enum=CachedReferenceDatasetQuery)
3334

@@ -71,6 +72,16 @@ def requires(self) -> luigi.Task:
7172
],
7273
),
7374
)
75+
# Special nested import to avoid a circular dependency issue
76+
# (ValidateCallset -> this file -> UpdatedReferenceDatasetCollection -> ValidateCallset)
77+
# The specific CRDQ referenced in ValidateCallset will never reach
78+
# this line due to it being a "query_raw_dataset". In theory this
79+
# would be fixed by splitting the CRDQ into raw_dataset and non-raw_dataset
80+
# queries.
81+
from v03_pipeline.lib.tasks.reference_data.updated_reference_dataset_collection import (
82+
UpdatedReferenceDatasetCollectionTask,
83+
)
84+
7485
return UpdatedReferenceDatasetCollectionTask(
7586
self.reference_genome,
7687
self.dataset_type,

v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query_test.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,14 @@
55
import hail as hl
66
import luigi
77

8+
import v03_pipeline.lib.tasks.reference_data.updated_reference_dataset_collection
89
from v03_pipeline.lib.annotations.enums import CLINVAR_PATHOGENICITIES
910
from v03_pipeline.lib.model import (
1011
CachedReferenceDatasetQuery,
1112
DatasetType,
1213
ReferenceDatasetCollection,
1314
ReferenceGenome,
15+
SampleType,
1416
)
1517
from v03_pipeline.lib.paths import (
1618
cached_reference_dataset_query_path,
@@ -28,6 +30,7 @@
2830
CLINVAR_CRDQ_PATH = (
2931
'v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht'
3032
)
33+
TEST_SNV_INDEL_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf'
3134

3235
MOCK_CONFIG = {
3336
'gnomad_qc': {
@@ -109,6 +112,13 @@ def test_gnomad_qc(
109112
reference_genome=ReferenceGenome.GRCh38,
110113
dataset_type=DatasetType.SNV_INDEL,
111114
crdq=CachedReferenceDatasetQuery.GNOMAD_QC,
115+
sample_type=SampleType.WGS,
116+
callset_path=TEST_SNV_INDEL_VCF,
117+
project_guids=[],
118+
project_remap_paths=[],
119+
project_pedigree_paths=[],
120+
skip_validation=True,
121+
run_id='1',
112122
)
113123
worker.add(task)
114124
worker.run()
@@ -143,8 +153,9 @@ def test_gnomad_qc(
143153
'v03_pipeline.lib.reference_data.compare_globals.CONFIG',
144154
MOCK_CONFIG,
145155
)
146-
@mock.patch(
147-
'v03_pipeline.lib.tasks.reference_data.updated_cached_reference_dataset_query.UpdatedReferenceDatasetCollectionTask',
156+
@mock.patch.object(
157+
v03_pipeline.lib.tasks.reference_data.updated_reference_dataset_collection,
158+
'UpdatedReferenceDatasetCollectionTask',
148159
)
149160
@mock.patch(
150161
'v03_pipeline.lib.tasks.reference_data.updated_cached_reference_dataset_query.CachedReferenceDatasetQuery.query',
@@ -198,6 +209,13 @@ def _clinvar_path_variants(table, **_: Any):
198209
reference_genome=ReferenceGenome.GRCh38,
199210
dataset_type=DatasetType.SNV_INDEL,
200211
crdq=CachedReferenceDatasetQuery.CLINVAR_PATH_VARIANTS,
212+
sample_type=SampleType.WGS,
213+
callset_path=TEST_SNV_INDEL_VCF,
214+
project_guids=[],
215+
project_remap_paths=[],
216+
project_pedigree_paths=[],
217+
skip_validation=True,
218+
run_id='2',
201219
)
202220
worker.add(task)
203221
worker.run()

0 commit comments

Comments
 (0)