Skip to content

Commit 7710b1d

Browse files
authored
Bugfix to ensure the validation/gnomad_qc table is updated before being used, + a unit test! (#783)
* add unit test for crdq updating * closer * fix the mock * lint * mock the task * Lint * rename test
1 parent 975ee0e commit 7710b1d

29 files changed

+184
-28
lines changed

luigi_pipeline/tests/test_seqr_loading_tasks.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ def setUp(self):
1414
# Create a temporary directory
1515
self.test_mt = hl.import_vcf(TEST_DATA_MT_1KG)
1616

17-
def _sample_type_stats_return_value( # noqa: PLR0913
17+
def _sample_type_stats_return_value(
1818
self,
1919
nc_match_count,
2020
nc_total_count,

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ inline-quotes = "single"
6363
'*test*' = [
6464
'ARG002', # allow unused method arguments
6565
'SLF001', # allow private access
66+
'PLR0913', # allow high arity functions
6667
]
6768

6869
[tool.ruff.pylint]

v03_pipeline/lib/model/cached_reference_dataset_query.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,6 @@ class CachedReferenceDatasetQuery(Enum):
2222

2323
@property
2424
def access_control(self) -> AccessControl:
25-
if self == CachedReferenceDatasetQuery.GNOMAD_QC:
26-
return AccessControl.PRIVATE
2725
return AccessControl.PUBLIC
2826

2927
def dataset(self, dataset_type: DatasetType) -> str | None:

v03_pipeline/lib/reference_data/config.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -386,8 +386,8 @@ def custom_mpc_select(ht):
386386
'source_path': 'gs://seqr-reference-data/gnomad_qc/GRCh37/gnomad.joint.high_callrate_common_biallelic_snps.pruned.mt',
387387
},
388388
'38': {
389-
'version': 'v4.0',
390-
'source_path': 'gs://gcp-public-data--gnomad/release/4.0/pca/gnomad.v4.0.pca_loadings.ht',
389+
'version': '4.0',
390+
'path': 'gs://gcp-public-data--gnomad/release/4.0/pca/gnomad.v4.0.pca_loadings.ht',
391391
},
392392
},
393393
'exac': {

v03_pipeline/lib/tasks/__init__.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
from v03_pipeline.lib.tasks.reference_data.write_cached_reference_dataset_query import (
2-
WriteCachedReferenceDatasetQuery,
1+
from v03_pipeline.lib.tasks.reference_data.update_cached_reference_dataset_queries import (
2+
UpdateCachedReferenceDatasetQueries,
33
)
44
from v03_pipeline.lib.tasks.update_lookup_table import (
55
UpdateLookupTableTask,
@@ -33,7 +33,7 @@
3333
'UpdateVariantAnnotationsTableWithNewSamplesTask',
3434
'UpdateVariantAnnotationsTableWithDeletedProjectTask',
3535
'UpdateVariantAnnotationsTableWithDeletedFamiliesTask',
36-
'WriteCachedReferenceDatasetQuery',
36+
'UpdateCachedReferenceDatasetQueries',
3737
'WriteMetadataForRunTask',
3838
'WriteProjectFamilyTablesTask',
3939
]

v03_pipeline/lib/tasks/reference_data/write_cached_reference_dataset_query.py renamed to v03_pipeline/lib/tasks/reference_data/update_cached_reference_dataset_queries.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
)
1212

1313

14-
class WriteCachedReferenceDatasetQuery(luigi.Task):
14+
class UpdateCachedReferenceDatasetQueries(luigi.Task):
1515
reference_genome = luigi.EnumParameter(enum=ReferenceGenome)
1616
dataset_type = luigi.EnumParameter(enum=DatasetType)
1717
sample_type = luigi.EnumParameter(enum=SampleType)

v03_pipeline/lib/tasks/reference_data/write_cached_reference_dataset_query_test.py renamed to v03_pipeline/lib/tasks/reference_data/update_cached_reference_dataset_queries_test.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,20 +9,20 @@
99
ReferenceGenome,
1010
SampleType,
1111
)
12-
from v03_pipeline.lib.tasks.reference_data.write_cached_reference_dataset_query import (
13-
WriteCachedReferenceDatasetQuery,
12+
from v03_pipeline.lib.tasks.reference_data.update_cached_reference_dataset_queries import (
13+
UpdateCachedReferenceDatasetQueries,
1414
)
1515
from v03_pipeline.lib.test.mock_complete_task import MockCompleteTask
1616

1717

1818
@mock.patch(
19-
'v03_pipeline.lib.tasks.reference_data.write_cached_reference_dataset_query.UpdatedCachedReferenceDatasetQuery',
19+
'v03_pipeline.lib.tasks.reference_data.update_cached_reference_dataset_queries.UpdatedCachedReferenceDatasetQuery',
2020
)
21-
class WriteCachedReferenceDatasetQueryTest(unittest.TestCase):
21+
class UpdateCachedReferenceDatasetQueriesTest(unittest.TestCase):
2222
def test_37_snv_indel(self, mock_crdq_task):
2323
mock_crdq_task.return_value = MockCompleteTask()
2424
worker = luigi.worker.Worker()
25-
task = WriteCachedReferenceDatasetQuery(
25+
task = UpdateCachedReferenceDatasetQueries(
2626
reference_genome=ReferenceGenome.GRCh37,
2727
dataset_type=DatasetType.SNV_INDEL,
2828
sample_type=SampleType.WGS,
@@ -62,7 +62,7 @@ def test_37_snv_indel(self, mock_crdq_task):
6262
def test_38_snv_indel(self, mock_crdq_task):
6363
mock_crdq_task.return_value = MockCompleteTask()
6464
worker = luigi.worker.Worker()
65-
task = WriteCachedReferenceDatasetQuery(
65+
task = UpdateCachedReferenceDatasetQueries(
6666
reference_genome=ReferenceGenome.GRCh38,
6767
dataset_type=DatasetType.SNV_INDEL,
6868
sample_type=SampleType.WGS,
@@ -102,7 +102,7 @@ def test_38_snv_indel(self, mock_crdq_task):
102102
def test_38_mito(self, mock_crdq_task):
103103
mock_crdq_task.return_value = MockCompleteTask()
104104
worker = luigi.worker.Worker()
105-
task = WriteCachedReferenceDatasetQuery(
105+
task = UpdateCachedReferenceDatasetQueries(
106106
reference_genome=ReferenceGenome.GRCh38,
107107
dataset_type=DatasetType.MITO,
108108
sample_type=SampleType.WGS,
@@ -124,7 +124,7 @@ def test_38_mito(self, mock_crdq_task):
124124
def test_38_sv(self, mock_crdq_task):
125125
mock_crdq_task.return_value = MockCompleteTask()
126126
worker = luigi.worker.Worker()
127-
task = WriteCachedReferenceDatasetQuery(
127+
task = UpdateCachedReferenceDatasetQueries(
128128
reference_genome=ReferenceGenome.GRCh38,
129129
dataset_type=DatasetType.SV,
130130
sample_type=SampleType.WGS,

v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,9 @@ def test_missing_interval_reference(
202202
worker.run()
203203
self.assertFalse(uvatwns_task.complete())
204204

205+
@patch(
206+
'v03_pipeline.lib.tasks.write_imported_callset.UpdatedCachedReferenceDatasetQuery',
207+
)
205208
@patch(
206209
'v03_pipeline.lib.tasks.write_new_variants_table.UpdateVariantAnnotationsTableWithUpdatedReferenceDataset',
207210
)
@@ -219,7 +222,9 @@ def test_multiple_update_vat(
219222
mock_standard_contigs: Mock,
220223
mock_update_vat_with_rdc_task: Mock,
221224
mock_update_rdc_task: Mock,
225+
mock_updated_cached_reference_dataset_query,
222226
) -> None:
227+
mock_updated_cached_reference_dataset_query.return_value = MockCompleteTask()
223228
mock_update_rdc_task.return_value = MockCompleteTask()
224229
mock_update_vat_with_rdc_task.return_value = (
225230
BaseUpdateVariantAnnotationsTableTask(
@@ -233,6 +238,7 @@ def test_multiple_update_vat(
233238
mock_standard_contigs.return_value = {'chr1'}
234239
# This creates a mock validation table with 1 coding and 1 non-coding variant
235240
# explicitly chosen from the VCF.
241+
# NB: This is the one and only place validation is enabled in the task tests!
236242
coding_and_noncoding_variants_ht = hl.Table.parallelize(
237243
[
238244
{
@@ -260,6 +266,17 @@ def test_multiple_update_vat(
260266
noncoding=hl.tbool,
261267
),
262268
key='locus',
269+
globals=hl.Struct(
270+
paths=hl.Struct(
271+
gnomad_genomes='gs://gcp-public-data--gnomad/release/4.1/ht/genomes/gnomad.genomes.v4.1.sites.ht',
272+
),
273+
versions=hl.Struct(
274+
gnomad_genomes='4.1',
275+
),
276+
enums=hl.Struct(
277+
gnomad_genomes=hl.Struct(),
278+
),
279+
),
263280
)
264281
coding_and_noncoding_variants_ht.write(
265282
valid_cached_reference_dataset_query_path(

v03_pipeline/lib/tasks/write_imported_callset.py

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,17 @@
1414
)
1515
from v03_pipeline.lib.misc.vets import annotate_vets
1616
from v03_pipeline.lib.model import CachedReferenceDatasetQuery
17+
from v03_pipeline.lib.model.environment import Env
1718
from v03_pipeline.lib.paths import (
1819
imported_callset_path,
1920
sex_check_table_path,
2021
valid_cached_reference_dataset_query_path,
2122
)
2223
from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask
2324
from v03_pipeline.lib.tasks.files import CallsetTask, GCSorLocalTarget, HailTableTask
25+
from v03_pipeline.lib.tasks.reference_data.updated_cached_reference_dataset_query import (
26+
UpdatedCachedReferenceDatasetQuery,
27+
)
2428
from v03_pipeline.lib.tasks.write_sex_check_table import WriteSexCheckTableTask
2529

2630

@@ -71,11 +75,20 @@ def requires(self) -> list[luigi.Task]:
7175
if self.validate and self.dataset_type.can_run_validation:
7276
requirements = [
7377
*requirements,
74-
HailTableTask(
75-
valid_cached_reference_dataset_query_path(
76-
self.reference_genome,
77-
self.dataset_type,
78-
CachedReferenceDatasetQuery.GNOMAD_CODING_AND_NONCODING_VARIANTS,
78+
(
79+
UpdatedCachedReferenceDatasetQuery(
80+
reference_genome=self.reference_genome,
81+
dataset_type=self.dataset_type,
82+
sample_type=self.sample_type,
83+
crdq=CachedReferenceDatasetQuery.GNOMAD_CODING_AND_NONCODING_VARIANTS,
84+
)
85+
if Env.REFERENCE_DATA_AUTO_UPDATE
86+
else HailTableTask(
87+
valid_cached_reference_dataset_query_path(
88+
self.reference_genome,
89+
self.dataset_type,
90+
CachedReferenceDatasetQuery.GNOMAD_CODING_AND_NONCODING_VARIANTS,
91+
),
7992
),
8093
),
8194
]

v03_pipeline/lib/tasks/write_relatedness_check_table.py

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@
99
)
1010
from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask
1111
from v03_pipeline.lib.tasks.files import GCSorLocalTarget, HailTableTask
12+
from v03_pipeline.lib.tasks.reference_data.updated_cached_reference_dataset_query import (
13+
UpdatedCachedReferenceDatasetQuery,
14+
)
1215
from v03_pipeline.lib.tasks.write_imported_callset import WriteImportedCallsetTask
1316

1417

@@ -36,12 +39,21 @@ def requires(self) -> luigi.Task:
3639
if Env.ACCESS_PRIVATE_REFERENCE_DATASETS:
3740
requirements = [
3841
*requirements,
39-
HailTableTask(
40-
valid_cached_reference_dataset_query_path(
41-
self.reference_genome,
42-
self.dataset_type,
43-
CachedReferenceDatasetQuery.GNOMAD_QC,
44-
),
42+
(
43+
UpdatedCachedReferenceDatasetQuery(
44+
reference_genome=self.reference_genome,
45+
dataset_type=self.dataset_type,
46+
sample_type=self.sample_type,
47+
crdq=CachedReferenceDatasetQuery.GNOMAD_QC,
48+
)
49+
if Env.REFERENCE_DATA_AUTO_UPDATE
50+
else HailTableTask(
51+
valid_cached_reference_dataset_query_path(
52+
self.reference_genome,
53+
self.dataset_type,
54+
CachedReferenceDatasetQuery.GNOMAD_QC,
55+
),
56+
)
4557
),
4658
]
4759
return requirements
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
import shutil
2+
from unittest import mock
3+
4+
import hail as hl
5+
import luigi.worker
6+
7+
from v03_pipeline.lib.misc.io import import_vcf
8+
from v03_pipeline.lib.model import (
9+
CachedReferenceDatasetQuery,
10+
DatasetType,
11+
ReferenceGenome,
12+
SampleType,
13+
)
14+
from v03_pipeline.lib.paths import (
15+
imported_callset_path,
16+
relatedness_check_table_path,
17+
valid_cached_reference_dataset_query_path,
18+
)
19+
from v03_pipeline.lib.tasks.write_relatedness_check_table import (
20+
WriteRelatednessCheckTableTask,
21+
)
22+
from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase
23+
24+
TEST_GNOMAD_QC_HT = 'v03_pipeline/var/test/reference_data/gnomad_qc_crdq.ht'
25+
TEST_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf'
26+
27+
MOCK_CONFIG = {
28+
'gnomad_qc': {
29+
'38': {
30+
'version': '4.0',
31+
'source_path': TEST_GNOMAD_QC_HT,
32+
'custom_import': lambda *_: hl.Table.parallelize(
33+
[],
34+
hl.tstruct(
35+
locus=hl.tlocus('GRCh38'),
36+
alleles=hl.tarray(hl.tstr),
37+
),
38+
key=['locus', 'alleles'],
39+
),
40+
},
41+
},
42+
}
43+
44+
45+
class WriteRelatednessCheckTableTaskTest(MockedDatarootTestCase):
46+
def setUp(self) -> None:
47+
super().setUp()
48+
self.gnomad_qc_path = valid_cached_reference_dataset_query_path(
49+
ReferenceGenome.GRCh38,
50+
DatasetType.SNV_INDEL,
51+
CachedReferenceDatasetQuery.GNOMAD_QC,
52+
)
53+
shutil.copytree(
54+
TEST_GNOMAD_QC_HT,
55+
self.gnomad_qc_path,
56+
)
57+
58+
# Force imported callset to be complete
59+
ht = import_vcf(TEST_VCF, ReferenceGenome.GRCh38)
60+
ht = ht.annotate_globals(sample_type=SampleType.WGS.value)
61+
ht.write(
62+
imported_callset_path(
63+
ReferenceGenome.GRCh38,
64+
DatasetType.SNV_INDEL,
65+
TEST_VCF,
66+
),
67+
)
68+
69+
@mock.patch.dict(
70+
'v03_pipeline.lib.reference_data.compare_globals.CONFIG',
71+
MOCK_CONFIG,
72+
)
73+
@mock.patch.dict(
74+
'v03_pipeline.lib.tasks.reference_data.updated_cached_reference_dataset_query.CONFIG',
75+
MOCK_CONFIG,
76+
)
77+
def test_relatedness_check_table_task_gnomad_qc_updated(
78+
self,
79+
) -> None:
80+
ht = hl.read_table(
81+
self.gnomad_qc_path,
82+
)
83+
self.assertEqual(
84+
hl.eval(ht.versions.gnomad_qc),
85+
'v3.1',
86+
)
87+
worker = luigi.worker.Worker()
88+
task = WriteRelatednessCheckTableTask(
89+
reference_genome=ReferenceGenome.GRCh38,
90+
dataset_type=DatasetType.SNV_INDEL,
91+
sample_type=SampleType.WGS,
92+
callset_path=TEST_VCF,
93+
)
94+
worker.add(task)
95+
worker.run()
96+
self.assertTrue(task.complete())
97+
ht = hl.read_table(self.gnomad_qc_path)
98+
self.assertEqual(
99+
hl.eval(ht.versions.gnomad_qc),
100+
'4.0',
101+
)
102+
ht = hl.read_table(
103+
relatedness_check_table_path(
104+
ReferenceGenome.GRCh38,
105+
DatasetType.SNV_INDEL,
106+
TEST_VCF,
107+
),
108+
)
109+
self.assertEqual(
110+
ht.collect(),
111+
[],
112+
)
Binary file not shown.
Binary file not shown.
Binary file not shown.
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
This folder comprises a Hail (www.hail.is) native Table or MatrixTable.
2+
Written with version 0.2.128-eead8100a1c1
3+
Created at 2024/05/10 15:28:58

v03_pipeline/var/test/reference_data/gnomad_qc_crdq.ht/_SUCCESS

Whitespace-only changes.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

0 commit comments

Comments
 (0)