Skip to content

Commit d81cc10

Browse files
authored
Benb/check parsed clinvar version in complete (#951)
* Parse clinvar version from header * First pass * Bump hail tables to https * correct dataset/dataset types * Fix clinvar mito * Fix combined * Dependency reordering for reference data updates and validation * ruff * missed one * Revert relatedness changes * push * Fix import issue * Fix sample type * ruff * Fix import mocking * imports * Missed one * First mocking pass * Finish mocks in reference data * responses activate * ruff * commas * fix test * Update compare_globals.py * import
1 parent 1fea1f7 commit d81cc10

File tree

84 files changed

+91
-20
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

84 files changed

+91
-20
lines changed

v03_pipeline/lib/reference_data/compare_globals.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,10 @@
44

55
from v03_pipeline.lib.logger import get_logger
66
from v03_pipeline.lib.model import (
7+
DatasetType,
78
ReferenceGenome,
89
)
10+
from v03_pipeline.lib.reference_data.clinvar import parse_clinvar_release_date
911
from v03_pipeline.lib.reference_data.config import CONFIG
1012
from v03_pipeline.lib.reference_data.dataset_table_operations import (
1113
get_all_select_fields,
@@ -16,6 +18,17 @@
1618
logger = get_logger(__name__)
1719

1820

21+
def clinvar_versions_equal(
22+
ht: hl.Table,
23+
reference_genome: ReferenceGenome,
24+
dataset_type: DatasetType,
25+
) -> bool:
26+
dataset = 'clinvar_mito' if dataset_type == DatasetType.MITO else 'clinvar'
27+
return hl.eval(ht.globals.versions[dataset]) == parse_clinvar_release_date(
28+
CONFIG[dataset][reference_genome.v02_value],
29+
)
30+
31+
1932
@dataclasses.dataclass
2033
class Globals:
2134
paths: dict[str, str]

v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from v03_pipeline.lib.model import ReferenceDatasetCollection
77
from v03_pipeline.lib.reference_data.compare_globals import (
88
Globals,
9+
clinvar_versions_equal,
910
get_datasets_to_update,
1011
)
1112
from v03_pipeline.lib.reference_data.config import CONFIG
@@ -54,6 +55,17 @@ def complete(self) -> bool:
5455
for rdc in self.reference_dataset_collections
5556
for dataset in rdc.datasets(self.dataset_type)
5657
]
58+
59+
if any(
60+
'clinvar' in d for d in datasets_to_check
61+
) and not clinvar_versions_equal(
62+
hl.read_table(self.output().path),
63+
self.reference_genome,
64+
self.dataset_type,
65+
):
66+
datasets_to_check.remove('clinvar')
67+
self._datasets_to_update.add('clinvar')
68+
5769
annotations_ht_globals = Globals.from_ht(
5870
hl.read_table(self.output().path),
5971
datasets_to_check,

v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@
6161
}
6262
MOCK_CLINVAR_CONFIG = {
6363
**CONFIG['clinvar']['38'],
64-
'source_path': 'ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz',
64+
'source_path': 'https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz',
6565
'custom_import': lambda *_: hl.Table.parallelize(
6666
[],
6767
hl.tstruct(
@@ -486,7 +486,7 @@
486486
'clinvar_mito': {
487487
'38': {
488488
**CONFIG['clinvar_mito']['38'],
489-
'source_path': 'ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz',
489+
'source_path': 'https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz',
490490
'custom_import': lambda *_: hl.Table.parallelize(
491491
[],
492492
hl.tstruct(
@@ -722,12 +722,17 @@ def setUp(self) -> None:
722722
'v03_pipeline.lib.reference_data.compare_globals.CONFIG',
723723
MOCK_CONFIG,
724724
)
725+
@mock.patch(
726+
'v03_pipeline.lib.tasks.reference_data.update_variant_annotations_table_with_updated_reference_dataset.clinvar_versions_equal',
727+
)
725728
def test_update_vat_with_updated_rdc_snv_indel_38(
726729
self,
730+
mock_clinvar_versions_equal,
727731
mock_initialize_table,
728732
mock_update_crdqs_task,
729733
mock_update_rdc_task,
730734
):
735+
mock_clinvar_versions_equal.return_value = True
731736
mock_update_rdc_task.return_value = MockCompleteTask()
732737
mock_update_crdqs_task.return_value = MockCompleteTask()
733738
mock_initialize_table.return_value = hl.Table.parallelize(
@@ -840,7 +845,7 @@ def test_update_vat_with_updated_rdc_snv_indel_38(
840845
hl.Struct(
841846
paths=hl.Struct(
842847
cadd='gs://seqr-reference-data/GRCh37/CADD/CADD_snvs_and_indels.v1.6.ht',
843-
clinvar='ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz',
848+
clinvar='https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz',
844849
dbnsfp='gs://seqr-reference-data/GRCh37/dbNSFP/v2.9.3/dbNSFP2.9.3_variant.ht',
845850
eigen='gs://seqr-reference-data/GRCh37/eigen/EIGEN_coding_noncoding.grch37.ht',
846851
exac='gs://seqr-reference-data/GRCh37/gnomad/ExAC.r1.sites.vep.ht',
@@ -939,12 +944,17 @@ def test_update_vat_with_updated_rdc_snv_indel_38(
939944
'v03_pipeline.lib.reference_data.compare_globals.CONFIG',
940945
MOCK_CONFIG_MITO,
941946
)
947+
@mock.patch(
948+
'v03_pipeline.lib.tasks.reference_data.update_variant_annotations_table_with_updated_reference_dataset.clinvar_versions_equal',
949+
)
942950
def test_update_vat_with_updated_rdc_mito_38(
943951
self,
952+
mock_clinvar_versions_equal,
944953
mock_initialize_table,
945954
mock_update_crdqs_task,
946955
mock_update_rdc_task,
947956
):
957+
mock_clinvar_versions_equal.return_value = (True,)
948958
mock_update_rdc_task.return_value = MockCompleteTask()
949959
mock_update_crdqs_task.return_value = MockCompleteTask()
950960
mock_initialize_table.return_value = hl.Table.parallelize(
@@ -999,7 +1009,7 @@ def test_update_vat_with_updated_rdc_mito_38(
9991009
hmtvar='gs://seqr-reference-data/GRCh38/mitochondrial/HmtVar/HmtVar%20Jan.%2010%202022.ht',
10001010
mitomap='gs://seqr-reference-data/GRCh38/mitochondrial/MITOMAP/mitomap-confirmed-mutations-2022-02-04.ht',
10011011
mitimpact='gs://seqr-reference-data/GRCh38/mitochondrial/MitImpact/MitImpact_db_3.0.7.ht',
1002-
clinvar_mito='ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz',
1012+
clinvar_mito='https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz',
10031013
dbnsfp_mito='gs://seqr-reference-data/GRCh38/dbNSFP/v4.2/dbNSFP4.2a_variant.with_new_scores.ht',
10041014
high_constraint_region_mito='gs://seqr-reference-data/GRCh38/mitochondrial/Helix high constraint intervals Feb-15-2022.tsv',
10051015
local_constraint_mito='gs://seqr-reference-data/GRCh38/mitochondrial/local_constraint.tsv',
@@ -1096,12 +1106,17 @@ def test_update_vat_with_updated_rdc_mito_38(
10961106
'v03_pipeline.lib.reference_data.compare_globals.CONFIG',
10971107
MOCK_CONFIG,
10981108
)
1109+
@mock.patch(
1110+
'v03_pipeline.lib.tasks.reference_data.update_variant_annotations_table_with_updated_reference_dataset.clinvar_versions_equal',
1111+
)
10991112
def test_update_vat_with_updated_rdc_snv_indel_37(
11001113
self,
1114+
mock_clinvar_versions_equal,
11011115
mock_initialize_table,
11021116
mock_update_crdqs_task,
11031117
mock_update_rdc_task,
11041118
):
1119+
mock_clinvar_versions_equal.return_value = True
11051120
mock_update_rdc_task.return_value = MockCompleteTask()
11061121
mock_update_crdqs_task.return_value = MockCompleteTask()
11071122
mock_initialize_table.return_value = hl.Table.parallelize(
@@ -1152,7 +1167,7 @@ def test_update_vat_with_updated_rdc_snv_indel_37(
11521167
hl.Struct(
11531168
paths=hl.Struct(
11541169
cadd='gs://seqr-reference-data/GRCh37/CADD/CADD_snvs_and_indels.v1.6.ht',
1155-
clinvar='ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz',
1170+
clinvar='https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz',
11561171
dbnsfp='gs://seqr-reference-data/GRCh37/dbNSFP/v2.9.3/dbNSFP2.9.3_variant.ht',
11571172
eigen='gs://seqr-reference-data/GRCh37/eigen/EIGEN_coding_noncoding.grch37.ht',
11581173
exac='gs://seqr-reference-data/GRCh37/gnomad/ExAC.r1.sites.vep.ht',

v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
)
1313
from v03_pipeline.lib.reference_data.compare_globals import (
1414
Globals,
15+
clinvar_versions_equal,
1516
get_datasets_to_update,
1617
)
1718
from v03_pipeline.lib.reference_data.config import CONFIG
@@ -39,14 +40,21 @@ def complete(self) -> bool:
3940
)
4041
return False
4142

42-
datasets_to_check = [self.crdq.dataset(self.dataset_type)]
43+
dataset = self.crdq.dataset(self.dataset_type)
44+
if 'clinvar' in dataset and not clinvar_versions_equal(
45+
hl.read_table(self.output().path),
46+
self.reference_genome,
47+
self.dataset_type,
48+
):
49+
return False
50+
4351
crdq_globals = Globals.from_ht(
4452
hl.read_table(self.output().path),
45-
datasets_to_check,
53+
[dataset],
4654
)
4755
dataset_config_globals = Globals.from_dataset_configs(
4856
self.reference_genome,
49-
datasets_to_check,
57+
[dataset],
5058
)
5159
return not get_datasets_to_update(
5260
crdq_globals,

v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query_test.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@
6060
'clinvar': {
6161
'38': {
6262
**CONFIG['clinvar']['38'],
63-
'source_path': 'ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz',
63+
'source_path': 'https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz',
6464
'custom_import': lambda *_: hl.Table.parallelize(
6565
[],
6666
hl.tstruct(
@@ -160,15 +160,21 @@ def test_gnomad_qc(
160160
@mock.patch(
161161
'v03_pipeline.lib.tasks.reference_data.updated_cached_reference_dataset_query.CachedReferenceDatasetQuery.query',
162162
)
163+
@mock.patch(
164+
'v03_pipeline.lib.tasks.reference_data.updated_cached_reference_dataset_query.clinvar_versions_equal',
165+
)
163166
def test_clinvar(
164167
self,
168+
mock_clinvar_versions_equal,
165169
mock_crdq_query,
166170
mock_updated_rdc_task,
167171
) -> None:
168172
"""
169173
Given a crdq task where there exists a clinvar crdq table and a clinvar rdc table,
170174
expect task to replace the clinvar crdq table with new version.
171175
"""
176+
mock_clinvar_versions_equal.return_value = True
177+
172178
# rdc dependency exists
173179
mock_updated_rdc_task.return_value = MockCompleteTask()
174180

v03_pipeline/lib/tasks/reference_data/updated_reference_dataset_collection.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from v03_pipeline.lib.paths import valid_reference_dataset_collection_path
77
from v03_pipeline.lib.reference_data.compare_globals import (
88
Globals,
9+
clinvar_versions_equal,
910
get_datasets_to_update,
1011
)
1112
from v03_pipeline.lib.reference_data.dataset_table_operations import (
@@ -53,6 +54,14 @@ def complete(self) -> bool:
5354
)
5455
return False
5556

57+
if any('clinvar' in d for d in datasets) and not clinvar_versions_equal(
58+
hl.read_table(self.output().path),
59+
self.reference_genome,
60+
self.dataset_type,
61+
):
62+
datasets.remove('clinvar')
63+
self._datasets_to_update.add('clinvar')
64+
5665
joined_ht_globals = Globals.from_ht(
5766
hl.read_table(self.output().path),
5867
datasets,

v03_pipeline/lib/tasks/reference_data/updated_reference_dataset_collection_test.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,14 +158,19 @@ class UpdatedReferenceDatasetCollectionTaskTest(MockedDatarootTestCase):
158158
MOCK_CONFIG,
159159
)
160160
@mock.patch.object(ReferenceDatasetCollection, 'datasets')
161+
@mock.patch(
162+
'v03_pipeline.lib.tasks.reference_data.updated_reference_dataset_collection.clinvar_versions_equal',
163+
)
161164
def test_update_task_with_empty_reference_data_table(
162165
self,
166+
mock_clinvar_versions_equal,
163167
mock_rdc_datasets,
164168
) -> None:
165169
"""
166170
Given a new task with no existing reference dataset collection table,
167171
expect the task to create a new reference dataset collection table for all datasets in the collection.
168172
"""
173+
mock_clinvar_versions_equal.return_value = True
169174
mock_rdc_datasets.return_value = ['cadd', 'primate_ai', 'clinvar']
170175
worker = luigi.worker.Worker()
171176
task = UpdatedReferenceDatasetCollectionTask(

v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -585,7 +585,7 @@ def test_multiple_update_vat(
585585
},
586586
paths=hl.Struct(
587587
cadd='gs://seqr-reference-data/GRCh37/CADD/CADD_snvs_and_indels.v1.6.ht',
588-
clinvar='ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz',
588+
clinvar='https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz',
589589
dbnsfp='gs://seqr-reference-data/GRCh37/dbNSFP/v2.9.3/dbNSFP2.9.3_variant.ht',
590590
eigen='gs://seqr-reference-data/GRCh37/eigen/EIGEN_coding_noncoding.grch37.ht',
591591
exac='gs://seqr-reference-data/GRCh37/gnomad/ExAC.r1.sites.vep.ht',
@@ -724,7 +724,7 @@ def test_update_vat_grch37(
724724
[
725725
hl.Struct(
726726
cadd='gs://seqr-reference-data/GRCh37/CADD/CADD_snvs_and_indels.v1.6.ht',
727-
clinvar='ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz',
727+
clinvar='https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz',
728728
dbnsfp='gs://seqr-reference-data/GRCh37/dbNSFP/v2.9.3/dbNSFP2.9.3_variant.ht',
729729
eigen='gs://seqr-reference-data/GRCh37/eigen/EIGEN_coding_noncoding.grch37.ht',
730730
exac='gs://seqr-reference-data/GRCh37/gnomad/ExAC.r1.sites.vep.ht',
@@ -965,7 +965,7 @@ def test_mito_update_vat(
965965
hl.Struct(
966966
paths=hl.Struct(
967967
high_constraint_region_mito='gs://seqr-reference-data/GRCh38/mitochondrial/Helix high constraint intervals Feb-15-2022.tsv',
968-
clinvar_mito='ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz',
968+
clinvar_mito='https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz',
969969
dbnsfp_mito='gs://seqr-reference-data/GRCh38/dbNSFP/v4.2/dbNSFP4.2a_variant.with_new_scores.ht',
970970
gnomad_mito='gs://gcp-public-data--gnomad/release/3.1/ht/genomes/gnomad.genomes.v3.1.sites.chrM.ht',
971971
helix_mito='gs://seqr-reference-data/GRCh38/mitochondrial/Helix/HelixMTdb_20200327.ht',

0 commit comments

Comments
 (0)