Skip to content

Commit 256c124

Browse files
authored
clinvar submitters and conditions (#739)
* still developing * more hail stuff * 2 tables * finish custom_select for clinver * oop * remove * comment * mock_join_clinvar_to_submission_summary * crdq test * list instead of set * partitions * a few changes * test tables * fix failing test and add test for no empty submitters * fix failing test and add test for no empty submitters * remove todo
1 parent 5dc7972 commit 256c124

File tree

53 files changed

+317
-12
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

53 files changed

+317
-12
lines changed

v03_pipeline/lib/reference_data/clinvar.py

Lines changed: 41 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,9 @@
3838
'practice_guideline': 4,
3939
},
4040
)
41-
41+
CLINVAR_SUBMISSION_SUMMARY_URL = (
42+
'ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/submission_summary.txt.gz'
43+
)
4244
logger = get_logger(__name__)
4345

4446

@@ -129,7 +131,7 @@ def download_and_import_latest_clinvar_vcf(
129131
force_bgz=True,
130132
)
131133
mt = mt.annotate_globals(version=_parse_clinvar_release_date(tmp_file.name))
132-
return mt.rows()
134+
return join_to_submission_summary_ht(mt.rows())
133135

134136

135137
def _parse_clinvar_release_date(local_vcf_path: str) -> str:
@@ -150,3 +152,40 @@ def _parse_clinvar_release_date(local_vcf_path: str) -> str:
150152
return None
151153

152154
return None
155+
156+
157+
def join_to_submission_summary_ht(vcf_ht: hl.Table) -> hl.Table:
158+
# https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/README - submission_summary.txt
159+
logger.info('Getting clinvar submission summary')
160+
ht = download_and_import_clinvar_submission_summary()
161+
ht = ht.rename({'#VariationID': 'VariationID'})
162+
ht = ht.select('VariationID', 'Submitter', 'ReportedPhenotypeInfo')
163+
ht = ht.group_by('VariationID').aggregate(
164+
Submitters=hl.agg.collect(ht.Submitter),
165+
Conditions=hl.agg.collect(ht.ReportedPhenotypeInfo),
166+
)
167+
ht = ht.key_by('VariationID')
168+
return vcf_ht.annotate(
169+
submitters=ht[vcf_ht.rsid].Submitters,
170+
conditions=ht[vcf_ht.rsid].Conditions,
171+
)
172+
173+
174+
def download_and_import_clinvar_submission_summary() -> hl.Table:
175+
with tempfile.NamedTemporaryFile(
176+
suffix='.txt.gz',
177+
delete=False,
178+
) as tmp_file:
179+
urllib.request.urlretrieve(CLINVAR_SUBMISSION_SUMMARY_URL, tmp_file.name) # noqa: S310
180+
return hl.import_table(
181+
tmp_file.name,
182+
force=True,
183+
filter='^(#[^:]*:|^##).*$', # removes all comments except for the header line
184+
types={
185+
'#VariationID': hl.tstr,
186+
'Submitter': hl.tstr,
187+
'ReportedPhenotypeInfo': hl.tstr,
188+
},
189+
missing='-',
190+
min_partitions=3, # recommended 2-4 partitions per core
191+
)

v03_pipeline/lib/reference_data/clinvar_test.py

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
import unittest
2+
from unittest import mock
23

34
import hail as hl
45

56
from v03_pipeline.lib.reference_data.clinvar import (
7+
join_to_submission_summary_ht,
68
parsed_and_mapped_clnsigconf,
79
parsed_clnsig,
810
)
@@ -83,3 +85,130 @@ def test_parsed_and_mapped_clnsigconf(self):
8385
],
8486
],
8587
)
88+
89+
@mock.patch(
90+
'v03_pipeline.lib.reference_data.clinvar.download_and_import_clinvar_submission_summary',
91+
)
92+
def test_join_to_submission_summary_ht(self, mock_download):
93+
clinvar_enums_struct = hl.Struct(
94+
CLNSIG=[
95+
'Pathogenic/Likely_pathogenic/Pathogenic',
96+
'_low_penetrance',
97+
],
98+
CLNSIGCONF=[
99+
'Pathogenic(8)|Likely_pathogenic(2)|Pathogenic',
100+
'_low_penetrance(1)|Uncertain_significance(1)',
101+
],
102+
CLNREVSTAT=['no_classifications_from_unflagged_records'],
103+
)
104+
vcf_ht = hl.Table.parallelize(
105+
[
106+
{
107+
'locus': hl.Locus(
108+
contig='chr1',
109+
position=871269,
110+
reference_genome='GRCh38',
111+
),
112+
'alleles': ['A', 'C'],
113+
'rsid': '5',
114+
'info': hl.Struct(ALLELEID=1, **clinvar_enums_struct),
115+
},
116+
{
117+
'locus': hl.Locus(
118+
contig='chr1',
119+
position=871269,
120+
reference_genome='GRCh38',
121+
),
122+
'alleles': ['A', 'AC'],
123+
'rsid': '7',
124+
'info': hl.Struct(ALLELEID=1, **clinvar_enums_struct),
125+
},
126+
],
127+
hl.tstruct(
128+
locus=hl.tlocus('GRCh38'),
129+
alleles=hl.tarray(hl.tstr),
130+
rsid=hl.tstr,
131+
info=hl.tstruct(
132+
ALLELEID=hl.tint32,
133+
CLNSIG=hl.tarray(hl.tstr),
134+
CLNSIGCONF=hl.tarray(hl.tstr),
135+
CLNREVSTAT=hl.tarray(hl.tstr),
136+
),
137+
),
138+
)
139+
mock_download.return_value = hl.Table.parallelize(
140+
[
141+
{
142+
'#VariationID': '5',
143+
'Submitter': 'OMIM',
144+
'ReportedPhenotypeInfo': 'C3661900:not provided',
145+
},
146+
{
147+
'#VariationID': '5',
148+
'Submitter': 'Broad Institute Rare Disease Group, Broad Institute',
149+
'ReportedPhenotypeInfo': 'C0023264:Leigh syndrome',
150+
},
151+
{
152+
'#VariationID': '5',
153+
'Submitter': 'PreventionGenetics, part of Exact Sciences',
154+
'ReportedPhenotypeInfo': 'na:FOXRED1-related condition',
155+
},
156+
{
157+
'#VariationID': '5',
158+
'Submitter': 'Invitae',
159+
'ReportedPhenotypeInfo': 'C4748791:Mitochondrial complex 1 deficiency, nuclear type 19',
160+
},
161+
{
162+
'#VariationID': '6',
163+
'Submitter': 'A',
164+
'ReportedPhenotypeInfo': 'na:B',
165+
},
166+
],
167+
hl.tstruct(
168+
**{
169+
'#VariationID': hl.tstr,
170+
'Submitter': hl.tstr,
171+
'ReportedPhenotypeInfo': hl.tstr,
172+
},
173+
),
174+
)
175+
ht = join_to_submission_summary_ht(vcf_ht)
176+
self.assertEqual(
177+
ht.collect(),
178+
[
179+
hl.Struct(
180+
locus=hl.Locus(
181+
contig='chr1',
182+
position=871269,
183+
reference_genome='GRCh38',
184+
),
185+
alleles=['A', 'C'],
186+
rsid='5',
187+
info=hl.Struct(ALLELEID=1, **clinvar_enums_struct),
188+
submitters=[
189+
'OMIM',
190+
'Broad Institute Rare Disease Group, Broad Institute',
191+
'PreventionGenetics, part of Exact Sciences',
192+
'Invitae',
193+
],
194+
conditions=[
195+
'C3661900:not provided',
196+
'C0023264:Leigh syndrome',
197+
'na:FOXRED1-related condition',
198+
'C4748791:Mitochondrial complex 1 deficiency, nuclear type 19',
199+
],
200+
),
201+
hl.Struct(
202+
locus=hl.Locus(
203+
contig='chr1',
204+
position=871269,
205+
reference_genome='GRCh38',
206+
),
207+
alleles=['A', 'AC'],
208+
rsid='7',
209+
info=hl.Struct(ALLELEID=1, **clinvar_enums_struct),
210+
submitters=None,
211+
conditions=None,
212+
),
213+
],
214+
)

v03_pipeline/lib/reference_data/config.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,11 @@ def clinvar_custom_select(ht):
5353
# so there's a hidden enum-mapping inside this clinvar function.
5454
selects['conflictingPathogenicities'] = parsed_and_mapped_clnsigconf(ht)
5555
selects['goldStars'] = CLINVAR_GOLD_STARS_LOOKUP.get(hl.delimit(ht.info.CLNREVSTAT))
56+
selects['submitters'] = ht.submitters
57+
selects['conditions'] = hl.map(
58+
lambda p: p.split(r':')[1],
59+
ht.conditions,
60+
) # assumes the format 'MedGen#:condition', e.g.'C0023264:Leigh syndrome'
5661
return selects
5762

5863

v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py

Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
TEST_COMBINED_37 = 'v03_pipeline/var/test/reference_data/test_combined_37.ht'
3636
TEST_HGMD_37 = 'v03_pipeline/var/test/reference_data/test_hgmd_37.ht'
3737

38+
3839
MOCK_CADD_CONFIG = {
3940
'version': 'v1.6',
4041
'select': ['PHRED'],
@@ -66,6 +67,8 @@
6667
CLNSIGCONF=hl.tarray(hl.tstr),
6768
CLNREVSTAT=hl.tarray(hl.tstr),
6869
),
70+
submitters=hl.tarray(hl.tstr),
71+
conditions=hl.tarray(hl.tstr),
6972
),
7073
key=['locus', 'alleles'],
7174
globals=hl.Struct(
@@ -456,6 +459,8 @@
456459
CLNSIGCONF=hl.tarray(hl.tstr),
457460
CLNREVSTAT=hl.tarray(hl.tstr),
458461
),
462+
submitters=hl.tarray(hl.tstr),
463+
conditions=hl.tarray(hl.tstr),
459464
),
460465
key=['locus', 'alleles'],
461466
globals=hl.Struct(
@@ -712,7 +717,15 @@ def test_update_vat_with_updated_rdc_snv_indel_38(
712717
),
713718
alleles=['A', 'C'],
714719
cadd=hl.Struct(PHRED=2),
715-
clinvar=None,
720+
clinvar=hl.Struct(
721+
alleleId=None,
722+
conflictingPathogenicities=None,
723+
goldStars=None,
724+
pathogenicity_id=None,
725+
assertion_ids=None,
726+
submitters=None,
727+
conditions=None,
728+
),
716729
dbnsfp=hl.Struct(
717730
REVEL_score=0.043,
718731
SIFT_score=None,
@@ -949,7 +962,15 @@ def test_update_vat_with_updated_rdc_mito_38(
949962
reference_genome='GRCh38',
950963
),
951964
alleles=['A', 'C'],
952-
clinvar_mito=None,
965+
clinvar_mito=hl.Struct(
966+
alleleId=None,
967+
conflictingPathogenicities=None,
968+
goldStars=None,
969+
pathogenicity_id=None,
970+
assertion_ids=None,
971+
submitters=None,
972+
conditions=None,
973+
),
953974
dbnsfp_mito=hl.Struct(
954975
SIFT_score=None,
955976
MutationTaster_pred_id=2,
@@ -1093,7 +1114,6 @@ def test_update_vat_with_updated_rdc_snv_indel_37(
10931114
),
10941115
],
10951116
)
1096-
10971117
self.assertCountEqual(
10981118
ht.collect(),
10991119
[
@@ -1105,7 +1125,15 @@ def test_update_vat_with_updated_rdc_snv_indel_37(
11051125
),
11061126
alleles=['A', 'C'],
11071127
cadd=hl.Struct(PHRED=9.699999809265137),
1108-
clinvar=None,
1128+
clinvar=hl.Struct(
1129+
alleleId=None,
1130+
conflictingPathogenicities=None,
1131+
goldStars=None,
1132+
pathogenicity_id=None,
1133+
assertion_ids=None,
1134+
submitters=None,
1135+
conditions=None,
1136+
),
11091137
dbnsfp=hl.Struct(
11101138
REVEL_score=0.043,
11111139
SIFT_score=None,

v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query_test.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,8 @@
7070
CLNSIGCONF=hl.tarray(hl.tstr),
7171
CLNREVSTAT=hl.tarray(hl.tstr),
7272
),
73+
submitters=hl.tarray(hl.tstr),
74+
conditions=hl.tarray(hl.tstr),
7375
),
7476
key=['locus', 'alleles'],
7577
globals=hl.Struct(

0 commit comments

Comments
 (0)