Skip to content

Commit 13655e1

Browse files
authored
Merge pull request #839 from broadinstitute/dev
Dev
2 parents cefeef9 + 17ed92c commit 13655e1

File tree

9 files changed

+256
-111
lines changed

9 files changed

+256
-111
lines changed

v03_pipeline/lib/misc/io.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -214,10 +214,12 @@ def checkpoint(t: hl.Table | hl.MatrixTable) -> tuple[hl.Table | hl.MatrixTable,
214214
def write(
215215
t: hl.Table | hl.MatrixTable,
216216
destination_path: str,
217+
repartition: bool = True,
217218
) -> hl.Table | hl.MatrixTable:
218219
t, path = checkpoint(t)
219-
t = t.repartition(
220-
compute_hail_n_partitions(file_size_bytes(path)),
221-
shuffle=False,
222-
)
220+
if repartition:
221+
t = t.repartition(
222+
compute_hail_n_partitions(file_size_bytes(path)),
223+
shuffle=False,
224+
)
223225
return t.write(destination_path, overwrite=True)

v03_pipeline/lib/paths.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -282,3 +282,10 @@ def new_variants_table_path(
282282
run_id,
283283
'new_variants.ht',
284284
)
285+
286+
287+
def clinvar_dataset_path(reference_genome: ReferenceGenome, etag: str) -> str:
288+
return os.path.join(
289+
Env.HAIL_TMPDIR,
290+
f'clinvar-{reference_genome.value}-{etag}.ht',
291+
)

v03_pipeline/lib/reference_data/clinvar.py

Lines changed: 43 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,15 @@
66
import urllib
77

88
import hail as hl
9+
import hailtop.fs as hfs
10+
import requests
911

1012
from v03_pipeline.lib.annotations.enums import CLINVAR_PATHOGENICITIES_LOOKUP
1113
from v03_pipeline.lib.logger import get_logger
14+
from v03_pipeline.lib.misc.io import write
1215
from v03_pipeline.lib.model import Env
1316
from v03_pipeline.lib.model.definitions import ReferenceGenome
17+
from v03_pipeline.lib.paths import clinvar_dataset_path
1418

1519
CLINVAR_ASSERTIONS = [
1620
'Affects',
@@ -110,12 +114,26 @@ def parsed_and_mapped_clnsigconf(ht: hl.Table):
110114
)
111115

112116

117+
def get_clinvar_ht(
118+
clinvar_url: str,
119+
reference_genome: ReferenceGenome,
120+
):
121+
etag = requests.head(clinvar_url, timeout=10).headers.get('ETag').strip('"')
122+
clinvar_ht_path = clinvar_dataset_path(reference_genome, etag)
123+
if hfs.exists(clinvar_ht_path):
124+
logger.info(f'Try using cached clinvar ht with etag {etag}')
125+
ht = hl.read_table(clinvar_ht_path)
126+
else:
127+
logger.info('Cached clinvar ht not found, downloading latest clinvar vcf')
128+
ht = download_and_import_latest_clinvar_vcf(clinvar_url, reference_genome)
129+
write(ht, clinvar_ht_path, repartition=False)
130+
return ht
131+
132+
113133
def download_and_import_latest_clinvar_vcf(
114134
clinvar_url: str,
115135
reference_genome: ReferenceGenome,
116136
) -> hl.Table:
117-
"""Downloads the latest clinvar VCF from the NCBI FTP server, imports it to a MT and returns that."""
118-
119137
with tempfile.NamedTemporaryFile(suffix='.vcf.gz', delete=False) as tmp_file:
120138
urllib.request.urlretrieve(clinvar_url, tmp_file.name) # noqa: S310
121139
gcs_tmp_file_name = os.path.join(
@@ -158,14 +176,8 @@ def _parse_clinvar_release_date(local_vcf_path: str) -> str:
158176

159177
def join_to_submission_summary_ht(vcf_ht: hl.Table) -> hl.Table:
160178
# https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/README - submission_summary.txt
161-
logger.info('Getting clinvar submission summary')
179+
logger.info('Getting clinvar submission summary from NCBI FTP server')
162180
ht = download_and_import_clinvar_submission_summary()
163-
ht = ht.rename({'#VariationID': 'VariationID'})
164-
ht = ht.select('VariationID', 'Submitter', 'ReportedPhenotypeInfo')
165-
ht = ht.group_by('VariationID').aggregate(
166-
Submitters=hl.agg.collect(ht.Submitter),
167-
Conditions=hl.agg.collect(ht.ReportedPhenotypeInfo),
168-
)
169181
return vcf_ht.annotate(
170182
submitters=ht[vcf_ht.rsid].Submitters,
171183
conditions=ht[vcf_ht.rsid].Conditions,
@@ -193,15 +205,25 @@ def download_and_import_clinvar_submission_summary() -> hl.Table:
193205
os.path.basename(unzipped_tmp_file.name),
194206
)
195207
safely_move_to_gcs(unzipped_tmp_file.name, gcs_tmp_file_name)
196-
return hl.import_table(
197-
gcs_tmp_file_name,
198-
force=True,
199-
filter='^(#[^:]*:|^##).*$', # removes all comments except for the header line
200-
types={
201-
'#VariationID': hl.tstr,
202-
'Submitter': hl.tstr,
203-
'ReportedPhenotypeInfo': hl.tstr,
204-
},
205-
missing='-',
206-
min_partitions=MIN_HT_PARTITIONS,
207-
)
208+
return import_submission_table(gcs_tmp_file_name)
209+
210+
211+
def import_submission_table(file_name: str) -> hl.Table:
212+
ht = hl.import_table(
213+
file_name,
214+
force=True,
215+
filter='^(#[^:]*:|^##).*$', # removes all comments except for the header line
216+
types={
217+
'#VariationID': hl.tstr,
218+
'Submitter': hl.tstr,
219+
'ReportedPhenotypeInfo': hl.tstr,
220+
},
221+
missing='-',
222+
min_partitions=MIN_HT_PARTITIONS,
223+
)
224+
ht = ht.rename({'#VariationID': 'VariationID'})
225+
ht = ht.select('VariationID', 'Submitter', 'ReportedPhenotypeInfo')
226+
return ht.group_by('VariationID').aggregate(
227+
Submitters=hl.agg.collect(ht.Submitter),
228+
Conditions=hl.agg.collect(ht.ReportedPhenotypeInfo),
229+
)

v03_pipeline/lib/reference_data/clinvar_test.py

Lines changed: 115 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import hail as hl
55

66
from v03_pipeline.lib.reference_data.clinvar import (
7+
import_submission_table,
78
join_to_submission_summary_ht,
89
parsed_and_mapped_clnsigconf,
910
parsed_clnsig,
@@ -87,56 +88,10 @@ def test_parsed_and_mapped_clnsigconf(self):
8788
)
8889

8990
@mock.patch(
90-
'v03_pipeline.lib.reference_data.clinvar.download_and_import_clinvar_submission_summary',
91+
'v03_pipeline.lib.reference_data.clinvar.hl.import_table',
9192
)
92-
def test_join_to_submission_summary_ht(self, mock_download):
93-
clinvar_enums_struct = hl.Struct(
94-
CLNSIG=[
95-
'Pathogenic/Likely_pathogenic/Pathogenic',
96-
'_low_penetrance',
97-
],
98-
CLNSIGCONF=[
99-
'Pathogenic(8)|Likely_pathogenic(2)|Pathogenic',
100-
'_low_penetrance(1)|Uncertain_significance(1)',
101-
],
102-
CLNREVSTAT=['no_classifications_from_unflagged_records'],
103-
)
104-
vcf_ht = hl.Table.parallelize(
105-
[
106-
{
107-
'locus': hl.Locus(
108-
contig='chr1',
109-
position=871269,
110-
reference_genome='GRCh38',
111-
),
112-
'alleles': ['A', 'C'],
113-
'rsid': '5',
114-
'info': hl.Struct(ALLELEID=1, **clinvar_enums_struct),
115-
},
116-
{
117-
'locus': hl.Locus(
118-
contig='chr1',
119-
position=871269,
120-
reference_genome='GRCh38',
121-
),
122-
'alleles': ['A', 'AC'],
123-
'rsid': '7',
124-
'info': hl.Struct(ALLELEID=1, **clinvar_enums_struct),
125-
},
126-
],
127-
hl.tstruct(
128-
locus=hl.tlocus('GRCh38'),
129-
alleles=hl.tarray(hl.tstr),
130-
rsid=hl.tstr,
131-
info=hl.tstruct(
132-
ALLELEID=hl.tint32,
133-
CLNSIG=hl.tarray(hl.tstr),
134-
CLNSIGCONF=hl.tarray(hl.tstr),
135-
CLNREVSTAT=hl.tarray(hl.tstr),
136-
),
137-
),
138-
)
139-
mock_download.return_value = hl.Table.parallelize(
93+
def test_import_submission_table(self, mock_import_table):
94+
mock_import_table.return_value = hl.Table.parallelize(
14095
[
14196
{
14297
'#VariationID': '5',
@@ -164,51 +119,137 @@ def test_join_to_submission_summary_ht(self, mock_download):
164119
'ReportedPhenotypeInfo': 'na:B',
165120
},
166121
],
167-
hl.tstruct(
168-
**{
169-
'#VariationID': hl.tstr,
170-
'Submitter': hl.tstr,
171-
'ReportedPhenotypeInfo': hl.tstr,
172-
},
173-
),
174122
)
175-
ht = join_to_submission_summary_ht(vcf_ht)
123+
ht = import_submission_table('mock_file_name')
176124
self.assertEqual(
177125
ht.collect(),
178126
[
179127
hl.Struct(
180-
locus=hl.Locus(
181-
contig='chr1',
182-
position=871269,
183-
reference_genome='GRCh38',
184-
),
185-
alleles=['A', 'C'],
186-
rsid='5',
187-
info=hl.Struct(ALLELEID=1, **clinvar_enums_struct),
188-
submitters=[
128+
VariationID='5',
129+
Submitters=[
189130
'OMIM',
190131
'Broad Institute Rare Disease Group, Broad Institute',
191132
'PreventionGenetics, part of Exact Sciences',
192133
'Invitae',
193134
],
194-
conditions=[
135+
Conditions=[
195136
'C3661900:not provided',
196137
'C0023264:Leigh syndrome',
197138
'na:FOXRED1-related condition',
198139
'C4748791:Mitochondrial complex 1 deficiency, nuclear type 19',
199140
],
200141
),
201142
hl.Struct(
202-
locus=hl.Locus(
143+
VariationID='6',
144+
Submitters=['A'],
145+
Conditions=['na:B'],
146+
),
147+
],
148+
)
149+
150+
@mock.patch(
151+
'v03_pipeline.lib.reference_data.clinvar.download_and_import_clinvar_submission_summary',
152+
)
153+
def test_join_to_submission_summary_ht(
154+
self,
155+
mock_download,
156+
):
157+
vcf_ht = hl.Table.parallelize(
158+
[
159+
{
160+
'locus': hl.Locus(
203161
contig='chr1',
204162
position=871269,
205163
reference_genome='GRCh38',
206164
),
207-
alleles=['A', 'AC'],
208-
rsid='7',
209-
info=hl.Struct(ALLELEID=1, **clinvar_enums_struct),
210-
submitters=None,
211-
conditions=None,
212-
),
165+
'alleles': ['A', 'C'],
166+
'rsid': '5',
167+
'info': hl.Struct(ALLELEID=1),
168+
},
169+
{
170+
'locus': hl.Locus(
171+
contig='chr1',
172+
position=871269,
173+
reference_genome='GRCh38',
174+
),
175+
'alleles': ['A', 'AC'],
176+
'rsid': '7',
177+
'info': hl.Struct(ALLELEID=1),
178+
},
179+
],
180+
hl.tstruct(
181+
locus=hl.tlocus('GRCh38'),
182+
alleles=hl.tarray(hl.tstr),
183+
rsid=hl.tstr,
184+
info=hl.tstruct(ALLELEID=hl.tint32),
185+
),
186+
)
187+
submitters_ht = hl.Table.parallelize(
188+
[
189+
{
190+
'VariationID': '5',
191+
'Submitters': [
192+
'OMIM',
193+
'Broad Institute Rare Disease Group, Broad Institute',
194+
'PreventionGenetics, part of Exact Sciences',
195+
'Invitae',
196+
],
197+
'Conditions': [
198+
'C3661900:not provided',
199+
'C0023264:Leigh syndrome',
200+
'na:FOXRED1-related condition',
201+
'C4748791:Mitochondrial complex 1 deficiency, nuclear type 19',
202+
],
203+
},
204+
{'VariationID': '6', 'Submitters': ['A'], 'Conditions': ['na:B']},
213205
],
206+
hl.tstruct(
207+
VariationID=hl.tstr,
208+
Submitters=hl.tarray(hl.tstr),
209+
Conditions=hl.tarray(hl.tstr),
210+
),
211+
key='VariationID',
212+
)
213+
expected_clinvar_ht_rows = [
214+
hl.Struct(
215+
locus=hl.Locus(
216+
contig='chr1',
217+
position=871269,
218+
reference_genome='GRCh38',
219+
),
220+
alleles=['A', 'C'],
221+
rsid='5',
222+
info=hl.Struct(ALLELEID=1),
223+
submitters=[
224+
'OMIM',
225+
'Broad Institute Rare Disease Group, Broad Institute',
226+
'PreventionGenetics, part of Exact Sciences',
227+
'Invitae',
228+
],
229+
conditions=[
230+
'C3661900:not provided',
231+
'C0023264:Leigh syndrome',
232+
'na:FOXRED1-related condition',
233+
'C4748791:Mitochondrial complex 1 deficiency, nuclear type 19',
234+
],
235+
),
236+
hl.Struct(
237+
locus=hl.Locus(
238+
contig='chr1',
239+
position=871269,
240+
reference_genome='GRCh38',
241+
),
242+
alleles=['A', 'AC'],
243+
rsid='7',
244+
info=hl.Struct(ALLELEID=1),
245+
submitters=None,
246+
conditions=None,
247+
),
248+
]
249+
250+
mock_download.return_value = submitters_ht
251+
ht = join_to_submission_summary_ht(vcf_ht)
252+
self.assertEqual(
253+
ht.collect(),
254+
expected_clinvar_ht_rows,
214255
)

0 commit comments

Comments
 (0)