Skip to content

Commit 2debf88

Browse files
authored
Parse clinvar version from header (#949)
* Parse clinvar version from header * responses activate * fix test
1 parent 7a1966d commit 2debf88

File tree

2 files changed

+38
-18
lines changed

2 files changed

+38
-18
lines changed

v03_pipeline/lib/reference_data/clinvar.py

Lines changed: 12 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ def download_and_import_latest_clinvar_vcf(
120120
clinvar_url: str,
121121
reference_genome: ReferenceGenome,
122122
) -> hl.Table:
123+
version = parse_clinvar_release_date(clinvar_url)
123124
with tempfile.NamedTemporaryFile(suffix='.vcf.gz', delete=False) as tmp_file:
124125
urllib.request.urlretrieve(clinvar_url, tmp_file.name) # noqa: S310
125126
cached_tmp_file_name = os.path.join(
@@ -139,27 +140,20 @@ def download_and_import_latest_clinvar_vcf(
139140
min_partitions=MIN_HT_PARTITIONS,
140141
force_bgz=True,
141142
)
142-
mt = mt.annotate_globals(version=_parse_clinvar_release_date(tmp_file.name))
143+
mt = mt.annotate_globals(version=version)
143144
return join_to_submission_summary_ht(mt.rows())
144145

145146

146-
def _parse_clinvar_release_date(local_vcf_path: str) -> str:
147-
"""Parse clinvar release date from the VCF header.
148-
149-
Args:
150-
local_vcf_path (str): clinvar vcf path on the local file system.
151-
152-
Returns:
153-
str: return VCF release date as string, or None if release date not found in header.
154-
"""
155-
with gzip.open(local_vcf_path, 'rt') as f:
156-
for line in f:
157-
if line.startswith('##fileDate='):
158-
return line.split('=')[-1].strip()
159-
160-
if not line.startswith('#'):
161-
return None
162-
147+
def parse_clinvar_release_date(clinvar_url: str) -> str:
148+
response = requests.get(clinvar_url, stream=True, timeout=10)
149+
for byte_line in gzip.GzipFile(fileobj=response.raw):
150+
line = byte_line.decode('ascii').strip()
151+
if not line:
152+
continue
153+
if line.startswith('##fileDate='):
154+
return line.split('=')[-1].strip()
155+
if not line.startswith('#'):
156+
return None
163157
return None
164158

165159

v03_pipeline/lib/reference_data/clinvar_test.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,43 @@
1+
import gzip
12
import unittest
23
from unittest import mock
34

45
import hail as hl
6+
import responses
57

68
from v03_pipeline.lib.reference_data.clinvar import (
79
import_submission_table,
810
join_to_submission_summary_ht,
11+
parse_clinvar_release_date,
912
parsed_and_mapped_clnsigconf,
1013
parsed_clnsig,
1114
)
1215

16+
CLINVAR_VCF_DATA = b"""
17+
##fileformat=VCFv4.1
18+
##fileDate=2024-10-27
19+
##source=ClinVar
20+
##reference=GRCh37
21+
##ID=<Description="ClinVar Variation ID">
22+
##INFO=<ID=AF_ESP,Number=1,Type=Float,Description="allele frequencies from GO-ESP">
23+
"""
24+
1325

1426
class ClinvarTest(unittest.TestCase):
27+
@responses.activate
28+
def test_parse_clinvar_release_date(self):
29+
clinvar_url = (
30+
'https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz'
31+
)
32+
responses.get(
33+
clinvar_url,
34+
body=gzip.compress(CLINVAR_VCF_DATA),
35+
)
36+
self.assertEqual(
37+
parse_clinvar_release_date(clinvar_url),
38+
'2024-10-27',
39+
)
40+
1541
def test_parsed_clnsig(self):
1642
ht = hl.Table.parallelize(
1743
[

0 commit comments

Comments
 (0)