@@ -120,6 +120,7 @@ def download_and_import_latest_clinvar_vcf(
120
120
clinvar_url : str ,
121
121
reference_genome : ReferenceGenome ,
122
122
) -> hl .Table :
123
+ version = parse_clinvar_release_date (clinvar_url )
123
124
with tempfile .NamedTemporaryFile (suffix = '.vcf.gz' , delete = False ) as tmp_file :
124
125
urllib .request .urlretrieve (clinvar_url , tmp_file .name ) # noqa: S310
125
126
cached_tmp_file_name = os .path .join (
@@ -139,27 +140,20 @@ def download_and_import_latest_clinvar_vcf(
139
140
min_partitions = MIN_HT_PARTITIONS ,
140
141
force_bgz = True ,
141
142
)
142
- mt = mt .annotate_globals (version = _parse_clinvar_release_date ( tmp_file . name ) )
143
+ mt = mt .annotate_globals (version = version )
143
144
return join_to_submission_summary_ht (mt .rows ())
144
145
145
146
146
- def _parse_clinvar_release_date (local_vcf_path : str ) -> str :
147
- """Parse clinvar release date from the VCF header.
148
-
149
- Args:
150
- local_vcf_path (str): clinvar vcf path on the local file system.
151
-
152
- Returns:
153
- str: return VCF release date as string, or None if release date not found in header.
154
- """
155
- with gzip .open (local_vcf_path , 'rt' ) as f :
156
- for line in f :
157
- if line .startswith ('##fileDate=' ):
158
- return line .split ('=' )[- 1 ].strip ()
159
-
160
- if not line .startswith ('#' ):
161
- return None
162
-
147
+ def parse_clinvar_release_date (clinvar_url : str ) -> str :
148
+ response = requests .get (clinvar_url , stream = True , timeout = 10 )
149
+ for byte_line in gzip .GzipFile (fileobj = response .raw ):
150
+ line = byte_line .decode ('ascii' ).strip ()
151
+ if not line :
152
+ continue
153
+ if line .startswith ('##fileDate=' ):
154
+ return line .split ('=' )[- 1 ].strip ()
155
+ if not line .startswith ('#' ):
156
+ return None
163
157
return None
164
158
165
159
0 commit comments