Skip to content

Commit 4a94b03

Browse files
authored
Add modified time to callset path (#1036)
* Add modified time to callset path * Fix tests * ruff * add test * test not worth it * actual test
1 parent daca608 commit 4a94b03

File tree

2 files changed

+65
-21
lines changed

2 files changed

+65
-21
lines changed

v03_pipeline/lib/paths.py

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
import os
33
import re
44

5+
import hailtop.fs as hfs
6+
57
from v03_pipeline.lib.model import (
68
AccessControl,
79
DatasetType,
@@ -59,6 +61,25 @@ def _v03_reference_dataset_prefix(
5961
)
6062

6163

64+
def _callset_path_hash(callset_path: str) -> str:
65+
# Include the most recent modified time of any
66+
# of the callset shards if they exist.
67+
try:
68+
# hfs.ls throws FileNotFoundError if a non-wildcard is passed
69+
# but not found, but does not throw if a wildcard is passed and
70+
# there are no results.
71+
shards = hfs.ls(callset_path)
72+
if not shards:
73+
key = callset_path
74+
else:
75+
key = callset_path + str(max(f.modification_time for f in shards))
76+
except FileNotFoundError:
77+
key = callset_path
78+
return hashlib.sha256(
79+
key.encode('utf8'),
80+
).hexdigest()
81+
82+
6283
def family_table_path(
6384
reference_genome: ReferenceGenome,
6485
dataset_type: DatasetType,
@@ -114,7 +135,7 @@ def imported_callset_path(
114135
dataset_type,
115136
),
116137
'imported_callsets',
117-
f'{hashlib.sha256(callset_path.encode("utf8")).hexdigest()}.mt',
138+
f'{_callset_path_hash(callset_path)}.mt',
118139
)
119140

120141

@@ -178,7 +199,7 @@ def relatedness_check_table_path(
178199
dataset_type,
179200
),
180201
'relatedness_check',
181-
f'{hashlib.sha256(callset_path.encode("utf8")).hexdigest()}.ht',
202+
f'{_callset_path_hash(callset_path)}.ht',
182203
)
183204

184205

@@ -194,7 +215,7 @@ def relatedness_check_tsv_path(
194215
dataset_type,
195216
),
196217
'relatedness_check',
197-
f'{hashlib.sha256(callset_path.encode("utf8")).hexdigest()}.tsv',
218+
f'{_callset_path_hash(callset_path)}.tsv',
198219
)
199220

200221

@@ -212,7 +233,7 @@ def remapped_and_subsetted_callset_path(
212233
),
213234
'remapped_and_subsetted_callsets',
214235
project_guid,
215-
f'{hashlib.sha256(callset_path.encode("utf8")).hexdigest()}.mt',
236+
f'{_callset_path_hash(callset_path)}.mt',
216237
)
217238

218239

@@ -256,7 +277,7 @@ def sex_check_table_path(
256277
dataset_type,
257278
),
258279
'sex_check',
259-
f'{hashlib.sha256(callset_path.encode("utf8")).hexdigest()}.ht',
280+
f'{_callset_path_hash(callset_path)}.ht',
260281
)
261282

262283

v03_pipeline/lib/paths_test.py

Lines changed: 39 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import unittest
22
from unittest.mock import patch
33

4+
import hailtop.fs as hfs
5+
46
from v03_pipeline.lib.model import (
57
DatasetType,
68
ReferenceGenome,
@@ -24,6 +26,8 @@
2426
variant_annotations_table_path,
2527
)
2628

29+
TEST_VCF = 'v03_pipeline/var/test/callsets/1kg_30varia*.vcf'
30+
2731

2832
class TestPaths(unittest.TestCase):
2933
def test_family_table_path(self) -> None:
@@ -39,15 +43,15 @@ def test_family_table_path(self) -> None:
3943
with patch('v03_pipeline.lib.paths.Env') as mock_env, patch(
4044
'v03_pipeline.lib.paths.FeatureFlag',
4145
) as mock_ff:
42-
mock_env.HAIL_SEARCH_DATA_DIR = 'gs://seqr-datasets/'
46+
mock_env.HAIL_SEARCH_DATA_DIR = '/var/bucket/'
4347
self.assertEqual(
4448
family_table_path(
4549
ReferenceGenome.GRCh37,
4650
DatasetType.SNV_INDEL,
4751
SampleType.WES,
4852
'franklin',
4953
),
50-
'gs://seqr-datasets/v3.1/GRCh37/SNV_INDEL/families/WES/franklin.ht',
54+
'/var/bucket/v3.1/GRCh37/SNV_INDEL/families/WES/franklin.ht',
5155
)
5256
mock_ff.INCLUDE_PIPELINE_VERSION_IN_PREFIX = False
5357
self.assertEqual(
@@ -57,15 +61,15 @@ def test_family_table_path(self) -> None:
5761
SampleType.WES,
5862
'franklin',
5963
),
60-
'gs://seqr-datasets/GRCh37/SNV_INDEL/families/WES/franklin.ht',
64+
'/var/bucket/GRCh37/SNV_INDEL/families/WES/franklin.ht',
6165
)
6266

6367
def test_valid_filters_path(self) -> None:
6468
self.assertEqual(
6569
valid_filters_path(
6670
DatasetType.MITO,
6771
SampleType.WES,
68-
'gs://bucket/RDG_Broad_WES_Internal_Oct2023/part_one_outputs/chr*/*.vcf.gz',
72+
'/var/bucket/RDG_Broad_WES_Internal_Oct2023/part_one_outputs/chr*/*.vcf.gz',
6973
),
7074
None,
7175
)
@@ -75,9 +79,9 @@ def test_valid_filters_path(self) -> None:
7579
valid_filters_path(
7680
DatasetType.SNV_INDEL,
7781
SampleType.WES,
78-
'gs://bucket/RDG_Broad_WES_Internal_Oct2023/part_one_outputs/chr*/*.vcf.gz',
82+
'/var/bucket/RDG_Broad_WES_Internal_Oct2023/part_one_outputs/chr*/*.vcf.gz',
7983
),
80-
'gs://bucket/RDG_Broad_WES_Internal_Oct2023/part_two_outputs/*.filtered.*.vcf.gz',
84+
'/var/bucket/RDG_Broad_WES_Internal_Oct2023/part_two_outputs/*.filtered.*.vcf.gz',
8185
)
8286

8387
def test_project_table_path(self) -> None:
@@ -105,19 +109,19 @@ def test_sex_check_table_path(self) -> None:
105109
sex_check_table_path(
106110
ReferenceGenome.GRCh38,
107111
DatasetType.SNV_INDEL,
108-
'gs://abc.efg/callset.vcf.gz',
112+
'/var/abc.efg/callset.vcf.gz',
109113
),
110-
'/var/seqr/seqr-loading-temp/v3.1/GRCh38/SNV_INDEL/sex_check/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.ht',
114+
'/var/seqr/seqr-loading-temp/v3.1/GRCh38/SNV_INDEL/sex_check/f92b8ab6b5b8c41fa20d7d49a5626b96dcd2ba79fa6f61eab7ffb80d550d951c.ht',
111115
)
112116

113117
def test_relatedness_check_table_path(self) -> None:
114118
self.assertEqual(
115119
relatedness_check_table_path(
116120
ReferenceGenome.GRCh38,
117121
DatasetType.SNV_INDEL,
118-
'gs://abc.efg/callset.vcf.gz',
122+
'/var/abc.efg/callset.vcf.gz',
119123
),
120-
'/var/seqr/seqr-loading-temp/v3.1/GRCh38/SNV_INDEL/relatedness_check/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.ht',
124+
'/var/seqr/seqr-loading-temp/v3.1/GRCh38/SNV_INDEL/relatedness_check/f92b8ab6b5b8c41fa20d7d49a5626b96dcd2ba79fa6f61eab7ffb80d550d951c.ht',
121125
)
122126

123127
def test_validation_errors_for_run_path(self) -> None:
@@ -154,31 +158,50 @@ def test_remapped_and_subsetted_callset_path(self) -> None:
154158
remapped_and_subsetted_callset_path(
155159
ReferenceGenome.GRCh38,
156160
DatasetType.GCNV,
157-
'gs://abc.efg/callset.vcf.gz',
161+
'/var/abc.efg/callset.vcf.gz',
158162
'R0111_tgg_bblanken_wes',
159163
),
160-
'/var/seqr/seqr-loading-temp/v3.1/GRCh38/GCNV/remapped_and_subsetted_callsets/R0111_tgg_bblanken_wes/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.mt',
164+
'/var/seqr/seqr-loading-temp/v3.1/GRCh38/GCNV/remapped_and_subsetted_callsets/R0111_tgg_bblanken_wes/f92b8ab6b5b8c41fa20d7d49a5626b96dcd2ba79fa6f61eab7ffb80d550d951c.mt',
161165
)
162166
self.assertEqual(
163167
remapped_and_subsetted_callset_path(
164168
ReferenceGenome.GRCh38,
165169
DatasetType.GCNV,
166-
'gs://abc.efg/callset/*.vcf.gz',
170+
'/var/abc.efg/callset/*.vcf.gz',
167171
'R0111_tgg_bblanken_wes',
168172
),
169-
'/var/seqr/seqr-loading-temp/v3.1/GRCh38/GCNV/remapped_and_subsetted_callsets/R0111_tgg_bblanken_wes/bce53ccdb49a5ed2513044e1d0c6224e3ffcc323f770dc807d9175fd3c70a050.mt',
173+
'/var/seqr/seqr-loading-temp/v3.1/GRCh38/GCNV/remapped_and_subsetted_callsets/R0111_tgg_bblanken_wes/26f481b386721f9889250c6549905660728ec9f77be4b8f7eeb6c4facc76282e.mt',
170174
)
171175

172176
def test_imported_callset_path(self) -> None:
173177
self.assertEqual(
174178
imported_callset_path(
175179
ReferenceGenome.GRCh38,
176180
DatasetType.SNV_INDEL,
177-
'gs://abc.efg/callset.vcf.gz',
181+
'/var/abc.efg/callset.vcf.gz',
178182
),
179-
'/var/seqr/seqr-loading-temp/v3.1/GRCh38/SNV_INDEL/imported_callsets/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.mt',
183+
'/var/seqr/seqr-loading-temp/v3.1/GRCh38/SNV_INDEL/imported_callsets/f92b8ab6b5b8c41fa20d7d49a5626b96dcd2ba79fa6f61eab7ffb80d550d951c.mt',
180184
)
181185

186+
with patch('v03_pipeline.lib.paths.hfs.ls') as mock_ls:
187+
mock_ls.return_value = [
188+
hfs.stat_result.FileListEntry(
189+
path='v03_pipeline/var/test/callsets/1kg_30variants.vcf',
190+
owner=None,
191+
size=104481,
192+
typ=hfs.stat_result.FileType(2),
193+
modification_time=1732033623.804012,
194+
),
195+
]
196+
self.assertEqual(
197+
imported_callset_path(
198+
ReferenceGenome.GRCh38,
199+
DatasetType.SNV_INDEL,
200+
TEST_VCF,
201+
),
202+
'/var/seqr/seqr-loading-temp/v3.1/GRCh38/SNV_INDEL/imported_callsets/42f2c9e2025c4b61106b3fecfd30443f882a1849b73c6f6903a7e421c20117e0.mt',
203+
)
204+
182205
def test_tdr_metrics_path(self) -> None:
183206
self.assertEqual(
184207
tdr_metrics_path(

0 commit comments

Comments
 (0)