Skip to content

Commit e707da8

Browse files
committed
Try a checkpoint before the join
1 parent c30d90f commit e707da8

File tree

3 files changed

+16
-9
lines changed

3 files changed

+16
-9
lines changed

v03_pipeline/lib/misc/io.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -200,21 +200,24 @@ def import_pedigree(pedigree_path: str) -> hl.Table:
200200
)
201201

202202

203-
def write(
204-
t: hl.Table | hl.MatrixTable,
205-
destination_path: str,
206-
) -> hl.Table | hl.MatrixTable:
203+
def checkpoint(t: hl.Table | hl.MatrixTable) -> tuple[hl.Table | hl.MatrixTable, str]:
207204
suffix = 'mt' if isinstance(t, hl.MatrixTable) else 'ht'
208205
read_fn = hl.read_matrix_table if isinstance(t, hl.MatrixTable) else hl.read_table
209206
checkpoint_path = os.path.join(
210207
Env.HAIL_TMPDIR,
211208
f'{uuid.uuid4()}.{suffix}',
212209
)
213-
# not using checkpoint to read/write here because the checkpoint codec is different, leading to a different on disk size.
214210
t.write(checkpoint_path)
215-
t = read_fn(checkpoint_path)
211+
return read_fn(checkpoint_path), checkpoint_path
212+
213+
214+
def write(
215+
t: hl.Table | hl.MatrixTable,
216+
destination_path: str,
217+
) -> hl.Table | hl.MatrixTable:
218+
t, path = checkpoint(t)
216219
t = t.repartition(
217-
compute_hail_n_partitions(file_size_bytes(checkpoint_path)),
220+
compute_hail_n_partitions(file_size_bytes(path)),
218221
shuffle=False,
219222
)
220223
return t.write(destination_path, overwrite=True)

v03_pipeline/lib/reference_data/clinvar.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,7 @@ def download_and_import_clinvar_submission_summary() -> hl.Table:
183183
os.path.basename(tmp_file.name),
184184
)
185185
safely_move_to_gcs(tmp_file.name, gcs_tmp_file_name)
186-
return hl.import_table(
186+
ht = hl.import_table(
187187
gcs_tmp_file_name,
188188
force=True,
189189
filter='^(#[^:]*:|^##).*$', # removes all comments except for the header line
@@ -193,5 +193,6 @@ def download_and_import_clinvar_submission_summary() -> hl.Table:
193193
'ReportedPhenotypeInfo': hl.tstr,
194194
},
195195
missing='-',
196-
min_partitions=MIN_HT_PARTITIONS,
197196
)
197+
# NB: min_partitions fails with force=True but appears overrideable
198+
return ht.reparition(MIN_HT_PARTITIONS)

v03_pipeline/lib/reference_data/dataset_table_operations.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import hail as hl
55
import pytz
66

7+
from v03_pipeline.lib.misc.io import checkpoint
78
from v03_pipeline.lib.misc.nested_field import parse_nested_field
89
from v03_pipeline.lib.model import (
910
DatasetType,
@@ -36,6 +37,7 @@ def update_or_create_joined_ht(
3637

3738
# Join the new one!
3839
dataset_ht = get_dataset_ht(dataset, reference_genome)
40+
dataset_ht, _ = checkpoint(dataset_ht)
3941
joined_ht = joined_ht.join(dataset_ht, 'outer')
4042
joined_ht = annotate_dataset_globals(joined_ht, dataset, dataset_ht)
4143

@@ -213,6 +215,7 @@ def join_hts(
213215
)
214216
for dataset in reference_dataset_collection.datasets(dataset_type):
215217
dataset_ht = get_dataset_ht(dataset, reference_genome)
218+
dataset_ht, _ = checkpoint(dataset_ht)
216219
joined_ht = joined_ht.join(dataset_ht, 'outer')
217220
joined_ht = annotate_dataset_globals(joined_ht, dataset, dataset_ht)
218221
return joined_ht

0 commit comments

Comments
 (0)