Skip to content

Commit 3dd56d6

Browse files
committed
Try more partitions in the repartition
1 parent 99e5d90 commit 3dd56d6

File tree

2 files changed

+11
-4
lines changed

2 files changed

+11
-4
lines changed

v03_pipeline/lib/misc/io.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -271,26 +271,33 @@ def remap_pedigree_hash(remap_path: str, pedigree_path: str) -> hl.Int32Expressi
271271
return hl.int32(int(sha256.hexdigest()[:8], 16))
272272

273273

274-
def checkpoint(t: hl.Table | hl.MatrixTable) -> tuple[hl.Table | hl.MatrixTable, str]:
274+
def checkpoint(
275+
t: hl.Table | hl.MatrixTable,
276+
repartition_factor: int = 1,
277+
) -> tuple[hl.Table | hl.MatrixTable, str]:
275278
suffix = 'mt' if isinstance(t, hl.MatrixTable) else 'ht'
276279
read_fn = hl.read_matrix_table if isinstance(t, hl.MatrixTable) else hl.read_table
277280
checkpoint_path = os.path.join(
278281
Env.HAIL_TMP_DIR,
279282
f'{uuid.uuid4()}.{suffix}',
280283
)
281-
t.write(checkpoint_path)
284+
t.write(checkpoint_path, repartition_factor=repartition_factor)
282285
return read_fn(checkpoint_path), checkpoint_path
283286

284287

285288
def write(
286289
t: hl.Table | hl.MatrixTable,
287290
destination_path: str,
288291
repartition: bool = True,
292+
# May be used to increase the number of partitions beyond
293+
# the optimally computed number. A higher number will
294+
# shard the table into more partitions.
295+
repartition_factor: int = 1,
289296
) -> hl.Table | hl.MatrixTable:
290297
t, path = checkpoint(t)
291298
if repartition:
292299
t = t.repartition(
293-
compute_hail_n_partitions(file_size_bytes(path)),
300+
(compute_hail_n_partitions(file_size_bytes(path)) * repartition_factor),
294301
shuffle=False,
295302
)
296303
return t.write(destination_path, overwrite=True)

v03_pipeline/lib/reference_datasets/splice_ai.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ def get_ht(
1313
# of file descriptors on dataproc :/
1414
hl._set_flags(use_new_shuffle=None, no_whole_stage_codegen='1') # noqa: SLF001
1515
ht = vcf_to_ht(paths, reference_genome)
16-
ht, _ = checkpoint(ht)
16+
ht, _ = checkpoint(ht, repartition_factor=2)
1717

1818
# SpliceAI INFO field description from the VCF header: SpliceAIv1.3 variant annotation. These include
1919
# delta scores (DS) and delta positions (DP) for acceptor gain (AG), acceptor loss (AL), donor gain (DG), and

0 commit comments

Comments
 (0)