Skip to content

Commit 70dd4f9

Browse files
committed
Add test and make max gap and max coalesce size config options
1 parent 361a292 commit 70dd4f9

File tree

3 files changed

+39
-12
lines changed

3 files changed

+39
-12
lines changed

src/zarr/codecs/sharding.py

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -569,9 +569,10 @@ async def _load_partial_shard_maybe(
569569
def _coalesce_chunks(
570570
self,
571571
chunks: list[_ChunkCoordsByteSlice],
572-
max_gap_bytes: int = 2**20, # 1MiB
573-
coalesce_max_bytes: int = 100 * 2**20, # 100MiB
574572
) -> list[list[_ChunkCoordsByteSlice]]:
573+
max_gap_bytes = config.get("sharding.read.coalesce_max_gap_bytes")
574+
coalesce_max_bytes = config.get("sharding.read.coalesce_max_bytes")
575+
575576
sorted_chunks = sorted(chunks, key=lambda c: c.byte_slice.start)
576577

577578
groups = []
@@ -590,15 +591,6 @@ def _coalesce_chunks(
590591

591592
groups.append(current_group)
592593

593-
from pprint import pprint
594-
595-
pprint(
596-
[
597-
f"{len(g)} chunks, {(g[-1].byte_slice.stop - g[0].byte_slice.start) / 1e6:.1f}MB"
598-
for g in groups
599-
]
600-
)
601-
602594
return groups
603595

604596
async def _get_group_bytes(

src/zarr/core/config.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,12 @@ def enable_gpu(self) -> ConfigSet:
108108
},
109109
"async": {"concurrency": 10, "timeout": None},
110110
"threading": {"max_workers": None},
111+
"sharding": {
112+
"read": {
113+
"coalesce_max_bytes": 100 * 2**20, # 100MiB
114+
"coalesce_max_gap_bytes": 2**20, # 1MiB
115+
}
116+
},
111117
"json_indent": 2,
112118
"codec_pipeline": {
113119
"path": "zarr.core.codec_pipeline.BatchedCodecPipeline",

tests/test_codecs/test_sharding.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -243,10 +243,39 @@ def test_partial_shard_read_performance(store: Store) -> None:
243243
}
244244
)
245245

246-
with open("zarr-python-partial-shard-read-performance-no-coalesce.json", "w") as f:
246+
with open("zarr-python-partial-shard-read-performance.json", "w") as f:
247247
json.dump(experiments, f)
248248

249249

250+
@pytest.mark.parametrize("index_location", ["start", "end"])
251+
@pytest.mark.parametrize("store", ["local", "memory", "zip"], indirect=["store"])
252+
def test_sharding_multiple_chunks_partial_shard_read(
253+
store: Store, index_location: ShardingCodecIndexLocation
254+
) -> None:
255+
array_shape = (8, 64)
256+
shard_shape = (4, 32)
257+
chunk_shape = (2, 4)
258+
259+
data = np.arange(np.prod(array_shape), dtype="float32").reshape(array_shape)
260+
261+
a = zarr.create_array(
262+
StorePath(store),
263+
shape=data.shape,
264+
chunks=chunk_shape,
265+
shards={"shape": shard_shape, "index_location": index_location},
266+
compressors=BloscCodec(cname="lz4"),
267+
dtype=data.dtype,
268+
fill_value=1,
269+
)
270+
a[:] = data
271+
272+
# Reads 2.5 (3 full, one partial) chunks each from 2 shards (a subset of both shards)
273+
assert np.allclose(a[0, 22:42], np.arange(22, 42, dtype="float32"))
274+
275+
# Reads 2 chunks from both shards along dimension 0
276+
assert np.allclose(a[:, 0], np.arange(0, data.size, array_shape[1], dtype="float32"))
277+
278+
250279
@pytest.mark.parametrize(
251280
"array_fixture",
252281
[

0 commit comments

Comments
 (0)