Skip to content

Commit f1bd703

Browse files
jhammand-v-b
andauthored
feature(store): V3 ZipStore (#2078)
* feature(store): add basic implementation of a zip store * add zip store to array/group/sharding tests * fix sharding and skip tests that require delete * store context managers * fix test typing * add buffer_cls to store test * clean up test failures * class docstring * remove commented out check against zipstore.delete * add api integration test --------- Co-authored-by: Davis Bennett <davis.v.bennett@gmail.com>
1 parent 534e0cd commit f1bd703

File tree

11 files changed

+413
-58
lines changed

11 files changed

+413
-58
lines changed

src/zarr/abc/store.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212

1313
class AccessMode(NamedTuple):
14+
str: AccessModeLiteral
1415
readonly: bool
1516
overwrite: bool
1617
create: bool
@@ -20,6 +21,7 @@ class AccessMode(NamedTuple):
2021
def from_literal(cls, mode: AccessModeLiteral) -> Self:
2122
if mode in ("r", "r+", "a", "w", "w-"):
2223
return cls(
24+
str=mode,
2325
readonly=mode == "r",
2426
overwrite=mode == "w",
2527
create=mode in ("a", "w", "w-"),
@@ -42,6 +44,14 @@ async def open(cls, *args: Any, **kwargs: Any) -> Self:
4244
await store._open()
4345
return store
4446

47+
def __enter__(self) -> Self:
48+
"""Enter a context manager that will close the store upon exiting."""
49+
return self
50+
51+
def __exit__(self, *args: Any) -> None:
52+
"""Close the store."""
53+
self.close()
54+
4555
async def _open(self) -> None:
4656
if self._is_open:
4757
raise ValueError("store is already open")
@@ -143,6 +153,12 @@ async def set(self, key: str, value: Buffer) -> None:
143153
"""
144154
...
145155

156+
@property
157+
@abstractmethod
158+
def supports_deletes(self) -> bool:
159+
"""Does the store support deletes?"""
160+
...
161+
146162
@abstractmethod
147163
async def delete(self, key: str) -> None:
148164
"""Remove a key from the store
@@ -221,7 +237,6 @@ def list_dir(self, prefix: str) -> AsyncGenerator[str, None]:
221237
def close(self) -> None:
222238
"""Close the store."""
223239
self._is_open = False
224-
pass
225240

226241

227242
@runtime_checkable

src/zarr/store/__init__.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,14 @@
22
from zarr.store.local import LocalStore
33
from zarr.store.memory import MemoryStore
44
from zarr.store.remote import RemoteStore
5+
from zarr.store.zip import ZipStore
56

6-
__all__ = ["StorePath", "StoreLike", "make_store_path", "RemoteStore", "LocalStore", "MemoryStore"]
7+
__all__ = [
8+
"StorePath",
9+
"StoreLike",
10+
"make_store_path",
11+
"RemoteStore",
12+
"LocalStore",
13+
"MemoryStore",
14+
"ZipStore",
15+
]

src/zarr/store/local.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ def _put(
7373

7474
class LocalStore(Store):
7575
supports_writes: bool = True
76+
supports_deletes: bool = True
7677
supports_partial_writes: bool = True
7778
supports_listing: bool = True
7879

src/zarr/store/memory.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
# When that is done, the `MemoryStore` will just be a store that wraps a dict.
1919
class MemoryStore(Store):
2020
supports_writes: bool = True
21+
supports_deletes: bool = True
2122
supports_partial_writes: bool = True
2223
supports_listing: bool = True
2324

src/zarr/store/remote.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
class RemoteStore(Store):
2121
# based on FSSpec
2222
supports_writes: bool = True
23+
supports_deletes: bool = True
2324
supports_partial_writes: bool = False
2425
supports_listing: bool = True
2526

src/zarr/store/zip.py

Lines changed: 223 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,223 @@
1+
from __future__ import annotations
2+
3+
import os
4+
import threading
5+
import time
6+
import zipfile
7+
from pathlib import Path
8+
from typing import TYPE_CHECKING, Literal
9+
10+
from zarr.abc.store import Store
11+
from zarr.core.buffer import Buffer, BufferPrototype
12+
13+
if TYPE_CHECKING:
14+
from collections.abc import AsyncGenerator
15+
16+
ZipStoreAccessModeLiteral = Literal["r", "w", "a"]
17+
18+
19+
class ZipStore(Store):
20+
"""
21+
Storage class using a ZIP file.
22+
23+
Parameters
24+
----------
25+
path : string
26+
Location of file.
27+
compression : integer, optional
28+
Compression method to use when writing to the archive.
29+
allowZip64 : bool, optional
30+
If True (the default) will create ZIP files that use the ZIP64
31+
extensions when the zipfile is larger than 2 GiB. If False
32+
will raise an exception when the ZIP file would require ZIP64
33+
extensions.
34+
mode : string, optional
35+
One of 'r' to read an existing file, 'w' to truncate and write a new
36+
file, 'a' to append to an existing file, or 'x' to exclusively create
37+
and write a new file.
38+
"""
39+
40+
supports_writes: bool = True
41+
supports_deletes: bool = False
42+
supports_partial_writes: bool = False
43+
supports_listing: bool = True
44+
45+
path: Path
46+
compression: int
47+
allowZip64: bool
48+
49+
_zf: zipfile.ZipFile
50+
_lock: threading.RLock
51+
52+
def __init__(
53+
self,
54+
path: Path | str,
55+
*,
56+
mode: ZipStoreAccessModeLiteral = "r",
57+
compression: int = zipfile.ZIP_STORED,
58+
allowZip64: bool = True,
59+
):
60+
super().__init__(mode=mode)
61+
62+
if isinstance(path, str):
63+
path = Path(path)
64+
assert isinstance(path, Path)
65+
self.path = path # root?
66+
67+
self._zmode = mode
68+
self.compression = compression
69+
self.allowZip64 = allowZip64
70+
71+
async def _open(self) -> None:
72+
if self._is_open:
73+
raise ValueError("store is already open")
74+
75+
self._lock = threading.RLock()
76+
77+
self._zf = zipfile.ZipFile(
78+
self.path,
79+
mode=self._zmode,
80+
compression=self.compression,
81+
allowZip64=self.allowZip64,
82+
)
83+
84+
self._is_open = True
85+
86+
def close(self) -> None:
87+
super().close()
88+
with self._lock:
89+
self._zf.close()
90+
91+
async def clear(self) -> None:
92+
with self._lock:
93+
self._check_writable()
94+
self._zf.close()
95+
os.remove(self.path)
96+
self._zf = zipfile.ZipFile(
97+
self.path, mode="w", compression=self.compression, allowZip64=self.allowZip64
98+
)
99+
100+
async def empty(self) -> bool:
101+
with self._lock:
102+
if self._zf.namelist():
103+
return False
104+
else:
105+
return True
106+
107+
def __str__(self) -> str:
108+
return f"zip://{self.path}"
109+
110+
def __repr__(self) -> str:
111+
return f"ZipStore({str(self)!r})"
112+
113+
def __eq__(self, other: object) -> bool:
114+
return isinstance(other, type(self)) and self.path == other.path
115+
116+
def _get(
117+
self,
118+
key: str,
119+
prototype: BufferPrototype,
120+
byte_range: tuple[int | None, int | None] | None = None,
121+
) -> Buffer | None:
122+
try:
123+
with self._zf.open(key) as f: # will raise KeyError
124+
if byte_range is None:
125+
return prototype.buffer.from_bytes(f.read())
126+
start, length = byte_range
127+
if start:
128+
if start < 0:
129+
start = f.seek(start, os.SEEK_END) + start
130+
else:
131+
start = f.seek(start, os.SEEK_SET)
132+
if length:
133+
return prototype.buffer.from_bytes(f.read(length))
134+
else:
135+
return prototype.buffer.from_bytes(f.read())
136+
except KeyError:
137+
return None
138+
139+
async def get(
140+
self,
141+
key: str,
142+
prototype: BufferPrototype,
143+
byte_range: tuple[int | None, int | None] | None = None,
144+
) -> Buffer | None:
145+
assert isinstance(key, str)
146+
147+
with self._lock:
148+
return self._get(key, prototype=prototype, byte_range=byte_range)
149+
150+
async def get_partial_values(
151+
self,
152+
prototype: BufferPrototype,
153+
key_ranges: list[tuple[str, tuple[int | None, int | None]]],
154+
) -> list[Buffer | None]:
155+
out = []
156+
with self._lock:
157+
for key, byte_range in key_ranges:
158+
out.append(self._get(key, prototype=prototype, byte_range=byte_range))
159+
return out
160+
161+
def _set(self, key: str, value: Buffer) -> None:
162+
# generally, this should be called inside a lock
163+
keyinfo = zipfile.ZipInfo(filename=key, date_time=time.localtime(time.time())[:6])
164+
keyinfo.compress_type = self.compression
165+
if keyinfo.filename[-1] == os.sep:
166+
keyinfo.external_attr = 0o40775 << 16 # drwxrwxr-x
167+
keyinfo.external_attr |= 0x10 # MS-DOS directory flag
168+
else:
169+
keyinfo.external_attr = 0o644 << 16 # ?rw-r--r--
170+
self._zf.writestr(keyinfo, value.to_bytes())
171+
172+
async def set(self, key: str, value: Buffer) -> None:
173+
self._check_writable()
174+
assert isinstance(key, str)
175+
if not isinstance(value, Buffer):
176+
raise TypeError("ZipStore.set(): `value` must a Buffer instance")
177+
with self._lock:
178+
self._set(key, value)
179+
180+
async def set_partial_values(self, key_start_values: list[tuple[str, int, bytes]]) -> None:
181+
raise NotImplementedError
182+
183+
async def delete(self, key: str) -> None:
184+
raise NotImplementedError
185+
186+
async def exists(self, key: str) -> bool:
187+
with self._lock:
188+
try:
189+
self._zf.getinfo(key)
190+
except KeyError:
191+
return False
192+
else:
193+
return True
194+
195+
async def list(self) -> AsyncGenerator[str, None]:
196+
with self._lock:
197+
for key in self._zf.namelist():
198+
yield key
199+
200+
async def list_prefix(self, prefix: str) -> AsyncGenerator[str, None]:
201+
async for key in self.list():
202+
if key.startswith(prefix):
203+
yield key
204+
205+
async def list_dir(self, prefix: str) -> AsyncGenerator[str, None]:
206+
if prefix.endswith("/"):
207+
prefix = prefix[:-1]
208+
209+
keys = self._zf.namelist()
210+
seen = set()
211+
if prefix == "":
212+
keys_unique = set(k.split("/")[0] for k in keys)
213+
for key in keys_unique:
214+
if key not in seen:
215+
seen.add(key)
216+
yield key
217+
else:
218+
for key in keys:
219+
if key.startswith(prefix + "/") and key != prefix:
220+
k = key.removeprefix(prefix + "/").split("/")[0]
221+
if k not in seen:
222+
seen.add(k)
223+
yield k

tests/v3/conftest.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from hypothesis import HealthCheck, Verbosity, settings
1111

1212
from zarr import AsyncGroup, config
13-
from zarr.store import LocalStore, MemoryStore, StorePath
13+
from zarr.store import LocalStore, MemoryStore, StorePath, ZipStore
1414
from zarr.store.remote import RemoteStore
1515

1616
if TYPE_CHECKING:
@@ -25,14 +25,16 @@
2525

2626

2727
async def parse_store(
28-
store: Literal["local", "memory", "remote"], path: str
29-
) -> LocalStore | MemoryStore | RemoteStore:
28+
store: Literal["local", "memory", "remote", "zip"], path: str
29+
) -> LocalStore | MemoryStore | RemoteStore | ZipStore:
3030
if store == "local":
3131
return await LocalStore.open(path, mode="w")
3232
if store == "memory":
3333
return await MemoryStore.open(mode="w")
3434
if store == "remote":
3535
return await RemoteStore.open(url=path, mode="w")
36+
if store == "zip":
37+
return await ZipStore.open(path + "/zarr.zip", mode="w")
3638
raise AssertionError
3739

3840

@@ -64,6 +66,11 @@ async def memory_store() -> MemoryStore:
6466
return await MemoryStore.open(mode="w")
6567

6668

69+
@pytest.fixture(scope="function")
70+
async def zip_store(tmpdir: LEGACY_PATH) -> ZipStore:
71+
return await ZipStore.open(str(tmpdir / "zarr.zip"), mode="w")
72+
73+
6774
@pytest.fixture(scope="function")
6875
async def store(request: pytest.FixtureRequest, tmpdir: LEGACY_PATH) -> Store:
6976
param = request.param
@@ -73,7 +80,7 @@ async def store(request: pytest.FixtureRequest, tmpdir: LEGACY_PATH) -> Store:
7380
@dataclass
7481
class AsyncGroupRequest:
7582
zarr_format: ZarrFormat
76-
store: Literal["local", "remote", "memory"]
83+
store: Literal["local", "remote", "memory", "zip"]
7784
attributes: dict[str, Any] = field(default_factory=dict)
7885

7986

tests/v3/test_array.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from zarr.store.common import StorePath
1111

1212

13-
@pytest.mark.parametrize("store", ("local", "memory"), indirect=["store"])
13+
@pytest.mark.parametrize("store", ("local", "memory", "zip"), indirect=["store"])
1414
@pytest.mark.parametrize("zarr_format", (2, 3))
1515
@pytest.mark.parametrize("exists_ok", [True, False])
1616
@pytest.mark.parametrize("extant_node", ["array", "group"])
@@ -59,7 +59,7 @@ def test_array_creation_existing_node(
5959
)
6060

6161

62-
@pytest.mark.parametrize("store", ("local", "memory"), indirect=["store"])
62+
@pytest.mark.parametrize("store", ("local", "memory", "zip"), indirect=["store"])
6363
@pytest.mark.parametrize("zarr_format", (2, 3))
6464
def test_array_name_properties_no_group(
6565
store: LocalStore | MemoryStore, zarr_format: ZarrFormat
@@ -70,7 +70,7 @@ def test_array_name_properties_no_group(
7070
assert arr.basename is None
7171

7272

73-
@pytest.mark.parametrize("store", ("local", "memory"), indirect=["store"])
73+
@pytest.mark.parametrize("store", ("local", "memory", "zip"), indirect=["store"])
7474
@pytest.mark.parametrize("zarr_format", (2, 3))
7575
def test_array_name_properties_with_group(
7676
store: LocalStore | MemoryStore, zarr_format: ZarrFormat

0 commit comments

Comments
 (0)