Skip to content

Commit b884714

Browse files
committed
Modify getsize to return total size, not just the top level
1 parent e1d98cd commit b884714

File tree

4 files changed

+44
-46
lines changed

4 files changed

+44
-46
lines changed

docs/release.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ Enhancements
2828

2929
Maintenance
3030
~~~~~~~~~~~
31+
* ``getsize`` now returns the total size of all nested arrays.
32+
By :user:`Ben Jeffery <benjeffery>` :issue:`253`.
3133

3234
Deprecations
3335
~~~~~~~~~~~~

zarr/storage.py

Lines changed: 37 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@
3030
from collections import OrderedDict
3131
from collections.abc import MutableMapping
3232
from functools import lru_cache
33-
from os import scandir
3433
from pickle import PicklingError
3534
from threading import Lock, RLock
3635
from typing import Sequence, Mapping, Optional, Union, List, Tuple, Dict, Any
@@ -270,9 +269,15 @@ def _getsize(store: BaseStore, path: Path = None) -> int:
270269
# also include zarr.json?
271270
# members += ['zarr.json']
272271
else:
273-
members = listdir(store, path)
274-
prefix = _path_to_prefix(path)
275-
members = [prefix + k for k in members]
272+
to_visit = [path]
273+
members = []
274+
while to_visit:
275+
print(to_visit)
276+
current_path = to_visit.pop()
277+
current_members = listdir(store, current_path)
278+
prefix = _path_to_prefix(current_path)
279+
members.extend([prefix + k for k in current_members])
280+
to_visit.extend([prefix + k for k in current_members])
276281
for k in members:
277282
try:
278283
v = store[k]
@@ -976,8 +981,12 @@ def getsize(self, path: Path = None):
976981
elif isinstance(value, self.cls):
977982
# total size for directory
978983
size = 0
979-
for v in value.values():
980-
if not isinstance(v, self.cls):
984+
to_visit = list(value.values())
985+
while to_visit:
986+
v = to_visit.pop()
987+
if isinstance(v, self.cls):
988+
to_visit.extend(v.values())
989+
else:
981990
size += buffer_size(v)
982991
return size
983992

@@ -1274,9 +1283,13 @@ def getsize(self, path=None):
12741283
return os.path.getsize(fs_path)
12751284
elif os.path.isdir(fs_path):
12761285
size = 0
1277-
for child in scandir(fs_path):
1278-
if child.is_file():
1279-
size += child.stat().st_size
1286+
for root, _, files in os.walk(fs_path):
1287+
# Include the size of the directory itself, as this can be substantial
1288+
# for directories with many files.
1289+
size += os.path.getsize(root)
1290+
for file in files:
1291+
file_path = os.path.join(root, file)
1292+
size += os.path.getsize(file_path)
12801293
return size
12811294
else:
12821295
return 0
@@ -1921,29 +1934,19 @@ def listdir(self, path=None):
19211934
def getsize(self, path=None):
19221935
path = normalize_storage_path(path)
19231936
with self.mutex:
1924-
children = self.listdir(path)
1925-
if children:
1926-
size = 0
1927-
for child in children:
1928-
if path:
1929-
name = path + "/" + child
1930-
else:
1931-
name = child
1932-
try:
1933-
info = self.zf.getinfo(name)
1934-
except KeyError:
1935-
pass
1936-
else:
1937-
size += info.compress_size
1938-
return size
1939-
elif path:
1937+
to_visit = [path] if path else self.listdir(path)
1938+
total_size = 0
1939+
while to_visit:
1940+
current_path = to_visit.pop()
19401941
try:
1941-
info = self.zf.getinfo(path)
1942-
return info.compress_size
1942+
info = self.zf.getinfo(current_path)
1943+
total_size += info.compress_size
19431944
except KeyError:
1944-
return 0
1945-
else:
1946-
return 0
1945+
children = self.listdir(current_path)
1946+
for child in children:
1947+
full_path = current_path + "/" + child if current_path else child
1948+
to_visit.append(full_path)
1949+
return total_size
19471950

19481951
def clear(self):
19491952
if self.mode == "r":
@@ -2527,6 +2530,8 @@ def listdir(self, path: Path = None):
25272530
return listing
25282531

25292532
def getsize(self, path=None) -> int:
2533+
print("WYF")
2534+
print(self._store, path)
25302535
return getsize(self._store, path=path)
25312536

25322537
def _pop_value(self):
@@ -2795,10 +2800,9 @@ def getsize(self, path=None):
27952800
size = self.cursor.execute(
27962801
"""
27972802
SELECT COALESCE(SUM(LENGTH(v)), 0) FROM zarr
2798-
WHERE k LIKE (? || "%") AND
2799-
0 == INSTR(LTRIM(SUBSTR(k, LENGTH(?) + 1), "/"), "/")
2803+
WHERE k LIKE (? || "%")
28002804
""",
2801-
(path, path),
2805+
(path,),
28022806
)
28032807
for (s,) in size:
28042808
return s

zarr/tests/test_core.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1686,7 +1686,8 @@ def create_store(self):
16861686
def test_nbytes_stored(self):
16871687
# dict as store
16881688
z = self.create_array(shape=1000, chunks=100)
1689-
expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values())
1689+
#4096 is the size of the containing directory
1690+
expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values()) + 4096
16901691
assert expect_nbytes_stored == z.nbytes_stored
16911692
z[:] = 42
16921693
expect_nbytes_stored = sum(buffer_size(v) for v in z.store.values())

zarr/tests/test_storage.py

Lines changed: 3 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -366,19 +366,10 @@ def test_hierarchy(self):
366366

367367
# test getsize (optional)
368368
if hasattr(store, "getsize"):
369-
# TODO: proper behavior of getsize?
370-
# v3 returns size of all nested arrays, not just the
371-
# size of the arrays in the current folder.
372-
if self.version == 2:
373-
assert 6 == store.getsize()
374-
else:
375-
assert 15 == store.getsize()
369+
assert 15 == store.getsize()
376370
assert 3 == store.getsize("a")
377371
assert 3 == store.getsize("b")
378-
if self.version == 2:
379-
assert 3 == store.getsize("c")
380-
else:
381-
assert 9 == store.getsize("c")
372+
assert 9 == store.getsize("c")
382373
assert 3 == store.getsize("c/d")
383374
assert 6 == store.getsize("c/e")
384375
assert 3 == store.getsize("c/e/f")
@@ -2256,7 +2247,7 @@ def test_getsize():
22562247
store["foo"] = b"aaa"
22572248
store["bar"] = b"bbbb"
22582249
store["baz/quux"] = b"ccccc"
2259-
assert 7 == getsize(store)
2250+
assert 12 == getsize(store)
22602251
assert 5 == getsize(store, "baz")
22612252

22622253
store = KVStore(dict())

0 commit comments

Comments
 (0)