Skip to content

feat: add read/write support #167

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ schema = [
"fastjsonschema",
"importlib-resources; python_version<'3.9'",
]
hdf5 = [
"h5py",
]

[dependency-groups]
docs = [
Expand All @@ -62,6 +65,7 @@ test = [
"boost-histogram>=1.0",
"fastjsonschema",
"importlib-resources; python_version<'3.9'",
"h5py; platform_python_implementation == 'cpython'",
]
dev = [{ include-group = "test"}]

Expand Down Expand Up @@ -89,7 +93,7 @@ warn_unreachable = true
enable_error_code = ["ignore-without-code", "redundant-expr", "truthy-bool"]

[[tool.mypy.overrides]]
module = ["fastjsonschema"]
module = ["fastjsonschema", "h5py"]
ignore_missing_imports = true


Expand Down
20 changes: 20 additions & 0 deletions src/uhi/io/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from __future__ import annotations

__all__ = ["ARRAY_KEYS", "LIST_KEYS"]

ARRAY_KEYS = frozenset(
[
"values",
"variances",
"edges",
"counts",
"sum_of_weights",
"sum_of_weights_squared",
]
)

LIST_KEYS = frozenset(
[
"categories",
]
)
125 changes: 125 additions & 0 deletions src/uhi/io/hdf5.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
from __future__ import annotations

from typing import Any

import h5py
import numpy as np

from ..typing.serialization import AnyAxis, AnyHistogram, AnyStorage, Histogram
from . import ARRAY_KEYS

__all__ = ["read", "write"]


def __dir__() -> list[str]:
return __all__


def write(grp: h5py.Group, /, histogram: AnyHistogram) -> None:
"""
Write a histogram to an HDF5 group.
"""
# All referenced objects will be stored inside of /{name}/ref_axes
hist_folder_storage = grp.create_group("ref_axes")

# Metadata

if "metadata" in histogram:
metadata_grp = grp.create_group("metadata")
for key, val1 in histogram["metadata"].items():
metadata_grp.attrs[key] = val1

# Axes
axes_dataset = grp.create_dataset(
"axes", len(histogram["axes"]), dtype=h5py.special_dtype(ref=h5py.Reference)
)
for i, axis in enumerate(histogram["axes"]):
# Iterating through the axes, calling `create_axes_object` for each of them,
# creating references to new groups and appending it to the `items` dataset defined above
ax_group = hist_folder_storage.create_group(f"axis_{i}")
ax_info = axis.copy()
ax_metadata = ax_info.pop("metadata", None)
ax_edges_raw = ax_info.pop("edges", None)
ax_edges = np.asarray(ax_edges_raw) if ax_edges_raw is not None else None
ax_cats: list[int] | list[str] | None = ax_info.pop("categories", None)
for key, val2 in ax_info.items():
ax_group.attrs[key] = val2
if ax_metadata is not None:
ax_metadata_grp = ax_group.create_group("metadata")
for k, v in ax_metadata.items():
ax_metadata_grp.attrs[k] = v
if ax_edges is not None:
ax_group.create_dataset("edges", shape=ax_edges.shape, data=ax_edges)
if ax_cats is not None:
ax_group.create_dataset("categories", shape=len(ax_cats), data=ax_cats)
axes_dataset[i] = ax_group.ref

# Storage
storage_grp = grp.create_group("storage")
storage_type = histogram["storage"]["type"]

storage_grp.attrs["type"] = storage_type

for key, val3 in histogram["storage"].items():
if key == "type":
continue
npvalue = np.asarray(val3)
storage_grp.create_dataset(key, shape=npvalue.shape, data=npvalue)


def _convert_axes(group: h5py.Group | h5py.Dataset | h5py.Datatype) -> AnyAxis:
"""
Convert an HDF5 axis reference to a dictionary.
"""
assert isinstance(group, h5py.Group)

axis = {k: _convert_item(k, v) for k, v in group.attrs.items()}
if "edges" in group:
edges = group["edges"]
assert isinstance(edges, h5py.Dataset)
axis["edges"] = np.asarray(edges)
if "categories" in group:
categories = group["categories"]
assert isinstance(categories, h5py.Dataset)
axis["categories"] = [_convert_item("", c) for c in categories]

return axis # type: ignore[return-value]


def _convert_item(name: str, item: Any, /) -> Any:
"""
Convert an HDF5 item to a native Python type.
"""
if isinstance(item, bytes):
return item.decode("utf-8")
if name == "metadata":
return {k: _convert_item("", v) for k, v in item.items()}
if name in ARRAY_KEYS:
return item
if isinstance(item, np.generic):
return item.item()
return item


def read(grp: h5py.Group, /) -> Histogram:
"""
Read a histogram from an HDF5 group.
"""
axes_grp = grp["axes"]
axes_ref = grp["ref_axes"]
assert isinstance(axes_ref, h5py.Group)
assert isinstance(axes_grp, h5py.Dataset)

axes = [_convert_axes(axes_ref[unref_axis_ref]) for unref_axis_ref in axes_ref]

storage_grp = grp["storage"]
assert isinstance(storage_grp, h5py.Group)
storage = AnyStorage(type=storage_grp.attrs["type"])
for key in storage_grp:
storage[key] = np.asarray(storage_grp[key]) # type: ignore[literal-required]

histogram_dict = AnyHistogram(axes=axes, storage=storage)
if "metadata" in grp:
histogram_dict["metadata"] = _convert_item("metadata", grp["metadata"].attrs)

return histogram_dict # type: ignore[return-value]
32 changes: 32 additions & 0 deletions src/uhi/io/json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from __future__ import annotations

from typing import Any

import numpy as np

from . import ARRAY_KEYS

__all__ = ["default", "object_hook"]


def __dir__() -> list[str]:
return __all__


def default(obj: Any, /) -> Any:
if isinstance(obj, np.ndarray):
return obj.tolist() # Convert ndarray to list
msg = f"Object of type {type(obj)} is not JSON serializable"
raise TypeError(msg)


def object_hook(dct: dict[str, Any], /) -> dict[str, Any]:
"""
Decode a histogram from a dictionary.
"""

for item in ARRAY_KEYS & dct.keys():
if isinstance(dct[item], list):
dct[item] = np.asarray(dct[item])

return dct
58 changes: 58 additions & 0 deletions src/uhi/io/zip.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from __future__ import annotations

import json
import zipfile
from typing import Any

import numpy as np

from ..typing.serialization import AnyHistogram, Histogram
from . import ARRAY_KEYS

__all__ = ["read", "write"]


def __dir__() -> list[str]:
return __all__


def write(
zip_file: zipfile.ZipFile,
/,
name: str,
histogram: AnyHistogram,
) -> None:
"""
Write a histogram to a zip file.
"""
# Write out numpy arrays to files in the zipfile
for storage_key in ARRAY_KEYS & histogram["storage"].keys():
path = f"{name}_storage_{storage_key}.npy"
with zip_file.open(path, "w") as f:
np.save(f, histogram["storage"][storage_key]) # type: ignore[literal-required]
histogram["storage"][storage_key] = path # type: ignore[literal-required]

for axis in histogram["axes"]:
for key in ARRAY_KEYS & axis.keys():
path = f"{name}_axis_{key}.npy"
with zip_file.open(path, "w") as f:
np.save(f, axis[key]) # type: ignore[literal-required]
axis[key] = path # type: ignore[literal-required]

hist_json = json.dumps(histogram)
zip_file.writestr(f"{name}.json", hist_json)


def read(zip_file: zipfile.ZipFile, /, name: str) -> Histogram:
"""
Read histograms from a zip file.
"""

def object_hook(dct: dict[str, Any], /) -> dict[str, Any]:
for item in ARRAY_KEYS & dct.keys():
if isinstance(dct[item], str):
dct[item] = np.load(zip_file.open(dct[item]))
return dct

with zip_file.open(f"{name}.json") as f:
return json.load(f, object_hook=object_hook) # type: ignore[no-any-return]
10 changes: 8 additions & 2 deletions src/uhi/resources/histogram.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -52,15 +52,21 @@
}
}
},
"ndarray": {
"type": "array",
"items": {
"oneOf": [{ "type": "number" }, { "$ref": "#/$defs/ndarray" }]
},
"description": "A ND (nested) array of numbers."
},
"data_array": {
"oneOf": [
{
"type": "string",
"description": "A path (similar to URI) to the floating point bin data"
},
{
"type": "array",
"items": { "type": "number" }
"$ref": "#/$defs/ndarray"
}
]
},
Expand Down
Loading
Loading