Skip to content

Commit aec277c

Browse files
committed
add schemas
1 parent ab97ddd commit aec277c

17 files changed

+1020
-0
lines changed

src/mdio/schemas/__init__.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
"""MDIO schemas for different data types."""
2+
3+
from mdio.schemas.compressors import ZFP
4+
from mdio.schemas.compressors import Blosc
5+
from mdio.schemas.dimension import NamedDimension
6+
from mdio.schemas.dtype import ScalarType
7+
from mdio.schemas.dtype import StructuredField
8+
from mdio.schemas.dtype import StructuredType
9+
10+
11+
__all__ = [
12+
"Blosc",
13+
"ZFP",
14+
"NamedDimension",
15+
"ScalarType",
16+
"StructuredField",
17+
"StructuredType",
18+
]

src/mdio/schemas/base.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
"""Base models to subclass from."""
2+
3+
from pydantic import ConfigDict
4+
from pydantic import Field
5+
from pydantic.json_schema import GenerateJsonSchema
6+
7+
from mdio.schemas.compressors import ZFP
8+
from mdio.schemas.compressors import Blosc
9+
from mdio.schemas.core import CamelCaseStrictModel
10+
from mdio.schemas.dimension import NamedDimension
11+
from mdio.schemas.dtype import DataTypeModel
12+
13+
14+
JSON_SCHEMA_DIALECT = GenerateJsonSchema.schema_dialect
15+
16+
17+
class BaseDataset(CamelCaseStrictModel):
18+
"""A base class for MDIO datasets.
19+
20+
We add schema dialect to extend the config of `StrictCamelBaseModel`.
21+
We use the default Pydantic schema generator `GenerateJsonSchema` to
22+
define the JSON schema dialect accurately.
23+
"""
24+
25+
model_config = ConfigDict(json_schema_extra={"$schema": JSON_SCHEMA_DIALECT})
26+
27+
28+
class BaseArray(DataTypeModel, CamelCaseStrictModel):
29+
"""A base array schema."""
30+
31+
dimensions: list[NamedDimension] | list[str] = Field(
32+
..., description="List of Dimension collection or reference to dimension names."
33+
)
34+
compressor: Blosc | ZFP | None = Field(
35+
default=None, description="Compression settings."
36+
)
37+
38+
39+
class NamedArray(BaseArray):
40+
"""An array with a name."""
41+
42+
name: str = Field(..., description="Name of the array.")
43+
long_name: str | None = Field(default=None, description="Fully descriptive name.")

src/mdio/schemas/builder.py

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
"""Schema builders."""
2+
3+
from __future__ import annotations
4+
5+
from typing import Any
6+
7+
from mdio.schemas import NamedDimension
8+
from mdio.schemas.v1.dataset import Dataset
9+
from mdio.schemas.v1.dataset import DatasetMetadata
10+
from mdio.schemas.v1.variable import Variable
11+
from mdio.schemas.v1.variable import VariableMetadata
12+
13+
14+
class VariableBuilder:
15+
"""Dataset builder."""
16+
17+
def __init__(self) -> None:
18+
"""Initialize the builder."""
19+
self.name = None
20+
self.long_name = None
21+
self.dtype = None
22+
self.chunks = None
23+
self.dims = None
24+
self.coords = None
25+
self.compressor = None
26+
self.meta_dict = None
27+
28+
def set_name(self, name: str) -> VariableBuilder:
29+
"""Set variable name."""
30+
self.name = name
31+
return self
32+
33+
def set_long_name(self, long_name: str) -> VariableBuilder:
34+
"""Add long, descriptive name to the variable."""
35+
self.long_name = long_name
36+
return self
37+
38+
def set_compressor(self, compressor: dict[str, Any]) -> VariableBuilder:
39+
"""Add long, descriptive name to the variable."""
40+
self.compressor = compressor
41+
return self
42+
43+
def add_dimension(self, *dimensions: str | dict[str, int]) -> VariableBuilder:
44+
"""Add a dimension to the dataset."""
45+
if self.dims is None:
46+
self.dims = []
47+
48+
if isinstance(dimensions[0], str):
49+
dims = list(dimensions)
50+
elif isinstance(dimensions[0], dict):
51+
dims = [
52+
NamedDimension(name=name, size=size)
53+
for dim in dimensions
54+
for name, size in dim.items()
55+
]
56+
else:
57+
raise NotImplementedError
58+
59+
self.dims.extend(dims)
60+
return self
61+
62+
def add_coordinate(self, *names: str) -> VariableBuilder:
63+
"""Add a coordinate to the variable."""
64+
if self.coords is None:
65+
self.coords = []
66+
67+
self.coords.extend(names)
68+
return self
69+
70+
def set_format(self, format_: str | dict[str, str]) -> VariableBuilder:
71+
"""Set variable format."""
72+
if isinstance(format_, dict):
73+
fields = [{"name": n, "format": f} for n, f in format_.items()]
74+
format_ = {"fields": fields}
75+
76+
self.dtype = format_
77+
return self
78+
79+
def set_chunks(self, chunks: list[int]) -> VariableBuilder:
80+
"""Set variable chunks."""
81+
if self.meta_dict is None:
82+
self.meta_dict = {}
83+
84+
self.meta_dict["chunkGrid"] = {"configuration": {"chunkShape": chunks}}
85+
return self
86+
87+
def set_units(self, units: dict[str, str]) -> VariableBuilder:
88+
"""Set variable units."""
89+
if self.meta_dict is None:
90+
self.meta_dict = {}
91+
92+
self.meta_dict["unitsV1"] = units
93+
return self
94+
95+
def add_attribute(self, key: str, value: Any) -> VariableBuilder: # noqa: ANN401
96+
"""Add a user attribute to the variable metadata."""
97+
if self.meta_dict is None:
98+
self.meta_dict = {}
99+
100+
self.meta_dict["attributes"] = {key: value}
101+
return self
102+
103+
def build(self) -> Variable:
104+
"""Build the dataset model."""
105+
if self.chunks is not None and len(self.chunks) != len(self.dims):
106+
msg = "Variable chunks must have same number of dimensions."
107+
raise ValueError(msg)
108+
109+
var_kwargs = {}
110+
111+
if self.meta_dict is not None:
112+
var_kwargs["metadata"] = VariableMetadata.model_validate(self.meta_dict)
113+
114+
return Variable(
115+
name=self.name,
116+
long_name=self.long_name,
117+
data_type=self.dtype,
118+
dimensions=self.dims,
119+
coordinates=self.coords,
120+
compressor=self.compressor,
121+
**var_kwargs,
122+
)
123+
124+
125+
class DatasetBuilder:
126+
"""Dataset builder."""
127+
128+
def __init__(self) -> None:
129+
"""Initialize the builder."""
130+
self.variables = []
131+
self.name = None
132+
self.metadata = None
133+
134+
def set_name(self, name: str) -> DatasetBuilder:
135+
"""Set dataset name."""
136+
self.name = name
137+
return self
138+
139+
def add_variable(self, variable: Variable) -> DatasetBuilder:
140+
"""Add a variable to the dataset."""
141+
self.variables.append(variable)
142+
return self
143+
144+
def add_variables(self, variables: list[Variable]) -> DatasetBuilder:
145+
"""Add multiple variables to the dataset."""
146+
[self.add_variable(variable) for variable in variables]
147+
return self
148+
149+
def set_metadata(self, metadata: DatasetMetadata) -> DatasetBuilder:
150+
"""Add a metadata to the dataset."""
151+
self.metadata = metadata
152+
return self
153+
154+
def build(self) -> Dataset:
155+
"""Build the dataset model."""
156+
return Dataset(variables=self.variables, metadata=self.metadata)

src/mdio/schemas/chunk_grid.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
"""This module contains data models for Zarr's chunk grid."""
2+
3+
from __future__ import annotations
4+
5+
from pydantic import Field
6+
7+
from mdio.schemas.core import CamelCaseStrictModel
8+
9+
10+
class RegularChunkShape(CamelCaseStrictModel):
11+
"""Represents regular chunk sizes along each dimension."""
12+
13+
chunk_shape: list[int] = Field(
14+
..., description="Lengths of the chunk along each dimension of the array."
15+
)
16+
17+
18+
class RectilinearChunkShape(CamelCaseStrictModel):
19+
"""Represents irregular chunk sizes along each dimension."""
20+
21+
chunk_shape: list[list[int]] = Field(
22+
...,
23+
description="Lengths of the chunk along each dimension of the array.",
24+
)
25+
26+
27+
class RegularChunkGrid(CamelCaseStrictModel):
28+
"""Represents a rectangular and regularly spaced chunk grid."""
29+
30+
name: str = Field(default="regular", description="The name of the chunk grid.")
31+
32+
configuration: RegularChunkShape = Field(
33+
..., description="Configuration of the regular chunk grid."
34+
)
35+
36+
37+
class RectilinearChunkGrid(CamelCaseStrictModel):
38+
"""Represents a rectangular and irregularly spaced chunk grid."""
39+
40+
name: str = Field(default="rectilinear", description="The name of the chunk grid.")
41+
42+
configuration: RectilinearChunkShape = Field(
43+
..., description="Configuration of the irregular chunk grid."
44+
)

0 commit comments

Comments
 (0)