From 9dd9fbc88cadd4e23a0ddfd6cc1689db8aa26c17 Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Tue, 24 Jun 2025 20:17:55 +0000 Subject: [PATCH 01/27] schema_v1-dataset_builder-add_dimension --- .devcontainer/Dockerfile.cli | 47 +++ .devcontainer/Dockerfile.dev | 61 ++++ .devcontainer/Dockerfile.nox | 49 +++ .devcontainer/devcontainer.json | 19 +- DEVELOPER_NOTES.md | 107 ++++++ src/mdio/schemas/v1/dataset_builder.py | 195 +++++++++++ tests/unit/v1/__init__.py | 1 + .../v1/test_dataset_builder_add_dimension.py | 307 ++++++++++++++++++ .../unit/v1/test_dataset_builder_internals.py | 208 ++++++++++++ tests/unit/v1/test_dataset_builder_state.py | 88 +++++ 10 files changed, 1073 insertions(+), 9 deletions(-) create mode 100644 .devcontainer/Dockerfile.cli create mode 100644 .devcontainer/Dockerfile.dev create mode 100644 .devcontainer/Dockerfile.nox create mode 100644 DEVELOPER_NOTES.md create mode 100644 src/mdio/schemas/v1/dataset_builder.py create mode 100644 tests/unit/v1/__init__.py create mode 100644 tests/unit/v1/test_dataset_builder_add_dimension.py create mode 100644 tests/unit/v1/test_dataset_builder_internals.py create mode 100644 tests/unit/v1/test_dataset_builder_state.py diff --git a/.devcontainer/Dockerfile.cli b/.devcontainer/Dockerfile.cli new file mode 100644 index 00000000..92720e34 --- /dev/null +++ b/.devcontainer/Dockerfile.cli @@ -0,0 +1,47 @@ +# HOW TO BUILD AND RUN THIS DOCKERFILE +# * Clone mdio-python and build a Docker image: +# git clone https://github.com/TGSAI/mdio-python.git +# cd mdio-python +# docker build -t mdio-cli -f .devcontainer/Dockerfile.cli . +# * Run /bin/bash in the Docker container: +# +# +# USAGE: +# docker run -it --rm --name mdio-cli mdio-cli --version +# docker run -it --rm --name mdio-cli mdio-cli --help +# +# LOCAL_DATA_DIR=$(pwd); \ +# docker run -it --rm -v $LOCAL_DATA_DIR:/DATA --name mdio-cli mdio-cli \ +# segy import \ +# /DATA/segy_file.segy \ +# /DATA/mdio_file.mdio \ +# -loc 181,185 \ +# -names inline,crossline +# +# LOCAL_DATA_DIR=$(pwd); \ +# docker run -it --rm -v $LOCAL_DATA_DIR:/DATA --name mdio-cli mdio-cli \ +# segy export \ +# /DATA/mdio_file.mdio \ +# /DATA/segy_file_copy.segy +# +FROM python:3.13-bookworm +# Create the user (https://code.visualstudio.com/remote/advancedcontainers/add-nonroot-user) +ENV USERNAME=python +ENV USER_UID=1000 +ENV USER_GID=$USER_UID +RUN groupadd --gid $USER_GID $USERNAME && useradd --uid $USER_UID --gid $USER_GID -m $USERNAME + +# Set the default non-root user +USER $USERNAME + +# Add path to the user-installed packages +ENV PYTHONUSERBASE=/home/$USERNAME/.local +ENV PATH="$PYTHONUSERBASE/bin:$PATH" + +COPY --chown=$USERNAME:$USERNAME ./ /home/$USERNAME/mdio-python + +WORKDIR /home/$USERNAME/mdio-python +RUN pip install . + +ENTRYPOINT ["mdio"] +CMD ["--version"] diff --git a/.devcontainer/Dockerfile.dev b/.devcontainer/Dockerfile.dev new file mode 100644 index 00000000..05f13579 --- /dev/null +++ b/.devcontainer/Dockerfile.dev @@ -0,0 +1,61 @@ +# USAGE: +# This file will be used by the VS Code DevContainer extension +# to create a development environment for the mdio-python project. +# HOW TO RUN TESTS +# 1. Open the project in VS Code. +# 2. Open the Command Palette (Ctrl+Shift+P) and select "Dev Containers: Reopen in Container". +# 3. Once the container is running, open a terminal in VS Code. +# 4. Run the tests using the command: `nox -s test`. +# HOW TO MANUALLY BUILD AND RUN THE CONTAINER +# docker build -t mdio-dev -f .devcontainer/Dockerfile.dev . +# docker run -it --rm --entrypoint /bin/bash --name mdio-dev mdio-dev +# NOTES: +# 1. The container will be run as the non-root user 'vscode' with UID 1000. +# 2. The virtual environment will be setup at /home/vscode/venv +# 3. The project source code will be mounted at /workspaces/mdio-python +ARG PYTHON_VERSION="3.13" +ARG LINUX_DISTRO="bookworm" +ARG UV_VERSION="0.6.11" +ARG NOX_VERSION="2025.2.9" +FROM mcr.microsoft.com/devcontainers/python:1-${PYTHON_VERSION}-${LINUX_DISTRO} + +# Install git for nox pre-commit +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + git \ + && rm -rf /var/lib/apt/lists/* + +ENV USERNAME="vscode" +USER $USERNAME + +# # Add path to the user-installed packages +# ENV PYTHONUSERBASE=/home/$USERNAME/.local +# ENV PATH="$PYTHONUSERBASE/bin:$PATH" + +COPY --chown=$USERNAME:$USERNAME ./ /workspaces/mdio-python + +WORKDIR /workspaces/mdio-python + +ARG UV_VERSION +ARG NOX_VERSION +RUN python3 -m pip install uv==${UV_VERSION} nox==${NOX_VERSION} msgpack ipykernel + +# Initialize virtual environement in the container +ENV VIRTUAL_ENV="/home/$USERNAME/venv" +RUN python3 -m venv $VIRTUAL_ENV +ENV PATH="$VIRTUAL_ENV/bin:$PATH" + +# installing pytest is required for VS Code Python Testing +RUN pip install pytest pytest-cov pytest-mock pytest-asyncio + +# Install the project in editable mode +# This allows for live reloading of the code during development +RUN pip install -e . + +# RUN uv pip install snakeviz + + + + + + diff --git a/.devcontainer/Dockerfile.nox b/.devcontainer/Dockerfile.nox new file mode 100644 index 00000000..103673fd --- /dev/null +++ b/.devcontainer/Dockerfile.nox @@ -0,0 +1,49 @@ +# HOW TO BUILD AND RUN THIS DOCKERFILE +# 1. Make sure you have Docker installed and running. +# 2. Clone mdio-python and build the Docker image: +# git clone https://github.com/TGSAI/mdio-python.git +# cd mdio-python +# docker build -t mdio-nox -f .devcontainer/Dockerfile.nox . +# 3. Run /bin/bash in the Docker container : +# LOCAL_DATA_DIR=$(pwd); \ +# docker run -it --rm -v $LOCAL_DATA_DIR:/DATA --entrypoint /bin/bash --name mdio-nox mdio-nox +# +# USAGE: +# docker run -it --rm mdio-nox --list +# docker run -it --rm mdio-nox -s tests-3.13 +# docker run -it --rm mdio-nox --no-stop-on-first-error +# +# NOTE: nox will fail if run in the directory mounted from the host machine +ARG PYTHON_VERSION="3.13" +ARG LINUX_DISTRO="bookworm" +ARG UV_VERSION="0.6.11" +ARG NOX_VERSION="2025.2.9" +FROM python:${PYTHON_VERSION}-${LINUX_DISTRO} +ARG PYTHON_VERSION +ARG LINUX_DISTRO +RUN echo "Using python:${PYTHON_VERSION}-${LINUX_DISTRO}" +# Create the user (https://code.visualstudio.com/remote/advancedcontainers/add-nonroot-user) +ENV USERNAME=python +ENV USER_UID=1000 +ENV USER_GID=$USER_UID +RUN groupadd --gid $USER_GID $USERNAME && useradd --uid $USER_UID --gid $USER_GID -m $USERNAME +# Set the default non-root user +USER $USERNAME + +# Add path to the user-installed packages +ENV PYTHONUSERBASE=/home/$USERNAME/.local +ENV PATH="$PYTHONUSERBASE/bin:$PATH" + +COPY --chown=$USERNAME:$USERNAME ./ /home/$USERNAME/mdio-python + +WORKDIR /home/$USERNAME/mdio-python +RUN pip install . + +# Install UV dependency manager and Nox test automator +ARG UV_VERSION +ARG NOX_VERSION +RUN echo "Using uv: $UV_VERSION and nox: $NOX_VERSION" +RUN python3 -m pip install uv==${UV_VERSION} nox==${NOX_VERSION} msgpack ipykernel + +ENTRYPOINT ["nox"] +CMD ["--list"] \ No newline at end of file diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index b618a526..ea5dc99e 100755 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -2,12 +2,12 @@ // README at: https://github.com/devcontainers/templates/tree/main/src/python { "build": { - "dockerfile": "Dockerfile", + "dockerfile": "Dockerfile.dev", "context": ".." }, // Use 'postCreateCommand' to run commands after the container is created. "postCreateCommand": { - "post_create_script": "bash ./.devcontainer/post-install.sh" + // "post_create_script": "bash ./.devcontainer/post-install.sh" }, // Forward 8787 to enable us to view dask dashboard "forwardPorts": [8787], @@ -16,8 +16,9 @@ // Configure properties specific to VS Code. "vscode": { "settings": { - "python.terminal.activateEnvInCurrentTerminal": true, - "python.defaultInterpreterPath": "/opt/venv/bin/python" + "python.testing.pytestArgs": ["tests"], + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true }, "extensions": [ "ms-python.python", @@ -27,17 +28,17 @@ "ms-toolsai.jupyter-renderers", "vscode-icons-team.vscode-icons", "wayou.vscode-todo-highlight", - "streetsidesoftware.code-spell-checker" + "streetsidesoftware.code-spell-checker", + "eamodio.gitlens" ] } }, // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root. // "remoteUser": "root", "updateRemoteUserUID": true, + "workspaceMount": "source=${localWorkspaceFolder},target=/workspaces/mdio-python,type=bind", + "workspaceFolder": "/workspaces/mdio-python", "mounts": [ - // Re-use local Git configuration - "source=${localEnv:HOME}/.gitconfig,target=/home/vscode/.gitconfig_tmp,type=bind,consistency=cached", - "source=${localEnv:HOME}/.gitconfig,target=/root/.gitconfig_tmp,type=bind,consistency=cached", - "source=${localEnv:SCRATCH_DIR}/${localEnv:USER},target=/scratch/,type=bind,consistency=cached" + // "source=${localWorkspaceFolder}/../DATA/,target=/DATA/,type=bind,consistency=cached" ] } diff --git a/DEVELOPER_NOTES.md b/DEVELOPER_NOTES.md new file mode 100644 index 00000000..7dcda95d --- /dev/null +++ b/DEVELOPER_NOTES.md @@ -0,0 +1,107 @@ +# Developer Notes +What are the goals for MDIO v1: + +## Overall API design and implementation +1. Do we want to have a strongly-typed (see pydantic) or dynamic-typed (see dictionary args) API? + For example +```Python + # Strongly typed + builder.add_dimension( + "length", + size=100, + data_type=ScalarType.FLOAT32, + metadata=[ + AllUnits(units_v1=LengthUnitModel( + length=LengthUnitEnum.FOOT)), + UserAttributes( + attributes={"MGA": 51, "UnitSystem": "Imperial"}), + ChunkGridMetadata( + chunk_grid=RegularChunkGrid( + configuration=RegularChunkShape( + chunk_shape=[20]))), + StatisticsMetadata(stats_v1=SummaryStatistics( + count=100, + sum=1215.1, + sumSquares=125.12, + min=5.61, + max=10.84, + histogram=CenteredBinHistogram( + binCenters=[1, 2], + counts=[10, 15]))) + ] + ) + + # dynamically-typed + builder.add_dimension( + "depth", + size=100, + data_type=ScalarType.FLOAT32, + metadata={ + "unitsV1": {"length": "m"}, + "attributes": {"MGA": 51}, + "chunkGrid": {"name": "regular", "configuration": {"chunkShape": [20]}}, + "statsV1": { + "count": 100, + "sum": 1215.1, + "sumSquares": 125.12, + "min": 5.61, + "max": 10.84, + "histogram": {"binCenters": [1, 2], "counts": [10, 15]}, + }, + }, + ) +``` +2. How extensive the handling of the edge cases and the invalid arguments should be? This affects the ammount of validation code that needs to be written + For example, + * Should we validate in the code that the units list contain a single item for the dimensions units + or should we expect the developer to always pass a single-item list? + * Should we test the statistics for count > 0 or dim(binCenter) == dim(count) in the case above? + +## V1 Schema questions +1. Why do we allow default / empty names? +2. Adding a dimension with the same name multiple times: is it allowed or should it raise an error? + * It is currently allowed: the second request is ignored + * Adding a dimension with the same name, but the different size currently throws an error +3. Why do we allow methods with dictionary parameters (non-strongly-typed)? +4. For the add_dimension(): + * Can AllUnits / UserAttributes / ChunkGridMetadata / StatisticsMetadata be repeated in the metadata list? + * For units, chunkGrid, statsV1 dict should we validate structure of the data passed in? + * Do we validate the unit string supplied in dictionary parameters? What what if someone supplies ftUS instead of ft? + * Are multiple dimension attributes allowed (I assume yes)? +5. It is not clear, how RectilinearChunkGrid can be mapped to a single dimension + ```RectilinearChunkGrid(configuration=RectilinearChunkShape(chunk_shape=[[2,3,4],[2,3,4]]))``` +6. StatisticsMetadata accepts list[SummaryStatistics]. what does this mean and does it need to be tested? +7. The pydentic attribute names are different from the v1 schema attributes names. + 'statsV1' <-> 'stats_v1', 'unitsV1' <-> 'units_v1', 'chunkGrid' <-> 'chunk_grid' + Tgus, we will pass `units_v1` if we use the typesafe API and `'unitsV1` if we use dictionary API +8. Can we add two variables with the same name? +9. Why histogram (e.g., SummaryStatistics) does not have a `histogram_type` attribute? +10. Why 'ftUS' is not supported by the schema? + Units: what foot does the MDIO uses: the U.S. survey foot or the International Foot? + The U.S. survey foot is defined as 1200/3937 meters, while the international foot is defined as exactly 0.3048 meters. + https://www.axiomint.com/survey-foot-versus-international-foot-whats-the-difference/ + "The REAL issue is when ... applied to State Plane coordinates in the N2,000,000 and E6,000,000 range! + This ... moves a State Plane coordinate position 4 feet by 12 feet. + +## Unclear +* Did we have a notion of the fixed increment for inline & xline annotations? +* How is rotation of East/North axes relatively to inline/xline axes is handled +* How is right-handed and left-handed surveys are handled? +* add_variable - should dimensions argument be required?? + +(src/mdio/schemas/v1/dataset_builder.py) +## Design suggestions +1. Instead of trying to track the state, should we just return a wrapper/pimpl class with the permitted methods? +2. Should we rename add_dimension to add_dimension_variable / add_dimension_annotation to indicate that we not just + providing the dimension name, but also creating the dimension variable +4. add_variable - should we call it `append_variable`. add implies that either name or index must be provided. + +## Under constructions +* TODO: ??? refactor _BuilderState to make inner class ??? +* TODO: Need an example of EdgeDefinedHistogram for add_dimension with histogram + +## Bugs +1. I assume we do not want attribute.attribute in the contract (see docs\tutorials\builder.ipynb) + 'metadata': {'unitsV1': {'length': 'm'}, 'attributes': {'attributes': {'MGA': 51}}} + +https://osdu.pages.opengroup.org/platform/domain-data-mgmt-services/seismic/open-vds/vds/specification/Metadata.html \ No newline at end of file diff --git a/src/mdio/schemas/v1/dataset_builder.py b/src/mdio/schemas/v1/dataset_builder.py new file mode 100644 index 00000000..ea431e4c --- /dev/null +++ b/src/mdio/schemas/v1/dataset_builder.py @@ -0,0 +1,195 @@ +"""Builder pattern implementation for MDIO v1 schema models.""" + +from collections.abc import Mapping +from datetime import UTC +from datetime import datetime +from enum import Enum +from enum import auto +from typing import Any + +from pydantic import BaseModel +from zarr.core.chunk_key_encodings import V2ChunkKeyEncoding # noqa: F401 + +from mdio.schemas.compressors import ZFP +from mdio.schemas.compressors import Blosc +from mdio.schemas.dimension import NamedDimension +from mdio.schemas.dtype import ScalarType, StructuredType +from mdio.schemas.metadata import ChunkGridMetadata, UserAttributes +from mdio.schemas.v1.dataset import Dataset +from mdio.schemas.v1.stats import StatisticsMetadata, SummaryStatistics +from mdio.schemas.v1.units import AllUnitModel, AllUnits +from mdio.schemas.v1.variable import Coordinate, Variable, VariableMetadata + +# TODO: Why do we allow default names? +# TODO: Instead of trying to track the state, should we just return a wrapper class with permitted methods? +# TODO: refactor to make inner class +class _BuilderState(Enum): + """States for the template builder.""" + + INITIAL = auto() + HAS_DIMENSIONS = auto() + HAS_COORDINATES = auto() + HAS_VARIABLES = auto() + +def contains_dimension(dimensions: list[NamedDimension], name_or_NamedDimension: str | NamedDimension) -> bool: + """Check if a dimension with the given name exists in the list.""" + if isinstance(name_or_NamedDimension, str): + name = name_or_NamedDimension + return get_dimension(dimensions, name) is not None + elif isinstance(name_or_NamedDimension, NamedDimension): + named_dimension = name_or_NamedDimension + return get_dimension(dimensions, named_dimension.name, named_dimension.size) is not None + else: + msg = f"Expected str or NamedDimension, got {type(name_or_NamedDimension).__name__}" + raise TypeError(msg) + +def get_dimension(dimensions: list[NamedDimension], name: str, size: int | None = None) -> NamedDimension | None: + """Get a dimension by name from the list.""" + if dimensions is None: + return False + if not isinstance(name, str): + raise TypeError(f"Expected str, got {type(name).__name__}") + + nd = next((dim for dim in dimensions if dim.name == name), None) + if nd is None: + return None + if size is not None and nd.size != size: + msg = f"Dimension {name!r} found but size {nd.size} does not match expected size {size}" + raise ValueError(msg) + return nd + +def to_dictionary(val: BaseModel) -> dict[str, Any]: + """Convert a pydantic BaseModel to a dictionary.""" + if not isinstance(val, BaseModel): + raise TypeError(f"Expected BaseModel, got {type(val).__name__}") + return val.model_dump(mode="json", by_alias=True) + +class MDIODatasetBuilder: + """Builder for creating MDIO datasets with enforced build order. + + This builder implements the builder pattern to create MDIO datasets with a v1 schema. + It enforces a specific build order to ensure valid dataset construction: + 1. Must add dimensions first via add_dimension() + 2. Can optionally add coordinates via add_coordinate() + 3. Must add variables via add_variable() + 4. Must call build() to create the dataset. + """ + + def __init__(self, name: str, attributes: dict[str, Any] | None = None): + self.name = name + self.api_version = "1.0.0" # TODO(BrianMichell, #0): Pull from package metadata + self.created_on = datetime.now(UTC) + self.attributes = attributes + self._dimensions: list[NamedDimension] = [] + self._coordinates: list[Coordinate] = [] + self._variables: list[Variable] = [] + self._state = _BuilderState.INITIAL + self._unnamed_variable_counter = 0 + + + def _add_named_dimensions(self, dimensions: list[NamedDimension | str] | None) -> list[NamedDimension]: + if dimensions is None: + return [] + + added_dims = [] + for dim in dimensions: + if isinstance(dim, str): + if not contains_dimension(self._dimensions, dim): + raise ValueError(f"Dimension named {dim!r} is not found") + else: + if not isinstance(dim, NamedDimension): + raise TypeError(f"Expected NamedDimension or str, got {type(dim).__name__}") + if contains_dimension(self._dimensions, dim): + continue + self._dimensions.append(dim) + added_dims.append(dim) + return added_dims + + + def _make_VariableMetadata_from_list(metadata: list[AllUnits | UserAttributes]) -> Any: + metadata_dict = {} + for md in metadata: + # NOTE: the pydentic attribute names are different from the v1 schema attributes names + # 'statsV1' <-> 'stats_v1', 'unitsV1' <-> 'units_v1', 'chunkGrid' <-> 'chunk_grid' + if isinstance(md, AllUnits): + val = md.units_v1 + metadata_dict["unitsV1"] = to_dictionary(val) + elif isinstance(md, UserAttributes): + # NOTE: md.attributes is not pydantic type, but a dictionary + metadata_dict["attributes"] = to_dictionary(md)["attributes"] + elif isinstance(md, ChunkGridMetadata): + val = md.chunk_grid + metadata_dict["chunkGrid"] = to_dictionary(val) + elif isinstance(md, StatisticsMetadata): + val = md.stats_v1 + metadata_dict["statsV1"] = to_dictionary(val) + else: + raise TypeError(f"Unsupported metadata type: {type(md)}") + return VariableMetadata(**metadata_dict) + + + def _make_VariableMetadata_from_dict(metadata: dict[str, Any]) -> type[BaseModel]: + converted_dict = {} + for key, value in metadata.items(): + if key == "unitsV1" or key == "statsV1" or key == "chunkGrid" or key == "attributes": + # TODO: Should we validate the structure of the value passed in? + if not isinstance(value, dict): + raise TypeError(f"Invalid value for key '{key}': {value!r}. Expected a dictionary.") + else: + raise TypeError(f"Unsupported metadata key: '{key}'. Expected 'unitsV1', 'attributes', 'chunkGrid', or 'statsV1.") + converted_dict[key] = value + return VariableMetadata(**converted_dict) + + + def _make_VariableMetadata(metadata: list[AllUnits | UserAttributes] | dict[str, Any] | None = None) -> Any | None: + if metadata is None: + return None + + if isinstance(metadata, list): + return MDIODatasetBuilder._make_VariableMetadata_from_list(metadata) + + if isinstance(metadata, dict): + return MDIODatasetBuilder._make_VariableMetadata_from_dict(metadata) + + raise TypeError(f"Unsupported metadata type: {type(metadata)}") + + + def add_dimension( # noqa: PLR0913 + self, + name: str, + size: int, + long_name: str = None, + data_type: ScalarType | StructuredType = ScalarType.INT32, + metadata: list[AllUnits | UserAttributes] | None | dict[str, Any] = None, + ) -> "MDIODatasetBuilder": + """Add a dimension. + + This must be called at least once before adding coordinates or variables. + + Args: + name: Name of the dimension + size: Size of the dimension + long_name: Optional long name for the dimension variable + data_type: Data type for the dimension variable (defaults to INT32) + metadata: Optional metadata for the dimension variable + + Returns: + self: Returns self for method chaining + """ + + added_dims = self._add_named_dimensions([NamedDimension(name=name, size=size)]) + if added_dims: + # Create a variable for the dimension + dim_var = Variable( + name=name, + longName=long_name, + dimensions=added_dims, + dataType=data_type, + compressor=None, + coordinates=None, + metadata=MDIODatasetBuilder._make_VariableMetadata(metadata) + ) + self._variables.append(dim_var) + + self._state = _BuilderState.HAS_DIMENSIONS + return self diff --git a/tests/unit/v1/__init__.py b/tests/unit/v1/__init__.py new file mode 100644 index 00000000..3db3a8e5 --- /dev/null +++ b/tests/unit/v1/__init__.py @@ -0,0 +1 @@ +"""Unit tests for parts of the MDIO package related to the v1 schema""" \ No newline at end of file diff --git a/tests/unit/v1/test_dataset_builder_add_dimension.py b/tests/unit/v1/test_dataset_builder_add_dimension.py new file mode 100644 index 00000000..bac9a838 --- /dev/null +++ b/tests/unit/v1/test_dataset_builder_add_dimension.py @@ -0,0 +1,307 @@ + +import pytest + +from datetime import datetime + +from mdio.schemas.chunk_grid import RectilinearChunkGrid, RectilinearChunkShape, RegularChunkGrid, RegularChunkShape +from mdio.schemas.dtype import ScalarType, StructuredType + +from mdio.schemas.dimension import NamedDimension +from mdio.schemas.metadata import ChunkGridMetadata, UserAttributes +from mdio.schemas.v1.dataset_builder import MDIODatasetBuilder, contains_dimension, get_dimension, _BuilderState +from mdio.schemas.v1.stats import CenteredBinHistogram, Histogram, StatisticsMetadata, SummaryStatistics +from mdio.schemas.v1.units import AllUnits, LengthUnitEnum, LengthUnitModel + +def test_add_dimension() -> None: + """Test adding a dimension to the dataset builder.""" + builder = MDIODatasetBuilder("Test Dataset Builder") + + builder.add_dimension(name="inline", size=2, long_name="Inline dimension") + assert len(builder._dimensions) == 1 + assert builder._dimensions[0] == NamedDimension(name="inline", size=2) + assert len(builder._variables) == 1 + assert builder._state == _BuilderState.HAS_DIMENSIONS + + +def test_add_dimension() -> None: + """Test dimension builder state transitions and functionality.""" + builder = MDIODatasetBuilder("test_dataset") + + # First dimension should change state to HAS_DIMENSIONS and create a variable + builder.add_dimension("x", 100, long_name="X Dimension") + assert builder._state == _BuilderState.HAS_DIMENSIONS + assert len(builder._dimensions) == 1 # noqa: PLR2004 + assert len(builder._variables) == 1 # noqa: PLR2004 + assert builder._dimensions[0].name == "x" + assert builder._dimensions[0].size == 100 # noqa: PLR2004 + var0 = builder._variables[0] + assert var0.name == "x" + assert var0.long_name == "X Dimension" + assert var0.data_type == ScalarType.INT32 + assert var0.dimensions[0].name == "x" + + # Adding another dimension should maintain state and create another variable + builder.add_dimension("y", 200, data_type=ScalarType.UINT32) + assert builder._state == _BuilderState.HAS_DIMENSIONS + assert len(builder._dimensions) == 2 # noqa: PLR2004 + assert len(builder._variables) == 2 # noqa: PLR2004 + assert builder._dimensions[1].name == "y" + assert builder._dimensions[1].size == 200 # noqa: PLR2004 + var1 = builder._variables[1] + assert var1.name == "y" + assert var1.data_type == ScalarType.UINT32 + assert var1.dimensions[0].name == "y" + + # TODO: Adding a dimension with the same: is allowed allowed (currently ignored) or should have raise an error? + builder.add_dimension("x", 100, long_name="X Dimension") + assert builder._state == _BuilderState.HAS_DIMENSIONS + assert len(builder._dimensions) == 2 # noqa: PLR2004 + assert len(builder._variables) == 2 # noqa: PLR2004 + + # Adding a dimension with the same name and different size throws an error + with pytest.raises(ValueError, match="Dimension 'x' found but size 100 does not match expected size 200"): + builder.add_dimension("x", 200, long_name="X Dimension") + assert builder._state == _BuilderState.HAS_DIMENSIONS + + + +def test_add_dimension_with_units() -> None: + """Test adding dimensions with units.""" + builder = MDIODatasetBuilder("test_dataset") + + # Add dimension with units as a dictionary + builder.add_dimension( + "depth", + size=100, + data_type=ScalarType.FLOAT32, + metadata={"unitsV1": {"length": "m"}}, + ) + assert len(builder._variables) == 1 + var0 = builder._variables[0] + assert var0.name == "depth" + assert var0.data_type == ScalarType.FLOAT32 + assert var0.metadata.units_v1.length == "m" + + # Add dimension with strongly-typed unit list of single-item + builder.add_dimension( + "length", + size=100, + data_type=ScalarType.FLOAT64, + metadata=[AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.FOOT))], + ) + assert len(builder._variables) == 2 + var1 = builder._variables[1] + assert var1.name == "length" + assert var1.data_type == ScalarType.FLOAT64 + assert var1.metadata.units_v1.length == "ft" + + +def test_add_dimension_with_attributes() -> None: + """Test adding dimensions with attributes.""" + builder = MDIODatasetBuilder("test_dataset") + + # Add dimension with attributes as dictionary + builder.add_dimension( + "depth", + size=100, + data_type=ScalarType.FLOAT32, + metadata={"attributes": {"MGA": 51, "UnitSystem": "Imperial"}} + ) + assert len(builder._variables) == 1 + var0 = builder._variables[0] + assert var0.name == "depth" + assert var0.data_type == ScalarType.FLOAT32 + assert var0.metadata.attributes["MGA"] == 51 # noqa: PLR2004 + assert var0.metadata.attributes["UnitSystem"] == "Imperial" + + # Add dimension with strongly-typed attribute list + builder.add_dimension( + "length", + size=100, + data_type=ScalarType.FLOAT32, + metadata=[UserAttributes(attributes={"MGA": 51, "UnitSystem": "Imperial"})] + ) + assert len(builder._variables) == 2 + var1 = builder._variables[1] + assert var1.name == "length" + assert var1.data_type == ScalarType.FLOAT32 + assert var1.metadata.attributes["MGA"] == 51 # noqa: PLR2004 + assert var1.metadata.attributes["UnitSystem"] == "Imperial" + + +def test_add_dimension_with_chunk_grid() -> None: + """Test adding dimensions with chunk grid.""" + builder = MDIODatasetBuilder("test_dataset") + + # Add dimension with chunk grid as dictionary + builder.add_dimension( + "depth", + size=100, + data_type=ScalarType.FLOAT32, + metadata={"chunkGrid": {"name": "regular", "configuration": {"chunkShape": [20]}}}, + ) + assert len(builder._variables) == 1 + var0 = builder._variables[0] + assert var0.name == "depth" + assert var0.data_type == ScalarType.FLOAT32 + assert var0.metadata.chunk_grid.name == "regular" + assert var0.metadata.chunk_grid.configuration.chunk_shape == [20] + + + # Add dimension with strongly-typed chunk grid + # TODO: It is not clear, how RectilinearChunkGrid can be mapped to a single dimension + # grid_definition = RectilinearChunkGrid(configuration=RectilinearChunkShape(chunk_shape=[[2,3,4],[2,3,4]])) + grid_definition = RegularChunkGrid(configuration=RegularChunkShape(chunk_shape=[20])) + builder.add_dimension( + "length", + size=100, + data_type=ScalarType.FLOAT32, + metadata=[ChunkGridMetadata(chunk_grid=grid_definition)] + ) + assert len(builder._variables) == 2 + var1 = builder._variables[1] + assert var1.name == "length" + assert var1.data_type == ScalarType.FLOAT32 + assert var1.metadata.chunk_grid.name == "regular" + assert var1.metadata.chunk_grid.configuration.chunk_shape == [20] + +def test_add_dimension_with_stats() -> None: + """Test adding dimensions with stats.""" + builder = MDIODatasetBuilder("test_dataset") + + # TODO: Are multiple statistic object supported? + # TODO: StatisticsMetadata accepts list[SummaryStatistics], what does this mean and does it need to be tested? + + # TODO: What is the proper spelling 'statsV1' or 'stats_v1'? Needs to be documented. + + # Add dimension with strongly-typed stats + builder.add_dimension( + "depth", + size=100, + data_type=ScalarType.FLOAT32, + metadata=[StatisticsMetadata(stats_v1=SummaryStatistics( + count=100, + sum=1215.1, + sumSquares=125.12, + min=5.61, + max=10.84, + # TODO: Also test EdgeDefinedHistogram + histogram=CenteredBinHistogram(binCenters=[1, 2], counts=[10, 15]) + ))] + ) + assert len(builder._variables) == 1 + var0 = builder._variables[0] + assert var0.name == "depth" + assert var0.data_type == ScalarType.FLOAT32 + assert var0.metadata.stats_v1.count == 100 # noqa: PLR2004 + assert var0.metadata.stats_v1.sum == 1215.1 # noqa: PLR2004 + + # Add dimension with dictionary stats + builder.add_dimension( + "length", + size=100, + data_type=ScalarType.FLOAT32, + metadata={ + "statsV1": { + "count": 100, + "sum": 1215.1, + "sumSquares": 125.12, + "min": 5.61, + "max": 10.84, + "histogram": {"binCenters": [1, 2], "counts": [10, 15]}, + } + }, + ) + assert len(builder._variables) == 2 + var1 = builder._variables[1] + assert var1.name == "length" + assert var1.data_type == ScalarType.FLOAT32 + assert var1.metadata.stats_v1.count == 100 # noqa: PLR2004 + assert var1.metadata.stats_v1.sum == 1215.1 # noqa: PLR2004 + +def test_add_dimension_with_full_metadata() -> None: + """Test adding dimensions with all metadata.""" + builder = MDIODatasetBuilder("test_dataset") + + # Add dimension with all metadata as dictionary + builder.add_dimension( + "depth", + size=100, + data_type=ScalarType.FLOAT32, + metadata={ + "unitsV1": {"length": "m"}, + "attributes": {"MGA": 51}, + "chunkGrid": {"name": "regular", "configuration": {"chunkShape": [20]}}, + "statsV1": { + "count": 100, + "sum": 1215.1, + "sumSquares": 125.12, + "min": 5.61, + "max": 10.84, + "histogram": {"binCenters": [1, 2], "counts": [10, 15]}, + }, + }, + ) + + assert len(builder._variables) == 1 + var0 = builder._variables[0] + assert var0.name == "depth" + assert var0.data_type == ScalarType.FLOAT32 + assert var0.metadata.units_v1.length == "m" + assert var0.metadata.attributes["MGA"] == 51 # noqa: PLR2004 + assert var0.metadata.chunk_grid.name == "regular" + assert var0.metadata.chunk_grid.configuration.chunk_shape == [20] # noqa: PLR2004 + assert var0.metadata.stats_v1.count == 100 # noqa: PLR2004 + assert var0.metadata.stats_v1.sum == 1215.1 # noqa: PLR2004 + assert var0.metadata.stats_v1.sum_squares == 125.12 # noqa: PLR2004 + assert var0.metadata.stats_v1.min == 5.61 # noqa: PLR2004 + assert var0.metadata.stats_v1.max == 10.84 # noqa: PLR2004 + assert var0.metadata.stats_v1.histogram.bin_centers == [1, 2] # noqa: PLR2004 + assert var0.metadata.stats_v1.histogram.counts == [10, 15] # noqa: PLR2004 + + # Add dimension with all strongly-typed metadata + builder.add_dimension( + "length", + size=100, + data_type=ScalarType.FLOAT32, + metadata=[ + AllUnits(units_v1=LengthUnitModel( + length=LengthUnitEnum.FOOT)), + UserAttributes( + attributes={"MGA": 51, "UnitSystem": "Imperial"}), + ChunkGridMetadata( + chunk_grid=RegularChunkGrid( + configuration=RegularChunkShape( + chunk_shape=[20]))), + StatisticsMetadata(stats_v1=SummaryStatistics( + count=100, + sum=1215.1, + sumSquares=125.12, + min=5.61, + max=10.84, + histogram=CenteredBinHistogram( + binCenters=[1, 2], + counts=[10, 15]))) + ] + ) + + assert len(builder._variables) == 2 + var1 = builder._variables[1] + assert var1.name == "length" + assert var1.data_type == ScalarType.FLOAT32 + assert var1.metadata.units_v1.length == "ft" + assert var1.metadata.attributes["MGA"] == 51 # noqa: PLR2004 + assert var1.metadata.attributes["UnitSystem"] == "Imperial" # noqa: PLR2004 + assert var1.metadata.chunk_grid.name == "regular" + assert var1.metadata.chunk_grid.configuration.chunk_shape == [20] # noqa: PLR2004 + assert var1.metadata.stats_v1.count == 100 # noqa: PLR2004 + assert var1.metadata.stats_v1.sum == 1215.1 # noqa: PLR2004 + assert var1.metadata.stats_v1.sum_squares == 125.12 # noqa: PLR2004 + assert var1.metadata.stats_v1.min == 5.61 # noqa: PLR2004 + assert var1.metadata.stats_v1.max == 10.84 # noqa: PLR2004 + assert var1.metadata.stats_v1.histogram.bin_centers == [1, 2] # noqa: PLR2004 + assert var1.metadata.stats_v1.histogram.counts == [10, 15] # noqa: PLR2004 + + + # j = builder.build().json() + # print(j) \ No newline at end of file diff --git a/tests/unit/v1/test_dataset_builder_internals.py b/tests/unit/v1/test_dataset_builder_internals.py new file mode 100644 index 00000000..100b5525 --- /dev/null +++ b/tests/unit/v1/test_dataset_builder_internals.py @@ -0,0 +1,208 @@ + +from datetime import datetime +from pydantic import BaseModel, Field +import pytest +from mdio.schemas.core import StrictModel +from mdio.schemas.dimension import NamedDimension +from mdio.schemas.v1.dataset_builder import MDIODatasetBuilder, contains_dimension, get_dimension, to_dictionary +from mdio.schemas.v1.units import LengthUnitEnum, LengthUnitModel +from mdio.schemas.v1.variable import VariableMetadata, AllUnits, UserAttributes + +def test__get_dimension() -> None: + """Test getting a dimension by name from the list of dimensions.""" + dimensions = [NamedDimension(name="inline", size=2), NamedDimension(name="crossline", size=3)] + + assert get_dimension([], "inline") is None + + assert get_dimension(dimensions, "inline") == NamedDimension(name="inline", size=2) + assert get_dimension(dimensions, "crossline") == NamedDimension(name="crossline", size=3) + assert get_dimension(dimensions, "time") is None + + with pytest.raises(TypeError, match="Expected str, got NoneType"): + get_dimension(dimensions, None) + with pytest.raises(TypeError, match="Expected str, got int"): + get_dimension(dimensions, 42) + with pytest.raises(ValueError, match="Dimension 'inline' found but size 2 does not match expected size 200"): + get_dimension(dimensions, "inline", size=200) + + +def test__contains_dimension() -> None: + """Test if a dimension with a given name exists in the list of dimensions.""" + dimensions = [NamedDimension(name="inline", size=2), NamedDimension(name="crossline", size=3)] + + assert contains_dimension([], "inline") is False + + assert contains_dimension(dimensions, "inline") is True + assert contains_dimension(dimensions, "crossline") is True + assert contains_dimension(dimensions, "time") is False + + with pytest.raises(TypeError, match="Expected str or NamedDimension, got NoneType"): + contains_dimension(dimensions, None) + with pytest.raises(TypeError, match="Expected str or NamedDimension, got int"): + contains_dimension(dimensions, 42) + with pytest.raises(ValueError, match="Dimension 'inline' found but size 2 does not match expected size 200"): + contains_dimension(dimensions, NamedDimension(name="inline", size=200)) + + + +def test__add_named_dimensions() -> None: + """Test adding named dimensions to a dataset.""" + + builder = MDIODatasetBuilder("Test Dataset Builder") + # + # Validate initial state + # + assert builder._dimensions is not None + assert len(builder._dimensions) == 0 + + # + # Validate that adding empty dimensions does not change the state + # + added_dims = builder._add_named_dimensions(None) + assert len(builder._dimensions) == 0 + assert len(added_dims) == 0 + added_dims = builder._add_named_dimensions([]) + assert len(builder._dimensions) == 0 + assert len(added_dims) == 0 + added_dims = builder._add_named_dimensions({}) + assert len(builder._dimensions) == 0 + assert len(added_dims) == 0 + + # + # Add named dimensions + # + inline_dim = NamedDimension(name="inline", size=2) + added_dims = builder._add_named_dimensions([inline_dim]) + assert len(builder._dimensions) == 1 + assert len(added_dims) == 1 + assert contains_dimension(added_dims, inline_dim) + + crossline_dim = NamedDimension(name="crossline", size=3) + time_dim = NamedDimension(name="time", size=4) + added_dims = builder._add_named_dimensions([crossline_dim, time_dim]) + assert len(builder._dimensions) == 3 + assert contains_dimension(builder._dimensions, inline_dim) + assert contains_dimension(builder._dimensions, crossline_dim) + assert contains_dimension(builder._dimensions, time_dim) + assert contains_dimension(added_dims, crossline_dim) + assert contains_dimension(added_dims, time_dim) + + # + # Add invalid object type + # + with pytest.raises(TypeError, match="Expected NamedDimension or str, got int"): + builder._add_named_dimensions([42]) + assert len(builder._dimensions) == 3 + + # + # Add dimensions with the same names again does nothing + # (make sure we are passing different instances) + # + inline_dim2 = NamedDimension(name=inline_dim.name, size=inline_dim.size) + crossline_dim2 = NamedDimension(name=crossline_dim.name, size=crossline_dim.size) + time_dim2 = NamedDimension(name=time_dim.name, size=time_dim.size) + added_dims = builder._add_named_dimensions([inline_dim2, crossline_dim2, time_dim2]) + # Validate that the dimensions and variables are not duplicated + assert len(builder._dimensions) == 3 + assert contains_dimension(builder._dimensions, inline_dim) + assert contains_dimension(builder._dimensions, crossline_dim) + assert contains_dimension(builder._dimensions, time_dim) + assert len(added_dims) == 0 + + # Add dimensions with the same name, but different size again + with pytest.raises(ValueError, match="Dimension 'inline' found but size 2 does not match expected size 200"): + inline_dim2 = NamedDimension(name="inline", size=200) + builder._add_named_dimensions([inline_dim2]) + assert len(builder._dimensions) == 3 + # + # Add existing dimension using its name + # + added_dims = builder._add_named_dimensions(["inline", "crossline"]) + assert len(builder._dimensions) == 3 + assert contains_dimension(builder._dimensions, inline_dim) + assert contains_dimension(builder._dimensions, crossline_dim) + assert contains_dimension(builder._dimensions, time_dim) + assert len(added_dims) == 0 + + # + # Add non-existing dimension using its name is not allowed + # + with pytest.raises(ValueError, match="Dimension named 'offset' is not found"): + builder._add_named_dimensions(["offset"]) + assert len(builder._dimensions) == 3 + assert contains_dimension(builder._dimensions, inline_dim) + assert contains_dimension(builder._dimensions, crossline_dim) + assert contains_dimension(builder._dimensions, time_dim) + + +def test__to_dictionary() -> None: + """Test converting a BaseModel to a dictionary.""" + + with pytest.raises(TypeError, match="Expected BaseModel, got datetime"): + # This should raise an error because datetime is not a BaseModel + to_dictionary(datetime.now()) + + class SomeModel(StrictModel): + count: int = Field(default=None, description="Samples count") + samples: list[float] = Field(default_factory=list, description="Samples.") + created: datetime = Field(default_factory=datetime.now, description="Creation time with TZ info.") + + m = SomeModel( + count = 3, + samples = [1.0, 2.0, 3.0], + created = datetime(2023, 10, 1, 12, 0, 0, tzinfo=None) + ) + result = to_dictionary(m) + assert isinstance(result, dict) + assert result == {'count': 3, 'created': '2023-10-01T12:00:00', 'samples': [1.0, 2.0, 3.0]} + + +def test__make_VariableMetadata_from_list() -> None: + """Test creating VariableMetadata from a strongly-typed list of AllUnits or UserAttributes.""" + + units = AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.FOOT)) + attrs = UserAttributes(attributes={"MGA": 51, "UnitSystem": "Imperial"}) + meta_list=[units, attrs] + + # TODO: I assume we do not want attribute.attribute in the contract: + # 'metadata': {'unitsV1': {'length': 'm'}, 'attributes': {'attributes': {'MGA': 51}}} + # TODO: Are multiple attributes allowed (I assume they are)? + metadata = MDIODatasetBuilder._make_VariableMetadata_from_list(meta_list) + assert isinstance(metadata, VariableMetadata) + assert metadata.units_v1.length == "ft" + assert metadata.attributes["MGA"] == 51 + assert metadata.attributes["UnitSystem"] == "Imperial" + + with pytest.raises(TypeError, match="Unsupported metadata type: "): + meta_list = ["ft"] + MDIODatasetBuilder._make_VariableMetadata_from_list(meta_list) + +def test__make_VariableMetadata_from_dict() -> None: + """Test creating VariableMetadata from a dictionary.""" + + # TODO: What is the key for units: it unitsV1 or units_v1? + # TODO: Are multiple attributes allowed (I assume they are)? + # TODO: Do we validate the unit string supplied in dictionary parameters? What what if someone supplies ftUS instead of ft? + meta_dict={"unitsV1": {"length": "ft"}, "attributes": {"MGA": 51, "UnitSystem": "Imperial"}} + metadata = MDIODatasetBuilder._make_VariableMetadata_from_dict(meta_dict) + assert isinstance(metadata, VariableMetadata) + assert metadata.units_v1.length == "ft" + assert metadata.attributes["MGA"] == 51 + assert metadata.attributes["UnitSystem"] == "Imperial" + + with pytest.raises(TypeError, match="Unsupported metadata key: 'units_v1'. Expected 'unitsV1', 'attributes', 'chunkGrid', or 'statsV1."): + meta_dict = {"units_v1": "ft"} + MDIODatasetBuilder._make_VariableMetadata_from_dict(meta_dict) + + with pytest.raises(TypeError, match="Invalid value for key 'attributes': 42. Expected a dictionary."): + meta_dict = {"attributes": 42} + MDIODatasetBuilder._make_VariableMetadata_from_dict(meta_dict) + + # *** We currently do not validate the structure of the value dictionaries *** + # Pass unit object with invalid structure + # with pytest.raises(TypeError, match="Invalid value format for key 'unitsV1': {'length': 'm', 'time': 'sec'}. "): + # meta_dict1 = {"unitsV1": {"length": "m", "time": "sec"}} + # MDIODatasetBuilder._make_VariableMetadata_from_dict(meta_dict1) + + + diff --git a/tests/unit/v1/test_dataset_builder_state.py b/tests/unit/v1/test_dataset_builder_state.py new file mode 100644 index 00000000..31c570a8 --- /dev/null +++ b/tests/unit/v1/test_dataset_builder_state.py @@ -0,0 +1,88 @@ + +from datetime import datetime +import pytest + +from mdio.schemas.v1.dataset_builder import _BuilderState, MDIODatasetBuilder + +def test_builder_initialization() -> None: + """Test basic builder initialization.""" + builder = MDIODatasetBuilder("test_dataset") + assert builder.name == "test_dataset" + assert builder.api_version == "1.0.0" + assert isinstance(builder.created_on, datetime) + assert len(builder._dimensions) == 0 + assert len(builder._coordinates) == 0 + assert len(builder._variables) == 0 + assert builder._state == _BuilderState.INITIAL + +def test_builder_add_dimension_state() -> None: + """Test coordinate builder before and after add_dimension.""" + builder = MDIODatasetBuilder("test_dataset") + assert builder._state == _BuilderState.INITIAL + + # One should be able to add dimension any time after the builder has been created + + # Add dimensions first + builder = builder.add_dimension("x", 100) + assert builder._state == _BuilderState.HAS_DIMENSIONS + builder = builder.add_dimension("y", 200) + assert builder._state == _BuilderState.HAS_DIMENSIONS + +@pytest.mark.skip(reason="Under construction.") +def test_coordinate_builder_state() -> None: + """Test coordinate builder state transitions and functionality.""" + builder = MDIODatasetBuilder("test_dataset") + + # Should not be able to add coordinates before dimensions + with pytest.raises( + ValueError, match="Must add at least one dimension before adding coordinates" + ): + builder.add_coordinate("x_coord", dimensions=["x"]) + + # Add dimensions first + builder = builder.add_dimension("x", 100) + builder = builder.add_dimension("y", 200) + + # Adding coordinate should change state to HAS_COORDINATES + builder = builder.add_coordinate("x_coord", dimensions=["x"], long_name="X Coordinate") + assert builder._state == _BuilderState.HAS_COORDINATES + assert len(builder._coordinates) == 1 # noqa: PLR2004 + assert builder._coordinates[0].name == "x_coord" + assert builder._coordinates[0].long_name == "X Coordinate" + assert builder._coordinates[0].dimensions[0].name == "x" + + # Adding another coordinate should maintain state + builder = builder.add_coordinate("y_coord", dimensions=["y"]) + assert builder._state == _BuilderState.HAS_COORDINATES + assert len(builder._coordinates) == 2 # noqa: PLR2004 + assert builder._coordinates[1].name == "y_coord" + assert builder._coordinates[1].dimensions[0].name == "y" + +@pytest.mark.skip(reason="Under construction.") +def test_variable_builder_state() -> None: + """Test variable builder state transitions and functionality.""" + builder = MDIODatasetBuilder("test_dataset") + + # Should not be able to add variables before dimensions + with pytest.raises(ValueError, match="Must add at least one dimension before adding variables"): + builder.add_variable("data", dimensions=["x"]) + + # Add dimension first + builder = builder.add_dimension("x", 100) + + # Adding variable should change state to HAS_VARIABLES + builder = builder.add_variable("data", dimensions=["x"], long_name="Data Variable") + assert builder._state == _BuilderState.HAS_VARIABLES + # One for dimension, one for variable + assert len(builder._variables) == 2 # noqa: PLR2004 + assert builder._variables[1].name == "data" + assert builder._variables[1].long_name == "Data Variable" + assert builder._variables[1].dimensions[0].name == "x" + + # Adding another variable should maintain state + builder = builder.add_variable("data2", dimensions=["x"]) + assert builder._state == _BuilderState.HAS_VARIABLES + # One for dimension, two for variables + assert len(builder._variables) == 3 # noqa: PLR2004 + assert builder._variables[2].name == "data2" + assert builder._variables[2].dimensions[0].name == "x" \ No newline at end of file From 1358f95bea4ad3f781cb353fc6f1d6d27ba99dab Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Fri, 27 Jun 2025 00:22:28 +0000 Subject: [PATCH 02/27] First take on add_dimension(), add_coordinate(), add_variable() --- DEVELOPER_NOTES.md | 213 ++++++----- src/mdio/schemas/v1/dataset_builder.py | 282 +++++++++----- tests/unit/v1/__init__.py | 2 +- .../v1/test_dataset_builder_add_coordinate.py | 144 ++++++++ .../v1/test_dataset_builder_add_dimension.py | 348 +++++++----------- .../v1/test_dataset_builder_add_variable.py | 123 +++++++ tests/unit/v1/test_dataset_builder_helpers.py | 265 +++++++++++++ .../unit/v1/test_dataset_builder_internals.py | 208 ----------- tests/unit/v1/test_dataset_builder_state.py | 88 ----- 9 files changed, 963 insertions(+), 710 deletions(-) create mode 100644 tests/unit/v1/test_dataset_builder_add_coordinate.py create mode 100644 tests/unit/v1/test_dataset_builder_add_variable.py create mode 100644 tests/unit/v1/test_dataset_builder_helpers.py delete mode 100644 tests/unit/v1/test_dataset_builder_internals.py delete mode 100644 tests/unit/v1/test_dataset_builder_state.py diff --git a/DEVELOPER_NOTES.md b/DEVELOPER_NOTES.md index 7dcda95d..427801ee 100644 --- a/DEVELOPER_NOTES.md +++ b/DEVELOPER_NOTES.md @@ -1,107 +1,126 @@ # Developer Notes -What are the goals for MDIO v1: + +## MDIO v1 scope of work + +### TASK 1: Creation an empty MDIO v1 dataset with metadata defined using the v1 schema +#### DESCRIPTION +In the v0, the following code was used to create an empty dataset: + +```Python +grid = Grid([Dimension("inline",...), Dimension("crossline", ...), Dimension("depth", ...)]) +variable = MDIOVariableConfig("stack_amplitude", ...) +create_conf = MDIOCreateConfig(path="demo.mdio", grid=grid, variables=[variable]) +create_empty(config=create_conf) +``` + +In the v1 it is replaced with the following API, which uses v1 schema: + +```Python +builder = MDIODatasetBuilder(...) +builder.add_dimension("inline", ...) +builder.add_dimension("crossline",...) +builder.add_dimension("depth", ...) +builder.add_coordinate("cdp_x",...) +builder.add_coordinate("cdp_y",...) +builder.add_variable("stack_amplitude",...) +builder.to_mdio(store="demo.mdio") +``` + +#### DEFINITION OF DONE +* The resulting v1 MDIO control `demo.mdio` file structure must be identical between Python and C++ +* Code coverage 90% +* Code documentation will be updated: + * API doc strings are reviewed + * docs/tutorials/creation.ipynb - current version describes v0 API. Should be updated with v1 API + * docs/api_reference.md - will be updated with new API + +#### ASSUMPTIONS +We expect that the following v0 workflows to keep working with this change +* Populating MDIOs +* Updating File and Checking with MDIOReader +* Write to SEG-Y ## Overall API design and implementation -1. Do we want to have a strongly-typed (see pydantic) or dynamic-typed (see dictionary args) API? - For example +We will have only a strongly-typed (see pydantic) API. For example: + ```Python - # Strongly typed - builder.add_dimension( - "length", - size=100, - data_type=ScalarType.FLOAT32, - metadata=[ - AllUnits(units_v1=LengthUnitModel( - length=LengthUnitEnum.FOOT)), - UserAttributes( - attributes={"MGA": 51, "UnitSystem": "Imperial"}), - ChunkGridMetadata( - chunk_grid=RegularChunkGrid( - configuration=RegularChunkShape( - chunk_shape=[20]))), - StatisticsMetadata(stats_v1=SummaryStatistics( - count=100, - sum=1215.1, - sumSquares=125.12, - min=5.61, - max=10.84, - histogram=CenteredBinHistogram( - binCenters=[1, 2], - counts=[10, 15]))) - ] - ) - - # dynamically-typed - builder.add_dimension( - "depth", - size=100, - data_type=ScalarType.FLOAT32, - metadata={ - "unitsV1": {"length": "m"}, - "attributes": {"MGA": 51}, - "chunkGrid": {"name": "regular", "configuration": {"chunkShape": [20]}}, - "statsV1": { - "count": 100, - "sum": 1215.1, - "sumSquares": 125.12, - "min": 5.61, - "max": 10.84, - "histogram": {"binCenters": [1, 2], "counts": [10, 15]}, - }, - }, - ) +VariableMetadataList: TypeAlias = list[AllUnits | UserAttributes | ChunkGridMetadata | StatisticsMetadata] +def add_dimension( + self, + name: str, + size: int, + long_name: str = None, + data_type: ScalarType | StructuredType = ScalarType.INT32, + metadata_info: VariableMetadataList | None = None, +) -> "MDIODatasetBuilder": ``` -2. How extensive the handling of the edge cases and the invalid arguments should be? This affects the ammount of validation code that needs to be written - For example, - * Should we validate in the code that the units list contain a single item for the dimensions units - or should we expect the developer to always pass a single-item list? - * Should we test the statistics for count > 0 or dim(binCenter) == dim(count) in the case above? - -## V1 Schema questions -1. Why do we allow default / empty names? -2. Adding a dimension with the same name multiple times: is it allowed or should it raise an error? - * It is currently allowed: the second request is ignored - * Adding a dimension with the same name, but the different size currently throws an error -3. Why do we allow methods with dictionary parameters (non-strongly-typed)? -4. For the add_dimension(): - * Can AllUnits / UserAttributes / ChunkGridMetadata / StatisticsMetadata be repeated in the metadata list? - * For units, chunkGrid, statsV1 dict should we validate structure of the data passed in? - * Do we validate the unit string supplied in dictionary parameters? What what if someone supplies ftUS instead of ft? - * Are multiple dimension attributes allowed (I assume yes)? -5. It is not clear, how RectilinearChunkGrid can be mapped to a single dimension - ```RectilinearChunkGrid(configuration=RectilinearChunkShape(chunk_shape=[[2,3,4],[2,3,4]]))``` -6. StatisticsMetadata accepts list[SummaryStatistics]. what does this mean and does it need to be tested? -7. The pydentic attribute names are different from the v1 schema attributes names. - 'statsV1' <-> 'stats_v1', 'unitsV1' <-> 'units_v1', 'chunkGrid' <-> 'chunk_grid' - Tgus, we will pass `units_v1` if we use the typesafe API and `'unitsV1` if we use dictionary API -8. Can we add two variables with the same name? -9. Why histogram (e.g., SummaryStatistics) does not have a `histogram_type` attribute? -10. Why 'ftUS' is not supported by the schema? - Units: what foot does the MDIO uses: the U.S. survey foot or the International Foot? - The U.S. survey foot is defined as 1200/3937 meters, while the international foot is defined as exactly 0.3048 meters. + +Which will be used as following: + +```Python +builder.add_dimension( + "length", + size=100, + data_type=ScalarType.FLOAT32, + metadata_info=[ + AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.FOOT)), + UserAttributes(attributes={"MGA": 51, "UnitSystem": "Imperial"}), + ChunkGridMetadata( + chunk_grid=RegularChunkGrid( + configuration=RegularChunkShape( + chunk_shape=[20]))), + StatisticsMetadata(stats_v1=SummaryStatistics( + count=100, + sum=1215.1, + sumSquares=125.12, + min=5.61, + max=10.84, + histogram=CenteredBinHistogram( + binCenters=[1, 2], + counts=[10, 15]))) + ] +) +``` +### Notes +* When a coordinate or a variable is created, their schema allows to store their dimensions either as + * list of dimensions name `list[str]`, where the names refer to the dimensions defined in the builder._dimensions + * list of named dimensions `list[NamedDimension]`, which duplicate the dimensions defined in the builder._dimensions + * Mixture of the two above `list[NamedDimension | str]` + + We will be using the first approach. + + **IMPORTANT: For binary compatibility, We need to ensure that the C++ code follows the same logic** + +* Metadata population from a dictionary in add_coordinate() and add_variable() will not be supported to ensure that the API is strongly-typed. If it is needed, such conversion should be done as a separate step: + ```Python + def make_variable_metadata_list_from_dict(metadata: dict[str, Any]) -> VariableMetadataList: + # Implementation goes here + def make_coordinate_metadata_list_from_dict(metadata: dict[str, Any]) -> CoordinateMetadataList: + # Implementation goes here + ``` + +## Schema V1 questions + +* add_dimension(): Can a dimension with the same name be added multiple times. Options: + * Allowed: the second request is ignored (current implementation) + * Adding a dimension with the same name, but the different size currently throws an error + * Not Allowed: should it raise an error? +* The pydantic attribute names are different from the v1 schema attributes names. What are the repercussions? + ``` + 'statsV1' <-> 'stats_v1' + 'unitsV1' <-> 'units_v1' + 'chunkGrid' <-> 'chunk_grid' + ``` +* Should histogram (e.g., SummaryStatistics) have a `histogram_type` attribute? +* Units + * Why 'ftUS' is not supported by the schema? U.S. survey foot vs the International Foot: + *"The U.S. survey foot is defined as 1200/3937 meters, while the international foot is defined as exactly 0.3048 meters. https://www.axiomint.com/survey-foot-versus-international-foot-whats-the-difference/ "The REAL issue is when ... applied to State Plane coordinates in the N2,000,000 and E6,000,000 range! - This ... moves a State Plane coordinate position 4 feet by 12 feet. - -## Unclear -* Did we have a notion of the fixed increment for inline & xline annotations? -* How is rotation of East/North axes relatively to inline/xline axes is handled -* How is right-handed and left-handed surveys are handled? -* add_variable - should dimensions argument be required?? - -(src/mdio/schemas/v1/dataset_builder.py) -## Design suggestions -1. Instead of trying to track the state, should we just return a wrapper/pimpl class with the permitted methods? -2. Should we rename add_dimension to add_dimension_variable / add_dimension_annotation to indicate that we not just - providing the dimension name, but also creating the dimension variable -4. add_variable - should we call it `append_variable`. add implies that either name or index must be provided. + This ... moves a State Plane coordinate position 4 feet by 12 feet."* + * Why there are no dimensionless unis (for seismic amplitudes, inlines, etc.) -## Under constructions -* TODO: ??? refactor _BuilderState to make inner class ??? -* TODO: Need an example of EdgeDefinedHistogram for add_dimension with histogram +## Design suggestions +* Should we rename add_dimension to add_dimension_variable (or similar) to indicate that we not just providing the dimension name, but also creating the dimension variable -## Bugs -1. I assume we do not want attribute.attribute in the contract (see docs\tutorials\builder.ipynb) - 'metadata': {'unitsV1': {'length': 'm'}, 'attributes': {'attributes': {'MGA': 51}}} -https://osdu.pages.opengroup.org/platform/domain-data-mgmt-services/seismic/open-vds/vds/specification/Metadata.html \ No newline at end of file diff --git a/src/mdio/schemas/v1/dataset_builder.py b/src/mdio/schemas/v1/dataset_builder.py index ea431e4c..b97ae7ee 100644 --- a/src/mdio/schemas/v1/dataset_builder.py +++ b/src/mdio/schemas/v1/dataset_builder.py @@ -1,28 +1,31 @@ """Builder pattern implementation for MDIO v1 schema models.""" -from collections.abc import Mapping from datetime import UTC from datetime import datetime from enum import Enum from enum import auto from typing import Any +from typing import TypeAlias from pydantic import BaseModel from zarr.core.chunk_key_encodings import V2ChunkKeyEncoding # noqa: F401 -from mdio.schemas.compressors import ZFP -from mdio.schemas.compressors import Blosc +from mdio.schemas.compressors import ZFP, Blosc from mdio.schemas.dimension import NamedDimension -from mdio.schemas.dtype import ScalarType, StructuredType -from mdio.schemas.metadata import ChunkGridMetadata, UserAttributes -from mdio.schemas.v1.dataset import Dataset -from mdio.schemas.v1.stats import StatisticsMetadata, SummaryStatistics -from mdio.schemas.v1.units import AllUnitModel, AllUnits -from mdio.schemas.v1.variable import Coordinate, Variable, VariableMetadata - -# TODO: Why do we allow default names? -# TODO: Instead of trying to track the state, should we just return a wrapper class with permitted methods? -# TODO: refactor to make inner class +from mdio.schemas.dtype import ScalarType +from mdio.schemas.dtype import StructuredType +from mdio.schemas.metadata import ChunkGridMetadata +from mdio.schemas.metadata import UserAttributes +from mdio.schemas.v1.stats import StatisticsMetadata +from mdio.schemas.v1.units import AllUnits +from mdio.schemas.v1.variable import Coordinate, CoordinateMetadata +from mdio.schemas.v1.variable import Variable +from mdio.schemas.v1.variable import VariableMetadata + + +CoordinateMetadataList: TypeAlias = list[AllUnits | UserAttributes] +VariableMetadataList: TypeAlias = list[AllUnits | UserAttributes | ChunkGridMetadata | StatisticsMetadata] + class _BuilderState(Enum): """States for the template builder.""" @@ -31,24 +34,30 @@ class _BuilderState(Enum): HAS_COORDINATES = auto() HAS_VARIABLES = auto() -def contains_dimension(dimensions: list[NamedDimension], name_or_NamedDimension: str | NamedDimension) -> bool: + +def contains_dimension( + dimensions: list[NamedDimension], name_or_dimension: str | NamedDimension +) -> bool: """Check if a dimension with the given name exists in the list.""" - if isinstance(name_or_NamedDimension, str): - name = name_or_NamedDimension + if isinstance(name_or_dimension, str): + name = name_or_dimension return get_dimension(dimensions, name) is not None - elif isinstance(name_or_NamedDimension, NamedDimension): - named_dimension = name_or_NamedDimension - return get_dimension(dimensions, named_dimension.name, named_dimension.size) is not None - else: - msg = f"Expected str or NamedDimension, got {type(name_or_NamedDimension).__name__}" - raise TypeError(msg) + if isinstance(name_or_dimension, NamedDimension): + dimension = name_or_dimension + return get_dimension(dimensions, dimension.name, dimension.size) is not None + msg = f"Expected str or NamedDimension, got {type(name_or_dimension).__name__}" + raise TypeError(msg) + -def get_dimension(dimensions: list[NamedDimension], name: str, size: int | None = None) -> NamedDimension | None: +def get_dimension( + dimensions: list[NamedDimension], name: str, size: int | None = None +) -> NamedDimension | None: """Get a dimension by name from the list.""" if dimensions is None: return False if not isinstance(name, str): - raise TypeError(f"Expected str, got {type(name).__name__}") + msg = f"Expected str, got {type(name).__name__}" + raise TypeError(msg) nd = next((dim for dim in dimensions if dim.name == name), None) if nd is None: @@ -58,11 +67,71 @@ def get_dimension(dimensions: list[NamedDimension], name: str, size: int | None raise ValueError(msg) return nd -def to_dictionary(val: BaseModel) -> dict[str, Any]: + +def get_dimension_names( dimensions: list[NamedDimension | str]) -> list[str]: + """Get a dimension by name from the list.""" + names = [] + if dimensions is None: + return names + for dim in dimensions: + if isinstance(dim, NamedDimension): + names.append(dim.name) + elif isinstance(dim, str): + names.append(dim) + return names + + +def _to_dictionary(val: BaseModel) -> dict[str, Any]: """Convert a pydantic BaseModel to a dictionary.""" if not isinstance(val, BaseModel): - raise TypeError(f"Expected BaseModel, got {type(val).__name__}") - return val.model_dump(mode="json", by_alias=True) + msg = f"Expected BaseModel, got {type(val).__name__}" + raise TypeError(msg) + return val.model_dump(mode="json", by_alias=True) + +def _make_coordinate_metadata(metadata: CoordinateMetadataList | None) -> CoordinateMetadata | None: + if metadata is None or not metadata: + return None + + metadata_dict = {} + for md in metadata: + # NOTE: the pydantic attribute names are different from the v1 schema attributes names + # 'unitsV1' <-> 'units_v1' + if isinstance(md, AllUnits): + val = md.units_v1 + metadata_dict["unitsV1"] = _to_dictionary(val) + elif isinstance(md, UserAttributes): + # NOTE: md.attributes is not pydantic type, but a dictionary + metadata_dict["attributes"] = _to_dictionary(md)["attributes"] + else: + msg = f"Unsupported metadata type: {type(md)}" + raise TypeError(msg) + return CoordinateMetadata(**metadata_dict) + + +def _make_variable_metadata(metadata: VariableMetadataList | None) -> VariableMetadata | None: + if metadata is None or not metadata: + return None + + metadata_dict = {} + for md in metadata: + # NOTE: the pydantic attribute names are different from the v1 schema attributes names + # 'statsV1' <-> 'stats_v1', 'unitsV1' <-> 'units_v1', 'chunkGrid' <-> 'chunk_grid' + if isinstance(md, AllUnits): + val = md.units_v1 + metadata_dict["unitsV1"] = _to_dictionary(val) + elif isinstance(md, UserAttributes): + # NOTE: md.attributes is not pydantic type, but a dictionary + metadata_dict["attributes"] = _to_dictionary(md)["attributes"] + elif isinstance(md, ChunkGridMetadata): + val = md.chunk_grid + metadata_dict["chunkGrid"] = _to_dictionary(val) + elif isinstance(md, StatisticsMetadata): + val = md.stats_v1 + metadata_dict["statsV1"] = _to_dictionary(val) + else: + msg = f"Unsupported metadata type: {type(md)}" + raise TypeError(msg) + return VariableMetadata(**metadata_dict) class MDIODatasetBuilder: """Builder for creating MDIO datasets with enforced build order. @@ -77,7 +146,8 @@ class MDIODatasetBuilder: def __init__(self, name: str, attributes: dict[str, Any] | None = None): self.name = name - self.api_version = "1.0.0" # TODO(BrianMichell, #0): Pull from package metadata + # TODO(BrianMichell, #0): Pull from package metadata + self.api_version = "1.0.0" self.created_on = datetime.now(UTC) self.attributes = attributes self._dimensions: list[NamedDimension] = [] @@ -86,8 +156,9 @@ def __init__(self, name: str, attributes: dict[str, Any] | None = None): self._state = _BuilderState.INITIAL self._unnamed_variable_counter = 0 - - def _add_named_dimensions(self, dimensions: list[NamedDimension | str] | None) -> list[NamedDimension]: + def _add_dimensions_if_needed( + self, dimensions: list[NamedDimension | str] | None + ) -> list[NamedDimension]: if dimensions is None: return [] @@ -95,72 +166,27 @@ def _add_named_dimensions(self, dimensions: list[NamedDimension | str] | None) - for dim in dimensions: if isinstance(dim, str): if not contains_dimension(self._dimensions, dim): - raise ValueError(f"Dimension named {dim!r} is not found") + msg = f"Pre-existing dimension named {dim!r} is not found" + raise ValueError(msg) else: if not isinstance(dim, NamedDimension): - raise TypeError(f"Expected NamedDimension or str, got {type(dim).__name__}") - if contains_dimension(self._dimensions, dim): + msg = f"Expected NamedDimension or str, got {type(dim).__name__}" + raise TypeError(msg) + if contains_dimension(self._dimensions, dim): continue - self._dimensions.append(dim) - added_dims.append(dim) + # Use value instead of a reference + d = NamedDimension(name=dim.name, size=dim.size) + self._dimensions.append(d) + added_dims.append(d) return added_dims - - def _make_VariableMetadata_from_list(metadata: list[AllUnits | UserAttributes]) -> Any: - metadata_dict = {} - for md in metadata: - # NOTE: the pydentic attribute names are different from the v1 schema attributes names - # 'statsV1' <-> 'stats_v1', 'unitsV1' <-> 'units_v1', 'chunkGrid' <-> 'chunk_grid' - if isinstance(md, AllUnits): - val = md.units_v1 - metadata_dict["unitsV1"] = to_dictionary(val) - elif isinstance(md, UserAttributes): - # NOTE: md.attributes is not pydantic type, but a dictionary - metadata_dict["attributes"] = to_dictionary(md)["attributes"] - elif isinstance(md, ChunkGridMetadata): - val = md.chunk_grid - metadata_dict["chunkGrid"] = to_dictionary(val) - elif isinstance(md, StatisticsMetadata): - val = md.stats_v1 - metadata_dict["statsV1"] = to_dictionary(val) - else: - raise TypeError(f"Unsupported metadata type: {type(md)}") - return VariableMetadata(**metadata_dict) - - - def _make_VariableMetadata_from_dict(metadata: dict[str, Any]) -> type[BaseModel]: - converted_dict = {} - for key, value in metadata.items(): - if key == "unitsV1" or key == "statsV1" or key == "chunkGrid" or key == "attributes": - # TODO: Should we validate the structure of the value passed in? - if not isinstance(value, dict): - raise TypeError(f"Invalid value for key '{key}': {value!r}. Expected a dictionary.") - else: - raise TypeError(f"Unsupported metadata key: '{key}'. Expected 'unitsV1', 'attributes', 'chunkGrid', or 'statsV1.") - converted_dict[key] = value - return VariableMetadata(**converted_dict) - - - def _make_VariableMetadata(metadata: list[AllUnits | UserAttributes] | dict[str, Any] | None = None) -> Any | None: - if metadata is None: - return None - - if isinstance(metadata, list): - return MDIODatasetBuilder._make_VariableMetadata_from_list(metadata) - - if isinstance(metadata, dict): - return MDIODatasetBuilder._make_VariableMetadata_from_dict(metadata) - - raise TypeError(f"Unsupported metadata type: {type(metadata)}") - - def add_dimension( # noqa: PLR0913 self, name: str, size: int, - long_name: str = None, - data_type: ScalarType | StructuredType = ScalarType.INT32, - metadata: list[AllUnits | UserAttributes] | None | dict[str, Any] = None, + var_long_name: str = None, + var_data_type: ScalarType | StructuredType = ScalarType.INT32, + var_metadata_info: VariableMetadataList | None = None, ) -> "MDIODatasetBuilder": """Add a dimension. @@ -171,25 +197,97 @@ def add_dimension( # noqa: PLR0913 size: Size of the dimension long_name: Optional long name for the dimension variable data_type: Data type for the dimension variable (defaults to INT32) - metadata: Optional metadata for the dimension variable + metadata_info: Optional metadata information for the dimension variable Returns: self: Returns self for method chaining """ + if not name: + raise ValueError("'name' must be a non-empty string") - added_dims = self._add_named_dimensions([NamedDimension(name=name, size=size)]) + added_dims = self._add_dimensions_if_needed( + [NamedDimension(name=name, size=size)]) if added_dims: # Create a variable for the dimension dim_var = Variable( name=name, - longName=long_name, + longName=var_long_name, dimensions=added_dims, - dataType=data_type, + dataType=var_data_type, compressor=None, coordinates=None, - metadata=MDIODatasetBuilder._make_VariableMetadata(metadata) + metadata=_make_variable_metadata(var_metadata_info), ) self._variables.append(dim_var) self._state = _BuilderState.HAS_DIMENSIONS return self + + def add_coordinate( # noqa: PLR0913 + self, + name: str, + *, + long_name: str = None, + dimensions: list[NamedDimension | str], + data_type: ScalarType | StructuredType = ScalarType.FLOAT32, + metadata_info: CoordinateMetadataList | None = None, + ) -> "MDIODatasetBuilder": + """Add a coordinate after adding at least one dimension.""" + if self._state == _BuilderState.INITIAL: + msg = "Must add at least one dimension before adding coordinates" + raise ValueError(msg) + if not name: + raise ValueError("'name' must be a non-empty string") + if dimensions is None or not dimensions: + raise ValueError("'dimensions' must be a non-empty list") + + self._add_dimensions_if_needed(dimensions) + dim_names = get_dimension_names(dimensions) + self._coordinates.append( + Coordinate( + name=name, + longName=long_name, + dimensions=dim_names, # We ass names: sts, not list[NamedDimension | str] + dataType=data_type, + metadata=_make_coordinate_metadata(metadata_info), + ) + ) + self._state = _BuilderState.HAS_COORDINATES + return self + + + def add_variable( # noqa: PLR0913 + self, + name: str, + *, + long_name: str = None, + dimensions: list[NamedDimension | str], + data_type: ScalarType | StructuredType = ScalarType.FLOAT32, + compressor: Blosc | ZFP | None = None, + coordinates: list[Coordinate | str] | None = None, + metadata_info: VariableMetadataList | None = None, + ) -> "MDIODatasetBuilder": + """Add a variable after adding at least one dimension.""" + if self._state == _BuilderState.INITIAL: + msg = "Must add at least one dimension before adding variables" + raise ValueError(msg) + if not name: + raise ValueError("'name' must be a non-empty string") + if dimensions is None or not dimensions: + raise ValueError("'dimensions' must be a non-empty list") + + self._add_dimensions_if_needed(dimensions) + dim_names = get_dimension_names(dimensions) + self._variables.append( + Variable( + name=name, + long_name=long_name, + dimensions=dim_names, + data_type=data_type, + compressor=compressor, + coordinates=coordinates, + metadata=_make_variable_metadata(metadata_info), + ) + ) + self._state = _BuilderState.HAS_VARIABLES + return self diff --git a/tests/unit/v1/__init__.py b/tests/unit/v1/__init__.py index 3db3a8e5..fa2ea633 100644 --- a/tests/unit/v1/__init__.py +++ b/tests/unit/v1/__init__.py @@ -1 +1 @@ -"""Unit tests for parts of the MDIO package related to the v1 schema""" \ No newline at end of file +"""Unit tests for parts of the MDIO package related to the v1 schema.""" diff --git a/tests/unit/v1/test_dataset_builder_add_coordinate.py b/tests/unit/v1/test_dataset_builder_add_coordinate.py new file mode 100644 index 00000000..b38da14b --- /dev/null +++ b/tests/unit/v1/test_dataset_builder_add_coordinate.py @@ -0,0 +1,144 @@ +# ruff: noqa: PLR2004 +# PLR2004 Magic value used in comparison, consider replacing `3` with a constant variable +# The above erroneous warning is generated for every numerical assert. +# Thus, disable it for this file +"""Tests the schema v1 dataset_builder.add_coordinate() public API.""" + +import pytest + +from mdio.schemas.chunk_grid import RegularChunkGrid +from mdio.schemas.chunk_grid import RegularChunkShape +from mdio.schemas.dimension import NamedDimension +from mdio.schemas.dtype import ScalarType +from mdio.schemas.metadata import ChunkGridMetadata +from mdio.schemas.metadata import UserAttributes +from mdio.schemas.v1.dataset_builder import MDIODatasetBuilder, contains_dimension +from mdio.schemas.v1.dataset_builder import _BuilderState +from mdio.schemas.v1.stats import CenteredBinHistogram +from mdio.schemas.v1.stats import StatisticsMetadata +from mdio.schemas.v1.stats import SummaryStatistics +from mdio.schemas.v1.units import AllUnits +from mdio.schemas.v1.units import LengthUnitEnum +from mdio.schemas.v1.units import LengthUnitModel + + +def test_add_coordinate() -> None: + """Test adding coordinates. Check the state transition and validate required parameters.""" + builder = MDIODatasetBuilder("test_dataset") + assert builder._state == _BuilderState.INITIAL + + with pytest.raises(ValueError, match="Must add at least one dimension before adding coordinates"): + builder.add_coordinate("amplitude", dimensions=["speed"]) + + builder.add_dimension("inline", 100) + builder.add_dimension("crossline", 100) + + bad_name = None + with pytest.raises(ValueError, match="'name' must be a non-empty string"): + builder.add_coordinate(bad_name, dimensions=["speed"]) + with pytest.raises(ValueError, match="'name' must be a non-empty string"): + builder.add_coordinate("", dimensions=["speed"]) + with pytest.raises(ValueError, match="'dimensions' must be a non-empty list"): + builder.add_coordinate("amplitude", dimensions=None) + with pytest.raises(ValueError, match="'dimensions' must be a non-empty list"): + builder.add_coordinate("amplitude", dimensions=[]) + + builder.add_coordinate("cdp", dimensions=["inline", "crossline"]) + assert builder._state == _BuilderState.HAS_COORDINATES + assert len(builder._dimensions) == 2 + assert len(builder._variables) == 2 + assert len(builder._coordinates) == 1 + +def test_add_coordinate_with_defaults() -> None: + """Test adding coordinates with default arguments.""" + builder = MDIODatasetBuilder("test_dataset") + builder.add_dimension("inline", 100) + builder.add_dimension("crossline", 100) + # Add coordinate using defaults + builder.add_coordinate("cdp", dimensions=["inline", "crossline"]) + assert len(builder._dimensions) == 2 + assert len(builder._variables) == 2 + assert len(builder._coordinates) == 1 + # NOTE: add_variable() stores dimensions as names + crd0 = next((e for e in builder._coordinates if e.name == "cdp"), None) + assert crd0 is not None + assert set(crd0.dimensions) == set(["inline", "crossline"]) + assert crd0.long_name is None # Default value + assert crd0.data_type == ScalarType.FLOAT32 # Default value + assert crd0.metadata is None # Default value + +def test_coordinate_with_units() -> None: + """Test adding coordinates with units.""" + builder = MDIODatasetBuilder("test_dataset") + builder.add_dimension("inline", 100) + builder.add_dimension("crossline", 100) + # Add coordinate with units + builder.add_coordinate( + "cdp", + dimensions=["inline", "crossline"], + metadata_info=[AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.FOOT))] + ) + assert len(builder._dimensions) == 2 + assert len(builder._variables) == 2 + assert len(builder._coordinates) == 1 + # NOTE: add_coordinate() stores dimensions as names + crd0 = next((e for e in builder._coordinates if e.name == "cdp"), None) + assert crd0 is not None + assert set(crd0.dimensions) == set(["inline", "crossline"]) + assert crd0.long_name is None # Default value + assert crd0.data_type == ScalarType.FLOAT32 # Default value + assert crd0.metadata.attributes is None + assert crd0.metadata.units_v1.length == "ft" + + +def test_coordinate_with_attributes() -> None: + """Test adding coordinates with attributes.""" + builder = MDIODatasetBuilder("test_dataset") + builder.add_dimension("inline", 100) + builder.add_dimension("crossline", 100) + # Add coordinate with attributes + builder.add_coordinate( + "cdp", + dimensions=["inline", "crossline"], + metadata_info=[UserAttributes(attributes={"MGA": 51, "UnitSystem": "Imperial"})], + ) + assert len(builder._dimensions) == 2 + assert len(builder._variables) == 2 + assert len(builder._coordinates) == 1 + # NOTE: add_coordinate() stores dimensions as names + crd0 = next((e for e in builder._coordinates if e.name == "cdp"), None) + assert crd0 is not None + assert set(crd0.dimensions) == set(["inline", "crossline"]) + assert crd0.long_name is None # Default value + assert crd0.data_type == ScalarType.FLOAT32 # Default value + assert crd0.metadata.attributes["MGA"] == 51 + assert crd0.metadata.attributes["UnitSystem"] == "Imperial" + assert crd0.metadata.units_v1 is None + + +def test_coordinate_with_full_metadata() -> None: + """Test adding coordinates with all metadata.""" + builder = MDIODatasetBuilder("test_dataset") + builder.add_dimension("inline", 100) + builder.add_dimension("crossline", 100) + # Add coordinate with all metadata + builder.add_coordinate( + "cdp", + dimensions=["inline", "crossline"], + metadata_info=[ + AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.FOOT)), + UserAttributes(attributes={"MGA": 51, "UnitSystem": "Imperial"})] + ) + assert len(builder._dimensions) == 2 + assert len(builder._variables) == 2 + assert len(builder._coordinates) == 1 + # NOTE: add_coordinate() stores dimensions as names + crd0 = next((e for e in builder._coordinates if e.name == "cdp"), None) + assert crd0 is not None + assert set(crd0.dimensions) == set(["inline", "crossline"]) + assert crd0.long_name is None # Default value + assert crd0.data_type == ScalarType.FLOAT32 # Default value + assert crd0.metadata.attributes["MGA"] == 51 + assert crd0.metadata.attributes["UnitSystem"] == "Imperial" + assert crd0.metadata.units_v1.length == "ft" + diff --git a/tests/unit/v1/test_dataset_builder_add_dimension.py b/tests/unit/v1/test_dataset_builder_add_dimension.py index bac9a838..7adebb4b 100644 --- a/tests/unit/v1/test_dataset_builder_add_dimension.py +++ b/tests/unit/v1/test_dataset_builder_add_dimension.py @@ -1,307 +1,207 @@ +# ruff: noqa: PLR2004 +# PLR2004 Magic value used in comparison, consider replacing `3` with a constant variable +# The above erroneous warning is generated for every numerical assert. +# Thus, disable it for this file +"""Tests the schema v1 dataset_builder.add_dimension() public API.""" import pytest -from datetime import datetime - -from mdio.schemas.chunk_grid import RectilinearChunkGrid, RectilinearChunkShape, RegularChunkGrid, RegularChunkShape -from mdio.schemas.dtype import ScalarType, StructuredType - +from mdio.schemas.chunk_grid import RegularChunkGrid +from mdio.schemas.chunk_grid import RegularChunkShape from mdio.schemas.dimension import NamedDimension -from mdio.schemas.metadata import ChunkGridMetadata, UserAttributes -from mdio.schemas.v1.dataset_builder import MDIODatasetBuilder, contains_dimension, get_dimension, _BuilderState -from mdio.schemas.v1.stats import CenteredBinHistogram, Histogram, StatisticsMetadata, SummaryStatistics -from mdio.schemas.v1.units import AllUnits, LengthUnitEnum, LengthUnitModel +from mdio.schemas.dtype import ScalarType +from mdio.schemas.metadata import ChunkGridMetadata +from mdio.schemas.metadata import UserAttributes +from mdio.schemas.v1.dataset_builder import MDIODatasetBuilder +from mdio.schemas.v1.dataset_builder import _BuilderState +from mdio.schemas.v1.stats import CenteredBinHistogram +from mdio.schemas.v1.stats import StatisticsMetadata +from mdio.schemas.v1.stats import SummaryStatistics +from mdio.schemas.v1.units import AllUnits +from mdio.schemas.v1.units import LengthUnitEnum +from mdio.schemas.v1.units import LengthUnitModel + def test_add_dimension() -> None: - """Test adding a dimension to the dataset builder.""" - builder = MDIODatasetBuilder("Test Dataset Builder") + """Test adding dimension. Check the state transition and validate required parameters.""" + builder = MDIODatasetBuilder("test_dataset") + assert builder._state == _BuilderState.INITIAL - builder.add_dimension(name="inline", size=2, long_name="Inline dimension") - assert len(builder._dimensions) == 1 - assert builder._dimensions[0] == NamedDimension(name="inline", size=2) - assert len(builder._variables) == 1 + bad_name = None + with pytest.raises(ValueError, match="'name' must be a non-empty string"): + builder.add_dimension(bad_name, 200) + with pytest.raises(ValueError, match="'name' must be a non-empty string"): + builder.add_dimension("", 200) + + # First dimension should change state to HAS_DIMENSIONS and create a variable + builder.add_dimension("x", 100) assert builder._state == _BuilderState.HAS_DIMENSIONS + assert len(builder._dimensions) == 1 + assert builder._dimensions[0] == NamedDimension(name="x", size=100) + assert len(builder._variables) == 1 + # Adding a dimension with the same name as an existing dimension. + # Currently, it does not raise an error, but just ignores this call + builder.add_dimension("x", 100, var_long_name="X Dimension") + assert builder._state == _BuilderState.HAS_DIMENSIONS + assert len(builder._dimensions) == 1 + assert len(builder._variables) == 1 -def test_add_dimension() -> None: + # Adding a dimension with the same name and different size throws an error + err_msg = "Dimension 'x' found but size 100 does not match expected size 200" + with pytest.raises(ValueError, match=err_msg): + builder.add_dimension("x", 200, var_long_name="X Dimension") + +def test_add_dimension_with_defaults() -> None: """Test dimension builder state transitions and functionality.""" builder = MDIODatasetBuilder("test_dataset") # First dimension should change state to HAS_DIMENSIONS and create a variable - builder.add_dimension("x", 100, long_name="X Dimension") + builder.add_dimension("x", 100) assert builder._state == _BuilderState.HAS_DIMENSIONS - assert len(builder._dimensions) == 1 # noqa: PLR2004 - assert len(builder._variables) == 1 # noqa: PLR2004 - assert builder._dimensions[0].name == "x" - assert builder._dimensions[0].size == 100 # noqa: PLR2004 + assert len(builder._dimensions) == 1 + assert builder._dimensions[0] == NamedDimension(name="x", size=100) + assert len(builder._variables) == 1 var0 = builder._variables[0] assert var0.name == "x" - assert var0.long_name == "X Dimension" + assert var0.long_name is None assert var0.data_type == ScalarType.INT32 - assert var0.dimensions[0].name == "x" - - # Adding another dimension should maintain state and create another variable - builder.add_dimension("y", 200, data_type=ScalarType.UINT32) - assert builder._state == _BuilderState.HAS_DIMENSIONS - assert len(builder._dimensions) == 2 # noqa: PLR2004 - assert len(builder._variables) == 2 # noqa: PLR2004 - assert builder._dimensions[1].name == "y" - assert builder._dimensions[1].size == 200 # noqa: PLR2004 - var1 = builder._variables[1] - assert var1.name == "y" - assert var1.data_type == ScalarType.UINT32 - assert var1.dimensions[0].name == "y" - - # TODO: Adding a dimension with the same: is allowed allowed (currently ignored) or should have raise an error? - builder.add_dimension("x", 100, long_name="X Dimension") - assert builder._state == _BuilderState.HAS_DIMENSIONS - assert len(builder._dimensions) == 2 # noqa: PLR2004 - assert len(builder._variables) == 2 # noqa: PLR2004 - - # Adding a dimension with the same name and different size throws an error - with pytest.raises(ValueError, match="Dimension 'x' found but size 100 does not match expected size 200"): - builder.add_dimension("x", 200, long_name="X Dimension") - assert builder._state == _BuilderState.HAS_DIMENSIONS - - + assert var0.compressor is None + assert var0.coordinates is None + assert var0.metadata is None def test_add_dimension_with_units() -> None: """Test adding dimensions with units.""" builder = MDIODatasetBuilder("test_dataset") - # Add dimension with units as a dictionary - builder.add_dimension( - "depth", - size=100, - data_type=ScalarType.FLOAT32, - metadata={"unitsV1": {"length": "m"}}, - ) - assert len(builder._variables) == 1 - var0 = builder._variables[0] - assert var0.name == "depth" - assert var0.data_type == ScalarType.FLOAT32 - assert var0.metadata.units_v1.length == "m" - # Add dimension with strongly-typed unit list of single-item builder.add_dimension( "length", size=100, - data_type=ScalarType.FLOAT64, - metadata=[AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.FOOT))], + var_data_type=ScalarType.FLOAT64, + var_metadata_info=[AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.FOOT))] ) - assert len(builder._variables) == 2 - var1 = builder._variables[1] - assert var1.name == "length" - assert var1.data_type == ScalarType.FLOAT64 - assert var1.metadata.units_v1.length == "ft" - + assert len(builder._variables) == 1 + var0 = builder._variables[0] + assert var0.name == "length" + assert var0.long_name is None + assert var0.data_type == ScalarType.FLOAT64 + assert var0.compressor is None + assert var0.coordinates is None + assert var0.metadata.units_v1.length == "ft" def test_add_dimension_with_attributes() -> None: """Test adding dimensions with attributes.""" builder = MDIODatasetBuilder("test_dataset") - # Add dimension with attributes as dictionary + # Add dimension with strongly-typed attribute list builder.add_dimension( - "depth", + "length", size=100, - data_type=ScalarType.FLOAT32, - metadata={"attributes": {"MGA": 51, "UnitSystem": "Imperial"}} + var_data_type=ScalarType.FLOAT32, + var_metadata_info=[UserAttributes(attributes={"MGA": 51, "UnitSystem": "Imperial"})], ) assert len(builder._variables) == 1 var0 = builder._variables[0] - assert var0.name == "depth" + assert var0.name == "length" assert var0.data_type == ScalarType.FLOAT32 - assert var0.metadata.attributes["MGA"] == 51 # noqa: PLR2004 + assert var0.metadata.attributes["MGA"] == 51 assert var0.metadata.attributes["UnitSystem"] == "Imperial" - # Add dimension with strongly-typed attribute list - builder.add_dimension( - "length", - size=100, - data_type=ScalarType.FLOAT32, - metadata=[UserAttributes(attributes={"MGA": 51, "UnitSystem": "Imperial"})] - ) - assert len(builder._variables) == 2 - var1 = builder._variables[1] - assert var1.name == "length" - assert var1.data_type == ScalarType.FLOAT32 - assert var1.metadata.attributes["MGA"] == 51 # noqa: PLR2004 - assert var1.metadata.attributes["UnitSystem"] == "Imperial" - def test_add_dimension_with_chunk_grid() -> None: """Test adding dimensions with chunk grid.""" builder = MDIODatasetBuilder("test_dataset") - # Add dimension with chunk grid as dictionary + # Add dimension with strongly-typed chunk grid + grid_definition = RegularChunkGrid(configuration=RegularChunkShape(chunk_shape=[20])) builder.add_dimension( - "depth", + "length", size=100, - data_type=ScalarType.FLOAT32, - metadata={"chunkGrid": {"name": "regular", "configuration": {"chunkShape": [20]}}}, + var_data_type=ScalarType.FLOAT32, + var_metadata_info=[ChunkGridMetadata(chunk_grid=grid_definition)], ) assert len(builder._variables) == 1 var0 = builder._variables[0] - assert var0.name == "depth" + assert var0.name == "length" assert var0.data_type == ScalarType.FLOAT32 assert var0.metadata.chunk_grid.name == "regular" assert var0.metadata.chunk_grid.configuration.chunk_shape == [20] - # Add dimension with strongly-typed chunk grid - # TODO: It is not clear, how RectilinearChunkGrid can be mapped to a single dimension - # grid_definition = RectilinearChunkGrid(configuration=RectilinearChunkShape(chunk_shape=[[2,3,4],[2,3,4]])) - grid_definition = RegularChunkGrid(configuration=RegularChunkShape(chunk_shape=[20])) - builder.add_dimension( - "length", - size=100, - data_type=ScalarType.FLOAT32, - metadata=[ChunkGridMetadata(chunk_grid=grid_definition)] - ) - assert len(builder._variables) == 2 - var1 = builder._variables[1] - assert var1.name == "length" - assert var1.data_type == ScalarType.FLOAT32 - assert var1.metadata.chunk_grid.name == "regular" - assert var1.metadata.chunk_grid.configuration.chunk_shape == [20] - def test_add_dimension_with_stats() -> None: """Test adding dimensions with stats.""" builder = MDIODatasetBuilder("test_dataset") - # TODO: Are multiple statistic object supported? - # TODO: StatisticsMetadata accepts list[SummaryStatistics], what does this mean and does it need to be tested? - - # TODO: What is the proper spelling 'statsV1' or 'stats_v1'? Needs to be documented. - # Add dimension with strongly-typed stats builder.add_dimension( "depth", size=100, - data_type=ScalarType.FLOAT32, - metadata=[StatisticsMetadata(stats_v1=SummaryStatistics( - count=100, - sum=1215.1, - sumSquares=125.12, - min=5.61, - max=10.84, - # TODO: Also test EdgeDefinedHistogram - histogram=CenteredBinHistogram(binCenters=[1, 2], counts=[10, 15]) - ))] + var_data_type=ScalarType.FLOAT32, + var_metadata_info=[ + StatisticsMetadata( + stats_v1=SummaryStatistics( + count=100, + sum=1215.1, + sumSquares=125.12, + min=5.61, + max=10.84, + # TODO(DmitriyRepin, #0): Also test EdgeDefinedHistogram + histogram=CenteredBinHistogram(binCenters=[1, 2], counts=[10, 15]), + ) + ) + ], ) assert len(builder._variables) == 1 var0 = builder._variables[0] assert var0.name == "depth" assert var0.data_type == ScalarType.FLOAT32 - assert var0.metadata.stats_v1.count == 100 # noqa: PLR2004 - assert var0.metadata.stats_v1.sum == 1215.1 # noqa: PLR2004 + assert var0.metadata.stats_v1.count == 100 + assert var0.metadata.stats_v1.sum == 1215.1 - # Add dimension with dictionary stats - builder.add_dimension( - "length", - size=100, - data_type=ScalarType.FLOAT32, - metadata={ - "statsV1": { - "count": 100, - "sum": 1215.1, - "sumSquares": 125.12, - "min": 5.61, - "max": 10.84, - "histogram": {"binCenters": [1, 2], "counts": [10, 15]}, - } - }, - ) - assert len(builder._variables) == 2 - var1 = builder._variables[1] - assert var1.name == "length" - assert var1.data_type == ScalarType.FLOAT32 - assert var1.metadata.stats_v1.count == 100 # noqa: PLR2004 - assert var1.metadata.stats_v1.sum == 1215.1 # noqa: PLR2004 def test_add_dimension_with_full_metadata() -> None: """Test adding dimensions with all metadata.""" builder = MDIODatasetBuilder("test_dataset") - # Add dimension with all metadata as dictionary - builder.add_dimension( - "depth", - size=100, - data_type=ScalarType.FLOAT32, - metadata={ - "unitsV1": {"length": "m"}, - "attributes": {"MGA": 51}, - "chunkGrid": {"name": "regular", "configuration": {"chunkShape": [20]}}, - "statsV1": { - "count": 100, - "sum": 1215.1, - "sumSquares": 125.12, - "min": 5.61, - "max": 10.84, - "histogram": {"binCenters": [1, 2], "counts": [10, 15]}, - }, - }, - ) - - assert len(builder._variables) == 1 - var0 = builder._variables[0] - assert var0.name == "depth" - assert var0.data_type == ScalarType.FLOAT32 - assert var0.metadata.units_v1.length == "m" - assert var0.metadata.attributes["MGA"] == 51 # noqa: PLR2004 - assert var0.metadata.chunk_grid.name == "regular" - assert var0.metadata.chunk_grid.configuration.chunk_shape == [20] # noqa: PLR2004 - assert var0.metadata.stats_v1.count == 100 # noqa: PLR2004 - assert var0.metadata.stats_v1.sum == 1215.1 # noqa: PLR2004 - assert var0.metadata.stats_v1.sum_squares == 125.12 # noqa: PLR2004 - assert var0.metadata.stats_v1.min == 5.61 # noqa: PLR2004 - assert var0.metadata.stats_v1.max == 10.84 # noqa: PLR2004 - assert var0.metadata.stats_v1.histogram.bin_centers == [1, 2] # noqa: PLR2004 - assert var0.metadata.stats_v1.histogram.counts == [10, 15] # noqa: PLR2004 # Add dimension with all strongly-typed metadata builder.add_dimension( "length", size=100, - data_type=ScalarType.FLOAT32, - metadata=[ - AllUnits(units_v1=LengthUnitModel( - length=LengthUnitEnum.FOOT)), - UserAttributes( - attributes={"MGA": 51, "UnitSystem": "Imperial"}), + var_data_type=ScalarType.FLOAT32, + var_metadata_info=[ + AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.FOOT)), + UserAttributes(attributes={"MGA": 51, "UnitSystem": "Imperial"}), ChunkGridMetadata( - chunk_grid=RegularChunkGrid( - configuration=RegularChunkShape( - chunk_shape=[20]))), - StatisticsMetadata(stats_v1=SummaryStatistics( - count=100, - sum=1215.1, - sumSquares=125.12, - min=5.61, - max=10.84, - histogram=CenteredBinHistogram( - binCenters=[1, 2], - counts=[10, 15]))) - ] + chunk_grid=RegularChunkGrid(configuration=RegularChunkShape(chunk_shape=[20])) + ), + StatisticsMetadata( + stats_v1=SummaryStatistics( + count=100, + sum=1215.1, + sumSquares=125.12, + min=5.61, + max=10.84, + histogram=CenteredBinHistogram(binCenters=[1, 2], counts=[10, 15]), + ) + ), + ], ) - assert len(builder._variables) == 2 - var1 = builder._variables[1] - assert var1.name == "length" - assert var1.data_type == ScalarType.FLOAT32 - assert var1.metadata.units_v1.length == "ft" - assert var1.metadata.attributes["MGA"] == 51 # noqa: PLR2004 - assert var1.metadata.attributes["UnitSystem"] == "Imperial" # noqa: PLR2004 - assert var1.metadata.chunk_grid.name == "regular" - assert var1.metadata.chunk_grid.configuration.chunk_shape == [20] # noqa: PLR2004 - assert var1.metadata.stats_v1.count == 100 # noqa: PLR2004 - assert var1.metadata.stats_v1.sum == 1215.1 # noqa: PLR2004 - assert var1.metadata.stats_v1.sum_squares == 125.12 # noqa: PLR2004 - assert var1.metadata.stats_v1.min == 5.61 # noqa: PLR2004 - assert var1.metadata.stats_v1.max == 10.84 # noqa: PLR2004 - assert var1.metadata.stats_v1.histogram.bin_centers == [1, 2] # noqa: PLR2004 - assert var1.metadata.stats_v1.histogram.counts == [10, 15] # noqa: PLR2004 - - - # j = builder.build().json() - # print(j) \ No newline at end of file + assert len(builder._variables) == 1 + var0 = builder._variables[0] + assert var0.name == "length" + assert var0.data_type == ScalarType.FLOAT32 + assert var0.metadata.units_v1.length == "ft" + assert var0.metadata.attributes["MGA"] == 51 + assert var0.metadata.attributes["UnitSystem"] == "Imperial" + assert var0.metadata.chunk_grid.name == "regular" + assert var0.metadata.chunk_grid.configuration.chunk_shape == [20] + assert var0.metadata.stats_v1.count == 100 + assert var0.metadata.stats_v1.sum == 1215.1 + assert var0.metadata.stats_v1.sum_squares == 125.12 + assert var0.metadata.stats_v1.min == 5.61 + assert var0.metadata.stats_v1.max == 10.84 + assert var0.metadata.stats_v1.histogram.bin_centers == [1, 2] + assert var0.metadata.stats_v1.histogram.counts == [10, 15] diff --git a/tests/unit/v1/test_dataset_builder_add_variable.py b/tests/unit/v1/test_dataset_builder_add_variable.py new file mode 100644 index 00000000..bc168df2 --- /dev/null +++ b/tests/unit/v1/test_dataset_builder_add_variable.py @@ -0,0 +1,123 @@ +# ruff: noqa: PLR2004 +# PLR2004 Magic value used in comparison, consider replacing `3` with a constant variable +# The above erroneous warning is generated for every numerical assert. +# Thus, disable it for this file +"""Tests the schema v1 Variable public API.""" + +import pytest +from mdio.schemas.dtype import ScalarType +from mdio.schemas.metadata import ChunkGridMetadata +from mdio.schemas.metadata import UserAttributes +from mdio.schemas.v1.dataset_builder import _BuilderState, MDIODatasetBuilder +from mdio.schemas.v1.stats import StatisticsMetadata +from mdio.schemas.v1.units import AllUnits +from mdio.schemas.v1.variable import VariableMetadata + +def test_dataset_builder_add_variable() -> None: + """Test adding variable. Check the state transition and validate required parameters..""" + builder = MDIODatasetBuilder("test_dataset") + assert builder._state == _BuilderState.INITIAL + + with pytest.raises(ValueError, match="Must add at least one dimension before adding variables"): + builder.add_variable("amplitude", dimensions=["speed"]) + + builder.add_dimension("inline", 100) + builder.add_dimension("crossline", 100) + builder.add_dimension("depth", 100) + + bad_name = None + with pytest.raises(ValueError, match="'name' must be a non-empty string"): + builder.add_variable(bad_name, dimensions=["speed"]) + with pytest.raises(ValueError, match="'name' must be a non-empty string"): + builder.add_variable("", dimensions=["speed"]) + with pytest.raises(ValueError, match="'dimensions' must be a non-empty list"): + builder.add_variable("amplitude", dimensions=None) + with pytest.raises(ValueError, match="'dimensions' must be a non-empty list"): + builder.add_variable("amplitude", dimensions=[]) + + builder.add_variable("seismic_amplitude", dimensions=["inline", "crossline", "depth"]) + assert builder._state == _BuilderState.HAS_VARIABLES + assert len(builder._dimensions) == 3 + assert len(builder._variables) == 4 + assert len(builder._coordinates) == 0 + +def test_add_variable_with_defaults() -> None: + """Test adding variable with default arguments.""" + builder = MDIODatasetBuilder("test_dataset") + builder.add_dimension("inline", 100) + builder.add_dimension("crossline", 100) + builder.add_dimension("depth", 100) + # Add variable using defaults + builder.add_variable("seismic_amplitude", dimensions=["inline", "crossline", "depth"]) + assert len(builder._dimensions) == 3 + assert len(builder._variables) == 4 + assert len(builder._coordinates) == 0 + var0 = next((e for e in builder._variables if e.name == "seismic_amplitude"), None) + assert var0 is not None + # NOTE: add_variable() stores dimensions as names + assert set(var0.dimensions) == set(["inline", "crossline", "depth"]) + assert var0.long_name is None # Default value + assert var0.data_type == ScalarType.FLOAT32 # Default value + assert var0.compressor is None # Default value + assert var0.coordinates is None # Default value + assert var0.metadata is None # Default value + +# def test__make_variable_metadata() -> None: +# """Test creating VariableMetadata from a dictionary.""" + +# meta_dict = None +# metadata = make_variable_metadata_list(meta_dict) +# assert metadata is None + +# meta_dict = {} +# metadata = make_variable_metadata_list(meta_dict) +# assert metadata is None + +# meta_dict = { +# "unitsV1": {"length": "m"}, +# "attributes": {"MGA": 51}, +# "chunkGrid": {"name": "regular", "configuration": {"chunkShape": [20]}}, +# "statsV1": { +# "count": 100, +# "sum": 1215.1, +# "sumSquares": 125.12, +# "min": 5.61, +# "max": 10.84, +# "histogram": {"binCenters": [1, 2], "counts": [10, 15]}, +# }, +# } +# metadata = make_variable_metadata_list(meta_dict) +# assert isinstance(metadata, VariableMetadata) +# assert metadata.units_v1.length == "m" +# assert metadata.attributes["MGA"] == 51 +# assert metadata.chunk_grid.name == "regular" +# assert metadata.chunk_grid.configuration.chunk_shape == [20] +# assert metadata.stats_v1.count == 100 +# assert metadata.stats_v1.sum == 1215.1 +# assert metadata.stats_v1.sum_squares == 125.12 +# assert metadata.stats_v1.min == 5.61 +# assert metadata.stats_v1.max == 10.84 +# assert metadata.stats_v1.histogram.bin_centers == [1, 2] +# assert metadata.stats_v1.histogram.counts == [10, 15] + +# # NOTE: the v1 schema has 'units_v1' property, but pydantic requires 'unitsV1' for CamelCaseStrictModel +# meta_dict = {"units_v1": {"length": "m"}} +# err_msg = ( +# "Unsupported metadata key: 'units_v1'. " +# "Expected 'unitsV1', 'attributes', 'chunkGrid', or 'statsV1." +# ) +# with pytest.raises(TypeError, match=err_msg): +# make_variable_metadata_list(meta_dict) + +# meta_dict = {"attributes": 42} +# with pytest.raises( +# TypeError, match="Invalid value for key 'attributes': 42. Expected a dictionary." +# ): +# make_variable_metadata_list(meta_dict) + +# # *** We currently do not validate the structure of the value dictionaries *** +# # Pass unit object with invalid structure +# # with pytest.raises( +# # TypeError, match="Invalid value format for key 'unitsV1': {'length': 'm', 'time': 'sec'}. "): +# # meta_dict1 = {"unitsV1": {"length": "m", "time": "sec"}} +# # _make_VariableMetadata_from_dict(meta_dict1) diff --git a/tests/unit/v1/test_dataset_builder_helpers.py b/tests/unit/v1/test_dataset_builder_helpers.py new file mode 100644 index 00000000..b9343b99 --- /dev/null +++ b/tests/unit/v1/test_dataset_builder_helpers.py @@ -0,0 +1,265 @@ +# ruff: noqa: PLR2004 +# PLR2004 Magic value used in comparison, consider replacing `3` with a constant variable +# The above erroneous warning is generated for every numerical assert. +# Thus, disable it for this file +"""Tests the schema v1 dataset_builder internal methods.""" + +from datetime import UTC +from datetime import datetime + +import pytest +from pydantic import Field + +from mdio.schemas.chunk_grid import RegularChunkGrid, RegularChunkShape +from mdio.schemas.core import StrictModel +from mdio.schemas.dimension import NamedDimension +from mdio.schemas.metadata import ChunkGridMetadata +from mdio.schemas.v1.dataset_builder import MDIODatasetBuilder, _make_coordinate_metadata +from mdio.schemas.v1.dataset_builder import get_dimension_names +from mdio.schemas.v1.dataset_builder import _make_variable_metadata +from mdio.schemas.v1.dataset_builder import _to_dictionary +from mdio.schemas.v1.dataset_builder import contains_dimension +from mdio.schemas.v1.dataset_builder import get_dimension +from mdio.schemas.v1.stats import CenteredBinHistogram, StatisticsMetadata, SummaryStatistics +from mdio.schemas.v1.units import LengthUnitEnum +from mdio.schemas.v1.units import LengthUnitModel +from mdio.schemas.v1.variable import AllUnits, CoordinateMetadata +from mdio.schemas.v1.variable import UserAttributes +from mdio.schemas.v1.variable import VariableMetadata + + +def test__get_dimension() -> None: + """Test getting a dimension by name from the list of dimensions.""" + dimensions = [NamedDimension(name="inline", size=2), NamedDimension(name="crossline", size=3)] + + assert get_dimension([], "inline") is None + + assert get_dimension(dimensions, "inline") == NamedDimension(name="inline", size=2) + assert get_dimension(dimensions, "crossline") == NamedDimension(name="crossline", size=3) + assert get_dimension(dimensions, "time") is None + + with pytest.raises(TypeError, match="Expected str, got NoneType"): + get_dimension(dimensions, None) + with pytest.raises(TypeError, match="Expected str, got int"): + get_dimension(dimensions, 42) + with pytest.raises( + ValueError, match="Dimension 'inline' found but size 2 does not match expected size 200" + ): + get_dimension(dimensions, "inline", size=200) + + +def test__contains_dimension() -> None: + """Test if a dimension with a given name exists in the list of dimensions.""" + dimensions = [NamedDimension(name="inline", size=2), NamedDimension(name="crossline", size=3)] + + assert contains_dimension([], "inline") is False + + assert contains_dimension(dimensions, "inline") is True + assert contains_dimension(dimensions, "crossline") is True + assert contains_dimension(dimensions, "time") is False + + with pytest.raises(TypeError, match="Expected str or NamedDimension, got NoneType"): + contains_dimension(dimensions, None) + with pytest.raises(TypeError, match="Expected str or NamedDimension, got int"): + contains_dimension(dimensions, 42) + with pytest.raises( + ValueError, match="Dimension 'inline' found but size 2 does not match expected size 200" + ): + contains_dimension(dimensions, NamedDimension(name="inline", size=200)) + + +def test_get_dimension_names() -> None: + """Test getting a list of dimension names from list[NamedDimension | str].""" + + empty_list = get_dimension_names(None) + assert empty_list is not None + assert isinstance(empty_list, list) + assert len(empty_list) == 0 + + empty_list = get_dimension_names([]) + assert empty_list is not None + assert isinstance(empty_list, list) + assert len(empty_list) == 0 + + dim_list = get_dimension_names([ + NamedDimension(name="inline", size=2), + "amplitude", + NamedDimension(name="crossline", size=3) + ]) + assert dim_list is not None + assert isinstance(dim_list, list) + assert set(dim_list) == set(["inline", "amplitude", "crossline"]) + + +def test_add_dimensions_if_needed() -> None: + """Test adding named dimensions to a dataset.""" + builder = MDIODatasetBuilder("Test Dataset Builder") + # + # Validate initial state + # + assert builder._dimensions is not None + assert len(builder._dimensions) == 0 + + # + # Validate that adding empty dimensions does not change the state + # + added_dims = builder._add_dimensions_if_needed(None) + assert len(builder._dimensions) == 0 + assert len(added_dims) == 0 + added_dims = builder._add_dimensions_if_needed([]) + assert len(builder._dimensions) == 0 + assert len(added_dims) == 0 + added_dims = builder._add_dimensions_if_needed({}) + assert len(builder._dimensions) == 0 + assert len(added_dims) == 0 + + # + # Add named dimensions + # + inline_dim = NamedDimension(name="inline", size=2) + added_dims = builder._add_dimensions_if_needed([inline_dim]) + assert len(builder._dimensions) == 1 + assert len(added_dims) == 1 + assert contains_dimension(added_dims, inline_dim) + + crossline_dim = NamedDimension(name="crossline", size=3) + time_dim = NamedDimension(name="time", size=4) + added_dims = builder._add_dimensions_if_needed([crossline_dim, time_dim]) + assert len(builder._dimensions) == 3 + assert contains_dimension(builder._dimensions, inline_dim) + assert contains_dimension(builder._dimensions, crossline_dim) + assert contains_dimension(builder._dimensions, time_dim) + assert contains_dimension(added_dims, crossline_dim) + assert contains_dimension(added_dims, time_dim) + + # + # Add invalid object type + # + with pytest.raises(TypeError, match="Expected NamedDimension or str, got int"): + builder._add_dimensions_if_needed([42]) + # Assert that the number of dimensions has not increased + assert len(builder._dimensions) == 3 + +def test__add_dimensions_if_needed_when_one_already_exists() -> None: + """Test adding existing named dimensions to a dataset.""" + builder = MDIODatasetBuilder("Test Dataset Builder") + + inline_dim = NamedDimension(name="inline", size=2) + crossline_dim = NamedDimension(name="crossline", size=3) + time_dim = NamedDimension(name="time", size=4) + # + # Add dimensions with the same names again does nothing + # (make sure we are passing different instances) + # + inline_dim2 = NamedDimension(name=inline_dim.name, size=inline_dim.size) + crossline_dim2 = NamedDimension(name=crossline_dim.name, size=crossline_dim.size) + time_dim2 = NamedDimension(name=time_dim.name, size=time_dim.size) + builder._add_dimensions_if_needed([inline_dim2, crossline_dim2, time_dim2]) + added_dims = builder._add_dimensions_if_needed([inline_dim2, crossline_dim2, time_dim2]) + # Validate that the dimensions and variables are not duplicated + assert len(builder._dimensions) == 3 + assert contains_dimension(builder._dimensions, inline_dim) + assert contains_dimension(builder._dimensions, crossline_dim) + assert contains_dimension(builder._dimensions, time_dim) + assert len(added_dims) == 0 + + # Add dimensions with the same name, but different size again + with pytest.raises( + ValueError, match="Dimension 'inline' found but size 2 does not match expected size 200" + ): + builder._add_dimensions_if_needed([NamedDimension(name="inline", size=200)]) + # Assert that the number of dimensions has not increased + assert len(builder._dimensions) == 3 + + # + # Add existing dimension using its name + # + added_dims = builder._add_dimensions_if_needed(["inline", "crossline"]) + assert len(builder._dimensions) == 3 + assert contains_dimension(builder._dimensions, inline_dim) + assert contains_dimension(builder._dimensions, crossline_dim) + assert contains_dimension(builder._dimensions, time_dim) + assert len(added_dims) == 0 + + # + # Add non-existing dimension using its name is not allowed + # + with pytest.raises(ValueError, match="Pre-existing dimension named 'offset' is not found"): + builder._add_dimensions_if_needed(["offset"]) + assert len(builder._dimensions) == 3 + assert contains_dimension(builder._dimensions, inline_dim) + assert contains_dimension(builder._dimensions, crossline_dim) + assert contains_dimension(builder._dimensions, time_dim) + +def test__to_dictionary() -> None: + """Test converting a BaseModel to a dictionary.""" + with pytest.raises(TypeError, match="Expected BaseModel, got datetime"): + # This should raise an error because datetime is not a BaseModel + _to_dictionary(datetime.now(UTC)) + + class SomeModel(StrictModel): + count: int = Field(default=None, description="Samples count") + samples: list[float] = Field(default_factory=list, description="Samples.") + created: datetime = Field( + default_factory=datetime.now, description="Creation time with TZ info." + ) + + m = SomeModel(count=3, + samples=[1.0, 2.0, 3.0], + created=datetime(2023, 10, 1, 12, 0, 0, tzinfo=UTC)) + result = _to_dictionary(m) + assert isinstance(result, dict) + assert result == {"count": 3, "created": "2023-10-01T12:00:00Z", "samples": [1.0, 2.0, 3.0]} + + +def test__make_coordinate_metadata() -> None: + """Test creating VariableMetadata from a strongly-typed list of AllUnits or UserAttributes.""" + units = AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.FOOT)) + attrs = UserAttributes(attributes={"MGA": 51, "UnitSystem": "Imperial"}) + meta_list = [units, attrs] + + # Assume that multiple attributes are allowed + metadata = _make_coordinate_metadata(meta_list) + assert isinstance(metadata, CoordinateMetadata) + assert metadata.units_v1.length == "ft" + assert metadata.attributes["MGA"] == 51 + assert metadata.attributes["UnitSystem"] == "Imperial" + + meta_list = ["ft"] + with pytest.raises(TypeError, match="Unsupported metadata type: "): + _make_variable_metadata(meta_list) + +def test__make_variable_metadata() -> None: + """Test creating VariableMetadata from a strongly-typed list of AllUnits or UserAttributes.""" + units = AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.FOOT)) + attrs = UserAttributes(attributes={"MGA": 51, "UnitSystem": "Imperial"}) + chgrd = ChunkGridMetadata( + chunk_grid=RegularChunkGrid( + configuration=RegularChunkShape(chunk_shape=[20]))) + stats = StatisticsMetadata( + stats_v1=SummaryStatistics( + count=100, + sum=1215.1, + sumSquares=125.12, + min=5.61, + max=10.84, + histogram=CenteredBinHistogram(binCenters=[1, 2], counts=[10, 15]))) + metadata_info = [units, attrs, chgrd, stats] + metadata = _make_variable_metadata(metadata_info) + assert isinstance(metadata, VariableMetadata) + assert metadata.units_v1.length == "ft" + assert metadata.attributes["MGA"] == 51 + assert metadata.attributes["UnitSystem"] == "Imperial" + assert metadata.chunk_grid.name == "regular" + assert metadata.chunk_grid.configuration.chunk_shape == [20] + assert metadata.stats_v1.count == 100 + assert metadata.stats_v1.sum == 1215.1 + assert metadata.stats_v1.sum_squares == 125.12 + assert metadata.stats_v1.min == 5.61 + assert metadata.stats_v1.max == 10.84 + assert metadata.stats_v1.histogram.bin_centers == [1, 2] + assert metadata.stats_v1.histogram.counts == [10, 15] + + meta_list = ["ft"] + with pytest.raises(TypeError, match="Unsupported metadata type: "): + _make_variable_metadata(meta_list) diff --git a/tests/unit/v1/test_dataset_builder_internals.py b/tests/unit/v1/test_dataset_builder_internals.py deleted file mode 100644 index 100b5525..00000000 --- a/tests/unit/v1/test_dataset_builder_internals.py +++ /dev/null @@ -1,208 +0,0 @@ - -from datetime import datetime -from pydantic import BaseModel, Field -import pytest -from mdio.schemas.core import StrictModel -from mdio.schemas.dimension import NamedDimension -from mdio.schemas.v1.dataset_builder import MDIODatasetBuilder, contains_dimension, get_dimension, to_dictionary -from mdio.schemas.v1.units import LengthUnitEnum, LengthUnitModel -from mdio.schemas.v1.variable import VariableMetadata, AllUnits, UserAttributes - -def test__get_dimension() -> None: - """Test getting a dimension by name from the list of dimensions.""" - dimensions = [NamedDimension(name="inline", size=2), NamedDimension(name="crossline", size=3)] - - assert get_dimension([], "inline") is None - - assert get_dimension(dimensions, "inline") == NamedDimension(name="inline", size=2) - assert get_dimension(dimensions, "crossline") == NamedDimension(name="crossline", size=3) - assert get_dimension(dimensions, "time") is None - - with pytest.raises(TypeError, match="Expected str, got NoneType"): - get_dimension(dimensions, None) - with pytest.raises(TypeError, match="Expected str, got int"): - get_dimension(dimensions, 42) - with pytest.raises(ValueError, match="Dimension 'inline' found but size 2 does not match expected size 200"): - get_dimension(dimensions, "inline", size=200) - - -def test__contains_dimension() -> None: - """Test if a dimension with a given name exists in the list of dimensions.""" - dimensions = [NamedDimension(name="inline", size=2), NamedDimension(name="crossline", size=3)] - - assert contains_dimension([], "inline") is False - - assert contains_dimension(dimensions, "inline") is True - assert contains_dimension(dimensions, "crossline") is True - assert contains_dimension(dimensions, "time") is False - - with pytest.raises(TypeError, match="Expected str or NamedDimension, got NoneType"): - contains_dimension(dimensions, None) - with pytest.raises(TypeError, match="Expected str or NamedDimension, got int"): - contains_dimension(dimensions, 42) - with pytest.raises(ValueError, match="Dimension 'inline' found but size 2 does not match expected size 200"): - contains_dimension(dimensions, NamedDimension(name="inline", size=200)) - - - -def test__add_named_dimensions() -> None: - """Test adding named dimensions to a dataset.""" - - builder = MDIODatasetBuilder("Test Dataset Builder") - # - # Validate initial state - # - assert builder._dimensions is not None - assert len(builder._dimensions) == 0 - - # - # Validate that adding empty dimensions does not change the state - # - added_dims = builder._add_named_dimensions(None) - assert len(builder._dimensions) == 0 - assert len(added_dims) == 0 - added_dims = builder._add_named_dimensions([]) - assert len(builder._dimensions) == 0 - assert len(added_dims) == 0 - added_dims = builder._add_named_dimensions({}) - assert len(builder._dimensions) == 0 - assert len(added_dims) == 0 - - # - # Add named dimensions - # - inline_dim = NamedDimension(name="inline", size=2) - added_dims = builder._add_named_dimensions([inline_dim]) - assert len(builder._dimensions) == 1 - assert len(added_dims) == 1 - assert contains_dimension(added_dims, inline_dim) - - crossline_dim = NamedDimension(name="crossline", size=3) - time_dim = NamedDimension(name="time", size=4) - added_dims = builder._add_named_dimensions([crossline_dim, time_dim]) - assert len(builder._dimensions) == 3 - assert contains_dimension(builder._dimensions, inline_dim) - assert contains_dimension(builder._dimensions, crossline_dim) - assert contains_dimension(builder._dimensions, time_dim) - assert contains_dimension(added_dims, crossline_dim) - assert contains_dimension(added_dims, time_dim) - - # - # Add invalid object type - # - with pytest.raises(TypeError, match="Expected NamedDimension or str, got int"): - builder._add_named_dimensions([42]) - assert len(builder._dimensions) == 3 - - # - # Add dimensions with the same names again does nothing - # (make sure we are passing different instances) - # - inline_dim2 = NamedDimension(name=inline_dim.name, size=inline_dim.size) - crossline_dim2 = NamedDimension(name=crossline_dim.name, size=crossline_dim.size) - time_dim2 = NamedDimension(name=time_dim.name, size=time_dim.size) - added_dims = builder._add_named_dimensions([inline_dim2, crossline_dim2, time_dim2]) - # Validate that the dimensions and variables are not duplicated - assert len(builder._dimensions) == 3 - assert contains_dimension(builder._dimensions, inline_dim) - assert contains_dimension(builder._dimensions, crossline_dim) - assert contains_dimension(builder._dimensions, time_dim) - assert len(added_dims) == 0 - - # Add dimensions with the same name, but different size again - with pytest.raises(ValueError, match="Dimension 'inline' found but size 2 does not match expected size 200"): - inline_dim2 = NamedDimension(name="inline", size=200) - builder._add_named_dimensions([inline_dim2]) - assert len(builder._dimensions) == 3 - # - # Add existing dimension using its name - # - added_dims = builder._add_named_dimensions(["inline", "crossline"]) - assert len(builder._dimensions) == 3 - assert contains_dimension(builder._dimensions, inline_dim) - assert contains_dimension(builder._dimensions, crossline_dim) - assert contains_dimension(builder._dimensions, time_dim) - assert len(added_dims) == 0 - - # - # Add non-existing dimension using its name is not allowed - # - with pytest.raises(ValueError, match="Dimension named 'offset' is not found"): - builder._add_named_dimensions(["offset"]) - assert len(builder._dimensions) == 3 - assert contains_dimension(builder._dimensions, inline_dim) - assert contains_dimension(builder._dimensions, crossline_dim) - assert contains_dimension(builder._dimensions, time_dim) - - -def test__to_dictionary() -> None: - """Test converting a BaseModel to a dictionary.""" - - with pytest.raises(TypeError, match="Expected BaseModel, got datetime"): - # This should raise an error because datetime is not a BaseModel - to_dictionary(datetime.now()) - - class SomeModel(StrictModel): - count: int = Field(default=None, description="Samples count") - samples: list[float] = Field(default_factory=list, description="Samples.") - created: datetime = Field(default_factory=datetime.now, description="Creation time with TZ info.") - - m = SomeModel( - count = 3, - samples = [1.0, 2.0, 3.0], - created = datetime(2023, 10, 1, 12, 0, 0, tzinfo=None) - ) - result = to_dictionary(m) - assert isinstance(result, dict) - assert result == {'count': 3, 'created': '2023-10-01T12:00:00', 'samples': [1.0, 2.0, 3.0]} - - -def test__make_VariableMetadata_from_list() -> None: - """Test creating VariableMetadata from a strongly-typed list of AllUnits or UserAttributes.""" - - units = AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.FOOT)) - attrs = UserAttributes(attributes={"MGA": 51, "UnitSystem": "Imperial"}) - meta_list=[units, attrs] - - # TODO: I assume we do not want attribute.attribute in the contract: - # 'metadata': {'unitsV1': {'length': 'm'}, 'attributes': {'attributes': {'MGA': 51}}} - # TODO: Are multiple attributes allowed (I assume they are)? - metadata = MDIODatasetBuilder._make_VariableMetadata_from_list(meta_list) - assert isinstance(metadata, VariableMetadata) - assert metadata.units_v1.length == "ft" - assert metadata.attributes["MGA"] == 51 - assert metadata.attributes["UnitSystem"] == "Imperial" - - with pytest.raises(TypeError, match="Unsupported metadata type: "): - meta_list = ["ft"] - MDIODatasetBuilder._make_VariableMetadata_from_list(meta_list) - -def test__make_VariableMetadata_from_dict() -> None: - """Test creating VariableMetadata from a dictionary.""" - - # TODO: What is the key for units: it unitsV1 or units_v1? - # TODO: Are multiple attributes allowed (I assume they are)? - # TODO: Do we validate the unit string supplied in dictionary parameters? What what if someone supplies ftUS instead of ft? - meta_dict={"unitsV1": {"length": "ft"}, "attributes": {"MGA": 51, "UnitSystem": "Imperial"}} - metadata = MDIODatasetBuilder._make_VariableMetadata_from_dict(meta_dict) - assert isinstance(metadata, VariableMetadata) - assert metadata.units_v1.length == "ft" - assert metadata.attributes["MGA"] == 51 - assert metadata.attributes["UnitSystem"] == "Imperial" - - with pytest.raises(TypeError, match="Unsupported metadata key: 'units_v1'. Expected 'unitsV1', 'attributes', 'chunkGrid', or 'statsV1."): - meta_dict = {"units_v1": "ft"} - MDIODatasetBuilder._make_VariableMetadata_from_dict(meta_dict) - - with pytest.raises(TypeError, match="Invalid value for key 'attributes': 42. Expected a dictionary."): - meta_dict = {"attributes": 42} - MDIODatasetBuilder._make_VariableMetadata_from_dict(meta_dict) - - # *** We currently do not validate the structure of the value dictionaries *** - # Pass unit object with invalid structure - # with pytest.raises(TypeError, match="Invalid value format for key 'unitsV1': {'length': 'm', 'time': 'sec'}. "): - # meta_dict1 = {"unitsV1": {"length": "m", "time": "sec"}} - # MDIODatasetBuilder._make_VariableMetadata_from_dict(meta_dict1) - - - diff --git a/tests/unit/v1/test_dataset_builder_state.py b/tests/unit/v1/test_dataset_builder_state.py deleted file mode 100644 index 31c570a8..00000000 --- a/tests/unit/v1/test_dataset_builder_state.py +++ /dev/null @@ -1,88 +0,0 @@ - -from datetime import datetime -import pytest - -from mdio.schemas.v1.dataset_builder import _BuilderState, MDIODatasetBuilder - -def test_builder_initialization() -> None: - """Test basic builder initialization.""" - builder = MDIODatasetBuilder("test_dataset") - assert builder.name == "test_dataset" - assert builder.api_version == "1.0.0" - assert isinstance(builder.created_on, datetime) - assert len(builder._dimensions) == 0 - assert len(builder._coordinates) == 0 - assert len(builder._variables) == 0 - assert builder._state == _BuilderState.INITIAL - -def test_builder_add_dimension_state() -> None: - """Test coordinate builder before and after add_dimension.""" - builder = MDIODatasetBuilder("test_dataset") - assert builder._state == _BuilderState.INITIAL - - # One should be able to add dimension any time after the builder has been created - - # Add dimensions first - builder = builder.add_dimension("x", 100) - assert builder._state == _BuilderState.HAS_DIMENSIONS - builder = builder.add_dimension("y", 200) - assert builder._state == _BuilderState.HAS_DIMENSIONS - -@pytest.mark.skip(reason="Under construction.") -def test_coordinate_builder_state() -> None: - """Test coordinate builder state transitions and functionality.""" - builder = MDIODatasetBuilder("test_dataset") - - # Should not be able to add coordinates before dimensions - with pytest.raises( - ValueError, match="Must add at least one dimension before adding coordinates" - ): - builder.add_coordinate("x_coord", dimensions=["x"]) - - # Add dimensions first - builder = builder.add_dimension("x", 100) - builder = builder.add_dimension("y", 200) - - # Adding coordinate should change state to HAS_COORDINATES - builder = builder.add_coordinate("x_coord", dimensions=["x"], long_name="X Coordinate") - assert builder._state == _BuilderState.HAS_COORDINATES - assert len(builder._coordinates) == 1 # noqa: PLR2004 - assert builder._coordinates[0].name == "x_coord" - assert builder._coordinates[0].long_name == "X Coordinate" - assert builder._coordinates[0].dimensions[0].name == "x" - - # Adding another coordinate should maintain state - builder = builder.add_coordinate("y_coord", dimensions=["y"]) - assert builder._state == _BuilderState.HAS_COORDINATES - assert len(builder._coordinates) == 2 # noqa: PLR2004 - assert builder._coordinates[1].name == "y_coord" - assert builder._coordinates[1].dimensions[0].name == "y" - -@pytest.mark.skip(reason="Under construction.") -def test_variable_builder_state() -> None: - """Test variable builder state transitions and functionality.""" - builder = MDIODatasetBuilder("test_dataset") - - # Should not be able to add variables before dimensions - with pytest.raises(ValueError, match="Must add at least one dimension before adding variables"): - builder.add_variable("data", dimensions=["x"]) - - # Add dimension first - builder = builder.add_dimension("x", 100) - - # Adding variable should change state to HAS_VARIABLES - builder = builder.add_variable("data", dimensions=["x"], long_name="Data Variable") - assert builder._state == _BuilderState.HAS_VARIABLES - # One for dimension, one for variable - assert len(builder._variables) == 2 # noqa: PLR2004 - assert builder._variables[1].name == "data" - assert builder._variables[1].long_name == "Data Variable" - assert builder._variables[1].dimensions[0].name == "x" - - # Adding another variable should maintain state - builder = builder.add_variable("data2", dimensions=["x"]) - assert builder._state == _BuilderState.HAS_VARIABLES - # One for dimension, two for variables - assert len(builder._variables) == 3 # noqa: PLR2004 - assert builder._variables[2].name == "data2" - assert builder._variables[2].dimensions[0].name == "x" \ No newline at end of file From e5261cb1af1a6cfadb6ddf9eac7c0c34bf544054 Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Sat, 28 Jun 2025 00:11:13 +0000 Subject: [PATCH 03/27] Finished add_dimension, add_coordinate, add_variable --- DEVELOPER_NOTES.md | 16 +- src/mdio/schemas/v1/dataset_builder.py | 60 +++++--- .../v1/test_dataset_builder_add_coordinate.py | 31 ++-- .../v1/test_dataset_builder_add_dimension.py | 15 +- .../v1/test_dataset_builder_add_variable.py | 143 ++++++++++-------- tests/unit/v1/test_dataset_builder_helpers.py | 21 +-- 6 files changed, 167 insertions(+), 119 deletions(-) diff --git a/DEVELOPER_NOTES.md b/DEVELOPER_NOTES.md index 427801ee..dec21dfe 100644 --- a/DEVELOPER_NOTES.md +++ b/DEVELOPER_NOTES.md @@ -85,9 +85,11 @@ builder.add_dimension( * When a coordinate or a variable is created, their schema allows to store their dimensions either as * list of dimensions name `list[str]`, where the names refer to the dimensions defined in the builder._dimensions * list of named dimensions `list[NamedDimension]`, which duplicate the dimensions defined in the builder._dimensions - * Mixture of the two above `list[NamedDimension | str]` + * Mixture of the two above `list[NamedDimension | str]` - We will be using the first approach. + which approach should be used? + + `RESOLUTION: We will be using the first approach.` **IMPORTANT: For binary compatibility, We need to ensure that the C++ code follows the same logic** @@ -98,20 +100,25 @@ builder.add_dimension( def make_coordinate_metadata_list_from_dict(metadata: dict[str, Any]) -> CoordinateMetadataList: # Implementation goes here ``` + `RESOLUTION: The approach confirmed.` ## Schema V1 questions * add_dimension(): Can a dimension with the same name be added multiple times. Options: * Allowed: the second request is ignored (current implementation) - * Adding a dimension with the same name, but the different size currently throws an error * Not Allowed: should it raise an error? + + `RESOLUTION: The dimensions with the same name are not allowed` * The pydantic attribute names are different from the v1 schema attributes names. What are the repercussions? ``` 'statsV1' <-> 'stats_v1' 'unitsV1' <-> 'units_v1' 'chunkGrid' <-> 'chunk_grid' ``` + `Under investigation` * Should histogram (e.g., SummaryStatistics) have a `histogram_type` attribute? + + `Under investigation` * Units * Why 'ftUS' is not supported by the schema? U.S. survey foot vs the International Foot: *"The U.S. survey foot is defined as 1200/3937 meters, while the international foot is defined as exactly 0.3048 meters. @@ -120,7 +127,10 @@ builder.add_dimension( This ... moves a State Plane coordinate position 4 feet by 12 feet."* * Why there are no dimensionless unis (for seismic amplitudes, inlines, etc.) + `Under investigation` + ## Design suggestions * Should we rename add_dimension to add_dimension_variable (or similar) to indicate that we not just providing the dimension name, but also creating the dimension variable + `RESOLUTION: Shorter names are preferable for public API. The function behavior will be described in the docs` diff --git a/src/mdio/schemas/v1/dataset_builder.py b/src/mdio/schemas/v1/dataset_builder.py index b97ae7ee..2cc65812 100644 --- a/src/mdio/schemas/v1/dataset_builder.py +++ b/src/mdio/schemas/v1/dataset_builder.py @@ -10,7 +10,8 @@ from pydantic import BaseModel from zarr.core.chunk_key_encodings import V2ChunkKeyEncoding # noqa: F401 -from mdio.schemas.compressors import ZFP, Blosc +from mdio.schemas.compressors import ZFP +from mdio.schemas.compressors import Blosc from mdio.schemas.dimension import NamedDimension from mdio.schemas.dtype import ScalarType from mdio.schemas.dtype import StructuredType @@ -18,13 +19,18 @@ from mdio.schemas.metadata import UserAttributes from mdio.schemas.v1.stats import StatisticsMetadata from mdio.schemas.v1.units import AllUnits -from mdio.schemas.v1.variable import Coordinate, CoordinateMetadata +from mdio.schemas.v1.variable import Coordinate +from mdio.schemas.v1.variable import CoordinateMetadata from mdio.schemas.v1.variable import Variable from mdio.schemas.v1.variable import VariableMetadata +CoordinateMetadataList: TypeAlias = list[AllUnits | + UserAttributes] +VariableMetadataList: TypeAlias = list[AllUnits | + UserAttributes | + ChunkGridMetadata | + StatisticsMetadata] -CoordinateMetadataList: TypeAlias = list[AllUnits | UserAttributes] -VariableMetadataList: TypeAlias = list[AllUnits | UserAttributes | ChunkGridMetadata | StatisticsMetadata] class _BuilderState(Enum): """States for the template builder.""" @@ -68,7 +74,7 @@ def get_dimension( return nd -def get_dimension_names( dimensions: list[NamedDimension | str]) -> list[str]: +def get_dimension_names(dimensions: list[NamedDimension | str]) -> list[str]: """Get a dimension by name from the list.""" names = [] if dimensions is None: @@ -88,10 +94,11 @@ def _to_dictionary(val: BaseModel) -> dict[str, Any]: raise TypeError(msg) return val.model_dump(mode="json", by_alias=True) + def _make_coordinate_metadata(metadata: CoordinateMetadataList | None) -> CoordinateMetadata | None: if metadata is None or not metadata: return None - + metadata_dict = {} for md in metadata: # NOTE: the pydantic attribute names are different from the v1 schema attributes names @@ -111,7 +118,7 @@ def _make_coordinate_metadata(metadata: CoordinateMetadataList | None) -> Coordi def _make_variable_metadata(metadata: VariableMetadataList | None) -> VariableMetadata | None: if metadata is None or not metadata: return None - + metadata_dict = {} for md in metadata: # NOTE: the pydantic attribute names are different from the v1 schema attributes names @@ -133,6 +140,7 @@ def _make_variable_metadata(metadata: VariableMetadataList | None) -> VariableMe raise TypeError(msg) return VariableMetadata(**metadata_dict) + class MDIODatasetBuilder: """Builder for creating MDIO datasets with enforced build order. @@ -191,19 +199,25 @@ def add_dimension( # noqa: PLR0913 """Add a dimension. This must be called at least once before adding coordinates or variables. + This call will create a variable, if one does not yet exists Args: name: Name of the dimension size: Size of the dimension - long_name: Optional long name for the dimension variable - data_type: Data type for the dimension variable (defaults to INT32) - metadata_info: Optional metadata information for the dimension variable + var_long_name: Optional long name for the dimension variable + var_data_type: Data type for the dimension variable (defaults to INT32) + var_metadata_info: Optional metadata information for the dimension variable Returns: self: Returns self for method chaining """ if not name: - raise ValueError("'name' must be a non-empty string") + msg = "'name' must be a non-empty string" + raise ValueError(msg) + old_var = next((e for e in self._dimensions if e.name == name), None) + if old_var is not None: + msg = "Adding dimension with the same name twice is not allowed" + raise ValueError(msg) added_dims = self._add_dimensions_if_needed( [NamedDimension(name=name, size=size)]) @@ -237,9 +251,15 @@ def add_coordinate( # noqa: PLR0913 msg = "Must add at least one dimension before adding coordinates" raise ValueError(msg) if not name: - raise ValueError("'name' must be a non-empty string") + msg = "'name' must be a non-empty string" + raise ValueError(msg) if dimensions is None or not dimensions: - raise ValueError("'dimensions' must be a non-empty list") + msg = "'dimensions' must be a non-empty list" + raise ValueError(msg) + old_var = next((e for e in self._coordinates if e.name == name), None) + if old_var is not None: + msg = "Adding coordinate with the same name twice is not allowed" + raise ValueError(msg) self._add_dimensions_if_needed(dimensions) dim_names = get_dimension_names(dimensions) @@ -247,7 +267,8 @@ def add_coordinate( # noqa: PLR0913 Coordinate( name=name, longName=long_name, - dimensions=dim_names, # We ass names: sts, not list[NamedDimension | str] + # We ass names: sts, not list[NamedDimension | str] + dimensions=dim_names, dataType=data_type, metadata=_make_coordinate_metadata(metadata_info), ) @@ -255,7 +276,6 @@ def add_coordinate( # noqa: PLR0913 self._state = _BuilderState.HAS_COORDINATES return self - def add_variable( # noqa: PLR0913 self, name: str, @@ -272,9 +292,15 @@ def add_variable( # noqa: PLR0913 msg = "Must add at least one dimension before adding variables" raise ValueError(msg) if not name: - raise ValueError("'name' must be a non-empty string") + msg = "'name' must be a non-empty string" + raise ValueError(msg) if dimensions is None or not dimensions: - raise ValueError("'dimensions' must be a non-empty list") + msg = "'dimensions' must be a non-empty list" + raise ValueError(msg) + old_var = next((e for e in self._variables if e.name == name), None) + if old_var is not None: + msg = "Adding variable with the same name twice is not allowed" + raise ValueError(msg) self._add_dimensions_if_needed(dimensions) dim_names = get_dimension_names(dimensions) diff --git a/tests/unit/v1/test_dataset_builder_add_coordinate.py b/tests/unit/v1/test_dataset_builder_add_coordinate.py index b38da14b..659fedac 100644 --- a/tests/unit/v1/test_dataset_builder_add_coordinate.py +++ b/tests/unit/v1/test_dataset_builder_add_coordinate.py @@ -6,17 +6,10 @@ import pytest -from mdio.schemas.chunk_grid import RegularChunkGrid -from mdio.schemas.chunk_grid import RegularChunkShape -from mdio.schemas.dimension import NamedDimension from mdio.schemas.dtype import ScalarType -from mdio.schemas.metadata import ChunkGridMetadata from mdio.schemas.metadata import UserAttributes -from mdio.schemas.v1.dataset_builder import MDIODatasetBuilder, contains_dimension +from mdio.schemas.v1.dataset_builder import MDIODatasetBuilder from mdio.schemas.v1.dataset_builder import _BuilderState -from mdio.schemas.v1.stats import CenteredBinHistogram -from mdio.schemas.v1.stats import StatisticsMetadata -from mdio.schemas.v1.stats import SummaryStatistics from mdio.schemas.v1.units import AllUnits from mdio.schemas.v1.units import LengthUnitEnum from mdio.schemas.v1.units import LengthUnitModel @@ -27,8 +20,9 @@ def test_add_coordinate() -> None: builder = MDIODatasetBuilder("test_dataset") assert builder._state == _BuilderState.INITIAL - with pytest.raises(ValueError, match="Must add at least one dimension before adding coordinates"): - builder.add_coordinate("amplitude", dimensions=["speed"]) + msg = "Must add at least one dimension before adding coordinates" + with pytest.raises(ValueError, match=msg): + builder.add_coordinate("cdp", dimensions=["inline", "crossline"]) builder.add_dimension("inline", 100) builder.add_dimension("crossline", 100) @@ -49,6 +43,11 @@ def test_add_coordinate() -> None: assert len(builder._variables) == 2 assert len(builder._coordinates) == 1 + # Adding coordinate with the same name twice + msg="Adding coordinate with the same name twice is not allowed" + with pytest.raises(ValueError, match=msg): + builder.add_coordinate("cdp", dimensions=["inline", "crossline"]) + def test_add_coordinate_with_defaults() -> None: """Test adding coordinates with default arguments.""" builder = MDIODatasetBuilder("test_dataset") @@ -59,10 +58,10 @@ def test_add_coordinate_with_defaults() -> None: assert len(builder._dimensions) == 2 assert len(builder._variables) == 2 assert len(builder._coordinates) == 1 - # NOTE: add_variable() stores dimensions as names crd0 = next((e for e in builder._coordinates if e.name == "cdp"), None) assert crd0 is not None - assert set(crd0.dimensions) == set(["inline", "crossline"]) + # NOTE: add_variable() stores dimensions as names + assert set(crd0.dimensions) == {"inline", "crossline"} assert crd0.long_name is None # Default value assert crd0.data_type == ScalarType.FLOAT32 # Default value assert crd0.metadata is None # Default value @@ -81,10 +80,10 @@ def test_coordinate_with_units() -> None: assert len(builder._dimensions) == 2 assert len(builder._variables) == 2 assert len(builder._coordinates) == 1 - # NOTE: add_coordinate() stores dimensions as names crd0 = next((e for e in builder._coordinates if e.name == "cdp"), None) assert crd0 is not None - assert set(crd0.dimensions) == set(["inline", "crossline"]) + # NOTE: add_coordinate() stores dimensions as names + assert set(crd0.dimensions) == {"inline", "crossline"} assert crd0.long_name is None # Default value assert crd0.data_type == ScalarType.FLOAT32 # Default value assert crd0.metadata.attributes is None @@ -108,7 +107,7 @@ def test_coordinate_with_attributes() -> None: # NOTE: add_coordinate() stores dimensions as names crd0 = next((e for e in builder._coordinates if e.name == "cdp"), None) assert crd0 is not None - assert set(crd0.dimensions) == set(["inline", "crossline"]) + assert set(crd0.dimensions) == {"inline", "crossline"} assert crd0.long_name is None # Default value assert crd0.data_type == ScalarType.FLOAT32 # Default value assert crd0.metadata.attributes["MGA"] == 51 @@ -135,7 +134,7 @@ def test_coordinate_with_full_metadata() -> None: # NOTE: add_coordinate() stores dimensions as names crd0 = next((e for e in builder._coordinates if e.name == "cdp"), None) assert crd0 is not None - assert set(crd0.dimensions) == set(["inline", "crossline"]) + assert set(crd0.dimensions) == {"inline", "crossline"} assert crd0.long_name is None # Default value assert crd0.data_type == ScalarType.FLOAT32 # Default value assert crd0.metadata.attributes["MGA"] == 51 diff --git a/tests/unit/v1/test_dataset_builder_add_dimension.py b/tests/unit/v1/test_dataset_builder_add_dimension.py index 7adebb4b..6ff1f259 100644 --- a/tests/unit/v1/test_dataset_builder_add_dimension.py +++ b/tests/unit/v1/test_dataset_builder_add_dimension.py @@ -40,17 +40,10 @@ def test_add_dimension() -> None: assert builder._dimensions[0] == NamedDimension(name="x", size=100) assert len(builder._variables) == 1 - # Adding a dimension with the same name as an existing dimension. - # Currently, it does not raise an error, but just ignores this call - builder.add_dimension("x", 100, var_long_name="X Dimension") - assert builder._state == _BuilderState.HAS_DIMENSIONS - assert len(builder._dimensions) == 1 - assert len(builder._variables) == 1 - - # Adding a dimension with the same name and different size throws an error - err_msg = "Dimension 'x' found but size 100 does not match expected size 200" - with pytest.raises(ValueError, match=err_msg): - builder.add_dimension("x", 200, var_long_name="X Dimension") + # Adding dimension with the same name twice + msg="Adding dimension with the same name twice is not allowed" + with pytest.raises(ValueError, match=msg): + builder.add_dimension("x", 200) def test_add_dimension_with_defaults() -> None: """Test dimension builder state transitions and functionality.""" diff --git a/tests/unit/v1/test_dataset_builder_add_variable.py b/tests/unit/v1/test_dataset_builder_add_variable.py index bc168df2..7c0b46ba 100644 --- a/tests/unit/v1/test_dataset_builder_add_variable.py +++ b/tests/unit/v1/test_dataset_builder_add_variable.py @@ -5,20 +5,31 @@ """Tests the schema v1 Variable public API.""" import pytest + +from mdio.schemas.chunk_grid import RegularChunkGrid +from mdio.schemas.chunk_grid import RegularChunkShape +from mdio.schemas.compressors import Blosc from mdio.schemas.dtype import ScalarType from mdio.schemas.metadata import ChunkGridMetadata from mdio.schemas.metadata import UserAttributes -from mdio.schemas.v1.dataset_builder import _BuilderState, MDIODatasetBuilder +from mdio.schemas.v1.dataset_builder import MDIODatasetBuilder +from mdio.schemas.v1.dataset_builder import _BuilderState +from mdio.schemas.v1.stats import CenteredBinHistogram from mdio.schemas.v1.stats import StatisticsMetadata +from mdio.schemas.v1.stats import SummaryStatistics from mdio.schemas.v1.units import AllUnits +from mdio.schemas.v1.units import LengthUnitEnum +from mdio.schemas.v1.units import LengthUnitModel from mdio.schemas.v1.variable import VariableMetadata -def test_dataset_builder_add_variable() -> None: + +def test_add_variable() -> None: """Test adding variable. Check the state transition and validate required parameters..""" builder = MDIODatasetBuilder("test_dataset") assert builder._state == _BuilderState.INITIAL - with pytest.raises(ValueError, match="Must add at least one dimension before adding variables"): + msg = "Must add at least one dimension before adding variables" + with pytest.raises(ValueError, match=msg): builder.add_variable("amplitude", dimensions=["speed"]) builder.add_dimension("inline", 100) @@ -35,12 +46,18 @@ def test_dataset_builder_add_variable() -> None: with pytest.raises(ValueError, match="'dimensions' must be a non-empty list"): builder.add_variable("amplitude", dimensions=[]) - builder.add_variable("seismic_amplitude", dimensions=["inline", "crossline", "depth"]) + builder.add_variable("amplitude", dimensions=["inline", "crossline", "depth"]) assert builder._state == _BuilderState.HAS_VARIABLES assert len(builder._dimensions) == 3 assert len(builder._variables) == 4 assert len(builder._coordinates) == 0 + # Adding variable with the same name twice + msg="Adding variable with the same name twice is not allowed" + with pytest.raises(ValueError, match=msg): + builder.add_variable("amplitude", dimensions=["inline", "crossline", "depth"]) + + def test_add_variable_with_defaults() -> None: """Test adding variable with default arguments.""" builder = MDIODatasetBuilder("test_dataset") @@ -55,69 +72,69 @@ def test_add_variable_with_defaults() -> None: var0 = next((e for e in builder._variables if e.name == "seismic_amplitude"), None) assert var0 is not None # NOTE: add_variable() stores dimensions as names - assert set(var0.dimensions) == set(["inline", "crossline", "depth"]) + assert set(var0.dimensions) == {"inline", "crossline", "depth"} assert var0.long_name is None # Default value assert var0.data_type == ScalarType.FLOAT32 # Default value assert var0.compressor is None # Default value assert var0.coordinates is None # Default value assert var0.metadata is None # Default value -# def test__make_variable_metadata() -> None: -# """Test creating VariableMetadata from a dictionary.""" - -# meta_dict = None -# metadata = make_variable_metadata_list(meta_dict) -# assert metadata is None - -# meta_dict = {} -# metadata = make_variable_metadata_list(meta_dict) -# assert metadata is None - -# meta_dict = { -# "unitsV1": {"length": "m"}, -# "attributes": {"MGA": 51}, -# "chunkGrid": {"name": "regular", "configuration": {"chunkShape": [20]}}, -# "statsV1": { -# "count": 100, -# "sum": 1215.1, -# "sumSquares": 125.12, -# "min": 5.61, -# "max": 10.84, -# "histogram": {"binCenters": [1, 2], "counts": [10, 15]}, -# }, -# } -# metadata = make_variable_metadata_list(meta_dict) -# assert isinstance(metadata, VariableMetadata) -# assert metadata.units_v1.length == "m" -# assert metadata.attributes["MGA"] == 51 -# assert metadata.chunk_grid.name == "regular" -# assert metadata.chunk_grid.configuration.chunk_shape == [20] -# assert metadata.stats_v1.count == 100 -# assert metadata.stats_v1.sum == 1215.1 -# assert metadata.stats_v1.sum_squares == 125.12 -# assert metadata.stats_v1.min == 5.61 -# assert metadata.stats_v1.max == 10.84 -# assert metadata.stats_v1.histogram.bin_centers == [1, 2] -# assert metadata.stats_v1.histogram.counts == [10, 15] - -# # NOTE: the v1 schema has 'units_v1' property, but pydantic requires 'unitsV1' for CamelCaseStrictModel -# meta_dict = {"units_v1": {"length": "m"}} -# err_msg = ( -# "Unsupported metadata key: 'units_v1'. " -# "Expected 'unitsV1', 'attributes', 'chunkGrid', or 'statsV1." -# ) -# with pytest.raises(TypeError, match=err_msg): -# make_variable_metadata_list(meta_dict) - -# meta_dict = {"attributes": 42} -# with pytest.raises( -# TypeError, match="Invalid value for key 'attributes': 42. Expected a dictionary." -# ): -# make_variable_metadata_list(meta_dict) +def test_add_variable_full_parameters() -> None: + """Test adding variable with full parameters.""" + builder = MDIODatasetBuilder("test_dataset") + builder.add_dimension("inline", 100) + builder.add_dimension("crossline", 100) + builder.add_dimension("depth", 100) + builder.add_coordinate("cdp-x", dimensions=["inline", "crossline"]) + builder.add_coordinate("cdp-y", dimensions=["inline", "crossline"]) + builder.add_variable("seismic_amplitude", + long_name="Amplitude (dimensionless)", + dimensions=["inline", "crossline", "depth"], + data_type=ScalarType.FLOAT64, + compressor=Blosc(algorithm="zstd"), + coordinates=["inline", "crossline", "depth", "cdp-x", "cdp-y"], + metadata_info=[ + AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.FOOT)), + UserAttributes(attributes={"MGA": 51, "UnitSystem": "Imperial"}), + ChunkGridMetadata( + chunk_grid=RegularChunkGrid(configuration=RegularChunkShape(chunk_shape=[20])) + ), + StatisticsMetadata( + stats_v1=SummaryStatistics( + count=100, + sum=1215.1, + sumSquares=125.12, + min=5.61, + max=10.84, + histogram=CenteredBinHistogram(binCenters=[1, 2], counts=[10, 15]), + ) + ), + ]) + assert len(builder._dimensions) == 3 + assert len(builder._coordinates) == 2 + assert len(builder._variables) == 4 + v = next((v for v in builder._variables if v.name == "seismic_amplitude"), None) + assert v is not None + assert v.long_name == "Amplitude (dimensionless)" + # NOTE: add_variable() stores dimensions as names + assert set(v.dimensions) == {"inline", "crossline", "depth"} + assert v.data_type == ScalarType.FLOAT64 + assert isinstance(v.compressor, Blosc) + assert v.compressor.algorithm == "zstd" + # NOTE: add_variable() stores coordinates as names + assert set(v.coordinates) == {"inline", "crossline", "depth", "cdp-x", "cdp-y"} + assert v.metadata.stats_v1.count == 100 + assert isinstance(v.metadata, VariableMetadata) + assert v.metadata.units_v1.length == "ft" + assert v.metadata.attributes["MGA"] == 51 + assert v.metadata.attributes["UnitSystem"] == "Imperial" + assert v.metadata.chunk_grid.name == "regular" + assert v.metadata.chunk_grid.configuration.chunk_shape == [20] + assert v.metadata.stats_v1.count == 100 + assert v.metadata.stats_v1.sum == 1215.1 + assert v.metadata.stats_v1.sum_squares == 125.12 + assert v.metadata.stats_v1.min == 5.61 + assert v.metadata.stats_v1.max == 10.84 + assert v.metadata.stats_v1.histogram.bin_centers == [1, 2] + assert v.metadata.stats_v1.histogram.counts == [10, 15] -# # *** We currently do not validate the structure of the value dictionaries *** -# # Pass unit object with invalid structure -# # with pytest.raises( -# # TypeError, match="Invalid value format for key 'unitsV1': {'length': 'm', 'time': 'sec'}. "): -# # meta_dict1 = {"unitsV1": {"length": "m", "time": "sec"}} -# # _make_VariableMetadata_from_dict(meta_dict1) diff --git a/tests/unit/v1/test_dataset_builder_helpers.py b/tests/unit/v1/test_dataset_builder_helpers.py index b9343b99..d3df78af 100644 --- a/tests/unit/v1/test_dataset_builder_helpers.py +++ b/tests/unit/v1/test_dataset_builder_helpers.py @@ -10,30 +10,34 @@ import pytest from pydantic import Field -from mdio.schemas.chunk_grid import RegularChunkGrid, RegularChunkShape +from mdio.schemas.chunk_grid import RegularChunkGrid +from mdio.schemas.chunk_grid import RegularChunkShape from mdio.schemas.core import StrictModel from mdio.schemas.dimension import NamedDimension from mdio.schemas.metadata import ChunkGridMetadata -from mdio.schemas.v1.dataset_builder import MDIODatasetBuilder, _make_coordinate_metadata -from mdio.schemas.v1.dataset_builder import get_dimension_names +from mdio.schemas.v1.dataset_builder import MDIODatasetBuilder +from mdio.schemas.v1.dataset_builder import _make_coordinate_metadata from mdio.schemas.v1.dataset_builder import _make_variable_metadata from mdio.schemas.v1.dataset_builder import _to_dictionary from mdio.schemas.v1.dataset_builder import contains_dimension from mdio.schemas.v1.dataset_builder import get_dimension -from mdio.schemas.v1.stats import CenteredBinHistogram, StatisticsMetadata, SummaryStatistics +from mdio.schemas.v1.dataset_builder import get_dimension_names +from mdio.schemas.v1.stats import CenteredBinHistogram +from mdio.schemas.v1.stats import StatisticsMetadata +from mdio.schemas.v1.stats import SummaryStatistics from mdio.schemas.v1.units import LengthUnitEnum from mdio.schemas.v1.units import LengthUnitModel -from mdio.schemas.v1.variable import AllUnits, CoordinateMetadata +from mdio.schemas.v1.variable import AllUnits +from mdio.schemas.v1.variable import CoordinateMetadata from mdio.schemas.v1.variable import UserAttributes from mdio.schemas.v1.variable import VariableMetadata -def test__get_dimension() -> None: +def test__get_dimension_by_name() -> None: """Test getting a dimension by name from the list of dimensions.""" dimensions = [NamedDimension(name="inline", size=2), NamedDimension(name="crossline", size=3)] assert get_dimension([], "inline") is None - assert get_dimension(dimensions, "inline") == NamedDimension(name="inline", size=2) assert get_dimension(dimensions, "crossline") == NamedDimension(name="crossline", size=3) assert get_dimension(dimensions, "time") is None @@ -70,7 +74,6 @@ def test__contains_dimension() -> None: def test_get_dimension_names() -> None: """Test getting a list of dimension names from list[NamedDimension | str].""" - empty_list = get_dimension_names(None) assert empty_list is not None assert isinstance(empty_list, list) @@ -88,7 +91,7 @@ def test_get_dimension_names() -> None: ]) assert dim_list is not None assert isinstance(dim_list, list) - assert set(dim_list) == set(["inline", "amplitude", "crossline"]) + assert set(dim_list) == {"inline", "amplitude", "crossline"} def test_add_dimensions_if_needed() -> None: From 95c01d8c02b9bf62f237097d86e358ed6a368406 Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Mon, 30 Jun 2025 16:09:13 -0500 Subject: [PATCH 04/27] Work on build --- src/mdio/schemas/v1/dataset_builder.py | 170 +++++++++++++++--- tests/unit/v1/test_dataset_builder_helpers.py | 8 + 2 files changed, 155 insertions(+), 23 deletions(-) diff --git a/src/mdio/schemas/v1/dataset_builder.py b/src/mdio/schemas/v1/dataset_builder.py index 2cc65812..7db49947 100644 --- a/src/mdio/schemas/v1/dataset_builder.py +++ b/src/mdio/schemas/v1/dataset_builder.py @@ -7,6 +7,8 @@ from typing import Any from typing import TypeAlias +import xarray as xr + from pydantic import BaseModel from zarr.core.chunk_key_encodings import V2ChunkKeyEncoding # noqa: F401 @@ -23,6 +25,8 @@ from mdio.schemas.v1.variable import CoordinateMetadata from mdio.schemas.v1.variable import Variable from mdio.schemas.v1.variable import VariableMetadata +from mdio.schemas.v1.dataset import Dataset, DatasetInfo +from mdio.schemas.v1.dataset import DatasetMetadata CoordinateMetadataList: TypeAlias = list[AllUnits | UserAttributes] @@ -30,8 +34,8 @@ UserAttributes | ChunkGridMetadata | StatisticsMetadata] - - +DatasetMetadataList: TypeAlias = list[DatasetInfo | + UserAttributes ] class _BuilderState(Enum): """States for the template builder.""" @@ -87,12 +91,14 @@ def get_dimension_names(dimensions: list[NamedDimension | str]) -> list[str]: return names -def _to_dictionary(val: BaseModel) -> dict[str, Any]: - """Convert a pydantic BaseModel to a dictionary.""" - if not isinstance(val, BaseModel): - msg = f"Expected BaseModel, got {type(val).__name__}" - raise TypeError(msg) - return val.model_dump(mode="json", by_alias=True) +def _to_dictionary(val: BaseModel | dict[str, Any]) -> dict[str, Any]: + """Convert a dictionary or pydantic BaseModel to a dictionary.""" + if isinstance(val, BaseModel): + return val.model_dump(mode="json", by_alias=True) + if isinstance(val, dict): + return val + msg = f"Expected BaseModel, got {type(val).__name__}" + raise TypeError(msg) def _make_coordinate_metadata(metadata: CoordinateMetadataList | None) -> CoordinateMetadata | None: @@ -104,11 +110,10 @@ def _make_coordinate_metadata(metadata: CoordinateMetadataList | None) -> Coordi # NOTE: the pydantic attribute names are different from the v1 schema attributes names # 'unitsV1' <-> 'units_v1' if isinstance(md, AllUnits): - val = md.units_v1 - metadata_dict["unitsV1"] = _to_dictionary(val) + metadata_dict["unitsV1"] = _to_dictionary(md.units_v1) elif isinstance(md, UserAttributes): # NOTE: md.attributes is not pydantic type, but a dictionary - metadata_dict["attributes"] = _to_dictionary(md)["attributes"] + metadata_dict["attributes"] = _to_dictionary(md.attributes) else: msg = f"Unsupported metadata type: {type(md)}" raise TypeError(msg) @@ -124,22 +129,38 @@ def _make_variable_metadata(metadata: VariableMetadataList | None) -> VariableMe # NOTE: the pydantic attribute names are different from the v1 schema attributes names # 'statsV1' <-> 'stats_v1', 'unitsV1' <-> 'units_v1', 'chunkGrid' <-> 'chunk_grid' if isinstance(md, AllUnits): - val = md.units_v1 - metadata_dict["unitsV1"] = _to_dictionary(val) + metadata_dict["unitsV1"] = _to_dictionary(md.units_v1) elif isinstance(md, UserAttributes): # NOTE: md.attributes is not pydantic type, but a dictionary - metadata_dict["attributes"] = _to_dictionary(md)["attributes"] + metadata_dict["attributes"] = _to_dictionary(md.attributes) elif isinstance(md, ChunkGridMetadata): - val = md.chunk_grid - metadata_dict["chunkGrid"] = _to_dictionary(val) + metadata_dict["chunkGrid"] = _to_dictionary(md.chunk_grid) elif isinstance(md, StatisticsMetadata): - val = md.stats_v1 - metadata_dict["statsV1"] = _to_dictionary(val) + metadata_dict["statsV1"] = _to_dictionary(md.stats_v1) else: msg = f"Unsupported metadata type: {type(md)}" raise TypeError(msg) return VariableMetadata(**metadata_dict) +def _make_datasetinfo_metadata(metadata: DatasetInfo | None) -> DatasetMetadata | None: + if metadata is None or not metadata: + return None + + metadata_dict = {} + for md in metadata: + # NOTE: the pydantic attribute names are different from the v1 schema attributes names + # 'apiVersion' <-> 'api_version', 'created_on' <-> 'createdOn' + if isinstance(md, DatasetInfo): + metadata_dict["name"] = md.name + metadata_dict["apiVersion"] = _to_dictionary(md.api_version) + metadata_dict["createdOn"] = _to_dictionary(md.created_on) + elif isinstance(md, UserAttributes): + # NOTE: md.attributes is not pydantic type, but a dictionary + metadata_dict["attributes"] = _to_dictionary(md.attributes) + else: + msg = f"Unsupported metadata type: {type(md)}" + raise TypeError(msg) + return DatasetMetadata(**metadata_dict) class MDIODatasetBuilder: """Builder for creating MDIO datasets with enforced build order. @@ -152,12 +173,14 @@ class MDIODatasetBuilder: 4. Must call build() to create the dataset. """ - def __init__(self, name: str, attributes: dict[str, Any] | None = None): - self.name = name + def __init__(self, name: str, attributes: UserAttributes | None = None): + # TODO(BrianMichell, #0): Pull from package metadata - self.api_version = "1.0.0" - self.created_on = datetime.now(UTC) - self.attributes = attributes + self._info = DatasetInfo( + name=name, + api_version="1.0.0", + created_on= datetime.now(UTC)), + self._attributes = attributes self._dimensions: list[NamedDimension] = [] self._coordinates: list[Coordinate] = [] self._variables: list[Variable] = [] @@ -317,3 +340,104 @@ def add_variable( # noqa: PLR0913 ) self._state = _BuilderState.HAS_VARIABLES return self + + def build(self) -> Dataset: + """Build the final dataset.""" + if self._state == _BuilderState.INITIAL: + msg = "Must add at least one dimension before building" + raise ValueError(msg) + + return Dataset( + variables=self._variables, + metadata=DatasetMetadata(self._info, + self._attributes) + ) + + + # def to_mdio( + # self, + # store: str, + # mode: str = "w", + # compute: bool = False, + # **kwargs: Mapping[str, str | int | float | bool], + # ) -> Dataset: + # """Write the dataset to a Zarr store and return the constructed mdio.Dataset. + + # This function constructs an mdio.Dataset from the MDIO dataset and writes its metadata + # to a Zarr store. The actual data is not written, only the metadata structure is created. + # """ + # return write_mdio_metadata(self.build(), store, mode, compute, **kwargs) + +# def write_mdio_metadata( +# mdio_ds: Dataset, +# store: str, +# mode: str = "w", +# compute: bool = False, +# **kwargs: Mapping[str, str | int | float | bool], +# ) -> mdio.Dataset: +# """Write MDIO metadata to a Zarr store and return the constructed mdio.Dataset. + +# This function constructs an mdio.Dataset from the MDIO dataset and writes its metadata +# to a Zarr store. The actual data is not written, only the metadata structure is created. + +# Args: +# mdio_ds: The MDIO dataset to serialize +# store: Path to the Zarr or .mdio store +# mode: Write mode to pass to to_mdio(), e.g. 'w' or 'a' +# compute: Whether to compute (write) array chunks (True) or only metadata (False) +# **kwargs: Additional arguments to pass to to_mdio() + +# Returns: +# The constructed xarray Dataset with MDIO extensions +# """ +# ds = _construct_mdio_dataset(mdio_ds) + +# def _generate_encodings() -> dict: +# """Generate encodings for each variable in the MDIO dataset. + +# Returns: +# Dictionary mapping variable names to their encoding configurations. +# """ +# # TODO(Anybody, #10274): Re-enable chunk_key_encoding when supported by xarray +# # dimension_separator_encoding = V2ChunkKeyEncoding(separator="/").to_dict() + +# # Collect dimension sizes (same approach as _construct_mdio_dataset) +# dims: dict[str, int] = {} +# for var in mdio_ds.variables: +# for d in var.dimensions: +# if isinstance(d, NamedDimension): +# dims[d.name] = d.size + +# global_encodings = {} +# for var in mdio_ds.variables: +# fill_value = 0 +# if isinstance(var.data_type, StructuredType): +# continue +# chunks = None +# if var.metadata is not None and var.metadata.chunk_grid is not None: +# chunks = var.metadata.chunk_grid.configuration.chunk_shape +# else: +# # When no chunk_grid is provided, set chunks to shape to avoid chunking +# dim_names = [d.name if isinstance(d, NamedDimension) else d for d in var.dimensions] +# chunks = tuple(dims[name] for name in dim_names) +# global_encodings[var.name] = { +# "chunks": chunks, +# # TODO(Anybody, #10274): Re-enable chunk_key_encoding when supported by xarray +# # "chunk_key_encoding": dimension_separator_encoding, +# "_FillValue": fill_value, +# "dtype": var.data_type, +# "compressors": _convert_compressor(var.compressor), +# } +# return global_encodings + +# ds.to_mdio( +# store, +# mode=mode, +# zarr_format=2, +# consolidated=True, +# safe_chunks=False, +# compute=compute, +# encoding=_generate_encodings(), +# **kwargs, +# ) +# return ds \ No newline at end of file diff --git a/tests/unit/v1/test_dataset_builder_helpers.py b/tests/unit/v1/test_dataset_builder_helpers.py index d3df78af..fdaef83d 100644 --- a/tests/unit/v1/test_dataset_builder_helpers.py +++ b/tests/unit/v1/test_dataset_builder_helpers.py @@ -214,6 +214,14 @@ class SomeModel(StrictModel): assert isinstance(result, dict) assert result == {"count": 3, "created": "2023-10-01T12:00:00Z", "samples": [1.0, 2.0, 3.0]} + d = { + "count": 3, + "samples": [1.0, 2.0, 3.0], + "created": datetime(2023, 10, 1, 12, 0, 0, tzinfo=UTC)} + result = _to_dictionary(m) + assert isinstance(result, dict) + assert result == {"count": 3, "created": "2023-10-01T12:00:00Z", "samples": [1.0, 2.0, 3.0]} + def test__make_coordinate_metadata() -> None: """Test creating VariableMetadata from a strongly-typed list of AllUnits or UserAttributes.""" From 46f82f01b9f0491d763cded3da1dca289406286a Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Tue, 1 Jul 2025 16:23:56 +0000 Subject: [PATCH 05/27] Generalize _to_dictionary() --- .devcontainer/devcontainer.json | 3 +- src/mdio/schemas/v1/dataset_builder.py | 105 +++++------------ tests/unit/v1/test_dataset_builder_build.py | 53 +++++++++ tests/unit/v1/test_dataset_builder_helpers.py | 106 +++++++++--------- 4 files changed, 135 insertions(+), 132 deletions(-) create mode 100644 tests/unit/v1/test_dataset_builder_build.py diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index ea5dc99e..dc80446c 100755 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -29,7 +29,8 @@ "vscode-icons-team.vscode-icons", "wayou.vscode-todo-highlight", "streetsidesoftware.code-spell-checker", - "eamodio.gitlens" + "eamodio.gitlens", + "visualstudioexptteam.vscodeintellicode" ] } }, diff --git a/src/mdio/schemas/v1/dataset_builder.py b/src/mdio/schemas/v1/dataset_builder.py index 7db49947..8b885b28 100644 --- a/src/mdio/schemas/v1/dataset_builder.py +++ b/src/mdio/schemas/v1/dataset_builder.py @@ -28,14 +28,17 @@ from mdio.schemas.v1.dataset import Dataset, DatasetInfo from mdio.schemas.v1.dataset import DatasetMetadata -CoordinateMetadataList: TypeAlias = list[AllUnits | - UserAttributes] +AnyMetadataList : TypeAlias = list[AllUnits | + UserAttributes | + ChunkGridMetadata | + StatisticsMetadata | + DatasetInfo] +CoordinateMetadataList: TypeAlias = list[AllUnits | UserAttributes] VariableMetadataList: TypeAlias = list[AllUnits | UserAttributes | ChunkGridMetadata | StatisticsMetadata] -DatasetMetadataList: TypeAlias = list[DatasetInfo | - UserAttributes ] +DatasetMetadataList: TypeAlias = list[DatasetInfo | UserAttributes] class _BuilderState(Enum): """States for the template builder.""" @@ -91,77 +94,24 @@ def get_dimension_names(dimensions: list[NamedDimension | str]) -> list[str]: return names -def _to_dictionary(val: BaseModel | dict[str, Any]) -> dict[str, Any]: - """Convert a dictionary or pydantic BaseModel to a dictionary.""" +def _to_dictionary(val: BaseModel | dict[str, Any] | AnyMetadataList)-> dict[str, Any]: + """Convert a dictionary, list or pydantic BaseModel to a dictionary.""" + if val is None: + return None if isinstance(val, BaseModel): return val.model_dump(mode="json", by_alias=True) if isinstance(val, dict): return val - msg = f"Expected BaseModel, got {type(val).__name__}" + if isinstance(val, list): + metadata_dict = {} + for md in val: + if md is None: + continue + metadata_dict.update(_to_dictionary(md)) + return metadata_dict + msg = f"Expected BaseModel, dict or list, got {type(val).__name__}" raise TypeError(msg) - -def _make_coordinate_metadata(metadata: CoordinateMetadataList | None) -> CoordinateMetadata | None: - if metadata is None or not metadata: - return None - - metadata_dict = {} - for md in metadata: - # NOTE: the pydantic attribute names are different from the v1 schema attributes names - # 'unitsV1' <-> 'units_v1' - if isinstance(md, AllUnits): - metadata_dict["unitsV1"] = _to_dictionary(md.units_v1) - elif isinstance(md, UserAttributes): - # NOTE: md.attributes is not pydantic type, but a dictionary - metadata_dict["attributes"] = _to_dictionary(md.attributes) - else: - msg = f"Unsupported metadata type: {type(md)}" - raise TypeError(msg) - return CoordinateMetadata(**metadata_dict) - - -def _make_variable_metadata(metadata: VariableMetadataList | None) -> VariableMetadata | None: - if metadata is None or not metadata: - return None - - metadata_dict = {} - for md in metadata: - # NOTE: the pydantic attribute names are different from the v1 schema attributes names - # 'statsV1' <-> 'stats_v1', 'unitsV1' <-> 'units_v1', 'chunkGrid' <-> 'chunk_grid' - if isinstance(md, AllUnits): - metadata_dict["unitsV1"] = _to_dictionary(md.units_v1) - elif isinstance(md, UserAttributes): - # NOTE: md.attributes is not pydantic type, but a dictionary - metadata_dict["attributes"] = _to_dictionary(md.attributes) - elif isinstance(md, ChunkGridMetadata): - metadata_dict["chunkGrid"] = _to_dictionary(md.chunk_grid) - elif isinstance(md, StatisticsMetadata): - metadata_dict["statsV1"] = _to_dictionary(md.stats_v1) - else: - msg = f"Unsupported metadata type: {type(md)}" - raise TypeError(msg) - return VariableMetadata(**metadata_dict) - -def _make_datasetinfo_metadata(metadata: DatasetInfo | None) -> DatasetMetadata | None: - if metadata is None or not metadata: - return None - - metadata_dict = {} - for md in metadata: - # NOTE: the pydantic attribute names are different from the v1 schema attributes names - # 'apiVersion' <-> 'api_version', 'created_on' <-> 'createdOn' - if isinstance(md, DatasetInfo): - metadata_dict["name"] = md.name - metadata_dict["apiVersion"] = _to_dictionary(md.api_version) - metadata_dict["createdOn"] = _to_dictionary(md.created_on) - elif isinstance(md, UserAttributes): - # NOTE: md.attributes is not pydantic type, but a dictionary - metadata_dict["attributes"] = _to_dictionary(md.attributes) - else: - msg = f"Unsupported metadata type: {type(md)}" - raise TypeError(msg) - return DatasetMetadata(**metadata_dict) - class MDIODatasetBuilder: """Builder for creating MDIO datasets with enforced build order. @@ -175,11 +125,13 @@ class MDIODatasetBuilder: def __init__(self, name: str, attributes: UserAttributes | None = None): - # TODO(BrianMichell, #0): Pull from package metadata - self._info = DatasetInfo( + info = DatasetInfo( name=name, api_version="1.0.0", - created_on= datetime.now(UTC)), + created_on=datetime.now(UTC) + ) + # TODO(BrianMichell, #0): Pull from package metadata + self._info = info self._attributes = attributes self._dimensions: list[NamedDimension] = [] self._coordinates: list[Coordinate] = [] @@ -253,7 +205,7 @@ def add_dimension( # noqa: PLR0913 dataType=var_data_type, compressor=None, coordinates=None, - metadata=_make_variable_metadata(var_metadata_info), + metadata=_to_dictionary(var_metadata_info), ) self._variables.append(dim_var) @@ -293,7 +245,7 @@ def add_coordinate( # noqa: PLR0913 # We ass names: sts, not list[NamedDimension | str] dimensions=dim_names, dataType=data_type, - metadata=_make_coordinate_metadata(metadata_info), + metadata=_to_dictionary(metadata_info), ) ) self._state = _BuilderState.HAS_COORDINATES @@ -335,7 +287,7 @@ def add_variable( # noqa: PLR0913 data_type=data_type, compressor=compressor, coordinates=coordinates, - metadata=_make_variable_metadata(metadata_info), + metadata=_to_dictionary(metadata_info), ) ) self._state = _BuilderState.HAS_VARIABLES @@ -349,8 +301,7 @@ def build(self) -> Dataset: return Dataset( variables=self._variables, - metadata=DatasetMetadata(self._info, - self._attributes) + metadata=_to_dictionary([self._info, self._attributes]) ) diff --git a/tests/unit/v1/test_dataset_builder_build.py b/tests/unit/v1/test_dataset_builder_build.py new file mode 100644 index 00000000..ab2d64af --- /dev/null +++ b/tests/unit/v1/test_dataset_builder_build.py @@ -0,0 +1,53 @@ +# ruff: noqa: PLR2004 +# PLR2004 Magic value used in comparison, consider replacing `3` with a constant variable +# The above erroneous warning is generated for every numerical assert. +# Thus, disable it for this file +"""Tests the schema v1 dataset_builder.add_coordinate() public API.""" + +from datetime import UTC, datetime +import pytest + +from mdio.schemas.dtype import ScalarType +from mdio.schemas.metadata import UserAttributes +from mdio.schemas.v1.dataset import DatasetInfo, DatasetMetadata +from mdio.schemas.v1.dataset_builder import MDIODatasetBuilder +from mdio.schemas.v1.dataset_builder import _BuilderState +from mdio.schemas.v1.units import AllUnits +from mdio.schemas.v1.units import LengthUnitEnum +from mdio.schemas.v1.units import LengthUnitModel +from mdio.schemas.v1.variable import VariableMetadata + +def test_build() -> None: + """Test adding coordinates. Check the state transition and validate required parameters.""" + builder = MDIODatasetBuilder("test_dataset") + assert builder._state == _BuilderState.INITIAL + + builder.add_dimension("x", 100) + builder.add_dimension("y", 100) + + builder.build() + +def test_play() -> None: + """Test adding coordinates. Check the state transition and validate required parameters.""" + # builder = MDIODatasetBuilder("test_dataset") + + u = AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.FOOT)) + + u_dict = u.model_dump(mode="json", by_alias=True) + vm = VariableMetadata(**u_dict) + vm_dict = vm.dict() + vm_json = vm.model_dump(mode="json", by_alias=True) + +# vm = VariableMetadata(u) + +# dInfo = DatasetInfo( +# name="My Dataset", +# api_version="1.0.0", +# created_on= datetime.now(UTC)), + +# d = dict(dInfo.model_dump(mode="json", by_alias=True)) + +# meta = DatasetMetadata(d) +# print(dInfo.dict()) + +# i = 0 \ No newline at end of file diff --git a/tests/unit/v1/test_dataset_builder_helpers.py b/tests/unit/v1/test_dataset_builder_helpers.py index fdaef83d..e5078a5e 100644 --- a/tests/unit/v1/test_dataset_builder_helpers.py +++ b/tests/unit/v1/test_dataset_builder_helpers.py @@ -16,8 +16,6 @@ from mdio.schemas.dimension import NamedDimension from mdio.schemas.metadata import ChunkGridMetadata from mdio.schemas.v1.dataset_builder import MDIODatasetBuilder -from mdio.schemas.v1.dataset_builder import _make_coordinate_metadata -from mdio.schemas.v1.dataset_builder import _make_variable_metadata from mdio.schemas.v1.dataset_builder import _to_dictionary from mdio.schemas.v1.dataset_builder import contains_dimension from mdio.schemas.v1.dataset_builder import get_dimension @@ -196,7 +194,7 @@ def test__add_dimensions_if_needed_when_one_already_exists() -> None: def test__to_dictionary() -> None: """Test converting a BaseModel to a dictionary.""" - with pytest.raises(TypeError, match="Expected BaseModel, got datetime"): + with pytest.raises(TypeError, match="Expected BaseModel, dict or list, got datetime"): # This should raise an error because datetime is not a BaseModel _to_dictionary(datetime.now(UTC)) @@ -223,54 +221,54 @@ class SomeModel(StrictModel): assert result == {"count": 3, "created": "2023-10-01T12:00:00Z", "samples": [1.0, 2.0, 3.0]} -def test__make_coordinate_metadata() -> None: - """Test creating VariableMetadata from a strongly-typed list of AllUnits or UserAttributes.""" - units = AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.FOOT)) - attrs = UserAttributes(attributes={"MGA": 51, "UnitSystem": "Imperial"}) - meta_list = [units, attrs] - - # Assume that multiple attributes are allowed - metadata = _make_coordinate_metadata(meta_list) - assert isinstance(metadata, CoordinateMetadata) - assert metadata.units_v1.length == "ft" - assert metadata.attributes["MGA"] == 51 - assert metadata.attributes["UnitSystem"] == "Imperial" - - meta_list = ["ft"] - with pytest.raises(TypeError, match="Unsupported metadata type: "): - _make_variable_metadata(meta_list) - -def test__make_variable_metadata() -> None: - """Test creating VariableMetadata from a strongly-typed list of AllUnits or UserAttributes.""" - units = AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.FOOT)) - attrs = UserAttributes(attributes={"MGA": 51, "UnitSystem": "Imperial"}) - chgrd = ChunkGridMetadata( - chunk_grid=RegularChunkGrid( - configuration=RegularChunkShape(chunk_shape=[20]))) - stats = StatisticsMetadata( - stats_v1=SummaryStatistics( - count=100, - sum=1215.1, - sumSquares=125.12, - min=5.61, - max=10.84, - histogram=CenteredBinHistogram(binCenters=[1, 2], counts=[10, 15]))) - metadata_info = [units, attrs, chgrd, stats] - metadata = _make_variable_metadata(metadata_info) - assert isinstance(metadata, VariableMetadata) - assert metadata.units_v1.length == "ft" - assert metadata.attributes["MGA"] == 51 - assert metadata.attributes["UnitSystem"] == "Imperial" - assert metadata.chunk_grid.name == "regular" - assert metadata.chunk_grid.configuration.chunk_shape == [20] - assert metadata.stats_v1.count == 100 - assert metadata.stats_v1.sum == 1215.1 - assert metadata.stats_v1.sum_squares == 125.12 - assert metadata.stats_v1.min == 5.61 - assert metadata.stats_v1.max == 10.84 - assert metadata.stats_v1.histogram.bin_centers == [1, 2] - assert metadata.stats_v1.histogram.counts == [10, 15] - - meta_list = ["ft"] - with pytest.raises(TypeError, match="Unsupported metadata type: "): - _make_variable_metadata(meta_list) +# def test__make_coordinate_metadata() -> None: +# """Test creating VariableMetadata from a strongly-typed list of AllUnits or UserAttributes.""" +# units = AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.FOOT)) +# attrs = UserAttributes(attributes={"MGA": 51, "UnitSystem": "Imperial"}) +# meta_list = [units, attrs] + +# # Assume that multiple attributes are allowed +# metadata = _make_coordinate_metadata(meta_list) +# assert isinstance(metadata, CoordinateMetadata) +# assert metadata.units_v1.length == "ft" +# assert metadata.attributes["MGA"] == 51 +# assert metadata.attributes["UnitSystem"] == "Imperial" + +# meta_list = ["ft"] +# with pytest.raises(TypeError, match="Expected BaseModel, dict or list, got str"): +# _make_variable_metadata(meta_list) + +# def test__make_variable_metadata() -> None: +# """Test creating VariableMetadata from a strongly-typed list of AllUnits or UserAttributes.""" +# units = AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.FOOT)) +# attrs = UserAttributes(attributes={"MGA": 51, "UnitSystem": "Imperial"}) +# chgrd = ChunkGridMetadata( +# chunk_grid=RegularChunkGrid( +# configuration=RegularChunkShape(chunk_shape=[20]))) +# stats = StatisticsMetadata( +# stats_v1=SummaryStatistics( +# count=100, +# sum=1215.1, +# sumSquares=125.12, +# min=5.61, +# max=10.84, +# histogram=CenteredBinHistogram(binCenters=[1, 2], counts=[10, 15]))) +# metadata_info = [units, attrs, chgrd, stats] +# metadata = _make_variable_metadata(metadata_info) +# assert isinstance(metadata, VariableMetadata) +# assert metadata.units_v1.length == "ft" +# assert metadata.attributes["MGA"] == 51 +# assert metadata.attributes["UnitSystem"] == "Imperial" +# assert metadata.chunk_grid.name == "regular" +# assert metadata.chunk_grid.configuration.chunk_shape == [20] +# assert metadata.stats_v1.count == 100 +# assert metadata.stats_v1.sum == 1215.1 +# assert metadata.stats_v1.sum_squares == 125.12 +# assert metadata.stats_v1.min == 5.61 +# assert metadata.stats_v1.max == 10.84 +# assert metadata.stats_v1.histogram.bin_centers == [1, 2] +# assert metadata.stats_v1.histogram.counts == [10, 15] + +# meta_list = ["ft"] +# with pytest.raises(TypeError, match="Expected BaseModel, dict or list, got str"): +# _make_variable_metadata(meta_list) From 0dc7cc86856dbdc234760cc36b898a2cb6e9a9e2 Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Tue, 1 Jul 2025 23:37:52 +0000 Subject: [PATCH 06/27] build --- src/mdio/schemas/v1/dataset_builder.py | 78 ++++-- .../v1/test_dataset_builder_add_coordinate.py | 24 +- .../v1/test_dataset_builder_add_dimension.py | 5 + .../v1/test_dataset_builder_add_variable.py | 4 +- tests/unit/v1/test_dataset_builder_build.py | 258 ++++++++++++++++-- tests/unit/v1/test_dataset_builder_helpers.py | 2 +- 6 files changed, 309 insertions(+), 62 deletions(-) diff --git a/src/mdio/schemas/v1/dataset_builder.py b/src/mdio/schemas/v1/dataset_builder.py index 8b885b28..8b169878 100644 --- a/src/mdio/schemas/v1/dataset_builder.py +++ b/src/mdio/schemas/v1/dataset_builder.py @@ -28,17 +28,19 @@ from mdio.schemas.v1.dataset import Dataset, DatasetInfo from mdio.schemas.v1.dataset import DatasetMetadata -AnyMetadataList : TypeAlias = list[AllUnits | - UserAttributes | - ChunkGridMetadata | - StatisticsMetadata | - DatasetInfo] +AnyMetadataList: TypeAlias = list[AllUnits | + UserAttributes | + ChunkGridMetadata | + StatisticsMetadata | + DatasetInfo] CoordinateMetadataList: TypeAlias = list[AllUnits | UserAttributes] VariableMetadataList: TypeAlias = list[AllUnits | UserAttributes | ChunkGridMetadata | StatisticsMetadata] DatasetMetadataList: TypeAlias = list[DatasetInfo | UserAttributes] + + class _BuilderState(Enum): """States for the template builder.""" @@ -94,7 +96,7 @@ def get_dimension_names(dimensions: list[NamedDimension | str]) -> list[str]: return names -def _to_dictionary(val: BaseModel | dict[str, Any] | AnyMetadataList)-> dict[str, Any]: +def _to_dictionary(val: BaseModel | dict[str, Any] | AnyMetadataList) -> dict[str, Any]: """Convert a dictionary, list or pydantic BaseModel to a dictionary.""" if val is None: return None @@ -112,6 +114,7 @@ def _to_dictionary(val: BaseModel | dict[str, Any] | AnyMetadataList)-> dict[str msg = f"Expected BaseModel, dict or list, got {type(val).__name__}" raise TypeError(msg) + class MDIODatasetBuilder: """Builder for creating MDIO datasets with enforced build order. @@ -194,18 +197,21 @@ def add_dimension( # noqa: PLR0913 msg = "Adding dimension with the same name twice is not allowed" raise ValueError(msg) - added_dims = self._add_dimensions_if_needed( - [NamedDimension(name=name, size=size)]) + added_dims = self._add_dimensions_if_needed([NamedDimension(name=name, size=size)]) if added_dims: + meta_dict = _to_dictionary(var_metadata_info) # Create a variable for the dimension dim_var = Variable( name=name, longName=var_long_name, + # IMPORTANT: we always use NamedDimension here, not the dimension name + # Since the Dataset does not have a dimension list, we need to preserve NamedDimension + # somewhere. Namely, in the variable created for the dimension dimensions=added_dims, dataType=var_data_type, compressor=None, coordinates=None, - metadata=_to_dictionary(var_metadata_info), + metadata=meta_dict, ) self._variables.append(dim_var) @@ -217,6 +223,7 @@ def add_coordinate( # noqa: PLR0913 name: str, *, long_name: str = None, + #TODO Only allow adding dimensions by name, not by NamedDimension object dimensions: list[NamedDimension | str], data_type: ScalarType | StructuredType = ScalarType.FLOAT32, metadata_info: CoordinateMetadataList | None = None, @@ -238,16 +245,33 @@ def add_coordinate( # noqa: PLR0913 self._add_dimensions_if_needed(dimensions) dim_names = get_dimension_names(dimensions) - self._coordinates.append( - Coordinate( - name=name, - longName=long_name, - # We ass names: sts, not list[NamedDimension | str] - dimensions=dim_names, - dataType=data_type, - metadata=_to_dictionary(metadata_info), - ) + meta_dict = _to_dictionary(metadata_info) + coord = Coordinate( + name=name, + longName=long_name, + # We ass names: sts, not list[NamedDimension | str] + dimensions=dim_names, + dataType=data_type, + metadata=meta_dict + ) + self._coordinates.append(coord) + + # Add coordinate as variables to the dataset + var_meta_dict = _to_dictionary(coord.metadata) + coord_var = Variable( + name=coord.name, + longName=coord.long_name, + dimensions=coord.dimensions, + dataType=coord.data_type, + compressor=None, + # IMPORTANT: we always use Coordinate here, not the coordinate name + # Since the Dataset does not have a coordinate list, we need to preserve Coordinate + # somewhere. Namely, in the variable created for the coordinate + coordinates=[coord], + metadata=var_meta_dict ) + self._variables.append(coord_var) + self._state = _BuilderState.HAS_COORDINATES return self @@ -256,9 +280,11 @@ def add_variable( # noqa: PLR0913 name: str, *, long_name: str = None, + #TODO Only allow adding dimensions by name, not by NamedDimension object dimensions: list[NamedDimension | str], data_type: ScalarType | StructuredType = ScalarType.FLOAT32, compressor: Blosc | ZFP | None = None, + #TODO Only allow adding coordinates by name, not by Coordinate object coordinates: list[Coordinate | str] | None = None, metadata_info: VariableMetadataList | None = None, ) -> "MDIODatasetBuilder": @@ -279,6 +305,7 @@ def add_variable( # noqa: PLR0913 self._add_dimensions_if_needed(dimensions) dim_names = get_dimension_names(dimensions) + meta_dict = _to_dictionary(metadata_info) self._variables.append( Variable( name=name, @@ -287,7 +314,7 @@ def add_variable( # noqa: PLR0913 data_type=data_type, compressor=compressor, coordinates=coordinates, - metadata=_to_dictionary(metadata_info), + metadata=meta_dict, ) ) self._state = _BuilderState.HAS_VARIABLES @@ -299,11 +326,10 @@ def build(self) -> Dataset: msg = "Must add at least one dimension before building" raise ValueError(msg) - return Dataset( - variables=self._variables, - metadata=_to_dictionary([self._info, self._attributes]) - ) + var_meta_dict = _to_dictionary([self._info, self._attributes]) + dataset = Dataset(variables=self._variables, metadata=var_meta_dict) + return dataset # def to_mdio( # self, @@ -351,14 +377,14 @@ def build(self) -> Dataset: # """ # # TODO(Anybody, #10274): Re-enable chunk_key_encoding when supported by xarray # # dimension_separator_encoding = V2ChunkKeyEncoding(separator="/").to_dict() - + # # Collect dimension sizes (same approach as _construct_mdio_dataset) # dims: dict[str, int] = {} # for var in mdio_ds.variables: # for d in var.dimensions: # if isinstance(d, NamedDimension): # dims[d.name] = d.size - + # global_encodings = {} # for var in mdio_ds.variables: # fill_value = 0 @@ -391,4 +417,4 @@ def build(self) -> Dataset: # encoding=_generate_encodings(), # **kwargs, # ) -# return ds \ No newline at end of file +# return ds diff --git a/tests/unit/v1/test_dataset_builder_add_coordinate.py b/tests/unit/v1/test_dataset_builder_add_coordinate.py index 659fedac..e6fa4bcf 100644 --- a/tests/unit/v1/test_dataset_builder_add_coordinate.py +++ b/tests/unit/v1/test_dataset_builder_add_coordinate.py @@ -40,8 +40,16 @@ def test_add_coordinate() -> None: builder.add_coordinate("cdp", dimensions=["inline", "crossline"]) assert builder._state == _BuilderState.HAS_COORDINATES assert len(builder._dimensions) == 2 - assert len(builder._variables) == 2 - assert len(builder._coordinates) == 1 + # 2 variables for dimensions, 1 variable for coordinates + assert len(builder._variables) == 3 + assert len(builder._coordinates) == 1 + + # Validate that we created a coordinate variable + var_cdp = next(e for e in builder._variables if e.name == "cdp") + assert var_cdp is not None + assert len(var_cdp.dimensions) == 2 + assert len(var_cdp.coordinates) == 1 + assert next(e for e in var_cdp.coordinates if e.name == "cdp") is not None # Adding coordinate with the same name twice msg="Adding coordinate with the same name twice is not allowed" @@ -56,7 +64,8 @@ def test_add_coordinate_with_defaults() -> None: # Add coordinate using defaults builder.add_coordinate("cdp", dimensions=["inline", "crossline"]) assert len(builder._dimensions) == 2 - assert len(builder._variables) == 2 + # 2 variables for dimensions, 1 variable for coordinates + assert len(builder._variables) == 3 assert len(builder._coordinates) == 1 crd0 = next((e for e in builder._coordinates if e.name == "cdp"), None) assert crd0 is not None @@ -78,7 +87,8 @@ def test_coordinate_with_units() -> None: metadata_info=[AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.FOOT))] ) assert len(builder._dimensions) == 2 - assert len(builder._variables) == 2 + # 2 variables for dimensions, 1 variable for coordinates + assert len(builder._variables) == 3 assert len(builder._coordinates) == 1 crd0 = next((e for e in builder._coordinates if e.name == "cdp"), None) assert crd0 is not None @@ -102,7 +112,8 @@ def test_coordinate_with_attributes() -> None: metadata_info=[UserAttributes(attributes={"MGA": 51, "UnitSystem": "Imperial"})], ) assert len(builder._dimensions) == 2 - assert len(builder._variables) == 2 + # 2 variables for dimensions, 1 variable for coordinates + assert len(builder._variables) == 3 assert len(builder._coordinates) == 1 # NOTE: add_coordinate() stores dimensions as names crd0 = next((e for e in builder._coordinates if e.name == "cdp"), None) @@ -129,7 +140,8 @@ def test_coordinate_with_full_metadata() -> None: UserAttributes(attributes={"MGA": 51, "UnitSystem": "Imperial"})] ) assert len(builder._dimensions) == 2 - assert len(builder._variables) == 2 + # 2 variables for dimensions, 1 variable for coordinates + assert len(builder._variables) == 3 assert len(builder._coordinates) == 1 # NOTE: add_coordinate() stores dimensions as names crd0 = next((e for e in builder._coordinates if e.name == "cdp"), None) diff --git a/tests/unit/v1/test_dataset_builder_add_dimension.py b/tests/unit/v1/test_dataset_builder_add_dimension.py index 6ff1f259..a6a12e85 100644 --- a/tests/unit/v1/test_dataset_builder_add_dimension.py +++ b/tests/unit/v1/test_dataset_builder_add_dimension.py @@ -40,6 +40,11 @@ def test_add_dimension() -> None: assert builder._dimensions[0] == NamedDimension(name="x", size=100) assert len(builder._variables) == 1 + # Validate that we created a dimension variable properly + var_x = next(e for e in builder._variables if e.name == "x") + assert var_x is not None + assert len(var_x.dimensions) == 1 + # Adding dimension with the same name twice msg="Adding dimension with the same name twice is not allowed" with pytest.raises(ValueError, match=msg): diff --git a/tests/unit/v1/test_dataset_builder_add_variable.py b/tests/unit/v1/test_dataset_builder_add_variable.py index 7c0b46ba..1181e10d 100644 --- a/tests/unit/v1/test_dataset_builder_add_variable.py +++ b/tests/unit/v1/test_dataset_builder_add_variable.py @@ -112,7 +112,9 @@ def test_add_variable_full_parameters() -> None: ]) assert len(builder._dimensions) == 3 assert len(builder._coordinates) == 2 - assert len(builder._variables) == 4 + # We expect 6 variables: + # 3 variables for dimensions, 2 variables for coordinates, and 1 variable for seismic_amplitude + assert len(builder._variables) == 6 v = next((v for v in builder._variables if v.name == "seismic_amplitude"), None) assert v is not None assert v.long_name == "Amplitude (dimensionless)" diff --git a/tests/unit/v1/test_dataset_builder_build.py b/tests/unit/v1/test_dataset_builder_build.py index ab2d64af..64dfead0 100644 --- a/tests/unit/v1/test_dataset_builder_build.py +++ b/tests/unit/v1/test_dataset_builder_build.py @@ -5,18 +5,26 @@ """Tests the schema v1 dataset_builder.add_coordinate() public API.""" from datetime import UTC, datetime +import json +import os +from pathlib import Path import pytest -from mdio.schemas.dtype import ScalarType -from mdio.schemas.metadata import UserAttributes -from mdio.schemas.v1.dataset import DatasetInfo, DatasetMetadata +from mdio.schemas import builder +from mdio.schemas.chunk_grid import RegularChunkGrid, RegularChunkShape +from mdio.schemas.compressors import Blosc +from mdio.schemas.dtype import ScalarType, StructuredType +from mdio.schemas.metadata import ChunkGridMetadata, UserAttributes +from mdio.schemas.v1.dataset import Dataset, DatasetInfo, DatasetMetadata from mdio.schemas.v1.dataset_builder import MDIODatasetBuilder from mdio.schemas.v1.dataset_builder import _BuilderState -from mdio.schemas.v1.units import AllUnits +from mdio.schemas.v1.stats import CenteredBinHistogram, StatisticsMetadata, SummaryStatistics +from mdio.schemas.v1.units import AllUnits, SpeedUnitEnum, SpeedUnitModel from mdio.schemas.v1.units import LengthUnitEnum from mdio.schemas.v1.units import LengthUnitModel from mdio.schemas.v1.variable import VariableMetadata + def test_build() -> None: """Test adding coordinates. Check the state transition and validate required parameters.""" builder = MDIODatasetBuilder("test_dataset") @@ -27,27 +35,221 @@ def test_build() -> None: builder.build() -def test_play() -> None: - """Test adding coordinates. Check the state transition and validate required parameters.""" - # builder = MDIODatasetBuilder("test_dataset") - - u = AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.FOOT)) - - u_dict = u.model_dump(mode="json", by_alias=True) - vm = VariableMetadata(**u_dict) - vm_dict = vm.dict() - vm_json = vm.model_dump(mode="json", by_alias=True) - -# vm = VariableMetadata(u) - -# dInfo = DatasetInfo( -# name="My Dataset", -# api_version="1.0.0", -# created_on= datetime.now(UTC)), - -# d = dict(dInfo.model_dump(mode="json", by_alias=True)) - -# meta = DatasetMetadata(d) -# print(dInfo.dict()) - -# i = 0 \ No newline at end of file + +def test_build_dataset() -> None: + """Test building a complete dataset.""" + dataset = ( + MDIODatasetBuilder("test_dataset") + .add_dimension("x", 100) + .add_dimension("y", 200) + .add_coordinate("x_coord", dimensions=["x"]) + .add_coordinate("y_coord", dimensions=["y"]) + .add_variable("data", dimensions=["x", "y"], long_name="Test Data") + .build() + ) + assert isinstance(dataset, Dataset) + assert dataset.metadata.name == "test_dataset" + # 2 dimension variables + 1 data variable + 2 coordinate variables + assert len(dataset.variables) == 5 + assert next(v for v in dataset.variables if v.name == "x") is not None + assert next(v for v in dataset.variables if v.name == "y") is not None + var_data = next(v for v in dataset.variables if v.name == "data") + assert var_data is not None + assert var_data.long_name == "Test Data" + assert len(var_data.dimensions) == 2 + +# def test_build_dataset_bad_example() -> None: +# builder = MDIODatasetBuilder("Bad example") +# builder.add_dimension("inline", 256) +# builder.add_dimension("crossline", 512) +# builder.add_dimension("depth", 384) +# builder.add_variable(name="image", dimensions=["inline", "crossline", "depth"]) +# dataset = builder.build() + +# json_str = dataset.model_dump_json() +# file_path = os.path.join(os.path.dirname(__file__), "bad_example.json") +# with open(file_path, 'w') as f: +# f.write(json_str) + +def test_build_campos_3d(tmp_path: Path) -> None: + """Test building a toy dataset with multiple variables and attributes.""" + dataset = make_campos_3d_dataset() + + # Verify dataset structure + assert dataset.metadata.name == "campos_3d" + assert dataset.metadata.api_version == "1.0.0" + assert dataset.metadata.attributes["foo"] == "bar" + assert len(dataset.metadata.attributes["textHeader"]) == 3 + + # Verify variables (including dimension variables) + # 3 dimension variables + 4 data variables + 2 coordinate variables + assert len(dataset.variables) == 9 # noqa: PLR2004 + + # Verify dimension variables + inline_var = next(v for v in dataset.variables if v.name == "inline") + assert inline_var.data_type == ScalarType.UINT32 + assert len(inline_var.dimensions) == 1 + assert inline_var.dimensions[0].name == "inline" + + depth_var = next(v for v in dataset.variables if v.name == "depth") + assert depth_var.data_type == ScalarType.UINT32 + assert depth_var.metadata.units_v1.length == "m" + + # Verify image variable + image = next(v for v in dataset.variables if v.name == "image") + assert image.data_type == ScalarType.FLOAT32 + assert isinstance(image.compressor, Blosc) + assert image.compressor.algorithm == "zstd" + assert image.metadata.stats_v1.count == 100 + + # Verify velocity variable + velocity = next(v for v in dataset.variables if v.name == "velocity") + assert velocity.data_type == ScalarType.FLOAT16 + assert velocity.compressor is None + assert velocity.metadata.units_v1.speed == "m/s" + + # Verify image_inline variable + image_inline = next( + v for v in dataset.variables if v.name == "image_inline") + assert image_inline.long_name == "inline optimized version of 3d_stack" + assert isinstance(image_inline.compressor, Blosc) + assert image_inline.compressor.algorithm == "zstd" + + # Verify image_headers variable + headers = next(v for v in dataset.variables if v.name == "image_headers") + assert isinstance(headers.data_type, StructuredType) + assert len(headers.data_type.fields) == 4 + assert headers.data_type.fields[0].name == "cdp-x" + +# def test_build_campos_3d_contract(tmp_path: Path) -> None: +# '''Test building campos_3d dataset and converting to JSON schema format.''' +# dataset = make_campos_3d_dataset() +# # json_str = dataset.model_dump_json() +# # json_sorted_str = json.dumps(json.loads(json_str), sort_keys=True) +# m_dict = dataset.model_dump(mode="json", by_alias=True) +# json_sorted_str = json.dumps(m_dict, sort_keys=True) + +# sorted_contract = load_campos_3d_contract() + +# save(json_sorted_str, sorted_contract) + +# def save(model: str, contract: str) -> None: +# model_file_path = os.path.join(os.path.dirname(__file__), "campos_3d_model.json") +# contract_file_path = os.path.join(os.path.dirname(__file__), "campos_3d_contract.json") +# with open(model_file_path, 'w') as f: +# f.write(model) +# with open(contract_file_path, 'w') as f: +# f.write(contract) + +# def load_campos_3d_contract(): +# file_path = os.path.join(os.path.dirname(__file__), 'test_data/campos_3d_contract.json') +# with open(file_path, 'r') as f: +# contract = json.load(f) +# assert contract is not None +# return json.dumps(contract, sort_keys=True) + +def make_campos_3d_dataset() -> Dataset: + """Create in-memory campos_3d dataset.""" + + ds = MDIODatasetBuilder( + "campos_3d", + attributes=UserAttributes(attributes={ + "textHeader": [ + "C01 .......................... ", + "C02 .......................... ", + "C03 .......................... ", + ], + "foo": "bar" + })) + + # Add dimensions + ds.add_dimension("inline", 256, var_data_type=ScalarType.UINT32) + ds.add_dimension("crossline", 512, var_data_type=ScalarType.UINT32) + ds.add_dimension("depth", 384, var_data_type=ScalarType.UINT32, + var_metadata_info=[ + AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.METER))] + ) + # Add coordinates + ds.add_coordinate( + "cdp-x", + dimensions=["inline", "crossline"], + metadata_info=[ + AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.METER))] + ) + ds.add_coordinate( + "cdp-y", + dimensions=["inline", "crossline"], + metadata_info=[ + AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.METER))] + # metadata={"unitsV1": {"length": "m"}}, + ) + # Add image variable + ds.add_variable( + name="image", + dimensions=["inline", "crossline", "depth"], + data_type=ScalarType.FLOAT32, + compressor=Blosc(algorithm="zstd"), + coordinates=["inline", "crossline", "depth", "cdp-x", "cdp-y"], + metadata_info=[ + ChunkGridMetadata( + chunk_grid=RegularChunkGrid( + configuration=RegularChunkShape(chunk_shape=[128, 128, 128])) + ), + StatisticsMetadata( + stats_v1=SummaryStatistics( + count=100, + sum=1215.1, + sumSquares=125.12, + min=5.61, + max=10.84, + histogram=CenteredBinHistogram( + binCenters=[1, 2], counts=[10, 15]), + ) + ), + UserAttributes( + attributes={"fizz": "buzz", "UnitSystem": "Canonical"}), + ]) + # Add velocity variable + ds.add_variable( + name="velocity", + dimensions=["inline", "crossline", "depth"], + data_type=ScalarType.FLOAT16, + coordinates=["inline", "crossline", "depth", "cdp-x", "cdp-y"], + metadata_info=[ + ChunkGridMetadata( + chunk_grid=RegularChunkGrid( + configuration=RegularChunkShape(chunk_shape=[128, 128, 128])) + ), + AllUnits(units_v1=SpeedUnitModel( + speed=SpeedUnitEnum.METER_PER_SECOND)), + ], + ) + # Add inline-optimized image variable + ds.add_variable( + name="image_inline", + long_name="inline optimized version of 3d_stack", + dimensions=["inline", "crossline", "depth"], + data_type=ScalarType.FLOAT32, + compressor=Blosc(algorithm="zstd"), + coordinates=["inline", "crossline", "depth", "cdp-x", "cdp-y"], + metadata_info=[ + ChunkGridMetadata( + chunk_grid=RegularChunkGrid( + configuration=RegularChunkShape(chunk_shape=[4, 512, 512])) + )] + ) + # Add headers variable with structured dtype + ds.add_variable( + name="image_headers", + dimensions=["inline", "crossline"], + data_type=StructuredType( + fields=[ + {"name": "cdp-x", "format": ScalarType.INT32}, + {"name": "cdp-y", "format": ScalarType.INT32}, + {"name": "elevation", "format": ScalarType.FLOAT16}, + {"name": "some_scalar", "format": ScalarType.FLOAT16}, + ] + ), + coordinates=["inline", "crossline", "cdp-x", "cdp-y"], + ) + return ds.build() diff --git a/tests/unit/v1/test_dataset_builder_helpers.py b/tests/unit/v1/test_dataset_builder_helpers.py index e5078a5e..e6bfda71 100644 --- a/tests/unit/v1/test_dataset_builder_helpers.py +++ b/tests/unit/v1/test_dataset_builder_helpers.py @@ -141,7 +141,7 @@ def test_add_dimensions_if_needed() -> None: # Assert that the number of dimensions has not increased assert len(builder._dimensions) == 3 -def test__add_dimensions_if_needed_when_one_already_exists() -> None: +def test__add_dimensions_if_one_already_exists() -> None: """Test adding existing named dimensions to a dataset.""" builder = MDIODatasetBuilder("Test Dataset Builder") From 79863ac554a4250f86b0712aee810d1eb5e427fb Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Wed, 2 Jul 2025 21:49:22 +0000 Subject: [PATCH 07/27] Dataset Build - pass one --- src/mdio/schemas/v1/dataset_builder.py | 288 ++++++----------- .../v1/test_dataset_builder_add_coordinate.py | 27 +- .../v1/test_dataset_builder_add_dimension.py | 60 ++-- .../v1/test_dataset_builder_add_variable.py | 29 +- tests/unit/v1/test_dataset_builder_build.py | 143 ++++----- tests/unit/v1/test_dataset_builder_helpers.py | 290 ++++-------------- 6 files changed, 297 insertions(+), 540 deletions(-) diff --git a/src/mdio/schemas/v1/dataset_builder.py b/src/mdio/schemas/v1/dataset_builder.py index 8b169878..701fdbba 100644 --- a/src/mdio/schemas/v1/dataset_builder.py +++ b/src/mdio/schemas/v1/dataset_builder.py @@ -7,8 +7,6 @@ from typing import Any from typing import TypeAlias -import xarray as xr - from pydantic import BaseModel from zarr.core.chunk_key_encodings import V2ChunkKeyEncoding # noqa: F401 @@ -19,14 +17,12 @@ from mdio.schemas.dtype import StructuredType from mdio.schemas.metadata import ChunkGridMetadata from mdio.schemas.metadata import UserAttributes +from mdio.schemas.v1.dataset import Dataset +from mdio.schemas.v1.dataset import DatasetInfo from mdio.schemas.v1.stats import StatisticsMetadata from mdio.schemas.v1.units import AllUnits from mdio.schemas.v1.variable import Coordinate -from mdio.schemas.v1.variable import CoordinateMetadata from mdio.schemas.v1.variable import Variable -from mdio.schemas.v1.variable import VariableMetadata -from mdio.schemas.v1.dataset import Dataset, DatasetInfo -from mdio.schemas.v1.dataset import DatasetMetadata AnyMetadataList: TypeAlias = list[AllUnits | UserAttributes | @@ -50,24 +46,10 @@ class _BuilderState(Enum): HAS_VARIABLES = auto() -def contains_dimension( - dimensions: list[NamedDimension], name_or_dimension: str | NamedDimension -) -> bool: - """Check if a dimension with the given name exists in the list.""" - if isinstance(name_or_dimension, str): - name = name_or_dimension - return get_dimension(dimensions, name) is not None - if isinstance(name_or_dimension, NamedDimension): - dimension = name_or_dimension - return get_dimension(dimensions, dimension.name, dimension.size) is not None - msg = f"Expected str or NamedDimension, got {type(name_or_dimension).__name__}" - raise TypeError(msg) - - -def get_dimension( +def _get_dimension( dimensions: list[NamedDimension], name: str, size: int | None = None ) -> NamedDimension | None: - """Get a dimension by name from the list.""" + """Get a dimension by name and size from the list[NamedDimension] .""" if dimensions is None: return False if not isinstance(name, str): @@ -83,19 +65,6 @@ def get_dimension( return nd -def get_dimension_names(dimensions: list[NamedDimension | str]) -> list[str]: - """Get a dimension by name from the list.""" - names = [] - if dimensions is None: - return names - for dim in dimensions: - if isinstance(dim, NamedDimension): - names.append(dim.name) - elif isinstance(dim, str): - names.append(dim) - return names - - def _to_dictionary(val: BaseModel | dict[str, Any] | AnyMetadataList) -> dict[str, Any]: """Convert a dictionary, list or pydantic BaseModel to a dictionary.""" if val is None: @@ -142,42 +111,18 @@ def __init__(self, name: str, attributes: UserAttributes | None = None): self._state = _BuilderState.INITIAL self._unnamed_variable_counter = 0 - def _add_dimensions_if_needed( - self, dimensions: list[NamedDimension | str] | None - ) -> list[NamedDimension]: - if dimensions is None: - return [] - - added_dims = [] - for dim in dimensions: - if isinstance(dim, str): - if not contains_dimension(self._dimensions, dim): - msg = f"Pre-existing dimension named {dim!r} is not found" - raise ValueError(msg) - else: - if not isinstance(dim, NamedDimension): - msg = f"Expected NamedDimension or str, got {type(dim).__name__}" - raise TypeError(msg) - if contains_dimension(self._dimensions, dim): - continue - # Use value instead of a reference - d = NamedDimension(name=dim.name, size=dim.size) - self._dimensions.append(d) - added_dims.append(d) - return added_dims - + def add_dimension( # noqa: PLR0913 self, name: str, size: int, - var_long_name: str = None, var_data_type: ScalarType | StructuredType = ScalarType.INT32, var_metadata_info: VariableMetadataList | None = None, ) -> "MDIODatasetBuilder": """Add a dimension. - This must be called at least once before adding coordinates or variables. - This call will create a variable, if one does not yet exists + This function be called at least once before adding coordinates or variables. + This call will create a dimension variable, if one does not yet exists Args: name: Name of the dimension @@ -192,28 +137,31 @@ def add_dimension( # noqa: PLR0913 if not name: msg = "'name' must be a non-empty string" raise ValueError(msg) + + # Validate that the dimension is not already defined old_var = next((e for e in self._dimensions if e.name == name), None) if old_var is not None: msg = "Adding dimension with the same name twice is not allowed" raise ValueError(msg) - added_dims = self._add_dimensions_if_needed([NamedDimension(name=name, size=size)]) - if added_dims: - meta_dict = _to_dictionary(var_metadata_info) - # Create a variable for the dimension - dim_var = Variable( - name=name, - longName=var_long_name, - # IMPORTANT: we always use NamedDimension here, not the dimension name - # Since the Dataset does not have a dimension list, we need to preserve NamedDimension - # somewhere. Namely, in the variable created for the dimension - dimensions=added_dims, - dataType=var_data_type, - compressor=None, - coordinates=None, - metadata=meta_dict, - ) - self._variables.append(dim_var) + dim = NamedDimension(name=name, size=size) + self._dimensions.append(dim) + + meta_dict = _to_dictionary(var_metadata_info) + # Create a variable for the dimension + dim_var = Variable( + name=name, + longName=f"'{name}' dimension variable", + # IMPORTANT: we use NamedDimension here, not the dimension name. + # Since the Dataset does not have a dimension list, we need to preserve + # NamedDimension somewhere. Namely, in the variable created for the dimension + dimensions=[dim], + dataType=var_data_type, + compressor=None, + coordinates=None, + metadata=meta_dict, + ) + self._variables.append(dim_var) self._state = _BuilderState.HAS_DIMENSIONS return self @@ -223,12 +171,25 @@ def add_coordinate( # noqa: PLR0913 name: str, *, long_name: str = None, - #TODO Only allow adding dimensions by name, not by NamedDimension object - dimensions: list[NamedDimension | str], + dimensions: list[str], data_type: ScalarType | StructuredType = ScalarType.FLOAT32, metadata_info: CoordinateMetadataList | None = None, ) -> "MDIODatasetBuilder": - """Add a coordinate after adding at least one dimension.""" + """Add a coordinate after adding at least one dimension. + + This function must be called after all required dimensions are added via add_dimension(). + This call will create a coordinate variable. + + Args: + name: Name of the coordinate + long_name: Optional long name for the coordinate + dimensions: List of dimension names that the coordinate is associated with + data_type: Data type for the coordinate (defaults to FLOAT32) + metadata_info: Optional metadata information for the coordinate + + Returns: + self: Returns self for method chaining + """ if self._state == _BuilderState.INITIAL: msg = "Must add at least one dimension before adding coordinates" raise ValueError(msg) @@ -239,32 +200,38 @@ def add_coordinate( # noqa: PLR0913 msg = "'dimensions' must be a non-empty list" raise ValueError(msg) old_var = next((e for e in self._coordinates if e.name == name), None) + + # Validate that the coordinate is not already defined if old_var is not None: msg = "Adding coordinate with the same name twice is not allowed" raise ValueError(msg) + + # Validate that all referenced dimensions are already defined + for dim in dimensions: + if next((d for d in self._dimensions if d.name == dim), None) is None: + msg = f"Pre-existing dimension named {dim!r} is not found" + raise ValueError(msg) - self._add_dimensions_if_needed(dimensions) - dim_names = get_dimension_names(dimensions) meta_dict = _to_dictionary(metadata_info) coord = Coordinate( name=name, longName=long_name, # We ass names: sts, not list[NamedDimension | str] - dimensions=dim_names, + dimensions=dimensions, dataType=data_type, metadata=meta_dict ) self._coordinates.append(coord) - # Add coordinate as variables to the dataset + # Add a coordinate variable to the dataset var_meta_dict = _to_dictionary(coord.metadata) coord_var = Variable( name=coord.name, - longName=coord.long_name, + longName=f"'{coord.name}' coordinate variable", dimensions=coord.dimensions, dataType=coord.data_type, compressor=None, - # IMPORTANT: we always use Coordinate here, not the coordinate name + # IMPORTANT: we always use the Coordinate here, not the coordinate name # Since the Dataset does not have a coordinate list, we need to preserve Coordinate # somewhere. Namely, in the variable created for the coordinate coordinates=[coord], @@ -280,15 +247,30 @@ def add_variable( # noqa: PLR0913 name: str, *, long_name: str = None, - #TODO Only allow adding dimensions by name, not by NamedDimension object - dimensions: list[NamedDimension | str], + dimensions: list[str], data_type: ScalarType | StructuredType = ScalarType.FLOAT32, compressor: Blosc | ZFP | None = None, - #TODO Only allow adding coordinates by name, not by Coordinate object - coordinates: list[Coordinate | str] | None = None, + coordinates: list[str] | None = None, metadata_info: VariableMetadataList | None = None, ) -> "MDIODatasetBuilder": - """Add a variable after adding at least one dimension.""" + """Add a variable after adding at least one dimension and, optionally, coordinate. + + This function must be called after all required dimensions are added via add_dimension(). + This function must be called after all required coordinates are added via add_coordinate(). + + Args: + name: Name of the variable + long_name: Optional long name for the variable + dimensions: List of dimension names that the variable is associated with + data_type: Data type for the variable (defaults to FLOAT32) + compressor: Compressor used for the variable (defaults to None) + coordinates: List of coordinate names that the variable is associated with + (defaults to None, meaning no coordinates) + metadata_info: Optional metadata information for the variable + + Returns: + self: Returns self for method chaining. + """ if self._state == _BuilderState.INITIAL: msg = "Must add at least one dimension before adding variables" raise ValueError(msg) @@ -298,19 +280,32 @@ def add_variable( # noqa: PLR0913 if dimensions is None or not dimensions: msg = "'dimensions' must be a non-empty list" raise ValueError(msg) + + # Validate that the variable is not already defined old_var = next((e for e in self._variables if e.name == name), None) if old_var is not None: msg = "Adding variable with the same name twice is not allowed" raise ValueError(msg) - self._add_dimensions_if_needed(dimensions) - dim_names = get_dimension_names(dimensions) + # Validate that all referenced dimensions are already defined + for dim in dimensions: + if next((e for e in self._dimensions if e.name == dim), None) is None: + msg = f"Pre-existing dimension named {dim!r} is not found" + raise ValueError(msg) + + # Validate that all referenced coordinates are already defined + if coordinates is not None: + for coord in coordinates: + if next((c for c in self._coordinates if c.name == coord), None) is None: + msg = f"Pre-existing coordinate named {coord!r} is not found" + raise ValueError(msg) + meta_dict = _to_dictionary(metadata_info) self._variables.append( Variable( name=name, long_name=long_name, - dimensions=dim_names, + dimensions=dimensions, data_type=data_type, compressor=compressor, coordinates=coordinates, @@ -321,100 +316,17 @@ def add_variable( # noqa: PLR0913 return self def build(self) -> Dataset: - """Build the final dataset.""" + """Build the final dataset. + + This function must be called after at least one dimension is added via add_dimension(). + It will create a Dataset object with all added dimensions, coordinates, and variables. + + Returns: + Dataset: The built dataset with all added dimensions, coordinates, and variables. + """ if self._state == _BuilderState.INITIAL: msg = "Must add at least one dimension before building" raise ValueError(msg) var_meta_dict = _to_dictionary([self._info, self._attributes]) - dataset = Dataset(variables=self._variables, metadata=var_meta_dict) - - return dataset - - # def to_mdio( - # self, - # store: str, - # mode: str = "w", - # compute: bool = False, - # **kwargs: Mapping[str, str | int | float | bool], - # ) -> Dataset: - # """Write the dataset to a Zarr store and return the constructed mdio.Dataset. - - # This function constructs an mdio.Dataset from the MDIO dataset and writes its metadata - # to a Zarr store. The actual data is not written, only the metadata structure is created. - # """ - # return write_mdio_metadata(self.build(), store, mode, compute, **kwargs) - -# def write_mdio_metadata( -# mdio_ds: Dataset, -# store: str, -# mode: str = "w", -# compute: bool = False, -# **kwargs: Mapping[str, str | int | float | bool], -# ) -> mdio.Dataset: -# """Write MDIO metadata to a Zarr store and return the constructed mdio.Dataset. - -# This function constructs an mdio.Dataset from the MDIO dataset and writes its metadata -# to a Zarr store. The actual data is not written, only the metadata structure is created. - -# Args: -# mdio_ds: The MDIO dataset to serialize -# store: Path to the Zarr or .mdio store -# mode: Write mode to pass to to_mdio(), e.g. 'w' or 'a' -# compute: Whether to compute (write) array chunks (True) or only metadata (False) -# **kwargs: Additional arguments to pass to to_mdio() - -# Returns: -# The constructed xarray Dataset with MDIO extensions -# """ -# ds = _construct_mdio_dataset(mdio_ds) - -# def _generate_encodings() -> dict: -# """Generate encodings for each variable in the MDIO dataset. - -# Returns: -# Dictionary mapping variable names to their encoding configurations. -# """ -# # TODO(Anybody, #10274): Re-enable chunk_key_encoding when supported by xarray -# # dimension_separator_encoding = V2ChunkKeyEncoding(separator="/").to_dict() - -# # Collect dimension sizes (same approach as _construct_mdio_dataset) -# dims: dict[str, int] = {} -# for var in mdio_ds.variables: -# for d in var.dimensions: -# if isinstance(d, NamedDimension): -# dims[d.name] = d.size - -# global_encodings = {} -# for var in mdio_ds.variables: -# fill_value = 0 -# if isinstance(var.data_type, StructuredType): -# continue -# chunks = None -# if var.metadata is not None and var.metadata.chunk_grid is not None: -# chunks = var.metadata.chunk_grid.configuration.chunk_shape -# else: -# # When no chunk_grid is provided, set chunks to shape to avoid chunking -# dim_names = [d.name if isinstance(d, NamedDimension) else d for d in var.dimensions] -# chunks = tuple(dims[name] for name in dim_names) -# global_encodings[var.name] = { -# "chunks": chunks, -# # TODO(Anybody, #10274): Re-enable chunk_key_encoding when supported by xarray -# # "chunk_key_encoding": dimension_separator_encoding, -# "_FillValue": fill_value, -# "dtype": var.data_type, -# "compressors": _convert_compressor(var.compressor), -# } -# return global_encodings - -# ds.to_mdio( -# store, -# mode=mode, -# zarr_format=2, -# consolidated=True, -# safe_chunks=False, -# compute=compute, -# encoding=_generate_encodings(), -# **kwargs, -# ) -# return ds + return Dataset(variables=self._variables, metadata=var_meta_dict) diff --git a/tests/unit/v1/test_dataset_builder_add_coordinate.py b/tests/unit/v1/test_dataset_builder_add_coordinate.py index e6fa4bcf..0adb9240 100644 --- a/tests/unit/v1/test_dataset_builder_add_coordinate.py +++ b/tests/unit/v1/test_dataset_builder_add_coordinate.py @@ -27,17 +27,24 @@ def test_add_coordinate() -> None: builder.add_dimension("inline", 100) builder.add_dimension("crossline", 100) + # Validate required parameters bad_name = None with pytest.raises(ValueError, match="'name' must be a non-empty string"): builder.add_coordinate(bad_name, dimensions=["speed"]) with pytest.raises(ValueError, match="'name' must be a non-empty string"): builder.add_coordinate("", dimensions=["speed"]) with pytest.raises(ValueError, match="'dimensions' must be a non-empty list"): - builder.add_coordinate("amplitude", dimensions=None) + builder.add_coordinate("cdp-x", dimensions=None) with pytest.raises(ValueError, match="'dimensions' must be a non-empty list"): - builder.add_coordinate("amplitude", dimensions=[]) + builder.add_coordinate("cdp-x", dimensions=[]) - builder.add_coordinate("cdp", dimensions=["inline", "crossline"]) + # Add a variable using non-existent dimensions + msg="Pre-existing dimension named 'xline' is not found" + with pytest.raises(ValueError, match=msg): + builder.add_coordinate("bad_cdp-x", dimensions=["inline", "xline"]) + + # Validate state transition + builder.add_coordinate("cdp-x", dimensions=["inline", "crossline"]) assert builder._state == _BuilderState.HAS_COORDINATES assert len(builder._dimensions) == 2 # 2 variables for dimensions, 1 variable for coordinates @@ -45,16 +52,18 @@ def test_add_coordinate() -> None: assert len(builder._coordinates) == 1 # Validate that we created a coordinate variable - var_cdp = next(e for e in builder._variables if e.name == "cdp") + var_cdp = next(e for e in builder._variables if e.name == "cdp-x") assert var_cdp is not None - assert len(var_cdp.dimensions) == 2 + # Validate that dimensions are stored as names + assert set(var_cdp.dimensions) == {"inline", "crossline"} + # Validate that coordinates are stored as Coordinate assert len(var_cdp.coordinates) == 1 - assert next(e for e in var_cdp.coordinates if e.name == "cdp") is not None + assert next((e for e in var_cdp.coordinates if e.name == "cdp-x"), None) is not None # Adding coordinate with the same name twice msg="Adding coordinate with the same name twice is not allowed" with pytest.raises(ValueError, match=msg): - builder.add_coordinate("cdp", dimensions=["inline", "crossline"]) + builder.add_coordinate("cdp-x", dimensions=["inline", "crossline"]) def test_add_coordinate_with_defaults() -> None: """Test adding coordinates with default arguments.""" @@ -97,7 +106,7 @@ def test_coordinate_with_units() -> None: assert crd0.long_name is None # Default value assert crd0.data_type == ScalarType.FLOAT32 # Default value assert crd0.metadata.attributes is None - assert crd0.metadata.units_v1.length == "ft" + assert crd0.metadata.units_v1.length == LengthUnitEnum.FOOT def test_coordinate_with_attributes() -> None: @@ -151,5 +160,5 @@ def test_coordinate_with_full_metadata() -> None: assert crd0.data_type == ScalarType.FLOAT32 # Default value assert crd0.metadata.attributes["MGA"] == 51 assert crd0.metadata.attributes["UnitSystem"] == "Imperial" - assert crd0.metadata.units_v1.length == "ft" + assert crd0.metadata.units_v1.length == LengthUnitEnum.FOOT diff --git a/tests/unit/v1/test_dataset_builder_add_dimension.py b/tests/unit/v1/test_dataset_builder_add_dimension.py index a6a12e85..1465a892 100644 --- a/tests/unit/v1/test_dataset_builder_add_dimension.py +++ b/tests/unit/v1/test_dataset_builder_add_dimension.py @@ -8,12 +8,12 @@ from mdio.schemas.chunk_grid import RegularChunkGrid from mdio.schemas.chunk_grid import RegularChunkShape -from mdio.schemas.dimension import NamedDimension from mdio.schemas.dtype import ScalarType from mdio.schemas.metadata import ChunkGridMetadata from mdio.schemas.metadata import UserAttributes from mdio.schemas.v1.dataset_builder import MDIODatasetBuilder from mdio.schemas.v1.dataset_builder import _BuilderState +from mdio.schemas.v1.dataset_builder import _get_dimension from mdio.schemas.v1.stats import CenteredBinHistogram from mdio.schemas.v1.stats import StatisticsMetadata from mdio.schemas.v1.stats import SummaryStatistics @@ -27,6 +27,7 @@ def test_add_dimension() -> None: builder = MDIODatasetBuilder("test_dataset") assert builder._state == _BuilderState.INITIAL + # Validate required parameters bad_name = None with pytest.raises(ValueError, match="'name' must be a non-empty string"): builder.add_dimension(bad_name, 200) @@ -36,14 +37,29 @@ def test_add_dimension() -> None: # First dimension should change state to HAS_DIMENSIONS and create a variable builder.add_dimension("x", 100) assert builder._state == _BuilderState.HAS_DIMENSIONS - assert len(builder._dimensions) == 1 - assert builder._dimensions[0] == NamedDimension(name="x", size=100) - assert len(builder._variables) == 1 + assert len(builder._dimensions) == 1 + assert _get_dimension(builder._dimensions, "x", 100) is not None - # Validate that we created a dimension variable properly + # Validate that we have created a dimension variable and + # that variable has the embedded NamedDimension + assert len(builder._variables) == 1 var_x = next(e for e in builder._variables if e.name == "x") assert var_x is not None assert len(var_x.dimensions) == 1 + # Validate that the dimension variable has the NamedDimension + assert _get_dimension(var_x.dimensions, "x", 100) is not None + assert var_x.long_name == "'x' dimension variable" + assert var_x.data_type == ScalarType.INT32 + assert var_x.compressor is None + assert var_x.coordinates is None + assert var_x.metadata is None + + # Validate that we can't add a dimension with the same name twice + with pytest.raises( + ValueError, + match="Adding dimension with the same name twice is not allowed", + ): + builder.add_dimension("x", 200) # Adding dimension with the same name twice msg="Adding dimension with the same name twice is not allowed" @@ -58,15 +74,16 @@ def test_add_dimension_with_defaults() -> None: builder.add_dimension("x", 100) assert builder._state == _BuilderState.HAS_DIMENSIONS assert len(builder._dimensions) == 1 - assert builder._dimensions[0] == NamedDimension(name="x", size=100) - assert len(builder._variables) == 1 - var0 = builder._variables[0] - assert var0.name == "x" - assert var0.long_name is None - assert var0.data_type == ScalarType.INT32 - assert var0.compressor is None - assert var0.coordinates is None - assert var0.metadata is None + # Validate that the dimension builder has the NamedDimension + assert _get_dimension(builder._dimensions, "x", 100) is not None + var_x = next((e for e in builder._variables if e.name == "x"), None) + assert var_x is not None + assert var_x.name == "x" + assert var_x.long_name == "'x' dimension variable" + assert var_x.data_type == ScalarType.INT32 + assert var_x.compressor is None + assert var_x.coordinates is None + assert var_x.metadata is None def test_add_dimension_with_units() -> None: """Test adding dimensions with units.""" @@ -82,17 +99,16 @@ def test_add_dimension_with_units() -> None: assert len(builder._variables) == 1 var0 = builder._variables[0] assert var0.name == "length" - assert var0.long_name is None + assert var0.long_name == "'length' dimension variable" assert var0.data_type == ScalarType.FLOAT64 assert var0.compressor is None assert var0.coordinates is None - assert var0.metadata.units_v1.length == "ft" + assert var0.metadata.units_v1.length == LengthUnitEnum.FOOT def test_add_dimension_with_attributes() -> None: """Test adding dimensions with attributes.""" builder = MDIODatasetBuilder("test_dataset") - # Add dimension with strongly-typed attribute list builder.add_dimension( "length", size=100, @@ -111,7 +127,6 @@ def test_add_dimension_with_chunk_grid() -> None: """Test adding dimensions with chunk grid.""" builder = MDIODatasetBuilder("test_dataset") - # Add dimension with strongly-typed chunk grid grid_definition = RegularChunkGrid(configuration=RegularChunkShape(chunk_shape=[20])) builder.add_dimension( "length", @@ -131,7 +146,6 @@ def test_add_dimension_with_stats() -> None: """Test adding dimensions with stats.""" builder = MDIODatasetBuilder("test_dataset") - # Add dimension with strongly-typed stats builder.add_dimension( "depth", size=100, @@ -162,8 +176,6 @@ def test_add_dimension_with_full_metadata() -> None: """Test adding dimensions with all metadata.""" builder = MDIODatasetBuilder("test_dataset") - - # Add dimension with all strongly-typed metadata builder.add_dimension( "length", size=100, @@ -191,9 +203,9 @@ def test_add_dimension_with_full_metadata() -> None: var0 = builder._variables[0] assert var0.name == "length" assert var0.data_type == ScalarType.FLOAT32 - assert var0.metadata.units_v1.length == "ft" - assert var0.metadata.attributes["MGA"] == 51 - assert var0.metadata.attributes["UnitSystem"] == "Imperial" + assert var0.metadata.units_v1.length == LengthUnitEnum.FOOT + assert var0.metadata.attributes["MGA"] == 51 + assert var0.metadata.attributes["UnitSystem"] == "Imperial" assert var0.metadata.chunk_grid.name == "regular" assert var0.metadata.chunk_grid.configuration.chunk_shape == [20] assert var0.metadata.stats_v1.count == 100 diff --git a/tests/unit/v1/test_dataset_builder_add_variable.py b/tests/unit/v1/test_dataset_builder_add_variable.py index 1181e10d..ca6d8f95 100644 --- a/tests/unit/v1/test_dataset_builder_add_variable.py +++ b/tests/unit/v1/test_dataset_builder_add_variable.py @@ -36,15 +36,21 @@ def test_add_variable() -> None: builder.add_dimension("crossline", 100) builder.add_dimension("depth", 100) + # Validate required parameters bad_name = None with pytest.raises(ValueError, match="'name' must be a non-empty string"): builder.add_variable(bad_name, dimensions=["speed"]) with pytest.raises(ValueError, match="'name' must be a non-empty string"): builder.add_variable("", dimensions=["speed"]) with pytest.raises(ValueError, match="'dimensions' must be a non-empty list"): - builder.add_variable("amplitude", dimensions=None) + builder.add_variable("bad_amplitude", dimensions=None) with pytest.raises(ValueError, match="'dimensions' must be a non-empty list"): - builder.add_variable("amplitude", dimensions=[]) + builder.add_variable("bad_amplitude", dimensions=[]) + + # Add a variable using non-existent dimensions + msg="Pre-existing dimension named 'xline' is not found" + with pytest.raises(ValueError, match=msg): + builder.add_variable("bad_amplitude", dimensions=["inline", "xline", "depth"]) builder.add_variable("amplitude", dimensions=["inline", "crossline", "depth"]) assert builder._state == _BuilderState.HAS_VARIABLES @@ -52,10 +58,21 @@ def test_add_variable() -> None: assert len(builder._variables) == 4 assert len(builder._coordinates) == 0 + # Add a variable using non-existent coordinates + msg="Pre-existing coordinate named 'cdp-x' is not found" + with pytest.raises(ValueError, match=msg): + builder.add_variable("bad_amplitude", + dimensions=["inline", "crossline", "depth"], + coordinates=["cdp-x", "cdp-y"]) + + builder.add_coordinate("cdp-x", dimensions=["inline", "crossline"]) + builder.add_coordinate("cdp-y", dimensions=["inline", "crossline"]) + # Adding variable with the same name twice msg="Adding variable with the same name twice is not allowed" with pytest.raises(ValueError, match=msg): - builder.add_variable("amplitude", dimensions=["inline", "crossline", "depth"]) + builder.add_variable("amplitude", + dimensions=["inline", "crossline", "depth"]) def test_add_variable_with_defaults() -> None: @@ -92,7 +109,7 @@ def test_add_variable_full_parameters() -> None: dimensions=["inline", "crossline", "depth"], data_type=ScalarType.FLOAT64, compressor=Blosc(algorithm="zstd"), - coordinates=["inline", "crossline", "depth", "cdp-x", "cdp-y"], + coordinates=["cdp-x", "cdp-y"], metadata_info=[ AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.FOOT)), UserAttributes(attributes={"MGA": 51, "UnitSystem": "Imperial"}), @@ -124,10 +141,10 @@ def test_add_variable_full_parameters() -> None: assert isinstance(v.compressor, Blosc) assert v.compressor.algorithm == "zstd" # NOTE: add_variable() stores coordinates as names - assert set(v.coordinates) == {"inline", "crossline", "depth", "cdp-x", "cdp-y"} + assert set(v.coordinates) == {"cdp-x", "cdp-y"} assert v.metadata.stats_v1.count == 100 assert isinstance(v.metadata, VariableMetadata) - assert v.metadata.units_v1.length == "ft" + assert v.metadata.units_v1.length == LengthUnitEnum.FOOT assert v.metadata.attributes["MGA"] == 51 assert v.metadata.attributes["UnitSystem"] == "Imperial" assert v.metadata.chunk_grid.name == "regular" diff --git a/tests/unit/v1/test_dataset_builder_build.py b/tests/unit/v1/test_dataset_builder_build.py index 64dfead0..488a9e6d 100644 --- a/tests/unit/v1/test_dataset_builder_build.py +++ b/tests/unit/v1/test_dataset_builder_build.py @@ -4,39 +4,28 @@ # Thus, disable it for this file """Tests the schema v1 dataset_builder.add_coordinate() public API.""" -from datetime import UTC, datetime -import json -import os -from pathlib import Path -import pytest - -from mdio.schemas import builder -from mdio.schemas.chunk_grid import RegularChunkGrid, RegularChunkShape + +from mdio.schemas.chunk_grid import RegularChunkGrid +from mdio.schemas.chunk_grid import RegularChunkShape from mdio.schemas.compressors import Blosc -from mdio.schemas.dtype import ScalarType, StructuredType -from mdio.schemas.metadata import ChunkGridMetadata, UserAttributes -from mdio.schemas.v1.dataset import Dataset, DatasetInfo, DatasetMetadata +from mdio.schemas.dtype import ScalarType +from mdio.schemas.dtype import StructuredType +from mdio.schemas.metadata import ChunkGridMetadata +from mdio.schemas.metadata import UserAttributes +from mdio.schemas.v1.dataset import Dataset from mdio.schemas.v1.dataset_builder import MDIODatasetBuilder -from mdio.schemas.v1.dataset_builder import _BuilderState -from mdio.schemas.v1.stats import CenteredBinHistogram, StatisticsMetadata, SummaryStatistics -from mdio.schemas.v1.units import AllUnits, SpeedUnitEnum, SpeedUnitModel +from mdio.schemas.v1.dataset_builder import _get_dimension +from mdio.schemas.v1.stats import CenteredBinHistogram +from mdio.schemas.v1.stats import StatisticsMetadata +from mdio.schemas.v1.stats import SummaryStatistics +from mdio.schemas.v1.units import AllUnits from mdio.schemas.v1.units import LengthUnitEnum from mdio.schemas.v1.units import LengthUnitModel -from mdio.schemas.v1.variable import VariableMetadata +from mdio.schemas.v1.units import SpeedUnitEnum +from mdio.schemas.v1.units import SpeedUnitModel def test_build() -> None: - """Test adding coordinates. Check the state transition and validate required parameters.""" - builder = MDIODatasetBuilder("test_dataset") - assert builder._state == _BuilderState.INITIAL - - builder.add_dimension("x", 100) - builder.add_dimension("y", 100) - - builder.build() - - -def test_build_dataset() -> None: """Test building a complete dataset.""" dataset = ( MDIODatasetBuilder("test_dataset") @@ -58,21 +47,9 @@ def test_build_dataset() -> None: assert var_data.long_name == "Test Data" assert len(var_data.dimensions) == 2 -# def test_build_dataset_bad_example() -> None: -# builder = MDIODatasetBuilder("Bad example") -# builder.add_dimension("inline", 256) -# builder.add_dimension("crossline", 512) -# builder.add_dimension("depth", 384) -# builder.add_variable(name="image", dimensions=["inline", "crossline", "depth"]) -# dataset = builder.build() - -# json_str = dataset.model_dump_json() -# file_path = os.path.join(os.path.dirname(__file__), "bad_example.json") -# with open(file_path, 'w') as f: -# f.write(json_str) - -def test_build_campos_3d(tmp_path: Path) -> None: - """Test building a toy dataset with multiple variables and attributes.""" + +def test_build_campos_3d() -> None: # noqa: PLR0915 Too many statements (57 > 50) + """Test building a Campos 3D dataset with multiple variables and attributes.""" dataset = make_campos_3d_dataset() # Verify dataset structure @@ -83,37 +60,74 @@ def test_build_campos_3d(tmp_path: Path) -> None: # Verify variables (including dimension variables) # 3 dimension variables + 4 data variables + 2 coordinate variables - assert len(dataset.variables) == 9 # noqa: PLR2004 + assert len(dataset.variables) == 9 # Verify dimension variables inline_var = next(v for v in dataset.variables if v.name == "inline") assert inline_var.data_type == ScalarType.UINT32 - assert len(inline_var.dimensions) == 1 - assert inline_var.dimensions[0].name == "inline" + # Dimension variables store dimensions as NamedDimension + assert _get_dimension(inline_var.dimensions, "inline", 256) + + crossline_var = next(v for v in dataset.variables if v.name == "crossline") + assert crossline_var.data_type == ScalarType.UINT32 + # Dimension variables store dimensions as NamedDimension + assert _get_dimension(crossline_var.dimensions, "crossline", 512) depth_var = next(v for v in dataset.variables if v.name == "depth") assert depth_var.data_type == ScalarType.UINT32 - assert depth_var.metadata.units_v1.length == "m" + # Dimension variables store dimensions as NamedDimension + assert _get_dimension(depth_var.dimensions, "depth", 384) + assert depth_var.metadata.units_v1.length == LengthUnitEnum.METER + + # Verify coordinate variables + cdp_x = next(v for v in dataset.variables if v.name == "cdp-x") + assert cdp_x.data_type == ScalarType.FLOAT32 + # Coordinates variables store dimensions as names + assert set(cdp_x.dimensions) == {"inline", "crossline"} + assert cdp_x.metadata.units_v1.length == LengthUnitEnum.METER + + cdp_y = next(v for v in dataset.variables if v.name == "cdp-y") + assert cdp_y.data_type == ScalarType.FLOAT32 + # Coordinates variables store dimensions as names + assert set(cdp_y.dimensions) == {"inline", "crossline"} + assert cdp_y.metadata.units_v1.length == LengthUnitEnum.METER # Verify image variable image = next(v for v in dataset.variables if v.name == "image") + assert set(image.dimensions) == {"inline", "crossline", "depth"} assert image.data_type == ScalarType.FLOAT32 assert isinstance(image.compressor, Blosc) assert image.compressor.algorithm == "zstd" + # Other variables store dimensions as names + assert set(image.coordinates) == {"cdp-x", "cdp-y"} + assert isinstance(image.metadata.chunk_grid, RegularChunkGrid) + assert image.metadata.chunk_grid.configuration.chunk_shape == [128, 128, 128] + assert isinstance(image.metadata.stats_v1, SummaryStatistics) assert image.metadata.stats_v1.count == 100 # Verify velocity variable velocity = next(v for v in dataset.variables if v.name == "velocity") + assert set(velocity.dimensions) == {"inline", "crossline", "depth"} assert velocity.data_type == ScalarType.FLOAT16 assert velocity.compressor is None - assert velocity.metadata.units_v1.speed == "m/s" + # Other variables store dimensions as names + assert set(velocity.coordinates) == {"cdp-x", "cdp-y"} + assert isinstance(velocity.metadata.chunk_grid, RegularChunkGrid) + assert velocity.metadata.chunk_grid.configuration.chunk_shape == [128, 128, 128] + assert isinstance(velocity.metadata.units_v1, SpeedUnitModel) + assert velocity.metadata.units_v1.speed == SpeedUnitEnum.METER_PER_SECOND # Verify image_inline variable image_inline = next( v for v in dataset.variables if v.name == "image_inline") assert image_inline.long_name == "inline optimized version of 3d_stack" + assert set(image_inline.dimensions) == {"inline", "crossline", "depth"} + assert image_inline.data_type == ScalarType.FLOAT32 assert isinstance(image_inline.compressor, Blosc) assert image_inline.compressor.algorithm == "zstd" + assert set(image_inline.coordinates) == {"cdp-x", "cdp-y"} + assert isinstance(image_inline.metadata.chunk_grid, RegularChunkGrid) + assert image_inline.metadata.chunk_grid.configuration.chunk_shape == [4, 512, 512] # Verify image_headers variable headers = next(v for v in dataset.variables if v.name == "image_headers") @@ -121,36 +135,8 @@ def test_build_campos_3d(tmp_path: Path) -> None: assert len(headers.data_type.fields) == 4 assert headers.data_type.fields[0].name == "cdp-x" -# def test_build_campos_3d_contract(tmp_path: Path) -> None: -# '''Test building campos_3d dataset and converting to JSON schema format.''' -# dataset = make_campos_3d_dataset() -# # json_str = dataset.model_dump_json() -# # json_sorted_str = json.dumps(json.loads(json_str), sort_keys=True) -# m_dict = dataset.model_dump(mode="json", by_alias=True) -# json_sorted_str = json.dumps(m_dict, sort_keys=True) - -# sorted_contract = load_campos_3d_contract() - -# save(json_sorted_str, sorted_contract) - -# def save(model: str, contract: str) -> None: -# model_file_path = os.path.join(os.path.dirname(__file__), "campos_3d_model.json") -# contract_file_path = os.path.join(os.path.dirname(__file__), "campos_3d_contract.json") -# with open(model_file_path, 'w') as f: -# f.write(model) -# with open(contract_file_path, 'w') as f: -# f.write(contract) - -# def load_campos_3d_contract(): -# file_path = os.path.join(os.path.dirname(__file__), 'test_data/campos_3d_contract.json') -# with open(file_path, 'r') as f: -# contract = json.load(f) -# assert contract is not None -# return json.dumps(contract, sort_keys=True) - def make_campos_3d_dataset() -> Dataset: """Create in-memory campos_3d dataset.""" - ds = MDIODatasetBuilder( "campos_3d", attributes=UserAttributes(attributes={ @@ -181,7 +167,6 @@ def make_campos_3d_dataset() -> Dataset: dimensions=["inline", "crossline"], metadata_info=[ AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.METER))] - # metadata={"unitsV1": {"length": "m"}}, ) # Add image variable ds.add_variable( @@ -189,7 +174,7 @@ def make_campos_3d_dataset() -> Dataset: dimensions=["inline", "crossline", "depth"], data_type=ScalarType.FLOAT32, compressor=Blosc(algorithm="zstd"), - coordinates=["inline", "crossline", "depth", "cdp-x", "cdp-y"], + coordinates=["cdp-x", "cdp-y"], metadata_info=[ ChunkGridMetadata( chunk_grid=RegularChunkGrid( @@ -214,7 +199,7 @@ def make_campos_3d_dataset() -> Dataset: name="velocity", dimensions=["inline", "crossline", "depth"], data_type=ScalarType.FLOAT16, - coordinates=["inline", "crossline", "depth", "cdp-x", "cdp-y"], + coordinates=["cdp-x", "cdp-y"], metadata_info=[ ChunkGridMetadata( chunk_grid=RegularChunkGrid( @@ -231,7 +216,7 @@ def make_campos_3d_dataset() -> Dataset: dimensions=["inline", "crossline", "depth"], data_type=ScalarType.FLOAT32, compressor=Blosc(algorithm="zstd"), - coordinates=["inline", "crossline", "depth", "cdp-x", "cdp-y"], + coordinates=["cdp-x", "cdp-y"], metadata_info=[ ChunkGridMetadata( chunk_grid=RegularChunkGrid( @@ -250,6 +235,6 @@ def make_campos_3d_dataset() -> Dataset: {"name": "some_scalar", "format": ScalarType.FLOAT16}, ] ), - coordinates=["inline", "crossline", "cdp-x", "cdp-y"], + coordinates=["cdp-x", "cdp-y"], ) return ds.build() diff --git a/tests/unit/v1/test_dataset_builder_helpers.py b/tests/unit/v1/test_dataset_builder_helpers.py index e6bfda71..6fbba42a 100644 --- a/tests/unit/v1/test_dataset_builder_helpers.py +++ b/tests/unit/v1/test_dataset_builder_helpers.py @@ -1,6 +1,6 @@ -# ruff: noqa: PLR2004 +# ruff: noqa: PLR2004 # PLR2004 Magic value used in comparison, consider replacing `3` with a constant variable -# The above erroneous warning is generated for every numerical assert. +# The above erroneous warning is generated for every numerical assert. # Thus, disable it for this file """Tests the schema v1 dataset_builder internal methods.""" @@ -10,265 +10,87 @@ import pytest from pydantic import Field -from mdio.schemas.chunk_grid import RegularChunkGrid -from mdio.schemas.chunk_grid import RegularChunkShape from mdio.schemas.core import StrictModel from mdio.schemas.dimension import NamedDimension -from mdio.schemas.metadata import ChunkGridMetadata -from mdio.schemas.v1.dataset_builder import MDIODatasetBuilder +from mdio.schemas.v1.dataset_builder import _get_dimension from mdio.schemas.v1.dataset_builder import _to_dictionary -from mdio.schemas.v1.dataset_builder import contains_dimension -from mdio.schemas.v1.dataset_builder import get_dimension -from mdio.schemas.v1.dataset_builder import get_dimension_names -from mdio.schemas.v1.stats import CenteredBinHistogram -from mdio.schemas.v1.stats import StatisticsMetadata -from mdio.schemas.v1.stats import SummaryStatistics -from mdio.schemas.v1.units import LengthUnitEnum -from mdio.schemas.v1.units import LengthUnitModel -from mdio.schemas.v1.variable import AllUnits -from mdio.schemas.v1.variable import CoordinateMetadata -from mdio.schemas.v1.variable import UserAttributes -from mdio.schemas.v1.variable import VariableMetadata -def test__get_dimension_by_name() -> None: +def test__get_dimension() -> None: """Test getting a dimension by name from the list of dimensions.""" - dimensions = [NamedDimension(name="inline", size=2), NamedDimension(name="crossline", size=3)] + dimensions = [NamedDimension(name="inline", size=2), NamedDimension( + name="crossline", size=3)] - assert get_dimension([], "inline") is None - assert get_dimension(dimensions, "inline") == NamedDimension(name="inline", size=2) - assert get_dimension(dimensions, "crossline") == NamedDimension(name="crossline", size=3) - assert get_dimension(dimensions, "time") is None + assert _get_dimension([], "inline") is None + assert _get_dimension(dimensions, "inline") == NamedDimension( + name="inline", size=2) + assert _get_dimension(dimensions, "crossline") == NamedDimension( + name="crossline", size=3) + assert _get_dimension(dimensions, "time") is None with pytest.raises(TypeError, match="Expected str, got NoneType"): - get_dimension(dimensions, None) + _get_dimension(dimensions, None) with pytest.raises(TypeError, match="Expected str, got int"): - get_dimension(dimensions, 42) + _get_dimension(dimensions, 42) with pytest.raises( ValueError, match="Dimension 'inline' found but size 2 does not match expected size 200" ): - get_dimension(dimensions, "inline", size=200) + _get_dimension(dimensions, "inline", size=200) -def test__contains_dimension() -> None: - """Test if a dimension with a given name exists in the list of dimensions.""" - dimensions = [NamedDimension(name="inline", size=2), NamedDimension(name="crossline", size=3)] - - assert contains_dimension([], "inline") is False - - assert contains_dimension(dimensions, "inline") is True - assert contains_dimension(dimensions, "crossline") is True - assert contains_dimension(dimensions, "time") is False - - with pytest.raises(TypeError, match="Expected str or NamedDimension, got NoneType"): - contains_dimension(dimensions, None) - with pytest.raises(TypeError, match="Expected str or NamedDimension, got int"): - contains_dimension(dimensions, 42) - with pytest.raises( - ValueError, match="Dimension 'inline' found but size 2 does not match expected size 200" - ): - contains_dimension(dimensions, NamedDimension(name="inline", size=200)) - - -def test_get_dimension_names() -> None: - """Test getting a list of dimension names from list[NamedDimension | str].""" - empty_list = get_dimension_names(None) - assert empty_list is not None - assert isinstance(empty_list, list) - assert len(empty_list) == 0 - - empty_list = get_dimension_names([]) - assert empty_list is not None - assert isinstance(empty_list, list) - assert len(empty_list) == 0 - - dim_list = get_dimension_names([ - NamedDimension(name="inline", size=2), - "amplitude", - NamedDimension(name="crossline", size=3) - ]) - assert dim_list is not None - assert isinstance(dim_list, list) - assert set(dim_list) == {"inline", "amplitude", "crossline"} - - -def test_add_dimensions_if_needed() -> None: - """Test adding named dimensions to a dataset.""" - builder = MDIODatasetBuilder("Test Dataset Builder") - # - # Validate initial state - # - assert builder._dimensions is not None - assert len(builder._dimensions) == 0 - - # - # Validate that adding empty dimensions does not change the state - # - added_dims = builder._add_dimensions_if_needed(None) - assert len(builder._dimensions) == 0 - assert len(added_dims) == 0 - added_dims = builder._add_dimensions_if_needed([]) - assert len(builder._dimensions) == 0 - assert len(added_dims) == 0 - added_dims = builder._add_dimensions_if_needed({}) - assert len(builder._dimensions) == 0 - assert len(added_dims) == 0 - - # - # Add named dimensions - # - inline_dim = NamedDimension(name="inline", size=2) - added_dims = builder._add_dimensions_if_needed([inline_dim]) - assert len(builder._dimensions) == 1 - assert len(added_dims) == 1 - assert contains_dimension(added_dims, inline_dim) - - crossline_dim = NamedDimension(name="crossline", size=3) - time_dim = NamedDimension(name="time", size=4) - added_dims = builder._add_dimensions_if_needed([crossline_dim, time_dim]) - assert len(builder._dimensions) == 3 - assert contains_dimension(builder._dimensions, inline_dim) - assert contains_dimension(builder._dimensions, crossline_dim) - assert contains_dimension(builder._dimensions, time_dim) - assert contains_dimension(added_dims, crossline_dim) - assert contains_dimension(added_dims, time_dim) - - # - # Add invalid object type - # - with pytest.raises(TypeError, match="Expected NamedDimension or str, got int"): - builder._add_dimensions_if_needed([42]) - # Assert that the number of dimensions has not increased - assert len(builder._dimensions) == 3 - -def test__add_dimensions_if_one_already_exists() -> None: - """Test adding existing named dimensions to a dataset.""" - builder = MDIODatasetBuilder("Test Dataset Builder") - - inline_dim = NamedDimension(name="inline", size=2) - crossline_dim = NamedDimension(name="crossline", size=3) - time_dim = NamedDimension(name="time", size=4) - # - # Add dimensions with the same names again does nothing - # (make sure we are passing different instances) - # - inline_dim2 = NamedDimension(name=inline_dim.name, size=inline_dim.size) - crossline_dim2 = NamedDimension(name=crossline_dim.name, size=crossline_dim.size) - time_dim2 = NamedDimension(name=time_dim.name, size=time_dim.size) - builder._add_dimensions_if_needed([inline_dim2, crossline_dim2, time_dim2]) - added_dims = builder._add_dimensions_if_needed([inline_dim2, crossline_dim2, time_dim2]) - # Validate that the dimensions and variables are not duplicated - assert len(builder._dimensions) == 3 - assert contains_dimension(builder._dimensions, inline_dim) - assert contains_dimension(builder._dimensions, crossline_dim) - assert contains_dimension(builder._dimensions, time_dim) - assert len(added_dims) == 0 - - # Add dimensions with the same name, but different size again - with pytest.raises( - ValueError, match="Dimension 'inline' found but size 2 does not match expected size 200" - ): - builder._add_dimensions_if_needed([NamedDimension(name="inline", size=200)]) - # Assert that the number of dimensions has not increased - assert len(builder._dimensions) == 3 - - # - # Add existing dimension using its name - # - added_dims = builder._add_dimensions_if_needed(["inline", "crossline"]) - assert len(builder._dimensions) == 3 - assert contains_dimension(builder._dimensions, inline_dim) - assert contains_dimension(builder._dimensions, crossline_dim) - assert contains_dimension(builder._dimensions, time_dim) - assert len(added_dims) == 0 - - # - # Add non-existing dimension using its name is not allowed - # - with pytest.raises(ValueError, match="Pre-existing dimension named 'offset' is not found"): - builder._add_dimensions_if_needed(["offset"]) - assert len(builder._dimensions) == 3 - assert contains_dimension(builder._dimensions, inline_dim) - assert contains_dimension(builder._dimensions, crossline_dim) - assert contains_dimension(builder._dimensions, time_dim) - def test__to_dictionary() -> None: - """Test converting a BaseModel to a dictionary.""" + """Test converting a dictionary, list or pydantic BaseModel to a dictionary.""" + # Validate inputs with pytest.raises(TypeError, match="Expected BaseModel, dict or list, got datetime"): - # This should raise an error because datetime is not a BaseModel _to_dictionary(datetime.now(UTC)) + # Convert None to None + result = _to_dictionary(None) + assert result is None + + # Validate conversion of a Pydantic BaseModel class SomeModel(StrictModel): count: int = Field(default=None, description="Samples count") - samples: list[float] = Field(default_factory=list, description="Samples.") + samples: list[float] = Field( + default_factory=list, description="Samples.") created: datetime = Field( default_factory=datetime.now, description="Creation time with TZ info." ) - - m = SomeModel(count=3, - samples=[1.0, 2.0, 3.0], - created=datetime(2023, 10, 1, 12, 0, 0, tzinfo=UTC)) - result = _to_dictionary(m) + md = SomeModel(count=3, + samples=[1.0, 2.0, 3.0], + created=datetime(2023, 10, 1, 12, 0, 0, tzinfo=UTC)) + result = _to_dictionary(md) assert isinstance(result, dict) - assert result == {"count": 3, "created": "2023-10-01T12:00:00Z", "samples": [1.0, 2.0, 3.0]} + assert result == { + "count": 3, "created": "2023-10-01T12:00:00Z", "samples": [1.0, 2.0, 3.0]} - d = { + # Validate conversion of a dictionary + dct = { "count": 3, "samples": [1.0, 2.0, 3.0], "created": datetime(2023, 10, 1, 12, 0, 0, tzinfo=UTC)} - result = _to_dictionary(m) + result = _to_dictionary(dct) assert isinstance(result, dict) - assert result == {"count": 3, "created": "2023-10-01T12:00:00Z", "samples": [1.0, 2.0, 3.0]} - - -# def test__make_coordinate_metadata() -> None: -# """Test creating VariableMetadata from a strongly-typed list of AllUnits or UserAttributes.""" -# units = AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.FOOT)) -# attrs = UserAttributes(attributes={"MGA": 51, "UnitSystem": "Imperial"}) -# meta_list = [units, attrs] - -# # Assume that multiple attributes are allowed -# metadata = _make_coordinate_metadata(meta_list) -# assert isinstance(metadata, CoordinateMetadata) -# assert metadata.units_v1.length == "ft" -# assert metadata.attributes["MGA"] == 51 -# assert metadata.attributes["UnitSystem"] == "Imperial" - -# meta_list = ["ft"] -# with pytest.raises(TypeError, match="Expected BaseModel, dict or list, got str"): -# _make_variable_metadata(meta_list) - -# def test__make_variable_metadata() -> None: -# """Test creating VariableMetadata from a strongly-typed list of AllUnits or UserAttributes.""" -# units = AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.FOOT)) -# attrs = UserAttributes(attributes={"MGA": 51, "UnitSystem": "Imperial"}) -# chgrd = ChunkGridMetadata( -# chunk_grid=RegularChunkGrid( -# configuration=RegularChunkShape(chunk_shape=[20]))) -# stats = StatisticsMetadata( -# stats_v1=SummaryStatistics( -# count=100, -# sum=1215.1, -# sumSquares=125.12, -# min=5.61, -# max=10.84, -# histogram=CenteredBinHistogram(binCenters=[1, 2], counts=[10, 15]))) -# metadata_info = [units, attrs, chgrd, stats] -# metadata = _make_variable_metadata(metadata_info) -# assert isinstance(metadata, VariableMetadata) -# assert metadata.units_v1.length == "ft" -# assert metadata.attributes["MGA"] == 51 -# assert metadata.attributes["UnitSystem"] == "Imperial" -# assert metadata.chunk_grid.name == "regular" -# assert metadata.chunk_grid.configuration.chunk_shape == [20] -# assert metadata.stats_v1.count == 100 -# assert metadata.stats_v1.sum == 1215.1 -# assert metadata.stats_v1.sum_squares == 125.12 -# assert metadata.stats_v1.min == 5.61 -# assert metadata.stats_v1.max == 10.84 -# assert metadata.stats_v1.histogram.bin_centers == [1, 2] -# assert metadata.stats_v1.histogram.counts == [10, 15] - -# meta_list = ["ft"] -# with pytest.raises(TypeError, match="Expected BaseModel, dict or list, got str"): -# _make_variable_metadata(meta_list) + assert result == {"count": 3, + "samples": [1.0, 2.0, 3.0], + "created": datetime(2023, 10, 1, 12, 0, 0, tzinfo=UTC), } + + # Validate conversion of a dictionary + lst = [None, + SomeModel(count=3, + samples=[1.0, 2.0, 3.0], + created=datetime(2023, 10, 1, 12, 0, 0, tzinfo=UTC)), + { + "count2": 3, + "samples2": [1.0, 2.0, 3.0], + "created2": datetime(2023, 10, 1, 12, 0, 0, tzinfo=UTC) + }] + result = _to_dictionary(lst) + assert isinstance(result, dict) + assert result == {"count": 3, + "samples": [1.0, 2.0, 3.0], + "created": "2023-10-01T12:00:00Z", + "count2": 3, + "samples2": [1.0, 2.0, 3.0], + "created2": datetime(2023, 10, 1, 12, 0, 0, tzinfo=UTC) + } From 4b2b1638189bf96299c22c5d1fcbd16322d17aec Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Mon, 7 Jul 2025 19:18:45 +0000 Subject: [PATCH 08/27] Revert .container changes --- .devcontainer/Dockerfile.cli | 47 ------------------------- .devcontainer/Dockerfile.dev | 61 --------------------------------- .devcontainer/Dockerfile.nox | 49 -------------------------- .devcontainer/devcontainer.json | 20 +++++------ 4 files changed, 9 insertions(+), 168 deletions(-) delete mode 100644 .devcontainer/Dockerfile.cli delete mode 100644 .devcontainer/Dockerfile.dev delete mode 100644 .devcontainer/Dockerfile.nox diff --git a/.devcontainer/Dockerfile.cli b/.devcontainer/Dockerfile.cli deleted file mode 100644 index 92720e34..00000000 --- a/.devcontainer/Dockerfile.cli +++ /dev/null @@ -1,47 +0,0 @@ -# HOW TO BUILD AND RUN THIS DOCKERFILE -# * Clone mdio-python and build a Docker image: -# git clone https://github.com/TGSAI/mdio-python.git -# cd mdio-python -# docker build -t mdio-cli -f .devcontainer/Dockerfile.cli . -# * Run /bin/bash in the Docker container: -# -# -# USAGE: -# docker run -it --rm --name mdio-cli mdio-cli --version -# docker run -it --rm --name mdio-cli mdio-cli --help -# -# LOCAL_DATA_DIR=$(pwd); \ -# docker run -it --rm -v $LOCAL_DATA_DIR:/DATA --name mdio-cli mdio-cli \ -# segy import \ -# /DATA/segy_file.segy \ -# /DATA/mdio_file.mdio \ -# -loc 181,185 \ -# -names inline,crossline -# -# LOCAL_DATA_DIR=$(pwd); \ -# docker run -it --rm -v $LOCAL_DATA_DIR:/DATA --name mdio-cli mdio-cli \ -# segy export \ -# /DATA/mdio_file.mdio \ -# /DATA/segy_file_copy.segy -# -FROM python:3.13-bookworm -# Create the user (https://code.visualstudio.com/remote/advancedcontainers/add-nonroot-user) -ENV USERNAME=python -ENV USER_UID=1000 -ENV USER_GID=$USER_UID -RUN groupadd --gid $USER_GID $USERNAME && useradd --uid $USER_UID --gid $USER_GID -m $USERNAME - -# Set the default non-root user -USER $USERNAME - -# Add path to the user-installed packages -ENV PYTHONUSERBASE=/home/$USERNAME/.local -ENV PATH="$PYTHONUSERBASE/bin:$PATH" - -COPY --chown=$USERNAME:$USERNAME ./ /home/$USERNAME/mdio-python - -WORKDIR /home/$USERNAME/mdio-python -RUN pip install . - -ENTRYPOINT ["mdio"] -CMD ["--version"] diff --git a/.devcontainer/Dockerfile.dev b/.devcontainer/Dockerfile.dev deleted file mode 100644 index 05f13579..00000000 --- a/.devcontainer/Dockerfile.dev +++ /dev/null @@ -1,61 +0,0 @@ -# USAGE: -# This file will be used by the VS Code DevContainer extension -# to create a development environment for the mdio-python project. -# HOW TO RUN TESTS -# 1. Open the project in VS Code. -# 2. Open the Command Palette (Ctrl+Shift+P) and select "Dev Containers: Reopen in Container". -# 3. Once the container is running, open a terminal in VS Code. -# 4. Run the tests using the command: `nox -s test`. -# HOW TO MANUALLY BUILD AND RUN THE CONTAINER -# docker build -t mdio-dev -f .devcontainer/Dockerfile.dev . -# docker run -it --rm --entrypoint /bin/bash --name mdio-dev mdio-dev -# NOTES: -# 1. The container will be run as the non-root user 'vscode' with UID 1000. -# 2. The virtual environment will be setup at /home/vscode/venv -# 3. The project source code will be mounted at /workspaces/mdio-python -ARG PYTHON_VERSION="3.13" -ARG LINUX_DISTRO="bookworm" -ARG UV_VERSION="0.6.11" -ARG NOX_VERSION="2025.2.9" -FROM mcr.microsoft.com/devcontainers/python:1-${PYTHON_VERSION}-${LINUX_DISTRO} - -# Install git for nox pre-commit -RUN apt-get update \ - && apt-get install -y --no-install-recommends \ - git \ - && rm -rf /var/lib/apt/lists/* - -ENV USERNAME="vscode" -USER $USERNAME - -# # Add path to the user-installed packages -# ENV PYTHONUSERBASE=/home/$USERNAME/.local -# ENV PATH="$PYTHONUSERBASE/bin:$PATH" - -COPY --chown=$USERNAME:$USERNAME ./ /workspaces/mdio-python - -WORKDIR /workspaces/mdio-python - -ARG UV_VERSION -ARG NOX_VERSION -RUN python3 -m pip install uv==${UV_VERSION} nox==${NOX_VERSION} msgpack ipykernel - -# Initialize virtual environement in the container -ENV VIRTUAL_ENV="/home/$USERNAME/venv" -RUN python3 -m venv $VIRTUAL_ENV -ENV PATH="$VIRTUAL_ENV/bin:$PATH" - -# installing pytest is required for VS Code Python Testing -RUN pip install pytest pytest-cov pytest-mock pytest-asyncio - -# Install the project in editable mode -# This allows for live reloading of the code during development -RUN pip install -e . - -# RUN uv pip install snakeviz - - - - - - diff --git a/.devcontainer/Dockerfile.nox b/.devcontainer/Dockerfile.nox deleted file mode 100644 index 103673fd..00000000 --- a/.devcontainer/Dockerfile.nox +++ /dev/null @@ -1,49 +0,0 @@ -# HOW TO BUILD AND RUN THIS DOCKERFILE -# 1. Make sure you have Docker installed and running. -# 2. Clone mdio-python and build the Docker image: -# git clone https://github.com/TGSAI/mdio-python.git -# cd mdio-python -# docker build -t mdio-nox -f .devcontainer/Dockerfile.nox . -# 3. Run /bin/bash in the Docker container : -# LOCAL_DATA_DIR=$(pwd); \ -# docker run -it --rm -v $LOCAL_DATA_DIR:/DATA --entrypoint /bin/bash --name mdio-nox mdio-nox -# -# USAGE: -# docker run -it --rm mdio-nox --list -# docker run -it --rm mdio-nox -s tests-3.13 -# docker run -it --rm mdio-nox --no-stop-on-first-error -# -# NOTE: nox will fail if run in the directory mounted from the host machine -ARG PYTHON_VERSION="3.13" -ARG LINUX_DISTRO="bookworm" -ARG UV_VERSION="0.6.11" -ARG NOX_VERSION="2025.2.9" -FROM python:${PYTHON_VERSION}-${LINUX_DISTRO} -ARG PYTHON_VERSION -ARG LINUX_DISTRO -RUN echo "Using python:${PYTHON_VERSION}-${LINUX_DISTRO}" -# Create the user (https://code.visualstudio.com/remote/advancedcontainers/add-nonroot-user) -ENV USERNAME=python -ENV USER_UID=1000 -ENV USER_GID=$USER_UID -RUN groupadd --gid $USER_GID $USERNAME && useradd --uid $USER_UID --gid $USER_GID -m $USERNAME -# Set the default non-root user -USER $USERNAME - -# Add path to the user-installed packages -ENV PYTHONUSERBASE=/home/$USERNAME/.local -ENV PATH="$PYTHONUSERBASE/bin:$PATH" - -COPY --chown=$USERNAME:$USERNAME ./ /home/$USERNAME/mdio-python - -WORKDIR /home/$USERNAME/mdio-python -RUN pip install . - -# Install UV dependency manager and Nox test automator -ARG UV_VERSION -ARG NOX_VERSION -RUN echo "Using uv: $UV_VERSION and nox: $NOX_VERSION" -RUN python3 -m pip install uv==${UV_VERSION} nox==${NOX_VERSION} msgpack ipykernel - -ENTRYPOINT ["nox"] -CMD ["--list"] \ No newline at end of file diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index dc80446c..b618a526 100755 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -2,12 +2,12 @@ // README at: https://github.com/devcontainers/templates/tree/main/src/python { "build": { - "dockerfile": "Dockerfile.dev", + "dockerfile": "Dockerfile", "context": ".." }, // Use 'postCreateCommand' to run commands after the container is created. "postCreateCommand": { - // "post_create_script": "bash ./.devcontainer/post-install.sh" + "post_create_script": "bash ./.devcontainer/post-install.sh" }, // Forward 8787 to enable us to view dask dashboard "forwardPorts": [8787], @@ -16,9 +16,8 @@ // Configure properties specific to VS Code. "vscode": { "settings": { - "python.testing.pytestArgs": ["tests"], - "python.testing.unittestEnabled": false, - "python.testing.pytestEnabled": true + "python.terminal.activateEnvInCurrentTerminal": true, + "python.defaultInterpreterPath": "/opt/venv/bin/python" }, "extensions": [ "ms-python.python", @@ -28,18 +27,17 @@ "ms-toolsai.jupyter-renderers", "vscode-icons-team.vscode-icons", "wayou.vscode-todo-highlight", - "streetsidesoftware.code-spell-checker", - "eamodio.gitlens", - "visualstudioexptteam.vscodeintellicode" + "streetsidesoftware.code-spell-checker" ] } }, // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root. // "remoteUser": "root", "updateRemoteUserUID": true, - "workspaceMount": "source=${localWorkspaceFolder},target=/workspaces/mdio-python,type=bind", - "workspaceFolder": "/workspaces/mdio-python", "mounts": [ - // "source=${localWorkspaceFolder}/../DATA/,target=/DATA/,type=bind,consistency=cached" + // Re-use local Git configuration + "source=${localEnv:HOME}/.gitconfig,target=/home/vscode/.gitconfig_tmp,type=bind,consistency=cached", + "source=${localEnv:HOME}/.gitconfig,target=/root/.gitconfig_tmp,type=bind,consistency=cached", + "source=${localEnv:SCRATCH_DIR}/${localEnv:USER},target=/scratch/,type=bind,consistency=cached" ] } From c532c3b9e914b11f09c550f968c0c7ae350e6147 Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Mon, 7 Jul 2025 19:24:03 +0000 Subject: [PATCH 09/27] PR review: remove DEVELOPER_NOTES.md --- DEVELOPER_NOTES.md | 136 --------------------------------------------- 1 file changed, 136 deletions(-) delete mode 100644 DEVELOPER_NOTES.md diff --git a/DEVELOPER_NOTES.md b/DEVELOPER_NOTES.md deleted file mode 100644 index dec21dfe..00000000 --- a/DEVELOPER_NOTES.md +++ /dev/null @@ -1,136 +0,0 @@ -# Developer Notes - -## MDIO v1 scope of work - -### TASK 1: Creation an empty MDIO v1 dataset with metadata defined using the v1 schema -#### DESCRIPTION -In the v0, the following code was used to create an empty dataset: - -```Python -grid = Grid([Dimension("inline",...), Dimension("crossline", ...), Dimension("depth", ...)]) -variable = MDIOVariableConfig("stack_amplitude", ...) -create_conf = MDIOCreateConfig(path="demo.mdio", grid=grid, variables=[variable]) -create_empty(config=create_conf) -``` - -In the v1 it is replaced with the following API, which uses v1 schema: - -```Python -builder = MDIODatasetBuilder(...) -builder.add_dimension("inline", ...) -builder.add_dimension("crossline",...) -builder.add_dimension("depth", ...) -builder.add_coordinate("cdp_x",...) -builder.add_coordinate("cdp_y",...) -builder.add_variable("stack_amplitude",...) -builder.to_mdio(store="demo.mdio") -``` - -#### DEFINITION OF DONE -* The resulting v1 MDIO control `demo.mdio` file structure must be identical between Python and C++ -* Code coverage 90% -* Code documentation will be updated: - * API doc strings are reviewed - * docs/tutorials/creation.ipynb - current version describes v0 API. Should be updated with v1 API - * docs/api_reference.md - will be updated with new API - -#### ASSUMPTIONS -We expect that the following v0 workflows to keep working with this change -* Populating MDIOs -* Updating File and Checking with MDIOReader -* Write to SEG-Y - -## Overall API design and implementation -We will have only a strongly-typed (see pydantic) API. For example: - -```Python -VariableMetadataList: TypeAlias = list[AllUnits | UserAttributes | ChunkGridMetadata | StatisticsMetadata] -def add_dimension( - self, - name: str, - size: int, - long_name: str = None, - data_type: ScalarType | StructuredType = ScalarType.INT32, - metadata_info: VariableMetadataList | None = None, -) -> "MDIODatasetBuilder": -``` - -Which will be used as following: - -```Python -builder.add_dimension( - "length", - size=100, - data_type=ScalarType.FLOAT32, - metadata_info=[ - AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.FOOT)), - UserAttributes(attributes={"MGA": 51, "UnitSystem": "Imperial"}), - ChunkGridMetadata( - chunk_grid=RegularChunkGrid( - configuration=RegularChunkShape( - chunk_shape=[20]))), - StatisticsMetadata(stats_v1=SummaryStatistics( - count=100, - sum=1215.1, - sumSquares=125.12, - min=5.61, - max=10.84, - histogram=CenteredBinHistogram( - binCenters=[1, 2], - counts=[10, 15]))) - ] -) -``` -### Notes -* When a coordinate or a variable is created, their schema allows to store their dimensions either as - * list of dimensions name `list[str]`, where the names refer to the dimensions defined in the builder._dimensions - * list of named dimensions `list[NamedDimension]`, which duplicate the dimensions defined in the builder._dimensions - * Mixture of the two above `list[NamedDimension | str]` - - which approach should be used? - - `RESOLUTION: We will be using the first approach.` - - **IMPORTANT: For binary compatibility, We need to ensure that the C++ code follows the same logic** - -* Metadata population from a dictionary in add_coordinate() and add_variable() will not be supported to ensure that the API is strongly-typed. If it is needed, such conversion should be done as a separate step: - ```Python - def make_variable_metadata_list_from_dict(metadata: dict[str, Any]) -> VariableMetadataList: - # Implementation goes here - def make_coordinate_metadata_list_from_dict(metadata: dict[str, Any]) -> CoordinateMetadataList: - # Implementation goes here - ``` - `RESOLUTION: The approach confirmed.` - -## Schema V1 questions - -* add_dimension(): Can a dimension with the same name be added multiple times. Options: - * Allowed: the second request is ignored (current implementation) - * Not Allowed: should it raise an error? - - `RESOLUTION: The dimensions with the same name are not allowed` -* The pydantic attribute names are different from the v1 schema attributes names. What are the repercussions? - ``` - 'statsV1' <-> 'stats_v1' - 'unitsV1' <-> 'units_v1' - 'chunkGrid' <-> 'chunk_grid' - ``` - `Under investigation` -* Should histogram (e.g., SummaryStatistics) have a `histogram_type` attribute? - - `Under investigation` -* Units - * Why 'ftUS' is not supported by the schema? U.S. survey foot vs the International Foot: - *"The U.S. survey foot is defined as 1200/3937 meters, while the international foot is defined as exactly 0.3048 meters. - https://www.axiomint.com/survey-foot-versus-international-foot-whats-the-difference/ - "The REAL issue is when ... applied to State Plane coordinates in the N2,000,000 and E6,000,000 range! - This ... moves a State Plane coordinate position 4 feet by 12 feet."* - * Why there are no dimensionless unis (for seismic amplitudes, inlines, etc.) - - `Under investigation` - -## Design suggestions -* Should we rename add_dimension to add_dimension_variable (or similar) to indicate that we not just providing the dimension name, but also creating the dimension variable - - `RESOLUTION: Shorter names are preferable for public API. The function behavior will be described in the docs` - From 08798cd7323bb48653503f6de557619dd7cdb1c5 Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Mon, 7 Jul 2025 19:36:04 +0000 Subject: [PATCH 10/27] PR Review: add_coordinate() should accept only data_type: ScalarType --- src/mdio/schemas/v1/dataset_builder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mdio/schemas/v1/dataset_builder.py b/src/mdio/schemas/v1/dataset_builder.py index 701fdbba..89711e0c 100644 --- a/src/mdio/schemas/v1/dataset_builder.py +++ b/src/mdio/schemas/v1/dataset_builder.py @@ -172,7 +172,7 @@ def add_coordinate( # noqa: PLR0913 *, long_name: str = None, dimensions: list[str], - data_type: ScalarType | StructuredType = ScalarType.FLOAT32, + data_type: ScalarType = ScalarType.FLOAT32, metadata_info: CoordinateMetadataList | None = None, ) -> "MDIODatasetBuilder": """Add a coordinate after adding at least one dimension. From e8febe4f9749f15da8ae3f063d78898cfde3412b Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Mon, 7 Jul 2025 19:50:49 +0000 Subject: [PATCH 11/27] PR review: add_variable() data_type remove default --- src/mdio/schemas/v1/dataset_builder.py | 2 +- .../v1/test_dataset_builder_add_variable.py | 26 ++++++++++++------- tests/unit/v1/test_dataset_builder_build.py | 4 ++- 3 files changed, 21 insertions(+), 11 deletions(-) diff --git a/src/mdio/schemas/v1/dataset_builder.py b/src/mdio/schemas/v1/dataset_builder.py index 89711e0c..dbedd3d6 100644 --- a/src/mdio/schemas/v1/dataset_builder.py +++ b/src/mdio/schemas/v1/dataset_builder.py @@ -248,7 +248,7 @@ def add_variable( # noqa: PLR0913 *, long_name: str = None, dimensions: list[str], - data_type: ScalarType | StructuredType = ScalarType.FLOAT32, + data_type: ScalarType | StructuredType, compressor: Blosc | ZFP | None = None, coordinates: list[str] | None = None, metadata_info: VariableMetadataList | None = None, diff --git a/tests/unit/v1/test_dataset_builder_add_variable.py b/tests/unit/v1/test_dataset_builder_add_variable.py index ca6d8f95..5484ea92 100644 --- a/tests/unit/v1/test_dataset_builder_add_variable.py +++ b/tests/unit/v1/test_dataset_builder_add_variable.py @@ -30,7 +30,7 @@ def test_add_variable() -> None: msg = "Must add at least one dimension before adding variables" with pytest.raises(ValueError, match=msg): - builder.add_variable("amplitude", dimensions=["speed"]) + builder.add_variable("amplitude", dimensions=["speed"], data_type = ScalarType.FLOAT32) builder.add_dimension("inline", 100) builder.add_dimension("crossline", 100) @@ -39,20 +39,24 @@ def test_add_variable() -> None: # Validate required parameters bad_name = None with pytest.raises(ValueError, match="'name' must be a non-empty string"): - builder.add_variable(bad_name, dimensions=["speed"]) + builder.add_variable(bad_name, dimensions=["speed"], data_type = ScalarType.FLOAT32) with pytest.raises(ValueError, match="'name' must be a non-empty string"): - builder.add_variable("", dimensions=["speed"]) + builder.add_variable("", dimensions=["speed"], data_type = ScalarType.FLOAT32) with pytest.raises(ValueError, match="'dimensions' must be a non-empty list"): - builder.add_variable("bad_amplitude", dimensions=None) + builder.add_variable("bad_amplitude", dimensions=None, data_type = ScalarType.FLOAT32) with pytest.raises(ValueError, match="'dimensions' must be a non-empty list"): - builder.add_variable("bad_amplitude", dimensions=[]) + builder.add_variable("bad_amplitude", dimensions=[], data_type = ScalarType.FLOAT32) # Add a variable using non-existent dimensions msg="Pre-existing dimension named 'xline' is not found" with pytest.raises(ValueError, match=msg): - builder.add_variable("bad_amplitude", dimensions=["inline", "xline", "depth"]) + builder.add_variable("bad_amplitude", + dimensions=["inline", "xline", "depth"], + data_type = ScalarType.FLOAT32) - builder.add_variable("amplitude", dimensions=["inline", "crossline", "depth"]) + builder.add_variable("amplitude", + dimensions=["inline", "crossline", "depth"], + data_type = ScalarType.FLOAT32) assert builder._state == _BuilderState.HAS_VARIABLES assert len(builder._dimensions) == 3 assert len(builder._variables) == 4 @@ -63,6 +67,7 @@ def test_add_variable() -> None: with pytest.raises(ValueError, match=msg): builder.add_variable("bad_amplitude", dimensions=["inline", "crossline", "depth"], + data_type = ScalarType.FLOAT32, coordinates=["cdp-x", "cdp-y"]) builder.add_coordinate("cdp-x", dimensions=["inline", "crossline"]) @@ -72,7 +77,8 @@ def test_add_variable() -> None: msg="Adding variable with the same name twice is not allowed" with pytest.raises(ValueError, match=msg): builder.add_variable("amplitude", - dimensions=["inline", "crossline", "depth"]) + dimensions=["inline", "crossline", "depth"], + data_type = ScalarType.FLOAT32) def test_add_variable_with_defaults() -> None: @@ -82,7 +88,9 @@ def test_add_variable_with_defaults() -> None: builder.add_dimension("crossline", 100) builder.add_dimension("depth", 100) # Add variable using defaults - builder.add_variable("seismic_amplitude", dimensions=["inline", "crossline", "depth"]) + builder.add_variable("seismic_amplitude", + dimensions=["inline", "crossline", "depth"], + data_type=ScalarType.FLOAT32) assert len(builder._dimensions) == 3 assert len(builder._variables) == 4 assert len(builder._coordinates) == 0 diff --git a/tests/unit/v1/test_dataset_builder_build.py b/tests/unit/v1/test_dataset_builder_build.py index 488a9e6d..2af86cdf 100644 --- a/tests/unit/v1/test_dataset_builder_build.py +++ b/tests/unit/v1/test_dataset_builder_build.py @@ -33,7 +33,9 @@ def test_build() -> None: .add_dimension("y", 200) .add_coordinate("x_coord", dimensions=["x"]) .add_coordinate("y_coord", dimensions=["y"]) - .add_variable("data", dimensions=["x", "y"], long_name="Test Data") + .add_variable("data", dimensions=["x", "y"], + long_name="Test Data", + data_type=ScalarType.FLOAT32) .build() ) assert isinstance(dataset, Dataset) From 0a4be3f34dbb10ad5aac6716673fd479d816549c Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Tue, 8 Jul 2025 18:24:51 +0000 Subject: [PATCH 12/27] RE review: do not add dimension variable --- src/mdio/schemas/v1/dataset_builder.py | 152 ++++---- .../v1/test_dataset_builder_add_coordinate.py | 143 ++++--- .../v1/test_dataset_builder_add_dimension.py | 167 +-------- .../v1/test_dataset_builder_add_variable.py | 349 +++++++++++++----- tests/unit/v1/test_dataset_builder_build.py | 105 ++++-- tests/unit/v1/test_dataset_builder_helpers.py | 18 +- 6 files changed, 495 insertions(+), 439 deletions(-) diff --git a/src/mdio/schemas/v1/dataset_builder.py b/src/mdio/schemas/v1/dataset_builder.py index dbedd3d6..f0621de3 100644 --- a/src/mdio/schemas/v1/dataset_builder.py +++ b/src/mdio/schemas/v1/dataset_builder.py @@ -46,10 +46,10 @@ class _BuilderState(Enum): HAS_VARIABLES = auto() -def _get_dimension( +def _get_named_dimension( dimensions: list[NamedDimension], name: str, size: int | None = None ) -> NamedDimension | None: - """Get a dimension by name and size from the list[NamedDimension] .""" + """Get a dimension by name and optional size from the list[NamedDimension].""" if dimensions is None: return False if not isinstance(name, str): @@ -111,25 +111,18 @@ def __init__(self, name: str, attributes: UserAttributes | None = None): self._state = _BuilderState.INITIAL self._unnamed_variable_counter = 0 - def add_dimension( # noqa: PLR0913 self, name: str, - size: int, - var_data_type: ScalarType | StructuredType = ScalarType.INT32, - var_metadata_info: VariableMetadataList | None = None, + size: int ) -> "MDIODatasetBuilder": """Add a dimension. This function be called at least once before adding coordinates or variables. - This call will create a dimension variable, if one does not yet exists Args: name: Name of the dimension size: Size of the dimension - var_long_name: Optional long name for the dimension variable - var_data_type: Data type for the dimension variable (defaults to INT32) - var_metadata_info: Optional metadata information for the dimension variable Returns: self: Returns self for method chaining @@ -137,7 +130,7 @@ def add_dimension( # noqa: PLR0913 if not name: msg = "'name' must be a non-empty string" raise ValueError(msg) - + # Validate that the dimension is not already defined old_var = next((e for e in self._dimensions if e.name == name), None) if old_var is not None: @@ -146,26 +139,34 @@ def add_dimension( # noqa: PLR0913 dim = NamedDimension(name=name, size=size) self._dimensions.append(dim) - - meta_dict = _to_dictionary(var_metadata_info) - # Create a variable for the dimension - dim_var = Variable( - name=name, - longName=f"'{name}' dimension variable", - # IMPORTANT: we use NamedDimension here, not the dimension name. - # Since the Dataset does not have a dimension list, we need to preserve - # NamedDimension somewhere. Namely, in the variable created for the dimension - dimensions=[dim], - dataType=var_data_type, - compressor=None, - coordinates=None, - metadata=meta_dict, - ) - self._variables.append(dim_var) - self._state = _BuilderState.HAS_DIMENSIONS return self + def _get_coordinate( + self, + coordinates: list[Coordinate] | list[str], + name: str, size: int | None = None + ) -> Coordinate | None: + """Get a coordinate by name from the list[Coordinate] | list[str].""" + if coordinates is None: + return None + + for c in coordinates: + if isinstance(c, str) and c == name: + # The coordinate is stored by name (str). + # Find it in the builder global list and return it. + cc = next((v for v in self._coordinates if v.name == name), None) + if cc is None: + msg = f"Pre-existing coordinate named {name!r} is not found" + raise ValueError(msg) + return cc + if isinstance(c, Coordinate) and c.name == name: + # The coordinate is stored as an embedded Coordinate object. + # Return it. + return c + + return None + def add_coordinate( # noqa: PLR0913 self, name: str, @@ -173,6 +174,7 @@ def add_coordinate( # noqa: PLR0913 long_name: str = None, dimensions: list[str], data_type: ScalarType = ScalarType.FLOAT32, + compressor: Blosc | ZFP | None = None, metadata_info: CoordinateMetadataList | None = None, ) -> "MDIODatasetBuilder": """Add a coordinate after adding at least one dimension. @@ -200,48 +202,66 @@ def add_coordinate( # noqa: PLR0913 msg = "'dimensions' must be a non-empty list" raise ValueError(msg) old_var = next((e for e in self._coordinates if e.name == name), None) - # Validate that the coordinate is not already defined if old_var is not None: msg = "Adding coordinate with the same name twice is not allowed" raise ValueError(msg) - + # Validate that all referenced dimensions are already defined - for dim in dimensions: - if next((d for d in self._dimensions if d.name == dim), None) is None: - msg = f"Pre-existing dimension named {dim!r} is not found" + named_dimensions = [] + for dim_name in dimensions: + nd = _get_named_dimension(self._dimensions, dim_name) + if nd is None: + msg = f"Pre-existing dimension named {dim_name!r} is not found" raise ValueError(msg) + named_dimensions.append(nd) meta_dict = _to_dictionary(metadata_info) coord = Coordinate( name=name, longName=long_name, - # We ass names: sts, not list[NamedDimension | str] - dimensions=dimensions, + dimensions=named_dimensions, + compressor=compressor, dataType=data_type, metadata=meta_dict ) self._coordinates.append(coord) # Add a coordinate variable to the dataset - var_meta_dict = _to_dictionary(coord.metadata) - coord_var = Variable( + self.add_variable( name=coord.name, - longName=f"'{coord.name}' coordinate variable", - dimensions=coord.dimensions, - dataType=coord.data_type, - compressor=None, - # IMPORTANT: we always use the Coordinate here, not the coordinate name - # Since the Dataset does not have a coordinate list, we need to preserve Coordinate - # somewhere. Namely, in the variable created for the coordinate - coordinates=[coord], - metadata=var_meta_dict + long_name=f"'{coord.name}' coordinate variable", + dimensions=dimensions, # dimension names (list[str]) + data_type=coord.data_type, + compressor=compressor, + coordinates=[name], # Use the coordinate name as a reference + metadata_info=coord.metadata ) - self._variables.append(coord_var) self._state = _BuilderState.HAS_COORDINATES return self + def add_dimension_coordinate( + self, + dimension_name: str, + *, + data_type: ScalarType, + compressor: Blosc | ZFP | None = None, + metadata_info: VariableMetadataList | None = None, + ) -> "MDIODatasetBuilder": + """Add a dimension coordinate variable for a pre-existing dimension. + This is a convenience method to create a coordinate variable + that represents sampling along a dimension. + """ + self.add_coordinate(dimension_name, + long_name=dimension_name, + dimensions=[dimension_name], + data_type=data_type, + compressor=compressor, + metadata_info=_to_dictionary(metadata_info)) + + return self + def add_variable( # noqa: PLR0913 self, name: str, @@ -255,9 +275,13 @@ def add_variable( # noqa: PLR0913 ) -> "MDIODatasetBuilder": """Add a variable after adding at least one dimension and, optionally, coordinate. - This function must be called after all required dimensions are added via add_dimension(). + This function must be called after all required dimensions are added via add_dimension() This function must be called after all required coordinates are added via add_coordinate(). + If this function is called with a single dimension name that matches the variable name, + it will create a dimension variable. Dimension variables are special variables that + represent sampling along a dimension. + Args: name: Name of the variable long_name: Optional long name for the variable @@ -280,7 +304,7 @@ def add_variable( # noqa: PLR0913 if dimensions is None or not dimensions: msg = "'dimensions' must be a non-empty list" raise ValueError(msg) - + # Validate that the variable is not already defined old_var = next((e for e in self._variables if e.name == name), None) if old_var is not None: @@ -288,10 +312,13 @@ def add_variable( # noqa: PLR0913 raise ValueError(msg) # Validate that all referenced dimensions are already defined - for dim in dimensions: - if next((e for e in self._dimensions if e.name == dim), None) is None: - msg = f"Pre-existing dimension named {dim!r} is not found" + named_dimensions = [] + for dim_name in dimensions: + nd = _get_named_dimension(self._dimensions, dim_name) + if nd is None: + msg = f"Pre-existing dimension named {dim_name!r} is not found" raise ValueError(msg) + named_dimensions.append(nd) # Validate that all referenced coordinates are already defined if coordinates is not None: @@ -301,17 +328,16 @@ def add_variable( # noqa: PLR0913 raise ValueError(msg) meta_dict = _to_dictionary(metadata_info) - self._variables.append( - Variable( - name=name, - long_name=long_name, - dimensions=dimensions, - data_type=data_type, - compressor=compressor, - coordinates=coordinates, - metadata=meta_dict, - ) - ) + var = Variable( + name=name, + long_name=long_name, + dimensions=named_dimensions, + data_type=data_type, + compressor=compressor, + coordinates=coordinates, + metadata=meta_dict) + self._variables.append(var) + self._state = _BuilderState.HAS_VARIABLES return self diff --git a/tests/unit/v1/test_dataset_builder_add_coordinate.py b/tests/unit/v1/test_dataset_builder_add_coordinate.py index 0adb9240..cae46834 100644 --- a/tests/unit/v1/test_dataset_builder_add_coordinate.py +++ b/tests/unit/v1/test_dataset_builder_add_coordinate.py @@ -6,13 +6,15 @@ import pytest +from mdio.schemas.compressors import Blosc from mdio.schemas.dtype import ScalarType from mdio.schemas.metadata import UserAttributes -from mdio.schemas.v1.dataset_builder import MDIODatasetBuilder +from mdio.schemas.v1.dataset_builder import MDIODatasetBuilder, _get_named_dimension from mdio.schemas.v1.dataset_builder import _BuilderState from mdio.schemas.v1.units import AllUnits from mdio.schemas.v1.units import LengthUnitEnum from mdio.schemas.v1.units import LengthUnitModel +from mdio.schemas.v1.variable import VariableMetadata def test_add_coordinate() -> None: @@ -25,7 +27,7 @@ def test_add_coordinate() -> None: builder.add_coordinate("cdp", dimensions=["inline", "crossline"]) builder.add_dimension("inline", 100) - builder.add_dimension("crossline", 100) + builder.add_dimension("crossline", 200) # Validate required parameters bad_name = None @@ -47,18 +49,21 @@ def test_add_coordinate() -> None: builder.add_coordinate("cdp-x", dimensions=["inline", "crossline"]) assert builder._state == _BuilderState.HAS_COORDINATES assert len(builder._dimensions) == 2 - # 2 variables for dimensions, 1 variable for coordinates - assert len(builder._variables) == 3 + # 1 variable for coordinates + assert len(builder._variables) == 1 assert len(builder._coordinates) == 1 # Validate that we created a coordinate variable var_cdp = next(e for e in builder._variables if e.name == "cdp-x") assert var_cdp is not None - # Validate that dimensions are stored as names - assert set(var_cdp.dimensions) == {"inline", "crossline"} + assert len(var_cdp.dimensions) == 2 + assert _get_named_dimension(var_cdp.dimensions, "inline", 100) is not None + assert _get_named_dimension(var_cdp.dimensions, "crossline", 200) is not None # Validate that coordinates are stored as Coordinate assert len(var_cdp.coordinates) == 1 - assert next((e for e in var_cdp.coordinates if e.name == "cdp-x"), None) is not None + # No dimensions are stored in coordinates + # Validate that non-dimension coordinates + assert builder._get_coordinate(var_cdp.coordinates, "cdp-x") is not None # Adding coordinate with the same name twice msg="Adding coordinate with the same name twice is not allowed" @@ -69,96 +74,76 @@ def test_add_coordinate_with_defaults() -> None: """Test adding coordinates with default arguments.""" builder = MDIODatasetBuilder("test_dataset") builder.add_dimension("inline", 100) - builder.add_dimension("crossline", 100) + builder.add_dimension("crossline", 200) # Add coordinate using defaults builder.add_coordinate("cdp", dimensions=["inline", "crossline"]) assert len(builder._dimensions) == 2 - # 2 variables for dimensions, 1 variable for coordinates - assert len(builder._variables) == 3 + # 1 variable for coordinates + assert len(builder._variables) == 1 assert len(builder._coordinates) == 1 - crd0 = next((e for e in builder._coordinates if e.name == "cdp"), None) - assert crd0 is not None - # NOTE: add_variable() stores dimensions as names - assert set(crd0.dimensions) == {"inline", "crossline"} - assert crd0.long_name is None # Default value - assert crd0.data_type == ScalarType.FLOAT32 # Default value - assert crd0.metadata is None # Default value - -def test_coordinate_with_units() -> None: - """Test adding coordinates with units.""" - builder = MDIODatasetBuilder("test_dataset") - builder.add_dimension("inline", 100) - builder.add_dimension("crossline", 100) - # Add coordinate with units - builder.add_coordinate( - "cdp", - dimensions=["inline", "crossline"], - metadata_info=[AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.FOOT))] - ) - assert len(builder._dimensions) == 2 - # 2 variables for dimensions, 1 variable for coordinates - assert len(builder._variables) == 3 - assert len(builder._coordinates) == 1 - crd0 = next((e for e in builder._coordinates if e.name == "cdp"), None) - assert crd0 is not None - # NOTE: add_coordinate() stores dimensions as names - assert set(crd0.dimensions) == {"inline", "crossline"} - assert crd0.long_name is None # Default value - assert crd0.data_type == ScalarType.FLOAT32 # Default value - assert crd0.metadata.attributes is None - assert crd0.metadata.units_v1.length == LengthUnitEnum.FOOT - -def test_coordinate_with_attributes() -> None: - """Test adding coordinates with attributes.""" - builder = MDIODatasetBuilder("test_dataset") - builder.add_dimension("inline", 100) - builder.add_dimension("crossline", 100) - # Add coordinate with attributes - builder.add_coordinate( - "cdp", - dimensions=["inline", "crossline"], - metadata_info=[UserAttributes(attributes={"MGA": 51, "UnitSystem": "Imperial"})], - ) - assert len(builder._dimensions) == 2 - # 2 variables for dimensions, 1 variable for coordinates - assert len(builder._variables) == 3 - assert len(builder._coordinates) == 1 - # NOTE: add_coordinate() stores dimensions as names - crd0 = next((e for e in builder._coordinates if e.name == "cdp"), None) - assert crd0 is not None - assert set(crd0.dimensions) == {"inline", "crossline"} - assert crd0.long_name is None # Default value - assert crd0.data_type == ScalarType.FLOAT32 # Default value - assert crd0.metadata.attributes["MGA"] == 51 - assert crd0.metadata.attributes["UnitSystem"] == "Imperial" - assert crd0.metadata.units_v1 is None + # Validate: the structure of the coordinate + coord_cdp = next((e for e in builder._coordinates if e.name == "cdp"), None) + assert coord_cdp is not None + assert len(coord_cdp.dimensions) == 2 + assert _get_named_dimension(coord_cdp.dimensions, "inline", 100) is not None + assert _get_named_dimension(coord_cdp.dimensions, "crossline", 200) is not None + assert coord_cdp.long_name is None # Default value + assert coord_cdp.data_type == ScalarType.FLOAT32 # Default value + assert coord_cdp.compressor is None # Default value + assert coord_cdp.metadata is None # Default value -def test_coordinate_with_full_metadata() -> None: +def test_coordinate_with_full_parameters() -> None: """Test adding coordinates with all metadata.""" builder = MDIODatasetBuilder("test_dataset") builder.add_dimension("inline", 100) - builder.add_dimension("crossline", 100) + builder.add_dimension("crossline", 200) # Add coordinate with all metadata builder.add_coordinate( "cdp", + long_name = "Common Depth Point", dimensions=["inline", "crossline"], + data_type = ScalarType.FLOAT16, + compressor = Blosc(algorithm="zstd"), metadata_info=[ AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.FOOT)), UserAttributes(attributes={"MGA": 51, "UnitSystem": "Imperial"})] ) + # Validate: the state of the builder + assert builder._state == _BuilderState.HAS_COORDINATES assert len(builder._dimensions) == 2 - # 2 variables for dimensions, 1 variable for coordinates - assert len(builder._variables) == 3 + # 1 variable for coordinates + assert len(builder._variables) == 1 assert len(builder._coordinates) == 1 - # NOTE: add_coordinate() stores dimensions as names - crd0 = next((e for e in builder._coordinates if e.name == "cdp"), None) - assert crd0 is not None - assert set(crd0.dimensions) == {"inline", "crossline"} - assert crd0.long_name is None # Default value - assert crd0.data_type == ScalarType.FLOAT32 # Default value - assert crd0.metadata.attributes["MGA"] == 51 - assert crd0.metadata.attributes["UnitSystem"] == "Imperial" - assert crd0.metadata.units_v1.length == LengthUnitEnum.FOOT + # Validate: the structure of the coordinate + coord_cdp = next((e for e in builder._coordinates if e.name == "cdp"), None) + assert coord_cdp is not None + assert len(coord_cdp.dimensions) == 2 + assert _get_named_dimension(coord_cdp.dimensions, "inline", 100) is not None + assert _get_named_dimension(coord_cdp.dimensions, "crossline", 200) is not None + assert coord_cdp.long_name == "Common Depth Point" + assert coord_cdp.data_type == ScalarType.FLOAT16 + assert isinstance(coord_cdp.compressor, Blosc) + assert coord_cdp.compressor.algorithm == "zstd" + assert coord_cdp.metadata.attributes["MGA"] == 51 + assert coord_cdp.metadata.attributes["UnitSystem"] == "Imperial" + assert coord_cdp.metadata.units_v1.length == LengthUnitEnum.FOOT + + # Validate: the structure of the created variable + v = next((v for v in builder._variables if v.name == "cdp"), None) + assert v is not None + assert v.long_name == "'cdp' coordinate variable" + assert len(v.dimensions) == 2 + assert _get_named_dimension(v.dimensions, "inline", 100) is not None + assert _get_named_dimension(v.dimensions, "crossline", 200) is not None + assert v.data_type == ScalarType.FLOAT16 + assert isinstance(v.compressor, Blosc) + assert v.compressor.algorithm == "zstd" + assert len(v.coordinates) == 1 + assert builder._get_coordinate(v.coordinates, "cdp") is not None + assert isinstance(v.metadata, VariableMetadata) + assert v.metadata.units_v1.length == LengthUnitEnum.FOOT + assert v.metadata.attributes["MGA"] == 51 + assert v.metadata.attributes["UnitSystem"] == "Imperial" diff --git a/tests/unit/v1/test_dataset_builder_add_dimension.py b/tests/unit/v1/test_dataset_builder_add_dimension.py index 1465a892..b701380f 100644 --- a/tests/unit/v1/test_dataset_builder_add_dimension.py +++ b/tests/unit/v1/test_dataset_builder_add_dimension.py @@ -13,7 +13,7 @@ from mdio.schemas.metadata import UserAttributes from mdio.schemas.v1.dataset_builder import MDIODatasetBuilder from mdio.schemas.v1.dataset_builder import _BuilderState -from mdio.schemas.v1.dataset_builder import _get_dimension +from mdio.schemas.v1.dataset_builder import _get_named_dimension from mdio.schemas.v1.stats import CenteredBinHistogram from mdio.schemas.v1.stats import StatisticsMetadata from mdio.schemas.v1.stats import SummaryStatistics @@ -38,21 +38,7 @@ def test_add_dimension() -> None: builder.add_dimension("x", 100) assert builder._state == _BuilderState.HAS_DIMENSIONS assert len(builder._dimensions) == 1 - assert _get_dimension(builder._dimensions, "x", 100) is not None - - # Validate that we have created a dimension variable and - # that variable has the embedded NamedDimension - assert len(builder._variables) == 1 - var_x = next(e for e in builder._variables if e.name == "x") - assert var_x is not None - assert len(var_x.dimensions) == 1 - # Validate that the dimension variable has the NamedDimension - assert _get_dimension(var_x.dimensions, "x", 100) is not None - assert var_x.long_name == "'x' dimension variable" - assert var_x.data_type == ScalarType.INT32 - assert var_x.compressor is None - assert var_x.coordinates is None - assert var_x.metadata is None + assert _get_named_dimension(builder._dimensions, "x", 100) is not None # Validate that we can't add a dimension with the same name twice with pytest.raises( @@ -66,152 +52,3 @@ def test_add_dimension() -> None: with pytest.raises(ValueError, match=msg): builder.add_dimension("x", 200) -def test_add_dimension_with_defaults() -> None: - """Test dimension builder state transitions and functionality.""" - builder = MDIODatasetBuilder("test_dataset") - - # First dimension should change state to HAS_DIMENSIONS and create a variable - builder.add_dimension("x", 100) - assert builder._state == _BuilderState.HAS_DIMENSIONS - assert len(builder._dimensions) == 1 - # Validate that the dimension builder has the NamedDimension - assert _get_dimension(builder._dimensions, "x", 100) is not None - var_x = next((e for e in builder._variables if e.name == "x"), None) - assert var_x is not None - assert var_x.name == "x" - assert var_x.long_name == "'x' dimension variable" - assert var_x.data_type == ScalarType.INT32 - assert var_x.compressor is None - assert var_x.coordinates is None - assert var_x.metadata is None - -def test_add_dimension_with_units() -> None: - """Test adding dimensions with units.""" - builder = MDIODatasetBuilder("test_dataset") - - # Add dimension with strongly-typed unit list of single-item - builder.add_dimension( - "length", - size=100, - var_data_type=ScalarType.FLOAT64, - var_metadata_info=[AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.FOOT))] - ) - assert len(builder._variables) == 1 - var0 = builder._variables[0] - assert var0.name == "length" - assert var0.long_name == "'length' dimension variable" - assert var0.data_type == ScalarType.FLOAT64 - assert var0.compressor is None - assert var0.coordinates is None - assert var0.metadata.units_v1.length == LengthUnitEnum.FOOT - -def test_add_dimension_with_attributes() -> None: - """Test adding dimensions with attributes.""" - builder = MDIODatasetBuilder("test_dataset") - - builder.add_dimension( - "length", - size=100, - var_data_type=ScalarType.FLOAT32, - var_metadata_info=[UserAttributes(attributes={"MGA": 51, "UnitSystem": "Imperial"})], - ) - assert len(builder._variables) == 1 - var0 = builder._variables[0] - assert var0.name == "length" - assert var0.data_type == ScalarType.FLOAT32 - assert var0.metadata.attributes["MGA"] == 51 - assert var0.metadata.attributes["UnitSystem"] == "Imperial" - - -def test_add_dimension_with_chunk_grid() -> None: - """Test adding dimensions with chunk grid.""" - builder = MDIODatasetBuilder("test_dataset") - - grid_definition = RegularChunkGrid(configuration=RegularChunkShape(chunk_shape=[20])) - builder.add_dimension( - "length", - size=100, - var_data_type=ScalarType.FLOAT32, - var_metadata_info=[ChunkGridMetadata(chunk_grid=grid_definition)], - ) - assert len(builder._variables) == 1 - var0 = builder._variables[0] - assert var0.name == "length" - assert var0.data_type == ScalarType.FLOAT32 - assert var0.metadata.chunk_grid.name == "regular" - assert var0.metadata.chunk_grid.configuration.chunk_shape == [20] - - -def test_add_dimension_with_stats() -> None: - """Test adding dimensions with stats.""" - builder = MDIODatasetBuilder("test_dataset") - - builder.add_dimension( - "depth", - size=100, - var_data_type=ScalarType.FLOAT32, - var_metadata_info=[ - StatisticsMetadata( - stats_v1=SummaryStatistics( - count=100, - sum=1215.1, - sumSquares=125.12, - min=5.61, - max=10.84, - # TODO(DmitriyRepin, #0): Also test EdgeDefinedHistogram - histogram=CenteredBinHistogram(binCenters=[1, 2], counts=[10, 15]), - ) - ) - ], - ) - assert len(builder._variables) == 1 - var0 = builder._variables[0] - assert var0.name == "depth" - assert var0.data_type == ScalarType.FLOAT32 - assert var0.metadata.stats_v1.count == 100 - assert var0.metadata.stats_v1.sum == 1215.1 - - -def test_add_dimension_with_full_metadata() -> None: - """Test adding dimensions with all metadata.""" - builder = MDIODatasetBuilder("test_dataset") - - builder.add_dimension( - "length", - size=100, - var_data_type=ScalarType.FLOAT32, - var_metadata_info=[ - AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.FOOT)), - UserAttributes(attributes={"MGA": 51, "UnitSystem": "Imperial"}), - ChunkGridMetadata( - chunk_grid=RegularChunkGrid(configuration=RegularChunkShape(chunk_shape=[20])) - ), - StatisticsMetadata( - stats_v1=SummaryStatistics( - count=100, - sum=1215.1, - sumSquares=125.12, - min=5.61, - max=10.84, - histogram=CenteredBinHistogram(binCenters=[1, 2], counts=[10, 15]), - ) - ), - ], - ) - - assert len(builder._variables) == 1 - var0 = builder._variables[0] - assert var0.name == "length" - assert var0.data_type == ScalarType.FLOAT32 - assert var0.metadata.units_v1.length == LengthUnitEnum.FOOT - assert var0.metadata.attributes["MGA"] == 51 - assert var0.metadata.attributes["UnitSystem"] == "Imperial" - assert var0.metadata.chunk_grid.name == "regular" - assert var0.metadata.chunk_grid.configuration.chunk_shape == [20] - assert var0.metadata.stats_v1.count == 100 - assert var0.metadata.stats_v1.sum == 1215.1 - assert var0.metadata.stats_v1.sum_squares == 125.12 - assert var0.metadata.stats_v1.min == 5.61 - assert var0.metadata.stats_v1.max == 10.84 - assert var0.metadata.stats_v1.histogram.bin_centers == [1, 2] - assert var0.metadata.stats_v1.histogram.counts == [10, 15] diff --git a/tests/unit/v1/test_dataset_builder_add_variable.py b/tests/unit/v1/test_dataset_builder_add_variable.py index 5484ea92..ad24b9de 100644 --- a/tests/unit/v1/test_dataset_builder_add_variable.py +++ b/tests/unit/v1/test_dataset_builder_add_variable.py @@ -6,13 +6,14 @@ import pytest +from mdio.schemas import builder from mdio.schemas.chunk_grid import RegularChunkGrid from mdio.schemas.chunk_grid import RegularChunkShape from mdio.schemas.compressors import Blosc from mdio.schemas.dtype import ScalarType from mdio.schemas.metadata import ChunkGridMetadata from mdio.schemas.metadata import UserAttributes -from mdio.schemas.v1.dataset_builder import MDIODatasetBuilder +from mdio.schemas.v1.dataset_builder import MDIODatasetBuilder, _get_named_dimension from mdio.schemas.v1.dataset_builder import _BuilderState from mdio.schemas.v1.stats import CenteredBinHistogram from mdio.schemas.v1.stats import StatisticsMetadata @@ -23,145 +24,299 @@ from mdio.schemas.v1.variable import VariableMetadata -def test_add_variable() -> None: +def test_add_variable_no_coords() -> None: """Test adding variable. Check the state transition and validate required parameters..""" builder = MDIODatasetBuilder("test_dataset") assert builder._state == _BuilderState.INITIAL - + + # Validate: Must add at least one dimension before adding variables msg = "Must add at least one dimension before adding variables" with pytest.raises(ValueError, match=msg): - builder.add_variable("amplitude", dimensions=["speed"], data_type = ScalarType.FLOAT32) + builder.add_variable("amplitude", dimensions=[ + "speed"], data_type=ScalarType.FLOAT32) + # Add dimension before we can add a data variable builder.add_dimension("inline", 100) - builder.add_dimension("crossline", 100) - builder.add_dimension("depth", 100) + builder.add_dimension("crossline", 200) + builder.add_dimension("depth", 300) - # Validate required parameters + # Validate: required parameters must be preset bad_name = None with pytest.raises(ValueError, match="'name' must be a non-empty string"): - builder.add_variable(bad_name, dimensions=["speed"], data_type = ScalarType.FLOAT32) + builder.add_variable(bad_name, dimensions=[ + "speed"], data_type=ScalarType.FLOAT32) with pytest.raises(ValueError, match="'name' must be a non-empty string"): - builder.add_variable("", dimensions=["speed"], data_type = ScalarType.FLOAT32) + builder.add_variable( + "", dimensions=["speed"], data_type=ScalarType.FLOAT32) with pytest.raises(ValueError, match="'dimensions' must be a non-empty list"): - builder.add_variable("bad_amplitude", dimensions=None, data_type = ScalarType.FLOAT32) + builder.add_variable("bad_amplitude", dimensions=None, + data_type=ScalarType.FLOAT32) with pytest.raises(ValueError, match="'dimensions' must be a non-empty list"): - builder.add_variable("bad_amplitude", dimensions=[], data_type = ScalarType.FLOAT32) + builder.add_variable("bad_amplitude", dimensions=[], + data_type=ScalarType.FLOAT32) - # Add a variable using non-existent dimensions - msg="Pre-existing dimension named 'xline' is not found" + # Validate: Add a variable using non-existent dimensions is not allowed + msg = "Pre-existing dimension named 'il' is not found" with pytest.raises(ValueError, match=msg): - builder.add_variable("bad_amplitude", - dimensions=["inline", "xline", "depth"], - data_type = ScalarType.FLOAT32) + builder.add_variable("bad_amplitude", + dimensions=["il", "xl", "depth"], + data_type=ScalarType.FLOAT32) - builder.add_variable("amplitude", - dimensions=["inline", "crossline", "depth"], - data_type = ScalarType.FLOAT32) + # Add a variable without coordinates + builder.add_variable("amplitude", + dimensions=["inline", "crossline", "depth"], + data_type=ScalarType.FLOAT32) assert builder._state == _BuilderState.HAS_VARIABLES - assert len(builder._dimensions) == 3 - assert len(builder._variables) == 4 - assert len(builder._coordinates) == 0 + assert len(builder._dimensions) == 3 + assert len(builder._variables) == 1 + assert len(builder._coordinates) == 0 + + # Validate the structure of the created variable + var_ampl = next((e for e in builder._variables if e.name == "amplitude"), None) + assert var_ampl is not None + # Validate that dimensions are stored as NamedDimensions + assert _get_named_dimension(var_ampl.dimensions, "inline", 100) is not None + assert _get_named_dimension(var_ampl.dimensions, "crossline", 200) is not None + assert _get_named_dimension(var_ampl.dimensions, "depth", 300) is not None + # Validate that no coordinates are set + assert var_ampl.coordinates is None - # Add a variable using non-existent coordinates - msg="Pre-existing coordinate named 'cdp-x' is not found" + # Validate: adding a variable with the same name twice is not allowed + msg = "Adding variable with the same name twice is not allowed" with pytest.raises(ValueError, match=msg): - builder.add_variable("bad_amplitude", + builder.add_variable("amplitude", dimensions=["inline", "crossline", "depth"], - data_type = ScalarType.FLOAT32, - coordinates=["cdp-x", "cdp-y"]) + data_type=ScalarType.FLOAT32) - builder.add_coordinate("cdp-x", dimensions=["inline", "crossline"]) - builder.add_coordinate("cdp-y", dimensions=["inline", "crossline"]) - # Adding variable with the same name twice - msg="Adding variable with the same name twice is not allowed" +def test_add_variable_with_coords() -> None: + """Test adding variable. Check the state transition and validate required parameters..""" + builder = MDIODatasetBuilder("test_dataset") + builder.add_dimension("inline", 100) + builder.add_dimension("crossline", 200) + builder.add_dimension("depth", 300) + + # Add dimension coordinates before we can add a data variable + builder.add_dimension_coordinate("inline", data_type=ScalarType.INT32) + builder.add_dimension_coordinate("crossline", data_type=ScalarType.INT32) + + # Validate: adding a variable with a coordinate that has not been pre-created is not allowed + msg = "Pre-existing coordinate named 'depth' is not found" with pytest.raises(ValueError, match=msg): - builder.add_variable("amplitude", + builder.add_variable("ampl", dimensions=["inline", "crossline", "depth"], - data_type = ScalarType.FLOAT32) + coordinates=["inline", "crossline", "depth"], + data_type=ScalarType.FLOAT32) + + # Add a variable with pre-defined dimension coordinates + builder.add_variable("ampl", + dimensions=["inline", "crossline", "depth"], + coordinates=["inline", "crossline"], + data_type=ScalarType.FLOAT32) + + assert builder._state == _BuilderState.HAS_VARIABLES + assert len(builder._dimensions) == 3 + # 2 dim coordinate variables + 1 data variables + assert len(builder._variables) == 3 + assert len(builder._coordinates) == 2 + + # Validate: the structure of the created variable + var_ampl = next((e for e in builder._variables if e.name == "ampl"), None) + assert var_ampl is not None + # Validate: that dimensions are stored as NamedDimensions + assert len(var_ampl.dimensions) == 3 + assert _get_named_dimension(var_ampl.dimensions, "inline", 100) is not None + assert _get_named_dimension(var_ampl.dimensions, "crossline", 200) is not None + assert _get_named_dimension(var_ampl.dimensions, "depth", 300) is not None + assert len(var_ampl.coordinates) == 2 + # Validate that dim coordinates "inline" and "crossline" are set + assert builder._get_coordinate(var_ampl.coordinates, "inline") is not None + assert builder._get_coordinate(var_ampl.coordinates, "crossline") is not None + # "depth" coordinate is not set + + # Add non-dim coordinates (e.g., 2D coordinates) + builder.add_coordinate("cdp-x", dimensions=["inline", "crossline"]) + builder.add_coordinate("cdp-y", dimensions=["inline", "crossline"]) + + # Add a variable with pre-defined dimension and non-dimension coordinates + builder.add_variable("ampl2", + dimensions=["inline", "crossline", "depth"], + coordinates=["inline", "crossline", "cdp-x", "cdp-y"], + data_type=ScalarType.FLOAT32) + + assert builder._state == _BuilderState.HAS_VARIABLES + assert len(builder._dimensions) == 3 + # 2 dim coordinate variables + 2 non-dim coordinate variables + 1 data variables + assert len(builder._variables) == 6 + assert len(builder._coordinates) == 4 + + # Validate: the structure of the created variable + var_ampl2 = next((e for e in builder._variables if e.name == "ampl2"), None) + assert var_ampl2 is not None + # Validate: that dimensions are stored as NamedDimensions + assert len(var_ampl2.dimensions) == 3 + assert _get_named_dimension(var_ampl2.dimensions, "inline", 100) is not None + assert _get_named_dimension(var_ampl2.dimensions, "crossline", 200) is not None + assert _get_named_dimension(var_ampl2.dimensions, "depth", 300) is not None + assert len(var_ampl2.coordinates) == 4 + # Validate that dim coordinates "inline" and "crossline" are set + assert builder._get_coordinate(var_ampl2.coordinates, "inline") is not None + assert builder._get_coordinate(var_ampl2.coordinates, "crossline") is not None + # "depth" coordinate is not set + # Validate that non-dimension coordinates "cdp-x" and "cdp-y" + assert builder._get_coordinate(var_ampl2.coordinates, "cdp-x") is not None + assert builder._get_coordinate(var_ampl2.coordinates, "cdp-y") is not None + + +def test_add_dimension_coordinate() -> None: + """Test adding dimension variable.""" + builder = MDIODatasetBuilder("test_dataset") + builder.add_dimension("inline", 100) + + builder.add_dimension_coordinate("inline", data_type=ScalarType.INT32) + + # Validate: that coordinate is stored in the builder global list + coord_il = builder._get_coordinate(builder._coordinates, "inline") + # Validate: that dimensions are stored as NamedDimensions in the coordinate + assert _get_named_dimension(coord_il.dimensions, "inline", 100) is not None + # Validate: a dim variable has been created + var_il = next((e for e in builder._variables if e.name == "inline"), None) + assert var_il is not None + # Validate: the variable has the expected properties + assert var_il.name == "inline" + assert var_il.long_name == "'inline' coordinate variable" + assert len(var_il.dimensions) == 1 + assert _get_named_dimension(var_il.dimensions, "inline", 100) is not None + assert var_il.data_type == ScalarType.INT32 + assert var_il.compressor is None # Default value + assert len(var_il.coordinates) == 1 + assert builder._get_coordinate(var_il.coordinates, "inline") is not None + assert var_il.metadata is None # Default value def test_add_variable_with_defaults() -> None: """Test adding variable with default arguments.""" builder = MDIODatasetBuilder("test_dataset") + # Add dimensions before we can add a data variables builder.add_dimension("inline", 100) - builder.add_dimension("crossline", 100) - builder.add_dimension("depth", 100) - # Add variable using defaults - builder.add_variable("seismic_amplitude", + builder.add_dimension("crossline", 200) + builder.add_dimension("depth", 300) + # Add dimension coordinates + builder.add_dimension_coordinate("inline", data_type=ScalarType.INT32) + builder.add_dimension_coordinate("crossline", data_type=ScalarType.INT32) + builder.add_dimension_coordinate("depth", + data_type=ScalarType.FLOAT32, + metadata_info=[ + AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.METER))]) + + # Add data variable using defaults + builder.add_variable("ampl", dimensions=["inline", "crossline", "depth"], data_type=ScalarType.FLOAT32) - assert len(builder._dimensions) == 3 - assert len(builder._variables) == 4 - assert len(builder._coordinates) == 0 - var0 = next((e for e in builder._variables if e.name == "seismic_amplitude"), None) - assert var0 is not None - # NOTE: add_variable() stores dimensions as names - assert set(var0.dimensions) == {"inline", "crossline", "depth"} - assert var0.long_name is None # Default value - assert var0.data_type == ScalarType.FLOAT32 # Default value - assert var0.compressor is None # Default value - assert var0.coordinates is None # Default value - assert var0.metadata is None # Default value + assert len(builder._dimensions) == 3 + # 3 dim coordinate variables + 1 data variable = 4 + assert len(builder._variables) == 4 + assert len(builder._coordinates) == 3 + + # Validate: the structure of the created variable + var_ampl = next((e for e in builder._variables if e.name == "ampl"), None) + assert var_ampl is not None + assert var_ampl.name == "ampl" + assert var_ampl.long_name is None # Default value + # Validate: that dimensions are stored as NamedDimensions + assert len(var_ampl.dimensions) == 3 + assert _get_named_dimension(var_ampl.dimensions, "inline", 100) is not None + assert _get_named_dimension(var_ampl.dimensions, "crossline", 200) is not None + assert _get_named_dimension(var_ampl.dimensions, "depth", 300) is not None + assert var_ampl.data_type == ScalarType.FLOAT32 + assert var_ampl.compressor is None # Default value + assert var_ampl.coordinates is None # Default value + # Validate: the variable has the expected properties + assert var_ampl.metadata is None # Default value + def test_add_variable_full_parameters() -> None: """Test adding variable with full parameters.""" builder = MDIODatasetBuilder("test_dataset") + # Add dimensions before we can add a data variables builder.add_dimension("inline", 100) - builder.add_dimension("crossline", 100) - builder.add_dimension("depth", 100) - builder.add_coordinate("cdp-x", dimensions=["inline", "crossline"]) - builder.add_coordinate("cdp-y", dimensions=["inline", "crossline"]) - builder.add_variable("seismic_amplitude", - long_name="Amplitude (dimensionless)", - dimensions=["inline", "crossline", "depth"], - data_type=ScalarType.FLOAT64, - compressor=Blosc(algorithm="zstd"), - coordinates=["cdp-x", "cdp-y"], - metadata_info=[ - AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.FOOT)), - UserAttributes(attributes={"MGA": 51, "UnitSystem": "Imperial"}), - ChunkGridMetadata( - chunk_grid=RegularChunkGrid(configuration=RegularChunkShape(chunk_shape=[20])) - ), - StatisticsMetadata( - stats_v1=SummaryStatistics( - count=100, - sum=1215.1, - sumSquares=125.12, - min=5.61, - max=10.84, - histogram=CenteredBinHistogram(binCenters=[1, 2], counts=[10, 15]), - ) - ), - ]) + builder.add_dimension("crossline", 200) + builder.add_dimension("depth", 300) + + # Add dimension coordinates + builder.add_dimension_coordinate("inline", data_type=ScalarType.INT32) + builder.add_dimension_coordinate("crossline", data_type=ScalarType.INT32) + builder.add_dimension_coordinate("depth", data_type=ScalarType.INT32) + + # Add coordinates before we can add a data variable + builder.add_coordinate("cdp-x", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT64) + builder.add_coordinate("cdp-y", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT64) + + # Add data variable with full parameters + builder.add_variable("ampl", + long_name="Amplitude (dimensionless)", + dimensions=["inline", "crossline", "depth"], + data_type=ScalarType.FLOAT32, + compressor=Blosc(algorithm="zstd"), + coordinates=["inline", "crossline", "depth", "cdp-x", "cdp-y"], + metadata_info=[ + AllUnits(units_v1=LengthUnitModel( + length=LengthUnitEnum.FOOT)), + UserAttributes( + attributes={"MGA": 51, "UnitSystem": "Imperial"}), + ChunkGridMetadata( + chunk_grid=RegularChunkGrid( + configuration=RegularChunkShape(chunk_shape=[20])) + ), + StatisticsMetadata( + stats_v1=SummaryStatistics( + count=100, + sum=1215.1, + sumSquares=125.12, + min=5.61, + max=10.84, + histogram=CenteredBinHistogram( + binCenters=[1, 2], counts=[10, 15]), + ) + ), + ]) + # Validate: the state of the builder + assert builder._state == _BuilderState.HAS_VARIABLES assert len(builder._dimensions) == 3 - assert len(builder._coordinates) == 2 - # We expect 6 variables: - # 3 variables for dimensions, 2 variables for coordinates, and 1 variable for seismic_amplitude - assert len(builder._variables) == 6 - v = next((v for v in builder._variables if v.name == "seismic_amplitude"), None) + # 3 dim coords + 2 non-dim coords = 5 + assert len(builder._coordinates) == 5 + # 3 dim coord + 2 non-dim coords, and 1 data variable + assert len(builder._variables) == 6 + + # Validate: the structure of the created variable + v = next((v for v in builder._variables if v.name == "ampl"), None) assert v is not None + assert v.name == "ampl" assert v.long_name == "Amplitude (dimensionless)" - # NOTE: add_variable() stores dimensions as names - assert set(v.dimensions) == {"inline", "crossline", "depth"} - assert v.data_type == ScalarType.FLOAT64 + assert len(v.dimensions) == 3 + assert _get_named_dimension(v.dimensions, "inline", 100) is not None + assert _get_named_dimension(v.dimensions, "crossline", 200) is not None + assert _get_named_dimension(v.dimensions, "depth", 300) is not None + assert v.data_type == ScalarType.FLOAT32 assert isinstance(v.compressor, Blosc) assert v.compressor.algorithm == "zstd" - # NOTE: add_variable() stores coordinates as names - assert set(v.coordinates) == {"cdp-x", "cdp-y"} - assert v.metadata.stats_v1.count == 100 + assert len(v.coordinates) == 5 + assert builder._get_coordinate(v.coordinates, "inline") is not None + assert builder._get_coordinate(v.coordinates, "crossline") is not None + assert builder._get_coordinate(v.coordinates, "depth") is not None + assert builder._get_coordinate(v.coordinates, "cdp-x") is not None + assert builder._get_coordinate(v.coordinates, "cdp-y") is not None + assert v.metadata.stats_v1.count == 100 assert isinstance(v.metadata, VariableMetadata) assert v.metadata.units_v1.length == LengthUnitEnum.FOOT assert v.metadata.attributes["MGA"] == 51 - assert v.metadata.attributes["UnitSystem"] == "Imperial" + assert v.metadata.attributes["UnitSystem"] == "Imperial" assert v.metadata.chunk_grid.name == "regular" - assert v.metadata.chunk_grid.configuration.chunk_shape == [20] - assert v.metadata.stats_v1.count == 100 - assert v.metadata.stats_v1.sum == 1215.1 - assert v.metadata.stats_v1.sum_squares == 125.12 - assert v.metadata.stats_v1.min == 5.61 - assert v.metadata.stats_v1.max == 10.84 - assert v.metadata.stats_v1.histogram.bin_centers == [1, 2] - assert v.metadata.stats_v1.histogram.counts == [10, 15] - + assert v.metadata.chunk_grid.configuration.chunk_shape == [20] + assert v.metadata.stats_v1.count == 100 + assert v.metadata.stats_v1.sum == 1215.1 + assert v.metadata.stats_v1.sum_squares == 125.12 + assert v.metadata.stats_v1.min == 5.61 + assert v.metadata.stats_v1.max == 10.84 + assert v.metadata.stats_v1.histogram.bin_centers == [1, 2] + assert v.metadata.stats_v1.histogram.counts == [10, 15] diff --git a/tests/unit/v1/test_dataset_builder_build.py b/tests/unit/v1/test_dataset_builder_build.py index 2af86cdf..8e06f08d 100644 --- a/tests/unit/v1/test_dataset_builder_build.py +++ b/tests/unit/v1/test_dataset_builder_build.py @@ -14,7 +14,7 @@ from mdio.schemas.metadata import UserAttributes from mdio.schemas.v1.dataset import Dataset from mdio.schemas.v1.dataset_builder import MDIODatasetBuilder -from mdio.schemas.v1.dataset_builder import _get_dimension +from mdio.schemas.v1.dataset_builder import _get_named_dimension from mdio.schemas.v1.stats import CenteredBinHistogram from mdio.schemas.v1.stats import StatisticsMetadata from mdio.schemas.v1.stats import SummaryStatistics @@ -29,21 +29,32 @@ def test_build() -> None: """Test building a complete dataset.""" dataset = ( MDIODatasetBuilder("test_dataset") - .add_dimension("x", 100) - .add_dimension("y", 200) - .add_coordinate("x_coord", dimensions=["x"]) - .add_coordinate("y_coord", dimensions=["y"]) - .add_variable("data", dimensions=["x", "y"], + .add_dimension("inline", 100) + .add_dimension("crossline", 200) + # Add a dimension coordinate explicitly using add_coordinate() + .add_coordinate("inline", dimensions=["inline"], data_type=ScalarType.FLOAT64) + # Add a dimension coordinate using .add_dimension_coordinate() shortcut + .add_dimension_coordinate("crossline", data_type=ScalarType.FLOAT64) + .add_coordinate("x_coord", dimensions=["inline", "crossline"]) + .add_coordinate("y_coord", dimensions=["inline", "crossline"]) + .add_variable("data", long_name="Test Data", + dimensions=["inline", "crossline"], + coordinates=["inline", "crossline", "x_coord", "y_coord"], data_type=ScalarType.FLOAT32) .build() ) + + # TODO: Expand this assert isinstance(dataset, Dataset) assert dataset.metadata.name == "test_dataset" - # 2 dimension variables + 1 data variable + 2 coordinate variables + # 2 dim coord var + 2 non-dim coord var + 1 data variables = 5 variables assert len(dataset.variables) == 5 - assert next(v for v in dataset.variables if v.name == "x") is not None - assert next(v for v in dataset.variables if v.name == "y") is not None + assert next(v for v in dataset.variables if v.name == "inline") is not None + assert next(v for v in dataset.variables if v.name == "crossline") is not None + assert next(v for v in dataset.variables if v.name == "x_coord") is not None + assert next(v for v in dataset.variables if v.name == "y_coord") is not None + assert next(v for v in dataset.variables if v.name == "data") is not None var_data = next(v for v in dataset.variables if v.name == "data") assert var_data is not None assert var_data.long_name == "Test Data" @@ -68,35 +79,40 @@ def test_build_campos_3d() -> None: # noqa: PLR0915 Too many statements (57 > 50 inline_var = next(v for v in dataset.variables if v.name == "inline") assert inline_var.data_type == ScalarType.UINT32 # Dimension variables store dimensions as NamedDimension - assert _get_dimension(inline_var.dimensions, "inline", 256) + assert _get_named_dimension(inline_var.dimensions, "inline", 256) crossline_var = next(v for v in dataset.variables if v.name == "crossline") assert crossline_var.data_type == ScalarType.UINT32 # Dimension variables store dimensions as NamedDimension - assert _get_dimension(crossline_var.dimensions, "crossline", 512) + assert _get_named_dimension(crossline_var.dimensions, "crossline", 512) depth_var = next(v for v in dataset.variables if v.name == "depth") - assert depth_var.data_type == ScalarType.UINT32 + assert depth_var.data_type == ScalarType.FLOAT64 # Dimension variables store dimensions as NamedDimension - assert _get_dimension(depth_var.dimensions, "depth", 384) + assert _get_named_dimension(depth_var.dimensions, "depth", 384) assert depth_var.metadata.units_v1.length == LengthUnitEnum.METER # Verify coordinate variables cdp_x = next(v for v in dataset.variables if v.name == "cdp-x") assert cdp_x.data_type == ScalarType.FLOAT32 - # Coordinates variables store dimensions as names - assert set(cdp_x.dimensions) == {"inline", "crossline"} + assert len(cdp_x.dimensions) == 2 + assert _get_named_dimension(cdp_x.dimensions, "inline", 256) + assert _get_named_dimension(cdp_x.dimensions, "crossline", 512) assert cdp_x.metadata.units_v1.length == LengthUnitEnum.METER cdp_y = next(v for v in dataset.variables if v.name == "cdp-y") assert cdp_y.data_type == ScalarType.FLOAT32 - # Coordinates variables store dimensions as names - assert set(cdp_y.dimensions) == {"inline", "crossline"} + assert len(cdp_y.dimensions) == 2 + assert _get_named_dimension(cdp_y.dimensions, "inline", 256) + assert _get_named_dimension(cdp_y.dimensions, "crossline", 512) assert cdp_y.metadata.units_v1.length == LengthUnitEnum.METER # Verify image variable image = next(v for v in dataset.variables if v.name == "image") - assert set(image.dimensions) == {"inline", "crossline", "depth"} + assert len(image.dimensions) == 3 + assert _get_named_dimension(image.dimensions, "inline", 256) + assert _get_named_dimension(image.dimensions, "crossline", 512) + assert _get_named_dimension(image.dimensions, "depth", 384) assert image.data_type == ScalarType.FLOAT32 assert isinstance(image.compressor, Blosc) assert image.compressor.algorithm == "zstd" @@ -109,7 +125,10 @@ def test_build_campos_3d() -> None: # noqa: PLR0915 Too many statements (57 > 50 # Verify velocity variable velocity = next(v for v in dataset.variables if v.name == "velocity") - assert set(velocity.dimensions) == {"inline", "crossline", "depth"} + assert len(velocity.dimensions) == 3 + assert _get_named_dimension(velocity.dimensions, "inline", 256) + assert _get_named_dimension(velocity.dimensions, "crossline", 512) + assert _get_named_dimension(velocity.dimensions, "depth", 384) assert velocity.data_type == ScalarType.FLOAT16 assert velocity.compressor is None # Other variables store dimensions as names @@ -123,7 +142,10 @@ def test_build_campos_3d() -> None: # noqa: PLR0915 Too many statements (57 > 50 image_inline = next( v for v in dataset.variables if v.name == "image_inline") assert image_inline.long_name == "inline optimized version of 3d_stack" - assert set(image_inline.dimensions) == {"inline", "crossline", "depth"} + assert len(image_inline.dimensions) == 3 + assert _get_named_dimension(image_inline.dimensions, "inline", 256) + assert _get_named_dimension(image_inline.dimensions, "crossline", 512) + assert _get_named_dimension(image_inline.dimensions, "depth", 384) assert image_inline.data_type == ScalarType.FLOAT32 assert isinstance(image_inline.compressor, Blosc) assert image_inline.compressor.algorithm == "zstd" @@ -151,12 +173,15 @@ def make_campos_3d_dataset() -> Dataset: })) # Add dimensions - ds.add_dimension("inline", 256, var_data_type=ScalarType.UINT32) - ds.add_dimension("crossline", 512, var_data_type=ScalarType.UINT32) - ds.add_dimension("depth", 384, var_data_type=ScalarType.UINT32, - var_metadata_info=[ - AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.METER))] - ) + ds.add_dimension("inline", 256) + ds.add_dimension("crossline", 512) + ds.add_dimension("depth", 384) + ds.add_dimension_coordinate("inline", data_type=ScalarType.UINT32) + ds.add_dimension_coordinate("crossline", data_type=ScalarType.UINT32) + ds.add_dimension_coordinate("depth", data_type=ScalarType.FLOAT64, + metadata_info=[ + AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.METER)) + ]) # Add coordinates ds.add_coordinate( "cdp-x", @@ -170,6 +195,7 @@ def make_campos_3d_dataset() -> Dataset: metadata_info=[ AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.METER))] ) + # Add image variable ds.add_variable( name="image", @@ -240,3 +266,30 @@ def make_campos_3d_dataset() -> Dataset: coordinates=["cdp-x", "cdp-y"], ) return ds.build() + +# def test_edge_case_not_used_dimension(): +# builder = MDIODatasetBuilder("test_dataset") + +# builder.add_dimension("inline", 100) +# builder.add_dimension("xline", 200) +# builder.add_dimension("depth", 300) +# builder.add_dimension("time", 600) + +# # Add 'dimension Coordinate' or 'index Coordinates', +# # the coordinates with the same name as a dimension, marked by *) on objects used in binary operations. +# builder.add_coordinate("inline", dimensions=["inline"], data_type=ScalarType.FLOAT32) +# builder.add_coordinate("xline", dimensions=["xline"], data_type=ScalarType.FLOAT32) +# # No 'depth' dimension coordinate is provided + +# # Add 'non-dimension coordinates' before we can add a data variable +# builder.add_coordinate("cdp-x", dimensions=["inline", "crossline"]) +# builder.add_coordinate("cdp-y", dimensions=["inline", "crossline"]) + +# # Add data variable with full parameters +# builder.add_variable("seismic", +# dimensions=["inline", "crossline", "depth"], +# data_type=ScalarType.FLOAT64, +# coordinates=["cdp-x", "cdp-y"]) + +# # NOTE: The model has separate list list[Coordinate] | list[str] +# # It does not allow mixing names and NamedDimensions \ No newline at end of file diff --git a/tests/unit/v1/test_dataset_builder_helpers.py b/tests/unit/v1/test_dataset_builder_helpers.py index 6fbba42a..aecd5ff6 100644 --- a/tests/unit/v1/test_dataset_builder_helpers.py +++ b/tests/unit/v1/test_dataset_builder_helpers.py @@ -12,30 +12,30 @@ from mdio.schemas.core import StrictModel from mdio.schemas.dimension import NamedDimension -from mdio.schemas.v1.dataset_builder import _get_dimension +from mdio.schemas.v1.dataset_builder import _get_named_dimension from mdio.schemas.v1.dataset_builder import _to_dictionary -def test__get_dimension() -> None: +def test__get_named_dimension() -> None: """Test getting a dimension by name from the list of dimensions.""" dimensions = [NamedDimension(name="inline", size=2), NamedDimension( name="crossline", size=3)] - assert _get_dimension([], "inline") is None - assert _get_dimension(dimensions, "inline") == NamedDimension( + assert _get_named_dimension([], "inline") is None + assert _get_named_dimension(dimensions, "inline") == NamedDimension( name="inline", size=2) - assert _get_dimension(dimensions, "crossline") == NamedDimension( + assert _get_named_dimension(dimensions, "crossline") == NamedDimension( name="crossline", size=3) - assert _get_dimension(dimensions, "time") is None + assert _get_named_dimension(dimensions, "time") is None with pytest.raises(TypeError, match="Expected str, got NoneType"): - _get_dimension(dimensions, None) + _get_named_dimension(dimensions, None) with pytest.raises(TypeError, match="Expected str, got int"): - _get_dimension(dimensions, 42) + _get_named_dimension(dimensions, 42) with pytest.raises( ValueError, match="Dimension 'inline' found but size 2 does not match expected size 200" ): - _get_dimension(dimensions, "inline", size=200) + _get_named_dimension(dimensions, "inline", size=200) def test__to_dictionary() -> None: From 7b25d6bdf041dc6abe69002bdc45838213747eac Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Tue, 8 Jul 2025 19:06:12 +0000 Subject: [PATCH 13/27] PR Review: get api version from the package version --- src/mdio/schemas/v1/dataset_builder.py | 17 ++++++++++++----- tests/unit/v1/test_dataset_builder_build.py | 2 +- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/src/mdio/schemas/v1/dataset_builder.py b/src/mdio/schemas/v1/dataset_builder.py index f0621de3..4b12375e 100644 --- a/src/mdio/schemas/v1/dataset_builder.py +++ b/src/mdio/schemas/v1/dataset_builder.py @@ -6,6 +6,7 @@ from enum import auto from typing import Any from typing import TypeAlias +from importlib import metadata from pydantic import BaseModel from zarr.core.chunk_key_encodings import V2ChunkKeyEncoding # noqa: F401 @@ -17,7 +18,7 @@ from mdio.schemas.dtype import StructuredType from mdio.schemas.metadata import ChunkGridMetadata from mdio.schemas.metadata import UserAttributes -from mdio.schemas.v1.dataset import Dataset +from mdio.schemas.v1.dataset import Dataset, DatasetMetadata from mdio.schemas.v1.dataset import DatasetInfo from mdio.schemas.v1.stats import StatisticsMetadata from mdio.schemas.v1.units import AllUnits @@ -97,13 +98,16 @@ class MDIODatasetBuilder: def __init__(self, name: str, attributes: UserAttributes | None = None): - info = DatasetInfo( + try: + api_version = metadata.version("multidimio") + except metadata.PackageNotFoundError: + api_version = "unknown" + + self._info = DatasetInfo( name=name, - api_version="1.0.0", + api_version=api_version, created_on=datetime.now(UTC) ) - # TODO(BrianMichell, #0): Pull from package metadata - self._info = info self._attributes = attributes self._dimensions: list[NamedDimension] = [] self._coordinates: list[Coordinate] = [] @@ -252,6 +256,9 @@ def add_dimension_coordinate( """Add a dimension coordinate variable for a pre-existing dimension. This is a convenience method to create a coordinate variable that represents sampling along a dimension. + + The dimension coordinate is a coordinate that has a single dimension and + the name of the coordinate is the same as the name of the dimension """ self.add_coordinate(dimension_name, long_name=dimension_name, diff --git a/tests/unit/v1/test_dataset_builder_build.py b/tests/unit/v1/test_dataset_builder_build.py index 8e06f08d..048f640d 100644 --- a/tests/unit/v1/test_dataset_builder_build.py +++ b/tests/unit/v1/test_dataset_builder_build.py @@ -67,7 +67,7 @@ def test_build_campos_3d() -> None: # noqa: PLR0915 Too many statements (57 > 50 # Verify dataset structure assert dataset.metadata.name == "campos_3d" - assert dataset.metadata.api_version == "1.0.0" + assert dataset.metadata.api_version == "1.0.0a1" assert dataset.metadata.attributes["foo"] == "bar" assert len(dataset.metadata.attributes["textHeader"]) == 3 From 7ca3ed8fbc1b6f47cbf10e01737d4c8a51f6f7ba Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Wed, 9 Jul 2025 14:46:20 +0000 Subject: [PATCH 14/27] PR Review: remove add_dimension_coordinate --- .devcontainer/Dockerfile.cli | 47 ++++++++++++++ .devcontainer/Dockerfile.dev | 61 +++++++++++++++++++ .devcontainer/Dockerfile.nox | 49 +++++++++++++++ .devcontainer/devcontainer.json | 20 +++--- src/mdio/schemas/v1/dataset_builder.py | 24 -------- .../v1/test_dataset_builder_add_variable.py | 50 ++++----------- tests/unit/v1/test_dataset_builder_build.py | 16 +++-- 7 files changed, 187 insertions(+), 80 deletions(-) create mode 100644 .devcontainer/Dockerfile.cli create mode 100644 .devcontainer/Dockerfile.dev create mode 100644 .devcontainer/Dockerfile.nox diff --git a/.devcontainer/Dockerfile.cli b/.devcontainer/Dockerfile.cli new file mode 100644 index 00000000..92720e34 --- /dev/null +++ b/.devcontainer/Dockerfile.cli @@ -0,0 +1,47 @@ +# HOW TO BUILD AND RUN THIS DOCKERFILE +# * Clone mdio-python and build a Docker image: +# git clone https://github.com/TGSAI/mdio-python.git +# cd mdio-python +# docker build -t mdio-cli -f .devcontainer/Dockerfile.cli . +# * Run /bin/bash in the Docker container: +# +# +# USAGE: +# docker run -it --rm --name mdio-cli mdio-cli --version +# docker run -it --rm --name mdio-cli mdio-cli --help +# +# LOCAL_DATA_DIR=$(pwd); \ +# docker run -it --rm -v $LOCAL_DATA_DIR:/DATA --name mdio-cli mdio-cli \ +# segy import \ +# /DATA/segy_file.segy \ +# /DATA/mdio_file.mdio \ +# -loc 181,185 \ +# -names inline,crossline +# +# LOCAL_DATA_DIR=$(pwd); \ +# docker run -it --rm -v $LOCAL_DATA_DIR:/DATA --name mdio-cli mdio-cli \ +# segy export \ +# /DATA/mdio_file.mdio \ +# /DATA/segy_file_copy.segy +# +FROM python:3.13-bookworm +# Create the user (https://code.visualstudio.com/remote/advancedcontainers/add-nonroot-user) +ENV USERNAME=python +ENV USER_UID=1000 +ENV USER_GID=$USER_UID +RUN groupadd --gid $USER_GID $USERNAME && useradd --uid $USER_UID --gid $USER_GID -m $USERNAME + +# Set the default non-root user +USER $USERNAME + +# Add path to the user-installed packages +ENV PYTHONUSERBASE=/home/$USERNAME/.local +ENV PATH="$PYTHONUSERBASE/bin:$PATH" + +COPY --chown=$USERNAME:$USERNAME ./ /home/$USERNAME/mdio-python + +WORKDIR /home/$USERNAME/mdio-python +RUN pip install . + +ENTRYPOINT ["mdio"] +CMD ["--version"] diff --git a/.devcontainer/Dockerfile.dev b/.devcontainer/Dockerfile.dev new file mode 100644 index 00000000..05f13579 --- /dev/null +++ b/.devcontainer/Dockerfile.dev @@ -0,0 +1,61 @@ +# USAGE: +# This file will be used by the VS Code DevContainer extension +# to create a development environment for the mdio-python project. +# HOW TO RUN TESTS +# 1. Open the project in VS Code. +# 2. Open the Command Palette (Ctrl+Shift+P) and select "Dev Containers: Reopen in Container". +# 3. Once the container is running, open a terminal in VS Code. +# 4. Run the tests using the command: `nox -s test`. +# HOW TO MANUALLY BUILD AND RUN THE CONTAINER +# docker build -t mdio-dev -f .devcontainer/Dockerfile.dev . +# docker run -it --rm --entrypoint /bin/bash --name mdio-dev mdio-dev +# NOTES: +# 1. The container will be run as the non-root user 'vscode' with UID 1000. +# 2. The virtual environment will be setup at /home/vscode/venv +# 3. The project source code will be mounted at /workspaces/mdio-python +ARG PYTHON_VERSION="3.13" +ARG LINUX_DISTRO="bookworm" +ARG UV_VERSION="0.6.11" +ARG NOX_VERSION="2025.2.9" +FROM mcr.microsoft.com/devcontainers/python:1-${PYTHON_VERSION}-${LINUX_DISTRO} + +# Install git for nox pre-commit +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + git \ + && rm -rf /var/lib/apt/lists/* + +ENV USERNAME="vscode" +USER $USERNAME + +# # Add path to the user-installed packages +# ENV PYTHONUSERBASE=/home/$USERNAME/.local +# ENV PATH="$PYTHONUSERBASE/bin:$PATH" + +COPY --chown=$USERNAME:$USERNAME ./ /workspaces/mdio-python + +WORKDIR /workspaces/mdio-python + +ARG UV_VERSION +ARG NOX_VERSION +RUN python3 -m pip install uv==${UV_VERSION} nox==${NOX_VERSION} msgpack ipykernel + +# Initialize virtual environement in the container +ENV VIRTUAL_ENV="/home/$USERNAME/venv" +RUN python3 -m venv $VIRTUAL_ENV +ENV PATH="$VIRTUAL_ENV/bin:$PATH" + +# installing pytest is required for VS Code Python Testing +RUN pip install pytest pytest-cov pytest-mock pytest-asyncio + +# Install the project in editable mode +# This allows for live reloading of the code during development +RUN pip install -e . + +# RUN uv pip install snakeviz + + + + + + diff --git a/.devcontainer/Dockerfile.nox b/.devcontainer/Dockerfile.nox new file mode 100644 index 00000000..103673fd --- /dev/null +++ b/.devcontainer/Dockerfile.nox @@ -0,0 +1,49 @@ +# HOW TO BUILD AND RUN THIS DOCKERFILE +# 1. Make sure you have Docker installed and running. +# 2. Clone mdio-python and build the Docker image: +# git clone https://github.com/TGSAI/mdio-python.git +# cd mdio-python +# docker build -t mdio-nox -f .devcontainer/Dockerfile.nox . +# 3. Run /bin/bash in the Docker container : +# LOCAL_DATA_DIR=$(pwd); \ +# docker run -it --rm -v $LOCAL_DATA_DIR:/DATA --entrypoint /bin/bash --name mdio-nox mdio-nox +# +# USAGE: +# docker run -it --rm mdio-nox --list +# docker run -it --rm mdio-nox -s tests-3.13 +# docker run -it --rm mdio-nox --no-stop-on-first-error +# +# NOTE: nox will fail if run in the directory mounted from the host machine +ARG PYTHON_VERSION="3.13" +ARG LINUX_DISTRO="bookworm" +ARG UV_VERSION="0.6.11" +ARG NOX_VERSION="2025.2.9" +FROM python:${PYTHON_VERSION}-${LINUX_DISTRO} +ARG PYTHON_VERSION +ARG LINUX_DISTRO +RUN echo "Using python:${PYTHON_VERSION}-${LINUX_DISTRO}" +# Create the user (https://code.visualstudio.com/remote/advancedcontainers/add-nonroot-user) +ENV USERNAME=python +ENV USER_UID=1000 +ENV USER_GID=$USER_UID +RUN groupadd --gid $USER_GID $USERNAME && useradd --uid $USER_UID --gid $USER_GID -m $USERNAME +# Set the default non-root user +USER $USERNAME + +# Add path to the user-installed packages +ENV PYTHONUSERBASE=/home/$USERNAME/.local +ENV PATH="$PYTHONUSERBASE/bin:$PATH" + +COPY --chown=$USERNAME:$USERNAME ./ /home/$USERNAME/mdio-python + +WORKDIR /home/$USERNAME/mdio-python +RUN pip install . + +# Install UV dependency manager and Nox test automator +ARG UV_VERSION +ARG NOX_VERSION +RUN echo "Using uv: $UV_VERSION and nox: $NOX_VERSION" +RUN python3 -m pip install uv==${UV_VERSION} nox==${NOX_VERSION} msgpack ipykernel + +ENTRYPOINT ["nox"] +CMD ["--list"] \ No newline at end of file diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index b618a526..dc80446c 100755 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -2,12 +2,12 @@ // README at: https://github.com/devcontainers/templates/tree/main/src/python { "build": { - "dockerfile": "Dockerfile", + "dockerfile": "Dockerfile.dev", "context": ".." }, // Use 'postCreateCommand' to run commands after the container is created. "postCreateCommand": { - "post_create_script": "bash ./.devcontainer/post-install.sh" + // "post_create_script": "bash ./.devcontainer/post-install.sh" }, // Forward 8787 to enable us to view dask dashboard "forwardPorts": [8787], @@ -16,8 +16,9 @@ // Configure properties specific to VS Code. "vscode": { "settings": { - "python.terminal.activateEnvInCurrentTerminal": true, - "python.defaultInterpreterPath": "/opt/venv/bin/python" + "python.testing.pytestArgs": ["tests"], + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true }, "extensions": [ "ms-python.python", @@ -27,17 +28,18 @@ "ms-toolsai.jupyter-renderers", "vscode-icons-team.vscode-icons", "wayou.vscode-todo-highlight", - "streetsidesoftware.code-spell-checker" + "streetsidesoftware.code-spell-checker", + "eamodio.gitlens", + "visualstudioexptteam.vscodeintellicode" ] } }, // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root. // "remoteUser": "root", "updateRemoteUserUID": true, + "workspaceMount": "source=${localWorkspaceFolder},target=/workspaces/mdio-python,type=bind", + "workspaceFolder": "/workspaces/mdio-python", "mounts": [ - // Re-use local Git configuration - "source=${localEnv:HOME}/.gitconfig,target=/home/vscode/.gitconfig_tmp,type=bind,consistency=cached", - "source=${localEnv:HOME}/.gitconfig,target=/root/.gitconfig_tmp,type=bind,consistency=cached", - "source=${localEnv:SCRATCH_DIR}/${localEnv:USER},target=/scratch/,type=bind,consistency=cached" + // "source=${localWorkspaceFolder}/../DATA/,target=/DATA/,type=bind,consistency=cached" ] } diff --git a/src/mdio/schemas/v1/dataset_builder.py b/src/mdio/schemas/v1/dataset_builder.py index 4b12375e..2750e82f 100644 --- a/src/mdio/schemas/v1/dataset_builder.py +++ b/src/mdio/schemas/v1/dataset_builder.py @@ -245,30 +245,6 @@ def add_coordinate( # noqa: PLR0913 self._state = _BuilderState.HAS_COORDINATES return self - def add_dimension_coordinate( - self, - dimension_name: str, - *, - data_type: ScalarType, - compressor: Blosc | ZFP | None = None, - metadata_info: VariableMetadataList | None = None, - ) -> "MDIODatasetBuilder": - """Add a dimension coordinate variable for a pre-existing dimension. - This is a convenience method to create a coordinate variable - that represents sampling along a dimension. - - The dimension coordinate is a coordinate that has a single dimension and - the name of the coordinate is the same as the name of the dimension - """ - self.add_coordinate(dimension_name, - long_name=dimension_name, - dimensions=[dimension_name], - data_type=data_type, - compressor=compressor, - metadata_info=_to_dictionary(metadata_info)) - - return self - def add_variable( # noqa: PLR0913 self, name: str, diff --git a/tests/unit/v1/test_dataset_builder_add_variable.py b/tests/unit/v1/test_dataset_builder_add_variable.py index ad24b9de..3db68809 100644 --- a/tests/unit/v1/test_dataset_builder_add_variable.py +++ b/tests/unit/v1/test_dataset_builder_add_variable.py @@ -97,8 +97,8 @@ def test_add_variable_with_coords() -> None: builder.add_dimension("depth", 300) # Add dimension coordinates before we can add a data variable - builder.add_dimension_coordinate("inline", data_type=ScalarType.INT32) - builder.add_dimension_coordinate("crossline", data_type=ScalarType.INT32) + builder.add_coordinate("inline", dimensions=["inline"], data_type=ScalarType.UINT32) + builder.add_coordinate("crossline", dimensions=["crossline"], data_type=ScalarType.UINT32) # Validate: adding a variable with a coordinate that has not been pre-created is not allowed msg = "Pre-existing coordinate named 'depth' is not found" @@ -168,32 +168,6 @@ def test_add_variable_with_coords() -> None: assert builder._get_coordinate(var_ampl2.coordinates, "cdp-y") is not None -def test_add_dimension_coordinate() -> None: - """Test adding dimension variable.""" - builder = MDIODatasetBuilder("test_dataset") - builder.add_dimension("inline", 100) - - builder.add_dimension_coordinate("inline", data_type=ScalarType.INT32) - - # Validate: that coordinate is stored in the builder global list - coord_il = builder._get_coordinate(builder._coordinates, "inline") - # Validate: that dimensions are stored as NamedDimensions in the coordinate - assert _get_named_dimension(coord_il.dimensions, "inline", 100) is not None - # Validate: a dim variable has been created - var_il = next((e for e in builder._variables if e.name == "inline"), None) - assert var_il is not None - # Validate: the variable has the expected properties - assert var_il.name == "inline" - assert var_il.long_name == "'inline' coordinate variable" - assert len(var_il.dimensions) == 1 - assert _get_named_dimension(var_il.dimensions, "inline", 100) is not None - assert var_il.data_type == ScalarType.INT32 - assert var_il.compressor is None # Default value - assert len(var_il.coordinates) == 1 - assert builder._get_coordinate(var_il.coordinates, "inline") is not None - assert var_il.metadata is None # Default value - - def test_add_variable_with_defaults() -> None: """Test adding variable with default arguments.""" builder = MDIODatasetBuilder("test_dataset") @@ -202,12 +176,12 @@ def test_add_variable_with_defaults() -> None: builder.add_dimension("crossline", 200) builder.add_dimension("depth", 300) # Add dimension coordinates - builder.add_dimension_coordinate("inline", data_type=ScalarType.INT32) - builder.add_dimension_coordinate("crossline", data_type=ScalarType.INT32) - builder.add_dimension_coordinate("depth", - data_type=ScalarType.FLOAT32, - metadata_info=[ - AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.METER))]) + builder.add_coordinate("inline", dimensions=["inline"], data_type=ScalarType.UINT32) + builder.add_coordinate("crossline", dimensions=["crossline"], data_type=ScalarType.UINT32) + builder.add_coordinate("depth", dimensions=["depth"], data_type=ScalarType.UINT32, + metadata_info=[ + AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.METER)) + ]) # Add data variable using defaults builder.add_variable("ampl", @@ -244,10 +218,10 @@ def test_add_variable_full_parameters() -> None: builder.add_dimension("depth", 300) # Add dimension coordinates - builder.add_dimension_coordinate("inline", data_type=ScalarType.INT32) - builder.add_dimension_coordinate("crossline", data_type=ScalarType.INT32) - builder.add_dimension_coordinate("depth", data_type=ScalarType.INT32) - + builder.add_coordinate("inline", dimensions=["inline"], data_type=ScalarType.UINT32) + builder.add_coordinate("crossline", dimensions=["crossline"], data_type=ScalarType.UINT32) + builder.add_coordinate("depth", dimensions=["depth"], data_type=ScalarType.UINT32) + # Add coordinates before we can add a data variable builder.add_coordinate("cdp-x", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT64) builder.add_coordinate("cdp-y", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT64) diff --git a/tests/unit/v1/test_dataset_builder_build.py b/tests/unit/v1/test_dataset_builder_build.py index 048f640d..32285585 100644 --- a/tests/unit/v1/test_dataset_builder_build.py +++ b/tests/unit/v1/test_dataset_builder_build.py @@ -31,10 +31,8 @@ def test_build() -> None: MDIODatasetBuilder("test_dataset") .add_dimension("inline", 100) .add_dimension("crossline", 200) - # Add a dimension coordinate explicitly using add_coordinate() .add_coordinate("inline", dimensions=["inline"], data_type=ScalarType.FLOAT64) - # Add a dimension coordinate using .add_dimension_coordinate() shortcut - .add_dimension_coordinate("crossline", data_type=ScalarType.FLOAT64) + .add_coordinate("crossline", dimensions=["crossline"], data_type=ScalarType.FLOAT64) .add_coordinate("x_coord", dimensions=["inline", "crossline"]) .add_coordinate("y_coord", dimensions=["inline", "crossline"]) .add_variable("data", @@ -176,12 +174,12 @@ def make_campos_3d_dataset() -> Dataset: ds.add_dimension("inline", 256) ds.add_dimension("crossline", 512) ds.add_dimension("depth", 384) - ds.add_dimension_coordinate("inline", data_type=ScalarType.UINT32) - ds.add_dimension_coordinate("crossline", data_type=ScalarType.UINT32) - ds.add_dimension_coordinate("depth", data_type=ScalarType.FLOAT64, - metadata_info=[ - AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.METER)) - ]) + ds.add_coordinate("inline", dimensions=["inline"], data_type=ScalarType.UINT32) + ds.add_coordinate("crossline", dimensions=["crossline"], data_type=ScalarType.UINT32) + ds.add_coordinate("depth", dimensions=["depth"], data_type=ScalarType.FLOAT64, + metadata_info=[ + AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.METER)) + ]) # Add coordinates ds.add_coordinate( "cdp-x", From 4d1ec9cc4307d5febfee299ab7cdfeddb3c83725 Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Wed, 9 Jul 2025 15:27:00 +0000 Subject: [PATCH 15/27] PR Review: add_coordinate() remove data_type default value --- src/mdio/schemas/v1/dataset_builder.py | 2 +- .../v1/test_dataset_builder_add_coordinate.py | 18 +++++----- .../v1/test_dataset_builder_add_variable.py | 4 +-- tests/unit/v1/test_dataset_builder_build.py | 34 +++---------------- 4 files changed, 16 insertions(+), 42 deletions(-) diff --git a/src/mdio/schemas/v1/dataset_builder.py b/src/mdio/schemas/v1/dataset_builder.py index 2750e82f..80d4b876 100644 --- a/src/mdio/schemas/v1/dataset_builder.py +++ b/src/mdio/schemas/v1/dataset_builder.py @@ -177,7 +177,7 @@ def add_coordinate( # noqa: PLR0913 *, long_name: str = None, dimensions: list[str], - data_type: ScalarType = ScalarType.FLOAT32, + data_type: ScalarType, compressor: Blosc | ZFP | None = None, metadata_info: CoordinateMetadataList | None = None, ) -> "MDIODatasetBuilder": diff --git a/tests/unit/v1/test_dataset_builder_add_coordinate.py b/tests/unit/v1/test_dataset_builder_add_coordinate.py index cae46834..0d4fdc6e 100644 --- a/tests/unit/v1/test_dataset_builder_add_coordinate.py +++ b/tests/unit/v1/test_dataset_builder_add_coordinate.py @@ -24,7 +24,7 @@ def test_add_coordinate() -> None: msg = "Must add at least one dimension before adding coordinates" with pytest.raises(ValueError, match=msg): - builder.add_coordinate("cdp", dimensions=["inline", "crossline"]) + builder.add_coordinate("cdp", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT32) builder.add_dimension("inline", 100) builder.add_dimension("crossline", 200) @@ -32,21 +32,21 @@ def test_add_coordinate() -> None: # Validate required parameters bad_name = None with pytest.raises(ValueError, match="'name' must be a non-empty string"): - builder.add_coordinate(bad_name, dimensions=["speed"]) + builder.add_coordinate(bad_name, dimensions=["speed"], data_type=ScalarType.FLOAT32) with pytest.raises(ValueError, match="'name' must be a non-empty string"): - builder.add_coordinate("", dimensions=["speed"]) + builder.add_coordinate("", dimensions=["speed"], data_type=ScalarType.FLOAT32) with pytest.raises(ValueError, match="'dimensions' must be a non-empty list"): - builder.add_coordinate("cdp-x", dimensions=None) + builder.add_coordinate("cdp-x", dimensions=None, data_type=ScalarType.FLOAT32) with pytest.raises(ValueError, match="'dimensions' must be a non-empty list"): - builder.add_coordinate("cdp-x", dimensions=[]) + builder.add_coordinate("cdp-x", dimensions=[], data_type=ScalarType.FLOAT32) # Add a variable using non-existent dimensions msg="Pre-existing dimension named 'xline' is not found" with pytest.raises(ValueError, match=msg): - builder.add_coordinate("bad_cdp-x", dimensions=["inline", "xline"]) + builder.add_coordinate("bad_cdp-x", dimensions=["inline", "xline"], data_type=ScalarType.FLOAT32) # Validate state transition - builder.add_coordinate("cdp-x", dimensions=["inline", "crossline"]) + builder.add_coordinate("cdp-x", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT32) assert builder._state == _BuilderState.HAS_COORDINATES assert len(builder._dimensions) == 2 # 1 variable for coordinates @@ -68,7 +68,7 @@ def test_add_coordinate() -> None: # Adding coordinate with the same name twice msg="Adding coordinate with the same name twice is not allowed" with pytest.raises(ValueError, match=msg): - builder.add_coordinate("cdp-x", dimensions=["inline", "crossline"]) + builder.add_coordinate("cdp-x", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT32) def test_add_coordinate_with_defaults() -> None: """Test adding coordinates with default arguments.""" @@ -76,7 +76,7 @@ def test_add_coordinate_with_defaults() -> None: builder.add_dimension("inline", 100) builder.add_dimension("crossline", 200) # Add coordinate using defaults - builder.add_coordinate("cdp", dimensions=["inline", "crossline"]) + builder.add_coordinate("cdp", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT32) assert len(builder._dimensions) == 2 # 1 variable for coordinates assert len(builder._variables) == 1 diff --git a/tests/unit/v1/test_dataset_builder_add_variable.py b/tests/unit/v1/test_dataset_builder_add_variable.py index 3db68809..99f2f994 100644 --- a/tests/unit/v1/test_dataset_builder_add_variable.py +++ b/tests/unit/v1/test_dataset_builder_add_variable.py @@ -135,8 +135,8 @@ def test_add_variable_with_coords() -> None: # "depth" coordinate is not set # Add non-dim coordinates (e.g., 2D coordinates) - builder.add_coordinate("cdp-x", dimensions=["inline", "crossline"]) - builder.add_coordinate("cdp-y", dimensions=["inline", "crossline"]) + builder.add_coordinate("cdp-x", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT32) + builder.add_coordinate("cdp-y", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT32) # Add a variable with pre-defined dimension and non-dimension coordinates builder.add_variable("ampl2", diff --git a/tests/unit/v1/test_dataset_builder_build.py b/tests/unit/v1/test_dataset_builder_build.py index 32285585..b008e062 100644 --- a/tests/unit/v1/test_dataset_builder_build.py +++ b/tests/unit/v1/test_dataset_builder_build.py @@ -33,8 +33,8 @@ def test_build() -> None: .add_dimension("crossline", 200) .add_coordinate("inline", dimensions=["inline"], data_type=ScalarType.FLOAT64) .add_coordinate("crossline", dimensions=["crossline"], data_type=ScalarType.FLOAT64) - .add_coordinate("x_coord", dimensions=["inline", "crossline"]) - .add_coordinate("y_coord", dimensions=["inline", "crossline"]) + .add_coordinate("x_coord", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT32) + .add_coordinate("y_coord", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT32) .add_variable("data", long_name="Test Data", dimensions=["inline", "crossline"], @@ -43,7 +43,6 @@ def test_build() -> None: .build() ) - # TODO: Expand this assert isinstance(dataset, Dataset) assert dataset.metadata.name == "test_dataset" # 2 dim coord var + 2 non-dim coord var + 1 data variables = 5 variables @@ -184,12 +183,14 @@ def make_campos_3d_dataset() -> Dataset: ds.add_coordinate( "cdp-x", dimensions=["inline", "crossline"], + data_type=ScalarType.FLOAT32, metadata_info=[ AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.METER))] ) ds.add_coordinate( "cdp-y", dimensions=["inline", "crossline"], + data_type=ScalarType.FLOAT32, metadata_info=[ AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.METER))] ) @@ -264,30 +265,3 @@ def make_campos_3d_dataset() -> Dataset: coordinates=["cdp-x", "cdp-y"], ) return ds.build() - -# def test_edge_case_not_used_dimension(): -# builder = MDIODatasetBuilder("test_dataset") - -# builder.add_dimension("inline", 100) -# builder.add_dimension("xline", 200) -# builder.add_dimension("depth", 300) -# builder.add_dimension("time", 600) - -# # Add 'dimension Coordinate' or 'index Coordinates', -# # the coordinates with the same name as a dimension, marked by *) on objects used in binary operations. -# builder.add_coordinate("inline", dimensions=["inline"], data_type=ScalarType.FLOAT32) -# builder.add_coordinate("xline", dimensions=["xline"], data_type=ScalarType.FLOAT32) -# # No 'depth' dimension coordinate is provided - -# # Add 'non-dimension coordinates' before we can add a data variable -# builder.add_coordinate("cdp-x", dimensions=["inline", "crossline"]) -# builder.add_coordinate("cdp-y", dimensions=["inline", "crossline"]) - -# # Add data variable with full parameters -# builder.add_variable("seismic", -# dimensions=["inline", "crossline", "depth"], -# data_type=ScalarType.FLOAT64, -# coordinates=["cdp-x", "cdp-y"]) - -# # NOTE: The model has separate list list[Coordinate] | list[str] -# # It does not allow mixing names and NamedDimensions \ No newline at end of file From 99fcf4385e5ce6147b37832b498f7d6d5ee3cca3 Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Wed, 9 Jul 2025 19:10:44 +0000 Subject: [PATCH 16/27] PR Review: improve unit tests by extracting common functionality in validate* functions --- src/mdio/schemas/v1/dataset_builder.py | 35 +-- tests/unit/v1/helpers.py | 243 +++++++++++++++++ .../v1/test_dataset_builder_add_coordinate.py | 113 ++++---- .../v1/test_dataset_builder_add_dimension.py | 4 +- .../v1/test_dataset_builder_add_variable.py | 137 +++------- tests/unit/v1/test_dataset_builder_build.py | 244 ++++++------------ 6 files changed, 412 insertions(+), 364 deletions(-) create mode 100644 tests/unit/v1/helpers.py diff --git a/src/mdio/schemas/v1/dataset_builder.py b/src/mdio/schemas/v1/dataset_builder.py index 80d4b876..3bfd914e 100644 --- a/src/mdio/schemas/v1/dataset_builder.py +++ b/src/mdio/schemas/v1/dataset_builder.py @@ -146,30 +146,6 @@ def add_dimension( # noqa: PLR0913 self._state = _BuilderState.HAS_DIMENSIONS return self - def _get_coordinate( - self, - coordinates: list[Coordinate] | list[str], - name: str, size: int | None = None - ) -> Coordinate | None: - """Get a coordinate by name from the list[Coordinate] | list[str].""" - if coordinates is None: - return None - - for c in coordinates: - if isinstance(c, str) and c == name: - # The coordinate is stored by name (str). - # Find it in the builder global list and return it. - cc = next((v for v in self._coordinates if v.name == name), None) - if cc is None: - msg = f"Pre-existing coordinate named {name!r} is not found" - raise ValueError(msg) - return cc - if isinstance(c, Coordinate) and c.name == name: - # The coordinate is stored as an embedded Coordinate object. - # Return it. - return c - - return None def add_coordinate( # noqa: PLR0913 self, @@ -303,13 +279,22 @@ def add_variable( # noqa: PLR0913 raise ValueError(msg) named_dimensions.append(nd) + coordinate_objs: list[Coordinate] = [] # Validate that all referenced coordinates are already defined if coordinates is not None: for coord in coordinates: - if next((c for c in self._coordinates if c.name == coord), None) is None: + c: Coordinate = next((c for c in self._coordinates if c.name == coord), None) + if c is not None: + coordinate_objs.append(c) + else: msg = f"Pre-existing coordinate named {coord!r} is not found" raise ValueError(msg) + if coordinates is not None: + # If this is a dimension coordinate variable, embed the Coordinate into it + if len(coordinates) == 1 and coordinates[0] == name: + coordinates = coordinate_objs + meta_dict = _to_dictionary(metadata_info) var = Variable( name=name, diff --git a/tests/unit/v1/helpers.py b/tests/unit/v1/helpers.py new file mode 100644 index 00000000..814357b4 --- /dev/null +++ b/tests/unit/v1/helpers.py @@ -0,0 +1,243 @@ + +from mdio.schemas.chunk_grid import RegularChunkGrid +from mdio.schemas.chunk_grid import RegularChunkShape +from mdio.schemas.compressors import Blosc +from mdio.schemas.dtype import ScalarType, StructuredField +from mdio.schemas.dtype import StructuredType +from mdio.schemas.metadata import ChunkGridMetadata +from mdio.schemas.metadata import UserAttributes +from mdio.schemas.v1.dataset import Dataset +from mdio.schemas.v1.dataset_builder import _BuilderState, MDIODatasetBuilder +from mdio.schemas.v1.dataset_builder import _get_named_dimension +from mdio.schemas.v1.stats import CenteredBinHistogram +from mdio.schemas.v1.stats import StatisticsMetadata +from mdio.schemas.v1.stats import SummaryStatistics +from mdio.schemas.v1.units import AllUnits +from mdio.schemas.v1.units import LengthUnitEnum +from mdio.schemas.v1.units import LengthUnitModel +from mdio.schemas.v1.units import SpeedUnitEnum +from mdio.schemas.v1.units import SpeedUnitModel +from mdio.schemas.v1.variable import Coordinate, Variable + + +def validate_builder(builder: MDIODatasetBuilder, + state: _BuilderState, + n_dims: int, + n_coords: int, + n_var: int) -> None: + """Validate the state of the builder and the number of dimensions, coordinates, and variables.""" + assert builder._state == state + assert len(builder._dimensions) == n_dims + assert len(builder._coordinates) == n_coords + assert len(builder._variables) == n_var + + +def validate_coordinate(builder: MDIODatasetBuilder, + name: str, + dims: list[tuple[str, int]], + dtype: ScalarType) -> Coordinate: + """Validate existence and the structure of the created coordinate.""" + # Validate that coordinate exists + c = next((c for c in builder._coordinates if c.name == name), None) + assert c is not None + assert isinstance(c, Coordinate) + + # Validate that dimensions are stored as NamedDimensions + for d in dims: + name = d[0] + size = d[1] + assert _get_named_dimension(c.dimensions, name, size) is not None + + assert c.data_type == dtype + return c + + +def validate_variable(container: MDIODatasetBuilder | Dataset, + name: str, + dims: list[tuple[str, int]], + coords: list[str], + dtype: ScalarType) -> Variable: + """Validate existence and the structure of the created variable.""" + + if isinstance(container, MDIODatasetBuilder): + var_list = container._variables + global_coord_list = container._coordinates + elif isinstance(container, Dataset): + var_list = container.variables + global_coord_list = _get_all_coordinates(container) + else: + raise TypeError("container must be MDIODatasetBuilder or Dataset") + + # Validate that the variable exists + v = next((e for e in var_list if e.name == name), None) + assert v is not None + assert isinstance(v, Variable) + + # Validate that dimensions are stored as NamedDimensions within the variable + assert len(v.dimensions) == len(dims) + for d in dims: + name = d[0] + size = d[1] + assert _get_named_dimension(v.dimensions, name, size) is not None + + # Validate that coordinates are either embedded or can be resolved from names to Coordinate + if coords is None: + assert v.coordinates is None + else: + assert len(v.coordinates) == len(coords) + for coord_name in coords: + assert _get_coordinate(global_coord_list, v.coordinates, coord_name) is not None + + assert v.data_type == dtype + return v + + +def _get_coordinate( + global_coord_list: list[Coordinate], + coordinates_or_references: list[Coordinate] | list[str], + name: str, size: int | None = None +) -> Coordinate | None: + """Get a coordinate by name from the list[Coordinate] | list[str]. + Validates that the coordinate referenced by the name can be found in the global coordinate list + """ + if coordinates_or_references is None: + return None + + for c in coordinates_or_references: + if isinstance(c, str) and c == name: + # The coordinate is stored by name (str). + cc = None + # Find the Coordinate in the global list and return it. + if global_coord_list is not None: + cc = next((cc for cc in global_coord_list if cc.name == name), None) + if cc is None: + msg = f"Pre-existing coordinate named {name!r} is not found" + raise ValueError(msg) + return cc + if isinstance(c, Coordinate) and c.name == name: + # The coordinate is stored as an embedded Coordinate object. + # Return it. + return c + + return None + + +def _get_all_coordinates(dataset: Dataset) -> list[Coordinate]: + all_coords: dict[str, Coordinate] = {} + for v in dataset.variables: + if v.coordinates is not None: + for c in v.coordinates: + if isinstance(c, Coordinate) and c.name not in all_coords: + all_coords[c.name] = c + return list(all_coords.values()) + + +def make_campos_3d_dataset() -> Dataset: + """Create in-memory campos_3d dataset.""" + ds = MDIODatasetBuilder( + "campos_3d", + attributes=UserAttributes(attributes={ + "textHeader": [ + "C01 .......................... ", + "C02 .......................... ", + "C03 .......................... ", + ], + "foo": "bar" + })) + + # Add dimensions + ds.add_dimension("inline", 256) + ds.add_dimension("crossline", 512) + ds.add_dimension("depth", 384) + ds.add_coordinate("inline", dimensions=["inline"], data_type=ScalarType.UINT32) + ds.add_coordinate("crossline", dimensions=["crossline"], data_type=ScalarType.UINT32) + ds.add_coordinate("depth", dimensions=["depth"], data_type=ScalarType.FLOAT64, + metadata_info=[ + AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.METER)) + ]) + # Add coordinates + ds.add_coordinate( + "cdp-x", + dimensions=["inline", "crossline"], + data_type=ScalarType.FLOAT32, + metadata_info=[ + AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.METER))] + ) + ds.add_coordinate( + "cdp-y", + dimensions=["inline", "crossline"], + data_type=ScalarType.FLOAT32, + metadata_info=[ + AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.METER))] + ) + + # Add image variable + ds.add_variable( + name="image", + dimensions=["inline", "crossline", "depth"], + data_type=ScalarType.FLOAT32, + compressor=Blosc(algorithm="zstd"), + coordinates=["cdp-x", "cdp-y"], + metadata_info=[ + ChunkGridMetadata( + chunk_grid=RegularChunkGrid( + configuration=RegularChunkShape(chunk_shape=[128, 128, 128])) + ), + StatisticsMetadata( + stats_v1=SummaryStatistics( + count=100, + sum=1215.1, + sumSquares=125.12, + min=5.61, + max=10.84, + histogram=CenteredBinHistogram( + binCenters=[1, 2], counts=[10, 15]), + ) + ), + UserAttributes( + attributes={"fizz": "buzz", "UnitSystem": "Canonical"}), + ]) + # Add velocity variable + ds.add_variable( + name="velocity", + dimensions=["inline", "crossline", "depth"], + data_type=ScalarType.FLOAT16, + coordinates=["cdp-x", "cdp-y"], + metadata_info=[ + ChunkGridMetadata( + chunk_grid=RegularChunkGrid( + configuration=RegularChunkShape(chunk_shape=[128, 128, 128])) + ), + AllUnits(units_v1=SpeedUnitModel( + speed=SpeedUnitEnum.METER_PER_SECOND)), + ], + ) + # Add inline-optimized image variable + ds.add_variable( + name="image_inline", + long_name="inline optimized version of 3d_stack", + dimensions=["inline", "crossline", "depth"], + data_type=ScalarType.FLOAT32, + compressor=Blosc(algorithm="zstd"), + coordinates=["cdp-x", "cdp-y"], + metadata_info=[ + ChunkGridMetadata( + chunk_grid=RegularChunkGrid( + configuration=RegularChunkShape(chunk_shape=[4, 512, 512])) + )] + ) + # Add headers variable with structured dtype + ds.add_variable( + name="image_headers", + dimensions=["inline", "crossline"], + data_type=StructuredType( + fields=[ + StructuredField(name="cdp-x", format=ScalarType.FLOAT32), + StructuredField(name="cdp-y", format=ScalarType.FLOAT32), + StructuredField(name="inline", format=ScalarType.UINT32), + StructuredField(name="crossline", format=ScalarType.UINT32), + ] + ), + coordinates=["cdp-x", "cdp-y"], + ) + return ds.build() diff --git a/tests/unit/v1/test_dataset_builder_add_coordinate.py b/tests/unit/v1/test_dataset_builder_add_coordinate.py index 0d4fdc6e..232c525d 100644 --- a/tests/unit/v1/test_dataset_builder_add_coordinate.py +++ b/tests/unit/v1/test_dataset_builder_add_coordinate.py @@ -6,6 +6,7 @@ import pytest +from mdio.schemas import builder from mdio.schemas.compressors import Blosc from mdio.schemas.dtype import ScalarType from mdio.schemas.metadata import UserAttributes @@ -15,7 +16,7 @@ from mdio.schemas.v1.units import LengthUnitEnum from mdio.schemas.v1.units import LengthUnitModel from mdio.schemas.v1.variable import VariableMetadata - +from .helpers import validate_builder, validate_coordinate, validate_variable def test_add_coordinate() -> None: """Test adding coordinates. Check the state transition and validate required parameters.""" @@ -47,23 +48,14 @@ def test_add_coordinate() -> None: # Validate state transition builder.add_coordinate("cdp-x", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT32) - assert builder._state == _BuilderState.HAS_COORDINATES - assert len(builder._dimensions) == 2 - # 1 variable for coordinates - assert len(builder._variables) == 1 - assert len(builder._coordinates) == 1 - - # Validate that we created a coordinate variable - var_cdp = next(e for e in builder._variables if e.name == "cdp-x") - assert var_cdp is not None - assert len(var_cdp.dimensions) == 2 - assert _get_named_dimension(var_cdp.dimensions, "inline", 100) is not None - assert _get_named_dimension(var_cdp.dimensions, "crossline", 200) is not None - # Validate that coordinates are stored as Coordinate - assert len(var_cdp.coordinates) == 1 - # No dimensions are stored in coordinates - # Validate that non-dimension coordinates - assert builder._get_coordinate(var_cdp.coordinates, "cdp-x") is not None + validate_builder(builder, _BuilderState.HAS_COORDINATES, n_dims=2, n_coords=1, n_var=1) + validate_variable( + builder, + name="cdp-x", + dims=[("inline", 100), ("crossline", 200)], + coords=["cdp-x"], + dtype=ScalarType.FLOAT32 + ) # Adding coordinate with the same name twice msg="Adding coordinate with the same name twice is not allowed" @@ -75,23 +67,26 @@ def test_add_coordinate_with_defaults() -> None: builder = MDIODatasetBuilder("test_dataset") builder.add_dimension("inline", 100) builder.add_dimension("crossline", 200) + # Add coordinate using defaults builder.add_coordinate("cdp", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT32) - assert len(builder._dimensions) == 2 - # 1 variable for coordinates - assert len(builder._variables) == 1 - assert len(builder._coordinates) == 1 - - # Validate: the structure of the coordinate - coord_cdp = next((e for e in builder._coordinates if e.name == "cdp"), None) - assert coord_cdp is not None - assert len(coord_cdp.dimensions) == 2 - assert _get_named_dimension(coord_cdp.dimensions, "inline", 100) is not None - assert _get_named_dimension(coord_cdp.dimensions, "crossline", 200) is not None - assert coord_cdp.long_name is None # Default value - assert coord_cdp.data_type == ScalarType.FLOAT32 # Default value - assert coord_cdp.compressor is None # Default value - assert coord_cdp.metadata is None # Default value + validate_builder(builder, _BuilderState.HAS_COORDINATES, n_dims=2, n_coords=1, n_var=1) + validate_coordinate( + builder, + name="cdp", + dims=[("inline", 100), ("crossline", 200)], + dtype=ScalarType.FLOAT32 + ) + v = validate_variable( + builder, + name="cdp", + dims=[("inline", 100), ("crossline", 200)], + coords=["cdp"], + dtype=ScalarType.FLOAT32 + ) + assert v.long_name == "'cdp' coordinate variable" # Default value + assert v.compressor is None # Default value + assert v.metadata is None # Default value def test_coordinate_with_full_parameters() -> None: @@ -99,6 +94,7 @@ def test_coordinate_with_full_parameters() -> None: builder = MDIODatasetBuilder("test_dataset") builder.add_dimension("inline", 100) builder.add_dimension("crossline", 200) + # Add coordinate with all metadata builder.add_coordinate( "cdp", @@ -110,39 +106,28 @@ def test_coordinate_with_full_parameters() -> None: AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.FOOT)), UserAttributes(attributes={"MGA": 51, "UnitSystem": "Imperial"})] ) - # Validate: the state of the builder - assert builder._state == _BuilderState.HAS_COORDINATES - assert len(builder._dimensions) == 2 - # 1 variable for coordinates - assert len(builder._variables) == 1 - assert len(builder._coordinates) == 1 - - # Validate: the structure of the coordinate - coord_cdp = next((e for e in builder._coordinates if e.name == "cdp"), None) - assert coord_cdp is not None - assert len(coord_cdp.dimensions) == 2 - assert _get_named_dimension(coord_cdp.dimensions, "inline", 100) is not None - assert _get_named_dimension(coord_cdp.dimensions, "crossline", 200) is not None - assert coord_cdp.long_name == "Common Depth Point" - assert coord_cdp.data_type == ScalarType.FLOAT16 - assert isinstance(coord_cdp.compressor, Blosc) - assert coord_cdp.compressor.algorithm == "zstd" - assert coord_cdp.metadata.attributes["MGA"] == 51 - assert coord_cdp.metadata.attributes["UnitSystem"] == "Imperial" - assert coord_cdp.metadata.units_v1.length == LengthUnitEnum.FOOT - - # Validate: the structure of the created variable - v = next((v for v in builder._variables if v.name == "cdp"), None) - assert v is not None - assert v.long_name == "'cdp' coordinate variable" - assert len(v.dimensions) == 2 - assert _get_named_dimension(v.dimensions, "inline", 100) is not None - assert _get_named_dimension(v.dimensions, "crossline", 200) is not None - assert v.data_type == ScalarType.FLOAT16 + validate_builder(builder, _BuilderState.HAS_COORDINATES, n_dims=2, n_coords=1, n_var=1) + c = validate_coordinate( + builder, + name="cdp", + dims=[("inline", 100), ("crossline", 200)], + dtype=ScalarType.FLOAT16 + ) + assert c.long_name == "Common Depth Point" + assert isinstance(c.compressor, Blosc) + assert c.compressor.algorithm == "zstd" + assert c.metadata.attributes["MGA"] == 51 + assert c.metadata.attributes["UnitSystem"] == "Imperial" + assert c.metadata.units_v1.length == LengthUnitEnum.FOOT + v = validate_variable( + builder, + name="cdp", + dims=[("inline", 100), ("crossline", 200)], + coords=["cdp"], + dtype=ScalarType.FLOAT16 + ) assert isinstance(v.compressor, Blosc) assert v.compressor.algorithm == "zstd" - assert len(v.coordinates) == 1 - assert builder._get_coordinate(v.coordinates, "cdp") is not None assert isinstance(v.metadata, VariableMetadata) assert v.metadata.units_v1.length == LengthUnitEnum.FOOT assert v.metadata.attributes["MGA"] == 51 diff --git a/tests/unit/v1/test_dataset_builder_add_dimension.py b/tests/unit/v1/test_dataset_builder_add_dimension.py index b701380f..5fd70a01 100644 --- a/tests/unit/v1/test_dataset_builder_add_dimension.py +++ b/tests/unit/v1/test_dataset_builder_add_dimension.py @@ -20,6 +20,7 @@ from mdio.schemas.v1.units import AllUnits from mdio.schemas.v1.units import LengthUnitEnum from mdio.schemas.v1.units import LengthUnitModel +from .helpers import validate_builder def test_add_dimension() -> None: @@ -36,8 +37,7 @@ def test_add_dimension() -> None: # First dimension should change state to HAS_DIMENSIONS and create a variable builder.add_dimension("x", 100) - assert builder._state == _BuilderState.HAS_DIMENSIONS - assert len(builder._dimensions) == 1 + validate_builder(builder, _BuilderState.HAS_DIMENSIONS, n_dims=1, n_coords=0, n_var=0) assert _get_named_dimension(builder._dimensions, "x", 100) is not None # Validate that we can't add a dimension with the same name twice diff --git a/tests/unit/v1/test_dataset_builder_add_variable.py b/tests/unit/v1/test_dataset_builder_add_variable.py index 99f2f994..3bd5ef37 100644 --- a/tests/unit/v1/test_dataset_builder_add_variable.py +++ b/tests/unit/v1/test_dataset_builder_add_variable.py @@ -21,13 +21,14 @@ from mdio.schemas.v1.units import AllUnits from mdio.schemas.v1.units import LengthUnitEnum from mdio.schemas.v1.units import LengthUnitModel -from mdio.schemas.v1.variable import VariableMetadata - +from mdio.schemas.v1.variable import VariableMetadata, Variable +from .helpers import validate_builder +from .helpers import validate_variable def test_add_variable_no_coords() -> None: """Test adding variable. Check the state transition and validate required parameters..""" builder = MDIODatasetBuilder("test_dataset") - assert builder._state == _BuilderState.INITIAL + validate_builder(builder, _BuilderState.INITIAL, n_dims=0, n_coords=0, n_var=0) # Validate: Must add at least one dimension before adding variables msg = "Must add at least one dimension before adding variables" @@ -66,20 +67,11 @@ def test_add_variable_no_coords() -> None: builder.add_variable("amplitude", dimensions=["inline", "crossline", "depth"], data_type=ScalarType.FLOAT32) - assert builder._state == _BuilderState.HAS_VARIABLES - assert len(builder._dimensions) == 3 - assert len(builder._variables) == 1 - assert len(builder._coordinates) == 0 - - # Validate the structure of the created variable - var_ampl = next((e for e in builder._variables if e.name == "amplitude"), None) - assert var_ampl is not None - # Validate that dimensions are stored as NamedDimensions - assert _get_named_dimension(var_ampl.dimensions, "inline", 100) is not None - assert _get_named_dimension(var_ampl.dimensions, "crossline", 200) is not None - assert _get_named_dimension(var_ampl.dimensions, "depth", 300) is not None - # Validate that no coordinates are set - assert var_ampl.coordinates is None + validate_builder(builder, _BuilderState.HAS_VARIABLES, n_dims=3, n_coords=0, n_var=1) + validate_variable(builder, "amplitude", + dims=[("inline", 100), ("crossline", 200), ("depth", 300)], + coords=None, + dtype=ScalarType.FLOAT32) # Validate: adding a variable with the same name twice is not allowed msg = "Adding variable with the same name twice is not allowed" @@ -113,26 +105,11 @@ def test_add_variable_with_coords() -> None: dimensions=["inline", "crossline", "depth"], coordinates=["inline", "crossline"], data_type=ScalarType.FLOAT32) - - assert builder._state == _BuilderState.HAS_VARIABLES - assert len(builder._dimensions) == 3 - # 2 dim coordinate variables + 1 data variables - assert len(builder._variables) == 3 - assert len(builder._coordinates) == 2 - - # Validate: the structure of the created variable - var_ampl = next((e for e in builder._variables if e.name == "ampl"), None) - assert var_ampl is not None - # Validate: that dimensions are stored as NamedDimensions - assert len(var_ampl.dimensions) == 3 - assert _get_named_dimension(var_ampl.dimensions, "inline", 100) is not None - assert _get_named_dimension(var_ampl.dimensions, "crossline", 200) is not None - assert _get_named_dimension(var_ampl.dimensions, "depth", 300) is not None - assert len(var_ampl.coordinates) == 2 - # Validate that dim coordinates "inline" and "crossline" are set - assert builder._get_coordinate(var_ampl.coordinates, "inline") is not None - assert builder._get_coordinate(var_ampl.coordinates, "crossline") is not None - # "depth" coordinate is not set + validate_builder(builder, _BuilderState.HAS_VARIABLES, n_dims=3, n_coords=2, n_var=3) + validate_variable(builder, "ampl", + dims=[("inline", 100), ("crossline", 200), ("depth", 300)], + coords=["inline", "crossline"], + dtype=ScalarType.FLOAT32) # Add non-dim coordinates (e.g., 2D coordinates) builder.add_coordinate("cdp-x", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT32) @@ -143,29 +120,11 @@ def test_add_variable_with_coords() -> None: dimensions=["inline", "crossline", "depth"], coordinates=["inline", "crossline", "cdp-x", "cdp-y"], data_type=ScalarType.FLOAT32) - - assert builder._state == _BuilderState.HAS_VARIABLES - assert len(builder._dimensions) == 3 - # 2 dim coordinate variables + 2 non-dim coordinate variables + 1 data variables - assert len(builder._variables) == 6 - assert len(builder._coordinates) == 4 - - # Validate: the structure of the created variable - var_ampl2 = next((e for e in builder._variables if e.name == "ampl2"), None) - assert var_ampl2 is not None - # Validate: that dimensions are stored as NamedDimensions - assert len(var_ampl2.dimensions) == 3 - assert _get_named_dimension(var_ampl2.dimensions, "inline", 100) is not None - assert _get_named_dimension(var_ampl2.dimensions, "crossline", 200) is not None - assert _get_named_dimension(var_ampl2.dimensions, "depth", 300) is not None - assert len(var_ampl2.coordinates) == 4 - # Validate that dim coordinates "inline" and "crossline" are set - assert builder._get_coordinate(var_ampl2.coordinates, "inline") is not None - assert builder._get_coordinate(var_ampl2.coordinates, "crossline") is not None - # "depth" coordinate is not set - # Validate that non-dimension coordinates "cdp-x" and "cdp-y" - assert builder._get_coordinate(var_ampl2.coordinates, "cdp-x") is not None - assert builder._get_coordinate(var_ampl2.coordinates, "cdp-y") is not None + validate_builder(builder, _BuilderState.HAS_VARIABLES, n_dims=3, n_coords=4, n_var=6) + validate_variable(builder, "ampl2", + dims=[("inline", 100), ("crossline", 200), ("depth", 300)], + coords=["inline", "crossline", "cdp-x", "cdp-y"], + dtype=ScalarType.FLOAT32) def test_add_variable_with_defaults() -> None: @@ -187,26 +146,15 @@ def test_add_variable_with_defaults() -> None: builder.add_variable("ampl", dimensions=["inline", "crossline", "depth"], data_type=ScalarType.FLOAT32) - assert len(builder._dimensions) == 3 - # 3 dim coordinate variables + 1 data variable = 4 - assert len(builder._variables) == 4 - assert len(builder._coordinates) == 3 - - # Validate: the structure of the created variable - var_ampl = next((e for e in builder._variables if e.name == "ampl"), None) - assert var_ampl is not None - assert var_ampl.name == "ampl" - assert var_ampl.long_name is None # Default value - # Validate: that dimensions are stored as NamedDimensions - assert len(var_ampl.dimensions) == 3 - assert _get_named_dimension(var_ampl.dimensions, "inline", 100) is not None - assert _get_named_dimension(var_ampl.dimensions, "crossline", 200) is not None - assert _get_named_dimension(var_ampl.dimensions, "depth", 300) is not None - assert var_ampl.data_type == ScalarType.FLOAT32 - assert var_ampl.compressor is None # Default value - assert var_ampl.coordinates is None # Default value - # Validate: the variable has the expected properties - assert var_ampl.metadata is None # Default value + validate_builder(builder, _BuilderState.HAS_VARIABLES, n_dims=3, n_coords=3, n_var=4) + v = validate_variable(builder, "ampl", + dims=[("inline", 100), ("crossline", 200), ("depth", 300)], + coords=None, + dtype=ScalarType.FLOAT32) + assert v.long_name is None # Default value + assert v.compressor is None # Default value + assert v.coordinates is None # Default value + assert v.metadata is None # Default value def test_add_variable_full_parameters() -> None: @@ -216,12 +164,10 @@ def test_add_variable_full_parameters() -> None: builder.add_dimension("inline", 100) builder.add_dimension("crossline", 200) builder.add_dimension("depth", 300) - # Add dimension coordinates builder.add_coordinate("inline", dimensions=["inline"], data_type=ScalarType.UINT32) builder.add_coordinate("crossline", dimensions=["crossline"], data_type=ScalarType.UINT32) builder.add_coordinate("depth", dimensions=["depth"], data_type=ScalarType.UINT32) - # Add coordinates before we can add a data variable builder.add_coordinate("cdp-x", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT64) builder.add_coordinate("cdp-y", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT64) @@ -254,32 +200,15 @@ def test_add_variable_full_parameters() -> None: ) ), ]) - # Validate: the state of the builder - assert builder._state == _BuilderState.HAS_VARIABLES - assert len(builder._dimensions) == 3 - # 3 dim coords + 2 non-dim coords = 5 - assert len(builder._coordinates) == 5 - # 3 dim coord + 2 non-dim coords, and 1 data variable - assert len(builder._variables) == 6 - - # Validate: the structure of the created variable - v = next((v for v in builder._variables if v.name == "ampl"), None) - assert v is not None - assert v.name == "ampl" + validate_builder(builder, _BuilderState.HAS_VARIABLES, n_dims=3, n_coords=5, n_var=6) + v = validate_variable(builder, "ampl", + dims=[("inline", 100), ("crossline", 200), ("depth", 300)], + coords=["inline", "crossline", "depth", "cdp-x", "cdp-y"], + dtype=ScalarType.FLOAT32) assert v.long_name == "Amplitude (dimensionless)" - assert len(v.dimensions) == 3 - assert _get_named_dimension(v.dimensions, "inline", 100) is not None - assert _get_named_dimension(v.dimensions, "crossline", 200) is not None - assert _get_named_dimension(v.dimensions, "depth", 300) is not None - assert v.data_type == ScalarType.FLOAT32 assert isinstance(v.compressor, Blosc) assert v.compressor.algorithm == "zstd" assert len(v.coordinates) == 5 - assert builder._get_coordinate(v.coordinates, "inline") is not None - assert builder._get_coordinate(v.coordinates, "crossline") is not None - assert builder._get_coordinate(v.coordinates, "depth") is not None - assert builder._get_coordinate(v.coordinates, "cdp-x") is not None - assert builder._get_coordinate(v.coordinates, "cdp-y") is not None assert v.metadata.stats_v1.count == 100 assert isinstance(v.metadata, VariableMetadata) assert v.metadata.units_v1.length == LengthUnitEnum.FOOT diff --git a/tests/unit/v1/test_dataset_builder_build.py b/tests/unit/v1/test_dataset_builder_build.py index b008e062..f2bda0e5 100644 --- a/tests/unit/v1/test_dataset_builder_build.py +++ b/tests/unit/v1/test_dataset_builder_build.py @@ -8,7 +8,7 @@ from mdio.schemas.chunk_grid import RegularChunkGrid from mdio.schemas.chunk_grid import RegularChunkShape from mdio.schemas.compressors import Blosc -from mdio.schemas.dtype import ScalarType +from mdio.schemas.dtype import ScalarType, StructuredField from mdio.schemas.dtype import StructuredType from mdio.schemas.metadata import ChunkGridMetadata from mdio.schemas.metadata import UserAttributes @@ -23,7 +23,9 @@ from mdio.schemas.v1.units import LengthUnitModel from mdio.schemas.v1.units import SpeedUnitEnum from mdio.schemas.v1.units import SpeedUnitModel +from .helpers import make_campos_3d_dataset +from .helpers import validate_builder, validate_coordinate, validate_variable def test_build() -> None: """Test building a complete dataset.""" @@ -52,11 +54,6 @@ def test_build() -> None: assert next(v for v in dataset.variables if v.name == "x_coord") is not None assert next(v for v in dataset.variables if v.name == "y_coord") is not None assert next(v for v in dataset.variables if v.name == "data") is not None - var_data = next(v for v in dataset.variables if v.name == "data") - assert var_data is not None - assert var_data.long_name == "Test Data" - assert len(var_data.dimensions) == 2 - def test_build_campos_3d() -> None: # noqa: PLR0915 Too many statements (57 > 50) """Test building a Campos 3D dataset with multiple variables and attributes.""" @@ -72,82 +69,84 @@ def test_build_campos_3d() -> None: # noqa: PLR0915 Too many statements (57 > 50 # 3 dimension variables + 4 data variables + 2 coordinate variables assert len(dataset.variables) == 9 - # Verify dimension variables - inline_var = next(v for v in dataset.variables if v.name == "inline") - assert inline_var.data_type == ScalarType.UINT32 - # Dimension variables store dimensions as NamedDimension - assert _get_named_dimension(inline_var.dimensions, "inline", 256) + # Verify dimension coordinate variables + validate_variable( + dataset, + name="inline", + dims=[("inline", 256)], + coords=["inline"], + dtype=ScalarType.UINT32 + ) - crossline_var = next(v for v in dataset.variables if v.name == "crossline") - assert crossline_var.data_type == ScalarType.UINT32 - # Dimension variables store dimensions as NamedDimension - assert _get_named_dimension(crossline_var.dimensions, "crossline", 512) + validate_variable( + dataset, + name="crossline", + dims=[("crossline", 512)], + coords=["crossline"], + dtype=ScalarType.UINT32 + ) - depth_var = next(v for v in dataset.variables if v.name == "depth") - assert depth_var.data_type == ScalarType.FLOAT64 - # Dimension variables store dimensions as NamedDimension - assert _get_named_dimension(depth_var.dimensions, "depth", 384) - assert depth_var.metadata.units_v1.length == LengthUnitEnum.METER + depth = validate_variable( + dataset, + name="depth", + dims=[("depth", 384)], + coords=["depth"], + dtype=ScalarType.FLOAT64 + ) + assert depth.metadata.units_v1.length == LengthUnitEnum.METER # Verify coordinate variables - cdp_x = next(v for v in dataset.variables if v.name == "cdp-x") - assert cdp_x.data_type == ScalarType.FLOAT32 - assert len(cdp_x.dimensions) == 2 - assert _get_named_dimension(cdp_x.dimensions, "inline", 256) - assert _get_named_dimension(cdp_x.dimensions, "crossline", 512) + cdp_x = validate_variable( + dataset, + name="cdp-x", + dims=[("inline", 256), ("crossline", 512)], + coords=["cdp-x"], + dtype=ScalarType.FLOAT32 + ) assert cdp_x.metadata.units_v1.length == LengthUnitEnum.METER - cdp_y = next(v for v in dataset.variables if v.name == "cdp-y") - assert cdp_y.data_type == ScalarType.FLOAT32 - assert len(cdp_y.dimensions) == 2 - assert _get_named_dimension(cdp_y.dimensions, "inline", 256) - assert _get_named_dimension(cdp_y.dimensions, "crossline", 512) + cdp_y = validate_variable( + dataset, + name="cdp-y", + dims=[("inline", 256), ("crossline", 512)], + coords=["cdp-y"], + dtype=ScalarType.FLOAT32 + ) assert cdp_y.metadata.units_v1.length == LengthUnitEnum.METER - # Verify image variable - image = next(v for v in dataset.variables if v.name == "image") - assert len(image.dimensions) == 3 - assert _get_named_dimension(image.dimensions, "inline", 256) - assert _get_named_dimension(image.dimensions, "crossline", 512) - assert _get_named_dimension(image.dimensions, "depth", 384) - assert image.data_type == ScalarType.FLOAT32 - assert isinstance(image.compressor, Blosc) + # Verify data variables + image = validate_variable( + dataset, + name="image", + dims=[("inline", 256), ("crossline", 512), ("depth", 384)], + coords=["cdp-x", "cdp-y"], + dtype=ScalarType.FLOAT32 + ) + assert image.metadata.units_v1 is None # No units defined for image assert image.compressor.algorithm == "zstd" - # Other variables store dimensions as names - assert set(image.coordinates) == {"cdp-x", "cdp-y"} - assert isinstance(image.metadata.chunk_grid, RegularChunkGrid) assert image.metadata.chunk_grid.configuration.chunk_shape == [128, 128, 128] - assert isinstance(image.metadata.stats_v1, SummaryStatistics) assert image.metadata.stats_v1.count == 100 - # Verify velocity variable - velocity = next(v for v in dataset.variables if v.name == "velocity") - assert len(velocity.dimensions) == 3 - assert _get_named_dimension(velocity.dimensions, "inline", 256) - assert _get_named_dimension(velocity.dimensions, "crossline", 512) - assert _get_named_dimension(velocity.dimensions, "depth", 384) - assert velocity.data_type == ScalarType.FLOAT16 + velocity = validate_variable( + dataset, + name="velocity", + dims=[("inline", 256), ("crossline", 512), ("depth", 384)], + coords=["cdp-x", "cdp-y"], + dtype=ScalarType.FLOAT16 + ) assert velocity.compressor is None - # Other variables store dimensions as names - assert set(velocity.coordinates) == {"cdp-x", "cdp-y"} - assert isinstance(velocity.metadata.chunk_grid, RegularChunkGrid) assert velocity.metadata.chunk_grid.configuration.chunk_shape == [128, 128, 128] - assert isinstance(velocity.metadata.units_v1, SpeedUnitModel) assert velocity.metadata.units_v1.speed == SpeedUnitEnum.METER_PER_SECOND - # Verify image_inline variable - image_inline = next( - v for v in dataset.variables if v.name == "image_inline") + image_inline = validate_variable( + dataset, + name="image_inline", + dims=[("inline", 256), ("crossline", 512), ("depth", 384)], + coords=["cdp-x", "cdp-y"], + dtype=ScalarType.FLOAT32 + ) assert image_inline.long_name == "inline optimized version of 3d_stack" - assert len(image_inline.dimensions) == 3 - assert _get_named_dimension(image_inline.dimensions, "inline", 256) - assert _get_named_dimension(image_inline.dimensions, "crossline", 512) - assert _get_named_dimension(image_inline.dimensions, "depth", 384) - assert image_inline.data_type == ScalarType.FLOAT32 - assert isinstance(image_inline.compressor, Blosc) assert image_inline.compressor.algorithm == "zstd" - assert set(image_inline.coordinates) == {"cdp-x", "cdp-y"} - assert isinstance(image_inline.metadata.chunk_grid, RegularChunkGrid) assert image_inline.metadata.chunk_grid.configuration.chunk_shape == [4, 512, 512] # Verify image_headers variable @@ -156,112 +155,19 @@ def test_build_campos_3d() -> None: # noqa: PLR0915 Too many statements (57 > 50 assert len(headers.data_type.fields) == 4 assert headers.data_type.fields[0].name == "cdp-x" -def make_campos_3d_dataset() -> Dataset: - """Create in-memory campos_3d dataset.""" - ds = MDIODatasetBuilder( - "campos_3d", - attributes=UserAttributes(attributes={ - "textHeader": [ - "C01 .......................... ", - "C02 .......................... ", - "C03 .......................... ", - ], - "foo": "bar" - })) - - # Add dimensions - ds.add_dimension("inline", 256) - ds.add_dimension("crossline", 512) - ds.add_dimension("depth", 384) - ds.add_coordinate("inline", dimensions=["inline"], data_type=ScalarType.UINT32) - ds.add_coordinate("crossline", dimensions=["crossline"], data_type=ScalarType.UINT32) - ds.add_coordinate("depth", dimensions=["depth"], data_type=ScalarType.FLOAT64, - metadata_info=[ - AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.METER)) - ]) - # Add coordinates - ds.add_coordinate( - "cdp-x", - dimensions=["inline", "crossline"], - data_type=ScalarType.FLOAT32, - metadata_info=[ - AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.METER))] - ) - ds.add_coordinate( - "cdp-y", - dimensions=["inline", "crossline"], - data_type=ScalarType.FLOAT32, - metadata_info=[ - AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.METER))] - ) - - # Add image variable - ds.add_variable( - name="image", - dimensions=["inline", "crossline", "depth"], - data_type=ScalarType.FLOAT32, - compressor=Blosc(algorithm="zstd"), - coordinates=["cdp-x", "cdp-y"], - metadata_info=[ - ChunkGridMetadata( - chunk_grid=RegularChunkGrid( - configuration=RegularChunkShape(chunk_shape=[128, 128, 128])) - ), - StatisticsMetadata( - stats_v1=SummaryStatistics( - count=100, - sum=1215.1, - sumSquares=125.12, - min=5.61, - max=10.84, - histogram=CenteredBinHistogram( - binCenters=[1, 2], counts=[10, 15]), - ) - ), - UserAttributes( - attributes={"fizz": "buzz", "UnitSystem": "Canonical"}), - ]) - # Add velocity variable - ds.add_variable( - name="velocity", - dimensions=["inline", "crossline", "depth"], - data_type=ScalarType.FLOAT16, - coordinates=["cdp-x", "cdp-y"], - metadata_info=[ - ChunkGridMetadata( - chunk_grid=RegularChunkGrid( - configuration=RegularChunkShape(chunk_shape=[128, 128, 128])) - ), - AllUnits(units_v1=SpeedUnitModel( - speed=SpeedUnitEnum.METER_PER_SECOND)), - ], - ) - # Add inline-optimized image variable - ds.add_variable( - name="image_inline", - long_name="inline optimized version of 3d_stack", - dimensions=["inline", "crossline", "depth"], - data_type=ScalarType.FLOAT32, - compressor=Blosc(algorithm="zstd"), - coordinates=["cdp-x", "cdp-y"], - metadata_info=[ - ChunkGridMetadata( - chunk_grid=RegularChunkGrid( - configuration=RegularChunkShape(chunk_shape=[4, 512, 512])) - )] - ) - # Add headers variable with structured dtype - ds.add_variable( + headers = validate_variable( + dataset, name="image_headers", - dimensions=["inline", "crossline"], - data_type=StructuredType( + dims=[("inline", 256), ("crossline", 512)], + coords=["cdp-x", "cdp-y"], + dtype=StructuredType( fields=[ - {"name": "cdp-x", "format": ScalarType.INT32}, - {"name": "cdp-y", "format": ScalarType.INT32}, - {"name": "elevation", "format": ScalarType.FLOAT16}, - {"name": "some_scalar", "format": ScalarType.FLOAT16}, + StructuredField(name="cdp-x", format=ScalarType.FLOAT32), + StructuredField(name="cdp-y", format=ScalarType.FLOAT32), + StructuredField(name="inline", format=ScalarType.UINT32), + StructuredField(name="crossline", format=ScalarType.UINT32), ] - ), - coordinates=["cdp-x", "cdp-y"], + ) ) - return ds.build() + + From 0778fdd251bba96595149f19ed675dcf92fa0c55 Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Wed, 9 Jul 2025 19:17:18 +0000 Subject: [PATCH 17/27] Remove the Dockerfile changes. They are not supposed to be a part of this PR --- .devcontainer/Dockerfile.cli | 47 ------------------------- .devcontainer/Dockerfile.dev | 61 --------------------------------- .devcontainer/Dockerfile.nox | 49 -------------------------- .devcontainer/devcontainer.json | 20 +++++------ 4 files changed, 9 insertions(+), 168 deletions(-) delete mode 100644 .devcontainer/Dockerfile.cli delete mode 100644 .devcontainer/Dockerfile.dev delete mode 100644 .devcontainer/Dockerfile.nox diff --git a/.devcontainer/Dockerfile.cli b/.devcontainer/Dockerfile.cli deleted file mode 100644 index 92720e34..00000000 --- a/.devcontainer/Dockerfile.cli +++ /dev/null @@ -1,47 +0,0 @@ -# HOW TO BUILD AND RUN THIS DOCKERFILE -# * Clone mdio-python and build a Docker image: -# git clone https://github.com/TGSAI/mdio-python.git -# cd mdio-python -# docker build -t mdio-cli -f .devcontainer/Dockerfile.cli . -# * Run /bin/bash in the Docker container: -# -# -# USAGE: -# docker run -it --rm --name mdio-cli mdio-cli --version -# docker run -it --rm --name mdio-cli mdio-cli --help -# -# LOCAL_DATA_DIR=$(pwd); \ -# docker run -it --rm -v $LOCAL_DATA_DIR:/DATA --name mdio-cli mdio-cli \ -# segy import \ -# /DATA/segy_file.segy \ -# /DATA/mdio_file.mdio \ -# -loc 181,185 \ -# -names inline,crossline -# -# LOCAL_DATA_DIR=$(pwd); \ -# docker run -it --rm -v $LOCAL_DATA_DIR:/DATA --name mdio-cli mdio-cli \ -# segy export \ -# /DATA/mdio_file.mdio \ -# /DATA/segy_file_copy.segy -# -FROM python:3.13-bookworm -# Create the user (https://code.visualstudio.com/remote/advancedcontainers/add-nonroot-user) -ENV USERNAME=python -ENV USER_UID=1000 -ENV USER_GID=$USER_UID -RUN groupadd --gid $USER_GID $USERNAME && useradd --uid $USER_UID --gid $USER_GID -m $USERNAME - -# Set the default non-root user -USER $USERNAME - -# Add path to the user-installed packages -ENV PYTHONUSERBASE=/home/$USERNAME/.local -ENV PATH="$PYTHONUSERBASE/bin:$PATH" - -COPY --chown=$USERNAME:$USERNAME ./ /home/$USERNAME/mdio-python - -WORKDIR /home/$USERNAME/mdio-python -RUN pip install . - -ENTRYPOINT ["mdio"] -CMD ["--version"] diff --git a/.devcontainer/Dockerfile.dev b/.devcontainer/Dockerfile.dev deleted file mode 100644 index 05f13579..00000000 --- a/.devcontainer/Dockerfile.dev +++ /dev/null @@ -1,61 +0,0 @@ -# USAGE: -# This file will be used by the VS Code DevContainer extension -# to create a development environment for the mdio-python project. -# HOW TO RUN TESTS -# 1. Open the project in VS Code. -# 2. Open the Command Palette (Ctrl+Shift+P) and select "Dev Containers: Reopen in Container". -# 3. Once the container is running, open a terminal in VS Code. -# 4. Run the tests using the command: `nox -s test`. -# HOW TO MANUALLY BUILD AND RUN THE CONTAINER -# docker build -t mdio-dev -f .devcontainer/Dockerfile.dev . -# docker run -it --rm --entrypoint /bin/bash --name mdio-dev mdio-dev -# NOTES: -# 1. The container will be run as the non-root user 'vscode' with UID 1000. -# 2. The virtual environment will be setup at /home/vscode/venv -# 3. The project source code will be mounted at /workspaces/mdio-python -ARG PYTHON_VERSION="3.13" -ARG LINUX_DISTRO="bookworm" -ARG UV_VERSION="0.6.11" -ARG NOX_VERSION="2025.2.9" -FROM mcr.microsoft.com/devcontainers/python:1-${PYTHON_VERSION}-${LINUX_DISTRO} - -# Install git for nox pre-commit -RUN apt-get update \ - && apt-get install -y --no-install-recommends \ - git \ - && rm -rf /var/lib/apt/lists/* - -ENV USERNAME="vscode" -USER $USERNAME - -# # Add path to the user-installed packages -# ENV PYTHONUSERBASE=/home/$USERNAME/.local -# ENV PATH="$PYTHONUSERBASE/bin:$PATH" - -COPY --chown=$USERNAME:$USERNAME ./ /workspaces/mdio-python - -WORKDIR /workspaces/mdio-python - -ARG UV_VERSION -ARG NOX_VERSION -RUN python3 -m pip install uv==${UV_VERSION} nox==${NOX_VERSION} msgpack ipykernel - -# Initialize virtual environement in the container -ENV VIRTUAL_ENV="/home/$USERNAME/venv" -RUN python3 -m venv $VIRTUAL_ENV -ENV PATH="$VIRTUAL_ENV/bin:$PATH" - -# installing pytest is required for VS Code Python Testing -RUN pip install pytest pytest-cov pytest-mock pytest-asyncio - -# Install the project in editable mode -# This allows for live reloading of the code during development -RUN pip install -e . - -# RUN uv pip install snakeviz - - - - - - diff --git a/.devcontainer/Dockerfile.nox b/.devcontainer/Dockerfile.nox deleted file mode 100644 index 103673fd..00000000 --- a/.devcontainer/Dockerfile.nox +++ /dev/null @@ -1,49 +0,0 @@ -# HOW TO BUILD AND RUN THIS DOCKERFILE -# 1. Make sure you have Docker installed and running. -# 2. Clone mdio-python and build the Docker image: -# git clone https://github.com/TGSAI/mdio-python.git -# cd mdio-python -# docker build -t mdio-nox -f .devcontainer/Dockerfile.nox . -# 3. Run /bin/bash in the Docker container : -# LOCAL_DATA_DIR=$(pwd); \ -# docker run -it --rm -v $LOCAL_DATA_DIR:/DATA --entrypoint /bin/bash --name mdio-nox mdio-nox -# -# USAGE: -# docker run -it --rm mdio-nox --list -# docker run -it --rm mdio-nox -s tests-3.13 -# docker run -it --rm mdio-nox --no-stop-on-first-error -# -# NOTE: nox will fail if run in the directory mounted from the host machine -ARG PYTHON_VERSION="3.13" -ARG LINUX_DISTRO="bookworm" -ARG UV_VERSION="0.6.11" -ARG NOX_VERSION="2025.2.9" -FROM python:${PYTHON_VERSION}-${LINUX_DISTRO} -ARG PYTHON_VERSION -ARG LINUX_DISTRO -RUN echo "Using python:${PYTHON_VERSION}-${LINUX_DISTRO}" -# Create the user (https://code.visualstudio.com/remote/advancedcontainers/add-nonroot-user) -ENV USERNAME=python -ENV USER_UID=1000 -ENV USER_GID=$USER_UID -RUN groupadd --gid $USER_GID $USERNAME && useradd --uid $USER_UID --gid $USER_GID -m $USERNAME -# Set the default non-root user -USER $USERNAME - -# Add path to the user-installed packages -ENV PYTHONUSERBASE=/home/$USERNAME/.local -ENV PATH="$PYTHONUSERBASE/bin:$PATH" - -COPY --chown=$USERNAME:$USERNAME ./ /home/$USERNAME/mdio-python - -WORKDIR /home/$USERNAME/mdio-python -RUN pip install . - -# Install UV dependency manager and Nox test automator -ARG UV_VERSION -ARG NOX_VERSION -RUN echo "Using uv: $UV_VERSION and nox: $NOX_VERSION" -RUN python3 -m pip install uv==${UV_VERSION} nox==${NOX_VERSION} msgpack ipykernel - -ENTRYPOINT ["nox"] -CMD ["--list"] \ No newline at end of file diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index dc80446c..b618a526 100755 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -2,12 +2,12 @@ // README at: https://github.com/devcontainers/templates/tree/main/src/python { "build": { - "dockerfile": "Dockerfile.dev", + "dockerfile": "Dockerfile", "context": ".." }, // Use 'postCreateCommand' to run commands after the container is created. "postCreateCommand": { - // "post_create_script": "bash ./.devcontainer/post-install.sh" + "post_create_script": "bash ./.devcontainer/post-install.sh" }, // Forward 8787 to enable us to view dask dashboard "forwardPorts": [8787], @@ -16,9 +16,8 @@ // Configure properties specific to VS Code. "vscode": { "settings": { - "python.testing.pytestArgs": ["tests"], - "python.testing.unittestEnabled": false, - "python.testing.pytestEnabled": true + "python.terminal.activateEnvInCurrentTerminal": true, + "python.defaultInterpreterPath": "/opt/venv/bin/python" }, "extensions": [ "ms-python.python", @@ -28,18 +27,17 @@ "ms-toolsai.jupyter-renderers", "vscode-icons-team.vscode-icons", "wayou.vscode-todo-highlight", - "streetsidesoftware.code-spell-checker", - "eamodio.gitlens", - "visualstudioexptteam.vscodeintellicode" + "streetsidesoftware.code-spell-checker" ] } }, // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root. // "remoteUser": "root", "updateRemoteUserUID": true, - "workspaceMount": "source=${localWorkspaceFolder},target=/workspaces/mdio-python,type=bind", - "workspaceFolder": "/workspaces/mdio-python", "mounts": [ - // "source=${localWorkspaceFolder}/../DATA/,target=/DATA/,type=bind,consistency=cached" + // Re-use local Git configuration + "source=${localEnv:HOME}/.gitconfig,target=/home/vscode/.gitconfig_tmp,type=bind,consistency=cached", + "source=${localEnv:HOME}/.gitconfig,target=/root/.gitconfig_tmp,type=bind,consistency=cached", + "source=${localEnv:SCRATCH_DIR}/${localEnv:USER},target=/scratch/,type=bind,consistency=cached" ] } From 7e74567ec615d0982b2239b0e916dbfdf2a5db48 Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Wed, 9 Jul 2025 19:38:26 +0000 Subject: [PATCH 18/27] PR Review: run ruff --- src/mdio/schemas/v1/dataset_builder.py | 10 +++--- tests/unit/v1/helpers.py | 34 +++++++++++-------- .../v1/test_dataset_builder_add_coordinate.py | 25 +++++++++----- .../v1/test_dataset_builder_add_dimension.py | 12 +------ .../v1/test_dataset_builder_add_variable.py | 19 +++++++---- tests/unit/v1/test_dataset_builder_build.py | 18 +++------- 6 files changed, 58 insertions(+), 60 deletions(-) diff --git a/src/mdio/schemas/v1/dataset_builder.py b/src/mdio/schemas/v1/dataset_builder.py index 3bfd914e..5b102ddd 100644 --- a/src/mdio/schemas/v1/dataset_builder.py +++ b/src/mdio/schemas/v1/dataset_builder.py @@ -4,9 +4,9 @@ from datetime import datetime from enum import Enum from enum import auto +from importlib import metadata from typing import Any from typing import TypeAlias -from importlib import metadata from pydantic import BaseModel from zarr.core.chunk_key_encodings import V2ChunkKeyEncoding # noqa: F401 @@ -18,7 +18,7 @@ from mdio.schemas.dtype import StructuredType from mdio.schemas.metadata import ChunkGridMetadata from mdio.schemas.metadata import UserAttributes -from mdio.schemas.v1.dataset import Dataset, DatasetMetadata +from mdio.schemas.v1.dataset import Dataset from mdio.schemas.v1.dataset import DatasetInfo from mdio.schemas.v1.stats import StatisticsMetadata from mdio.schemas.v1.units import AllUnits @@ -167,6 +167,7 @@ def add_coordinate( # noqa: PLR0913 long_name: Optional long name for the coordinate dimensions: List of dimension names that the coordinate is associated with data_type: Data type for the coordinate (defaults to FLOAT32) + compressor: Compressor used for the variable (defaults to None) metadata_info: Optional metadata information for the coordinate Returns: @@ -290,9 +291,8 @@ def add_variable( # noqa: PLR0913 msg = f"Pre-existing coordinate named {coord!r} is not found" raise ValueError(msg) - if coordinates is not None: - # If this is a dimension coordinate variable, embed the Coordinate into it - if len(coordinates) == 1 and coordinates[0] == name: + # If this is a dimension coordinate variable, embed the Coordinate into it + if coordinates is not None and len(coordinates) == 1 and coordinates[0] == name: coordinates = coordinate_objs meta_dict = _to_dictionary(metadata_info) diff --git a/tests/unit/v1/helpers.py b/tests/unit/v1/helpers.py index 814357b4..699425c1 100644 --- a/tests/unit/v1/helpers.py +++ b/tests/unit/v1/helpers.py @@ -1,13 +1,15 @@ - +"""Helper methods used in unit tests.""" from mdio.schemas.chunk_grid import RegularChunkGrid from mdio.schemas.chunk_grid import RegularChunkShape from mdio.schemas.compressors import Blosc -from mdio.schemas.dtype import ScalarType, StructuredField +from mdio.schemas.dtype import ScalarType +from mdio.schemas.dtype import StructuredField from mdio.schemas.dtype import StructuredType from mdio.schemas.metadata import ChunkGridMetadata from mdio.schemas.metadata import UserAttributes from mdio.schemas.v1.dataset import Dataset -from mdio.schemas.v1.dataset_builder import _BuilderState, MDIODatasetBuilder +from mdio.schemas.v1.dataset_builder import MDIODatasetBuilder +from mdio.schemas.v1.dataset_builder import _BuilderState from mdio.schemas.v1.dataset_builder import _get_named_dimension from mdio.schemas.v1.stats import CenteredBinHistogram from mdio.schemas.v1.stats import StatisticsMetadata @@ -17,7 +19,8 @@ from mdio.schemas.v1.units import LengthUnitModel from mdio.schemas.v1.units import SpeedUnitEnum from mdio.schemas.v1.units import SpeedUnitModel -from mdio.schemas.v1.variable import Coordinate, Variable +from mdio.schemas.v1.variable import Coordinate +from mdio.schemas.v1.variable import Variable def validate_builder(builder: MDIODatasetBuilder, @@ -25,7 +28,7 @@ def validate_builder(builder: MDIODatasetBuilder, n_dims: int, n_coords: int, n_var: int) -> None: - """Validate the state of the builder and the number of dimensions, coordinates, and variables.""" + """Validate the state of the builder, the number of dimensions, coordinates, and variables.""" assert builder._state == state assert len(builder._dimensions) == n_dims assert len(builder._coordinates) == n_coords @@ -58,7 +61,6 @@ def validate_variable(container: MDIODatasetBuilder | Dataset, coords: list[str], dtype: ScalarType) -> Variable: """Validate existence and the structure of the created variable.""" - if isinstance(container, MDIODatasetBuilder): var_list = container._variables global_coord_list = container._coordinates @@ -66,7 +68,8 @@ def validate_variable(container: MDIODatasetBuilder | Dataset, var_list = container.variables global_coord_list = _get_all_coordinates(container) else: - raise TypeError("container must be MDIODatasetBuilder or Dataset") + err_msg = f"Expected MDIODatasetBuilder or Dataset, got {type(container)}" + raise TypeError(err_msg) # Validate that the variable exists v = next((e for e in var_list if e.name == name), None) @@ -93,23 +96,26 @@ def validate_variable(container: MDIODatasetBuilder | Dataset, def _get_coordinate( - global_coord_list: list[Coordinate], - coordinates_or_references: list[Coordinate] | list[str], - name: str, size: int | None = None -) -> Coordinate | None: + global_coord_list: list[Coordinate], + coordinates_or_references: list[Coordinate] | list[str], + name: str) -> Coordinate | None: """Get a coordinate by name from the list[Coordinate] | list[str]. - Validates that the coordinate referenced by the name can be found in the global coordinate list + + The function validates that the coordinate referenced by the name can be found + in the global coordinate list. + If the coordinate is stored as a Coordinate object, it is returned directly. """ if coordinates_or_references is None: return None for c in coordinates_or_references: if isinstance(c, str) and c == name: - # The coordinate is stored by name (str). + # The coordinate is stored by name (str). cc = None # Find the Coordinate in the global list and return it. if global_coord_list is not None: - cc = next((cc for cc in global_coord_list if cc.name == name), None) + cc = next( + (cc for cc in global_coord_list if cc.name == name), None) if cc is None: msg = f"Pre-existing coordinate named {name!r} is not found" raise ValueError(msg) diff --git a/tests/unit/v1/test_dataset_builder_add_coordinate.py b/tests/unit/v1/test_dataset_builder_add_coordinate.py index 232c525d..334c27b8 100644 --- a/tests/unit/v1/test_dataset_builder_add_coordinate.py +++ b/tests/unit/v1/test_dataset_builder_add_coordinate.py @@ -6,17 +6,20 @@ import pytest -from mdio.schemas import builder from mdio.schemas.compressors import Blosc from mdio.schemas.dtype import ScalarType from mdio.schemas.metadata import UserAttributes -from mdio.schemas.v1.dataset_builder import MDIODatasetBuilder, _get_named_dimension +from mdio.schemas.v1.dataset_builder import MDIODatasetBuilder from mdio.schemas.v1.dataset_builder import _BuilderState from mdio.schemas.v1.units import AllUnits from mdio.schemas.v1.units import LengthUnitEnum from mdio.schemas.v1.units import LengthUnitModel from mdio.schemas.v1.variable import VariableMetadata -from .helpers import validate_builder, validate_coordinate, validate_variable + +from .helpers import validate_builder +from .helpers import validate_coordinate +from .helpers import validate_variable + def test_add_coordinate() -> None: """Test adding coordinates. Check the state transition and validate required parameters.""" @@ -25,7 +28,8 @@ def test_add_coordinate() -> None: msg = "Must add at least one dimension before adding coordinates" with pytest.raises(ValueError, match=msg): - builder.add_coordinate("cdp", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT32) + builder.add_coordinate( + "cdp", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT32) builder.add_dimension("inline", 100) builder.add_dimension("crossline", 200) @@ -42,12 +46,14 @@ def test_add_coordinate() -> None: builder.add_coordinate("cdp-x", dimensions=[], data_type=ScalarType.FLOAT32) # Add a variable using non-existent dimensions - msg="Pre-existing dimension named 'xline' is not found" + msg = "Pre-existing dimension named 'xline' is not found" with pytest.raises(ValueError, match=msg): - builder.add_coordinate("bad_cdp-x", dimensions=["inline", "xline"], data_type=ScalarType.FLOAT32) + builder.add_coordinate( + "bad_cdp-x", dimensions=["inline", "xline"], data_type=ScalarType.FLOAT32) # Validate state transition - builder.add_coordinate("cdp-x", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT32) + builder.add_coordinate( + "cdp-x", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT32) validate_builder(builder, _BuilderState.HAS_COORDINATES, n_dims=2, n_coords=1, n_var=1) validate_variable( builder, @@ -58,9 +64,10 @@ def test_add_coordinate() -> None: ) # Adding coordinate with the same name twice - msg="Adding coordinate with the same name twice is not allowed" + msg = "Adding coordinate with the same name twice is not allowed" with pytest.raises(ValueError, match=msg): - builder.add_coordinate("cdp-x", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT32) + builder.add_coordinate( + "cdp-x", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT32) def test_add_coordinate_with_defaults() -> None: """Test adding coordinates with default arguments.""" diff --git a/tests/unit/v1/test_dataset_builder_add_dimension.py b/tests/unit/v1/test_dataset_builder_add_dimension.py index 5fd70a01..e11a788d 100644 --- a/tests/unit/v1/test_dataset_builder_add_dimension.py +++ b/tests/unit/v1/test_dataset_builder_add_dimension.py @@ -6,20 +6,10 @@ import pytest -from mdio.schemas.chunk_grid import RegularChunkGrid -from mdio.schemas.chunk_grid import RegularChunkShape -from mdio.schemas.dtype import ScalarType -from mdio.schemas.metadata import ChunkGridMetadata -from mdio.schemas.metadata import UserAttributes from mdio.schemas.v1.dataset_builder import MDIODatasetBuilder from mdio.schemas.v1.dataset_builder import _BuilderState from mdio.schemas.v1.dataset_builder import _get_named_dimension -from mdio.schemas.v1.stats import CenteredBinHistogram -from mdio.schemas.v1.stats import StatisticsMetadata -from mdio.schemas.v1.stats import SummaryStatistics -from mdio.schemas.v1.units import AllUnits -from mdio.schemas.v1.units import LengthUnitEnum -from mdio.schemas.v1.units import LengthUnitModel + from .helpers import validate_builder diff --git a/tests/unit/v1/test_dataset_builder_add_variable.py b/tests/unit/v1/test_dataset_builder_add_variable.py index 3bd5ef37..8227be47 100644 --- a/tests/unit/v1/test_dataset_builder_add_variable.py +++ b/tests/unit/v1/test_dataset_builder_add_variable.py @@ -6,14 +6,13 @@ import pytest -from mdio.schemas import builder from mdio.schemas.chunk_grid import RegularChunkGrid from mdio.schemas.chunk_grid import RegularChunkShape from mdio.schemas.compressors import Blosc from mdio.schemas.dtype import ScalarType from mdio.schemas.metadata import ChunkGridMetadata from mdio.schemas.metadata import UserAttributes -from mdio.schemas.v1.dataset_builder import MDIODatasetBuilder, _get_named_dimension +from mdio.schemas.v1.dataset_builder import MDIODatasetBuilder from mdio.schemas.v1.dataset_builder import _BuilderState from mdio.schemas.v1.stats import CenteredBinHistogram from mdio.schemas.v1.stats import StatisticsMetadata @@ -21,10 +20,12 @@ from mdio.schemas.v1.units import AllUnits from mdio.schemas.v1.units import LengthUnitEnum from mdio.schemas.v1.units import LengthUnitModel -from mdio.schemas.v1.variable import VariableMetadata, Variable +from mdio.schemas.v1.variable import VariableMetadata + from .helpers import validate_builder from .helpers import validate_variable + def test_add_variable_no_coords() -> None: """Test adding variable. Check the state transition and validate required parameters..""" builder = MDIODatasetBuilder("test_dataset") @@ -112,8 +113,10 @@ def test_add_variable_with_coords() -> None: dtype=ScalarType.FLOAT32) # Add non-dim coordinates (e.g., 2D coordinates) - builder.add_coordinate("cdp-x", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT32) - builder.add_coordinate("cdp-y", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT32) + builder.add_coordinate( + "cdp-x", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT32) + builder.add_coordinate( + "cdp-y", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT32) # Add a variable with pre-defined dimension and non-dimension coordinates builder.add_variable("ampl2", @@ -169,8 +172,10 @@ def test_add_variable_full_parameters() -> None: builder.add_coordinate("crossline", dimensions=["crossline"], data_type=ScalarType.UINT32) builder.add_coordinate("depth", dimensions=["depth"], data_type=ScalarType.UINT32) # Add coordinates before we can add a data variable - builder.add_coordinate("cdp-x", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT64) - builder.add_coordinate("cdp-y", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT64) + builder.add_coordinate( + "cdp-x", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT64) + builder.add_coordinate( + "cdp-y", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT64) # Add data variable with full parameters builder.add_variable("ampl", diff --git a/tests/unit/v1/test_dataset_builder_build.py b/tests/unit/v1/test_dataset_builder_build.py index f2bda0e5..d42f29a7 100644 --- a/tests/unit/v1/test_dataset_builder_build.py +++ b/tests/unit/v1/test_dataset_builder_build.py @@ -5,27 +5,17 @@ """Tests the schema v1 dataset_builder.add_coordinate() public API.""" -from mdio.schemas.chunk_grid import RegularChunkGrid -from mdio.schemas.chunk_grid import RegularChunkShape -from mdio.schemas.compressors import Blosc -from mdio.schemas.dtype import ScalarType, StructuredField +from mdio.schemas.dtype import ScalarType +from mdio.schemas.dtype import StructuredField from mdio.schemas.dtype import StructuredType -from mdio.schemas.metadata import ChunkGridMetadata -from mdio.schemas.metadata import UserAttributes from mdio.schemas.v1.dataset import Dataset from mdio.schemas.v1.dataset_builder import MDIODatasetBuilder -from mdio.schemas.v1.dataset_builder import _get_named_dimension -from mdio.schemas.v1.stats import CenteredBinHistogram -from mdio.schemas.v1.stats import StatisticsMetadata -from mdio.schemas.v1.stats import SummaryStatistics -from mdio.schemas.v1.units import AllUnits from mdio.schemas.v1.units import LengthUnitEnum -from mdio.schemas.v1.units import LengthUnitModel from mdio.schemas.v1.units import SpeedUnitEnum -from mdio.schemas.v1.units import SpeedUnitModel + from .helpers import make_campos_3d_dataset +from .helpers import validate_variable -from .helpers import validate_builder, validate_coordinate, validate_variable def test_build() -> None: """Test building a complete dataset.""" From 0aaa5f6110eb7afd59a5cb901a747da5fc60cbde Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Thu, 10 Jul 2025 15:10:07 +0000 Subject: [PATCH 19/27] PR Review: fix pre-commit errors --- src/mdio/schemas/v1/dataset_builder.py | 60 +++--- tests/unit/v1/helpers.py | 109 +++++----- .../v1/test_dataset_builder_add_coordinate.py | 44 ++-- .../v1/test_dataset_builder_add_dimension.py | 3 +- .../v1/test_dataset_builder_add_variable.py | 200 ++++++++++-------- tests/unit/v1/test_dataset_builder_build.py | 52 ++--- tests/unit/v1/test_dataset_builder_helpers.py | 68 +++--- 7 files changed, 283 insertions(+), 253 deletions(-) diff --git a/src/mdio/schemas/v1/dataset_builder.py b/src/mdio/schemas/v1/dataset_builder.py index 5b102ddd..85bae21f 100644 --- a/src/mdio/schemas/v1/dataset_builder.py +++ b/src/mdio/schemas/v1/dataset_builder.py @@ -25,16 +25,13 @@ from mdio.schemas.v1.variable import Coordinate from mdio.schemas.v1.variable import Variable -AnyMetadataList: TypeAlias = list[AllUnits | - UserAttributes | - ChunkGridMetadata | - StatisticsMetadata | - DatasetInfo] +AnyMetadataList: TypeAlias = list[ + AllUnits | UserAttributes | ChunkGridMetadata | StatisticsMetadata | DatasetInfo +] CoordinateMetadataList: TypeAlias = list[AllUnits | UserAttributes] -VariableMetadataList: TypeAlias = list[AllUnits | - UserAttributes | - ChunkGridMetadata | - StatisticsMetadata] +VariableMetadataList: TypeAlias = list[ + AllUnits | UserAttributes | ChunkGridMetadata | StatisticsMetadata +] DatasetMetadataList: TypeAlias = list[DatasetInfo | UserAttributes] @@ -97,17 +94,12 @@ class MDIODatasetBuilder: """ def __init__(self, name: str, attributes: UserAttributes | None = None): - try: api_version = metadata.version("multidimio") except metadata.PackageNotFoundError: api_version = "unknown" - self._info = DatasetInfo( - name=name, - api_version=api_version, - created_on=datetime.now(UTC) - ) + self._info = DatasetInfo(name=name, api_version=api_version, created_on=datetime.now(UTC)) self._attributes = attributes self._dimensions: list[NamedDimension] = [] self._coordinates: list[Coordinate] = [] @@ -116,9 +108,7 @@ def __init__(self, name: str, attributes: UserAttributes | None = None): self._unnamed_variable_counter = 0 def add_dimension( # noqa: PLR0913 - self, - name: str, - size: int + self, name: str, size: int ) -> "MDIODatasetBuilder": """Add a dimension. @@ -128,6 +118,10 @@ def add_dimension( # noqa: PLR0913 name: Name of the dimension size: Size of the dimension + Raises: + ValueError: If 'name' is not a non-empty string. + if the dimension is already defined. + Returns: self: Returns self for method chaining """ @@ -146,7 +140,6 @@ def add_dimension( # noqa: PLR0913 self._state = _BuilderState.HAS_DIMENSIONS return self - def add_coordinate( # noqa: PLR0913 self, name: str, @@ -170,6 +163,13 @@ def add_coordinate( # noqa: PLR0913 compressor: Compressor used for the variable (defaults to None) metadata_info: Optional metadata information for the coordinate + Raises: + ValueError: If no dimensions have been added yet. + If 'name' is not a non-empty string. + If 'dimensions' is not a non-empty list. + If the coordinate is already defined. + If any referenced dimension is not already defined. + Returns: self: Returns self for method chaining """ @@ -195,7 +195,7 @@ def add_coordinate( # noqa: PLR0913 if nd is None: msg = f"Pre-existing dimension named {dim_name!r} is not found" raise ValueError(msg) - named_dimensions.append(nd) + named_dimensions.append(nd) meta_dict = _to_dictionary(metadata_info) coord = Coordinate( @@ -204,7 +204,7 @@ def add_coordinate( # noqa: PLR0913 dimensions=named_dimensions, compressor=compressor, dataType=data_type, - metadata=meta_dict + metadata=meta_dict, ) self._coordinates.append(coord) @@ -216,7 +216,7 @@ def add_coordinate( # noqa: PLR0913 data_type=coord.data_type, compressor=compressor, coordinates=[name], # Use the coordinate name as a reference - metadata_info=coord.metadata + metadata_info=coord.metadata, ) self._state = _BuilderState.HAS_COORDINATES @@ -252,6 +252,14 @@ def add_variable( # noqa: PLR0913 (defaults to None, meaning no coordinates) metadata_info: Optional metadata information for the variable + Raises: + ValueError: If no dimensions have been added yet. + If 'name' is not a non-empty string. + If 'dimensions' is not a non-empty list. + If the variable is already defined. + If any referenced dimension is not already defined. + If any referenced coordinate is not already defined. + Returns: self: Returns self for method chaining. """ @@ -293,7 +301,7 @@ def add_variable( # noqa: PLR0913 # If this is a dimension coordinate variable, embed the Coordinate into it if coordinates is not None and len(coordinates) == 1 and coordinates[0] == name: - coordinates = coordinate_objs + coordinates = coordinate_objs meta_dict = _to_dictionary(metadata_info) var = Variable( @@ -303,7 +311,8 @@ def add_variable( # noqa: PLR0913 data_type=data_type, compressor=compressor, coordinates=coordinates, - metadata=meta_dict) + metadata=meta_dict, + ) self._variables.append(var) self._state = _BuilderState.HAS_VARIABLES @@ -315,6 +324,9 @@ def build(self) -> Dataset: This function must be called after at least one dimension is added via add_dimension(). It will create a Dataset object with all added dimensions, coordinates, and variables. + Raises: + ValueError: If no dimensions have been added yet. + Returns: Dataset: The built dataset with all added dimensions, coordinates, and variables. """ diff --git a/tests/unit/v1/helpers.py b/tests/unit/v1/helpers.py index 699425c1..2058bdd5 100644 --- a/tests/unit/v1/helpers.py +++ b/tests/unit/v1/helpers.py @@ -1,4 +1,5 @@ """Helper methods used in unit tests.""" + from mdio.schemas.chunk_grid import RegularChunkGrid from mdio.schemas.chunk_grid import RegularChunkShape from mdio.schemas.compressors import Blosc @@ -23,11 +24,9 @@ from mdio.schemas.v1.variable import Variable -def validate_builder(builder: MDIODatasetBuilder, - state: _BuilderState, - n_dims: int, - n_coords: int, - n_var: int) -> None: +def validate_builder( + builder: MDIODatasetBuilder, state: _BuilderState, n_dims: int, n_coords: int, n_var: int +) -> None: """Validate the state of the builder, the number of dimensions, coordinates, and variables.""" assert builder._state == state assert len(builder._dimensions) == n_dims @@ -35,10 +34,9 @@ def validate_builder(builder: MDIODatasetBuilder, assert len(builder._variables) == n_var -def validate_coordinate(builder: MDIODatasetBuilder, - name: str, - dims: list[tuple[str, int]], - dtype: ScalarType) -> Coordinate: +def validate_coordinate( + builder: MDIODatasetBuilder, name: str, dims: list[tuple[str, int]], dtype: ScalarType +) -> Coordinate: """Validate existence and the structure of the created coordinate.""" # Validate that coordinate exists c = next((c for c in builder._coordinates if c.name == name), None) @@ -55,11 +53,13 @@ def validate_coordinate(builder: MDIODatasetBuilder, return c -def validate_variable(container: MDIODatasetBuilder | Dataset, - name: str, - dims: list[tuple[str, int]], - coords: list[str], - dtype: ScalarType) -> Variable: +def validate_variable( + container: MDIODatasetBuilder | Dataset, + name: str, + dims: list[tuple[str, int]], + coords: list[str], + dtype: ScalarType, +) -> Variable: """Validate existence and the structure of the created variable.""" if isinstance(container, MDIODatasetBuilder): var_list = container._variables @@ -90,19 +90,20 @@ def validate_variable(container: MDIODatasetBuilder | Dataset, assert len(v.coordinates) == len(coords) for coord_name in coords: assert _get_coordinate(global_coord_list, v.coordinates, coord_name) is not None - + assert v.data_type == dtype return v def _get_coordinate( - global_coord_list: list[Coordinate], - coordinates_or_references: list[Coordinate] | list[str], - name: str) -> Coordinate | None: + global_coord_list: list[Coordinate], + coordinates_or_references: list[Coordinate] | list[str], + name: str, +) -> Coordinate | None: """Get a coordinate by name from the list[Coordinate] | list[str]. - The function validates that the coordinate referenced by the name can be found - in the global coordinate list. + The function validates that the coordinate referenced by the name can be found + in the global coordinate list. If the coordinate is stored as a Coordinate object, it is returned directly. """ if coordinates_or_references is None: @@ -114,8 +115,7 @@ def _get_coordinate( cc = None # Find the Coordinate in the global list and return it. if global_coord_list is not None: - cc = next( - (cc for cc in global_coord_list if cc.name == name), None) + cc = next((cc for cc in global_coord_list if cc.name == name), None) if cc is None: msg = f"Pre-existing coordinate named {name!r} is not found" raise ValueError(msg) @@ -142,39 +142,42 @@ def make_campos_3d_dataset() -> Dataset: """Create in-memory campos_3d dataset.""" ds = MDIODatasetBuilder( "campos_3d", - attributes=UserAttributes(attributes={ - "textHeader": [ - "C01 .......................... ", - "C02 .......................... ", - "C03 .......................... ", - ], - "foo": "bar" - })) + attributes=UserAttributes( + attributes={ + "textHeader": [ + "C01 .......................... ", + "C02 .......................... ", + "C03 .......................... ", + ], + "foo": "bar", + } + ), + ) # Add dimensions ds.add_dimension("inline", 256) ds.add_dimension("crossline", 512) ds.add_dimension("depth", 384) - ds.add_coordinate("inline", dimensions=["inline"], data_type=ScalarType.UINT32) - ds.add_coordinate("crossline", dimensions=["crossline"], data_type=ScalarType.UINT32) - ds.add_coordinate("depth", dimensions=["depth"], data_type=ScalarType.FLOAT64, - metadata_info=[ - AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.METER)) - ]) + ds.add_coordinate("inline", dimensions=["inline"], data_type=ScalarType.UINT32) + ds.add_coordinate("crossline", dimensions=["crossline"], data_type=ScalarType.UINT32) + ds.add_coordinate( + "depth", + dimensions=["depth"], + data_type=ScalarType.FLOAT64, + metadata_info=[AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.METER))], + ) # Add coordinates ds.add_coordinate( "cdp-x", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT32, - metadata_info=[ - AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.METER))] + metadata_info=[AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.METER))], ) ds.add_coordinate( "cdp-y", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT32, - metadata_info=[ - AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.METER))] + metadata_info=[AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.METER))], ) # Add image variable @@ -187,7 +190,8 @@ def make_campos_3d_dataset() -> Dataset: metadata_info=[ ChunkGridMetadata( chunk_grid=RegularChunkGrid( - configuration=RegularChunkShape(chunk_shape=[128, 128, 128])) + configuration=RegularChunkShape(chunk_shape=[128, 128, 128]) + ) ), StatisticsMetadata( stats_v1=SummaryStatistics( @@ -196,13 +200,12 @@ def make_campos_3d_dataset() -> Dataset: sumSquares=125.12, min=5.61, max=10.84, - histogram=CenteredBinHistogram( - binCenters=[1, 2], counts=[10, 15]), + histogram=CenteredBinHistogram(binCenters=[1, 2], counts=[10, 15]), ) ), - UserAttributes( - attributes={"fizz": "buzz", "UnitSystem": "Canonical"}), - ]) + UserAttributes(attributes={"fizz": "buzz", "UnitSystem": "Canonical"}), + ], + ) # Add velocity variable ds.add_variable( name="velocity", @@ -212,10 +215,10 @@ def make_campos_3d_dataset() -> Dataset: metadata_info=[ ChunkGridMetadata( chunk_grid=RegularChunkGrid( - configuration=RegularChunkShape(chunk_shape=[128, 128, 128])) + configuration=RegularChunkShape(chunk_shape=[128, 128, 128]) + ) ), - AllUnits(units_v1=SpeedUnitModel( - speed=SpeedUnitEnum.METER_PER_SECOND)), + AllUnits(units_v1=SpeedUnitModel(speed=SpeedUnitEnum.METER_PER_SECOND)), ], ) # Add inline-optimized image variable @@ -229,8 +232,10 @@ def make_campos_3d_dataset() -> Dataset: metadata_info=[ ChunkGridMetadata( chunk_grid=RegularChunkGrid( - configuration=RegularChunkShape(chunk_shape=[4, 512, 512])) - )] + configuration=RegularChunkShape(chunk_shape=[4, 512, 512]) + ) + ) + ], ) # Add headers variable with structured dtype ds.add_variable( @@ -240,7 +245,7 @@ def make_campos_3d_dataset() -> Dataset: fields=[ StructuredField(name="cdp-x", format=ScalarType.FLOAT32), StructuredField(name="cdp-y", format=ScalarType.FLOAT32), - StructuredField(name="inline", format=ScalarType.UINT32), + StructuredField(name="inline", format=ScalarType.UINT32), StructuredField(name="crossline", format=ScalarType.UINT32), ] ), diff --git a/tests/unit/v1/test_dataset_builder_add_coordinate.py b/tests/unit/v1/test_dataset_builder_add_coordinate.py index 334c27b8..46cb27cd 100644 --- a/tests/unit/v1/test_dataset_builder_add_coordinate.py +++ b/tests/unit/v1/test_dataset_builder_add_coordinate.py @@ -25,11 +25,12 @@ def test_add_coordinate() -> None: """Test adding coordinates. Check the state transition and validate required parameters.""" builder = MDIODatasetBuilder("test_dataset") assert builder._state == _BuilderState.INITIAL - + msg = "Must add at least one dimension before adding coordinates" with pytest.raises(ValueError, match=msg): builder.add_coordinate( - "cdp", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT32) + "cdp", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT32 + ) builder.add_dimension("inline", 100) builder.add_dimension("crossline", 200) @@ -49,25 +50,29 @@ def test_add_coordinate() -> None: msg = "Pre-existing dimension named 'xline' is not found" with pytest.raises(ValueError, match=msg): builder.add_coordinate( - "bad_cdp-x", dimensions=["inline", "xline"], data_type=ScalarType.FLOAT32) + "bad_cdp-x", dimensions=["inline", "xline"], data_type=ScalarType.FLOAT32 + ) # Validate state transition builder.add_coordinate( - "cdp-x", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT32) + "cdp-x", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT32 + ) validate_builder(builder, _BuilderState.HAS_COORDINATES, n_dims=2, n_coords=1, n_var=1) validate_variable( builder, name="cdp-x", dims=[("inline", 100), ("crossline", 200)], coords=["cdp-x"], - dtype=ScalarType.FLOAT32 + dtype=ScalarType.FLOAT32, ) # Adding coordinate with the same name twice msg = "Adding coordinate with the same name twice is not allowed" with pytest.raises(ValueError, match=msg): builder.add_coordinate( - "cdp-x", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT32) + "cdp-x", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT32 + ) + def test_add_coordinate_with_defaults() -> None: """Test adding coordinates with default arguments.""" @@ -79,21 +84,18 @@ def test_add_coordinate_with_defaults() -> None: builder.add_coordinate("cdp", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT32) validate_builder(builder, _BuilderState.HAS_COORDINATES, n_dims=2, n_coords=1, n_var=1) validate_coordinate( - builder, - name="cdp", - dims=[("inline", 100), ("crossline", 200)], - dtype=ScalarType.FLOAT32 + builder, name="cdp", dims=[("inline", 100), ("crossline", 200)], dtype=ScalarType.FLOAT32 ) v = validate_variable( builder, name="cdp", dims=[("inline", 100), ("crossline", 200)], coords=["cdp"], - dtype=ScalarType.FLOAT32 + dtype=ScalarType.FLOAT32, ) assert v.long_name == "'cdp' coordinate variable" # Default value - assert v.compressor is None # Default value - assert v.metadata is None # Default value + assert v.compressor is None # Default value + assert v.metadata is None # Default value def test_coordinate_with_full_parameters() -> None: @@ -105,20 +107,18 @@ def test_coordinate_with_full_parameters() -> None: # Add coordinate with all metadata builder.add_coordinate( "cdp", - long_name = "Common Depth Point", + long_name="Common Depth Point", dimensions=["inline", "crossline"], - data_type = ScalarType.FLOAT16, - compressor = Blosc(algorithm="zstd"), + data_type=ScalarType.FLOAT16, + compressor=Blosc(algorithm="zstd"), metadata_info=[ AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.FOOT)), - UserAttributes(attributes={"MGA": 51, "UnitSystem": "Imperial"})] + UserAttributes(attributes={"MGA": 51, "UnitSystem": "Imperial"}), + ], ) validate_builder(builder, _BuilderState.HAS_COORDINATES, n_dims=2, n_coords=1, n_var=1) c = validate_coordinate( - builder, - name="cdp", - dims=[("inline", 100), ("crossline", 200)], - dtype=ScalarType.FLOAT16 + builder, name="cdp", dims=[("inline", 100), ("crossline", 200)], dtype=ScalarType.FLOAT16 ) assert c.long_name == "Common Depth Point" assert isinstance(c.compressor, Blosc) @@ -131,7 +131,7 @@ def test_coordinate_with_full_parameters() -> None: name="cdp", dims=[("inline", 100), ("crossline", 200)], coords=["cdp"], - dtype=ScalarType.FLOAT16 + dtype=ScalarType.FLOAT16, ) assert isinstance(v.compressor, Blosc) assert v.compressor.algorithm == "zstd" diff --git a/tests/unit/v1/test_dataset_builder_add_dimension.py b/tests/unit/v1/test_dataset_builder_add_dimension.py index e11a788d..ddcbd745 100644 --- a/tests/unit/v1/test_dataset_builder_add_dimension.py +++ b/tests/unit/v1/test_dataset_builder_add_dimension.py @@ -38,7 +38,6 @@ def test_add_dimension() -> None: builder.add_dimension("x", 200) # Adding dimension with the same name twice - msg="Adding dimension with the same name twice is not allowed" + msg = "Adding dimension with the same name twice is not allowed" with pytest.raises(ValueError, match=msg): builder.add_dimension("x", 200) - diff --git a/tests/unit/v1/test_dataset_builder_add_variable.py b/tests/unit/v1/test_dataset_builder_add_variable.py index 8227be47..84abfe3a 100644 --- a/tests/unit/v1/test_dataset_builder_add_variable.py +++ b/tests/unit/v1/test_dataset_builder_add_variable.py @@ -34,10 +34,9 @@ def test_add_variable_no_coords() -> None: # Validate: Must add at least one dimension before adding variables msg = "Must add at least one dimension before adding variables" with pytest.raises(ValueError, match=msg): - builder.add_variable("amplitude", dimensions=[ - "speed"], data_type=ScalarType.FLOAT32) + builder.add_variable("amplitude", dimensions=["speed"], data_type=ScalarType.FLOAT32) - # Add dimension before we can add a data variable + # Add dimension before we can add a data variable builder.add_dimension("inline", 100) builder.add_dimension("crossline", 200) builder.add_dimension("depth", 300) @@ -45,41 +44,40 @@ def test_add_variable_no_coords() -> None: # Validate: required parameters must be preset bad_name = None with pytest.raises(ValueError, match="'name' must be a non-empty string"): - builder.add_variable(bad_name, dimensions=[ - "speed"], data_type=ScalarType.FLOAT32) + builder.add_variable(bad_name, dimensions=["speed"], data_type=ScalarType.FLOAT32) with pytest.raises(ValueError, match="'name' must be a non-empty string"): - builder.add_variable( - "", dimensions=["speed"], data_type=ScalarType.FLOAT32) + builder.add_variable("", dimensions=["speed"], data_type=ScalarType.FLOAT32) with pytest.raises(ValueError, match="'dimensions' must be a non-empty list"): - builder.add_variable("bad_amplitude", dimensions=None, - data_type=ScalarType.FLOAT32) + builder.add_variable("bad_amplitude", dimensions=None, data_type=ScalarType.FLOAT32) with pytest.raises(ValueError, match="'dimensions' must be a non-empty list"): - builder.add_variable("bad_amplitude", dimensions=[], - data_type=ScalarType.FLOAT32) + builder.add_variable("bad_amplitude", dimensions=[], data_type=ScalarType.FLOAT32) # Validate: Add a variable using non-existent dimensions is not allowed msg = "Pre-existing dimension named 'il' is not found" with pytest.raises(ValueError, match=msg): - builder.add_variable("bad_amplitude", - dimensions=["il", "xl", "depth"], - data_type=ScalarType.FLOAT32) + builder.add_variable( + "bad_amplitude", dimensions=["il", "xl", "depth"], data_type=ScalarType.FLOAT32 + ) # Add a variable without coordinates - builder.add_variable("amplitude", - dimensions=["inline", "crossline", "depth"], - data_type=ScalarType.FLOAT32) + builder.add_variable( + "amplitude", dimensions=["inline", "crossline", "depth"], data_type=ScalarType.FLOAT32 + ) validate_builder(builder, _BuilderState.HAS_VARIABLES, n_dims=3, n_coords=0, n_var=1) - validate_variable(builder, "amplitude", - dims=[("inline", 100), ("crossline", 200), ("depth", 300)], - coords=None, - dtype=ScalarType.FLOAT32) + validate_variable( + builder, + "amplitude", + dims=[("inline", 100), ("crossline", 200), ("depth", 300)], + coords=None, + dtype=ScalarType.FLOAT32, + ) # Validate: adding a variable with the same name twice is not allowed msg = "Adding variable with the same name twice is not allowed" with pytest.raises(ValueError, match=msg): - builder.add_variable("amplitude", - dimensions=["inline", "crossline", "depth"], - data_type=ScalarType.FLOAT32) + builder.add_variable( + "amplitude", dimensions=["inline", "crossline", "depth"], data_type=ScalarType.FLOAT32 + ) def test_add_variable_with_coords() -> None: @@ -96,38 +94,52 @@ def test_add_variable_with_coords() -> None: # Validate: adding a variable with a coordinate that has not been pre-created is not allowed msg = "Pre-existing coordinate named 'depth' is not found" with pytest.raises(ValueError, match=msg): - builder.add_variable("ampl", - dimensions=["inline", "crossline", "depth"], - coordinates=["inline", "crossline", "depth"], - data_type=ScalarType.FLOAT32) + builder.add_variable( + "ampl", + dimensions=["inline", "crossline", "depth"], + coordinates=["inline", "crossline", "depth"], + data_type=ScalarType.FLOAT32, + ) # Add a variable with pre-defined dimension coordinates - builder.add_variable("ampl", - dimensions=["inline", "crossline", "depth"], - coordinates=["inline", "crossline"], - data_type=ScalarType.FLOAT32) + builder.add_variable( + "ampl", + dimensions=["inline", "crossline", "depth"], + coordinates=["inline", "crossline"], + data_type=ScalarType.FLOAT32, + ) validate_builder(builder, _BuilderState.HAS_VARIABLES, n_dims=3, n_coords=2, n_var=3) - validate_variable(builder, "ampl", - dims=[("inline", 100), ("crossline", 200), ("depth", 300)], - coords=["inline", "crossline"], - dtype=ScalarType.FLOAT32) + validate_variable( + builder, + "ampl", + dims=[("inline", 100), ("crossline", 200), ("depth", 300)], + coords=["inline", "crossline"], + dtype=ScalarType.FLOAT32, + ) # Add non-dim coordinates (e.g., 2D coordinates) builder.add_coordinate( - "cdp-x", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT32) + "cdp-x", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT32 + ) builder.add_coordinate( - "cdp-y", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT32) + "cdp-y", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT32 + ) # Add a variable with pre-defined dimension and non-dimension coordinates - builder.add_variable("ampl2", - dimensions=["inline", "crossline", "depth"], - coordinates=["inline", "crossline", "cdp-x", "cdp-y"], - data_type=ScalarType.FLOAT32) + builder.add_variable( + "ampl2", + dimensions=["inline", "crossline", "depth"], + coordinates=["inline", "crossline", "cdp-x", "cdp-y"], + data_type=ScalarType.FLOAT32, + ) validate_builder(builder, _BuilderState.HAS_VARIABLES, n_dims=3, n_coords=4, n_var=6) - validate_variable(builder, "ampl2", - dims=[("inline", 100), ("crossline", 200), ("depth", 300)], - coords=["inline", "crossline", "cdp-x", "cdp-y"], - dtype=ScalarType.FLOAT32) + validate_variable( + builder, + "ampl2", + dims=[("inline", 100), ("crossline", 200), ("depth", 300)], + coords=["inline", "crossline", "cdp-x", "cdp-y"], + dtype=ScalarType.FLOAT32, + ) def test_add_variable_with_defaults() -> None: @@ -140,20 +152,25 @@ def test_add_variable_with_defaults() -> None: # Add dimension coordinates builder.add_coordinate("inline", dimensions=["inline"], data_type=ScalarType.UINT32) builder.add_coordinate("crossline", dimensions=["crossline"], data_type=ScalarType.UINT32) - builder.add_coordinate("depth", dimensions=["depth"], data_type=ScalarType.UINT32, - metadata_info=[ - AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.METER)) - ]) + builder.add_coordinate( + "depth", + dimensions=["depth"], + data_type=ScalarType.UINT32, + metadata_info=[AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.METER))], + ) # Add data variable using defaults - builder.add_variable("ampl", - dimensions=["inline", "crossline", "depth"], - data_type=ScalarType.FLOAT32) + builder.add_variable( + "ampl", dimensions=["inline", "crossline", "depth"], data_type=ScalarType.FLOAT32 + ) validate_builder(builder, _BuilderState.HAS_VARIABLES, n_dims=3, n_coords=3, n_var=4) - v = validate_variable(builder, "ampl", - dims=[("inline", 100), ("crossline", 200), ("depth", 300)], - coords=None, - dtype=ScalarType.FLOAT32) + v = validate_variable( + builder, + "ampl", + dims=[("inline", 100), ("crossline", 200), ("depth", 300)], + coords=None, + dtype=ScalarType.FLOAT32, + ) assert v.long_name is None # Default value assert v.compressor is None # Default value assert v.coordinates is None # Default value @@ -173,43 +190,46 @@ def test_add_variable_full_parameters() -> None: builder.add_coordinate("depth", dimensions=["depth"], data_type=ScalarType.UINT32) # Add coordinates before we can add a data variable builder.add_coordinate( - "cdp-x", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT64) + "cdp-x", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT64 + ) builder.add_coordinate( - "cdp-y", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT64) + "cdp-y", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT64 + ) # Add data variable with full parameters - builder.add_variable("ampl", - long_name="Amplitude (dimensionless)", - dimensions=["inline", "crossline", "depth"], - data_type=ScalarType.FLOAT32, - compressor=Blosc(algorithm="zstd"), - coordinates=["inline", "crossline", "depth", "cdp-x", "cdp-y"], - metadata_info=[ - AllUnits(units_v1=LengthUnitModel( - length=LengthUnitEnum.FOOT)), - UserAttributes( - attributes={"MGA": 51, "UnitSystem": "Imperial"}), - ChunkGridMetadata( - chunk_grid=RegularChunkGrid( - configuration=RegularChunkShape(chunk_shape=[20])) - ), - StatisticsMetadata( - stats_v1=SummaryStatistics( - count=100, - sum=1215.1, - sumSquares=125.12, - min=5.61, - max=10.84, - histogram=CenteredBinHistogram( - binCenters=[1, 2], counts=[10, 15]), - ) - ), - ]) + builder.add_variable( + "ampl", + long_name="Amplitude (dimensionless)", + dimensions=["inline", "crossline", "depth"], + data_type=ScalarType.FLOAT32, + compressor=Blosc(algorithm="zstd"), + coordinates=["inline", "crossline", "depth", "cdp-x", "cdp-y"], + metadata_info=[ + AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.FOOT)), + UserAttributes(attributes={"MGA": 51, "UnitSystem": "Imperial"}), + ChunkGridMetadata( + chunk_grid=RegularChunkGrid(configuration=RegularChunkShape(chunk_shape=[20])) + ), + StatisticsMetadata( + stats_v1=SummaryStatistics( + count=100, + sum=1215.1, + sumSquares=125.12, + min=5.61, + max=10.84, + histogram=CenteredBinHistogram(binCenters=[1, 2], counts=[10, 15]), + ) + ), + ], + ) validate_builder(builder, _BuilderState.HAS_VARIABLES, n_dims=3, n_coords=5, n_var=6) - v = validate_variable(builder, "ampl", - dims=[("inline", 100), ("crossline", 200), ("depth", 300)], - coords=["inline", "crossline", "depth", "cdp-x", "cdp-y"], - dtype=ScalarType.FLOAT32) + v = validate_variable( + builder, + "ampl", + dims=[("inline", 100), ("crossline", 200), ("depth", 300)], + coords=["inline", "crossline", "depth", "cdp-x", "cdp-y"], + dtype=ScalarType.FLOAT32, + ) assert v.long_name == "Amplitude (dimensionless)" assert isinstance(v.compressor, Blosc) assert v.compressor.algorithm == "zstd" diff --git a/tests/unit/v1/test_dataset_builder_build.py b/tests/unit/v1/test_dataset_builder_build.py index d42f29a7..aa5fad55 100644 --- a/tests/unit/v1/test_dataset_builder_build.py +++ b/tests/unit/v1/test_dataset_builder_build.py @@ -4,7 +4,6 @@ # Thus, disable it for this file """Tests the schema v1 dataset_builder.add_coordinate() public API.""" - from mdio.schemas.dtype import ScalarType from mdio.schemas.dtype import StructuredField from mdio.schemas.dtype import StructuredType @@ -23,15 +22,17 @@ def test_build() -> None: MDIODatasetBuilder("test_dataset") .add_dimension("inline", 100) .add_dimension("crossline", 200) - .add_coordinate("inline", dimensions=["inline"], data_type=ScalarType.FLOAT64) - .add_coordinate("crossline", dimensions=["crossline"], data_type=ScalarType.FLOAT64) + .add_coordinate("inline", dimensions=["inline"], data_type=ScalarType.FLOAT64) + .add_coordinate("crossline", dimensions=["crossline"], data_type=ScalarType.FLOAT64) .add_coordinate("x_coord", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT32) .add_coordinate("y_coord", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT32) - .add_variable("data", - long_name="Test Data", - dimensions=["inline", "crossline"], - coordinates=["inline", "crossline", "x_coord", "y_coord"], - data_type=ScalarType.FLOAT32) + .add_variable( + "data", + long_name="Test Data", + dimensions=["inline", "crossline"], + coordinates=["inline", "crossline", "x_coord", "y_coord"], + data_type=ScalarType.FLOAT32, + ) .build() ) @@ -45,7 +46,8 @@ def test_build() -> None: assert next(v for v in dataset.variables if v.name == "y_coord") is not None assert next(v for v in dataset.variables if v.name == "data") is not None -def test_build_campos_3d() -> None: # noqa: PLR0915 Too many statements (57 > 50) + +def test_build_campos_3d() -> None: # noqa: PLR0915 Too many statements (57 > 50) """Test building a Campos 3D dataset with multiple variables and attributes.""" dataset = make_campos_3d_dataset() @@ -61,11 +63,7 @@ def test_build_campos_3d() -> None: # noqa: PLR0915 Too many statements (57 > 50 # Verify dimension coordinate variables validate_variable( - dataset, - name="inline", - dims=[("inline", 256)], - coords=["inline"], - dtype=ScalarType.UINT32 + dataset, name="inline", dims=[("inline", 256)], coords=["inline"], dtype=ScalarType.UINT32 ) validate_variable( @@ -73,15 +71,11 @@ def test_build_campos_3d() -> None: # noqa: PLR0915 Too many statements (57 > 50 name="crossline", dims=[("crossline", 512)], coords=["crossline"], - dtype=ScalarType.UINT32 + dtype=ScalarType.UINT32, ) depth = validate_variable( - dataset, - name="depth", - dims=[("depth", 384)], - coords=["depth"], - dtype=ScalarType.FLOAT64 + dataset, name="depth", dims=[("depth", 384)], coords=["depth"], dtype=ScalarType.FLOAT64 ) assert depth.metadata.units_v1.length == LengthUnitEnum.METER @@ -91,7 +85,7 @@ def test_build_campos_3d() -> None: # noqa: PLR0915 Too many statements (57 > 50 name="cdp-x", dims=[("inline", 256), ("crossline", 512)], coords=["cdp-x"], - dtype=ScalarType.FLOAT32 + dtype=ScalarType.FLOAT32, ) assert cdp_x.metadata.units_v1.length == LengthUnitEnum.METER @@ -100,17 +94,17 @@ def test_build_campos_3d() -> None: # noqa: PLR0915 Too many statements (57 > 50 name="cdp-y", dims=[("inline", 256), ("crossline", 512)], coords=["cdp-y"], - dtype=ScalarType.FLOAT32 + dtype=ScalarType.FLOAT32, ) assert cdp_y.metadata.units_v1.length == LengthUnitEnum.METER # Verify data variables image = validate_variable( - dataset, + dataset, name="image", dims=[("inline", 256), ("crossline", 512), ("depth", 384)], coords=["cdp-x", "cdp-y"], - dtype=ScalarType.FLOAT32 + dtype=ScalarType.FLOAT32, ) assert image.metadata.units_v1 is None # No units defined for image assert image.compressor.algorithm == "zstd" @@ -122,7 +116,7 @@ def test_build_campos_3d() -> None: # noqa: PLR0915 Too many statements (57 > 50 name="velocity", dims=[("inline", 256), ("crossline", 512), ("depth", 384)], coords=["cdp-x", "cdp-y"], - dtype=ScalarType.FLOAT16 + dtype=ScalarType.FLOAT16, ) assert velocity.compressor is None assert velocity.metadata.chunk_grid.configuration.chunk_shape == [128, 128, 128] @@ -133,7 +127,7 @@ def test_build_campos_3d() -> None: # noqa: PLR0915 Too many statements (57 > 50 name="image_inline", dims=[("inline", 256), ("crossline", 512), ("depth", 384)], coords=["cdp-x", "cdp-y"], - dtype=ScalarType.FLOAT32 + dtype=ScalarType.FLOAT32, ) assert image_inline.long_name == "inline optimized version of 3d_stack" assert image_inline.compressor.algorithm == "zstd" @@ -154,10 +148,8 @@ def test_build_campos_3d() -> None: # noqa: PLR0915 Too many statements (57 > 50 fields=[ StructuredField(name="cdp-x", format=ScalarType.FLOAT32), StructuredField(name="cdp-y", format=ScalarType.FLOAT32), - StructuredField(name="inline", format=ScalarType.UINT32), + StructuredField(name="inline", format=ScalarType.UINT32), StructuredField(name="crossline", format=ScalarType.UINT32), ] - ) + ), ) - - diff --git a/tests/unit/v1/test_dataset_builder_helpers.py b/tests/unit/v1/test_dataset_builder_helpers.py index aecd5ff6..b1ab9964 100644 --- a/tests/unit/v1/test_dataset_builder_helpers.py +++ b/tests/unit/v1/test_dataset_builder_helpers.py @@ -18,14 +18,11 @@ def test__get_named_dimension() -> None: """Test getting a dimension by name from the list of dimensions.""" - dimensions = [NamedDimension(name="inline", size=2), NamedDimension( - name="crossline", size=3)] + dimensions = [NamedDimension(name="inline", size=2), NamedDimension(name="crossline", size=3)] assert _get_named_dimension([], "inline") is None - assert _get_named_dimension(dimensions, "inline") == NamedDimension( - name="inline", size=2) - assert _get_named_dimension(dimensions, "crossline") == NamedDimension( - name="crossline", size=3) + assert _get_named_dimension(dimensions, "inline") == NamedDimension(name="inline", size=2) + assert _get_named_dimension(dimensions, "crossline") == NamedDimension(name="crossline", size=3) assert _get_named_dimension(dimensions, "time") is None with pytest.raises(TypeError, match="Expected str, got NoneType"): @@ -51,46 +48,51 @@ def test__to_dictionary() -> None: # Validate conversion of a Pydantic BaseModel class SomeModel(StrictModel): count: int = Field(default=None, description="Samples count") - samples: list[float] = Field( - default_factory=list, description="Samples.") + samples: list[float] = Field(default_factory=list, description="Samples.") created: datetime = Field( default_factory=datetime.now, description="Creation time with TZ info." ) - md = SomeModel(count=3, - samples=[1.0, 2.0, 3.0], - created=datetime(2023, 10, 1, 12, 0, 0, tzinfo=UTC)) + + md = SomeModel( + count=3, samples=[1.0, 2.0, 3.0], created=datetime(2023, 10, 1, 12, 0, 0, tzinfo=UTC) + ) result = _to_dictionary(md) assert isinstance(result, dict) - assert result == { - "count": 3, "created": "2023-10-01T12:00:00Z", "samples": [1.0, 2.0, 3.0]} + assert result == {"count": 3, "created": "2023-10-01T12:00:00Z", "samples": [1.0, 2.0, 3.0]} # Validate conversion of a dictionary dct = { "count": 3, "samples": [1.0, 2.0, 3.0], - "created": datetime(2023, 10, 1, 12, 0, 0, tzinfo=UTC)} + "created": datetime(2023, 10, 1, 12, 0, 0, tzinfo=UTC), + } result = _to_dictionary(dct) assert isinstance(result, dict) - assert result == {"count": 3, - "samples": [1.0, 2.0, 3.0], - "created": datetime(2023, 10, 1, 12, 0, 0, tzinfo=UTC), } + assert result == { + "count": 3, + "samples": [1.0, 2.0, 3.0], + "created": datetime(2023, 10, 1, 12, 0, 0, tzinfo=UTC), + } # Validate conversion of a dictionary - lst = [None, - SomeModel(count=3, - samples=[1.0, 2.0, 3.0], - created=datetime(2023, 10, 1, 12, 0, 0, tzinfo=UTC)), - { - "count2": 3, - "samples2": [1.0, 2.0, 3.0], - "created2": datetime(2023, 10, 1, 12, 0, 0, tzinfo=UTC) - }] + lst = [ + None, + SomeModel( + count=3, samples=[1.0, 2.0, 3.0], created=datetime(2023, 10, 1, 12, 0, 0, tzinfo=UTC) + ), + { + "count2": 3, + "samples2": [1.0, 2.0, 3.0], + "created2": datetime(2023, 10, 1, 12, 0, 0, tzinfo=UTC), + }, + ] result = _to_dictionary(lst) assert isinstance(result, dict) - assert result == {"count": 3, - "samples": [1.0, 2.0, 3.0], - "created": "2023-10-01T12:00:00Z", - "count2": 3, - "samples2": [1.0, 2.0, 3.0], - "created2": datetime(2023, 10, 1, 12, 0, 0, tzinfo=UTC) - } + assert result == { + "count": 3, + "samples": [1.0, 2.0, 3.0], + "created": "2023-10-01T12:00:00Z", + "count2": 3, + "samples2": [1.0, 2.0, 3.0], + "created2": datetime(2023, 10, 1, 12, 0, 0, tzinfo=UTC), + } From 1904deefe7dd58f3d4cb24cbbc682d93f0064926 Mon Sep 17 00:00:00 2001 From: Altay Sansal Date: Thu, 10 Jul 2025 10:45:04 -0500 Subject: [PATCH 20/27] remove some noqa overrides --- src/mdio/schemas/v1/dataset_builder.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/mdio/schemas/v1/dataset_builder.py b/src/mdio/schemas/v1/dataset_builder.py index 85bae21f..facd0dc8 100644 --- a/src/mdio/schemas/v1/dataset_builder.py +++ b/src/mdio/schemas/v1/dataset_builder.py @@ -9,7 +9,6 @@ from typing import TypeAlias from pydantic import BaseModel -from zarr.core.chunk_key_encodings import V2ChunkKeyEncoding # noqa: F401 from mdio.schemas.compressors import ZFP from mdio.schemas.compressors import Blosc @@ -107,9 +106,7 @@ def __init__(self, name: str, attributes: UserAttributes | None = None): self._state = _BuilderState.INITIAL self._unnamed_variable_counter = 0 - def add_dimension( # noqa: PLR0913 - self, name: str, size: int - ) -> "MDIODatasetBuilder": + def add_dimension(self, name: str, size: int) -> "MDIODatasetBuilder": """Add a dimension. This function be called at least once before adding coordinates or variables. From 4c7c8337f16d3d1ee5602f54df66a3fcc0dc1581 Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Thu, 10 Jul 2025 23:34:39 +0000 Subject: [PATCH 21/27] Writing XArray / Zarr --- _dev/DEVELOPERS_NOTES.md | 10 + _dev/zmetadata.cpp.json | 294 ++++++++++++++++++ _dev/zmetadata.python.json | 282 +++++++++++++++++ src/mdio/schemas/v1/dataset_builder.py | 2 +- src/mdio/schemas/v1/dataset_serializer.py | 224 +++++++++++++ tests/unit/v1/helpers.py | 122 ++++++++ .../v1/test_dataset_builder_add_coordinate.py | 2 +- .../v1/test_dataset_builder_add_variable.py | 2 +- tests/unit/v1/test_dataset_builder_build.py | 2 +- tests/unit/v1/test_dataset_serializer.py | 60 ++++ 10 files changed, 996 insertions(+), 4 deletions(-) create mode 100644 _dev/DEVELOPERS_NOTES.md create mode 100644 _dev/zmetadata.cpp.json create mode 100644 _dev/zmetadata.python.json create mode 100644 src/mdio/schemas/v1/dataset_serializer.py create mode 100644 tests/unit/v1/test_dataset_serializer.py diff --git a/_dev/DEVELOPERS_NOTES.md b/_dev/DEVELOPERS_NOTES.md new file mode 100644 index 00000000..2e4718a4 --- /dev/null +++ b/_dev/DEVELOPERS_NOTES.md @@ -0,0 +1,10 @@ +# Wring empty XArray / Zarr to a local storage + +src/mdio/schemas/v1/dataset_serializer.py + +## Issues encountered + +1. Non-zero size of the serialized data files +2. Not clear how to properly set `compressor`, `dimension_separator`, and `fill_value` + * Should `fill_value` be a part f the model? +3. For image_inline chunks[2] are somehow different? \ No newline at end of file diff --git a/_dev/zmetadata.cpp.json b/_dev/zmetadata.cpp.json new file mode 100644 index 00000000..40a12d6d --- /dev/null +++ b/_dev/zmetadata.cpp.json @@ -0,0 +1,294 @@ +{ + "metadata": { + ".zattrs": { + "apiVersion": "1.0.0", + "attributes": { + "foo": "bar", + "textHeader": [ + "C01 .......................... ", + "C02 .......................... ", + "C03 .......................... " + ] + }, + "createdOn": "2023-12-12T15:02:06.413469-06:00", + "name": "campos_3d" + }, + ".zgroup": { + "zarr_format": 2 + }, + "cdp-x/.zarray": { + "chunks": [ + 256, + 512 + ], + "compressor": null, + "dimension_separator": "/", + "dtype": " dict[str, NamedDimension]: + all_named_dims: dict[str, NamedDimension] = {} + for v in dataset.variables: + if v.dimensions is not None: + for d in v.dimensions: + if isinstance(d, NamedDimension): + all_named_dims[d.name] = d + else: + # Skip, if this is a named reference + # This should not be ever a case for the dataset generated with the dataset builder + warnings.warn( + f"Unsupported dimension type: {type(d)} in variable {v.name}. " + "Expected NamedDimension." + ) + pass + return all_named_dims + +def _get_all_coordinates(dataset: Dataset) -> dict[str, Coordinate]: + all_coords: dict[str, Coordinate] = {} + for v in dataset.variables: + if v.coordinates is not None: + for c in v.coordinates: + if isinstance(c, Coordinate) and c.name not in all_coords: + all_coords[c.name] = c + return all_coords + +def _get_dimension_names(var: Variable) -> list[str]: + dim_names: list[str] = [] + if var.dimensions is not None: + for d in var.dimensions: + if isinstance(d, NamedDimension): + dim_names.append(d.name) + elif isinstance(d, str): + # This should not be ever a case for the dataset generated with the dataset builder + warnings.warn(f"Unsupported dimension type: 'str' in variable {var.name}. " + "Expected NamedDimension." + ) + dim_names.append(d) + else: + err = f"Unsupported dimension type: {type(d)} in variable {var.name}" + raise TypeError(err) + return dim_names + +def _get_coord_names(var: Variable) -> list[str]: + coord_names: list[str] = [] + if var.coordinates is not None: + for c in var.coordinates: + if isinstance(c, Coordinate): + coord_names.append(c.name) + elif isinstance(c, str): + coord_names.append(c) + else: + err = f"Unsupported coordinate type: {type(c)} in variable {var.name}" + raise TypeError(err) + return coord_names + +def _get_np_datatype(var: Variable) -> np.dtype: + data_type = var.data_type + if isinstance(data_type, ScalarType): + return np.dtype(data_type.value) + elif isinstance(data_type, StructuredType): + return np.dtype([(f.name, f.format.value) for f in data_type.fields]) + else: + raise TypeError(f"Unsupported data_type: {data_type}") + +def _get_zarr_shape(var: Variable) -> tuple[int, ...]: + # NOTE: This assumes that the variable dimensions are all NamedDimension + return tuple(dim.size for dim in var.dimensions) + +def _get_zarr_chunks(var: Variable) -> tuple[int, ...]: + """Get the chunk shape for a variable, defaulting to its shape if no chunk grid is defined.""" + if var.metadata is not None and var.metadata.chunk_grid is not None: + return var.metadata.chunk_grid.configuration.chunk_shape + else: + # Default to full shape if no chunk grid is defined + return _get_zarr_shape(var) + +def to_xarray_dataset(ds: Dataset) -> xr.DataArray: # noqa: PLR0912 + """Build an MDIO dataset with correct dimensions and dtypes. + + This internal function constructs the underlying data structure for an MDIO dataset, + handling dimension mapping, data types, and metadata organization. + + Args: + mdio_ds: The source MDIO dataset to construct from. + + Returns: + The constructed dataset with proper MDIO structure and metadata. + + Raises: + TypeError: If an unsupported data type is encountered. + """ + + # See the xarray tutorial for more details on how to create datasets: + # https://tutorial.xarray.dev/fundamentals/01.1_creating_data_structures.html + + # all_dims = _get_all_named_dimensions(ds) + # all_coords = _get_all_coordinates(ds) + + # Build all variables + data_arrays: dict[str, xr.DataArray] = {} + for v in ds.variables: + + # Use dask array instead of numpy array for lazy evaluation + shape = _get_zarr_shape(v) + dtype = _get_np_datatype(v) + chunks = _get_zarr_chunks(v) + arr = dask.array.zeros(shape, dtype=dtype, chunks=chunks) + + # Create a DataArray for the variable. We will set coords in the second pass + dim_names = _get_dimension_names(v) + data_array = xr.DataArray(arr, dims=dim_names) + + # https://docs.xarray.dev/en/stable/internals/zarr-encoding-spec.html#zarr-encoding + # If you don't explicitly specify a compressor when creating a Zarr array, Z + # arr will use a default compressor based on the Zarr format version and the data type of your array. + # Zarr V2 (Default is Blosc) + # data_array.encoding.compressor = None + #TODO: beging the par that does not work + data_array.encoding["fill_value"] = 0.0 + data_array.encoding["dimension_separator"] = "/" # Does not work + if v.compressor is not None: + compressor = _to_dictionary(v.compressor) + data_array.encoding["compressor"] = compressor + else: + data_array.encoding["compressor"] = None + #TODO: end the part that does not work + + # Add array attributes + if v.metadata is not None: + meta_dict = _to_dictionary(v.metadata) + # Exclude chunk_grid + del meta_dict["chunkGrid"] + # Remove empty attributes + meta_dict = {k: v for k, v in meta_dict.items() if v is not None} + # Add metadata to the data array attributes + data_array.attrs.update(meta_dict) + pass + if v.long_name: + data_array.attrs["long_name"] = v.long_name + + # Let's store the data array for the second pass + data_arrays[v.name] = data_array + + # Add non-dimension coordinates to the data arrays + for v in ds.variables: + da = data_arrays[v.name] + non_dim_coords_names = set(_get_coord_names(v)) - set(_get_dimension_names(v)) - {v.name} + # Create a populate a dictionary {coord_name: DataArray for the coordinate} + non_dim_coords_dict : dict[str, xr.DataArray] = {} + for name in non_dim_coords_names: + non_dim_coords_dict[name] = data_arrays[name] + if non_dim_coords_dict: + # NOTE: here is a gotcha: assign_coords() does not update in-place, + # but returns an updated instance! + data_arrays[v.name] = da.assign_coords(non_dim_coords_dict) + pass + + # Now let's create a dataset with all data arrays + xr_ds = xr.Dataset(data_arrays) + # Attach dataset metadata + if ds.metadata is not None: + xr_ds.attrs["apiVersion"] = ds.metadata.api_version + xr_ds.attrs["createdOn"] = str(ds.metadata.created_on) + xr_ds.attrs["name"] = ds.metadata.name + if ds.metadata.attributes: + xr_ds.attrs["attributes"] = ds.metadata.attributes + + return xr_ds + + +def to_zarr(dataset: xr.Dataset, + store: str | None = None, + *args: str | int | float | bool, + **kwargs: Mapping[str, str | int | float | bool], +) -> None: + """Alias for `.to_zarr()`.""" + # Ensure zarr_format=2 by default unless explicitly overridden + zarr_format = kwargs.get("zarr_format", 2) + if zarr_format != 2: # noqa: PLR2004 + msg = "MDIO only supports zarr_format=2" + raise ValueError(msg) + + # ds.to_zarr("foo.zarr", consolidated=False, encoding={"foo": {"compressors": [compressor]}}) + # Define compressor + # compressor = zarr.Blosc(cname="zstd", clevel=5, shuffle=2) + # # Define encoding + # encoding = { + # "foo": {"compressor": compressor}, + # "bar": {"compressor": compressor}, + # } + # kwargs["encoding"] = encoding + encoding = {} + for key in dataset.data_vars.keys(): + c = dataset[key].encoding.get("compressors", None) + encoding[key] = {"compressors": c} + kwargs["encoding"] = encoding + + kwargs["zarr_format"] = zarr_format + + + + + return dataset.to_zarr(*args, store=store, **kwargs) + +# https://docs.xarray.dev/en/stable/user-guide/io.html +# ds.to_zarr("path/to/directory.zarr", zarr_format=2, consolidated=False) \ No newline at end of file diff --git a/tests/unit/v1/helpers.py b/tests/unit/v1/helpers.py index 2058bdd5..8a01a966 100644 --- a/tests/unit/v1/helpers.py +++ b/tests/unit/v1/helpers.py @@ -138,6 +138,128 @@ def _get_all_coordinates(dataset: Dataset) -> list[Coordinate]: return list(all_coords.values()) +def make_campos_3d_acceptance_dataset() -> Dataset: + """Create in-memory campos_3d dataset.""" + ds = MDIODatasetBuilder( + "campos_3d", + attributes=UserAttributes( + attributes={ + "textHeader": [ + "C01 .......................... ", + "C02 .......................... ", + "C03 .......................... ", + ], + "foo": "bar", + } + ), + ) + + # Add dimensions + ds.add_dimension("inline", 256) + ds.add_dimension("crossline", 512) + ds.add_dimension("depth", 384) # data_type=ScalarType.UINT32, LengthUnitEnum.METER + ds.add_coordinate("inline", dimensions=["inline"], data_type=ScalarType.UINT32) + ds.add_coordinate("crossline", dimensions=["crossline"], data_type=ScalarType.UINT32) + ds.add_coordinate( + "depth", + dimensions=["depth"], + data_type=ScalarType.UINT32, + metadata_info=[AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.METER))], + ) + # Add coordinates + ds.add_coordinate( + "cdp-x", + dimensions=["inline", "crossline"], + data_type=ScalarType.FLOAT32, + metadata_info=[AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.METER))], + ) + ds.add_coordinate( + "cdp-y", + dimensions=["inline", "crossline"], + data_type=ScalarType.FLOAT32, + metadata_info=[AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.METER))], + ) + + # Add image variable + ds.add_variable( + name="image", + dimensions=["inline", "crossline", "depth"], + data_type=ScalarType.FLOAT32, + compressor=Blosc(algorithm="zstd"), + coordinates=["cdp-x", "cdp-y"], + metadata_info=[ + ChunkGridMetadata( + chunk_grid=RegularChunkGrid( + configuration=RegularChunkShape(chunk_shape=[128, 128, 128]) + ) + ), + StatisticsMetadata( + stats_v1=SummaryStatistics( + count=100, + sum=1215.1, + sumSquares=125.12, + min=5.61, + max=10.84, + histogram=CenteredBinHistogram(binCenters=[1, 2], counts=[10, 15]), + ) + ), + UserAttributes(attributes={"fizz": "buzz"}), + ], + ) + # Add velocity variable + ds.add_variable( + name="velocity", + dimensions=["inline", "crossline", "depth"], + data_type=ScalarType.FLOAT16, + coordinates=["cdp-x", "cdp-y"], + metadata_info=[ + ChunkGridMetadata( + chunk_grid=RegularChunkGrid( + configuration=RegularChunkShape(chunk_shape=[128, 128, 128]) + ) + ), + AllUnits(units_v1=SpeedUnitModel(speed=SpeedUnitEnum.METER_PER_SECOND)), + ], + ) + # Add inline-optimized image variable + ds.add_variable( + name="image_inline", + long_name="inline optimized version of 3d_stack", + dimensions=["inline", "crossline", "depth"], + data_type=ScalarType.FLOAT32, + compressor=Blosc(algorithm="zstd"), + coordinates=["cdp-x", "cdp-y"], + metadata_info=[ + ChunkGridMetadata( + chunk_grid=RegularChunkGrid( + configuration=RegularChunkShape(chunk_shape=[4, 512, 512]) + ) + ) + ], + ) + # Add headers variable with structured dtype + ds.add_variable( + name="image_headers", + dimensions=["inline", "crossline"], + data_type=StructuredType( + fields=[ + StructuredField(name="cdp-x", format=ScalarType.INT32), + StructuredField(name="cdp-y", format=ScalarType.INT32), + StructuredField(name="elevation", format=ScalarType.FLOAT16), + StructuredField(name="some_scalar", format=ScalarType.FLOAT16), + ] + ), + metadata_info=[ + ChunkGridMetadata( + chunk_grid=RegularChunkGrid( + configuration=RegularChunkShape(chunk_shape=[128, 128]) + ) + ) + ], + coordinates=["cdp-x", "cdp-y"], + ) + return ds.build() + def make_campos_3d_dataset() -> Dataset: """Create in-memory campos_3d dataset.""" ds = MDIODatasetBuilder( diff --git a/tests/unit/v1/test_dataset_builder_add_coordinate.py b/tests/unit/v1/test_dataset_builder_add_coordinate.py index 46cb27cd..20f68602 100644 --- a/tests/unit/v1/test_dataset_builder_add_coordinate.py +++ b/tests/unit/v1/test_dataset_builder_add_coordinate.py @@ -93,7 +93,7 @@ def test_add_coordinate_with_defaults() -> None: coords=["cdp"], dtype=ScalarType.FLOAT32, ) - assert v.long_name == "'cdp' coordinate variable" # Default value + assert v.long_name is None # Default value assert v.compressor is None # Default value assert v.metadata is None # Default value diff --git a/tests/unit/v1/test_dataset_builder_add_variable.py b/tests/unit/v1/test_dataset_builder_add_variable.py index 84abfe3a..6f46db50 100644 --- a/tests/unit/v1/test_dataset_builder_add_variable.py +++ b/tests/unit/v1/test_dataset_builder_add_variable.py @@ -2,7 +2,7 @@ # PLR2004 Magic value used in comparison, consider replacing `3` with a constant variable # The above erroneous warning is generated for every numerical assert. # Thus, disable it for this file -"""Tests the schema v1 Variable public API.""" +"""Tests the schema v1 dataset_builder.add_variable() public API.""" import pytest diff --git a/tests/unit/v1/test_dataset_builder_build.py b/tests/unit/v1/test_dataset_builder_build.py index aa5fad55..832f624c 100644 --- a/tests/unit/v1/test_dataset_builder_build.py +++ b/tests/unit/v1/test_dataset_builder_build.py @@ -2,7 +2,7 @@ # PLR2004 Magic value used in comparison, consider replacing `3` with a constant variable # The above erroneous warning is generated for every numerical assert. # Thus, disable it for this file -"""Tests the schema v1 dataset_builder.add_coordinate() public API.""" +"""Tests the schema v1 dataset_builder.build() public API.""" from mdio.schemas.dtype import ScalarType from mdio.schemas.dtype import StructuredField diff --git a/tests/unit/v1/test_dataset_serializer.py b/tests/unit/v1/test_dataset_serializer.py new file mode 100644 index 00000000..ddd43563 --- /dev/null +++ b/tests/unit/v1/test_dataset_serializer.py @@ -0,0 +1,60 @@ +# ruff: noqa: PLR2004 +# PLR2004 Magic value used in comparison, consider replacing `3` with a constant variable +# The above erroneous warning is generated for every numerical assert. +# Thus, disable it for this file +"""Tests the schema v1 dataset_serializer public API.""" + +import dask +import xarray as xr +import numpy as np + +from mdio.schemas.dtype import ScalarType +from mdio.schemas.dtype import StructuredField +from mdio.schemas.dtype import StructuredType +from mdio.schemas.v1.dataset import Dataset +from mdio.schemas.v1.dataset_builder import MDIODatasetBuilder +from mdio.schemas.v1.dataset_serializer import to_xarray_dataset, to_zarr +from mdio.schemas.v1.units import LengthUnitEnum +from mdio.schemas.v1.units import SpeedUnitEnum + +from .helpers import make_campos_3d_acceptance_dataset, make_campos_3d_dataset +from .helpers import validate_variable + + +# def test_to_xarray_dataset(capsys) -> None: +# """Test building a complete dataset.""" +# dataset = ( +# MDIODatasetBuilder("test_dataset") +# .add_dimension("inline", 100) +# .add_dimension("crossline", 200) +# .add_dimension("depth", 300) +# .add_coordinate("inline", dimensions=["inline"], data_type=ScalarType.FLOAT64) +# .add_coordinate("crossline", dimensions=["crossline"], data_type=ScalarType.FLOAT64) +# .add_coordinate("x_coord", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT32) +# .add_coordinate("y_coord", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT32) +# .add_variable( +# "data", +# long_name="Test Data", +# dimensions=["inline", "crossline", "depth"], +# coordinates=["inline", "crossline", "x_coord", "y_coord"], +# data_type=ScalarType.FLOAT32, +# ) +# .build() +# ) + +# # with capsys.disabled(): +# xds : xr.Dataset = to_xarray_dataset(dataset) + +# to_zarr(xds, f"test-dataset-{xds.attrs["name"]}.zarr", mode="w") + + +def test_campos_3d_acceptance_to_xarray_dataset(capsys) -> None: + """Test building a complete dataset.""" + dataset = make_campos_3d_acceptance_dataset() + + + xds : xr.Dataset = to_xarray_dataset(dataset) + + # file_name = "XYZ" + file_name = f"{xds.attrs['name']}" + to_zarr(xds, f"test-data/{file_name}.zarr", mode="w") \ No newline at end of file From 4b39ffa844367df72e6f9ef2facb4dcd5ea701f0 Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Thu, 10 Jul 2025 23:37:17 +0000 Subject: [PATCH 22/27] gitignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index bfdc38f2..643394a4 100644 --- a/.gitignore +++ b/.gitignore @@ -153,3 +153,5 @@ mdio1/* pytest-of-* tmp debugging/* + +test-data/* From cea73086ec1754dfc2a87fc0b768f42767e306b8 Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Fri, 11 Jul 2025 20:13:09 +0000 Subject: [PATCH 23/27] to_zarr() fix compression --- _dev/DEVELOPERS_NOTES.md | 13 +- _dev/zmetadata.cpp.json | 483 +++++++++------------- _dev/zmetadata.python.json | 473 +++++++++------------ src/mdio/schemas/v1/dataset_serializer.py | 248 +++++++---- tests/unit/v1/helpers.py | 7 +- tests/unit/v1/test_dataset_serializer.py | 75 ++-- 6 files changed, 592 insertions(+), 707 deletions(-) diff --git a/_dev/DEVELOPERS_NOTES.md b/_dev/DEVELOPERS_NOTES.md index 2e4718a4..8c575025 100644 --- a/_dev/DEVELOPERS_NOTES.md +++ b/_dev/DEVELOPERS_NOTES.md @@ -4,7 +4,12 @@ src/mdio/schemas/v1/dataset_serializer.py ## Issues encountered -1. Non-zero size of the serialized data files -2. Not clear how to properly set `compressor`, `dimension_separator`, and `fill_value` - * Should `fill_value` be a part f the model? -3. For image_inline chunks[2] are somehow different? \ No newline at end of file +1. FIXED: Non-zero size of the serialized data files +2. FIXED: Not clear how to properly set `compressor`, `dimension_separator`, and `fill_value` +3. FIXED: For image_inline chunks[2] are somehow different? + +4. `fill_value` for StructuredType is set to null, but "AAAAAAAAAAAAAAAA" is expected + +## TO DO: +* Add more unit tests for internal functions +* Add a trest comparing expected and actual .zmetadata for the serialized dataset diff --git a/_dev/zmetadata.cpp.json b/_dev/zmetadata.cpp.json index 40a12d6d..394de1a8 100644 --- a/_dev/zmetadata.cpp.json +++ b/_dev/zmetadata.cpp.json @@ -1,294 +1,195 @@ { - "metadata": { - ".zattrs": { - "apiVersion": "1.0.0", - "attributes": { - "foo": "bar", - "textHeader": [ - "C01 .......................... ", - "C02 .......................... ", - "C03 .......................... " - ] - }, - "createdOn": "2023-12-12T15:02:06.413469-06:00", - "name": "campos_3d" - }, - ".zgroup": { - "zarr_format": 2 - }, - "cdp-x/.zarray": { - "chunks": [ - 256, - 512 - ], - "compressor": null, - "dimension_separator": "/", - "dtype": " dict[str, NamedDimension]: f"Unsupported dimension type: {type(d)} in variable {v.name}. " "Expected NamedDimension." ) - pass return all_named_dims + def _get_all_coordinates(dataset: Dataset) -> dict[str, Coordinate]: all_coords: dict[str, Coordinate] = {} for v in dataset.variables: @@ -41,6 +53,7 @@ def _get_all_coordinates(dataset: Dataset) -> dict[str, Coordinate]: all_coords[c.name] = c return all_coords + def _get_dimension_names(var: Variable) -> list[str]: dim_names: list[str] = [] if var.dimensions is not None: @@ -49,15 +62,17 @@ def _get_dimension_names(var: Variable) -> list[str]: dim_names.append(d.name) elif isinstance(d, str): # This should not be ever a case for the dataset generated with the dataset builder - warnings.warn(f"Unsupported dimension type: 'str' in variable {var.name}. " - "Expected NamedDimension." - ) + warnings.warn( + f"Unsupported dimension type: 'str' in variable {var.name}. " + "Expected NamedDimension." + ) dim_names.append(d) else: err = f"Unsupported dimension type: {type(d)} in variable {var.name}" raise TypeError(err) return dim_names - + + def _get_coord_names(var: Variable) -> list[str]: coord_names: list[str] = [] if var.coordinates is not None: @@ -71,31 +86,99 @@ def _get_coord_names(var: Variable) -> list[str]: raise TypeError(err) return coord_names + def _get_np_datatype(var: Variable) -> np.dtype: data_type = var.data_type if isinstance(data_type, ScalarType): return np.dtype(data_type.value) - elif isinstance(data_type, StructuredType): + if isinstance(data_type, StructuredType): return np.dtype([(f.name, f.format.value) for f in data_type.fields]) - else: - raise TypeError(f"Unsupported data_type: {data_type}") + raise TypeError(f"Unsupported data_type: {data_type}") + def _get_zarr_shape(var: Variable) -> tuple[int, ...]: # NOTE: This assumes that the variable dimensions are all NamedDimension return tuple(dim.size for dim in var.dimensions) + def _get_zarr_chunks(var: Variable) -> tuple[int, ...]: """Get the chunk shape for a variable, defaulting to its shape if no chunk grid is defined.""" if var.metadata is not None and var.metadata.chunk_grid is not None: return var.metadata.chunk_grid.configuration.chunk_shape - else: - # Default to full shape if no chunk grid is defined - return _get_zarr_shape(var) + # Default to full shape if no chunk grid is defined + return _get_zarr_shape(var) + + +def _convert_compressor( + compressor: mdio_Blosc | mdio_ZFP | None, +) -> numcodecs.Blosc | zfpy_ZFPY | None: + """Convert a compressor to a numcodecs compatible format.""" + if compressor is None: + return None + + if isinstance(compressor, mdio_Blosc): + return numcodecs.Blosc( + cname=compressor.algorithm.value, + clevel=compressor.level, + shuffle=compressor.shuffle.value, + blocksize=compressor.blocksize if compressor.blocksize > 0 else 0, + ) + + if isinstance(compressor, mdio_ZFP): + if zfpy_ZFPY is None: + msg = "zfpy and numcodecs are required to use ZFP compression" + raise ImportError(msg) + return zfpy_ZFPY( + mode=compressor.mode.value, + tolerance=compressor.tolerance, + rate=compressor.rate, + precision=compressor.precision, + ) + + msg = f"Unsupported compressor model: {type(compressor)}" + raise TypeError(msg) + + +# Do we already have it somewhere in the codebase? I could not find it. +fill_value_map = { + ScalarType.BOOL: None, + ScalarType.FLOAT16: np.nan, + ScalarType.FLOAT32: np.nan, + ScalarType.FLOAT64: np.nan, + ScalarType.UINT8: 2**8 - 1, # Max value for uint8 + ScalarType.UINT16: 2**16 - 1, # Max value for uint16 + ScalarType.UINT32: 2**32 - 1, # Max value for uint32 + ScalarType.UINT64: 2**64 - 1, # Max value for uint64 + ScalarType.INT8: 2**7 - 1, # Max value for int8 + ScalarType.INT16: 2**15 - 1, # Max value for int16 + ScalarType.INT32: 2**31 - 1, # Max value for int32 + ScalarType.INT64: 2**63 - 1, # Max value for int64 + ScalarType.COMPLEX64: complex(np.nan, np.nan), + ScalarType.COMPLEX128: complex(np.nan, np.nan), + ScalarType.COMPLEX256: complex(np.nan, np.nan), +} + + +def get_fill_value(data_type: ScalarType | StructuredType | str) -> any: + """Get the fill value for a given data type. + The Zarr fill_value is a scalar value providing the default value to use for + uninitialized portions of the array, or null if no fill_value is to be used + https://zarr-specs.readthedocs.io/en/latest/v2/v2.0.html + """ + if isinstance(data_type, ScalarType): + return fill_value_map.get(data_type) + if isinstance(data_type, StructuredType): + return "AAAAAAAAAAAAAAAA" # BUG: this does not work!!! + if isinstance(data_type, str): + return "" + # If we do not have a fill value for this type, use None + return None + -def to_xarray_dataset(ds: Dataset) -> xr.DataArray: # noqa: PLR0912 - """Build an MDIO dataset with correct dimensions and dtypes. +def to_xarray_dataset(mdio_ds: Dataset) -> xr.DataArray: # noqa: PLR0912 + """Build an XArray dataset with correct dimensions and dtypes. - This internal function constructs the underlying data structure for an MDIO dataset, + This function constructs the underlying data structure for an XArray dataset, handling dimension mapping, data types, and metadata organization. Args: @@ -107,17 +190,15 @@ def to_xarray_dataset(ds: Dataset) -> xr.DataArray: # noqa: PLR0912 Raises: TypeError: If an unsupported data type is encountered. """ - # See the xarray tutorial for more details on how to create datasets: - # https://tutorial.xarray.dev/fundamentals/01.1_creating_data_structures.html + # https://tutorial.xarray.dev/fundamentals/01.1_creating_data_structures.html # all_dims = _get_all_named_dimensions(ds) # all_coords = _get_all_coordinates(ds) - # Build all variables + # First pass: Build all variables data_arrays: dict[str, xr.DataArray] = {} - for v in ds.variables: - + for v in mdio_ds.variables: # Use dask array instead of numpy array for lazy evaluation shape = _get_zarr_shape(v) dtype = _get_np_datatype(v) @@ -126,99 +207,92 @@ def to_xarray_dataset(ds: Dataset) -> xr.DataArray: # noqa: PLR0912 # Create a DataArray for the variable. We will set coords in the second pass dim_names = _get_dimension_names(v) - data_array = xr.DataArray(arr, dims=dim_names) - - # https://docs.xarray.dev/en/stable/internals/zarr-encoding-spec.html#zarr-encoding - # If you don't explicitly specify a compressor when creating a Zarr array, Z - # arr will use a default compressor based on the Zarr format version and the data type of your array. - # Zarr V2 (Default is Blosc) - # data_array.encoding.compressor = None - #TODO: beging the par that does not work - data_array.encoding["fill_value"] = 0.0 - data_array.encoding["dimension_separator"] = "/" # Does not work - if v.compressor is not None: - compressor = _to_dictionary(v.compressor) - data_array.encoding["compressor"] = compressor - else: - data_array.encoding["compressor"] = None - #TODO: end the part that does not work + data_array = xr.DataArray(arr, dims=dim_names) # Add array attributes if v.metadata is not None: meta_dict = _to_dictionary(v.metadata) # Exclude chunk_grid - del meta_dict["chunkGrid"] + del meta_dict["chunkGrid"] # Remove empty attributes meta_dict = {k: v for k, v in meta_dict.items() if v is not None} # Add metadata to the data array attributes data_array.attrs.update(meta_dict) - pass if v.long_name: data_array.attrs["long_name"] = v.long_name + # Compression: + # https://docs.xarray.dev/en/stable/internals/zarr-encoding-spec.html#zarr-encoding + # If you don't explicitly specify a compressor when creating a Zarr array, Z + # arr will use a default compressor based on the Zarr format version and the data type of your array. + # Zarr V2 (Default is Blosc). + # Thus, if there is no compressor, we will explicitly set "compressor" to None. + # + # Create a custom chunk key encoding with "/" as separator + from zarr.core.chunk_key_encodings import V2ChunkKeyEncoding + chunk_key_encoding = V2ChunkKeyEncoding(separator="/").to_dict() + encoding = { + "fill_value": get_fill_value(v.data_type), + "chunks": chunks, + "chunk_key_encoding": chunk_key_encoding, + # I was hoping the following would work, but it does not. + # I see: + # >_compressor = parse_compressor(compressor[0]) + # > return numcodecs.get_codec(data) + # E - numcodecs.errors.UnknownCodecError: codec not available: 'None'" + # Example: https://zarr.readthedocs.io/en/stable/user-guide/arrays.html#compressors + # from zarr.codecs import BloscCodec + # compressor = BloscCodec(cname="zstd", clevel=3, shuffle="shuffle") + # + # "compressor": _to_dictionary(v.compressor) + # Thus, we will call the conversion function: + "compressor": _convert_compressor(v.compressor), + } + data_array.encoding = encoding + # Let's store the data array for the second pass data_arrays[v.name] = data_array - # Add non-dimension coordinates to the data arrays - for v in ds.variables: + # Second pass: Add non-dimension coordinates to the data arrays + for v in mdio_ds.variables: da = data_arrays[v.name] non_dim_coords_names = set(_get_coord_names(v)) - set(_get_dimension_names(v)) - {v.name} - # Create a populate a dictionary {coord_name: DataArray for the coordinate} - non_dim_coords_dict : dict[str, xr.DataArray] = {} + # Create and populate a dictionary {coord_name: DataArray for the coordinate} + non_dim_coords_dict: dict[str, xr.DataArray] = {} for name in non_dim_coords_names: non_dim_coords_dict[name] = data_arrays[name] if non_dim_coords_dict: - # NOTE: here is a gotcha: assign_coords() does not update in-place, + # NOTE: here is a gotcha: assign_coords() does not update in-place, # but returns an updated instance! data_arrays[v.name] = da.assign_coords(non_dim_coords_dict) - pass # Now let's create a dataset with all data arrays xr_ds = xr.Dataset(data_arrays) # Attach dataset metadata - if ds.metadata is not None: - xr_ds.attrs["apiVersion"] = ds.metadata.api_version - xr_ds.attrs["createdOn"] = str(ds.metadata.created_on) - xr_ds.attrs["name"] = ds.metadata.name - if ds.metadata.attributes: - xr_ds.attrs["attributes"] = ds.metadata.attributes + if mdio_ds.metadata is not None: + xr_ds.attrs["apiVersion"] = mdio_ds.metadata.api_version + xr_ds.attrs["createdOn"] = str(mdio_ds.metadata.created_on) + xr_ds.attrs["name"] = mdio_ds.metadata.name + if mdio_ds.metadata.attributes: + xr_ds.attrs["attributes"] = mdio_ds.metadata.attributes return xr_ds -def to_zarr(dataset: xr.Dataset, +def to_zarr( + dataset: xr.Dataset, store: str | None = None, *args: str | int | float | bool, **kwargs: Mapping[str, str | int | float | bool], ) -> None: - """Alias for `.to_zarr()`.""" - # Ensure zarr_format=2 by default unless explicitly overridden - zarr_format = kwargs.get("zarr_format", 2) - if zarr_format != 2: # noqa: PLR2004 - msg = "MDIO only supports zarr_format=2" - raise ValueError(msg) - - # ds.to_zarr("foo.zarr", consolidated=False, encoding={"foo": {"compressors": [compressor]}}) - # Define compressor - # compressor = zarr.Blosc(cname="zstd", clevel=5, shuffle=2) - # # Define encoding - # encoding = { - # "foo": {"compressor": compressor}, - # "bar": {"compressor": compressor}, - # } - # kwargs["encoding"] = encoding - encoding = {} - for key in dataset.data_vars.keys(): - c = dataset[key].encoding.get("compressors", None) - encoding[key] = {"compressors": c} - kwargs["encoding"] = encoding - - kwargs["zarr_format"] = zarr_format - - - - + """Write an XArray dataset to Zarr format.""" + # MDIO only supports zarr_format=2 + kwargs["zarr_format"] = 2 + # compute: default: True) – If True write array data immediately, + # otherwise return a dask.delayed.Delayed object that can be computed + # to write array data later. + # *** Metadata is always updated eagerly. *** + kwargs["compute"] = False + # https://docs.xarray.dev/en/stable/user-guide/io.html + # https://docs.xarray.dev/en/latest/generated/xarray.DataArray.to_zarr.html return dataset.to_zarr(*args, store=store, **kwargs) - -# https://docs.xarray.dev/en/stable/user-guide/io.html -# ds.to_zarr("path/to/directory.zarr", zarr_format=2, consolidated=False) \ No newline at end of file diff --git a/tests/unit/v1/helpers.py b/tests/unit/v1/helpers.py index 8a01a966..336f2f0d 100644 --- a/tests/unit/v1/helpers.py +++ b/tests/unit/v1/helpers.py @@ -157,7 +157,7 @@ def make_campos_3d_acceptance_dataset() -> Dataset: # Add dimensions ds.add_dimension("inline", 256) ds.add_dimension("crossline", 512) - ds.add_dimension("depth", 384) # data_type=ScalarType.UINT32, LengthUnitEnum.METER + ds.add_dimension("depth", 384) ds.add_coordinate("inline", dimensions=["inline"], data_type=ScalarType.UINT32) ds.add_coordinate("crossline", dimensions=["crossline"], data_type=ScalarType.UINT32) ds.add_coordinate( @@ -251,15 +251,14 @@ def make_campos_3d_acceptance_dataset() -> Dataset: ), metadata_info=[ ChunkGridMetadata( - chunk_grid=RegularChunkGrid( - configuration=RegularChunkShape(chunk_shape=[128, 128]) - ) + chunk_grid=RegularChunkGrid(configuration=RegularChunkShape(chunk_shape=[128, 128])) ) ], coordinates=["cdp-x", "cdp-y"], ) return ds.build() + def make_campos_3d_dataset() -> Dataset: """Create in-memory campos_3d dataset.""" ds = MDIODatasetBuilder( diff --git a/tests/unit/v1/test_dataset_serializer.py b/tests/unit/v1/test_dataset_serializer.py index ddd43563..1bee02e3 100644 --- a/tests/unit/v1/test_dataset_serializer.py +++ b/tests/unit/v1/test_dataset_serializer.py @@ -4,57 +4,50 @@ # Thus, disable it for this file """Tests the schema v1 dataset_serializer public API.""" -import dask import xarray as xr -import numpy as np from mdio.schemas.dtype import ScalarType -from mdio.schemas.dtype import StructuredField -from mdio.schemas.dtype import StructuredType -from mdio.schemas.v1.dataset import Dataset from mdio.schemas.v1.dataset_builder import MDIODatasetBuilder -from mdio.schemas.v1.dataset_serializer import to_xarray_dataset, to_zarr -from mdio.schemas.v1.units import LengthUnitEnum -from mdio.schemas.v1.units import SpeedUnitEnum - -from .helpers import make_campos_3d_acceptance_dataset, make_campos_3d_dataset -from .helpers import validate_variable - - -# def test_to_xarray_dataset(capsys) -> None: -# """Test building a complete dataset.""" -# dataset = ( -# MDIODatasetBuilder("test_dataset") -# .add_dimension("inline", 100) -# .add_dimension("crossline", 200) -# .add_dimension("depth", 300) -# .add_coordinate("inline", dimensions=["inline"], data_type=ScalarType.FLOAT64) -# .add_coordinate("crossline", dimensions=["crossline"], data_type=ScalarType.FLOAT64) -# .add_coordinate("x_coord", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT32) -# .add_coordinate("y_coord", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT32) -# .add_variable( -# "data", -# long_name="Test Data", -# dimensions=["inline", "crossline", "depth"], -# coordinates=["inline", "crossline", "x_coord", "y_coord"], -# data_type=ScalarType.FLOAT32, -# ) -# .build() -# ) - -# # with capsys.disabled(): -# xds : xr.Dataset = to_xarray_dataset(dataset) - -# to_zarr(xds, f"test-dataset-{xds.attrs["name"]}.zarr", mode="w") +from mdio.schemas.v1.dataset_serializer import to_xarray_dataset +from mdio.schemas.v1.dataset_serializer import to_zarr + +from .helpers import make_campos_3d_acceptance_dataset + + +def test_to_xarray_dataset(capsys) -> None: + """Test building a complete dataset.""" + dataset = ( + MDIODatasetBuilder("test_dataset") + .add_dimension("inline", 100) + .add_dimension("crossline", 200) + .add_dimension("depth", 300) + .add_coordinate("inline", dimensions=["inline"], data_type=ScalarType.FLOAT64) + .add_coordinate("crossline", dimensions=["crossline"], data_type=ScalarType.FLOAT64) + .add_coordinate("x_coord", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT32) + .add_coordinate("y_coord", dimensions=["inline", "crossline"], data_type=ScalarType.FLOAT32) + .add_variable( + "data", + long_name="Test Data", + dimensions=["inline", "crossline", "depth"], + coordinates=["inline", "crossline", "x_coord", "y_coord"], + data_type=ScalarType.FLOAT32, + ) + .build() + ) + + # with capsys.disabled(): + xds: xr.Dataset = to_xarray_dataset(dataset) + + file_name = "sample_dataset" + to_zarr(xds, f"test-data/{file_name}.zarr", mode="w") def test_campos_3d_acceptance_to_xarray_dataset(capsys) -> None: """Test building a complete dataset.""" dataset = make_campos_3d_acceptance_dataset() - - xds : xr.Dataset = to_xarray_dataset(dataset) + xds: xr.Dataset = to_xarray_dataset(dataset) # file_name = "XYZ" file_name = f"{xds.attrs['name']}" - to_zarr(xds, f"test-data/{file_name}.zarr", mode="w") \ No newline at end of file + to_zarr(xds, f"test-data/{file_name}.zarr", mode="w") From 850135ea3d0f21f9a04b4ede6d7888c46e2e36a1 Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Fri, 11 Jul 2025 20:55:45 +0000 Subject: [PATCH 24/27] Fix precommit issues --- _dev/DEVELOPERS_NOTES.md | 5 +- src/mdio/schemas/v1/dataset_serializer.py | 89 ++++++++++------------- tests/unit/v1/test_dataset_serializer.py | 17 ++--- 3 files changed, 50 insertions(+), 61 deletions(-) diff --git a/_dev/DEVELOPERS_NOTES.md b/_dev/DEVELOPERS_NOTES.md index 8c575025..a9df833e 100644 --- a/_dev/DEVELOPERS_NOTES.md +++ b/_dev/DEVELOPERS_NOTES.md @@ -11,5 +11,6 @@ src/mdio/schemas/v1/dataset_serializer.py 4. `fill_value` for StructuredType is set to null, but "AAAAAAAAAAAAAAAA" is expected ## TO DO: -* Add more unit tests for internal functions -* Add a trest comparing expected and actual .zmetadata for the serialized dataset + +- Add more unit tests for internal functions +- Add a trest comparing expected and actual .zmetadata for the serialized dataset diff --git a/src/mdio/schemas/v1/dataset_serializer.py b/src/mdio/schemas/v1/dataset_serializer.py index 1592e93b..10e9f462 100644 --- a/src/mdio/schemas/v1/dataset_serializer.py +++ b/src/mdio/schemas/v1/dataset_serializer.py @@ -1,22 +1,23 @@ -import warnings +"""Convert MDIO v1 schema Dataset to Xarray DataSet and write it in Zarr.""" + from collections.abc import Mapping -import dask as dask -import numcodecs as numcodecs -import numpy as np -import xarray as xr -import zarr as zarr +from dask import array as dask_array +from numcodecs import Blosc as nc_Blosc +from numpy import dtype as np_dtype +from numpy import nan as np_nan +from xarray import DataArray as xr_DataArray +from xarray import Dataset as xr_Dataset +from zarr.core.chunk_key_encodings import V2ChunkKeyEncoding try: # zfpy is an optional dependency for ZFP compression # It is not installed by default, so we check for its presence and import it only if available. - import zfpy as zfpy - from zfpy import ZFPY as zfpy_ZFPY + from zfpy import ZFPY as zfpy_ZFPY # noqa: N811 except ImportError: - zfpy = None - zfpy_ZFPY = None + zfpy_ZFPY = None # noqa: N816 -from mdio.schemas.compressors import ZFP as mdio_ZFP +from mdio.schemas.compressors import ZFP as mdio_ZFP # noqa: N811 from mdio.schemas.compressors import Blosc as mdio_Blosc from mdio.schemas.dimension import NamedDimension from mdio.schemas.dtype import ScalarType @@ -35,12 +36,8 @@ def _get_all_named_dimensions(dataset: Dataset) -> dict[str, NamedDimension]: if isinstance(d, NamedDimension): all_named_dims[d.name] = d else: - # Skip, if this is a named reference - # This should not be ever a case for the dataset generated with the dataset builder - warnings.warn( - f"Unsupported dimension type: {type(d)} in variable {v.name}. " - "Expected NamedDimension." - ) + # Never happens for the dataset generated with the dataset builder + pass return all_named_dims @@ -61,11 +58,7 @@ def _get_dimension_names(var: Variable) -> list[str]: if isinstance(d, NamedDimension): dim_names.append(d.name) elif isinstance(d, str): - # This should not be ever a case for the dataset generated with the dataset builder - warnings.warn( - f"Unsupported dimension type: 'str' in variable {var.name}. " - "Expected NamedDimension." - ) + # Never happens for the dataset generated with the dataset builder dim_names.append(d) else: err = f"Unsupported dimension type: {type(d)} in variable {var.name}" @@ -87,13 +80,14 @@ def _get_coord_names(var: Variable) -> list[str]: return coord_names -def _get_np_datatype(var: Variable) -> np.dtype: +def _get_np_datatype(var: Variable) -> np_dtype: data_type = var.data_type if isinstance(data_type, ScalarType): - return np.dtype(data_type.value) + return np_dtype(data_type.value) if isinstance(data_type, StructuredType): - return np.dtype([(f.name, f.format.value) for f in data_type.fields]) - raise TypeError(f"Unsupported data_type: {data_type}") + return np_dtype([(f.name, f.format.value) for f in data_type.fields]) + err = f"Unsupported data type: {type(data_type)} in variable {var.name}" + raise TypeError(err) def _get_zarr_shape(var: Variable) -> tuple[int, ...]: @@ -111,13 +105,13 @@ def _get_zarr_chunks(var: Variable) -> tuple[int, ...]: def _convert_compressor( compressor: mdio_Blosc | mdio_ZFP | None, -) -> numcodecs.Blosc | zfpy_ZFPY | None: +) -> nc_Blosc | zfpy_ZFPY | None: """Convert a compressor to a numcodecs compatible format.""" if compressor is None: return None if isinstance(compressor, mdio_Blosc): - return numcodecs.Blosc( + return nc_Blosc( cname=compressor.algorithm.value, clevel=compressor.level, shuffle=compressor.shuffle.value, @@ -142,9 +136,9 @@ def _convert_compressor( # Do we already have it somewhere in the codebase? I could not find it. fill_value_map = { ScalarType.BOOL: None, - ScalarType.FLOAT16: np.nan, - ScalarType.FLOAT32: np.nan, - ScalarType.FLOAT64: np.nan, + ScalarType.FLOAT16: np_nan, + ScalarType.FLOAT32: np_nan, + ScalarType.FLOAT64: np_nan, ScalarType.UINT8: 2**8 - 1, # Max value for uint8 ScalarType.UINT16: 2**16 - 1, # Max value for uint16 ScalarType.UINT32: 2**32 - 1, # Max value for uint32 @@ -153,14 +147,15 @@ def _convert_compressor( ScalarType.INT16: 2**15 - 1, # Max value for int16 ScalarType.INT32: 2**31 - 1, # Max value for int32 ScalarType.INT64: 2**63 - 1, # Max value for int64 - ScalarType.COMPLEX64: complex(np.nan, np.nan), - ScalarType.COMPLEX128: complex(np.nan, np.nan), - ScalarType.COMPLEX256: complex(np.nan, np.nan), + ScalarType.COMPLEX64: complex(np_nan, np_nan), + ScalarType.COMPLEX128: complex(np_nan, np_nan), + ScalarType.COMPLEX256: complex(np_nan, np_nan), } -def get_fill_value(data_type: ScalarType | StructuredType | str) -> any: +def _get_fill_value(data_type: ScalarType | StructuredType | str) -> any: """Get the fill value for a given data type. + The Zarr fill_value is a scalar value providing the default value to use for uninitialized portions of the array, or null if no fill_value is to be used https://zarr-specs.readthedocs.io/en/latest/v2/v2.0.html @@ -175,7 +170,7 @@ def get_fill_value(data_type: ScalarType | StructuredType | str) -> any: return None -def to_xarray_dataset(mdio_ds: Dataset) -> xr.DataArray: # noqa: PLR0912 +def to_xarray_dataset(mdio_ds: Dataset) -> xr_DataArray: # noqa: PLR0912 """Build an XArray dataset with correct dimensions and dtypes. This function constructs the underlying data structure for an XArray dataset, @@ -186,9 +181,6 @@ def to_xarray_dataset(mdio_ds: Dataset) -> xr.DataArray: # noqa: PLR0912 Returns: The constructed dataset with proper MDIO structure and metadata. - - Raises: - TypeError: If an unsupported data type is encountered. """ # See the xarray tutorial for more details on how to create datasets: # https://tutorial.xarray.dev/fundamentals/01.1_creating_data_structures.html @@ -197,17 +189,17 @@ def to_xarray_dataset(mdio_ds: Dataset) -> xr.DataArray: # noqa: PLR0912 # all_coords = _get_all_coordinates(ds) # First pass: Build all variables - data_arrays: dict[str, xr.DataArray] = {} + data_arrays: dict[str, xr_DataArray] = {} for v in mdio_ds.variables: # Use dask array instead of numpy array for lazy evaluation shape = _get_zarr_shape(v) dtype = _get_np_datatype(v) chunks = _get_zarr_chunks(v) - arr = dask.array.zeros(shape, dtype=dtype, chunks=chunks) + arr = dask_array.zeros(shape, dtype=dtype, chunks=chunks) # Create a DataArray for the variable. We will set coords in the second pass dim_names = _get_dimension_names(v) - data_array = xr.DataArray(arr, dims=dim_names) + data_array = xr_DataArray(arr, dims=dim_names) # Add array attributes if v.metadata is not None: @@ -224,15 +216,14 @@ def to_xarray_dataset(mdio_ds: Dataset) -> xr.DataArray: # noqa: PLR0912 # Compression: # https://docs.xarray.dev/en/stable/internals/zarr-encoding-spec.html#zarr-encoding # If you don't explicitly specify a compressor when creating a Zarr array, Z - # arr will use a default compressor based on the Zarr format version and the data type of your array. - # Zarr V2 (Default is Blosc). + # arr will use a default compressor based on the Zarr format version and the data + # type of your array. Zarr V2 (Default is Blosc). # Thus, if there is no compressor, we will explicitly set "compressor" to None. # # Create a custom chunk key encoding with "/" as separator - from zarr.core.chunk_key_encodings import V2ChunkKeyEncoding chunk_key_encoding = V2ChunkKeyEncoding(separator="/").to_dict() encoding = { - "fill_value": get_fill_value(v.data_type), + "fill_value": _get_fill_value(v.data_type), "chunks": chunks, "chunk_key_encoding": chunk_key_encoding, # I was hoping the following would work, but it does not. @@ -258,7 +249,7 @@ def to_xarray_dataset(mdio_ds: Dataset) -> xr.DataArray: # noqa: PLR0912 da = data_arrays[v.name] non_dim_coords_names = set(_get_coord_names(v)) - set(_get_dimension_names(v)) - {v.name} # Create and populate a dictionary {coord_name: DataArray for the coordinate} - non_dim_coords_dict: dict[str, xr.DataArray] = {} + non_dim_coords_dict: dict[str, xr_DataArray] = {} for name in non_dim_coords_names: non_dim_coords_dict[name] = data_arrays[name] if non_dim_coords_dict: @@ -267,7 +258,7 @@ def to_xarray_dataset(mdio_ds: Dataset) -> xr.DataArray: # noqa: PLR0912 data_arrays[v.name] = da.assign_coords(non_dim_coords_dict) # Now let's create a dataset with all data arrays - xr_ds = xr.Dataset(data_arrays) + xr_ds = xr_Dataset(data_arrays) # Attach dataset metadata if mdio_ds.metadata is not None: xr_ds.attrs["apiVersion"] = mdio_ds.metadata.api_version @@ -280,7 +271,7 @@ def to_xarray_dataset(mdio_ds: Dataset) -> xr.DataArray: # noqa: PLR0912 def to_zarr( - dataset: xr.Dataset, + dataset: xr_Dataset, store: str | None = None, *args: str | int | float | bool, **kwargs: Mapping[str, str | int | float | bool], diff --git a/tests/unit/v1/test_dataset_serializer.py b/tests/unit/v1/test_dataset_serializer.py index 1bee02e3..d1ec69a3 100644 --- a/tests/unit/v1/test_dataset_serializer.py +++ b/tests/unit/v1/test_dataset_serializer.py @@ -4,8 +4,6 @@ # Thus, disable it for this file """Tests the schema v1 dataset_serializer public API.""" -import xarray as xr - from mdio.schemas.dtype import ScalarType from mdio.schemas.v1.dataset_builder import MDIODatasetBuilder from mdio.schemas.v1.dataset_serializer import to_xarray_dataset @@ -14,7 +12,7 @@ from .helpers import make_campos_3d_acceptance_dataset -def test_to_xarray_dataset(capsys) -> None: +def test_to_xarray_dataset() -> None: """Test building a complete dataset.""" dataset = ( MDIODatasetBuilder("test_dataset") @@ -35,19 +33,18 @@ def test_to_xarray_dataset(capsys) -> None: .build() ) - # with capsys.disabled(): - xds: xr.Dataset = to_xarray_dataset(dataset) + xr_ds = to_xarray_dataset(dataset) file_name = "sample_dataset" - to_zarr(xds, f"test-data/{file_name}.zarr", mode="w") + to_zarr(xr_ds, f"test-data/{file_name}.zarr", mode="w") -def test_campos_3d_acceptance_to_xarray_dataset(capsys) -> None: +def test_campos_3d_acceptance_to_xarray_dataset() -> None: """Test building a complete dataset.""" dataset = make_campos_3d_acceptance_dataset() - xds: xr.Dataset = to_xarray_dataset(dataset) + xr_ds = to_xarray_dataset(dataset) # file_name = "XYZ" - file_name = f"{xds.attrs['name']}" - to_zarr(xds, f"test-data/{file_name}.zarr", mode="w") + file_name = f"{xr_ds.attrs['name']}" + to_zarr(xr_ds, f"test-data/{file_name}.zarr", mode="w") From 82f1960da9939222c2d44077624a6ae126812ee4 Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Fri, 11 Jul 2025 22:26:34 +0000 Subject: [PATCH 25/27] Use only make_campos_3d_acceptance_dataset --- tests/unit/v1/helpers.py | 117 +------------------- tests/unit/v1/test_dataset_builder_build.py | 15 +-- 2 files changed, 9 insertions(+), 123 deletions(-) diff --git a/tests/unit/v1/helpers.py b/tests/unit/v1/helpers.py index 336f2f0d..ccd21289 100644 --- a/tests/unit/v1/helpers.py +++ b/tests/unit/v1/helpers.py @@ -241,6 +241,7 @@ def make_campos_3d_acceptance_dataset() -> Dataset: ds.add_variable( name="image_headers", dimensions=["inline", "crossline"], + coordinates=["cdp-x", "cdp-y"], data_type=StructuredType( fields=[ StructuredField(name="cdp-x", format=ScalarType.INT32), @@ -254,122 +255,6 @@ def make_campos_3d_acceptance_dataset() -> Dataset: chunk_grid=RegularChunkGrid(configuration=RegularChunkShape(chunk_shape=[128, 128])) ) ], - coordinates=["cdp-x", "cdp-y"], ) return ds.build() - -def make_campos_3d_dataset() -> Dataset: - """Create in-memory campos_3d dataset.""" - ds = MDIODatasetBuilder( - "campos_3d", - attributes=UserAttributes( - attributes={ - "textHeader": [ - "C01 .......................... ", - "C02 .......................... ", - "C03 .......................... ", - ], - "foo": "bar", - } - ), - ) - - # Add dimensions - ds.add_dimension("inline", 256) - ds.add_dimension("crossline", 512) - ds.add_dimension("depth", 384) - ds.add_coordinate("inline", dimensions=["inline"], data_type=ScalarType.UINT32) - ds.add_coordinate("crossline", dimensions=["crossline"], data_type=ScalarType.UINT32) - ds.add_coordinate( - "depth", - dimensions=["depth"], - data_type=ScalarType.FLOAT64, - metadata_info=[AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.METER))], - ) - # Add coordinates - ds.add_coordinate( - "cdp-x", - dimensions=["inline", "crossline"], - data_type=ScalarType.FLOAT32, - metadata_info=[AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.METER))], - ) - ds.add_coordinate( - "cdp-y", - dimensions=["inline", "crossline"], - data_type=ScalarType.FLOAT32, - metadata_info=[AllUnits(units_v1=LengthUnitModel(length=LengthUnitEnum.METER))], - ) - - # Add image variable - ds.add_variable( - name="image", - dimensions=["inline", "crossline", "depth"], - data_type=ScalarType.FLOAT32, - compressor=Blosc(algorithm="zstd"), - coordinates=["cdp-x", "cdp-y"], - metadata_info=[ - ChunkGridMetadata( - chunk_grid=RegularChunkGrid( - configuration=RegularChunkShape(chunk_shape=[128, 128, 128]) - ) - ), - StatisticsMetadata( - stats_v1=SummaryStatistics( - count=100, - sum=1215.1, - sumSquares=125.12, - min=5.61, - max=10.84, - histogram=CenteredBinHistogram(binCenters=[1, 2], counts=[10, 15]), - ) - ), - UserAttributes(attributes={"fizz": "buzz", "UnitSystem": "Canonical"}), - ], - ) - # Add velocity variable - ds.add_variable( - name="velocity", - dimensions=["inline", "crossline", "depth"], - data_type=ScalarType.FLOAT16, - coordinates=["cdp-x", "cdp-y"], - metadata_info=[ - ChunkGridMetadata( - chunk_grid=RegularChunkGrid( - configuration=RegularChunkShape(chunk_shape=[128, 128, 128]) - ) - ), - AllUnits(units_v1=SpeedUnitModel(speed=SpeedUnitEnum.METER_PER_SECOND)), - ], - ) - # Add inline-optimized image variable - ds.add_variable( - name="image_inline", - long_name="inline optimized version of 3d_stack", - dimensions=["inline", "crossline", "depth"], - data_type=ScalarType.FLOAT32, - compressor=Blosc(algorithm="zstd"), - coordinates=["cdp-x", "cdp-y"], - metadata_info=[ - ChunkGridMetadata( - chunk_grid=RegularChunkGrid( - configuration=RegularChunkShape(chunk_shape=[4, 512, 512]) - ) - ) - ], - ) - # Add headers variable with structured dtype - ds.add_variable( - name="image_headers", - dimensions=["inline", "crossline"], - data_type=StructuredType( - fields=[ - StructuredField(name="cdp-x", format=ScalarType.FLOAT32), - StructuredField(name="cdp-y", format=ScalarType.FLOAT32), - StructuredField(name="inline", format=ScalarType.UINT32), - StructuredField(name="crossline", format=ScalarType.UINT32), - ] - ), - coordinates=["cdp-x", "cdp-y"], - ) - return ds.build() diff --git a/tests/unit/v1/test_dataset_builder_build.py b/tests/unit/v1/test_dataset_builder_build.py index 832f624c..d726c32d 100644 --- a/tests/unit/v1/test_dataset_builder_build.py +++ b/tests/unit/v1/test_dataset_builder_build.py @@ -12,7 +12,7 @@ from mdio.schemas.v1.units import LengthUnitEnum from mdio.schemas.v1.units import SpeedUnitEnum -from .helpers import make_campos_3d_dataset +from .helpers import make_campos_3d_acceptance_dataset from .helpers import validate_variable @@ -49,7 +49,7 @@ def test_build() -> None: def test_build_campos_3d() -> None: # noqa: PLR0915 Too many statements (57 > 50) """Test building a Campos 3D dataset with multiple variables and attributes.""" - dataset = make_campos_3d_dataset() + dataset = make_campos_3d_acceptance_dataset() # Verify dataset structure assert dataset.metadata.name == "campos_3d" @@ -75,7 +75,7 @@ def test_build_campos_3d() -> None: # noqa: PLR0915 Too many statements (57 > 5 ) depth = validate_variable( - dataset, name="depth", dims=[("depth", 384)], coords=["depth"], dtype=ScalarType.FLOAT64 + dataset, name="depth", dims=[("depth", 384)], coords=["depth"], dtype=ScalarType.UINT32 ) assert depth.metadata.units_v1.length == LengthUnitEnum.METER @@ -146,10 +146,11 @@ def test_build_campos_3d() -> None: # noqa: PLR0915 Too many statements (57 > 5 coords=["cdp-x", "cdp-y"], dtype=StructuredType( fields=[ - StructuredField(name="cdp-x", format=ScalarType.FLOAT32), - StructuredField(name="cdp-y", format=ScalarType.FLOAT32), - StructuredField(name="inline", format=ScalarType.UINT32), - StructuredField(name="crossline", format=ScalarType.UINT32), + StructuredField(name="cdp-x", format=ScalarType.INT32), + StructuredField(name="cdp-y", format=ScalarType.INT32), + StructuredField(name="elevation", format=ScalarType.FLOAT16), + StructuredField(name="some_scalar", format=ScalarType.FLOAT16), ] ), ) + assert headers.metadata.chunk_grid.configuration.chunk_shape == [128, 128] From b5ee31e90e366395df26674a4f8407c0343ae580 Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Mon, 14 Jul 2025 21:51:54 +0000 Subject: [PATCH 26/27] PR Review: address the review comments --- .gitignore | 2 - _dev/DEVELOPERS_NOTES.md | 16 - _dev/zmetadata.cpp.json | 195 ----------- _dev/zmetadata.python.json | 195 ----------- src/mdio/constants.py | 61 ++-- src/mdio/schemas/v1/dataset_serializer.py | 153 ++++----- tests/unit/v1/helpers.py | 22 +- tests/unit/v1/test_dataset_builder_build.py | 8 +- tests/unit/v1/test_dataset_serializer.py | 350 +++++++++++++++++++- 9 files changed, 483 insertions(+), 519 deletions(-) delete mode 100644 _dev/DEVELOPERS_NOTES.md delete mode 100644 _dev/zmetadata.cpp.json delete mode 100644 _dev/zmetadata.python.json diff --git a/.gitignore b/.gitignore index 643394a4..bfdc38f2 100644 --- a/.gitignore +++ b/.gitignore @@ -153,5 +153,3 @@ mdio1/* pytest-of-* tmp debugging/* - -test-data/* diff --git a/_dev/DEVELOPERS_NOTES.md b/_dev/DEVELOPERS_NOTES.md deleted file mode 100644 index a9df833e..00000000 --- a/_dev/DEVELOPERS_NOTES.md +++ /dev/null @@ -1,16 +0,0 @@ -# Wring empty XArray / Zarr to a local storage - -src/mdio/schemas/v1/dataset_serializer.py - -## Issues encountered - -1. FIXED: Non-zero size of the serialized data files -2. FIXED: Not clear how to properly set `compressor`, `dimension_separator`, and `fill_value` -3. FIXED: For image_inline chunks[2] are somehow different? - -4. `fill_value` for StructuredType is set to null, but "AAAAAAAAAAAAAAAA" is expected - -## TO DO: - -- Add more unit tests for internal functions -- Add a trest comparing expected and actual .zmetadata for the serialized dataset diff --git a/_dev/zmetadata.cpp.json b/_dev/zmetadata.cpp.json deleted file mode 100644 index 394de1a8..00000000 --- a/_dev/zmetadata.cpp.json +++ /dev/null @@ -1,195 +0,0 @@ -{ - "metadata": { - ".zattrs": { - "apiVersion": "1.0.0", - "attributes": { - "foo": "bar", - "textHeader": [ - "C01 .......................... ", - "C02 .......................... ", - "C03 .......................... " - ] - }, - "createdOn": "2023-12-12T15:02:06.413469-06:00", - "name": "campos_3d" - }, - ".zgroup": { - "zarr_format": 2 - }, - "cdp-x/.zarray": { - "chunks": [256, 512], - "compressor": null, - "dimension_separator": "/", - "dtype": " dict[str, NamedDimension]: + """Get all NamedDimensions from the dataset variables. + + This function returns a dictionary of NamedDimensions, but if some dimensions + are not resolvable, they will not be included in the result. + + Args: + dataset: The MDIO Dataset to extract NamedDimensions from. + + Note: + The Dataset Builder ensures that all dimensions are resolvable by always embedding + dimensions as NamedDimension and never as str. + If the dataset is created in a different way, some dimensions may be specified as + dimension names (str) instead of NamedDimension. In this case, we will try to resolve + them to NamedDimension, but if the dimension is not found, it will be skipped. + It is the responsibility of the Dataset creator to ensure that all dimensions are + resolvable at the Dataset level. + + Returns: + A dictionary mapping dimension names to NamedDimension instances. + """ all_named_dims: dict[str, NamedDimension] = {} for v in dataset.variables: if v.dimensions is not None: @@ -36,37 +56,30 @@ def _get_all_named_dimensions(dataset: Dataset) -> dict[str, NamedDimension]: if isinstance(d, NamedDimension): all_named_dims[d.name] = d else: - # Never happens for the dataset generated with the dataset builder pass return all_named_dims -def _get_all_coordinates(dataset: Dataset) -> dict[str, Coordinate]: - all_coords: dict[str, Coordinate] = {} - for v in dataset.variables: - if v.coordinates is not None: - for c in v.coordinates: - if isinstance(c, Coordinate) and c.name not in all_coords: - all_coords[c.name] = c - return all_coords - - def _get_dimension_names(var: Variable) -> list[str]: + """Get the names of dimensions for a variable. + + Note: + We expect that Datasets produced by DatasetBuilder has all dimensions + embedded as NamedDimension, but we also support dimension name strings for + compatibility with Dataset produced in a different way. + """ dim_names: list[str] = [] if var.dimensions is not None: for d in var.dimensions: if isinstance(d, NamedDimension): dim_names.append(d.name) elif isinstance(d, str): - # Never happens for the dataset generated with the dataset builder dim_names.append(d) - else: - err = f"Unsupported dimension type: {type(d)} in variable {var.name}" - raise TypeError(err) return dim_names def _get_coord_names(var: Variable) -> list[str]: + """Get the names of coordinates for a variable.""" coord_names: list[str] = [] if var.coordinates is not None: for c in var.coordinates: @@ -74,13 +87,11 @@ def _get_coord_names(var: Variable) -> list[str]: coord_names.append(c.name) elif isinstance(c, str): coord_names.append(c) - else: - err = f"Unsupported coordinate type: {type(c)} in variable {var.name}" - raise TypeError(err) return coord_names def _get_np_datatype(var: Variable) -> np_dtype: + """Get the numpy dtype for a variable.""" data_type = var.data_type if isinstance(data_type, ScalarType): return np_dtype(data_type.value) @@ -90,17 +101,33 @@ def _get_np_datatype(var: Variable) -> np_dtype: raise TypeError(err) -def _get_zarr_shape(var: Variable) -> tuple[int, ...]: - # NOTE: This assumes that the variable dimensions are all NamedDimension - return tuple(dim.size for dim in var.dimensions) +def _get_zarr_shape(var: Variable, all_named_dims: dict[str, NamedDimension]) -> tuple[int, ...]: + """Get the shape of a variable for Zarr storage. - -def _get_zarr_chunks(var: Variable) -> tuple[int, ...]: + Note: + We expect that Datasets produced by DatasetBuilder has all dimensions + embedded as NamedDimension, but we also support dimension name strings for + compatibility with Dataset produced in a different way. + """ + shape: list[int] = [] + for dim in var.dimensions: + if isinstance(dim, NamedDimension): + shape.append(dim.size) + if isinstance(dim, str): + named_dim = all_named_dims.get(dim) + if named_dim is None: + err = f"Dimension named '{dim}' can't be resolved to a NamedDimension." + raise ValueError(err) + shape.append(named_dim.size) + return tuple(shape) + + +def _get_zarr_chunks(var: Variable, all_named_dims: dict[str, NamedDimension]) -> tuple[int, ...]: """Get the chunk shape for a variable, defaulting to its shape if no chunk grid is defined.""" if var.metadata is not None and var.metadata.chunk_grid is not None: - return var.metadata.chunk_grid.configuration.chunk_shape + return tuple(var.metadata.chunk_grid.configuration.chunk_shape) # Default to full shape if no chunk grid is defined - return _get_zarr_shape(var) + return _get_zarr_shape(var, all_named_dims=all_named_dims) def _convert_compressor( @@ -133,26 +160,6 @@ def _convert_compressor( raise TypeError(msg) -# Do we already have it somewhere in the codebase? I could not find it. -fill_value_map = { - ScalarType.BOOL: None, - ScalarType.FLOAT16: np_nan, - ScalarType.FLOAT32: np_nan, - ScalarType.FLOAT64: np_nan, - ScalarType.UINT8: 2**8 - 1, # Max value for uint8 - ScalarType.UINT16: 2**16 - 1, # Max value for uint16 - ScalarType.UINT32: 2**32 - 1, # Max value for uint32 - ScalarType.UINT64: 2**64 - 1, # Max value for uint64 - ScalarType.INT8: 2**7 - 1, # Max value for int8 - ScalarType.INT16: 2**15 - 1, # Max value for int16 - ScalarType.INT32: 2**31 - 1, # Max value for int32 - ScalarType.INT64: 2**63 - 1, # Max value for int64 - ScalarType.COMPLEX64: complex(np_nan, np_nan), - ScalarType.COMPLEX128: complex(np_nan, np_nan), - ScalarType.COMPLEX256: complex(np_nan, np_nan), -} - - def _get_fill_value(data_type: ScalarType | StructuredType | str) -> any: """Get the fill value for a given data type. @@ -185,16 +192,15 @@ def to_xarray_dataset(mdio_ds: Dataset) -> xr_DataArray: # noqa: PLR0912 # See the xarray tutorial for more details on how to create datasets: # https://tutorial.xarray.dev/fundamentals/01.1_creating_data_structures.html - # all_dims = _get_all_named_dimensions(ds) - # all_coords = _get_all_coordinates(ds) + all_named_dims = _get_all_named_dimensions(mdio_ds) # First pass: Build all variables data_arrays: dict[str, xr_DataArray] = {} for v in mdio_ds.variables: # Use dask array instead of numpy array for lazy evaluation - shape = _get_zarr_shape(v) + shape = _get_zarr_shape(v, all_named_dims=all_named_dims) dtype = _get_np_datatype(v) - chunks = _get_zarr_chunks(v) + chunks = _get_zarr_chunks(v, all_named_dims=all_named_dims) arr = dask_array.zeros(shape, dtype=dtype, chunks=chunks) # Create a DataArray for the variable. We will set coords in the second pass @@ -213,30 +219,12 @@ def to_xarray_dataset(mdio_ds: Dataset) -> xr_DataArray: # noqa: PLR0912 if v.long_name: data_array.attrs["long_name"] = v.long_name - # Compression: - # https://docs.xarray.dev/en/stable/internals/zarr-encoding-spec.html#zarr-encoding - # If you don't explicitly specify a compressor when creating a Zarr array, Z - # arr will use a default compressor based on the Zarr format version and the data - # type of your array. Zarr V2 (Default is Blosc). - # Thus, if there is no compressor, we will explicitly set "compressor" to None. - # # Create a custom chunk key encoding with "/" as separator chunk_key_encoding = V2ChunkKeyEncoding(separator="/").to_dict() encoding = { "fill_value": _get_fill_value(v.data_type), "chunks": chunks, "chunk_key_encoding": chunk_key_encoding, - # I was hoping the following would work, but it does not. - # I see: - # >_compressor = parse_compressor(compressor[0]) - # > return numcodecs.get_codec(data) - # E - numcodecs.errors.UnknownCodecError: codec not available: 'None'" - # Example: https://zarr.readthedocs.io/en/stable/user-guide/arrays.html#compressors - # from zarr.codecs import BloscCodec - # compressor = BloscCodec(cname="zstd", clevel=3, shuffle="shuffle") - # - # "compressor": _to_dictionary(v.compressor) - # Thus, we will call the conversion function: "compressor": _convert_compressor(v.compressor), } data_array.encoding = encoding @@ -276,14 +264,27 @@ def to_zarr( *args: str | int | float | bool, **kwargs: Mapping[str, str | int | float | bool], ) -> None: - """Write an XArray dataset to Zarr format.""" - # MDIO only supports zarr_format=2 + """Write an XArray dataset to Zarr format. + + Args: + dataset: The XArray dataset to write. + store: The Zarr store to write to. If None, defaults to in-memory store. + *args: Additional positional arguments for the Zarr store. + **kwargs: Additional keyword arguments for the Zarr store. + + Notes: + It sets the zarr_format to 2, which is the default for XArray datasets. + Since we set kwargs["compute"], this method will return a dask.delayed.Delayed object + and the arrays will not be immediately written. + + References: + https://docs.xarray.dev/en/stable/user-guide/io.html + https://docs.xarray.dev/en/latest/generated/xarray.DataArray.to_zarr.html + + Returns: + None: The function writes the dataset as dask.delayed.Delayed object to the + specified Zarr store. + """ kwargs["zarr_format"] = 2 - # compute: default: True) – If True write array data immediately, - # otherwise return a dask.delayed.Delayed object that can be computed - # to write array data later. - # *** Metadata is always updated eagerly. *** kwargs["compute"] = False - # https://docs.xarray.dev/en/stable/user-guide/io.html - # https://docs.xarray.dev/en/latest/generated/xarray.DataArray.to_zarr.html return dataset.to_zarr(*args, store=store, **kwargs) diff --git a/tests/unit/v1/helpers.py b/tests/unit/v1/helpers.py index ccd21289..d0ebe5d7 100644 --- a/tests/unit/v1/helpers.py +++ b/tests/unit/v1/helpers.py @@ -129,6 +129,7 @@ def _get_coordinate( def _get_all_coordinates(dataset: Dataset) -> list[Coordinate]: + """Get all coordinates from the dataset.""" all_coords: dict[str, Coordinate] = {} for v in dataset.variables: if v.coordinates is not None: @@ -138,8 +139,24 @@ def _get_all_coordinates(dataset: Dataset) -> list[Coordinate]: return list(all_coords.values()) -def make_campos_3d_acceptance_dataset() -> Dataset: - """Create in-memory campos_3d dataset.""" +def output_path(file_dir: str, file_name: str, debugging: bool = False) -> str: + """Generate the output path for the test file-system output. + + Note: + Use debugging=True, if you need to retain the created files for debugging + purposes. Otherwise, the files will be created in-memory and not saved to disk. + """ + if debugging: + # Use the following for debugging: + file_path = f"{file_dir}/mdio-tests/{file_name}.zarr" + else: + # Use the following for normal runs: + file_path = f"memory://path_to_zarr/mdio-tests/{file_name}.zarr" + return file_path + + +def make_seismic_poststack_3d_acceptance_dataset() -> Dataset: + """Create in-memory Seismic PostStack 3D Acceptance dataset.""" ds = MDIODatasetBuilder( "campos_3d", attributes=UserAttributes( @@ -257,4 +274,3 @@ def make_campos_3d_acceptance_dataset() -> Dataset: ], ) return ds.build() - diff --git a/tests/unit/v1/test_dataset_builder_build.py b/tests/unit/v1/test_dataset_builder_build.py index d726c32d..2a68c833 100644 --- a/tests/unit/v1/test_dataset_builder_build.py +++ b/tests/unit/v1/test_dataset_builder_build.py @@ -12,7 +12,7 @@ from mdio.schemas.v1.units import LengthUnitEnum from mdio.schemas.v1.units import SpeedUnitEnum -from .helpers import make_campos_3d_acceptance_dataset +from .helpers import make_seismic_poststack_3d_acceptance_dataset from .helpers import validate_variable @@ -47,9 +47,9 @@ def test_build() -> None: assert next(v for v in dataset.variables if v.name == "data") is not None -def test_build_campos_3d() -> None: # noqa: PLR0915 Too many statements (57 > 50) - """Test building a Campos 3D dataset with multiple variables and attributes.""" - dataset = make_campos_3d_acceptance_dataset() +def test_build_seismic_poststack_3d_acceptance_dataset() -> None: # noqa: PLR0915 Too many statements (57 > 50) + """Test building a Seismic PostStack 3D Acceptance dataset.""" + dataset = make_seismic_poststack_3d_acceptance_dataset() # Verify dataset structure assert dataset.metadata.name == "campos_3d" diff --git a/tests/unit/v1/test_dataset_serializer.py b/tests/unit/v1/test_dataset_serializer.py index d1ec69a3..19e3db50 100644 --- a/tests/unit/v1/test_dataset_serializer.py +++ b/tests/unit/v1/test_dataset_serializer.py @@ -4,15 +4,348 @@ # Thus, disable it for this file """Tests the schema v1 dataset_serializer public API.""" +import pytest +from numpy import dtype as np_dtype +from numpy import isnan as np_isnan + +from mdio.constants import fill_value_map +from mdio.schemas.chunk_grid import RegularChunkGrid +from mdio.schemas.chunk_grid import RegularChunkShape +from mdio.schemas.dimension import NamedDimension from mdio.schemas.dtype import ScalarType +from mdio.schemas.dtype import StructuredField +from mdio.schemas.dtype import StructuredType +from mdio.schemas.metadata import ChunkGridMetadata +from mdio.schemas.v1.dataset import Dataset +from mdio.schemas.v1.dataset import DatasetInfo from mdio.schemas.v1.dataset_builder import MDIODatasetBuilder +from mdio.schemas.v1.dataset_builder import _to_dictionary +from mdio.schemas.v1.dataset_serializer import _convert_compressor +from mdio.schemas.v1.dataset_serializer import _get_all_named_dimensions +from mdio.schemas.v1.dataset_serializer import _get_coord_names +from mdio.schemas.v1.dataset_serializer import _get_dimension_names +from mdio.schemas.v1.dataset_serializer import _get_fill_value +from mdio.schemas.v1.dataset_serializer import _get_np_datatype +from mdio.schemas.v1.dataset_serializer import _get_zarr_chunks +from mdio.schemas.v1.dataset_serializer import _get_zarr_shape from mdio.schemas.v1.dataset_serializer import to_xarray_dataset from mdio.schemas.v1.dataset_serializer import to_zarr +from mdio.schemas.v1.variable import Coordinate +from mdio.schemas.v1.variable import Variable + +from .helpers import make_seismic_poststack_3d_acceptance_dataset +from .helpers import output_path + +try: + from zfpy import ZFPY as zfpy_ZFPY # noqa: N811 + + HAS_ZFPY = True +except ImportError: + zfpy_ZFPY = None # noqa: N816 + HAS_ZFPY = False + +from numcodecs import Blosc as nc_Blosc + +from mdio.schemas.compressors import ZFP as mdio_ZFP # noqa: N811 +from mdio.schemas.compressors import Blosc as mdio_Blosc +from mdio.schemas.compressors import BloscAlgorithm as mdio_BloscAlgorithm +from mdio.schemas.compressors import BloscShuffle as mdio_BloscShuffle +from mdio.schemas.compressors import ZFPMode as mdio_ZFPMode + + +def test__get_all_named_dimensions() -> None: + """Test _get_all_named_dimensions function.""" + dim1 = NamedDimension(name="inline", size=100) + dim2 = NamedDimension(name="crossline", size=200) + dim3 = NamedDimension(name="depth", size=300) + v1 = Variable(name="named_dims", data_type=ScalarType.FLOAT32, dimensions=[dim1, dim2, dim3]) + v2 = Variable( + name="string_dims", + data_type=ScalarType.FLOAT32, + dimensions=["inline", "crossline", "depth"], + ) + v3 = Variable(name="unresolved_dims", data_type=ScalarType.FLOAT32, dimensions=["x", "y", "z"]) + ds = Dataset( + variables=[v1, v2, v3], + metadata=_to_dictionary( + [ + DatasetInfo( + name="test_dataset", api_version="1.0.0", created_on="2023-10-01T00:00:00Z" + ) + ] + ), + ) + + all_dims = _get_all_named_dimensions(ds) + # Only 3 named dimensions could be resolved. + # The dimension names "x", "y', "z" are unresolvable. + assert set(all_dims) == {"inline", "crossline", "depth"} + + +def test__get_dimension_names() -> None: + """Test _get_dimension_names function with various dimension types.""" + dim1 = NamedDimension(name="inline", size=100) + dim2 = NamedDimension(name="crossline", size=200) + + # Test case 1: Variable with NamedDimension + var_named_dims = Variable( + name="Variable with NamedDimension dimensions", + data_type=ScalarType.FLOAT32, + dimensions=[dim1, dim2], + ) + assert set(_get_dimension_names(var_named_dims)) == {"inline", "crossline"} + + # Test case 2: Variable with string dimensions + var_string_dims = Variable( + name="Variable with string dimensions", + data_type=ScalarType.FLOAT32, + dimensions=["x", "y", "z"], + ) + assert set(_get_dimension_names(var_string_dims)) == {"x", "y", "z"} + + # Test case 3: Mixed NamedDimension and string dimensions + # NOTE: mixing NamedDimension and string dimensions is not allowed by the Variable schema + + +def test__get_coord_names() -> None: + """Comprehensive test for _get_coord_names function covering all scenarios.""" + dim1 = NamedDimension(name="inline", size=100) + dim2 = NamedDimension(name="crossline", size=200) + + # Test 1: Variable with Coordinate objects + coord1 = Coordinate(name="x_coord", dimensions=[dim1, dim2], data_type=ScalarType.FLOAT32) + coord2 = Coordinate(name="y_coord", dimensions=[dim1, dim2], data_type=ScalarType.FLOAT64) + variable_coords = Variable( + name="Variable with Coordinate objects", + data_type=ScalarType.FLOAT32, + dimensions=[dim1, dim2], + coordinates=[coord1, coord2], + ) + assert set(_get_coord_names(variable_coords)) == {"x_coord", "y_coord"} + + # Test 2: Variable with string coordinates + variable_strings = Variable( + name="Variable with string coordinates", + data_type=ScalarType.FLOAT32, + dimensions=[dim1, dim2], + coordinates=["lat", "lon", "time"], + ) + assert set(_get_coord_names(variable_strings)) == {"lat", "lon", "time"} + + # Test 3: Variable with mixed coordinate types + # NOTE: mixing Coordinate objects and coordinate name strings is not allowed by the + # Variable schema + + +def test__get_np_datatype() -> None: + """Comprehensive test for _get_np_datatype function.""" + # Test 1: ScalarType cases - all supported scalar types + scalar_type_tests = [ + (ScalarType.FLOAT32, "float32"), + (ScalarType.FLOAT64, "float64"), + (ScalarType.INT8, "int8"), + (ScalarType.INT16, "int16"), + (ScalarType.INT32, "int32"), + (ScalarType.INT64, "int64"), + (ScalarType.UINT8, "uint8"), + (ScalarType.UINT16, "uint16"), + (ScalarType.UINT32, "uint32"), + (ScalarType.UINT64, "uint64"), + (ScalarType.COMPLEX64, "complex64"), + (ScalarType.COMPLEX128, "complex128"), + (ScalarType.BOOL, "bool"), + ] + + for scalar_type, expected_numpy_type in scalar_type_tests: + variable = Variable(name="test_var", dimensions=[], data_type=scalar_type) + + result = _get_np_datatype(variable) + expected = np_dtype(expected_numpy_type) + + assert result == expected + assert isinstance(result, np_dtype) + assert result.name == expected.name + + # Test 2: StructuredType with multiple fields + multi_fields = [ + StructuredField(name="x", format=ScalarType.FLOAT64), + StructuredField(name="y", format=ScalarType.FLOAT64), + StructuredField(name="z", format=ScalarType.FLOAT64), + StructuredField(name="id", format=ScalarType.INT32), + StructuredField(name="valid", format=ScalarType.BOOL), + ] + structured_multi = StructuredType(fields=multi_fields) + + variable_multi_struct = Variable( + name="multi_struct_var", dimensions=[], data_type=structured_multi + ) + + result_multi = _get_np_datatype(variable_multi_struct) + expected_multi = np_dtype( + [("x", "float64"), ("y", "float64"), ("z", "float64"), ("id", "int32"), ("valid", "bool")] + ) + + assert result_multi == expected_multi + assert isinstance(result_multi, np_dtype) + assert len(result_multi.names) == 5 + assert set(result_multi.names) == {"x", "y", "z", "id", "valid"} + + +def test__get_zarr_shape() -> None: + """Test for _get_zarr_shape function.""" + d1 = NamedDimension(name="inline", size=100) + d2 = NamedDimension(name="crossline", size=200) + d3 = NamedDimension(name="depth", size=300) + + v = Variable(name="seismic 3d var", data_type=ScalarType.FLOAT32, dimensions=[d1, d2, d3]) + assert _get_zarr_shape(v, all_named_dims=[d1, d2, d3]) == (100, 200, 300) + + +def test__get_zarr_chunks() -> None: + """Test for _get_zarr_chunks function.""" + d1 = NamedDimension(name="inline", size=100) + d2 = NamedDimension(name="crossline", size=200) + d3 = NamedDimension(name="depth", size=300) + + # Test 1: Variable with chunk defined in metadata + v = Variable( + name="seismic 3d var", + data_type=ScalarType.FLOAT32, + dimensions=[d1, d2, d3], + metadata=_to_dictionary( + ChunkGridMetadata( + chunk_grid=RegularChunkGrid( + configuration=RegularChunkShape(chunk_shape=[10, 20, 30]) + ) + ) + ), + ) + assert _get_zarr_chunks(v, all_named_dims=[d1, d2, d3]) == (10, 20, 30) + + # Test 2: Variable with no chunks defined + v = Variable(name="seismic 3d var", data_type=ScalarType.FLOAT32, dimensions=[d1, d2, d3]) + assert _get_zarr_chunks(v, all_named_dims=[d1, d2, d3]) == (100, 200, 300) + + +def test__get_fill_value() -> None: + """Test for _get_fill_value function.""" + # Test 1: ScalarType cases - should return values from fill_value_map + scalar_types = [ + ScalarType.BOOL, + ] + for scalar_type in scalar_types: + assert _get_fill_value(scalar_type) is None + + scalar_types = [ + ScalarType.FLOAT16, + ScalarType.FLOAT32, + ScalarType.FLOAT64, + ] + for scalar_type in scalar_types: + assert np_isnan(_get_fill_value(scalar_type)) + + scalar_types = [ + ScalarType.UINT8, + ScalarType.UINT16, + ScalarType.UINT32, + ScalarType.INT8, + ScalarType.INT16, + ScalarType.INT32, + ] + for scalar_type in scalar_types: + assert fill_value_map[scalar_type] == _get_fill_value(scalar_type) + + scalar_types = [ + ScalarType.COMPLEX64, + ScalarType.COMPLEX128, + ScalarType.COMPLEX256, + ] + for scalar_type in scalar_types: + val = _get_fill_value(scalar_type) + assert isinstance(val, complex) + assert np_isnan(val.real) + assert np_isnan(val.imag) + + # Test 2: StructuredType - should return "AAAAAAAAAAAAAAAA" + field = StructuredField(name="test_field", format=ScalarType.FLOAT32) + structured_type = StructuredType(fields=[field]) + result_structured = _get_fill_value(structured_type) + assert result_structured == "AAAAAAAAAAAAAAAA" + + # Test 3: String type - should return empty string + result_string = _get_fill_value("string_type") + assert result_string == "" + + # Test 4: Unknown type - should return None + result_none = _get_fill_value(42) # Invalid type + assert result_none is None + + # Test 5: None input - should return None + result_none_input = _get_fill_value(None) + assert result_none_input is None + + +def test__convert_compressor() -> None: + """Simple test for _convert_compressor function covering basic scenarios.""" + # Test 1: None input - should return None + result_none = _convert_compressor(None) + assert result_none is None + + # Test 2: mdio_Blosc compressor - should return nc_Blosc + result_blosc = _convert_compressor( + mdio_Blosc( + algorithm=mdio_BloscAlgorithm.LZ4, + level=5, + shuffle=mdio_BloscShuffle.AUTOSHUFFLE, + blocksize=1024, + ) + ) + assert isinstance(result_blosc, nc_Blosc) + assert result_blosc.cname == "lz4" # BloscAlgorithm.LZ4.value + assert result_blosc.clevel == 5 + assert result_blosc.shuffle == -1 # BloscShuffle.UTOSHUFFLE = -1 + assert result_blosc.blocksize == 1024 + + # Test 3: mdio_Blosc with blocksize 0 - should use 0 as blocksize + result_blosc_zero = _convert_compressor( + mdio_Blosc( + algorithm=mdio_BloscAlgorithm.ZSTD, + level=3, + shuffle=mdio_BloscShuffle.AUTOSHUFFLE, + blocksize=0, + ) + ) + assert isinstance(result_blosc_zero, nc_Blosc) + assert result_blosc_zero.blocksize == 0 + + # Test 4: mdio_ZFP compressor - should return zfpy_ZFPY if available + zfp_compressor = mdio_ZFP(mode=mdio_ZFPMode.FIXED_RATE, tolerance=0.01, rate=8.0, precision=16) + + if HAS_ZFPY: + result_zfp = _convert_compressor(zfp_compressor) + assert isinstance(result_zfp, zfpy_ZFPY) + assert result_zfp.mode == 1 # ZFPMode.FIXED_RATE.value = "fixed_rate" + assert result_zfp.tolerance == 0.01 + assert result_zfp.rate == 8.0 + assert result_zfp.precision == 16 + else: + # Test 5: mdio_ZFP without zfpy installed - should raise ImportError + with pytest.raises(ImportError) as exc_info: + _convert_compressor(zfp_compressor) + + error_message = str(exc_info.value) + assert "zfpy and numcodecs are required to use ZFP compression" in error_message -from .helpers import make_campos_3d_acceptance_dataset + # Test 6: Unsupported compressor type - should raise TypeError + unsupported_compressor = "invalid_compressor" + with pytest.raises(TypeError) as exc_info: + _convert_compressor(unsupported_compressor) + error_message = str(exc_info.value) + assert "Unsupported compressor model" in error_message + assert "" in error_message -def test_to_xarray_dataset() -> None: +def test_to_xarray_dataset(tmp_path) -> None: # noqa: ANN001 - tmp_path is a pytest fixture """Test building a complete dataset.""" dataset = ( MDIODatasetBuilder("test_dataset") @@ -35,16 +368,15 @@ def test_to_xarray_dataset() -> None: xr_ds = to_xarray_dataset(dataset) - file_name = "sample_dataset" - to_zarr(xr_ds, f"test-data/{file_name}.zarr", mode="w") + file_path = output_path(tmp_path, f"{xr_ds.attrs['name']}", debugging=False) + to_zarr(xr_ds, file_path, mode="w") -def test_campos_3d_acceptance_to_xarray_dataset() -> None: +def test_seismic_poststack_3d_acceptance_to_xarray_dataset(tmp_path) -> None: # noqa: ANN001 """Test building a complete dataset.""" - dataset = make_campos_3d_acceptance_dataset() + dataset = make_seismic_poststack_3d_acceptance_dataset() xr_ds = to_xarray_dataset(dataset) - # file_name = "XYZ" - file_name = f"{xr_ds.attrs['name']}" - to_zarr(xr_ds, f"test-data/{file_name}.zarr", mode="w") + file_path = output_path(tmp_path, f"{xr_ds.attrs['name']}", debugging=False) + to_zarr(xr_ds, file_path, mode="w") From 7b3ba70ecc508cc28fd472390254ca47e37c1d62 Mon Sep 17 00:00:00 2001 From: Dmitriy Repin Date: Mon, 14 Jul 2025 23:12:56 +0000 Subject: [PATCH 27/27] Update _get_fill_value for StructuredType --- src/mdio/schemas/v1/dataset_serializer.py | 10 ++++++++-- tests/unit/v1/test_dataset_serializer.py | 18 +++++++++++------- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/src/mdio/schemas/v1/dataset_serializer.py b/src/mdio/schemas/v1/dataset_serializer.py index a8c871ac..932816f2 100644 --- a/src/mdio/schemas/v1/dataset_serializer.py +++ b/src/mdio/schemas/v1/dataset_serializer.py @@ -170,8 +170,8 @@ def _get_fill_value(data_type: ScalarType | StructuredType | str) -> any: if isinstance(data_type, ScalarType): return fill_value_map.get(data_type) if isinstance(data_type, StructuredType): - return "AAAAAAAAAAAAAAAA" # BUG: this does not work!!! - if isinstance(data_type, str): + return tuple(fill_value_map.get(field.format) for field in data_type.fields) + if isinstance(data_type, str): return "" # If we do not have a fill value for this type, use None return None @@ -222,6 +222,12 @@ def to_xarray_dataset(mdio_ds: Dataset) -> xr_DataArray: # noqa: PLR0912 # Create a custom chunk key encoding with "/" as separator chunk_key_encoding = V2ChunkKeyEncoding(separator="/").to_dict() encoding = { + # Is this a bug in Zarr? For datatype: + # dtype([('cdp-x', ' None: val = _get_fill_value(scalar_type) assert isinstance(val, complex) assert np_isnan(val.real) - assert np_isnan(val.imag) - - # Test 2: StructuredType - should return "AAAAAAAAAAAAAAAA" - field = StructuredField(name="test_field", format=ScalarType.FLOAT32) - structured_type = StructuredType(fields=[field]) + assert np_isnan(val.imag) + + # Test 2: StructuredType + f1 = StructuredField(name="cdp-x", format=ScalarType.INT32) + f2 = StructuredField(name="cdp-y", format=ScalarType.INT32) + f3 = StructuredField(name="elevation", format=ScalarType.FLOAT16) + f4 = StructuredField(name="some_scalar", format=ScalarType.FLOAT16) + structured_type = StructuredType(fields=[f1, f2, f3, f4]) result_structured = _get_fill_value(structured_type) - assert result_structured == "AAAAAAAAAAAAAAAA" + assert result_structured == (2147483647, 2147483647, np_nan, np_nan) # Test 3: String type - should return empty string result_string = _get_fill_value("string_type") @@ -378,5 +382,5 @@ def test_seismic_poststack_3d_acceptance_to_xarray_dataset(tmp_path) -> None: # xr_ds = to_xarray_dataset(dataset) - file_path = output_path(tmp_path, f"{xr_ds.attrs['name']}", debugging=False) + file_path = output_path(tmp_path, f"{xr_ds.attrs['name']}", debugging=True) to_zarr(xr_ds, file_path, mode="w")