diff --git a/.gitignore b/.gitignore index bfdc38f2..e77a2883 100644 --- a/.gitignore +++ b/.gitignore @@ -153,3 +153,6 @@ mdio1/* pytest-of-* tmp debugging/* + +# Docs +docs/tutorials/output/* diff --git a/docs/tutorials/builder.ipynb b/docs/tutorials/builder.ipynb new file mode 100644 index 00000000..30b397cf --- /dev/null +++ b/docs/tutorials/builder.ipynb @@ -0,0 +1,816 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "9fd6d920", + "metadata": {}, + "source": [ + "# Constructing a v1 Dataset with the MDIODatasetBuilder\n", + "\n", + "In this notebook, we demonstrate how to use the `MDIODatasetBuilder` to build and write a post-stack depth-migrated (PSDM) seismic dataset using the MDIO v1 schema." + ] + }, + { + "cell_type": "markdown", + "id": "1240095a", + "metadata": {}, + "source": [ + "## Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "1c00c220", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "\n", + "# Auxiliary import for formatting and pretty printing\n", + "from rich import print as rprint\n", + "\n", + "from mdio.core.v1.builder import MDIODatasetBuilder\n", + "from mdio.schemas.compressors import Blosc\n", + "from mdio.schemas.dtype import ScalarType" + ] + }, + { + "cell_type": "markdown", + "id": "a9432bdc", + "metadata": {}, + "source": [ + "## 1. Create Builder and Add Dimensions\n", + "First, instantiate a builder instance with a name and optional global attributes. The builder provides a chainable interface to construct bespoke Dataset contracts that may not exist in the factory.\n", + "\n", + "Attributes are free-form and intended to describe the overall dataset, data providence, processing steps, or any other information that would enrich the Dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "35505bee", + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize builder for PSDM stack\n", + "builder = MDIODatasetBuilder(\n", + " name=\"psdm_stack_example\", attributes={\"description\": \"Example PSDM stack\"}\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "1befa778", + "metadata": {}, + "source": [ + "# 2. Add Dimensions\n", + "\n", + "The Dimensions represent the core grid of the Dataset.\n", + "\n", + "They are one-dimensional tick-labels which may be populated with values for value-based and index-based access to the Dataset or inert for index-based access to the Dataset.\n", + "\n", + "It is generally recommended to fully populate the dimensions, but is beyond the scope of this example." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bd9df8ca", + "metadata": {}, + "outputs": [], + "source": [ + "# Add core dimensions: inline, crossline, depth\n", + "builder.add_dimension(\"inline\", 256, long_name=\"Inline Number\").add_dimension(\n", + " \"crossline\", 512, long_name=\"Crossline Number\"\n", + ").add_dimension(\"depth\", 384, long_name=\"Depth Sample\")" + ] + }, + { + "cell_type": "markdown", + "id": "4ac0a62e", + "metadata": {}, + "source": [ + "# 3. Add CDP Coordinates (UTM Easting/Northing)\n", + "\n", + "Coordinates are N-dimensional arrays which enrich the dataset by providing auxiliary coordinate systems.\n", + "\n", + "In this example, our Dataset contract shows that we expect that our inline and crossline indices can be translated into real world coordinate values in Map Grid of Australia [Zone 51](https://epsg.io/28351)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7d2da0c3", + "metadata": {}, + "outputs": [], + "source": [ + "# CDP X and Y on inline-crossline grid\n", + "builder.add_coordinate(\n", + " name=\"cdp_x\",\n", + " dimensions=[\"inline\", \"crossline\"],\n", + " long_name=\"CDP X (UTM Easting)\",\n", + " data_type=ScalarType.FLOAT64,\n", + " metadata={\"unitsV1\": {\"length\": \"m\"}, \"attributes\": {\"MGA\": 51}},\n", + ").add_coordinate(\n", + " name=\"cdp_y\",\n", + " dimensions=[\"inline\", \"crossline\"],\n", + " long_name=\"CDP Y (UTM Northing)\",\n", + " data_type=ScalarType.FLOAT64,\n", + " metadata={\"unitsV1\": {\"length\": \"m\"}, \"attributes\": {\"MGA\": 51}},\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "45954756", + "metadata": {}, + "source": [ + "## 3. Add Post-Stack Amplitude Volume Variable" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7b4c8aa7", + "metadata": {}, + "outputs": [], + "source": [ + "builder.add_variable(\n", + " name=\"stack_amplitude\",\n", + " dimensions=[\"inline\", \"crossline\", \"depth\"],\n", + " data_type=ScalarType.FLOAT32,\n", + " compressor=Blosc(algorithm=\"zstd\", level=3),\n", + " coordinates=[\"inline\", \"crossline\", \"cdp_x\", \"cdp_y\"],\n", + " metadata={\"chunkGrid\": {\"name\": \"regular\", \"configuration\": {\"chunkShape\": [64, 64, 64]}}},\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "0ed7500f", + "metadata": {}, + "source": [ + "## 4. Build and Write" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "d7df200f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.MDIODataset> Size: 203MB\n",
+       "Dimensions:          (inline: 256, crossline: 512, depth: 384)\n",
+       "Coordinates:\n",
+       "  * inline           (inline) int32 1kB 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0\n",
+       "  * crossline        (crossline) int32 2kB 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0\n",
+       "  * depth            (depth) int32 2kB 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0\n",
+       "Data variables:\n",
+       "    stack_amplitude  (inline, crossline, depth) float32 201MB 0.0 0.0 ... 0.0\n",
+       "    cdp_x            (inline, crossline) float64 1MB 0.0 0.0 0.0 ... 0.0 0.0 0.0\n",
+       "    cdp_y            (inline, crossline) float64 1MB 0.0 0.0 0.0 ... 0.0 0.0 0.0\n",
+       "Attributes:\n",
+       "    apiVersion:  1.0.0\n",
+       "    createdOn:   2025-06-02 13:40:42.724997+00:00\n",
+       "    name:        psdm_stack_example\n",
+       "    attributes:  {'description': 'Example PSDM stack'}
" + ], + "text/plain": [ + " Size: 203MB\n", + "Dimensions: (inline: 256, crossline: 512, depth: 384)\n", + "Coordinates:\n", + " * inline (inline) int32 1kB 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0\n", + " * crossline (crossline) int32 2kB 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0\n", + " * depth (depth) int32 2kB 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0\n", + "Data variables:\n", + " stack_amplitude (inline, crossline, depth) float32 201MB 0.0 0.0 ... 0.0\n", + " cdp_x (inline, crossline) float64 1MB 0.0 0.0 0.0 ... 0.0 0.0 0.0\n", + " cdp_y (inline, crossline) float64 1MB 0.0 0.0 0.0 ... 0.0 0.0 0.0\n", + "Attributes:\n", + " apiVersion: 1.0.0\n", + " createdOn: 2025-06-02 13:40:42.724997+00:00\n", + " name: psdm_stack_example\n", + " attributes: {'description': 'Example PSDM stack'}" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Write only metadata to .mdio store and build the interactable Dataset object\n", + "ds = builder.to_mdio(store=\"output/psdm_stack_example.mdio\")\n", + "\n", + "# Display the interactable Dataset\n", + "ds" + ] + }, + { + "cell_type": "markdown", + "id": "9efbeb0b", + "metadata": {}, + "source": [ + "# Build and view the Dataset contract" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "bbcca480", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
{\n",
+       "    'metadata': {\n",
+       "        'name': 'psdm_stack_example',\n",
+       "        'apiVersion': '1.0.0',\n",
+       "        'createdOn': '2025-06-02T13:40:42.724997Z',\n",
+       "        'attributes': {'description': 'Example PSDM stack'}\n",
+       "    },\n",
+       "    'variables': [\n",
+       "        {\n",
+       "            'dataType': 'int32',\n",
+       "            'dimensions': [{'name': 'inline', 'size': 256}],\n",
+       "            'name': 'inline',\n",
+       "            'longName': 'Inline Number'\n",
+       "        },\n",
+       "        {\n",
+       "            'dataType': 'int32',\n",
+       "            'dimensions': [{'name': 'crossline', 'size': 512}],\n",
+       "            'name': 'crossline',\n",
+       "            'longName': 'Crossline Number'\n",
+       "        },\n",
+       "        {\n",
+       "            'dataType': 'int32',\n",
+       "            'dimensions': [{'name': 'depth', 'size': 384}],\n",
+       "            'name': 'depth',\n",
+       "            'longName': 'Depth Sample'\n",
+       "        },\n",
+       "        {\n",
+       "            'dataType': 'float32',\n",
+       "            'dimensions': [\n",
+       "                {'name': 'inline', 'size': 256},\n",
+       "                {'name': 'crossline', 'size': 512},\n",
+       "                {'name': 'depth', 'size': 384}\n",
+       "            ],\n",
+       "            'compressor': {'name': 'blosc', 'algorithm': 'zstd', 'level': 3, 'shuffle': 1, 'blocksize': 0},\n",
+       "            'name': 'stack_amplitude',\n",
+       "            'coordinates': ['inline', 'crossline', 'cdp_x', 'cdp_y'],\n",
+       "            'metadata': {'chunkGrid': {'name': 'regular', 'configuration': {'chunkShape': [64, 64, 64]}}}\n",
+       "        },\n",
+       "        {\n",
+       "            'dataType': 'float64',\n",
+       "            'dimensions': [{'name': 'inline', 'size': 256}, {'name': 'crossline', 'size': 512}],\n",
+       "            'name': 'cdp_x',\n",
+       "            'longName': 'CDP X (UTM Easting)',\n",
+       "            'metadata': {'unitsV1': {'length': 'm'}, 'attributes': {'attributes': {'MGA': 51}}}\n",
+       "        },\n",
+       "        {\n",
+       "            'dataType': 'float64',\n",
+       "            'dimensions': [{'name': 'inline', 'size': 256}, {'name': 'crossline', 'size': 512}],\n",
+       "            'name': 'cdp_y',\n",
+       "            'longName': 'CDP Y (UTM Northing)',\n",
+       "            'metadata': {'unitsV1': {'length': 'm'}, 'attributes': {'attributes': {'MGA': 51}}}\n",
+       "        }\n",
+       "    ]\n",
+       "}\n",
+       "
\n" + ], + "text/plain": [ + "\u001b[1m{\u001b[0m\n", + " \u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\n", + " \u001b[32m'name'\u001b[0m: \u001b[32m'psdm_stack_example'\u001b[0m,\n", + " \u001b[32m'apiVersion'\u001b[0m: \u001b[32m'1.0.0'\u001b[0m,\n", + " \u001b[32m'createdOn'\u001b[0m: \u001b[32m'2025-06-02T13:40:42.724997Z'\u001b[0m,\n", + " \u001b[32m'attributes'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'description'\u001b[0m: \u001b[32m'Example PSDM stack'\u001b[0m\u001b[1m}\u001b[0m\n", + " \u001b[1m}\u001b[0m,\n", + " \u001b[32m'variables'\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[1m{\u001b[0m\n", + " \u001b[32m'dataType'\u001b[0m: \u001b[32m'int32'\u001b[0m,\n", + " \u001b[32m'dimensions'\u001b[0m: \u001b[1m[\u001b[0m\u001b[1m{\u001b[0m\u001b[32m'name'\u001b[0m: \u001b[32m'inline'\u001b[0m, \u001b[32m'size'\u001b[0m: \u001b[1;36m256\u001b[0m\u001b[1m}\u001b[0m\u001b[1m]\u001b[0m,\n", + " \u001b[32m'name'\u001b[0m: \u001b[32m'inline'\u001b[0m,\n", + " \u001b[32m'longName'\u001b[0m: \u001b[32m'Inline Number'\u001b[0m\n", + " \u001b[1m}\u001b[0m,\n", + " \u001b[1m{\u001b[0m\n", + " \u001b[32m'dataType'\u001b[0m: \u001b[32m'int32'\u001b[0m,\n", + " \u001b[32m'dimensions'\u001b[0m: \u001b[1m[\u001b[0m\u001b[1m{\u001b[0m\u001b[32m'name'\u001b[0m: \u001b[32m'crossline'\u001b[0m, \u001b[32m'size'\u001b[0m: \u001b[1;36m512\u001b[0m\u001b[1m}\u001b[0m\u001b[1m]\u001b[0m,\n", + " \u001b[32m'name'\u001b[0m: \u001b[32m'crossline'\u001b[0m,\n", + " \u001b[32m'longName'\u001b[0m: \u001b[32m'Crossline Number'\u001b[0m\n", + " \u001b[1m}\u001b[0m,\n", + " \u001b[1m{\u001b[0m\n", + " \u001b[32m'dataType'\u001b[0m: \u001b[32m'int32'\u001b[0m,\n", + " \u001b[32m'dimensions'\u001b[0m: \u001b[1m[\u001b[0m\u001b[1m{\u001b[0m\u001b[32m'name'\u001b[0m: \u001b[32m'depth'\u001b[0m, \u001b[32m'size'\u001b[0m: \u001b[1;36m384\u001b[0m\u001b[1m}\u001b[0m\u001b[1m]\u001b[0m,\n", + " \u001b[32m'name'\u001b[0m: \u001b[32m'depth'\u001b[0m,\n", + " \u001b[32m'longName'\u001b[0m: \u001b[32m'Depth Sample'\u001b[0m\n", + " \u001b[1m}\u001b[0m,\n", + " \u001b[1m{\u001b[0m\n", + " \u001b[32m'dataType'\u001b[0m: \u001b[32m'float32'\u001b[0m,\n", + " \u001b[32m'dimensions'\u001b[0m: \u001b[1m[\u001b[0m\n", + " \u001b[1m{\u001b[0m\u001b[32m'name'\u001b[0m: \u001b[32m'inline'\u001b[0m, \u001b[32m'size'\u001b[0m: \u001b[1;36m256\u001b[0m\u001b[1m}\u001b[0m,\n", + " \u001b[1m{\u001b[0m\u001b[32m'name'\u001b[0m: \u001b[32m'crossline'\u001b[0m, \u001b[32m'size'\u001b[0m: \u001b[1;36m512\u001b[0m\u001b[1m}\u001b[0m,\n", + " \u001b[1m{\u001b[0m\u001b[32m'name'\u001b[0m: \u001b[32m'depth'\u001b[0m, \u001b[32m'size'\u001b[0m: \u001b[1;36m384\u001b[0m\u001b[1m}\u001b[0m\n", + " \u001b[1m]\u001b[0m,\n", + " \u001b[32m'compressor'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'name'\u001b[0m: \u001b[32m'blosc'\u001b[0m, \u001b[32m'algorithm'\u001b[0m: \u001b[32m'zstd'\u001b[0m, \u001b[32m'level'\u001b[0m: \u001b[1;36m3\u001b[0m, \u001b[32m'shuffle'\u001b[0m: \u001b[1;36m1\u001b[0m, \u001b[32m'blocksize'\u001b[0m: \u001b[1;36m0\u001b[0m\u001b[1m}\u001b[0m,\n", + " \u001b[32m'name'\u001b[0m: \u001b[32m'stack_amplitude'\u001b[0m,\n", + " \u001b[32m'coordinates'\u001b[0m: \u001b[1m[\u001b[0m\u001b[32m'inline'\u001b[0m, \u001b[32m'crossline'\u001b[0m, \u001b[32m'cdp_x'\u001b[0m, \u001b[32m'cdp_y'\u001b[0m\u001b[1m]\u001b[0m,\n", + " \u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'chunkGrid'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'name'\u001b[0m: \u001b[32m'regular'\u001b[0m, \u001b[32m'configuration'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'chunkShape'\u001b[0m: \u001b[1m[\u001b[0m\u001b[1;36m64\u001b[0m, \u001b[1;36m64\u001b[0m, \u001b[1;36m64\u001b[0m\u001b[1m]\u001b[0m\u001b[1m}\u001b[0m\u001b[1m}\u001b[0m\u001b[1m}\u001b[0m\n", + " \u001b[1m}\u001b[0m,\n", + " \u001b[1m{\u001b[0m\n", + " \u001b[32m'dataType'\u001b[0m: \u001b[32m'float64'\u001b[0m,\n", + " \u001b[32m'dimensions'\u001b[0m: \u001b[1m[\u001b[0m\u001b[1m{\u001b[0m\u001b[32m'name'\u001b[0m: \u001b[32m'inline'\u001b[0m, \u001b[32m'size'\u001b[0m: \u001b[1;36m256\u001b[0m\u001b[1m}\u001b[0m, \u001b[1m{\u001b[0m\u001b[32m'name'\u001b[0m: \u001b[32m'crossline'\u001b[0m, \u001b[32m'size'\u001b[0m: \u001b[1;36m512\u001b[0m\u001b[1m}\u001b[0m\u001b[1m]\u001b[0m,\n", + " \u001b[32m'name'\u001b[0m: \u001b[32m'cdp_x'\u001b[0m,\n", + " \u001b[32m'longName'\u001b[0m: \u001b[32m'CDP X \u001b[0m\u001b[32m(\u001b[0m\u001b[32mUTM Easting\u001b[0m\u001b[32m)\u001b[0m\u001b[32m'\u001b[0m,\n", + " \u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'unitsV1'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'length'\u001b[0m: \u001b[32m'm'\u001b[0m\u001b[1m}\u001b[0m, \u001b[32m'attributes'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'attributes'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'MGA'\u001b[0m: \u001b[1;36m51\u001b[0m\u001b[1m}\u001b[0m\u001b[1m}\u001b[0m\u001b[1m}\u001b[0m\n", + " \u001b[1m}\u001b[0m,\n", + " \u001b[1m{\u001b[0m\n", + " \u001b[32m'dataType'\u001b[0m: \u001b[32m'float64'\u001b[0m,\n", + " \u001b[32m'dimensions'\u001b[0m: \u001b[1m[\u001b[0m\u001b[1m{\u001b[0m\u001b[32m'name'\u001b[0m: \u001b[32m'inline'\u001b[0m, \u001b[32m'size'\u001b[0m: \u001b[1;36m256\u001b[0m\u001b[1m}\u001b[0m, \u001b[1m{\u001b[0m\u001b[32m'name'\u001b[0m: \u001b[32m'crossline'\u001b[0m, \u001b[32m'size'\u001b[0m: \u001b[1;36m512\u001b[0m\u001b[1m}\u001b[0m\u001b[1m]\u001b[0m,\n", + " \u001b[32m'name'\u001b[0m: \u001b[32m'cdp_y'\u001b[0m,\n", + " \u001b[32m'longName'\u001b[0m: \u001b[32m'CDP Y \u001b[0m\u001b[32m(\u001b[0m\u001b[32mUTM Northing\u001b[0m\u001b[32m)\u001b[0m\u001b[32m'\u001b[0m,\n", + " \u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'unitsV1'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'length'\u001b[0m: \u001b[32m'm'\u001b[0m\u001b[1m}\u001b[0m, \u001b[32m'attributes'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'attributes'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'MGA'\u001b[0m: \u001b[1;36m51\u001b[0m\u001b[1m}\u001b[0m\u001b[1m}\u001b[0m\u001b[1m}\u001b[0m\n", + " \u001b[1m}\u001b[0m\n", + " \u001b[1m]\u001b[0m\n", + "\u001b[1m}\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Build our Dataset model from the builder\n", + "dataset = builder.build()\n", + "\n", + "# Serialize the Dataset model to JSON\n", + "contract = json.loads(dataset.json())\n", + "\n", + "# Reorder the contract so that metadata is displayed first\n", + "ordered_contract = {\n", + " \"metadata\": contract[\"metadata\"],\n", + " \"variables\": contract[\"variables\"],\n", + "}\n", + "\n", + "rprint(ordered_contract)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "jupyter", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/tutorials/builder.md b/docs/tutorials/builder.md new file mode 100644 index 00000000..9fddf377 --- /dev/null +++ b/docs/tutorials/builder.md @@ -0,0 +1,146 @@ +# Constructing a v1 Dataset with the MDIODatasetBuilder + +In this tutorial, we'll walk through how to use the `MDIODatasetBuilder` class to programmatically construct an MDIO v1 dataset. The builder enforces a specific build order to ensure a valid dataset: + +1. Add dimensions via `add_dimension()` +2. (Optional) Add coordinates via `add_coordinate()` +3. Add variables via `add_variable()` +4. Call `build()` to finalize the dataset. + +## Importing the Builder + +```python +from mdio.core.v1.builder import MDIODatasetBuilder, write_mdio_metadata +from mdio.schemas.dtype import ScalarType, StructuredType +from mdio.schemas.compressors import Blosc, ZFP +``` + +## Creating the Builder + +First, create a builder instance with a name and optional global attributes: + +```python +builder = MDIODatasetBuilder( + name="example_dataset", + attributes={ + "description": "An example MDIO v1 dataset", + "creator": "Your Name", + }, +) +``` + +## Adding Dimensions + +Dimensions define the axes of your dataset. You must add at least one dimension before adding coordinates or variables: + +```python +builder = ( + builder + .add_dimension(name="inline", size=256, long_name="Inline Number") + .add_dimension(name="crossline", size=512, long_name="Crossline Number") + .add_dimension(name="depth", size=384, long_name="Depth Sample") +) +``` + +## Adding Coordinates (Optional) + +Coordinates map grid indices to real-world positions (e.g., UTM coordinates on the inline–crossline plane): + +```python +builder = ( + builder + .add_coordinate( + name="cdp_x", + dimensions=["inline", "crossline"], + long_name="CDP X (UTM Easting)", + data_type=ScalarType.FLOAT64, + metadata={"unitsV1": {"length": "m"}}, + ) + .add_coordinate( + name="cdp_y", + dimensions=["inline", "crossline"], + long_name="CDP Y (UTM Northing)", + data_type=ScalarType.FLOAT64, + metadata={"unitsV1": {"length": "m"}}, + ) +) +``` + +If you omit `name`, the builder auto-generates names like `coord_0`. If you omit `dimensions`, it uses all defined dimensions. + +## Adding Variables + +Add one or more seismic data variables (e.g., post-stack amplitude volumes). Variables can have compressors, statistics, and more: + +```python +builder = builder.add_variable( + name="stack_amplitude", + dimensions=["inline", "crossline", "depth"], + data_type=ScalarType.FLOAT32, + compressor=Blosc(algorithm="zstd", level=3), + coordinates=["inline", "crossline", "cdp_x", "cdp_y"], + metadata={ + "chunkGrid": {"name": "regular", "configuration": {"chunkShape": [64, 64, 64]}} + }, +) +``` + +For structured dtypes, use `StructuredType`: + +```python +from mdio.schemas.dtype import StructuredType, ScalarType + +structured_dtype = StructuredType( + fields=[ + {"name": "flag", "format": ScalarType.INT8}, + {"name": "value", "format": ScalarType.FLOAT32}, + ] +) + +builder = builder.add_variable( + name="metadata", + dimensions=["x", "y"], + data_type=structured_dtype, +) +``` + +## Building the Dataset + +After adding all components, call: + +```python +dataset = builder.build() +``` + +This returns a `Dataset` object conforming to the MDIO v1 schema. + +## Writing Metadata and Writing Data + +The `.build()` method returns an in-memory Pydantic `Dataset` model (MDIO v1 schema). To serialize this model to disk, use the following approaches: + +- **Metadata only** (no array values written): + + ```python + # Write metadata structure only (no data arrays) + mds = write_mdio_metadata( + dataset, + store="path/to/output.mdio" + ) + ``` + + This writes only the metadata to the `.mdio` store and returns an `mdio.Dataset` (an xarray.Dataset subclass) with placeholder arrays. + +- **Write actual data** (array values): + + After writing metadata, call `to_mdio()` on the returned `mdio.Dataset` with `compute=True` to write the actual data arrays: + + ```python + # Write data arrays into the existing store + mds.to_mdio( + store="path/to/output.mdio", + mode="a", + compute=True, + ) + ``` + + Alternatively, skip `write_mdio_metadata()` and write both metadata and data in one call by invoking `to_mdio()` directly on the `mdio.Dataset` produced by `_construct_mdio_dataset`, if you have it available. diff --git a/noxfile.py b/noxfile.py index 281ec525..421cf931 100644 --- a/noxfile.py +++ b/noxfile.py @@ -200,6 +200,7 @@ def tests(session: Session) -> None: "pygments", "pytest-dependency", "s3fs", + "zfpy", # TODO(BrianMichell): #0 Ensure this is pulling from the pyproject.toml ], ) diff --git a/pyproject.toml b/pyproject.toml index 7e01894e..d5605839 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,8 +34,8 @@ dependencies = [ "rich (>=13.9.4,<14.0.0)", "segy (>=0.4.0,<0.5.0)", "tqdm (>=4.67.0,<5.0.0)", - "xarray>=2025.3.1", "zarr (>=3.0.8,<4.0.0)", + "xarray (>=2025.4.0)", ] [project.optional-dependencies] diff --git a/src/mdio/core/v1/__init__.py b/src/mdio/core/v1/__init__.py new file mode 100644 index 00000000..d8ec5b85 --- /dev/null +++ b/src/mdio/core/v1/__init__.py @@ -0,0 +1,26 @@ +"""MDIO core v1 package initialization. + +Exposes the MDIO overloads and core v1 functionality. +""" + +from ._overloads import mdio +from ._serializer import make_coordinate +from ._serializer import make_dataset +from ._serializer import make_dataset_metadata +from ._serializer import make_named_dimension +from ._serializer import make_variable +from .builder import MDIODatasetBuilder +from .factory import SCHEMA_TEMPLATE_MAP +from .factory import MDIOSchemaType + +__all__ = [ + "MDIODatasetBuilder", + "make_coordinate", + "make_dataset", + "make_dataset_metadata", + "make_named_dimension", + "make_variable", + "mdio", + "MDIOSchemaType", + "SCHEMA_TEMPLATE_MAP", +] diff --git a/src/mdio/core/v1/_overloads.py b/src/mdio/core/v1/_overloads.py new file mode 100644 index 00000000..a07353b1 --- /dev/null +++ b/src/mdio/core/v1/_overloads.py @@ -0,0 +1,93 @@ +"""Overloads for xarray. + +The intent of overloading here is: +1. To provide a consistent mdio.* naming scheme. +""" + +from collections.abc import Mapping + +import xarray as xr +from xarray import DataArray as _DataArray +from xarray import Dataset as _Dataset + + +class MDIODataset(_Dataset): + """xarray.Dataset subclass with MDIO v1 extensions.""" + + __slots__ = () + + def to_mdio( + self, + store: str | None = None, + *args: str | int | float | bool, + **kwargs: Mapping[str, str | int | float | bool], + ) -> None: + """Alias for `.to_zarr()`.""" + # Ensure zarr_format=2 by default unless explicitly overridden + zarr_format = kwargs.get("zarr_format", 2) + if zarr_format != 2: # noqa: PLR2004 + msg = "MDIO only supports zarr_format=2" + raise ValueError(msg) + kwargs["zarr_format"] = zarr_format + return super().to_zarr(*args, store=store, **kwargs) + + +class MDIODataArray(_DataArray): + """xarray.DataArray subclass with MDIO v1 extensions.""" + + __slots__ = () + + def to_mdio( + self, + store: str | None = None, + *args: str | int | float | bool, + **kwargs: Mapping[str, str | int | float | bool], + ) -> None: + """Alias for `.to_zarr()`, and writes to Zarr store.""" + # Ensure zarr_format=2 by default unless explicitly overridden + zarr_format = kwargs.get("zarr_format", 2) + if zarr_format != 2: # noqa: PLR2004 + msg = "MDIO only supports zarr_format=2" + raise ValueError(msg) + kwargs["zarr_format"] = zarr_format + return super().to_zarr(*args, store=store, **kwargs) + + +class MDIO: + """MDIO namespace for overloaded types and functions.""" + + Dataset = MDIODataset + DataArray = MDIODataArray + + @staticmethod + def open( + store: str, + *args: str | int | float | bool, + engine: str = "zarr", + consolidated: bool = False, + **kwargs: Mapping[str, str | int | float | bool], + ) -> MDIODataset: + """Open a Zarr store as an MDIODataset. + + Casts the returned xarray.Dataset (and its variables) to the MDIO subclasses. + """ + ds = xr.open_dataset( + store, + *args, + engine=engine, + consolidated=consolidated, + **kwargs, + ) + # Cast Dataset to MDIODataset + ds.__class__ = MDIODataset + # Cast each DataArray in data_vars and coords + + for _name, var in ds.data_vars.items(): # noqa: PERF102 .values() failed tests + var.__class__ = MDIODataArray + for _name, coord in ds.coords.items(): # noqa: PERF102 .values() failed tests + coord.__class__ = MDIODataArray + return ds + + +# Create module-level MDIO namespace +mdio = MDIO() diff --git a/src/mdio/core/v1/_serializer.py b/src/mdio/core/v1/_serializer.py new file mode 100644 index 00000000..d7c6656d --- /dev/null +++ b/src/mdio/core/v1/_serializer.py @@ -0,0 +1,293 @@ +"""Internal serialization module for MDIO v1 datasets. + +This module contains internal implementation details for serializing MDIO schema models +to Zarr storage. This API is not considered stable and may change without notice. +""" + +from datetime import datetime +from typing import Any + +import numpy as np +from numcodecs import Blosc as NumcodecsBlosc + +from mdio.core.v1._overloads import mdio +from mdio.schemas.chunk_grid import * # noqa: F403 +from mdio.schemas.compressors import ZFP +from mdio.schemas.compressors import Blosc +from mdio.schemas.dimension import NamedDimension +from mdio.schemas.dtype import ScalarType +from mdio.schemas.dtype import StructuredType +from mdio.schemas.metadata import UserAttributes +from mdio.schemas.v1.dataset import Dataset as MDIODataset +from mdio.schemas.v1.dataset import DatasetMetadata +from mdio.schemas.v1.stats import * # noqa: F403 +from mdio.schemas.v1.units import AllUnits +from mdio.schemas.v1.variable import Coordinate +from mdio.schemas.v1.variable import Variable +from mdio.schemas.v1.variable import VariableMetadata + +try: + import zfpy as zfpy_base # Base library + from numcodecs import ZFPY # Codec +except ImportError: + zfpy_base = None + ZFPY = None + + +def make_named_dimension(name: str, size: int) -> NamedDimension: + """Create a NamedDimension with the given name and size.""" + return NamedDimension(name=name, size=size) + + +def make_coordinate( + name: str, + dimensions: list[NamedDimension | str], + data_type: ScalarType | StructuredType, + long_name: str = None, + metadata: list[AllUnits | UserAttributes] | dict[str, Any] | None = None, +) -> Coordinate: + """Create a Coordinate with the given name, dimensions, data_type, and metadata.""" + # Build metadata list of AllUnits or UserAttributes to satisfy Coordinate.schema + coord_meta_list: list[AllUnits | UserAttributes] | None = None + if metadata is not None: + items: list[AllUnits | UserAttributes] = [] + # single dict input + if isinstance(metadata, dict): + if "unitsV1" in metadata: + items.append(AllUnits(unitsV1=metadata["unitsV1"])) + if "attributes" in metadata: + items.append(UserAttributes(attributes=metadata["attributes"])) + # list input may contain dict or model instances + elif isinstance(metadata, list): + for md in metadata: + if isinstance(md, AllUnits) or isinstance(md, UserAttributes): # noqa: SIM101 + items.append(md) + elif isinstance(md, dict): + if "unitsV1" in md: + items.append(AllUnits(unitsV1=md["unitsV1"])) + if "attributes" in md: + items.append(UserAttributes(attributes=md["attributes"])) + else: + msg = f"Unsupported metadata element type for coordinate: {type(md)}" + raise TypeError(msg) + else: + msg = f"Unsupported metadata type for coordinate: {type(metadata)}" + raise TypeError(msg) + coord_meta_list = items or None + return Coordinate( + name=name, + longName=long_name, + dimensions=dimensions, + dataType=data_type, + metadata=coord_meta_list, + ) + + +def make_variable( # noqa: PLR0913 PLR0912 + name: str, + dimensions: list[NamedDimension | str], + data_type: ScalarType | StructuredType, + long_name: str = None, + compressor: Blosc | ZFP | None = None, + coordinates: list[Coordinate | str] | None = None, + metadata: list[AllUnits | UserAttributes] | dict[str, Any] | VariableMetadata | None = None, +) -> Variable: + """Create a Variable with the given parameters. + + Args: + name: Name of the variable + dimensions: List of dimensions + data_type: Data type of the variable + long_name: Optional long name + compressor: Optional compressor + coordinates: Optional list of coordinates + metadata: Optional metadata + + Returns: + Variable: A Variable instance with the specified parameters. + + Raises: + TypeError: If the metadata type is not supported. + """ + + def _to_serializable(val: object) -> dict[str, Any] | object: + return val.model_dump(mode="json", by_alias=True) if hasattr(val, "model_dump") else val + + var_metadata = None + if metadata: + if isinstance(metadata, list): + metadata_dict = {} + for md in metadata: + if isinstance(md, AllUnits): + val = md.units_v1 + if isinstance(val, list) and len(val) == 1: + val = val[0] + metadata_dict["unitsV1"] = val + elif isinstance(md, UserAttributes): + attrs = _to_serializable(md) + metadata_dict["attributes"] = ( + attrs[0] if isinstance(attrs, list) and len(attrs) == 1 else attrs + ) + var_metadata = VariableMetadata(**metadata_dict) + + elif isinstance(metadata, dict): + converted_dict = {} + for key, value in metadata.items(): + if key == "unitsV1": + val = value[0] if isinstance(value, list) and len(value) == 1 else value + converted_dict["unitsV1"] = _to_serializable(val) + else: + converted_dict[key] = value + var_metadata = VariableMetadata(**converted_dict) + + elif isinstance(metadata, VariableMetadata): + # Flatten any single-element list fields in metadata + md = metadata.model_dump(by_alias=True, exclude_none=True) + for key, value in list(md.items()): + if isinstance(value, list) and len(value) == 1: + md[key] = value[0] + var_metadata = VariableMetadata(**md) + + else: + msg = f"Unsupported metadata type: {type(metadata)}" + raise TypeError(msg) + + return Variable( + name=name, + longName=long_name, + dimensions=dimensions, + dataType=data_type, + compressor=compressor, + coordinates=coordinates, + metadata=var_metadata, + ) + + +def make_dataset_metadata( + name: str, + api_version: str, + created_on: datetime, + attributes: dict[str, Any] | None = None, +) -> DatasetMetadata: + """Create a DatasetMetadata with name, api_version, created_on, and optional attributes.""" + dataset_metadata_dict = { + "name": name, + "apiVersion": api_version, + "createdOn": created_on, + "attributes": attributes, + } + return DatasetMetadata(**dataset_metadata_dict) + + +def make_dataset( + variables: list[Variable], + metadata: DatasetMetadata, +) -> MDIODataset: + """Create a Dataset with the given variables and metadata.""" + return MDIODataset( + variables=variables, + metadata=metadata, + ) + + +def _convert_compressor( + model: Blosc | ZFP | None, +) -> NumcodecsBlosc | ZFPY | None: + if isinstance(model, Blosc): + return NumcodecsBlosc( + cname=model.algorithm.value, + clevel=model.level, + shuffle=model.shuffle.value, + blocksize=model.blocksize if model.blocksize > 0 else 0, + ) + if isinstance(model, ZFP): + if zfpy_base is None or ZFPY is None: + msg = "zfpy and numcodecs are required to use ZFP compression" + raise ImportError(msg) + return ZFPY( + mode=model.mode.value, + tolerance=model.tolerance, + rate=model.rate, + precision=model.precision, + ) + if model is None: + return None + msg = f"Unsupported compressor model: {type(model)}" + raise TypeError(msg) + + +def _construct_mdio_dataset(mdio_ds: MDIODataset) -> mdio.Dataset: # noqa: PLR0912 + """Build an MDIO dataset with correct dimensions and dtypes. + + This internal function constructs the underlying data structure for an MDIO dataset, + handling dimension mapping, data types, and metadata organization. + + Args: + mdio_ds: The source MDIO dataset to construct from. + + Returns: + The constructed dataset with proper MDIO structure and metadata. + + Raises: + TypeError: If an unsupported data type is encountered. + """ + # Collect dimension sizes + dims: dict[str, int] = {} + for var in mdio_ds.variables: + for d in var.dimensions: + if isinstance(d, NamedDimension): + dims[d.name] = d.size + + # Build data variables + data_vars: dict[str, mdio.DataArray] = {} + for var in mdio_ds.variables: + dim_names = [d.name if isinstance(d, NamedDimension) else d for d in var.dimensions] + shape = tuple(dims[name] for name in dim_names) + dt = var.data_type + if isinstance(dt, ScalarType): + dtype = np.dtype(dt.value) + elif isinstance(dt, StructuredType): + dtype = np.dtype([(f.name, f.format.value) for f in dt.fields]) + else: + msg = f"Unsupported data_type: {dt}" + raise TypeError(msg) + arr = np.zeros(shape, dtype=dtype) + data_array = mdio.DataArray(arr, dims=dim_names) + data_array.encoding["fill_value"] = 0.0 + + # Set long_name if present + if var.long_name is not None: + data_array.attrs["long_name"] = var.long_name + + # Set coordinates if present, excluding dimension names + if var.coordinates is not None: + dim_set = set(dim_names) + coord_names = [ + c.name if isinstance(c, Coordinate) else c + for c in var.coordinates + if (c.name if isinstance(c, Coordinate) else c) not in dim_set + ] + if coord_names: + data_array.attrs["coordinates"] = " ".join(coord_names) + + # Attach variable metadata into DataArray attributes + if var.metadata is not None: + md = var.metadata.model_dump( + by_alias=True, + exclude_none=True, + exclude={"chunk_grid"}, + ) + for key, value in md.items(): + if isinstance(value, list) and len(value) == 1: + md[key] = value[0] + data_array.attrs.update(md) + data_vars[var.name] = data_array + + ds = mdio.Dataset(data_vars) + # Attach dataset metadata + ds.attrs["apiVersion"] = mdio_ds.metadata.api_version + ds.attrs["createdOn"] = str(mdio_ds.metadata.created_on) + ds.attrs["name"] = mdio_ds.metadata.name + if mdio_ds.metadata.attributes: + ds.attrs["attributes"] = mdio_ds.metadata.attributes + return ds diff --git a/src/mdio/core/v1/builder.py b/src/mdio/core/v1/builder.py new file mode 100644 index 00000000..8ffef60f --- /dev/null +++ b/src/mdio/core/v1/builder.py @@ -0,0 +1,299 @@ +"""Builder pattern implementation for MDIO v1 schema models.""" + +from collections.abc import Mapping +from datetime import UTC +from datetime import datetime +from enum import Enum +from enum import auto +from typing import Any + +from zarr.core.chunk_key_encodings import V2ChunkKeyEncoding # noqa: F401 + +from mdio.core.v1._overloads import mdio +from mdio.schemas.compressors import ZFP +from mdio.schemas.compressors import Blosc +from mdio.schemas.dimension import NamedDimension +from mdio.schemas.dtype import ScalarType +from mdio.schemas.dtype import StructuredType +from mdio.schemas.metadata import UserAttributes +from mdio.schemas.v1.dataset import Dataset +from mdio.schemas.v1.units import AllUnits +from mdio.schemas.v1.variable import Coordinate +from mdio.schemas.v1.variable import Variable +from mdio.schemas.v1.variable import VariableMetadata + +# Import factory functions from serializer module +from ._serializer import _construct_mdio_dataset +from ._serializer import _convert_compressor +from ._serializer import make_coordinate +from ._serializer import make_dataset +from ._serializer import make_dataset_metadata +from ._serializer import make_named_dimension +from ._serializer import make_variable + + +class _BuilderState(Enum): + """States for the template builder.""" + + INITIAL = auto() + HAS_DIMENSIONS = auto() + HAS_COORDINATES = auto() + HAS_VARIABLES = auto() + + +class MDIODatasetBuilder: + """Builder for creating MDIO datasets with enforced build order. + + This builder implements the builder pattern to create MDIO datasets with a v1 schema. + It enforces a specific build order to ensure valid dataset construction: + 1. Must add dimensions first via add_dimension() + 2. Can optionally add coordinates via add_coordinate() + 3. Must add variables via add_variable() + 4. Must call build() to create the dataset. + """ + + def __init__(self, name: str, attributes: dict[str, Any] | None = None): + self.name = name + self.api_version = "1.0.0" # TODO(BrianMichell, #0): Pull from package metadata + self.created_on = datetime.now(UTC) + self.attributes = attributes + self._dimensions: list[NamedDimension] = [] + self._coordinates: list[Coordinate] = [] + self._variables: list[Variable] = [] + self._state = _BuilderState.INITIAL + self._unnamed_variable_counter = 0 + + def add_dimension( # noqa: PLR0913 + self, + name: str, + size: int, + long_name: str = None, + data_type: ScalarType | StructuredType = ScalarType.INT32, + metadata: list[AllUnits | UserAttributes] | None | dict[str, Any] = None, + ) -> "MDIODatasetBuilder": + """Add a dimension. + + This must be called at least once before adding coordinates or variables. + + Args: + name: Name of the dimension + size: Size of the dimension + long_name: Optional long name for the dimension variable + data_type: Data type for the dimension variable (defaults to INT32) + metadata: Optional metadata for the dimension variable + + Returns: + self: Returns self for method chaining + """ + # Create the dimension + dimension = make_named_dimension(name, size) + self._dimensions.append(dimension) + + # Create a variable for the dimension + dim_var = make_variable( + name=name, + long_name=long_name, + dimensions=[dimension], + data_type=data_type, + metadata=metadata, + ) + self._variables.append(dim_var) + + self._state = _BuilderState.HAS_DIMENSIONS + return self + + def add_coordinate( # noqa: PLR0913 + self, + name: str = "", + *, + long_name: str = None, + dimensions: list[NamedDimension | str] | None = None, + data_type: ScalarType | StructuredType = ScalarType.FLOAT32, + metadata: list[AllUnits | UserAttributes] | None | dict[str, Any] = None, + ) -> "MDIODatasetBuilder": + """Add a coordinate after adding at least one dimension.""" + if self._state == _BuilderState.INITIAL: + msg = "Must add at least one dimension before adding coordinates" + raise ValueError(msg) + + if name == "": + name = f"coord_{len(self._coordinates)}" + if dimensions is None: + dimensions = self._dimensions + if isinstance(metadata, dict): + metadata = [metadata] + + # Convert string dimension names to NamedDimension objects + dim_objects = [] + for dim in dimensions: + if isinstance(dim, str): + dim_obj = next((d for d in self._dimensions if d.name == dim), None) + if dim_obj is None: + msg = f"Dimension {dim!r} not found" + raise ValueError(msg) + dim_objects.append(dim_obj) + else: + dim_objects.append(dim) + + self._coordinates.append( + make_coordinate( + name=name, + long_name=long_name, + dimensions=dim_objects, + data_type=data_type, + metadata=metadata, + ) + ) + self._state = _BuilderState.HAS_COORDINATES + return self + + def add_variable( # noqa: PLR0913 + self, + name: str = "", + *, + long_name: str = None, + dimensions: list[NamedDimension | str] | None = None, + data_type: ScalarType | StructuredType = ScalarType.FLOAT32, + compressor: Blosc | ZFP | None = None, + coordinates: list[Coordinate | str] | None = None, + metadata: VariableMetadata | None = None, + ) -> "MDIODatasetBuilder": + """Add a variable after adding at least one dimension.""" + if self._state == _BuilderState.INITIAL: + msg = "Must add at least one dimension before adding variables" + raise ValueError(msg) + + if name == "": + name = f"var_{self._unnamed_variable_counter}" + self._unnamed_variable_counter += 1 + if dimensions is None: + dimensions = self._dimensions + + # Convert string dimension names to NamedDimension objects + dim_objects = [] + for dim in dimensions: + if isinstance(dim, str): + dim_obj = next((d for d in self._dimensions if d.name == dim), None) + if dim_obj is None: + msg = f"Dimension {dim!r} not found" + raise ValueError(msg) + dim_objects.append(dim_obj) + else: + dim_objects.append(dim) + + self._variables.append( + make_variable( + name=name, + long_name=long_name, + dimensions=dim_objects, + data_type=data_type, + compressor=compressor, + coordinates=coordinates, + metadata=metadata, + ) + ) + self._state = _BuilderState.HAS_VARIABLES + return self + + def build(self) -> Dataset: + """Build the final dataset.""" + if self._state == _BuilderState.INITIAL: + msg = "Must add at least one dimension before building" + raise ValueError(msg) + + metadata = make_dataset_metadata( + self.name, self.api_version, self.created_on, self.attributes + ) + + # Add coordinates as variables to the dataset + # We make a copy so that coordinates are not duplicated if the builder is reused + all_variables = self._variables.copy() + for coord in self._coordinates: + # Convert coordinate to variable + coord_var = make_variable( + name=coord.name, + long_name=coord.long_name, + dimensions=coord.dimensions, + data_type=coord.data_type, + metadata=coord.metadata, + ) + all_variables.append(coord_var) + + return make_dataset(all_variables, metadata) + + def to_mdio( + self, + store: str, + mode: str = "w", + compute: bool = False, + **kwargs: Mapping[str, str | int | float | bool], + ) -> Dataset: + """Write the dataset to a Zarr store and return the constructed mdio.Dataset. + + This function constructs an mdio.Dataset from the MDIO dataset and writes its metadata + to a Zarr store. The actual data is not written, only the metadata structure is created. + """ + return write_mdio_metadata(self.build(), store, mode, compute, **kwargs) + + +def write_mdio_metadata( + mdio_ds: Dataset, + store: str, + mode: str = "w", + compute: bool = False, + **kwargs: Mapping[str, str | int | float | bool], +) -> mdio.Dataset: + """Write MDIO metadata to a Zarr store and return the constructed mdio.Dataset. + + This function constructs an mdio.Dataset from the MDIO dataset and writes its metadata + to a Zarr store. The actual data is not written, only the metadata structure is created. + + Args: + mdio_ds: The MDIO dataset to serialize + store: Path to the Zarr or .mdio store + mode: Write mode to pass to to_mdio(), e.g. 'w' or 'a' + compute: Whether to compute (write) array chunks (True) or only metadata (False) + **kwargs: Additional arguments to pass to to_mdio() + + Returns: + The constructed xarray Dataset with MDIO extensions + """ + ds = _construct_mdio_dataset(mdio_ds) + + def _generate_encodings() -> dict: + """Generate encodings for each variable in the MDIO dataset. + + Returns: + Dictionary mapping variable names to their encoding configurations. + """ + # TODO(Anybody, #10274): Re-enable chunk_key_encoding when supported by xarray + # dimension_separator_encoding = V2ChunkKeyEncoding(separator="/").to_dict() + global_encodings = {} + for var in mdio_ds.variables: + fill_value = 0 + if isinstance(var.data_type, StructuredType): + continue + chunks = None + if var.metadata is not None and var.metadata.chunk_grid is not None: + chunks = var.metadata.chunk_grid.configuration.chunk_shape + global_encodings[var.name] = { + "chunks": chunks, + # TODO(Anybody, #10274): Re-enable chunk_key_encoding when supported by xarray + # "chunk_key_encoding": dimension_separator_encoding, + "_FillValue": fill_value, + "dtype": var.data_type, + "compressors": _convert_compressor(var.compressor), + } + return global_encodings + + ds.to_mdio( + store, + mode=mode, + zarr_format=2, + consolidated=True, + safe_chunks=False, + compute=compute, + encoding=_generate_encodings(), + **kwargs, + ) + return ds diff --git a/src/mdio/core/v1/factory.py b/src/mdio/core/v1/factory.py new file mode 100644 index 00000000..3cc7ced7 --- /dev/null +++ b/src/mdio/core/v1/factory.py @@ -0,0 +1,259 @@ +"""MDIO factories for seismic data.""" + +# TODO(BrianMichell, #535): Add implementations for other canonical datasets. + +from __future__ import annotations + +from enum import Enum +from enum import auto +from typing import TYPE_CHECKING +from typing import Any + +from mdio.core.v1.builder import MDIODatasetBuilder +from mdio.schemas.compressors import Blosc +from mdio.schemas.dtype import ScalarType +from mdio.schemas.dtype import StructuredType + +if TYPE_CHECKING: + from mdio.schemas.v1.dataset import Dataset + +import json + +from pydantic import ValidationError + + +def from_contract(store: str, contract: str | dict) -> Dataset: + """Creates an MDIO Dataset from the contract and writes the metadata to the store. + + Args: + store: The store to write the metadata to. + contract: The contract to create the dataset from. + + Raises: + ValueError: If the contract cannot be validated successfully. + + Returns: + The created MDIO Dataset. + """ + from mdio.core.v1._serializer import _construct_mdio_dataset + from mdio.schemas.v1 import Dataset as V1Dataset + + if isinstance(contract, str): + contract = json.loads(contract) + + try: + V1Dataset.model_validate(contract) + except ValidationError as e: + msg = f"Failed to validate the input contract: {e}" + raise ValueError(msg) from e + + ds = _construct_mdio_dataset(contract) + return ds.to_mdio(store) + + +class MDIOSchemaType(Enum): + """MDIO templates for specific data types.""" + + SEISMIC_3D_POST_STACK_GENERIC = auto() + SEISMIC_3D_POST_STACK_TIME = auto() + SEISMIC_3D_POST_STACK_DEPTH = auto() + SEISMIC_3D_PRE_STACK_CDP_TIME = auto() + SEISMIC_3D_PRE_STACK_CDP_DEPTH = auto() + + +class Seismic3DPostStackGeneric: + """Generic 3D seismic post stack dataset.""" + + def __init__(self) -> None: + self._dim_names = ["inline", "crossline", "sample"] + self._chunks = [128, 128, 128] # 8 mb + self._coords = { + "cdp-x": ("float64", {"unitsV1": {"length": "m"}}, self._dim_names[:-1]), + "cdp-y": ("float64", {"unitsV1": {"length": "m"}}, self._dim_names[:-1]), + } + + def create( # noqa: PLR0913 + self, + name: str, + shape: list[int], + header_fields: dict[str, str], + create_coords: bool = False, + sample_format: str | None = None, + chunks: list[int] | None = None, + sample_units: dict[str, str] | None = None, + z_units: dict[str, str] | None = None, + attributes: dict[str, Any] | None = None, + ) -> Dataset: + """Create a generic seismic dataset schema. + + Args: + name: Name of the dataset + shape: Shape of the dataset + header_fields: Header fields to include as a dict of field_name: dtype + create_coords: Whether to create coordinates + sample_format: Format of the samples + chunks: Chunk sizes + sample_units: Units for samples + z_units: Units for z-axis + attributes: Additional attributes to include in the dataset metadata + + Returns: + Dataset: The created dataset + """ + chunks = chunks or self._chunks + sample_format = sample_format or "float32" + + builder = MDIODatasetBuilder( + name=name, + attributes=attributes, + ) + + # Add dimensions + for dim_name, dim_size in zip(self._dim_names, shape, strict=True): + builder.add_dimension( + name=dim_name, + size=dim_size, + data_type=ScalarType.UINT32, + metadata=z_units if dim_name == "sample" else None, + ) + + # Add coordinates if requested + if create_coords: + for coord_name, (format_, unit, coord_dims) in self._coords.items(): + builder.add_coordinate( + name=coord_name, + data_type=ScalarType(format_), + dimensions=coord_dims, + metadata=unit, + ) + + # Add seismic variable + builder.add_variable( + name="seismic", + data_type=ScalarType(sample_format), + dimensions=self._dim_names, + compressor=Blosc(name="blosc", algorithm="zstd"), + metadata=sample_units, + ) + + # Add header variable with structured dtype + header_dtype = StructuredType( + fields=[ + {"name": field_name, "format": field_type} + for field_name, field_type in header_fields.items() + ] + ) + builder.add_variable( + name="headers", + data_type=header_dtype, + dimensions=self._dim_names[:-1], + compressor=Blosc(name="blosc"), + ) + + # Add trace mask + builder.add_variable( + name="trace_mask", + data_type=ScalarType.BOOL, + dimensions=self._dim_names[:-1], + compressor=Blosc(name="blosc"), + ) + + return builder.build() + + +class Seismic3DPostStack(Seismic3DPostStackGeneric): + """3D seismic post stack dataset with domain-specific attributes.""" + + def __init__(self, domain: str) -> None: + super().__init__() + self._dim_names = ["inline", "crossline", domain] + + def create( # noqa: PLR0913 + self, + name: str, + shape: list[int], + header_fields: dict[str, str], + create_coords: bool = False, + sample_format: str | None = None, + chunks: list[int] | None = None, + sample_units: dict[str, str] | None = None, + z_units: dict[str, str] | None = None, + attributes: dict[str, Any] | None = None, + ) -> Dataset: + """Create a seismic dataset schema with domain-specific attributes.""" + # Add seismic-specific attributes + seismic_attrs = { + "surveyDimensionality": "3D", + "ensembleType": "line", + "processingStage": "post-stack", + } + if attributes: + seismic_attrs.update(attributes) + + return super().create( + name=name, + shape=shape, + header_fields=header_fields, + create_coords=create_coords, + sample_format=sample_format, + chunks=chunks, + sample_units=sample_units, + z_units=z_units, + attributes=seismic_attrs, + ) + + +class Seismic3DPreStack(Seismic3DPostStackGeneric): + """3D seismic pre stack dataset.""" + + def __init__(self, domain: str) -> None: + super().__init__() + self._dim_names = ["inline", "crossline", "offset", domain] + self._chunks = [1, 1, 512, 4096] # 8 mb + self._coords = { + "cdp-x": ("float64", {"length": "m"}, self._dim_names[:-2]), + "cdp-y": ("float64", {"length": "m"}, self._dim_names[:-2]), + } + + def create( # noqa: PLR0913 + self, + name: str, + shape: list[int], + header_fields: dict[str, str], + create_coords: bool = False, + sample_format: str | None = None, + chunks: list[int] | None = None, + sample_units: dict[str, str] | None = None, + z_units: dict[str, str] | None = None, + attributes: dict[str, Any] | None = None, + ) -> Dataset: + """Create a seismic dataset schema with pre-stack attributes.""" + # Add seismic-specific attributes + seismic_attrs = { + "surveyDimensionality": "3D", + "ensembleType": "cdp", + "processingStage": "pre-stack", + } + if attributes: + seismic_attrs.update(attributes) + + return super().create( + name=name, + shape=shape, + header_fields=header_fields, + create_coords=create_coords, + sample_format=sample_format, + chunks=chunks, + sample_units=sample_units, + z_units=z_units, + attributes=seismic_attrs, + ) + + +SCHEMA_TEMPLATE_MAP = { + MDIOSchemaType.SEISMIC_3D_POST_STACK_GENERIC: Seismic3DPostStackGeneric(), + MDIOSchemaType.SEISMIC_3D_POST_STACK_TIME: Seismic3DPostStack("time"), + MDIOSchemaType.SEISMIC_3D_POST_STACK_DEPTH: Seismic3DPostStack("depth"), + MDIOSchemaType.SEISMIC_3D_PRE_STACK_CDP_TIME: Seismic3DPreStack("time"), + MDIOSchemaType.SEISMIC_3D_PRE_STACK_CDP_DEPTH: Seismic3DPreStack("depth"), +} diff --git a/src/mdio/schemas/builder.py b/src/mdio/schemas/builder.py deleted file mode 100644 index 40908ff0..00000000 --- a/src/mdio/schemas/builder.py +++ /dev/null @@ -1,154 +0,0 @@ -"""Schema builders.""" - -from __future__ import annotations - -from typing import Any - -from mdio.schemas import NamedDimension -from mdio.schemas.v1.dataset import Dataset -from mdio.schemas.v1.dataset import DatasetMetadata -from mdio.schemas.v1.variable import Variable -from mdio.schemas.v1.variable import VariableMetadata - - -class VariableBuilder: - """Dataset builder.""" - - def __init__(self) -> None: - self.name = None - self.long_name = None - self.dtype = None - self.chunks = None - self.dims = None - self.coords = None - self.compressor = None - self.meta_dict = None - - def set_name(self, name: str) -> VariableBuilder: - """Set variable name.""" - self.name = name - return self - - def set_long_name(self, long_name: str) -> VariableBuilder: - """Add long, descriptive name to the variable.""" - self.long_name = long_name - return self - - def set_compressor(self, compressor: dict[str, Any]) -> VariableBuilder: - """Add long, descriptive name to the variable.""" - self.compressor = compressor - return self - - def add_dimension(self, *dimensions: str | dict[str, int]) -> VariableBuilder: - """Add a dimension to the dataset.""" - if self.dims is None: - self.dims = [] - - if isinstance(dimensions[0], str): - dims = list(dimensions) - elif isinstance(dimensions[0], dict): - dims = [ - NamedDimension(name=name, size=size) - for dim in dimensions - for name, size in dim.items() - ] - else: - raise NotImplementedError - - self.dims.extend(dims) - return self - - def add_coordinate(self, *names: str) -> VariableBuilder: - """Add a coordinate to the variable.""" - if self.coords is None: - self.coords = [] - - self.coords.extend(names) - return self - - def set_format(self, format_: str | dict[str, str]) -> VariableBuilder: - """Set variable format.""" - if isinstance(format_, dict): - fields = [{"name": n, "format": f} for n, f in format_.items()] - format_ = {"fields": fields} - - self.dtype = format_ - return self - - def set_chunks(self, chunks: list[int]) -> VariableBuilder: - """Set variable chunks.""" - if self.meta_dict is None: - self.meta_dict = {} - - self.meta_dict["chunkGrid"] = {"configuration": {"chunkShape": chunks}} - return self - - def set_units(self, units: dict[str, str]) -> VariableBuilder: - """Set variable units.""" - if self.meta_dict is None: - self.meta_dict = {} - - self.meta_dict["unitsV1"] = units - return self - - def add_attribute(self, key: str, value: Any) -> VariableBuilder: # noqa: ANN401 - """Add a user attribute to the variable metadata.""" - if self.meta_dict is None: - self.meta_dict = {} - - self.meta_dict["attributes"] = {key: value} - return self - - def build(self) -> Variable: - """Build the dataset model.""" - if self.chunks is not None and len(self.chunks) != len(self.dims): - msg = "Variable chunks must have same number of dimensions." - raise ValueError(msg) - - var_kwargs = {} - - if self.meta_dict is not None: - var_kwargs["metadata"] = VariableMetadata.model_validate(self.meta_dict) - - return Variable( - name=self.name, - long_name=self.long_name, - data_type=self.dtype, - dimensions=self.dims, - coordinates=self.coords, - compressor=self.compressor, - **var_kwargs, - ) - - -class DatasetBuilder: - """Dataset builder.""" - - def __init__(self) -> None: - self.variables = [] - self.name = None - self.metadata = None - - def set_name(self, name: str) -> DatasetBuilder: - """Set dataset name.""" - self.name = name - return self - - def add_variable(self, variable: Variable) -> DatasetBuilder: - """Add a variable to the dataset.""" - self.variables.append(variable) - return self - - def add_variables(self, variables: list[Variable]) -> DatasetBuilder: - """Add multiple variables to the dataset.""" - [self.add_variable(variable) for variable in variables] - return self - - def set_metadata(self, metadata: DatasetMetadata) -> DatasetBuilder: - """Add a metadata to the dataset.""" - self.metadata = metadata - return self - - def build(self) -> Dataset: - """Build the dataset model.""" - return Dataset(variables=self.variables, metadata=self.metadata) diff --git a/src/mdio/schemas/core.py b/src/mdio/schemas/core.py index 34a09066..7768be06 100644 --- a/src/mdio/schemas/core.py +++ b/src/mdio/schemas/core.py @@ -3,10 +3,10 @@ from __future__ import annotations from typing import Any -from typing import get_type_hints from pydantic import BaseModel from pydantic import ConfigDict +from pydantic import Field from pydantic.alias_generators import to_camel @@ -28,12 +28,16 @@ def model_fields(model: type[BaseModel]) -> dict[str, tuple[Any, Any]]: >>> model_fields(MyModel) {'name': (str, ), 'age': (int, 0)} """ - annotations = get_type_hints(model) - fields = {} - for field_name, field in model.model_fields.items(): - fields[field_name] = (annotations[field_name], field) - + for field_name, field_info in model.model_fields.items(): + annotated_type = field_info.annotation + if field_info.is_required(): + fields[field_name] = (annotated_type, ...) + else: + fields[field_name] = ( + annotated_type, + Field(field_info.default, description=field_info.description), + ) return fields @@ -46,4 +50,25 @@ class StrictModel(BaseModel): class CamelCaseStrictModel(StrictModel): """A model with forbidden extras and camel case aliases.""" - model_config = ConfigDict(alias_generator=to_camel) + model_config = ConfigDict( + extra="forbid", + populate_by_name=False, + alias_generator=to_camel, + ser_json_by_alias=True, + ) + + def model_dump_json(self, *args, **kwargs) -> dict: # noqa: ANN201 ANN001 ANN002 ANN003 + """Dump JSON using camelCase aliases and excluding None values by default.""" + # Ensure camelCase aliases + if "by_alias" not in kwargs: + kwargs["by_alias"] = True + # Exclude None fields to avoid nulls in output + if "exclude_none" not in kwargs: + kwargs["exclude_none"] = True + return super().model_dump_json(*args, **kwargs) + + def json(self, *args, **kwargs) -> dict: # noqa: ANN201 ANN001 ANN002 ANN003 + """Dump JSON using camelCase aliases and excluding None values by default.""" + if "by_alias" not in kwargs: + kwargs["by_alias"] = True + return self.model_dump_json(*args, **kwargs) diff --git a/src/mdio/schemas/v1/units.py b/src/mdio/schemas/v1/units.py index 1913ff2e..f96b7491 100644 --- a/src/mdio/schemas/v1/units.py +++ b/src/mdio/schemas/v1/units.py @@ -12,7 +12,7 @@ from mdio.schemas.units import create_unit_model ureg = UnitRegistry() -ureg.default_format = "~C" # compact, abbreviated (symbol). +ureg.formatter.default_format = "~C" # compact, abbreviated (symbol). class LengthUnitEnum(UnitEnum): diff --git a/tests/integration/test_v1_serialization.py b/tests/integration/test_v1_serialization.py new file mode 100644 index 00000000..3b16388c --- /dev/null +++ b/tests/integration/test_v1_serialization.py @@ -0,0 +1,184 @@ +"""Integration test for MDIO v1 Xarray Zarr constructor.""" + +from datetime import datetime +from pathlib import Path + +import numpy as np + +from mdio.core.v1._overloads import MDIODataset +from mdio.core.v1._serializer import make_dataset +from mdio.core.v1._serializer import make_dataset_metadata +from mdio.core.v1._serializer import make_named_dimension +from mdio.core.v1._serializer import make_variable +from mdio.core.v1.builder import write_mdio_metadata +from mdio.schemas.compressors import ZFP +from mdio.schemas.compressors import Blosc +from mdio.schemas.dtype import ScalarType +from mdio.schemas.dtype import StructuredType + + +def build_toy_dataset() -> MDIODataset: + """Build a toy dataset for testing.""" + # core dimensions + inline = make_named_dimension("inline", 256) + crossline = make_named_dimension("crossline", 512) + depth = make_named_dimension("depth", 384) + + # Create dataset metadata + created = datetime.fromisoformat("2023-12-12T15:02:06.413469-06:00") + meta = make_dataset_metadata( + name="campos_3d", + api_version="1.0.0", + created_on=created, + attributes={ + "textHeader": [ + "C01 .......................... ", + "C02 .......................... ", + "C03 .......................... ", + ], + "foo": "bar", + }, + ) + + # Image variable + image = make_variable( + name="image", + dimensions=[inline, crossline, depth], + data_type=ScalarType.FLOAT32, + compressor=Blosc(algorithm="zstd"), + coordinates=["inline", "crossline", "depth", "cdp-x", "cdp-y"], + metadata={ + "chunkGrid": { + "name": "regular", + "configuration": {"chunkShape": [128, 128, 128]}, + }, + "statsV1": { + "count": 100, + "sum": 1215.1, + "sumSquares": 125.12, + "min": 5.61, + "max": 10.84, + "histogram": {"binCenters": [1, 2], "counts": [10, 15]}, + }, + "attributes": {"fizz": "buzz"}, + }, + ) + + # Velocity variable + velocity = make_variable( + name="velocity", + dimensions=[inline, crossline, depth], + data_type=ScalarType.FLOAT16, + compressor=None, + coordinates=["inline", "crossline", "depth", "cdp-x", "cdp-y"], + metadata={ + "chunkGrid": { + "name": "regular", + "configuration": {"chunkShape": [128, 128, 128]}, + }, + "unitsV1": {"speed": "m/s"}, + }, + ) + + # Inline-optimized image variable + image_inline = make_variable( + name="image_inline", + dimensions=[inline, crossline, depth], + data_type=ScalarType.FLOAT32, + compressor=ZFP(mode="fixed_accuracy", tolerance=0.05), + coordinates=["inline", "crossline", "depth", "cdp-x", "cdp-y"], + metadata={ + "chunkGrid": { + "name": "regular", + "configuration": {"chunkShape": [4, 512, 512]}, + } + }, + ) + + # Headers variable with structured dtype + headers_dtype = StructuredType( + fields=[ + {"name": "cdp-x", "format": ScalarType.INT32}, + {"name": "cdp-y", "format": ScalarType.INT32}, + {"name": "elevation", "format": ScalarType.FLOAT16}, + {"name": "some_scalar", "format": ScalarType.FLOAT16}, + ] + ) + image_headers = make_variable( + name="image_headers", + dimensions=[inline, crossline], + data_type=headers_dtype, + compressor=None, + coordinates=["inline", "crossline", "cdp-x", "cdp-y"], + metadata={ + "chunkGrid": { + "name": "regular", + "configuration": {"chunkShape": [128, 128]}, + } + }, + ) + + # Standalone dimension variables + inline_var = make_variable( + name="inline", dimensions=[inline], data_type=ScalarType.UINT32, compressor=None + ) + crossline_var = make_variable( + name="crossline", + dimensions=[crossline], + data_type=ScalarType.UINT32, + compressor=None, + ) + depth_var = make_variable( + name="depth", + dimensions=[depth], + data_type=ScalarType.UINT32, + compressor=None, + metadata={"unitsV1": {"length": "m"}}, + ) + cdp_x = make_variable( + name="cdp-x", + dimensions=[inline, crossline], + data_type=ScalarType.FLOAT32, + compressor=None, + metadata={"unitsV1": {"length": "m"}}, + ) + cdp_y = make_variable( + name="cdp-y", + dimensions=[inline, crossline], + data_type=ScalarType.FLOAT32, + compressor=None, + metadata={"unitsV1": {"length": "m"}}, + ) + + # Compose full dataset + return make_dataset( + [ + image, + velocity, + image_inline, + image_headers, + inline_var, + crossline_var, + depth_var, + cdp_x, + cdp_y, + ], + meta, + ) + + +def test_to_mdio_writes_and_returns_mdio(tmp_path: Path) -> None: + """Test that to_mdio writes and returns an mdio.Dataset.""" + ds_in = build_toy_dataset() + store_path = tmp_path / "toy.mdio" + # write to Zarr and get back xarray.Dataset + ds_out = write_mdio_metadata(ds_in, str(store_path)) + # global attributes should be present on the returned Dataset + assert ds_out.attrs["apiVersion"] == ds_in.metadata.api_version + assert ds_out.attrs["createdOn"] == str(ds_in.metadata.created_on) + if ds_in.metadata.attributes: + assert ds_out.attrs["attributes"] == ds_in.metadata.attributes + # verify the DataArray exists with correct shape and dtype + arr = ds_out["image"] + assert arr.shape == (256, 512, 384) + assert arr.dtype == np.dtype("float32") diff --git a/tests/unit/schema/v1/test_template_builder.py b/tests/unit/schema/v1/test_template_builder.py new file mode 100644 index 00000000..83ac3b11 --- /dev/null +++ b/tests/unit/schema/v1/test_template_builder.py @@ -0,0 +1,543 @@ +"""Unit tests for MDIO v1 schema builder.""" + +from datetime import datetime +from pathlib import Path + +import pytest + +from mdio.core.v1.builder import MDIODatasetBuilder +from mdio.core.v1.builder import _BuilderState +from mdio.core.v1.builder import write_mdio_metadata +from mdio.schemas.compressors import Blosc +from mdio.schemas.dtype import ScalarType +from mdio.schemas.dtype import StructuredType +from mdio.schemas.v1.dataset import Dataset + + +def test_builder_initialization() -> None: + """Test basic builder initialization.""" + builder = MDIODatasetBuilder("test_dataset") + assert builder.name == "test_dataset" + assert builder.api_version == "1.0.0" + assert isinstance(builder.created_on, datetime) + assert len(builder._dimensions) == 0 + assert len(builder._coordinates) == 0 + assert len(builder._variables) == 0 + assert builder._state == _BuilderState.INITIAL + + +def test_dimension_builder_state() -> None: + """Test dimension builder state transitions and functionality.""" + builder = MDIODatasetBuilder("test_dataset") + + # First dimension should change state to HAS_DIMENSIONS and create a variable + builder.add_dimension("x", 100, long_name="X Dimension") + assert builder._state == _BuilderState.HAS_DIMENSIONS + assert len(builder._dimensions) == 1 # noqa: PLR2004 + assert len(builder._variables) == 1 # noqa: PLR2004 + assert builder._dimensions[0].name == "x" + assert builder._dimensions[0].size == 100 # noqa: PLR2004 + assert builder._variables[0].name == "x" + assert builder._variables[0].long_name == "X Dimension" + assert builder._variables[0].data_type == ScalarType.INT32 + assert builder._variables[0].dimensions[0].name == "x" + + # Adding another dimension should maintain state and create another variable + builder.add_dimension("y", 200, data_type=ScalarType.UINT32) + assert builder._state == _BuilderState.HAS_DIMENSIONS + assert len(builder._dimensions) == 2 # noqa: PLR2004 + assert len(builder._variables) == 2 # noqa: PLR2004 + assert builder._dimensions[1].name == "y" + assert builder._dimensions[1].size == 200 # noqa: PLR2004 + assert builder._variables[1].name == "y" + assert builder._variables[1].data_type == ScalarType.UINT32 + assert builder._variables[1].dimensions[0].name == "y" + + +def test_dimension_with_units() -> None: + """Test adding dimensions with custom metadata.""" + builder = MDIODatasetBuilder("test_dataset") + + # Add dimension with custom metadata + builder.add_dimension( + "depth", + size=100, + data_type=ScalarType.FLOAT32, + metadata={"unitsV1": {"length": "m"}}, + ) + + assert len(builder._variables) == 1 + depth_var = builder._variables[0] + assert depth_var.name == "depth" + assert depth_var.data_type == ScalarType.FLOAT32 + assert depth_var.metadata.units_v1.length == "m" + + +def test_dimension_with_attributes() -> None: + """Test adding dimensions with attributes.""" + builder = MDIODatasetBuilder("test_dataset") + + # Add dimension with attributes + builder.add_dimension( + "depth", + size=100, + data_type=ScalarType.FLOAT32, + metadata={"attributes": {"MGA": 51}}, + ) + + assert len(builder._variables) == 1 + depth_var = builder._variables[0] + assert depth_var.name == "depth" + assert depth_var.data_type == ScalarType.FLOAT32 + assert depth_var.metadata.attributes["MGA"] == 51 # noqa: PLR2004 + + +def test_dimension_with_chunk_grid() -> None: + """Test adding dimensions with chunk grid.""" + builder = MDIODatasetBuilder("test_dataset") + + # Add dimension with chunk grid + builder.add_dimension( + "depth", + size=100, + data_type=ScalarType.FLOAT32, + metadata={"chunkGrid": {"name": "regular", "configuration": {"chunkShape": [20]}}}, + ) + + assert len(builder._variables) == 1 + depth_var = builder._variables[0] + assert depth_var.name == "depth" + assert depth_var.data_type == ScalarType.FLOAT32 + assert depth_var.metadata.chunk_grid.name == "regular" + assert depth_var.metadata.chunk_grid.configuration.chunk_shape == [20] + + +def test_dimension_with_stats() -> None: + """Test adding dimensions with stats.""" + builder = MDIODatasetBuilder("test_dataset") + + # Add dimension with stats + builder.add_dimension( + "depth", + size=100, + data_type=ScalarType.FLOAT32, + metadata={ + "statsV1": { + "count": 100, + "sum": 1215.1, + "sumSquares": 125.12, + "min": 5.61, + "max": 10.84, + "histogram": {"binCenters": [1, 2], "counts": [10, 15]}, + } + }, + ) + + assert len(builder._variables) == 1 + depth_var = builder._variables[0] + assert depth_var.name == "depth" + assert depth_var.data_type == ScalarType.FLOAT32 + assert depth_var.metadata.stats_v1.count == 100 # noqa: PLR2004 + assert depth_var.metadata.stats_v1.sum == 1215.1 # noqa: PLR2004 + + +def test_dimension_with_full_metadata() -> None: + """Test adding dimensions with all metadata.""" + builder = MDIODatasetBuilder("test_dataset") + + # Add dimension with all metadata + builder.add_dimension( + "depth", + size=100, + data_type=ScalarType.FLOAT32, + metadata={ + "unitsV1": {"length": "m"}, + "attributes": {"MGA": 51}, + "chunkGrid": {"name": "regular", "configuration": {"chunkShape": [20]}}, + "statsV1": { + "count": 100, + "sum": 1215.1, + "sumSquares": 125.12, + "min": 5.61, + "max": 10.84, + "histogram": {"binCenters": [1, 2], "counts": [10, 15]}, + }, + }, + ) + + assert len(builder._variables) == 1 + depth_var = builder._variables[0] + assert depth_var.name == "depth" + assert depth_var.data_type == ScalarType.FLOAT32 + assert depth_var.metadata.units_v1.length == "m" + assert depth_var.metadata.attributes["MGA"] == 51 # noqa: PLR2004 + assert depth_var.metadata.chunk_grid.name == "regular" + assert depth_var.metadata.chunk_grid.configuration.chunk_shape == [20] # noqa: PLR2004 + assert depth_var.metadata.stats_v1.count == 100 # noqa: PLR2004 + assert depth_var.metadata.stats_v1.sum == 1215.1 # noqa: PLR2004 + assert depth_var.metadata.stats_v1.sum_squares == 125.12 # noqa: PLR2004 + assert depth_var.metadata.stats_v1.min == 5.61 # noqa: PLR2004 + assert depth_var.metadata.stats_v1.max == 10.84 # noqa: PLR2004 + assert depth_var.metadata.stats_v1.histogram.bin_centers == [1, 2] # noqa: PLR2004 + assert depth_var.metadata.stats_v1.histogram.counts == [10, 15] # noqa: PLR2004 + + j = builder.build().json() + print(j) + + +def test_coordiante_with_units() -> None: + """Test adding coordinates with units.""" + builder = MDIODatasetBuilder("test_dataset") + builder.add_dimension("inline", 100) + builder.add_dimension("crossline", 100) + + # Add coordinate with units + builder.add_coordinate( + "cdp", dimensions=["inline", "crossline"], metadata={"unitsV1": {"length": "m"}} + ) + + assert len(builder._variables) == 2 # noqa: PLR2004 + assert len(builder._coordinates) == 1 # noqa: PLR2004 + cdp_var = builder._coordinates[0] + assert cdp_var.name == "cdp" + assert cdp_var.data_type == ScalarType.FLOAT32 + # assert cdp_var.metadata.units_v1.length == "m" + assert cdp_var.metadata[0].units_v1.length == "m" + + +def test_coordinate_with_attributes() -> None: + """Test adding coordinates with attributes.""" + builder = MDIODatasetBuilder("test_dataset") + builder.add_dimension("inline", 100) + builder.add_dimension("crossline", 100) + + # Add coordinate with attributes + builder.add_coordinate( + "cdp", dimensions=["inline", "crossline"], metadata={"attributes": {"MGA": 51}} + ) + + assert len(builder._variables) == 2 # noqa: PLR2004 + assert len(builder._coordinates) == 1 # noqa: PLR2004 + cdp_var = builder._coordinates[0] + assert cdp_var.name == "cdp" + assert cdp_var.data_type == ScalarType.FLOAT32 + # assert cdp_var.metadata.attributes["MGA"] == 51 # noqa: PLR2004 + assert cdp_var.metadata[0].attributes["MGA"] == 51 # noqa: PLR2004 + + +def test_coordinate_with_full_metadata() -> None: + """Test adding coordinates with all metadata.""" + builder = MDIODatasetBuilder("test_dataset") + builder.add_dimension("inline", 100) + builder.add_dimension("crossline", 100) + + # Add coordinate with all metadata + builder.add_coordinate( + "cdp", + dimensions=["inline", "crossline"], + metadata={"unitsV1": {"length": "m"}, "attributes": {"MGA": 51}}, + ) + + assert len(builder._variables) == 2 # noqa: PLR2004 + assert len(builder._coordinates) == 1 # noqa: PLR2004 + cdp_var = builder._coordinates[0] + assert cdp_var.name == "cdp" + assert cdp_var.data_type == ScalarType.FLOAT32 + + # TODO(BrianMichell): #553 - If this PR is merged, we can remove the subscripting + + # assert cdp_var.metadata.units_v1.length == "m" + assert cdp_var.metadata[0].units_v1.length == "m" + # assert cdp_var.metadata.attributes["MGA"] == 51 # noqa: PLR2004 + assert cdp_var.metadata[1].attributes["MGA"] == 51 # noqa: PLR2004 + + +def test_coordinate_builder_state() -> None: + """Test coordinate builder state transitions and functionality.""" + builder = MDIODatasetBuilder("test_dataset") + + # Should not be able to add coordinates before dimensions + with pytest.raises( + ValueError, match="Must add at least one dimension before adding coordinates" + ): + builder.add_coordinate("x_coord", dimensions=["x"]) + + # Add dimensions first + builder = builder.add_dimension("x", 100) + builder = builder.add_dimension("y", 200) + + # Adding coordinate should change state to HAS_COORDINATES + builder = builder.add_coordinate("x_coord", dimensions=["x"], long_name="X Coordinate") + assert builder._state == _BuilderState.HAS_COORDINATES + assert len(builder._coordinates) == 1 # noqa: PLR2004 + assert builder._coordinates[0].name == "x_coord" + assert builder._coordinates[0].long_name == "X Coordinate" + assert builder._coordinates[0].dimensions[0].name == "x" + + # Adding another coordinate should maintain state + builder = builder.add_coordinate("y_coord", dimensions=["y"]) + assert builder._state == _BuilderState.HAS_COORDINATES + assert len(builder._coordinates) == 2 # noqa: PLR2004 + assert builder._coordinates[1].name == "y_coord" + assert builder._coordinates[1].dimensions[0].name == "y" + + +def test_variable_builder_state() -> None: + """Test variable builder state transitions and functionality.""" + builder = MDIODatasetBuilder("test_dataset") + + # Should not be able to add variables before dimensions + with pytest.raises(ValueError, match="Must add at least one dimension before adding variables"): + builder.add_variable("data", dimensions=["x"]) + + # Add dimension first + builder = builder.add_dimension("x", 100) + + # Adding variable should change state to HAS_VARIABLES + builder = builder.add_variable("data", dimensions=["x"], long_name="Data Variable") + assert builder._state == _BuilderState.HAS_VARIABLES + # One for dimension, one for variable + assert len(builder._variables) == 2 # noqa: PLR2004 + assert builder._variables[1].name == "data" + assert builder._variables[1].long_name == "Data Variable" + assert builder._variables[1].dimensions[0].name == "x" + + # Adding another variable should maintain state + builder = builder.add_variable("data2", dimensions=["x"]) + assert builder._state == _BuilderState.HAS_VARIABLES + # One for dimension, two for variables + assert len(builder._variables) == 3 # noqa: PLR2004 + assert builder._variables[2].name == "data2" + assert builder._variables[2].dimensions[0].name == "x" + + +def test_build_dataset() -> None: + """Test building a complete dataset.""" + dataset = ( + MDIODatasetBuilder("test_dataset") + .add_dimension("x", 100) + .add_dimension("y", 200) + .add_coordinate("x_coord", dimensions=["x"]) + .add_coordinate("y_coord", dimensions=["y"]) + .add_variable("data", dimensions=["x", "y"], long_name="Test Data") + .build() + ) + + assert isinstance(dataset, Dataset) + assert dataset.metadata.name == "test_dataset" + # Two dimension variables + one data variable + two coordinate variables + assert len(dataset.variables) == 5 # noqa: PLR2004 + assert dataset.variables[0].name == "x" + assert dataset.variables[1].name == "y" + assert dataset.variables[2].name == "data" + assert dataset.variables[2].long_name == "Test Data" + assert len(dataset.variables[2].dimensions) == 2 # noqa: PLR2004 + + +def test_auto_naming() -> None: + """Test automatic naming of coordinates and variables.""" + dataset = ( + MDIODatasetBuilder("test_dataset") + .add_dimension("x", 100) + .add_coordinate() # Should be named "coord_0" + .add_coordinate() # Should be named "coord_1" + .add_variable() # Should be named "var_0" + .add_variable() # Should be named "var_1" + .build() + ) + + assert dataset.variables[0].name == "x" # Dimension variable + assert dataset.variables[1].name == "var_0" + assert dataset.variables[2].name == "var_1" + + +def test_default_dimensions() -> None: + """Test that coordinates and variables use all dimensions by default.""" + dataset = ( + MDIODatasetBuilder("test_dataset") + .add_dimension("x", 100) + .add_dimension("y", 200) + .add_coordinate() # Should use both x and y dimensions + .add_variable() # Should use both x and y dimensions + .build() + ) + + # Two dimension variables + one data variable + one coordinate variable + assert len(dataset.variables) == 4 # noqa: PLR2004 + assert dataset.variables[2].name == "var_0" + assert len(dataset.variables[2].dimensions) == 2 # noqa: PLR2004 + assert dataset.variables[2].dimensions[0].name == "x" + assert dataset.variables[2].dimensions[1].name == "y" + + +def test_build_order_enforcement() -> None: + """Test that the builder enforces the correct build order.""" + builder = MDIODatasetBuilder("test_dataset") + + # Should not be able to add coordinates before dimensions + with pytest.raises( + ValueError, match="Must add at least one dimension before adding coordinates" + ): + builder.add_coordinate("x_coord", dimensions=["x"]) + + # Should not be able to add variables before dimensions + with pytest.raises(ValueError, match="Must add at least one dimension before adding variables"): + builder.add_variable("data", dimensions=["x"]) + + # Should not be able to build without dimensions + with pytest.raises(ValueError, match="Must add at least one dimension before building"): + builder.build() + + +def test_toy_example(tmp_path: Path) -> None: + """Test building a toy dataset with multiple variables and attributes.""" + dataset = ( + MDIODatasetBuilder( + "campos_3d", + attributes={ + "textHeader": [ + "C01 .......................... ", + "C02 .......................... ", + "C03 .......................... ", + ], + "foo": "bar", + }, + ) + # Add dimensions + .add_dimension("inline", 256, data_type=ScalarType.UINT32) + .add_dimension("crossline", 512, data_type=ScalarType.UINT32) + .add_dimension( + "depth", + 384, + data_type=ScalarType.UINT32, + metadata={"unitsV1": {"length": "m"}}, + ) + # Add coordinates + .add_coordinate( + "cdp-x", + dimensions=["inline", "crossline"], + metadata={"unitsV1": {"length": "m"}}, + ) + .add_coordinate( + "cdp-y", + dimensions=["inline", "crossline"], + metadata={"unitsV1": {"length": "m"}}, + ) + # Add image variable + .add_variable( + name="image", + dimensions=["inline", "crossline", "depth"], + data_type=ScalarType.FLOAT32, + compressor=Blosc(algorithm="zstd"), + coordinates=["inline", "crossline", "depth", "cdp-x", "cdp-y"], + metadata={ + "chunkGrid": { + "name": "regular", + "configuration": {"chunkShape": [128, 128, 128]}, + }, + "statsV1": { + "count": 100, + "sum": 1215.1, + "sumSquares": 125.12, + "min": 5.61, + "max": 10.84, + "histogram": {"binCenters": [1, 2], "counts": [10, 15]}, + }, + "attributes": {"fizz": "buzz"}, + }, + ) + # Add velocity variable + .add_variable( + name="velocity", + dimensions=["inline", "crossline", "depth"], + data_type=ScalarType.FLOAT16, + coordinates=["inline", "crossline", "depth", "cdp-x", "cdp-y"], + metadata={ + "chunkGrid": { + "name": "regular", + "configuration": {"chunkShape": [128, 128, 128]}, + }, + "unitsV1": {"speed": "m/s"}, + }, + ) + # Add inline-optimized image variable + .add_variable( + name="image_inline", + long_name="inline optimized version of 3d_stack", + dimensions=["inline", "crossline", "depth"], + data_type=ScalarType.FLOAT32, + compressor=Blosc(algorithm="zstd"), + coordinates=["inline", "crossline", "depth", "cdp-x", "cdp-y"], + metadata={ + "chunkGrid": { + "name": "regular", + "configuration": {"chunkShape": [4, 512, 512]}, + } + }, + ) + # Add headers variable with structured dtype + .add_variable( + name="image_headers", + dimensions=["inline", "crossline"], + data_type=StructuredType( + fields=[ + {"name": "cdp-x", "format": ScalarType.INT32}, + {"name": "cdp-y", "format": ScalarType.INT32}, + {"name": "elevation", "format": ScalarType.FLOAT16}, + {"name": "some_scalar", "format": ScalarType.FLOAT16}, + ] + ), + coordinates=["inline", "crossline", "cdp-x", "cdp-y"], + ) + .build() + ) + + # print(dataset.model_dump_json(indent=2)) + + path = tmp_path / "toy.mdio" + write_mdio_metadata(dataset, path) + + # Verify dataset structure + assert dataset.metadata.name == "campos_3d" + assert dataset.metadata.api_version == "1.0.0" + assert dataset.metadata.attributes["foo"] == "bar" + assert len(dataset.metadata.attributes["textHeader"]) == 3 # noqa: PLR2004 + + # Verify variables (including dimension variables) + # 3 dimension variables + 4 data variables + 2 coordinate variables + assert len(dataset.variables) == 9 # noqa: PLR2004 + + # Verify dimension variables + inline_var = next(v for v in dataset.variables if v.name == "inline") + assert inline_var.data_type == ScalarType.UINT32 + assert len(inline_var.dimensions) == 1 + assert inline_var.dimensions[0].name == "inline" + + depth_var = next(v for v in dataset.variables if v.name == "depth") + assert depth_var.data_type == ScalarType.UINT32 + assert depth_var.metadata.units_v1.length == "m" + + # Verify image variable + image = next(v for v in dataset.variables if v.name == "image") + assert image.data_type == ScalarType.FLOAT32 + assert isinstance(image.compressor, Blosc) + assert image.compressor.algorithm == "zstd" + assert image.metadata.stats_v1.count == 100 # noqa: PLR2004 + + # Verify velocity variable + velocity = next(v for v in dataset.variables if v.name == "velocity") + assert velocity.data_type == ScalarType.FLOAT16 + assert velocity.compressor is None + assert velocity.metadata.units_v1.speed == "m/s" + + # Verify image_inline variable + image_inline = next(v for v in dataset.variables if v.name == "image_inline") + assert image_inline.long_name == "inline optimized version of 3d_stack" + assert isinstance(image_inline.compressor, Blosc) + assert image_inline.compressor.algorithm == "zstd" + + # Verify image_headers variable + headers = next(v for v in dataset.variables if v.name == "image_headers") + assert isinstance(headers.data_type, StructuredType) + assert len(headers.data_type.fields) == 4 # noqa: PLR2004 + assert headers.data_type.fields[0].name == "cdp-x" diff --git a/tests/unit/test_schema.py b/tests/unit/test_schema.py index 0b2a9f54..e862624f 100644 --- a/tests/unit/test_schema.py +++ b/tests/unit/test_schema.py @@ -1,47 +1,54 @@ """Test the schema for the v1 dataset.""" +import copy +import json +from pathlib import Path + +import pytest +from pydantic import ValidationError + from mdio.schemas.v1 import Dataset as V1Dataset TEST_SCHEMA = { "metadata": { "name": "test_dataset", - "api_version": "1.0.0", - "created_on": "2023-01-01T00:00:00Z", + "apiVersion": "1.0.0", + "createdOn": "2023-01-01T00:00:00Z", }, "variables": [ { "name": "actual_variable", - "data_type": "float32", + "dataType": "float32", "dimensions": ["dim0", "dim1"], "compressor": {"name": "blosc", "level": 3}, "coordinates": ["coord"], "metadata": { - "chunk_grid": { + "chunkGrid": { "name": "regular", - "configuration": {"chunk_shape": [10, 20]}, + "configuration": {"chunkShape": [10, 20]}, }, }, }, { "name": "coord", - "data_type": "float32", + "dataType": "float32", "dimensions": ["dim0", "dim1"], "metadata": { - "chunk_grid": { + "chunkGrid": { "name": "regular", - "configuration": {"chunk_shape": [10, 20]}, + "configuration": {"chunkShape": [10, 20]}, }, - "units_v1": {"length": "m"}, + "unitsV1": {"length": "m"}, }, }, { "name": "dim0", - "data_type": "int32", + "dataType": "int32", "dimensions": [{"name": "dim0", "size": 100}], }, { "name": "dim1", - "data_type": "int32", + "dataType": "int32", "dimensions": [{"name": "dim1", "size": 200}], }, ], @@ -51,3 +58,521 @@ def test_dataset_schema_validation() -> None: """Test that the dataset schema validates correctly.""" V1Dataset.model_validate(TEST_SCHEMA) + + +class TestV1DatasetJSONSerialization: + """Test JSON serialization capabilities of V1Dataset using Pydantic methods.""" + + @pytest.fixture + def sample_dataset(self) -> V1Dataset: + """Create a sample V1Dataset for testing.""" + # Use a deep copy to avoid test interference + return V1Dataset.model_validate(copy.deepcopy(TEST_SCHEMA)) + + def test_model_dump_json_default_camel_case(self, sample_dataset: V1Dataset) -> None: + """Test that JSON serialization uses camelCase by default.""" + json_str = sample_dataset.model_dump_json(by_alias=True) + + print(json_str) + + # Should be valid JSON + parsed = json.loads(json_str) + assert isinstance(parsed, dict) + + # Should contain expected top-level keys + assert "metadata" in parsed + assert "variables" in parsed + + # Metadata should have expected fields + assert parsed["metadata"]["name"] == "test_dataset" + assert parsed["metadata"]["apiVersion"] == "1.0.0" + assert parsed["metadata"]["createdOn"] == "2023-01-01T00:00:00Z" + + # Should have 4 variables + assert len(parsed["variables"]) == 4 # noqa: PLR2004 + + def test_model_dump_json_exclude_none(self, sample_dataset: V1Dataset) -> None: + """Test JSON serialization excluding None values.""" + json_str = sample_dataset.model_dump_json(exclude_none=True) + parsed = json.loads(json_str) # noqa: F841 + + # Should not contain null values in the JSON + json_str_lower = json_str.lower() + assert "null" not in json_str_lower + + def test_model_validate_json_basic(self) -> None: + """Test basic JSON deserialization using model_validate_json.""" + json_str = json.dumps(TEST_SCHEMA) + dataset = V1Dataset.model_validate_json(json_str) + + assert dataset.metadata.name == "test_dataset" + assert dataset.metadata.api_version == "1.0.0" + assert len(dataset.variables) == 4 # noqa: PLR2004 + + # Check first variable + var = dataset.variables[0] + assert var.name == "actual_variable" + assert var.data_type.value == "float32" + assert var.dimensions == ["dim0", "dim1"] + + def test_model_validate_json_invalid(self) -> None: + """Test JSON deserialization with invalid data.""" + invalid_json = '{"metadata": {"name": "test"}, "variables": []}' + + with pytest.raises(ValidationError) as exc_info: + V1Dataset.model_validate_json(invalid_json) + + # Should have validation errors + errors = exc_info.value.errors() + assert len(errors) > 0 + + def test_model_validate_json_malformed(self) -> None: + """Test JSON deserialization with malformed JSON.""" + malformed_json = '{"metadata": {"name": "test"' # Missing closing braces + + with pytest.raises(ValidationError): + V1Dataset.model_validate_json(malformed_json) + + def test_json_schema_generation(self) -> None: + """Test JSON schema generation using model_json_schema.""" + schema = V1Dataset.model_json_schema() + + # Should be a valid JSON schema + assert isinstance(schema, dict) + assert schema["type"] == "object" + assert "properties" in schema + + # Should have metadata and variables properties + properties = schema["properties"] + assert "metadata" in properties + assert "variables" in properties + + # Should have required fields + assert "required" in schema + required = schema["required"] + assert "metadata" in required + assert "variables" in required + + def test_json_schema_with_mode(self) -> None: + """Test JSON schema generation with different modes.""" + # Test validation mode (default) + validation_schema = V1Dataset.model_json_schema(mode="validation") + assert "properties" in validation_schema + + # Test serialization mode + serialization_schema = V1Dataset.model_json_schema(mode="serialization") + assert "properties" in serialization_schema + + def test_round_trip_consistency_default(self, sample_dataset: V1Dataset) -> None: + """Test that serialization -> deserialization preserves data (default camelCase).""" + # Export to JSON (default camelCase) + json_str = sample_dataset.model_dump_json() + + # Import from JSON + restored_dataset = V1Dataset.model_validate_json(json_str) + + # Export again + json_str2 = restored_dataset.model_dump_json() + + # Should be identical + assert json_str == json_str2 + + # Key properties should match + assert sample_dataset.metadata.name == restored_dataset.metadata.name + assert sample_dataset.metadata.api_version == restored_dataset.metadata.api_version + assert len(sample_dataset.variables) == len(restored_dataset.variables) + + # Variables should match + for orig_var, restored_var in zip( + sample_dataset.variables, restored_dataset.variables, strict=False + ): + assert orig_var.name == restored_var.name + assert orig_var.data_type == restored_var.data_type + assert orig_var.dimensions == restored_var.dimensions + + def test_round_trip_with_aliases(self, sample_dataset: V1Dataset) -> None: + """Test round-trip consistency when using aliases.""" + # Export with aliases (should be default now) + json_str = sample_dataset.model_dump_json() + + # Import (should handle aliases automatically) + restored_dataset = V1Dataset.model_validate_json(json_str) + + # Should preserve data + assert sample_dataset.metadata.name == restored_dataset.metadata.name + assert len(sample_dataset.variables) == len(restored_dataset.variables) + + def test_json_file_operations(self, sample_dataset: V1Dataset, tmp_path: Path) -> None: + """Test JSON serialization to/from files.""" + json_file = tmp_path / "test_dataset.json" + + # Write to file (using default camelCase) + json_str = sample_dataset.model_dump_json(indent=2) + json_file.write_text(json_str, encoding="utf-8") + + # Verify file exists and has content + assert json_file.exists() + assert json_file.stat().st_size > 0 + + # Read from file + file_content = json_file.read_text(encoding="utf-8") + restored_dataset = V1Dataset.model_validate_json(file_content) + + # Should match original + assert sample_dataset.metadata.name == restored_dataset.metadata.name + assert len(sample_dataset.variables) == len(restored_dataset.variables) + + def test_json_validation_without_instantiation(self) -> None: + """Test JSON validation without creating a dataset instance.""" + valid_json = json.dumps(TEST_SCHEMA) + + # This should not raise an exception + try: + V1Dataset.model_validate_json(valid_json) + validation_passed = True + except ValidationError: + validation_passed = False + + assert validation_passed + + def test_partial_json_validation(self) -> None: + """Test validation of partial/incomplete JSON data.""" + # Missing required fields + incomplete_schema = { + "metadata": { + "name": "test_dataset", + # Missing apiVersion and createdOn + }, + "variables": [], + } + + with pytest.raises(ValidationError) as exc_info: + V1Dataset.model_validate_json(json.dumps(incomplete_schema)) + + errors = exc_info.value.errors() + # Should have errors for missing required fields + error_fields = {error["loc"][-1] for error in errors} + assert "apiVersion" in error_fields or "api_version" in error_fields + + def test_json_with_extra_fields(self) -> None: + """Test JSON deserialization with extra fields.""" + # Use a copy to avoid modifying the global TEST_SCHEMA + schema_with_extra = copy.deepcopy(TEST_SCHEMA) + schema_with_extra["extra_field"] = "should_be_ignored" + schema_with_extra["metadata"]["extra_metadata"] = "also_ignored" + + # Should raise ValidationError because extra fields are forbidden + with pytest.raises(ValidationError) as exc_info: + V1Dataset.model_validate_json(json.dumps(schema_with_extra)) + + # Should have error about extra fields + errors = exc_info.value.errors() + assert any("extra_forbidden" in str(error) for error in errors) + + def test_json_schema_contains_examples(self) -> None: + """Test that generated JSON schema contains useful information.""" + schema = V1Dataset.model_json_schema() + + # Should have descriptions for properties + properties = schema.get("properties", {}) + if "metadata" in properties: + # Check if metadata has some schema information + metadata_schema = properties["metadata"] + assert isinstance(metadata_schema, dict) + + if "variables" in properties: + # Check if variables has some schema information + variables_schema = properties["variables"] + assert isinstance(variables_schema, dict) + assert variables_schema.get("type") == "array" + + def test_json_serialization_performance(self, sample_dataset: V1Dataset) -> None: + """Test that JSON serialization is reasonably performant.""" + import time + + # Time multiple serializations + start_time = time.time() + for _ in range(100): + json_str = sample_dataset.model_dump_json() + end_time = time.time() + + # Should complete 100 serializations in reasonable time (< 1 second) + elapsed = end_time - start_time + assert elapsed < 1.0 + + # Verify the JSON is still valid + parsed = json.loads(json_str) + assert parsed["metadata"]["name"] == "test_dataset" + + +class TestPydanticMDIORoundTrip: + """Test round-trip conversions between JSON and MDIO datasets using to_mdio.""" + + def test_json_to_mdio_dataset(self, tmp_path: Path) -> None: + """Test converting TEST_SCHEMA JSON to an MDIO dataset using to_mdio.""" + from mdio.core.v1._serializer import _construct_mdio_dataset + + output_path = tmp_path / "from_json.mdio" + # output_path = "test_mdio_from_json.mdio" + + # Step 1: Validate the TEST_SCHEMA JSON with Pydantic + dataset = V1Dataset.model_validate(TEST_SCHEMA) + + # Step 2: Convert to MDIO dataset using the internal constructor + mdio_dataset = _construct_mdio_dataset(dataset) + + # Step 3: Use to_mdio to save the dataset + mdio_dataset.to_mdio(store=str(output_path)) + + # Verify the dataset was created + assert output_path.exists() + + # Verify we can read it back + from mdio.core.v1 import mdio + + with mdio.open(str(output_path)) as reader: + assert "actual_variable" in reader + assert "coord" in reader + assert "dim0" in reader.coords + assert "dim1" in reader.coords + assert reader.attrs["name"] == "test_dataset" + + def test_mdio_dataset_to_json(self, tmp_path: Path) -> None: + """Test converting an MDIO dataset back to JSON (camelCase).""" + from mdio.core.v1 import mdio + from mdio.core.v1._serializer import _construct_mdio_dataset + + # Step 1: Create MDIO dataset from TEST_SCHEMA + dataset = V1Dataset.model_validate(TEST_SCHEMA) + mdio_dataset = _construct_mdio_dataset(dataset) + + mdio_path = tmp_path / "test_dataset.mdio" + mdio_dataset.to_mdio(store=str(mdio_path)) + + # Step 2: Read back the MDIO dataset + with mdio.open(str(mdio_path)) as reader: + # Step 3: Extract information to reconstruct Pydantic model + variables = [] + + # Add dimension variables + for dim_name in ["dim0", "dim1"]: + if dim_name in reader.coords: + coord = reader.coords[dim_name] + var_dict = { + "name": dim_name, + "dataType": str(coord.dtype), + "dimensions": [{"name": dim_name, "size": reader.dims[dim_name]}], + } + variables.append(var_dict) + + # Add data variables with their metadata + for var_name in reader.data_vars: + var = reader[var_name] + var_dict = { + "name": var_name, + "dataType": str(var.dtype), + "dimensions": list(var.dims), + } + + # Reconstruct metadata based on original TEST_SCHEMA + if var_name == "coord": + var_dict["metadata"] = { + "chunkGrid": { + "name": "regular", + "configuration": {"chunkShape": [10, 20]}, + }, + "unitsV1": {"length": "m"}, + } + elif var_name == "actual_variable": + var_dict["compressor"] = {"name": "blosc", "level": 3} + var_dict["coordinates"] = ["coord"] + var_dict["metadata"] = { + "chunkGrid": { + "name": "regular", + "configuration": {"chunkShape": [10, 20]}, + }, + } + variables.append(var_dict) + + # Step 4: Create Pydantic model data (camelCase) + dataset_data = { + "metadata": { + "name": reader.attrs.get("name"), + "apiVersion": reader.attrs.get("apiVersion", "1.0.0"), + "createdOn": reader.attrs.get("createdOn", "2023-01-01T00:00:00Z"), + }, + "variables": variables, + } + + # Step 5: Validate with Pydantic and serialize to JSON using by_alias=True + pydantic_dataset = V1Dataset.model_validate(dataset_data) + json_str = pydantic_dataset.model_dump_json(by_alias=True) + + # Verify it's valid JSON and camelCase + parsed = json.loads(json_str) + + print(parsed) + + assert "apiVersion" in parsed["metadata"] + assert "createdOn" in parsed["metadata"] + assert "dataType" in parsed["variables"][0] + + # Verify the conversion preserved data + assert pydantic_dataset.metadata.name == "test_dataset" + + def test_full_round_trip_json_mdio_json(self, tmp_path: Path) -> None: + """Test full round-trip: TEST_SCHEMA JSON -> MDIO -> JSON using to_mdio.""" + from mdio.core.v1 import mdio + from mdio.core.v1._serializer import _construct_mdio_dataset + + # Step 1: Start with TEST_SCHEMA (input JSON) + original_dataset = V1Dataset.model_validate(TEST_SCHEMA) + original_json = original_dataset.model_dump_json(by_alias=True) + original_parsed = json.loads(original_json) + + # Verify original is camelCase + assert "apiVersion" in original_parsed["metadata"] + assert "createdOn" in original_parsed["metadata"] + + # Step 2: Convert to MDIO dataset and save + mdio_dataset = _construct_mdio_dataset(original_dataset) + mdio_path = tmp_path / "round_trip.mdio" + mdio_dataset.to_mdio(store=str(mdio_path)) + + # Step 3: Read back from MDIO and convert to JSON + with mdio.open(str(mdio_path)) as reader: + # Reconstruct the schema structure + variables = [] + + # Add dimension variables + for dim_name in ["dim0", "dim1"]: + if dim_name in reader.coords: + coord = reader.coords[dim_name] + var_dict = { + "name": dim_name, + "dataType": str(coord.dtype), + "dimensions": [{"name": dim_name, "size": reader.dims[dim_name]}], + } + variables.append(var_dict) + + # Add coordinate variables that are not dimensions + for coord_name, coord in reader.coords.items(): + if coord_name not in ["dim0", "dim1"]: # Skip dimension coordinates + var_dict = { + "name": coord_name, + "dataType": str(coord.dtype), + "dimensions": list(coord.dims), + } + + # Add metadata for coord variable from original TEST_SCHEMA + if coord_name == "coord": + var_dict["metadata"] = { + "chunkGrid": { + "name": "regular", + "configuration": {"chunkShape": [10, 20]}, + }, + "unitsV1": {"length": "m"}, + } + variables.append(var_dict) + + # Add data variables with original metadata + for var_name in reader.data_vars: + var = reader[var_name] + var_dict = { + "name": var_name, + "dataType": str(var.dtype), + "dimensions": list(var.dims), + } + + # Add original metadata back from TEST_SCHEMA + if var_name == "actual_variable": + var_dict["compressor"] = {"name": "blosc", "level": 3} + var_dict["coordinates"] = ["coord"] + var_dict["metadata"] = { + "chunkGrid": { + "name": "regular", + "configuration": {"chunkShape": [10, 20]}, + }, + } + variables.append(var_dict) + + # Create final dataset + final_data = { + "metadata": { + "name": reader.attrs.get("name", "test_dataset"), + "apiVersion": reader.attrs.get("apiVersion", "1.0.0"), + "createdOn": reader.attrs.get("createdOn", "2023-01-01T00:00:00Z"), + }, + "variables": variables, + } + + final_dataset = V1Dataset.model_validate(final_data) + final_json = final_dataset.model_dump_json(by_alias=True) + final_parsed = json.loads(final_json) + + # Step 4: Verify round-trip integrity + assert final_parsed["metadata"]["name"] == original_parsed["metadata"]["name"] + assert ( + final_parsed["metadata"]["apiVersion"] == original_parsed["metadata"]["apiVersion"] + ) + + # Verify camelCase is preserved + assert "apiVersion" in final_parsed["metadata"] + assert "createdOn" in final_parsed["metadata"] + assert "dataType" in final_parsed["variables"][0] + + # Verify variable structure is preserved + original_var_names = {v["name"] for v in original_parsed["variables"]} + final_var_names = {v["name"] for v in final_parsed["variables"]} + + print(original_var_names) + print("=================================") + print(final_var_names) + + assert original_var_names == final_var_names + + def test_invalid_snake_case_json_fails(self) -> None: + """Test that snake_case JSON fails validation (negative test).""" + # Create snake_case version of TEST_SCHEMA (should fail) + invalid_snake_case_schema = { + "metadata": { + "name": "test_dataset", + "api_version": "1.0.0", # snake_case should fail + "created_on": "2023-01-01T00:00:00Z", # snake_case should fail + }, + "variables": [ + { + "name": "test_var", + "data_type": "float32", # snake_case should fail + "dimensions": ["dim0"], + } + ], + } + + # This should fail validation + with pytest.raises(ValidationError): + V1Dataset.model_validate(invalid_snake_case_schema) + + def test_camel_case_serialization_only(self) -> None: + """Test that serialization only produces camelCase output.""" + dataset = V1Dataset.model_validate(TEST_SCHEMA) + json_str = dataset.model_dump_json() + parsed = json.loads(json_str) + + # Verify camelCase fields are present + assert "apiVersion" in parsed["metadata"] + assert "createdOn" in parsed["metadata"] + + # Verify snake_case fields are NOT present + assert "api_version" not in parsed["metadata"] + assert "created_on" not in parsed["metadata"] + + # Check variables use camelCase + for var in parsed["variables"]: + assert "dataType" in var + assert "data_type" not in var + + # Check nested metadata if present + if "metadata" in var and "chunkGrid" in var["metadata"]: + assert "chunkGrid" in var["metadata"] + assert "chunk_grid" not in var["metadata"] diff --git a/tests/unit/test_template_factory.py b/tests/unit/test_template_factory.py new file mode 100644 index 00000000..5dc6ffa0 --- /dev/null +++ b/tests/unit/test_template_factory.py @@ -0,0 +1,172 @@ +"""Unit tests for MDIO v1 factory.""" + +# TODO(BrianMichell, #535): Update this to use canonical factory functions. + +from datetime import UTC +from datetime import datetime +from pathlib import Path + +import pytest +from pydantic import ValidationError + +from mdio.core.v1._serializer import make_coordinate +from mdio.core.v1._serializer import make_dataset +from mdio.core.v1._serializer import make_dataset_metadata +from mdio.core.v1._serializer import make_named_dimension +from mdio.core.v1._serializer import make_variable +from mdio.core.v1.builder import write_mdio_metadata +from mdio.core.v1.factory import SCHEMA_TEMPLATE_MAP +from mdio.core.v1.factory import MDIOSchemaType +from mdio.schemas.compressors import Blosc +from mdio.schemas.dtype import ScalarType +from mdio.schemas.dtype import StructuredType + + +def test_make_toy_dataset(tmp_path: Path) -> None: + """Test that make_toy_dataset returns a Dataset object using the factory pattern.""" + # Create dataset using factory + template = SCHEMA_TEMPLATE_MAP[MDIOSchemaType.SEISMIC_3D_POST_STACK_GENERIC] + ds = template.create( + name="campos_3d", + shape=[256, 512, 384], # inline, crossline, time + header_fields={ + "cdp-x": "int32", + "cdp-y": "int32", + "elevation": "float16", + "some_scalar": "float16", + }, + create_coords=True, + sample_format="float32", + chunks=[128, 128, 128], + z_units={"unitsV1": {"time": "ms"}}, + attributes={ + "textHeader": [ + "C01 .......................... ", + "C02 .......................... ", + "C03 .......................... ", + ], + "foo": "bar", + }, + ) + + # Print the JSON representation of the dataset schema + print("\nDataset Schema JSON:") + print(ds.model_dump_json(indent=2)) + + mdio_path = tmp_path / "test_toy_dataset.mdio" + write_mdio_metadata(ds, str(mdio_path)) + + # Verify metadata + assert ds.metadata.name == "campos_3d" + assert ds.metadata.api_version == "1.0.0" + assert ds.metadata.attributes == { + "textHeader": [ + "C01 .......................... ", + "C02 .......................... ", + "C03 .......................... ", + ], + "foo": "bar", + } + + # Verify variables, coordinates, and dimensions + assert len(ds.variables) == 8 # noqa: PLR2004 + + # Find seismic variable + seismic = next(v for v in ds.variables if v.name == "seismic") + assert seismic.data_type == ScalarType.FLOAT32 + assert seismic.dimensions[0].name == "inline" + assert seismic.dimensions[1].name == "crossline" + assert seismic.dimensions[2].name == "sample" + assert seismic.compressor == Blosc(name="blosc", algorithm="zstd") + + # Find headers variable + headers = next(v for v in ds.variables if v.name == "headers") + assert isinstance(headers.data_type, StructuredType) + assert len(headers.data_type.fields) == 4 # noqa: PLR2004 + assert headers.dimensions[0].name == "inline" + assert headers.dimensions[1].name == "crossline" + assert headers.compressor == Blosc(name="blosc") + + # Find trace mask + mask = next(v for v in ds.variables if v.name == "trace_mask") + assert mask.data_type == ScalarType.BOOL + assert mask.dimensions[0].name == "inline" + assert mask.dimensions[1].name == "crossline" + assert mask.compressor == Blosc(name="blosc") + + # Find coordinates + cdp_x = next(v for v in ds.variables if v.name == "cdp-x") + assert cdp_x.data_type == ScalarType.FLOAT64 + assert cdp_x.dimensions[0].name == "inline" + assert cdp_x.dimensions[1].name == "crossline" + assert cdp_x.metadata.units_v1.length == "m" + + cdp_y = next(v for v in ds.variables if v.name == "cdp-y") + assert cdp_y.data_type == ScalarType.FLOAT64 + assert cdp_y.dimensions[0].name == "inline" + assert cdp_y.dimensions[1].name == "crossline" + assert cdp_y.metadata.units_v1.length == "m" + + +def test_named_dimension_invalid_size() -> None: + """Test that make_named_dimension raises a ValidationError for invalid size.""" + with pytest.raises(ValidationError): + make_named_dimension("dim", 0) + with pytest.raises(ValidationError): + make_named_dimension("dim", -1) + + +def test_make_coordinate_invalid_types() -> None: + """Test that make_coordinate raises a ValidationError for invalid types.""" + # dimensions must be a list of NamedDimension or str + with pytest.raises(ValidationError): + make_coordinate(name="coord", dimensions="notalist", data_type=ScalarType.FLOAT32) + # data_type must be a valid ScalarType + with pytest.raises(ValidationError): + make_coordinate(name="coord", dimensions=["x"], data_type="notatype") + + +def test_make_variable_invalid_args() -> None: + """Test that make_variable raises a ValidationError for invalid types.""" + # compressor must be Blosc, ZFP or None + with pytest.raises(ValidationError): + make_variable( + name="var", + dimensions=["x"], + data_type=ScalarType.FLOAT32, + compressor="notacompressor", + ) + # metadata dict must match VariableMetadata schema + with pytest.raises(ValidationError): + make_variable( + name="var", + dimensions=["x"], + data_type=ScalarType.FLOAT32, + compressor=None, + metadata={"foo": "bar"}, + ) + + +def test_make_dataset_metadata_invalid_created_on() -> None: + """Test that make_dataset_metadata raises a ValidationError for invalid created_on.""" + # created_on must be an aware datetime + with pytest.raises(ValidationError): + make_dataset_metadata(name="ds", api_version="1", created_on="not-a-date") + + +def test_make_dataset_invalid_variables_and_metadata_types() -> None: + """Test that make_dataset raises a ValidationError.""" + ts = datetime.now(UTC) + meta = make_dataset_metadata(name="ds", api_version="1", created_on=ts) + var = make_variable( + name="var", + dimensions=["x"], + data_type=ScalarType.FLOAT32, + compressor=None, + ) + # variables must be a list of Variable objects + with pytest.raises(ValidationError): + make_dataset(variables="notalist", metadata=meta) + # metadata must be a DatasetMetadata instance + with pytest.raises(ValidationError): + make_dataset(variables=[var], metadata={"foo": "bar"}) diff --git a/uv.lock b/uv.lock index de95c4e6..a81fb7b0 100644 --- a/uv.lock +++ b/uv.lock @@ -1811,7 +1811,7 @@ requires-dist = [ { name = "s3fs", marker = "extra == 'cloud'", specifier = "==2024.12.0" }, { name = "segy", specifier = ">=0.4.0,<0.5.0" }, { name = "tqdm", specifier = ">=4.67.0,<5.0.0" }, - { name = "xarray", specifier = ">=2025.3.1" }, + { name = "xarray", specifier = ">=2025.4.0" }, { name = "zarr", specifier = ">=3.0.8,<4.0.0" }, { name = "zfpy", marker = "extra == 'lossy'", specifier = ">=1.0.1,<2.0.0" }, ]