Chunked live mask (#527)

BrianMichell · tasansal · web-flow · commit 7a8b751a4767 · 2025-04-11T14:20:05.000-05:00
* Add minimal support for live mask configuration

* formatting

* Formatting

* Reduce possibility of integer overflow

* Automatic chunking of live_maks for grids that exceed blosc's maximum elements.

* Remove old test

* Formatting

* Resolve pre-commit issues

* Helps protect integer overflow and logs when type promotion is performed.

* linting

* Use number of samples instead of live samples

Co-authored-by: Altay Sansal &lt;tasansal@users.noreply.github.com&gt;

* Explicitly use the live mask shape for base case

* Remove integer as a return type hint

* Rework chunking computation and expand test coverage

* Update to use Dask's chunk generation and ~500MiB chunk sizes

* Mock input arrays to increase speed and avoid pipeline OOM

* Cleanup, mocking, and relocation of autochunking

* Linting

* Clean tests up

* Use auto-chunking for live mask in factory.

* Fix import

* Update to handle live_mask not existing in Grid

* Use numpy empty instead of mocking class. Add test for Grid without live_mask.

* Linting

* Avoid bare except

* Mock the grid array info.

* simplify dtype determination logic

* use numpy instead of hand coding

* refactor auto chunking and optimize live mask and grid map creation

* consolidate tests and adjust for refactor

* only compare grid dims, because live mask is expected to be different

* move types to type checking block

* remve unnecessary comment

* remove comment repeating var name

* Use the pre-calculated constant

* Avoid magic numbers

* fix broken comment

---------

Co-authored-by: Altay Sansal &lt;tasansal@users.noreply.github.com&gt;
diff --git a/src/mdio/constants.py b/src/mdio/constants.py
@@ -12,20 +12,26 @@
 FLOAT64_MIN = np.finfo("float64").min
 FLOAT64_MAX = np.finfo("float64").max
 
-INT8_MIN = -0x80
-INT8_MAX = 0x7F
+INT8_MIN = np.iinfo("int8").min
+INT8_MAX = np.iinfo("int8").max
 
-INT16_MIN = -0x8000
-INT16_MAX = 0x7FFF
+INT16_MIN = np.iinfo("int16").min
+INT16_MAX = np.iinfo("int16").max
 
-INT32_MIN = -0x80000000
-INT32_MAX = 0x7FFFFFFF
+INT32_MIN = np.iinfo("int32").min
+INT32_MAX = np.iinfo("int32").max
 
-UINT8_MIN = 0x0
-UINT8_MAX = 0xFF
+INT64_MIN = np.iinfo("int64").min
+INT64_MAX = np.iinfo("int64").max
 
-UINT16_MIN = 0x0
-UINT16_MAX = 0xFFFF
+UINT8_MIN = 0
+UINT8_MAX = np.iinfo("uint8").max
 
-UINT32_MIN = 0x0
-UINT32_MAX = 0xFFFFFFFF
+UINT16_MIN = 0
+UINT16_MAX = np.iinfo("uint16").max
+
+UINT32_MIN = 0
+UINT32_MAX = np.iinfo("uint32").max
+
+UINT64_MIN = 0
+UINT64_MAX = np.iinfo("uint64").max
diff --git a/src/mdio/core/factory.py b/src/mdio/core/factory.py
@@ -34,6 +34,7 @@
 from mdio import MDIOWriter
 from mdio.api.io_utils import process_url
 from mdio.core import Grid
+from mdio.core.utils_write import get_live_mask_chunksize
 from mdio.core.utils_write import write_attribute
 from mdio.segy.helpers_segy import create_zarr_hierarchy
 
@@ -145,10 +146,12 @@ def create_empty(
     write_attribute(name="text_header", zarr_group=meta_group, attribute=DEFAULT_TEXT)
     write_attribute(name="binary_header", zarr_group=meta_group, attribute={})
 
+    live_shape = config.grid.shape[:-1]
+    live_chunks = get_live_mask_chunksize(live_shape)
     meta_group.create_dataset(
         name="live_mask",
-        shape=config.grid.shape[:-1],
-        chunks=-1,
+        shape=live_shape,
+        chunks=live_chunks,
         dtype="bool",
         dimension_separator="/",
     )
diff --git a/src/mdio/core/grid.py b/src/mdio/core/grid.py
@@ -4,13 +4,20 @@
 
 import inspect
 from dataclasses import dataclass
+from typing import TYPE_CHECKING
 
 import numpy as np
 import zarr
 
 from mdio.constants import UINT32_MAX
 from mdio.core import Dimension
 from mdio.core.serialization import Serializer
+from mdio.core.utils_write import get_constrained_chunksize
+
+
+if TYPE_CHECKING:
+    from segy.arrays import HeaderArray
+    from zarr import Array as ZarrArray
 
 
 @dataclass
@@ -25,10 +32,14 @@ class Grid:
 
     Args:
         dims: List of dimension instances.
-
     """
 
     dims: list[Dimension]
+    map: ZarrArray | None = None
+    live_mask: ZarrArray | None = None
+
+    _TARGET_MEMORY_PER_BATCH = 1 * 1024**3  # 1GB target for batch process map
+    _INTERNAL_CHUNK_SIZE_TARGET = 10 * 1024**2  # 10MB target for internal chunks
 
     def __post_init__(self):
         """Initialize convenience properties."""
@@ -77,23 +88,51 @@ def from_zarr(cls, zarr_root: zarr.Group):
 
         return cls(dims_list)
 
-    def build_map(self, index_headers):
+    def build_map(self, index_headers: HeaderArray) -> None:
         """Build a map for live traces based on `index_headers`.
 
         Args:
             index_headers: Headers to be normalized (indexed)
         """
-        live_dim_indices = tuple()
-        for dim in self.dims[:-1]:
-            dim_hdr = index_headers[dim.name]
-            live_dim_indices += (np.searchsorted(dim, dim_hdr),)
-
-        # We set dead traces to uint32 max. Should be far away from actual trace counts.
-        self.map = zarr.full(self.shape[:-1], dtype="uint32", fill_value=UINT32_MAX)
-        self.map.vindex[live_dim_indices] = range(len(live_dim_indices[0]))
-
-        self.live_mask = zarr.zeros(self.shape[:-1], dtype="bool")
-        self.live_mask.vindex[live_dim_indices] = 1
+        # Determine data type for the map based on grid size
+        grid_size = np.prod(self.shape[:-1])
+        map_dtype = "uint64" if grid_size > UINT32_MAX else "uint32"
+        fill_value = np.iinfo(map_dtype).max
+
+        # Initialize Zarr arrays for the map and live mask
+        live_shape = self.shape[:-1]
+        chunks = get_constrained_chunksize(
+            shape=live_shape,
+            dtype=map_dtype,
+            max_bytes=self._INTERNAL_CHUNK_SIZE_TARGET,
+        )
+        # Temporary zarrs for ingestion.
+        self.map = zarr.full(live_shape, fill_value, dtype=map_dtype, chunks=chunks)
+        self.live_mask = zarr.zeros(live_shape, dtype="bool", chunks=chunks)
+
+        # Calculate batch size for processing
+        memory_per_trace_index = index_headers.itemsize
+        batch_size = int(self._TARGET_MEMORY_PER_BATCH / memory_per_trace_index)
+        total_live_traces = index_headers.size
+
+        # Process live traces in batches
+        for start in range(0, total_live_traces, batch_size):
+            end = min(start + batch_size, total_live_traces)
+
+            # Compute indices for the current batch
+            live_dim_indices = []
+            for dim in self.dims[:-1]:
+                dim_hdr = index_headers[dim.name][start:end]
+                indices = np.searchsorted(dim, dim_hdr).astype(np.uint32)
+                live_dim_indices.append(indices)
+            live_dim_indices = tuple(live_dim_indices)
+
+            # Generate trace indices for the batch
+            trace_indices = np.arange(start, end, dtype=np.uint64)
+
+            # Update Zarr arrays for the batch
+            self.map.vindex[live_dim_indices] = trace_indices
+            self.live_mask.vindex[live_dim_indices] = True
 
 
 class GridSerializer(Serializer):
diff --git a/src/mdio/core/utils_write.py b/src/mdio/core/utils_write.py
@@ -1,11 +1,21 @@
 """Convenience utilities for writing to Zarr."""
 
+from typing import TYPE_CHECKING
 from typing import Any
 
-import zarr
+from dask.array.core import normalize_chunks
+from dask.array.rechunk import _balance_chunksizes
 
 
-def write_attribute(name: str, attribute: Any, zarr_group: zarr.Group) -> None:
+if TYPE_CHECKING:
+    from numpy.typing import DTypeLike
+    from zarr import Group
+
+
+MAX_SIZE_LIVE_MASK = 512 * 1024**2
+
+
+def write_attribute(name: str, attribute: Any, zarr_group: "Group") -> None:
     """Write a mappable to Zarr array or group attribute.
 
     Args:
@@ -14,3 +24,34 @@ def write_attribute(name: str, attribute: Any, zarr_group: zarr.Group) -> None:
         zarr_group: Output group or array.
     """
     zarr_group.attrs[name] = attribute
+
+
+def get_constrained_chunksize(
+    shape: tuple[int, ...],
+    dtype: "DTypeLike",
+    max_bytes: int,
+) -> tuple[int]:
+    """Calculate the optimal chunk size for N-D array based on max_bytes.
+
+    Args:
+        shape: The shape of the array.
+        dtype: The data dtype to be used in calculation.
+        max_bytes: The maximum allowed number of bytes per chunk.
+
+    Returns:
+        A sequence of integers of calculated chunk sizes.
+    """
+    chunks = normalize_chunks("auto", shape, dtype=dtype, limit=max_bytes)
+    return tuple(_balance_chunksizes(chunk)[0] for chunk in chunks)
+
+
+def get_live_mask_chunksize(shape: tuple[int, ...]) -> tuple[int]:
+    """Given a live_mask shape, calculate the optimal write chunk size.
+
+    Args:
+        shape: The shape of the array.
+
+    Returns:
+        A sequence of integers of calculated chunk sizes.
+    """
+    return get_constrained_chunksize(shape, "bool", MAX_SIZE_LIVE_MASK)
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,11 +1,21 @@
 """Test configuration before everything runs."""
 
+import warnings
 from os import path
 from urllib.request import urlretrieve
 
 import pytest
 
 
+# Suppress Dask's chunk balancing warning
+warnings.filterwarnings(
+    "ignore",
+    message="Could not balance chunks to be equal",
+    category=UserWarning,
+    module="dask.array.rechunk",
+)
+
+
 @pytest.fixture(scope="session")
 def fake_segy_tmp(tmp_path_factory):
     """Make a temp file for the fake SEG-Y files we are going to create."""
diff --git a/tests/unit/test_auto_chunking.py b/tests/unit/test_auto_chunking.py
@@ -0,0 +1,86 @@
+"""Test live mask chunk size calculation."""
+
+from typing import TYPE_CHECKING
+
+import numpy as np
+import pytest
+
+from mdio.core.utils_write import MAX_SIZE_LIVE_MASK
+from mdio.core.utils_write import get_constrained_chunksize
+from mdio.core.utils_write import get_live_mask_chunksize
+
+
+if TYPE_CHECKING:
+    from numpy.typing import DTypeLike
+
+
+@pytest.mark.parametrize(
+    ("shape", "dtype", "limit", "expected_chunks"),
+    [
+        ((100,), "int8", 100, (100,)),  # 1D full chunk
+        ((8, 6), "int8", 20, (4, 4)),  # 2D adjusted int8
+        ((6, 8), "int16", 96, (6, 8)),  # 2D small int16
+        ((9, 6, 4), "int8", 100, (5, 5, 4)),  # 3D adjusted
+        ((4, 5), "int32", 4, (1, 1)),  # test minimum edge case
+        ((10, 10), "int8", 1000, (10, 10)),  # big limit
+        ((7, 5), "int8", 35, (7, 5)),  # test full primes
+        ((7, 5), "int8", 23, (4, 4)),  # test adjusted primes
+    ],
+)
+def test_auto_chunking(
+    shape: tuple[int, ...],
+    dtype: "DTypeLike",
+    limit: int,
+    expected_chunks: tuple[int, ...],
+) -> None:
+    """Test automatic chunking based on size limit and an array spec."""
+    result = get_constrained_chunksize(shape, dtype, limit)
+    assert result == expected_chunks
+
+
+class TestAutoChunkLiveMask:
+    """Test class for live mask auto chunking."""
+
+    @pytest.mark.parametrize(
+        ("shape", "expected_chunks"),
+        [
+            ((100,), (100,)),  # small 1d
+            ((100, 100), (100, 100)),  # small 2d
+            ((50000, 50000), (25000, 25000)),  # large 2d
+            ((1500, 1500, 1500), (750, 750, 750)),  # large 3d
+            ((1000, 1000, 100, 36), (334, 334, 100, 36)),  # large 4d
+        ],
+    )
+    def test_auto_chunk_live_mask(
+        self,
+        shape: tuple[int, ...],
+        expected_chunks: tuple[int, ...],
+    ) -> None:
+        """Test auto chunked live mask is within expected number of bytes."""
+        result = get_live_mask_chunksize(shape)
+        assert result == expected_chunks
+
+    @pytest.mark.parametrize(
+        "shape",
+        [
+            # Below are >500MiB. Smaller ones tested above
+            (32768, 32768),
+            (46341, 46341),
+            (86341, 96341),
+            (55000, 97500),
+            (100000, 100000),
+            (1024, 1024, 1024),
+            (215, 215, 215, 215),
+            (512, 216, 512, 400),
+            (74, 74, 74, 74, 74),
+            (512, 17, 43, 200, 50),
+        ],
+    )
+    def test_auto_chunk_live_mask_nbytes(self, shape: tuple[int, ...]) -> None:
+        """Test auto chunked live mask is within expected number of bytes."""
+        result = get_live_mask_chunksize(shape)
+        chunk_elements = np.prod(result)
+
+        # We want them to be 500MB +/- 25%
+        assert chunk_elements > MAX_SIZE_LIVE_MASK * 0.75
+        assert chunk_elements < MAX_SIZE_LIVE_MASK * 1.25
diff --git a/tests/unit/test_factory.py b/tests/unit/test_factory.py
@@ -20,7 +20,8 @@ def test_create_empty_like(mock_reader: MDIOReader):
 
     source_reader = mock_reader
     dest_reader = MDIOReader(dest_path)
-    assert source_reader.grid == dest_reader.grid
+    assert source_reader.grid.dims == dest_reader.grid.dims
+    assert source_reader.live_mask != dest_reader.grid.live_mask
 
     source_traces = source_reader._traces
     dest_traces = dest_reader._traces