Skip to content

Commit 8e4af5b

Browse files
authored
Support for MDIO Origination from Scratch or Existing Dataset (#529)
* Introduce MDIO dataset factory for streamlined creation. * Refactor metadata handling and improve getter/setters. * Refactor ZFPY import to handle missing dependency * Add notebook in docs for creating and populating MDIO files * update headings for docs rendering * Refactor test fixtures for creating mock MDIO data. * update test based on new mock mdio * title case * correct live_mask explanation * update plot axes * Use dynamic package version for API version metadata * fix docstring consolidate * fix write docstring * promote stats keys to class attribute for reusability * add `create_empty_like` function. * tests for `create_empty_like` * expose create_empty_like at root module level * add `create_empty_like` demo. * remove empty cell * use Codec ABC instead of specific compressors
1 parent 1dc2364 commit 8e4af5b

File tree

10 files changed

+1296
-223
lines changed

10 files changed

+1296
-223
lines changed

docs/index.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ maxdepth: 1
1717
---
1818
installation
1919
notebooks/quickstart
20+
notebooks/creation
2021
notebooks/compression
2122
notebooks/rechunking
2223
usage

docs/notebooks/creation.ipynb

Lines changed: 815 additions & 0 deletions
Large diffs are not rendered by default.

src/mdio/__init__.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,12 @@
77
from mdio.api.convenience import copy_mdio
88
from mdio.converters import mdio_to_segy
99
from mdio.converters import segy_to_mdio
10+
from mdio.core.dimension import Dimension
11+
from mdio.core.factory import MDIOCreateConfig
12+
from mdio.core.factory import MDIOVariableConfig
13+
from mdio.core.factory import create_empty
14+
from mdio.core.factory import create_empty_like
15+
from mdio.core.grid import Grid
1016

1117

1218
__all__ = [
@@ -15,6 +21,12 @@
1521
"copy_mdio",
1622
"mdio_to_segy",
1723
"segy_to_mdio",
24+
"Dimension",
25+
"MDIOCreateConfig",
26+
"MDIOVariableConfig",
27+
"create_empty",
28+
"create_empty_like",
29+
"Grid",
1830
]
1931

2032

src/mdio/api/accessor.py

Lines changed: 22 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,8 @@ class MDIOAccessor:
129129
we can directly unpack it and use it further down our code.
130130
"""
131131

132+
_stats_keys = {"mean", "std", "rms", "min", "max"}
133+
132134
_array_load_function_mapper = {
133135
"zarr": open_zarr_array,
134136
"dask": open_zarr_array_dask,
@@ -154,16 +156,13 @@ def __init__(
154156

155157
# Set private attributes for public interface.
156158
# Pep8 complains because they are defined outside __init__
157-
self._binary_header = None
158159
self._chunks = None
159160
self._live_mask = None
160161
self._root = None
161162
self._n_dim = None
162163
self._orig_chunks = None
163164
self._store = None
164165
self._shape = None
165-
self._stats = None
166-
self._text_header = None
167166
self._trace_count = None
168167

169168
# Private attributes
@@ -212,19 +211,17 @@ def _connect(self):
212211
)
213212
raise MDIONotFoundError(msg) from e
214213

214+
def _consolidate_metadata(self) -> None:
215+
"""Flush optimized MDIO metadata, run after modifying it."""
216+
zarr.consolidate_metadata(self.root.store)
217+
215218
def _deserialize_grid(self):
216219
"""Deserialize grid from Zarr metadata."""
217220
self.grid = Grid.from_zarr(self.root)
218221

219222
def _set_attributes(self):
220223
"""Deserialize attributes from Zarr metadata."""
221224
self.trace_count = self.root.attrs["trace_count"]
222-
self.stats = {
223-
key: self.root.attrs[key] for key in ["mean", "std", "rms", "min", "max"]
224-
}
225-
226-
self.text_header = self._metadata_group.attrs["text_header"]
227-
self.binary_header = self._metadata_group.attrs["binary_header"]
228225

229226
# Grid based attributes
230227
self.shape = self.grid.shape
@@ -331,26 +328,28 @@ def trace_count(self, value: int) -> None:
331328
@property
332329
def text_header(self) -> list:
333330
"""Get seismic text header."""
334-
return self._text_header
331+
return self._metadata_group.attrs["text_header"]
335332

336333
@text_header.setter
337334
def text_header(self, value: list) -> None:
338335
"""Validate and set seismic text header."""
339-
if not isinstance(value, list):
336+
if not isinstance(value, list) or len(value) != 40:
340337
raise AttributeError("Text header must be a list of str with 40 elements")
341-
self._text_header = value
338+
self._metadata_group.attrs["text_header"] = value
339+
self._consolidate_metadata()
342340

343341
@property
344342
def binary_header(self) -> dict:
345343
"""Get seismic binary header metadata."""
346-
return self._binary_header
344+
return self._metadata_group.attrs["binary_header"]
347345

348346
@binary_header.setter
349347
def binary_header(self, value: dict) -> None:
350348
"""Validate and set seismic binary header metadata."""
351349
if not isinstance(value, dict):
352350
raise AttributeError("Binary header has to be a dictionary type collection")
353-
self._binary_header = value
351+
self._metadata_group.attrs["binary_header"] = value
352+
self._consolidate_metadata()
354353

355354
@property
356355
def chunks(self) -> tuple[int, ...]:
@@ -365,12 +364,16 @@ def chunks(self, value: tuple[int, ...]) -> None:
365364
@property
366365
def stats(self) -> dict:
367366
"""Get global statistics like min/max/rms/std."""
368-
return self._stats
367+
return {key: self.root.attrs[key] for key in self._stats_keys}
369368

370369
@stats.setter
371370
def stats(self, value: dict) -> None:
372371
"""Set global statistics like min/max/rms/std."""
373-
self._stats = value
372+
if not isinstance(value, dict) or not self._stats_keys.issubset(value.keys()):
373+
msg = f"For settings status, you must provide keys: {self._stats_keys}"
374+
raise AttributeError(msg)
375+
self.root.attrs.update(value)
376+
self._consolidate_metadata()
374377

375378
@property
376379
def _metadata_group(self) -> zarr.Group:
@@ -403,6 +406,7 @@ def __getitem__(self, item: int | tuple) -> npt.ArrayLike | da.Array | tuple:
403406
def __setitem__(self, key: int | tuple, value: npt.ArrayLike) -> None:
404407
"""Data setter."""
405408
self._traces[key] = value
409+
self._live_mask[key] = True
406410

407411
def coord_to_index(
408412
self,
@@ -643,10 +647,10 @@ def __init__(
643647
memory_cache_size: int = 0,
644648
disk_cache: bool = False,
645649
): # TODO: Disabled all caching by default, sometimes causes performance issues
646-
"""Initialize super class with `r+` permission."""
650+
"""Initialize accessor class with `w` permission."""
647651
super().__init__(
648652
mdio_path_or_buffer=mdio_path_or_buffer,
649-
mode="r+",
653+
mode="w",
650654
access_pattern=access_pattern,
651655
storage_options=storage_options,
652656
return_metadata=return_metadata,

src/mdio/converters/segy.py

Lines changed: 77 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,6 @@
55
import logging
66
import os
77
from collections.abc import Sequence
8-
from datetime import datetime
9-
from datetime import timezone
108
from importlib import metadata
119
from typing import Any
1210

@@ -15,19 +13,30 @@
1513
from segy import SegyFile
1614
from segy.config import SegySettings
1715
from segy.schema import HeaderField
16+
from zarr import Blosc
1817

1918
from mdio.api.io_utils import process_url
2019
from mdio.converters.exceptions import EnvironmentFormatError
2120
from mdio.converters.exceptions import GridTraceCountError
2221
from mdio.converters.exceptions import GridTraceSparsityError
2322
from mdio.core import Grid
23+
from mdio.core.factory import MDIOCreateConfig
24+
from mdio.core.factory import MDIOVariableConfig
25+
from mdio.core.factory import create_empty
2426
from mdio.core.utils_write import write_attribute
2527
from mdio.segy import blocked_io
2628
from mdio.segy.compat import mdio_segy_spec
27-
from mdio.segy.helpers_segy import create_zarr_hierarchy
2829
from mdio.segy.utilities import get_grid_plan
2930

3031

32+
try:
33+
import zfpy # Base library
34+
from zarr import ZFPY # Codec
35+
except ImportError:
36+
ZFPY = None
37+
zfpy = None
38+
39+
3140
logger = logging.getLogger(__name__)
3241

3342
try:
@@ -103,6 +112,28 @@ def grid_density_qc(grid: Grid, num_traces: int) -> None:
103112
raise GridTraceSparsityError(grid.shape, num_traces, msg)
104113

105114

115+
def get_compressor(
116+
lossless: bool, compression_tolerance: float = -1
117+
) -> Blosc | ZFPY | None:
118+
"""Get the appropriate compressor for the seismic traces."""
119+
if lossless:
120+
compressor = Blosc("zstd")
121+
else:
122+
if zfpy is None or ZFPY is None:
123+
msg = (
124+
"Lossy compression requires the 'zfpy' library. It is "
125+
"not installed in your environment. To proceed please "
126+
"install 'zfpy' or install mdio with `--extras lossy`"
127+
)
128+
raise ImportError(msg)
129+
130+
compressor = ZFPY(
131+
mode=zfpy.mode_fixed_accuracy,
132+
tolerance=compression_tolerance,
133+
)
134+
return compressor
135+
136+
106137
def segy_to_mdio( # noqa: C901
107138
segy_path: str,
108139
mdio_path_or_buffer: str,
@@ -364,14 +395,6 @@ def segy_to_mdio( # noqa: C901
364395
if storage_options_output is None:
365396
storage_options_output = {}
366397

367-
store = process_url(
368-
url=mdio_path_or_buffer,
369-
mode="w",
370-
storage_options=storage_options_output,
371-
memory_cache_size=0, # Making sure disk caching is disabled,
372-
disk_cache=False, # Making sure disk caching is disabled
373-
)
374-
375398
# Open SEG-Y with MDIO's SegySpec. Endianness will be inferred.
376399
mdio_spec = mdio_segy_spec()
377400
segy_settings = SegySettings(storage_options=storage_options_input)
@@ -406,45 +429,6 @@ def segy_to_mdio( # noqa: C901
406429
logger.warning(f"Ingestion grid shape: {grid.shape}.")
407430
raise GridTraceCountError(np.sum(grid.live_mask), num_traces)
408431

409-
zarr_root = create_zarr_hierarchy(
410-
store=store,
411-
overwrite=overwrite,
412-
)
413-
414-
# Get UTC time, then add local timezone information offset.
415-
iso_datetime = datetime.now(timezone.utc).isoformat()
416-
417-
write_attribute(name="created", zarr_group=zarr_root, attribute=iso_datetime)
418-
write_attribute(name="api_version", zarr_group=zarr_root, attribute=API_VERSION)
419-
420-
dimensions_dict = [dim.to_dict() for dim in dimensions]
421-
write_attribute(name="dimension", zarr_group=zarr_root, attribute=dimensions_dict)
422-
423-
# Write trace count
424-
trace_count = np.count_nonzero(grid.live_mask)
425-
write_attribute(name="trace_count", zarr_group=zarr_root, attribute=trace_count)
426-
427-
# Note, live mask is not chunked since it's bool and small.
428-
zarr_root["metadata"].create_dataset(
429-
data=grid.live_mask,
430-
name="live_mask",
431-
shape=grid.shape[:-1],
432-
chunks=-1,
433-
dimension_separator="/",
434-
)
435-
436-
write_attribute(
437-
name="text_header",
438-
zarr_group=zarr_root["metadata"],
439-
attribute=text_header.split("\n"),
440-
)
441-
442-
write_attribute(
443-
name="binary_header",
444-
zarr_group=zarr_root["metadata"],
445-
attribute=binary_header.to_dict(),
446-
)
447-
448432
if chunksize is None:
449433
dim_count = len(index_names) + 1
450434
if dim_count == 2:
@@ -467,18 +451,59 @@ def segy_to_mdio( # noqa: C901
467451
suffix = [str(idx) for idx, value in enumerate(suffix) if value is not None]
468452
suffix = "".join(suffix)
469453

454+
compressor = get_compressor(compression_tolerance, lossless)
455+
header_dtype = segy.spec.trace.header.dtype.newbyteorder("=")
456+
var_conf = MDIOVariableConfig(
457+
name=f"chunked_{suffix}",
458+
dtype="float32",
459+
chunks=chunksize,
460+
compressor=compressor,
461+
header_dtype=header_dtype,
462+
)
463+
config = MDIOCreateConfig(path=mdio_path_or_buffer, grid=grid, variables=[var_conf])
464+
465+
zarr_root = create_empty(
466+
config,
467+
overwrite=overwrite,
468+
storage_options=storage_options_output,
469+
consolidate_meta=False,
470+
)
471+
data_group, meta_group = zarr_root["data"], zarr_root["metadata"]
472+
data_array = data_group[f"chunked_{suffix}"]
473+
header_array = meta_group[f"chunked_{suffix}_trace_headers"]
474+
475+
# Write actual live mask and metadata to empty MDIO
476+
meta_group["live_mask"][:] = grid.live_mask
477+
write_attribute(
478+
name="trace_count",
479+
zarr_group=zarr_root,
480+
attribute=np.count_nonzero(grid.live_mask),
481+
)
482+
write_attribute(
483+
name="text_header",
484+
zarr_group=zarr_root["metadata"],
485+
attribute=text_header.split("\n"),
486+
)
487+
write_attribute(
488+
name="binary_header",
489+
zarr_group=zarr_root["metadata"],
490+
attribute=binary_header.to_dict(),
491+
)
492+
493+
# Write traces
470494
stats = blocked_io.to_zarr(
471495
segy_file=segy,
472496
grid=grid,
473-
data_root=zarr_root["data"],
474-
metadata_root=zarr_root["metadata"],
497+
data_array=data_array,
498+
header_array=header_array,
475499
name="_".join(["chunked", suffix]),
476500
dtype="float32",
477501
chunks=chunksize,
478502
lossless=lossless,
479503
compression_tolerance=compression_tolerance,
480504
)
481505

506+
# Write actual stats
482507
for key, value in stats.items():
483508
write_attribute(name=key, zarr_group=zarr_root, attribute=value)
484509

0 commit comments

Comments
 (0)