Refactor SEG-Y workers to open files instead of passing SegyFile from main process for safer multiprocessing. (#575)

tasansal · web-flow · commit f67d5099fb6e · 2025-07-18T22:50:24.000-05:00
diff --git a/src/mdio/segy/_workers.py b/src/mdio/segy/_workers.py
@@ -5,31 +5,47 @@
 import os
 from typing import TYPE_CHECKING
 from typing import Any
+from typing import TypedDict
 from typing import cast
 
 import numpy as np
+from segy import SegyFile
 
 if TYPE_CHECKING:
-    from segy import SegyFile
     from segy.arrays import HeaderArray
+    from segy.config import SegySettings
+    from segy.schema import SegySpec
     from zarr import Array
 
     from mdio.core import Grid
 
 
-def header_scan_worker(segy_file: SegyFile, trace_range: tuple[int, int]) -> HeaderArray:
+class SegyFileArguments(TypedDict):
+    """Arguments to open SegyFile instance creation."""
+
+    url: str
+    spec: SegySpec | None
+    settings: SegySettings | None
+
+
+def header_scan_worker(
+    segy_kw: SegyFileArguments,
+    trace_range: tuple[int, int],
+) -> HeaderArray:
     """Header scan worker.
 
     If SegyFile is not open, it can either accept a path string or a handle that was opened in
     a different context manager.
 
     Args:
-        segy_file: SegyFile instance.
+        segy_kw: Arguments to open SegyFile instance.
         trace_range: Tuple consisting of the trace ranges to read.
 
     Returns:
         HeaderArray parsed from SEG-Y library.
     """
+    segy_file = SegyFile(**segy_kw)
+
     slice_ = slice(*trace_range)
 
     cloud_native_mode = os.getenv("MDIO__IMPORT__CLOUD_NATIVE", default="False")
@@ -52,7 +68,7 @@ def header_scan_worker(segy_file: SegyFile, trace_range: tuple[int, int]) -> Hea
 
 
 def trace_worker(
-    segy_file: SegyFile,
+    segy_kw: SegyFileArguments,
     data_array: Array,
     metadata_array: Array,
     grid: Grid,
@@ -68,7 +84,7 @@ def trace_worker(
     slices across the sample dimension since SEG-Y data isn't chunked, eliminating concern.
 
     Args:
-        segy_file: SegyFile instance.
+        segy_kw: Arguments to open SegyFile instance.
         data_array: Handle for zarr.Array we are writing traces to
         metadata_array: Handle for zarr.Array we are writing trace headers
         grid: mdio.Grid instance
@@ -78,6 +94,7 @@ def trace_worker(
         Partial statistics for chunk, or None
     """
     # Special case where there are no traces inside chunk.
+    segy_file = SegyFile(**segy_kw)
     live_subset = grid.live_mask[chunk_indices[:-1]]
 
     if np.count_nonzero(live_subset) == 0:
diff --git a/src/mdio/segy/blocked_io.py b/src/mdio/segy/blocked_io.py
@@ -2,7 +2,6 @@
 
 from __future__ import annotations
 
-import multiprocessing as mp
 import os
 from concurrent.futures import ProcessPoolExecutor
 from itertools import repeat
@@ -48,22 +47,23 @@ def to_zarr(segy_file: SegyFile, grid: Grid, data_array: Array, header_array: Ar
     chunker = ChunkIterator(data_array, chunk_samples=False)
     num_chunks = len(chunker)
 
-    # For Unix async writes with s3fs/fsspec & multiprocessing, use 'spawn' instead of default
-    # 'fork' to avoid deadlocks on cloud stores. Slower but necessary. Default on Windows.
     num_cpus = int(os.getenv("MDIO__IMPORT__CPU_COUNT", default_cpus))
     num_workers = min(num_chunks, num_cpus)
-    context = mp.get_context("spawn")
-    executor = ProcessPoolExecutor(max_workers=num_workers, mp_context=context)
 
     # Chunksize here is for multiprocessing, not Zarr chunksize.
     pool_chunksize, extra = divmod(num_chunks, num_workers * 4)
     pool_chunksize += 1 if extra else pool_chunksize
 
+    segy_kw = {
+        "url": segy_file.fs.unstrip_protocol(segy_file.url),
+        "spec": segy_file.spec,
+        "settings": segy_file.settings,
+    }
     tqdm_kw = {"unit": "block", "dynamic_ncols": True}
-    with executor:
+    with ProcessPoolExecutor(max_workers=num_workers) as executor:
         lazy_work = executor.map(
             trace_worker,  # fn
-            repeat(segy_file),
+            repeat(segy_kw),
             repeat(data_array),
             repeat(header_array),
             repeat(grid),
diff --git a/src/mdio/segy/parsers.py b/src/mdio/segy/parsers.py
@@ -2,7 +2,6 @@
 
 from __future__ import annotations
 
-import multiprocessing as mp
 import os
 from concurrent.futures import ProcessPoolExecutor
 from itertools import repeat
@@ -48,15 +47,17 @@ def parse_index_headers(
 
         trace_ranges.append((start, stop))
 
-    # For Unix async reads with s3fs/fsspec & multiprocessing, use 'spawn' instead of default
-    # 'fork' to avoid deadlocks on cloud stores. Slower but necessary. Default on Windows.
     num_cpus = int(os.getenv("MDIO__IMPORT__CPU_COUNT", default_cpus))
     num_workers = min(n_blocks, num_cpus)
-    context = mp.get_context("spawn")
 
+    segy_kw = {
+        "url": segy_file.fs.unstrip_protocol(segy_file.url),
+        "spec": segy_file.spec,
+        "settings": segy_file.settings,
+    }
     tqdm_kw = {"unit": "block", "dynamic_ncols": True}
-    with ProcessPoolExecutor(num_workers, mp_context=context) as executor:
-        lazy_work = executor.map(header_scan_worker, repeat(segy_file), trace_ranges)
+    with ProcessPoolExecutor(num_workers) as executor:
+        lazy_work = executor.map(header_scan_worker, repeat(segy_kw), trace_ranges)
 
         if progress_bar is True:
             lazy_work = tqdm(