Skip to content

Commit 077276a

Browse files
jrbourbeaujhammanpre-commit-ci[bot]
authored
Add utility for opening remote files with fsspec (#9797)
* Add utility for opening remote files with fsspec * Apply Joe's suggestions from code review Co-authored-by: Joe Hamman <jhamman1@gmail.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Lint * Add what's new entry * Type hint * Make mypy happy --------- Co-authored-by: Joe Hamman <jhamman1@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
1 parent 339ed93 commit 077276a

File tree

4 files changed

+44
-1
lines changed

4 files changed

+44
-1
lines changed

doc/whats-new.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,9 @@ New Features
5757
- Optimize :py:meth:`DataArray.polyfit` and :py:meth:`Dataset.polyfit` with dask, when used with
5858
arrays with more than two dimensions.
5959
(:issue:`5629`). By `Deepak Cherian <https://github.com/dcherian>`_.
60+
- Support for directly opening remote files as string paths (for example, ``s3://bucket/data.nc``)
61+
with ``fsspec`` when using the ``h5netcdf`` engine (:issue:`9723`, :pull:`9797`).
62+
By `James Bourbeau <https://github.com/jrbourbeau>`_.
6063
- Re-implement the :py:mod:`ufuncs` module, which now dynamically dispatches to the
6164
underlying array's backend. Provides better support for certain wrapped array types
6265
like ``jax.numpy.ndarray``. (:issue:`7848`, :pull:`9776`).

xarray/backends/common.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,15 @@ def _normalize_path_list(
181181
return _normalize_path_list(paths)
182182

183183

184+
def _open_remote_file(file, mode, storage_options=None):
185+
import fsspec
186+
187+
fs, _, paths = fsspec.get_fs_token_paths(
188+
file, mode=mode, storage_options=storage_options
189+
)
190+
return fs.open(paths[0], mode=mode)
191+
192+
184193
def _encode_variable_name(name):
185194
if name is None:
186195
name = NONE_VAR_NAME

xarray/backends/h5netcdf_.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
BackendEntrypoint,
1414
WritableCFDataStore,
1515
_normalize_path,
16+
_open_remote_file,
1617
datatree_from_dict_with_io_cleanup,
1718
find_root_and_group,
1819
)
@@ -149,9 +150,16 @@ def open(
149150
decode_vlen_strings=True,
150151
driver=None,
151152
driver_kwds=None,
153+
storage_options: dict[str, Any] | None = None,
152154
):
153155
import h5netcdf
154156

157+
if isinstance(filename, str) and is_remote_uri(filename) and driver is None:
158+
mode_ = "rb" if mode == "r" else mode
159+
filename = _open_remote_file(
160+
filename, mode=mode_, storage_options=storage_options
161+
)
162+
155163
if isinstance(filename, bytes):
156164
raise ValueError(
157165
"can't open netCDF4/HDF5 as bytes "
@@ -161,7 +169,7 @@ def open(
161169
magic_number = read_magic_number_from_file(filename)
162170
if not magic_number.startswith(b"\211HDF\r\n\032\n"):
163171
raise ValueError(
164-
f"{magic_number} is not the signature of a valid netCDF4 file"
172+
f"{magic_number!r} is not the signature of a valid netCDF4 file"
165173
)
166174

167175
if format not in [None, "NETCDF4"]:
@@ -425,6 +433,7 @@ def open_dataset(
425433
decode_vlen_strings=True,
426434
driver=None,
427435
driver_kwds=None,
436+
storage_options: dict[str, Any] | None = None,
428437
) -> Dataset:
429438
filename_or_obj = _normalize_path(filename_or_obj)
430439
store = H5NetCDFStore.open(
@@ -437,6 +446,7 @@ def open_dataset(
437446
decode_vlen_strings=decode_vlen_strings,
438447
driver=driver,
439448
driver_kwds=driver_kwds,
449+
storage_options=storage_options,
440450
)
441451

442452
store_entrypoint = StoreBackendEntrypoint()

xarray/tests/test_backends.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6489,3 +6489,24 @@ def test_zarr_safe_chunk_region(tmp_path):
64896489
chunk = ds.isel(region)
64906490
chunk = chunk.chunk()
64916491
chunk.chunk().to_zarr(store, region=region)
6492+
6493+
6494+
@requires_h5netcdf
6495+
@requires_fsspec
6496+
def test_h5netcdf_storage_options() -> None:
6497+
with create_tmp_files(2, allow_cleanup_failure=ON_WINDOWS) as (f1, f2):
6498+
ds1 = create_test_data()
6499+
ds1.to_netcdf(f1, engine="h5netcdf")
6500+
6501+
ds2 = create_test_data()
6502+
ds2.to_netcdf(f2, engine="h5netcdf")
6503+
6504+
files = [f"file://{f}" for f in [f1, f2]]
6505+
ds = xr.open_mfdataset(
6506+
files,
6507+
engine="h5netcdf",
6508+
concat_dim="time",
6509+
combine="nested",
6510+
storage_options={"skip_instance_cache": False},
6511+
)
6512+
assert_identical(xr.concat([ds1, ds2], dim="time"), ds)

0 commit comments

Comments
 (0)