Allow setting (or skipping) new indexes in open_dataset (#8051)

benbovy · dcherian · TomNicholas · web-flow · commit 3679a5df71cc · 2025-07-08T12:20:19.000-07:00
Co-authored-by: Deepak Cherian &lt;dcherian@users.noreply.github.com&gt;
Co-authored-by: Tom Nicholas &lt;tom@cworthy.org&gt;
Co-authored-by: Justus Magin &lt;keewis@posteo.de&gt;
Co-authored-by: Justus Magin &lt;keewis@users.noreply.github.com&gt;
Co-authored-by: Deepak Cherian &lt;deepak@cherian.net&gt;
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -12,7 +12,8 @@ v2025.07.1 (unreleased)
 
 New Features
 ~~~~~~~~~~~~
-
+- Allow skipping the creation of default indexes when opening datasets (:pull:`8051`).
+  By `Benoit Bovy <https://github.com/benbovy>`_ and `Justus Magin <https://github.com/keewis>`_.
 
 Breaking changes
 ~~~~~~~~~~~~~~~~
diff --git a/xarray/backends/api.py b/xarray/backends/api.py
@@ -36,6 +36,7 @@
 from xarray.backends.locks import _get_scheduler
 from xarray.coders import CFDatetimeCoder, CFTimedeltaCoder
 from xarray.core import indexing
+from xarray.core.coordinates import Coordinates
 from xarray.core.dataarray import DataArray
 from xarray.core.dataset import Dataset
 from xarray.core.datatree import DataTree
@@ -379,6 +380,15 @@ def _chunk_ds(
     return backend_ds._replace(variables)
 
 
+def _maybe_create_default_indexes(ds):
+    to_index = {
+        name: coord.variable
+        for name, coord in ds.coords.items()
+        if coord.dims == (name,) and name not in ds.xindexes
+    }
+    return ds.assign_coords(Coordinates(to_index))
+
+
 def _dataset_from_backend_dataset(
     backend_ds,
     filename_or_obj,
@@ -389,6 +399,7 @@ def _dataset_from_backend_dataset(
     inline_array,
     chunked_array_type,
     from_array_kwargs,
+    create_default_indexes,
     **extra_tokens,
 ):
     if not isinstance(chunks, int | dict) and chunks not in {None, "auto"}:
@@ -397,11 +408,15 @@ def _dataset_from_backend_dataset(
         )
 
     _protect_dataset_variables_inplace(backend_ds, cache)
-    if chunks is None:
-        ds = backend_ds
+
+    if create_default_indexes:
+        ds = _maybe_create_default_indexes(backend_ds)
     else:
+        ds = backend_ds
+
+    if chunks is not None:
         ds = _chunk_ds(
-            backend_ds,
+            ds,
             filename_or_obj,
             engine,
             chunks,
@@ -434,6 +449,7 @@ def _datatree_from_backend_datatree(
     inline_array,
     chunked_array_type,
     from_array_kwargs,
+    create_default_indexes,
     **extra_tokens,
 ):
     if not isinstance(chunks, int | dict) and chunks not in {None, "auto"}:
@@ -442,9 +458,11 @@ def _datatree_from_backend_datatree(
         )
 
     _protect_datatree_variables_inplace(backend_tree, cache)
-    if chunks is None:
-        tree = backend_tree
+    if create_default_indexes:
+        tree = backend_tree.map_over_datasets(_maybe_create_default_indexes)
     else:
+        tree = backend_tree
+    if chunks is not None:
         tree = DataTree.from_dict(
             {
                 path: _chunk_ds(
@@ -459,11 +477,12 @@ def _datatree_from_backend_datatree(
                     node=path,
                     **extra_tokens,
                 )
-                for path, [node] in group_subtrees(backend_tree)
+                for path, [node] in group_subtrees(tree)
             },
-            name=backend_tree.name,
+            name=tree.name,
         )
 
+    if create_default_indexes or chunks is not None:
         for path, [node] in group_subtrees(backend_tree):
             tree[path].set_close(node._close)
 
@@ -497,6 +516,7 @@ def open_dataset(
     concat_characters: bool | Mapping[str, bool] | None = None,
     decode_coords: Literal["coordinates", "all"] | bool | None = None,
     drop_variables: str | Iterable[str] | None = None,
+    create_default_indexes: bool = True,
     inline_array: bool = False,
     chunked_array_type: str | None = None,
     from_array_kwargs: dict[str, Any] | None = None,
@@ -610,6 +630,13 @@ def open_dataset(
         A variable or list of variables to exclude from being parsed from the
         dataset. This may be useful to drop variables with problems or
         inconsistent values.
+    create_default_indexes : bool, default: True
+        If True, create pandas indexes for :term:`dimension coordinates <dimension coordinate>`,
+        which loads the coordinate data into memory. Set it to False if you want to avoid loading
+        data into memory.
+
+        Note that backends can still choose to create other indexes. If you want to control that,
+        please refer to the backend's documentation.
     inline_array: bool, default: False
         How to include the array in the dask task graph.
         By default(``inline_array=False``) the array is included in a task by
@@ -702,6 +729,7 @@ def open_dataset(
         chunked_array_type,
         from_array_kwargs,
         drop_variables=drop_variables,
+        create_default_indexes=create_default_indexes,
         **decoders,
         **kwargs,
     )
@@ -725,6 +753,7 @@ def open_dataarray(
     concat_characters: bool | None = None,
     decode_coords: Literal["coordinates", "all"] | bool | None = None,
     drop_variables: str | Iterable[str] | None = None,
+    create_default_indexes: bool = True,
     inline_array: bool = False,
     chunked_array_type: str | None = None,
     from_array_kwargs: dict[str, Any] | None = None,
@@ -833,6 +862,13 @@ def open_dataarray(
         A variable or list of variables to exclude from being parsed from the
         dataset. This may be useful to drop variables with problems or
         inconsistent values.
+    create_default_indexes : bool, default: True
+        If True, create pandas indexes for :term:`dimension coordinates <dimension coordinate>`,
+        which loads the coordinate data into memory. Set it to False if you want to avoid loading
+        data into memory.
+
+        Note that backends can still choose to create other indexes. If you want to control that,
+        please refer to the backend's documentation.
     inline_array: bool, default: False
         How to include the array in the dask task graph.
         By default(``inline_array=False``) the array is included in a task by
@@ -890,6 +926,7 @@ def open_dataarray(
         chunks=chunks,
         cache=cache,
         drop_variables=drop_variables,
+        create_default_indexes=create_default_indexes,
         inline_array=inline_array,
         chunked_array_type=chunked_array_type,
         from_array_kwargs=from_array_kwargs,
@@ -946,6 +983,7 @@ def open_datatree(
     concat_characters: bool | Mapping[str, bool] | None = None,
     decode_coords: Literal["coordinates", "all"] | bool | None = None,
     drop_variables: str | Iterable[str] | None = None,
+    create_default_indexes: bool = True,
     inline_array: bool = False,
     chunked_array_type: str | None = None,
     from_array_kwargs: dict[str, Any] | None = None,
@@ -1055,6 +1093,13 @@ def open_datatree(
         A variable or list of variables to exclude from being parsed from the
         dataset. This may be useful to drop variables with problems or
         inconsistent values.
+    create_default_indexes : bool, default: True
+        If True, create pandas indexes for :term:`dimension coordinates <dimension coordinate>`,
+        which loads the coordinate data into memory. Set it to False if you want to avoid loading
+        data into memory.
+
+        Note that backends can still choose to create other indexes. If you want to control that,
+        please refer to the backend's documentation.
     inline_array: bool, default: False
         How to include the array in the dask task graph.
         By default(``inline_array=False``) the array is included in a task by
@@ -1148,6 +1193,7 @@ def open_datatree(
         chunked_array_type,
         from_array_kwargs,
         drop_variables=drop_variables,
+        create_default_indexes=create_default_indexes,
         **decoders,
         **kwargs,
     )
@@ -1175,6 +1221,7 @@ def open_groups(
     concat_characters: bool | Mapping[str, bool] | None = None,
     decode_coords: Literal["coordinates", "all"] | bool | None = None,
     drop_variables: str | Iterable[str] | None = None,
+    create_default_indexes: bool = True,
     inline_array: bool = False,
     chunked_array_type: str | None = None,
     from_array_kwargs: dict[str, Any] | None = None,
@@ -1286,6 +1333,13 @@ def open_groups(
         A variable or list of variables to exclude from being parsed from the
         dataset. This may be useful to drop variables with problems or
         inconsistent values.
+    create_default_indexes : bool, default: True
+        If True, create pandas indexes for :term:`dimension coordinates <dimension coordinate>`,
+        which loads the coordinate data into memory. Set it to False if you want to avoid loading
+        data into memory.
+
+        Note that backends can still choose to create other indexes. If you want to control that,
+        please refer to the backend's documentation.
     inline_array: bool, default: False
         How to include the array in the dask task graph.
         By default(``inline_array=False``) the array is included in a task by
@@ -1381,6 +1435,7 @@ def open_groups(
             chunked_array_type,
             from_array_kwargs,
             drop_variables=drop_variables,
+            create_default_indexes=create_default_indexes,
             **decoders,
             **kwargs,
         )
diff --git a/xarray/backends/store.py b/xarray/backends/store.py
@@ -9,6 +9,7 @@
     AbstractDataStore,
     BackendEntrypoint,
 )
+from xarray.core.coordinates import Coordinates
 from xarray.core.dataset import Dataset
 
 if TYPE_CHECKING:
@@ -36,6 +37,7 @@ def open_dataset(
         concat_characters=True,
         decode_coords=True,
         drop_variables: str | Iterable[str] | None = None,
+        set_indexes: bool = True,
         use_cftime=None,
         decode_timedelta=None,
     ) -> Dataset:
@@ -56,8 +58,19 @@ def open_dataset(
             decode_timedelta=decode_timedelta,
         )
 
-        ds = Dataset(vars, attrs=attrs)
-        ds = ds.set_coords(coord_names.intersection(vars))
+        # split data and coordinate variables (promote dimension coordinates)
+        data_vars = {}
+        coord_vars = {}
+        for name, var in vars.items():
+            if name in coord_names or var.dims == (name,):
+                coord_vars[name] = var
+            else:
+                data_vars[name] = var
+
+        # explicit Coordinates object with no index passed
+        coords = Coordinates(coord_vars, indexes={})
+
+        ds = Dataset(data_vars, coords=coords, attrs=attrs)
         ds.set_close(filename_or_obj.close)
         ds.encoding = encoding
 
diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py
@@ -1347,6 +1347,7 @@ def open_zarr(
     use_zarr_fill_value_as_mask=None,
     chunked_array_type: str | None = None,
     from_array_kwargs: dict[str, Any] | None = None,
+    create_default_indexes=True,
     **kwargs,
 ):
     """Load and decode a dataset from a Zarr store.
@@ -1457,6 +1458,13 @@ def open_zarr(
         chunked arrays, via whichever chunk manager is specified through the ``chunked_array_type`` kwarg.
         Defaults to ``{'manager': 'dask'}``, meaning additional kwargs will be passed eventually to
         :py:func:`dask.array.from_array`. Experimental API that should not be relied upon.
+    create_default_indexes : bool, default: True
+        If True, create pandas indexes for :term:`dimension coordinates <dimension coordinate>`,
+        which loads the coordinate data into memory. Set it to False if you want to avoid loading
+        data into memory.
+
+        Note that backends can still choose to create other indexes. If you want to control that,
+        please refer to the backend's documentation.
 
     Returns
     -------
@@ -1513,6 +1521,7 @@ def open_zarr(
         engine="zarr",
         chunks=chunks,
         drop_variables=drop_variables,
+        create_default_indexes=create_default_indexes,
         chunked_array_type=chunked_array_type,
         from_array_kwargs=from_array_kwargs,
         backend_kwargs=backend_kwargs,
diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py
@@ -55,6 +55,7 @@
 from xarray.coding.variables import SerializationWarning
 from xarray.conventions import encode_dataset_coordinates
 from xarray.core import indexing
+from xarray.core.indexes import PandasIndex
 from xarray.core.options import set_options
 from xarray.core.types import PDDatetimeUnitOptions
 from xarray.core.utils import module_available
@@ -2066,6 +2067,26 @@ def test_encoding_enum__error_multiple_variable_with_changing_enum(self):
                         with self.roundtrip(original):
                             pass
 
+    @pytest.mark.parametrize("create_default_indexes", [True, False])
+    def test_create_default_indexes(self, tmp_path, create_default_indexes) -> None:
+        store_path = tmp_path / "tmp.nc"
+        original_ds = xr.Dataset(
+            {"data": ("x", np.arange(3))}, coords={"x": [-1, 0, 1]}
+        )
+        original_ds.to_netcdf(store_path, engine=self.engine, mode="w")
+
+        with open_dataset(
+            store_path,
+            engine=self.engine,
+            create_default_indexes=create_default_indexes,
+        ) as loaded_ds:
+            if create_default_indexes:
+                assert list(loaded_ds.xindexes) == ["x"] and isinstance(
+                    loaded_ds.xindexes["x"], PandasIndex
+                )
+            else:
+                assert len(loaded_ds.xindexes) == 0
+
 
 @requires_netCDF4
 class TestNetCDF4Data(NetCDF4Base):
@@ -4063,6 +4084,26 @@ def test_pickle(self) -> None:
     def test_pickle_dataarray(self) -> None:
         pass
 
+    @pytest.mark.parametrize("create_default_indexes", [True, False])
+    def test_create_default_indexes(self, tmp_path, create_default_indexes) -> None:
+        store_path = tmp_path / "tmp.nc"
+        original_ds = xr.Dataset(
+            {"data": ("x", np.arange(3))}, coords={"x": [-1, 0, 1]}
+        )
+        original_ds.to_netcdf(store_path, engine=self.engine, mode="w")
+
+        with open_dataset(
+            store_path,
+            engine=self.engine,
+            create_default_indexes=create_default_indexes,
+        ) as loaded_ds:
+            if create_default_indexes:
+                assert list(loaded_ds.xindexes) == ["x"] and isinstance(
+                    loaded_ds.xindexes["x"], PandasIndex
+                )
+            else:
+                assert len(loaded_ds.xindexes) == 0
+
 
 @requires_scipy
 class TestScipyFilePath(CFEncodedBase, NetCDF3Only):
@@ -6434,6 +6475,26 @@ def test_zarr_closing_internal_zip_store():
         assert_identical(original_da, loaded_da)
 
 
+@requires_zarr
+@pytest.mark.parametrize("create_default_indexes", [True, False])
+def test_zarr_create_default_indexes(tmp_path, create_default_indexes) -> None:
+    from xarray.core.indexes import PandasIndex
+
+    store_path = tmp_path / "tmp.zarr"
+    original_ds = xr.Dataset({"data": ("x", np.arange(3))}, coords={"x": [-1, 0, 1]})
+    original_ds.to_zarr(store_path, mode="w")
+
+    with open_dataset(
+        store_path, engine="zarr", create_default_indexes=create_default_indexes
+    ) as loaded_ds:
+        if create_default_indexes:
+            assert list(loaded_ds.xindexes) == ["x"] and isinstance(
+                loaded_ds.xindexes["x"], PandasIndex
+            )
+        else:
+            assert len(loaded_ds.xindexes) == 0
+
+
 @requires_zarr
 @pytest.mark.usefixtures("default_zarr_format")
 def test_raises_key_error_on_invalid_zarr_store(tmp_path):
diff --git a/xarray/tests/test_backends_api.py b/xarray/tests/test_backends_api.py