Start splitting up dataset.py (#10039)

max-sixty · web-flow · commit 1873874d7e50 · 2025-02-09T21:12:16.000-08:00
* Start splitting up `dataset.py`

Currently, `dataset.py` is 10956 lines long. This makes doing any work with current LLMs basically impossible — with Claude's tokenizer, the file is 104K tokens, or &gt;2.5x the size of the _per-minute_ rate limit for basic accounts. Most of xarray touches it in some way, so you generally want to give it the file for context.

Even if you don't think "LLMs are the future, let's code with vibes!", the file is still really long; can be difficult to navigate (though OTOH it can be easy to just grep, to be fair...).

So I would propose:
- We start breaking it up, while also being cognizant that big changes can cause merge conflicts
- Start with the low-hanging fruit
  - For example, this PR moves code outside the class (but that's quite limited)
  - Then move some of the code from the big methods into functions in other files, like `curve_fit`
- Possibly (has tradeoffs; needs discussion) build some mixins so we can split up the class, if we want to have much smaller files
- We can also think about other files: `dataarray.py` is 7.5K lines. The tests are also huge (`test_dataset` is 7.5K lines), but unlike with the library code, we can copy out &amp; in chunks of tests when developing.

(Note that I don't have any strong views on exactly what code should go in which file; I made a quick guess — very open to any suggestions; also easy to change later, particularly since this code doesn't change much so is less likely to cause conflicts)

* .
diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py
@@ -893,7 +893,7 @@ def _item_key_to_dict(self, key: Any) -> Mapping[Hashable, Any]:
         return dict(zip(self.dims, key, strict=True))
 
     def _getitem_coord(self, key: Any) -> Self:
-        from xarray.core.dataset import _get_virtual_variable
+        from xarray.core.dataset_utils import _get_virtual_variable
 
         try:
             var = self._coords[key]
diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
@@ -24,11 +24,14 @@
 from operator import methodcaller
 from os import PathLike
 from types import EllipsisType
-from typing import IO, TYPE_CHECKING, Any, Generic, Literal, cast, overload
+from typing import IO, TYPE_CHECKING, Any, Literal, cast, overload
 
 import numpy as np
 from pandas.api.types import is_extension_array_dtype
 
+from xarray.core.dataset_utils import _get_virtual_variable, _LocIndexer
+from xarray.core.dataset_variables import DataVariables
+
 # remove once numpy 2.0 is the oldest supported version
 try:
     from numpy.exceptions import RankWarning
@@ -98,7 +101,6 @@
     T_ChunksFreq,
     T_DataArray,
     T_DataArrayOrSet,
-    T_Dataset,
     ZarrWriteModes,
 )
 from xarray.core.utils import (
@@ -196,43 +198,6 @@
 ]
 
 
-def _get_virtual_variable(
-    variables, key: Hashable, dim_sizes: Mapping | None = None
-) -> tuple[Hashable, Hashable, Variable]:
-    """Get a virtual variable (e.g., 'time.year') from a dict of xarray.Variable
-    objects (if possible)
-
-    """
-    from xarray.core.dataarray import DataArray
-
-    if dim_sizes is None:
-        dim_sizes = {}
-
-    if key in dim_sizes:
-        data = pd.Index(range(dim_sizes[key]), name=key)
-        variable = IndexVariable((key,), data)
-        return key, key, variable
-
-    if not isinstance(key, str):
-        raise KeyError(key)
-
-    split_key = key.split(".", 1)
-    if len(split_key) != 2:
-        raise KeyError(key)
-
-    ref_name, var_name = split_key
-    ref_var = variables[ref_name]
-
-    if _contains_datetime_like_objects(ref_var):
-        ref_var = DataArray(ref_var)
-        data = getattr(ref_var.dt, var_name).data
-    else:
-        data = getattr(ref_var, var_name).data
-    virtual_var = Variable(ref_var.dims, data)
-
-    return ref_name, var_name, virtual_var
-
-
 def _get_chunk(var: Variable, chunks, chunkmanager: ChunkManagerEntrypoint):
     """
     Return map from each dim to chunk sizes, accounting for backend's preferred chunks.
@@ -367,19 +332,6 @@ def _maybe_chunk(
         return var
 
 
-def as_dataset(obj: Any) -> Dataset:
-    """Cast the given object to a Dataset.
-
-    Handles Datasets, DataArrays and dictionaries of variables. A new Dataset
-    object is only created if the provided object is not already one.
-    """
-    if hasattr(obj, "to_dataset"):
-        obj = obj.to_dataset()
-    if not isinstance(obj, Dataset):
-        obj = Dataset(obj)
-    return obj
-
-
 def _get_func_args(func, param_names):
     """Use `inspect.signature` to try accessing `func` args. Otherwise, ensure
     they are provided by user.
@@ -468,84 +420,6 @@ def merge_data_and_coords(data_vars: DataVars, coords) -> _MergeResult:
     )
 
 
-class DataVariables(Mapping[Any, "DataArray"]):
-    __slots__ = ("_dataset",)
-
-    def __init__(self, dataset: Dataset):
-        self._dataset = dataset
-
-    def __iter__(self) -> Iterator[Hashable]:
-        return (
-            key
-            for key in self._dataset._variables
-            if key not in self._dataset._coord_names
-        )
-
-    def __len__(self) -> int:
-        length = len(self._dataset._variables) - len(self._dataset._coord_names)
-        assert length >= 0, "something is wrong with Dataset._coord_names"
-        return length
-
-    def __contains__(self, key: Hashable) -> bool:
-        return key in self._dataset._variables and key not in self._dataset._coord_names
-
-    def __getitem__(self, key: Hashable) -> DataArray:
-        if key not in self._dataset._coord_names:
-            return self._dataset[key]
-        raise KeyError(key)
-
-    def __repr__(self) -> str:
-        return formatting.data_vars_repr(self)
-
-    @property
-    def variables(self) -> Mapping[Hashable, Variable]:
-        all_variables = self._dataset.variables
-        return Frozen({k: all_variables[k] for k in self})
-
-    @property
-    def dtypes(self) -> Frozen[Hashable, np.dtype]:
-        """Mapping from data variable names to dtypes.
-
-        Cannot be modified directly, but is updated when adding new variables.
-
-        See Also
-        --------
-        Dataset.dtype
-        """
-        return self._dataset.dtypes
-
-    def _ipython_key_completions_(self):
-        """Provide method for the key-autocompletions in IPython."""
-        return [
-            key
-            for key in self._dataset._ipython_key_completions_()
-            if key not in self._dataset._coord_names
-        ]
-
-
-class _LocIndexer(Generic[T_Dataset]):
-    __slots__ = ("dataset",)
-
-    def __init__(self, dataset: T_Dataset):
-        self.dataset = dataset
-
-    def __getitem__(self, key: Mapping[Any, Any]) -> T_Dataset:
-        if not utils.is_dict_like(key):
-            raise TypeError("can only lookup dictionaries from Dataset.loc")
-        return self.dataset.sel(key)
-
-    def __setitem__(self, key, value) -> None:
-        if not utils.is_dict_like(key):
-            raise TypeError(
-                "can only set locations defined by dictionaries from Dataset.loc."
-                f" Got: {key}"
-            )
-
-        # set new values
-        dim_indexers = map_index_queries(self.dataset, key).dim_indexers
-        self.dataset[dim_indexers] = value
-
-
 class Dataset(
     DataWithCoords,
     DatasetAggregations,
diff --git a/xarray/core/dataset_utils.py b/xarray/core/dataset_utils.py
@@ -0,0 +1,91 @@
+from __future__ import annotations
+
+import typing
+from collections.abc import Hashable, Mapping
+from typing import Any, Generic
+
+import pandas as pd
+
+from xarray.core import utils
+from xarray.core.common import _contains_datetime_like_objects
+from xarray.core.indexing import map_index_queries
+from xarray.core.types import T_Dataset
+from xarray.core.variable import IndexVariable, Variable
+
+if typing.TYPE_CHECKING:
+    from xarray.core.dataset import Dataset
+
+
+class _LocIndexer(Generic[T_Dataset]):
+    __slots__ = ("dataset",)
+
+    def __init__(self, dataset: T_Dataset):
+        self.dataset = dataset
+
+    def __getitem__(self, key: Mapping[Any, Any]) -> T_Dataset:
+        if not utils.is_dict_like(key):
+            raise TypeError("can only lookup dictionaries from Dataset.loc")
+        return self.dataset.sel(key)
+
+    def __setitem__(self, key, value) -> None:
+        if not utils.is_dict_like(key):
+            raise TypeError(
+                "can only set locations defined by dictionaries from Dataset.loc."
+                f" Got: {key}"
+            )
+
+        # set new values
+        dim_indexers = map_index_queries(self.dataset, key).dim_indexers
+        self.dataset[dim_indexers] = value
+
+
+def as_dataset(obj: Any) -> Dataset:
+    """Cast the given object to a Dataset.
+
+    Handles Datasets, DataArrays and dictionaries of variables. A new Dataset
+    object is only created if the provided object is not already one.
+    """
+    from xarray.core.dataset import Dataset
+
+    if hasattr(obj, "to_dataset"):
+        obj = obj.to_dataset()
+    if not isinstance(obj, Dataset):
+        obj = Dataset(obj)
+    return obj
+
+
+def _get_virtual_variable(
+    variables, key: Hashable, dim_sizes: Mapping | None = None
+) -> tuple[Hashable, Hashable, Variable]:
+    """Get a virtual variable (e.g., 'time.year') from a dict of xarray.Variable
+    objects (if possible)
+
+    """
+    from xarray.core.dataarray import DataArray
+
+    if dim_sizes is None:
+        dim_sizes = {}
+
+    if key in dim_sizes:
+        data = pd.Index(range(dim_sizes[key]), name=key)
+        variable = IndexVariable((key,), data)
+        return key, key, variable
+
+    if not isinstance(key, str):
+        raise KeyError(key)
+
+    split_key = key.split(".", 1)
+    if len(split_key) != 2:
+        raise KeyError(key)
+
+    ref_name, var_name = split_key
+    ref_var = variables[ref_name]
+
+    if _contains_datetime_like_objects(ref_var):
+        ref_var = DataArray(ref_var)
+        data = getattr(ref_var.dt, var_name).data
+    else:
+        data = getattr(ref_var, var_name).data
+    virtual_var = Variable(ref_var.dims, data)
+
+    return ref_name, var_name, virtual_var
diff --git a/xarray/core/dataset_variables.py b/xarray/core/dataset_variables.py
@@ -0,0 +1,68 @@
+import typing
+from collections.abc import Hashable, Iterator, Mapping
+from typing import Any
+
+import numpy as np
+
+from xarray.core import formatting
+from xarray.core.utils import Frozen
+from xarray.core.variable import Variable
+
+if typing.TYPE_CHECKING:
+    from xarray.core.dataarray import DataArray
+    from xarray.core.dataset import Dataset
+
+
+class DataVariables(Mapping[Any, "DataArray"]):
+    __slots__ = ("_dataset",)
+
+    def __init__(self, dataset: "Dataset"):
+        self._dataset = dataset
+
+    def __iter__(self) -> Iterator[Hashable]:
+        return (
+            key
+            for key in self._dataset._variables
+            if key not in self._dataset._coord_names
+        )
+
+    def __len__(self) -> int:
+        length = len(self._dataset._variables) - len(self._dataset._coord_names)
+        assert length >= 0, "something is wrong with Dataset._coord_names"
+        return length
+
+    def __contains__(self, key: Hashable) -> bool:
+        return key in self._dataset._variables and key not in self._dataset._coord_names
+
+    def __getitem__(self, key: Hashable) -> "DataArray":
+        if key not in self._dataset._coord_names:
+            return self._dataset[key]
+        raise KeyError(key)
+
+    def __repr__(self) -> str:
+        return formatting.data_vars_repr(self)
+
+    @property
+    def variables(self) -> Mapping[Hashable, Variable]:
+        all_variables = self._dataset.variables
+        return Frozen({k: all_variables[k] for k in self})
+
+    @property
+    def dtypes(self) -> Frozen[Hashable, np.dtype]:
+        """Mapping from data variable names to dtypes.
+
+        Cannot be modified directly, but is updated when adding new variables.
+
+        See Also
+        --------
+        Dataset.dtype
+        """
+        return self._dataset.dtypes
+
+    def _ipython_key_completions_(self):
+        """Provide method for the key-autocompletions in IPython."""
+        return [
+            key
+            for key in self._dataset._ipython_key_completions_()
+            if key not in self._dataset._coord_names
+        ]
diff --git a/xarray/core/datatree.py b/xarray/core/datatree.py
@@ -21,7 +21,8 @@
 from xarray.core.common import TreeAttrAccessMixin, get_chunksizes
 from xarray.core.coordinates import Coordinates, DataTreeCoordinates
 from xarray.core.dataarray import DataArray
-from xarray.core.dataset import Dataset, DataVariables
+from xarray.core.dataset import Dataset
+from xarray.core.dataset_variables import DataVariables
 from xarray.core.datatree_mapping import (
     map_over_datasets,
 )