Enable numbagg for reductions (#8316)

dcherian · andersy005 · web-flow · commit ae41d82127ff · 2023-10-18T03:39:30.000-07:00
Co-authored-by: Anderson Banihirwe &lt;13301940+andersy005@users.noreply.github.com&gt;
diff --git a/xarray/core/nputils.py b/xarray/core/nputils.py
@@ -5,6 +5,7 @@
 import numpy as np
 import pandas as pd
 from numpy.core.multiarray import normalize_axis_index  # type: ignore[attr-defined]
+from packaging.version import Version
 
 # remove once numpy 2.0 is the oldest supported version
 try:
@@ -18,11 +19,20 @@
 try:
     import bottleneck as bn
 
-    _USE_BOTTLENECK = True
+    _BOTTLENECK_AVAILABLE = True
 except ImportError:
     # use numpy methods instead
     bn = np
-    _USE_BOTTLENECK = False
+    _BOTTLENECK_AVAILABLE = False
+
+try:
+    import numbagg
+
+    _HAS_NUMBAGG = Version(numbagg.__version__) >= Version("0.5.0")
+except ImportError:
+    # use numpy methods instead
+    numbagg = np
+    _HAS_NUMBAGG = False
 
 
 def _select_along_axis(values, idx, axis):
@@ -161,13 +171,30 @@ def __setitem__(self, key, value):
         self._array[key] = np.moveaxis(value, vindex_positions, mixed_positions)
 
 
-def _create_bottleneck_method(name, npmodule=np):
+def _create_method(name, npmodule=np):
     def f(values, axis=None, **kwargs):
         dtype = kwargs.get("dtype", None)
         bn_func = getattr(bn, name, None)
+        nba_func = getattr(numbagg, name, None)
 
         if (
-            _USE_BOTTLENECK
+            _HAS_NUMBAGG
+            and OPTIONS["use_numbagg"]
+            and isinstance(values, np.ndarray)
+            and nba_func is not None
+            # numbagg uses ddof=1 only, but numpy uses ddof=0 by default
+            and (("var" in name or "std" in name) and kwargs.get("ddof", 0) == 1)
+            # TODO: bool?
+            and values.dtype.kind in "uifc"
+            # and values.dtype.isnative
+            and (dtype is None or np.dtype(dtype) == values.dtype)
+        ):
+            # numbagg does not take care dtype, ddof
+            kwargs.pop("dtype", None)
+            kwargs.pop("ddof", None)
+            result = nba_func(values, axis=axis, **kwargs)
+        elif (
+            _BOTTLENECK_AVAILABLE
             and OPTIONS["use_bottleneck"]
             and isinstance(values, np.ndarray)
             and bn_func is not None
@@ -233,14 +260,14 @@ def least_squares(lhs, rhs, rcond=None, skipna=False):
     return coeffs, residuals
 
 
-nanmin = _create_bottleneck_method("nanmin")
-nanmax = _create_bottleneck_method("nanmax")
-nanmean = _create_bottleneck_method("nanmean")
-nanmedian = _create_bottleneck_method("nanmedian")
-nanvar = _create_bottleneck_method("nanvar")
-nanstd = _create_bottleneck_method("nanstd")
-nanprod = _create_bottleneck_method("nanprod")
-nancumsum = _create_bottleneck_method("nancumsum")
-nancumprod = _create_bottleneck_method("nancumprod")
-nanargmin = _create_bottleneck_method("nanargmin")
-nanargmax = _create_bottleneck_method("nanargmax")
+nanmin = _create_method("nanmin")
+nanmax = _create_method("nanmax")
+nanmean = _create_method("nanmean")
+nanmedian = _create_method("nanmedian")
+nanvar = _create_method("nanvar")
+nanstd = _create_method("nanstd")
+nanprod = _create_method("nanprod")
+nancumsum = _create_method("nancumsum")
+nancumprod = _create_method("nancumprod")
+nanargmin = _create_method("nanargmin")
+nanargmax = _create_method("nanargmax")
diff --git a/xarray/core/options.py b/xarray/core/options.py
@@ -27,6 +27,7 @@
         "keep_attrs",
         "warn_for_unclosed_files",
         "use_bottleneck",
+        "use_numbagg",
         "use_flox",
     ]
 
@@ -50,6 +51,7 @@ class T_Options(TypedDict):
         warn_for_unclosed_files: bool
         use_bottleneck: bool
         use_flox: bool
+        use_numbagg: bool
 
 
 OPTIONS: T_Options = {
@@ -72,6 +74,7 @@ class T_Options(TypedDict):
     "warn_for_unclosed_files": False,
     "use_bottleneck": True,
     "use_flox": True,
+    "use_numbagg": True,
 }
 
 _JOIN_OPTIONS = frozenset(["inner", "outer", "left", "right", "exact"])
@@ -98,6 +101,7 @@ def _positive_integer(value: int) -> bool:
     "file_cache_maxsize": _positive_integer,
     "keep_attrs": lambda choice: choice in [True, False, "default"],
     "use_bottleneck": lambda value: isinstance(value, bool),
+    "use_numbagg": lambda value: isinstance(value, bool),
     "use_flox": lambda value: isinstance(value, bool),
     "warn_for_unclosed_files": lambda value: isinstance(value, bool),
 }
@@ -230,6 +234,9 @@ class set_options:
     use_flox : bool, default: True
         Whether to use ``numpy_groupies`` and `flox`` to
         accelerate groupby and resampling reductions.
+    use_numbagg : bool, default: True
+        Whether to use ``numbagg`` to accelerate reductions.
+        Takes precedence over ``use_bottleneck`` when both are True.
     warn_for_unclosed_files : bool, default: False
         Whether or not to issue a warning when unclosed files are
         deallocated. This is mostly useful for debugging.