Merge pull request scipy#22175 from rgommers/stats-nogil-threadsafety

tylerjereddy · web-flow · commit f40d2cd27d46 · 2024-12-23T15:17:08.000-07:00
MAINT: stats: fix thread-safety issues under free-threaded CPython
diff --git a/scipy/stats/_kde.py b/scipy/stats/_kde.py
@@ -18,6 +18,7 @@
 #-------------------------------------------------------------------------------
 
 # Standard library imports.
+import threading
 import warnings
 
 # SciPy imports.
@@ -36,6 +37,8 @@
 
 __all__ = ['gaussian_kde']
 
+MVN_LOCK = threading.Lock()
+
 
 class gaussian_kde:
     """Representation of a kernel-density estimate using Gaussian kernels.
@@ -384,9 +387,10 @@ def integrate_box(self, low_bounds, high_bounds, maxpts=None):
         else:
             extra_kwds = {}
 
-        value, inform = _mvn.mvnun_weighted(low_bounds, high_bounds,
-                                            self.dataset, self.weights,
-                                            self.covariance, **extra_kwds)
+        with MVN_LOCK:
+            value, inform = _mvn.mvnun_weighted(low_bounds, high_bounds,
+                                                self.dataset, self.weights,
+                                                self.covariance, **extra_kwds)
         if inform:
             msg = f'An integral in _mvn.mvnun requires more points than {self.d * 1000}'
             warnings.warn(msg, stacklevel=2)
diff --git a/scipy/stats/_mannwhitneyu.py b/scipy/stats/_mannwhitneyu.py
@@ -1,3 +1,4 @@
+import threading
 import numpy as np
 from collections import namedtuple
 from scipy import special
@@ -143,7 +144,10 @@ def build_u_freqs_array(self, maxu):
         return configurations / total
 
 
-_mwu_state = _MWU(0, 0)
+# Maintain state for faster repeat calls to `mannwhitneyu`.
+# _MWU() is calculated once per thread and stored as an attribute on
+# this thread-local variable inside mannwhitneyu().
+_mwu_state = threading.local()
 
 
 def _get_mwu_z(U, n1, n2, t, axis=0, continuity=True):
@@ -461,8 +465,10 @@ def mannwhitneyu(x, y, use_continuity=True, alternative="two-sided",
         method = _mwu_choose_method(n1, n2, np.any(t > 1))
 
     if method == "exact":
-        _mwu_state.set_shapes(n1, n2)
-        p = _mwu_state.sf(U.astype(int))
+        if not hasattr(_mwu_state, 's'):
+            _mwu_state.s = _MWU(0, 0)
+        _mwu_state.s.set_shapes(n1, n2)
+        p = _mwu_state.s.sf(U.astype(int))
     elif method == "asymptotic":
         z = _get_mwu_z(U, n1, n2, t, continuity=use_continuity)
         p = stats.norm.sf(z)
diff --git a/scipy/stats/_morestats.py b/scipy/stats/_morestats.py
@@ -1,5 +1,6 @@
 import math
 import warnings
+import threading
 from collections import namedtuple
 
 import numpy as np
@@ -962,7 +963,7 @@ def boxcox_llf(lmb, data):
     if xp.isdtype(dt, 'integral'):
         data = xp.asarray(data, dtype=xp.float64)
         dt = xp.float64
-    
+
     logdata = xp.log(data)
 
     # Compute the variance of the transformed data.
@@ -2628,7 +2629,9 @@ def sf(self, k, n, m):
 
 
 # Maintain state for faster repeat calls to ansari w/ method='exact'
-_abw_state = _ABW()
+# _ABW() is calculated once per thread and stored as an attribute on
+# this thread-local variable inside ansari().
+_abw_state = threading.local()
 
 
 @_axis_nan_policy_factory(AnsariResult, n_samples=2)
@@ -2739,6 +2742,10 @@ def ansari(x, y, alternative='two-sided'):
     if alternative not in {'two-sided', 'greater', 'less'}:
         raise ValueError("'alternative' must be 'two-sided',"
                          " 'greater', or 'less'.")
+
+    if not hasattr(_abw_state, 'a'):
+        _abw_state.a = _ABW()
+
     x, y = asarray(x), asarray(y)
     n = len(x)
     m = len(y)
@@ -2759,14 +2766,14 @@ def ansari(x, y, alternative='two-sided'):
         warnings.warn("Ties preclude use of exact statistic.", stacklevel=2)
     if exact:
         if alternative == 'two-sided':
-            pval = 2.0 * np.minimum(_abw_state.cdf(AB, n, m),
-                                    _abw_state.sf(AB, n, m))
+            pval = 2.0 * np.minimum(_abw_state.a.cdf(AB, n, m),
+                                    _abw_state.a.sf(AB, n, m))
         elif alternative == 'greater':
             # AB statistic is _smaller_ when ratio of scales is larger,
             # so this is the opposite of the usual calculation
-            pval = _abw_state.cdf(AB, n, m)
+            pval = _abw_state.a.cdf(AB, n, m)
         else:
-            pval = _abw_state.sf(AB, n, m)
+            pval = _abw_state.a.sf(AB, n, m)
         return AnsariResult(AB, min(1.0, pval))
 
     # otherwise compute normal approximation
@@ -4359,7 +4366,7 @@ def directional_stats(samples, *, axis=0, normalize=True):
     """
     xp = array_namespace(samples)
     samples = xp.asarray(samples)
-    
+
     if samples.ndim < 2:
         raise ValueError("samples must at least be two-dimensional. "
                          f"Instead samples has shape: {tuple(samples.shape)}")
diff --git a/scipy/stats/_multivariate.py b/scipy/stats/_multivariate.py
@@ -2,6 +2,7 @@
 # Author: Joris Vankerschaver 2013
 #
 import math
+import threading
 import numpy as np
 import scipy.linalg
 from scipy._lib import doccer
@@ -38,6 +39,7 @@
 _LOG_2PI = np.log(2 * np.pi)
 _LOG_2 = np.log(2)
 _LOG_PI = np.log(np.pi)
+MVN_LOCK = threading.Lock()
 
 
 _doc_random_state = """\
@@ -638,8 +640,9 @@ def _cdf(self, x, mean, cov, maxpts, abseps, releps, lower_limit):
 
         # mvnun expects 1-d arguments, so process points sequentially
         def func1d(limits):
-            return _mvn.mvnun(limits[:n], limits[n:], mean, cov,
-                              maxpts, abseps, releps)[0]
+            with MVN_LOCK:
+                return _mvn.mvnun(limits[:n], limits[n:], mean, cov,
+                                maxpts, abseps, releps)[0]
 
         out = np.apply_along_axis(func1d, -1, limits) * signs
         return _squeeze_output(out)
diff --git a/scipy/stats/tests/test_hypotests.py b/scipy/stats/tests/test_hypotests.py
@@ -14,7 +14,7 @@
                                     _cdf_cvm, cramervonmises_2samp,
                                     _pval_cvm_2samp_exact, barnard_exact,
                                     boschloo_exact)
-from scipy.stats._mannwhitneyu import mannwhitneyu, _mwu_state
+from scipy.stats._mannwhitneyu import mannwhitneyu, _mwu_state, _MWU
 from .common_tests import check_named_results
 from scipy._lib._testutils import _TestPythranFunc
 from scipy.stats._axis_nan_policy import SmallSampleWarning, too_small_1d_not_omit
@@ -367,28 +367,30 @@ def test_tie_correct(self):
 
     def test_exact_distribution(self):
         # I considered parametrize. I decided against it.
+        setattr(_mwu_state, 's', _MWU(0, 0))
+
         p_tables = {3: self.pn3, 4: self.pn4, 5: self.pm5, 6: self.pm6}
         for n, table in p_tables.items():
             for m, p in table.items():
                 # check p-value against table
                 u = np.arange(0, len(p))
-                _mwu_state.set_shapes(m, n)
-                assert_allclose(_mwu_state.cdf(k=u), p, atol=1e-3)
+                _mwu_state.s.set_shapes(m, n)
+                assert_allclose(_mwu_state.s.cdf(k=u), p, atol=1e-3)
 
                 # check identity CDF + SF - PMF = 1
                 # ( In this implementation, SF(U) includes PMF(U) )
                 u2 = np.arange(0, m*n+1)
-                assert_allclose(_mwu_state.cdf(k=u2)
-                                + _mwu_state.sf(k=u2)
-                                - _mwu_state.pmf(k=u2), 1)
+                assert_allclose(_mwu_state.s.cdf(k=u2)
+                                + _mwu_state.s.sf(k=u2)
+                                - _mwu_state.s.pmf(k=u2), 1)
 
                 # check symmetry about mean of U, i.e. pmf(U) = pmf(m*n-U)
-                pmf = _mwu_state.pmf(k=u2)
+                pmf = _mwu_state.s.pmf(k=u2)
                 assert_allclose(pmf, pmf[::-1])
 
                 # check symmetry w.r.t. interchange of m, n
-                _mwu_state.set_shapes(n, m)
-                pmf2 = _mwu_state.pmf(k=u2)
+                _mwu_state.s.set_shapes(n, m)
+                pmf2 = _mwu_state.s.pmf(k=u2)
                 assert_allclose(pmf, pmf2)
 
     def test_asymptotic_behavior(self):
@@ -628,22 +630,25 @@ def test_gh19692_smaller_table(self):
         m, n = 5, 11
         x = rng.random(size=m)
         y = rng.random(size=n)
-        _mwu_state.reset()  # reset cache
+
+        setattr(_mwu_state, 's', _MWU(0, 0))
+        _mwu_state.s.reset()  # reset cache
+
         res = stats.mannwhitneyu(x, y, method='exact')
-        shape = _mwu_state.configurations.shape
+        shape = _mwu_state.s.configurations.shape
         assert shape[-1] == min(res.statistic, m*n - res.statistic) + 1
         stats.mannwhitneyu(y, x, method='exact')
-        assert shape == _mwu_state.configurations.shape  # same when sizes are reversed
+        assert shape == _mwu_state.s.configurations.shape  # same with reversed sizes
 
         # Also, we weren't exploiting the symmetry of the null distribution
         # to its full potential. Ensure that the null distribution is not
         # evaluated explicitly for `k > m*n/2`.
-        _mwu_state.reset()  # reset cache
+        _mwu_state.s.reset()  # reset cache
         stats.mannwhitneyu(x, 0*y, method='exact', alternative='greater')
-        shape = _mwu_state.configurations.shape
+        shape = _mwu_state.s.configurations.shape
         assert shape[-1] == 1  # k is smallest possible
         stats.mannwhitneyu(0*x, y, method='exact', alternative='greater')
-        assert shape == _mwu_state.configurations.shape
+        assert shape == _mwu_state.s.configurations.shape
 
     @pytest.mark.parametrize('alternative', ['less', 'greater', 'two-sided'])
     def test_permutation_method(self, alternative):
diff --git a/scipy/stats/tests/test_morestats.py b/scipy/stats/tests/test_morestats.py
@@ -671,7 +671,7 @@ def test_alternative_exact(self):
         assert pval_g < 0.05  # level of significance.
         # also check if the p-values sum up to 1 plus the probability
         # mass under the calculated statistic.
-        prob = _abw_state.pmf(statistic, len(x1), len(x2))
+        prob = _abw_state.a.pmf(statistic, len(x1), len(x2))
         assert_allclose(pval_g + pval_l, 1 + prob, atol=1e-12)
         # also check if one of the one-sided p-value equals half the
         # two-sided p-value and the other one-sided p-value is its