neurodata
diff --git a/‎azure-pipelines.yml
Lines changed: 5 additions & 0 deletions b/‎azure-pipelines.yml
Lines changed: 5 additions & 0 deletions
diff --git a/‎build_tools/check-meson-openmp-dependencies.py
Lines changed: 172 additions & 0 deletions b/‎build_tools/check-meson-openmp-dependencies.py
Lines changed: 172 additions & 0 deletions
diff --git a/‎doc/api_reference.py
Lines changed: 1 addition & 0 deletions b/‎doc/api_reference.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎doc/developers/develop.rst
Lines changed: 0 additions & 9 deletions b/‎doc/developers/develop.rst
Lines changed: 0 additions & 9 deletions
diff --git a/‎doc/whats_new/v1.6.rst
Lines changed: 14 additions & 0 deletions b/‎doc/whats_new/v1.6.rst
Lines changed: 14 additions & 0 deletions
diff --git a/‎examples/gaussian_process/plot_gpr_noisy.py
Lines changed: 59 additions & 23 deletions b/‎examples/gaussian_process/plot_gpr_noisy.py
Lines changed: 59 additions & 23 deletions
@@ -40,6 +40,11 @@ jobs:
     - bash: |
         ./build_tools/linting.sh
       displayName: Run linters
+    - bash: |
+        pip install ninja meson scipy
+        python build_tools/check-meson-openmp-dependencies.py
+      displayName: Run Meson OpenMP checks
+
 
 - template: build_tools/azure/posix.yml
   parameters:
 
@@ -0,0 +1,172 @@
+"""
+Check that OpenMP dependencies are correctly defined in meson.build files.
+
+This is based on trying to make sure the the following two things match:
+- the Cython files using OpenMP (based on a git grep regex)
+- the Cython extension modules that are built with OpenMP compiler flags (based
+  on meson introspect json output)
+"""
+
+import json
+import re
+import subprocess
+from pathlib import Path
+
+
+def has_source_openmp_flags(target_source):
+    return any("openmp" in arg for arg in target_source["parameters"])
+
+
+def has_openmp_flags(target):
+    """Return whether target sources use OpenMP flags.
+
+    Make sure that both compiler and linker source use OpenMP.
+    Look at `get_meson_info` docstring to see what `target` looks like.
+    """
+    target_sources = target["target_sources"]
+
+    target_use_openmp_flags = any(
+        has_source_openmp_flags(target_source) for target_source in target_sources
+    )
+
+    if not target_use_openmp_flags:
+        return False
+
+    # When the target use OpenMP we expect a compiler + linker source and we
+    # want to make sure that both the compiler and the linker use OpenMP
+    assert len(target_sources) == 2
+    compiler_source, linker_source = target_sources
+    assert "compiler" in compiler_source
+    assert "linker" in linker_source
+
+    compiler_use_openmp_flags = any(
+        "openmp" in arg for arg in compiler_source["parameters"]
+    )
+    linker_use_openmp_flags = any(
+        "openmp" in arg for arg in linker_source["parameters"]
+    )
+
+    assert compiler_use_openmp_flags == linker_use_openmp_flags
+    return compiler_use_openmp_flags
+
+
+def get_canonical_name_meson(target, build_path):
+    """Return a name based on generated shared library.
+
+    The goal is to return a name that can be easily matched with the output
+    from `git_grep_info`.
+
+    Look at `get_meson_info` docstring to see what `target` looks like.
+    """
+    # Expect a list with one element with the name of the shared library
+    assert len(target["filename"]) == 1
+    shared_library_path = Path(target["filename"][0])
+    shared_library_relative_path = shared_library_path.relative_to(
+        build_path.absolute()
+    )
+    # Needed on Windows to match git grep output
+    rel_path = shared_library_relative_path.as_posix()
+    # OS-specific naming of the shared library .cpython- on POSIX and
+    # something like .cp312- on Windows
+    pattern = r"\.(cpython|cp\d+)-.+"
+    return re.sub(pattern, "", str(rel_path))
+
+
+def get_canonical_name_git_grep(filename):
+    """Return name based on filename.
+
+    The goal is to return a name that can easily be matched with the output
+    from `get_meson_info`.
+    """
+    return re.sub(r"\.pyx(\.tp)?", "", filename)
+
+
+def get_meson_info():
+    """Return names of extension that use OpenMP based on meson introspect output.
+
+    The meson introspect json info is a list of targets where a target is a dict
+    that looks like this (parts not used in this script are not shown for simplicity):
+    {
+      'name': '_k_means_elkan.cpython-312-x86_64-linux-gnu',
+      'filename': [
+        '<meson_build_dir>/sklearn/cluster/_k_means_elkan.cpython-312-x86_64-linux-gnu.so'
+      ],
+      'target_sources': [
+        {
+          'compiler': ['ccache', 'cc'],
+          'parameters': [
+            '-Wall',
+            '-std=c11',
+            '-fopenmp',
+            ...
+          ],
+          ...
+        },
+        {
+          'linker': ['cc'],
+          'parameters': [
+            '-shared',
+            '-fPIC',
+            '-fopenmp',
+            ...
+          ]
+        }
+      ]
+    }
+    """
+    build_path = Path("build/introspect")
+    subprocess.check_call(["meson", "setup", build_path, "--reconfigure"])
+
+    json_out = subprocess.check_output(
+        ["meson", "introspect", build_path, "--targets"], text=True
+    )
+    target_list = json.loads(json_out)
+    meson_targets = [target for target in target_list if has_openmp_flags(target)]
+
+    return [get_canonical_name_meson(each, build_path) for each in meson_targets]
+
+
+def get_git_grep_info():
+    """Return names of extensions that use OpenMP based on git grep regex."""
+    git_grep_filenames = subprocess.check_output(
+        ["git", "grep", "-lP", "cython.*parallel|_openmp_helpers"], text=True
+    ).splitlines()
+    git_grep_filenames = [f for f in git_grep_filenames if ".pyx" in f]
+
+    return [get_canonical_name_git_grep(each) for each in git_grep_filenames]
+
+
+def main():
+    from_meson = set(get_meson_info())
+    from_git_grep = set(get_git_grep_info())
+
+    only_in_git_grep = from_git_grep - from_meson
+    only_in_meson = from_meson - from_git_grep
+
+    msg = ""
+    if only_in_git_grep:
+        only_in_git_grep_msg = "\n".join(
+            [f"  {each}" for each in sorted(only_in_git_grep)]
+        )
+        msg += (
+            "Some Cython files use OpenMP,"
+            " but their meson.build is missing the openmp_dep dependency:\n"
+            f"{only_in_git_grep_msg}\n\n"
+        )
+
+    if only_in_meson:
+        only_in_meson_msg = "\n".join([f"  {each}" for each in sorted(only_in_meson)])
+        msg += (
+            "Some Cython files do not use OpenMP,"
+            " you should remove openmp_dep from their meson.build:\n"
+            f"{only_in_meson_msg}\n\n"
+        )
+
+    if from_meson != from_git_grep:
+        raise ValueError(
+            f"Some issues have been found in Meson OpenMP dependencies:\n\n{msg}"
+        )
+
+
+if __name__ == "__main__":
+    main()
@@ -1183,6 +1183,7 @@ def _get_submodule(module_name, submodule_name):
                     "validation.check_symmetric",
                     "validation.column_or_1d",
                     "validation.has_fit_parameter",
+                    "validation.validate_data",
                 ],
             },
             {
 
@@ -562,15 +562,6 @@ for your estimator's tags. For example::
 You can create a new subclass of :class:`~sklearn.utils.Tags` if you wish
 to add new tags to the existing set.
 
-In addition to the tags, estimators also need to declare any non-optional
-parameters to ``__init__`` in the ``_required_parameters`` class attribute,
-which is a list or tuple.  If ``_required_parameters`` is only
-``["estimator"]`` or ``["base_estimator"]``, then the estimator will be
-instantiated with an instance of ``LogisticRegression`` (or
-``RidgeRegression`` if the estimator is a regressor) in the tests. The choice
-of these two models is somewhat idiosyncratic but both should provide robust
-closed-form solutions.
-
 .. _developer_api_set_output:
 
 Developer API for `set_output`
 
@@ -25,6 +25,12 @@ Version 1.6.0
 Changes impacting many modules
 ------------------------------
 
+- |API| :func:`utils.validation.validate_data` is introduced and replaces previously
+  private `base.BaseEstimator._validate_data` method. This is intended for third party
+  estimator developers, who should use this function in most cases instead of
+  :func:`utils.validation.check_array` and :func:`utils.validation.check_X_y`.
+  :pr:`29696` by `Adrin Jalali`_.
+
 - |Enhancement| `__sklearn_tags__` was introduced for setting tags in estimators.
   More details in :ref:`estimator_tags`.
   :pr:`22606` by `Thomas Fan`_ and :pr:`29677` by `Adrin Jalali`_.
@@ -247,6 +253,10 @@ Changelog
 :mod:`sklearn.linear_model`
 ...........................
 
+- |Fix| :class:`linear_model.LogisticRegressionCV` corrects sample weight handling
+  for the calculation of test scores.
+  :pr:`29419` by :user:`Shruti Nath <snath-xoc>`.
+
 - |API| Deprecates `copy_X` in :class:`linear_model.TheilSenRegressor` as the parameter
   has no effect. `copy_X` will be removed in 1.8.
   :pr:`29105` by :user:`Adam Li <adam2392>`.
@@ -271,6 +281,10 @@ Changelog
   :pr:`29210` by :user:`Marc Torrellas Socastro <marctorsoc>` and
   :user:`Stefanie Senger <StefanieSenger>`.
 
+- |Efficiency| :func:`sklearn.metrics.classification_report` is now faster by caching
+  classification labels.
+  :pr:`29738` by `Adrin Jalali`_.
+
 - |API| scoring="neg_max_error" should be used instead of
   scoring="max_error" which is now deprecated.
   :pr:`29462` by :user:`Farid "Freddie" Taba <artificialfintelligence>`.
 
@@ -33,7 +33,7 @@ def target_generator(X, add_noise=False):
 # %%
 # Let's have a look to the target generator where we will not add any noise to
 # observe the signal that we would like to predict.
-X = np.linspace(0, 5, num=30).reshape(-1, 1)
+X = np.linspace(0, 5, num=80).reshape(-1, 1)
 y = target_generator(X, add_noise=False)
 
 # %%
@@ -88,7 +88,7 @@ def target_generator(X, add_noise=False):
 from sklearn.gaussian_process.kernels import RBF, WhiteKernel
 
 kernel = 1.0 * RBF(length_scale=1e1, length_scale_bounds=(1e-2, 1e3)) + WhiteKernel(
-    noise_level=1, noise_level_bounds=(1e-5, 1e1)
+    noise_level=1, noise_level_bounds=(1e-10, 1e1)
 )
 gpr = GaussianProcessRegressor(kernel=kernel, alpha=0.0)
 gpr.fit(X_train, y_train)
@@ -97,7 +97,7 @@ def target_generator(X, add_noise=False):
 # %%
 plt.plot(X, y, label="Expected signal")
 plt.scatter(x=X_train[:, 0], y=y_train, color="black", alpha=0.4, label="Observations")
-plt.errorbar(X, y_mean, y_std)
+plt.errorbar(X, y_mean, y_std, label="Posterior mean ± std")
 plt.legend()
 plt.xlabel("X")
 plt.ylabel("y")
@@ -109,15 +109,18 @@ def target_generator(X, add_noise=False):
     fontsize=8,
 )
 # %%
-# We see that the optimum kernel found still have a high noise level and
-# an even larger length scale. Furthermore, we observe that the
-# model does not provide faithful predictions.
+# We see that the optimum kernel found still has a high noise level and an even
+# larger length scale. The length scale reaches the maximum bound that we
+# allowed for this parameter and we got a warning as a result.
 #
-# Now, we will initialize the
-# :class:`~sklearn.gaussian_process.kernels.RBF` with a
-# larger `length_scale` and the
-# :class:`~sklearn.gaussian_process.kernels.WhiteKernel`
-# with a smaller noise level lower bound.
+# More importantly, we observe that the model does not provide useful
+# predictions: the mean prediction seems to be constant: it does not follow the
+# expected noise-free signal.
+#
+# Now, we will initialize the :class:`~sklearn.gaussian_process.kernels.RBF`
+# with a larger `length_scale` initial value and the
+# :class:`~sklearn.gaussian_process.kernels.WhiteKernel` with a smaller initial
+# noise level lower while keeping the parameter bounds unchanged.
 kernel = 1.0 * RBF(length_scale=1e-1, length_scale_bounds=(1e-2, 1e3)) + WhiteKernel(
     noise_level=1e-2, noise_level_bounds=(1e-10, 1e1)
 )
@@ -128,7 +131,7 @@ def target_generator(X, add_noise=False):
 # %%
 plt.plot(X, y, label="Expected signal")
 plt.scatter(x=X_train[:, 0], y=y_train, color="black", alpha=0.4, label="Observations")
-plt.errorbar(X, y_mean, y_std)
+plt.errorbar(X, y_mean, y_std, label="Posterior mean ± std")
 plt.legend()
 plt.xlabel("X")
 plt.ylabel("y")
@@ -153,21 +156,19 @@ def target_generator(X, add_noise=False):
 # for different hyperparameters to get a sense of the local minima.
 from matplotlib.colors import LogNorm
 
-length_scale = np.logspace(-2, 4, num=50)
-noise_level = np.logspace(-2, 1, num=50)
+length_scale = np.logspace(-2, 4, num=80)
+noise_level = np.logspace(-2, 1, num=80)
 length_scale_grid, noise_level_grid = np.meshgrid(length_scale, noise_level)
 
 log_marginal_likelihood = [
     gpr.log_marginal_likelihood(theta=np.log([0.36, scale, noise]))
     for scale, noise in zip(length_scale_grid.ravel(), noise_level_grid.ravel())
 ]
-log_marginal_likelihood = np.reshape(
-    log_marginal_likelihood, newshape=noise_level_grid.shape
-)
+log_marginal_likelihood = np.reshape(log_marginal_likelihood, noise_level_grid.shape)
 
 # %%
 vmin, vmax = (-log_marginal_likelihood).min(), 50
-level = np.around(np.logspace(np.log10(vmin), np.log10(vmax), num=50), decimals=1)
+level = np.around(np.logspace(np.log10(vmin), np.log10(vmax), num=20), decimals=1)
 plt.contour(
     length_scale_grid,
     noise_level_grid,
@@ -184,8 +185,43 @@ def target_generator(X, add_noise=False):
 plt.show()
 
 # %%
-# We see that there are two local minima that correspond to the combination
-# of hyperparameters previously found. Depending on the initial values for the
-# hyperparameters, the gradient-based optimization might converge whether or
-# not to the best model. It is thus important to repeat the optimization
-# several times for different initializations.
+#
+# We see that there are two local minima that correspond to the combination of
+# hyperparameters previously found. Depending on the initial values for the
+# hyperparameters, the gradient-based optimization might or might not
+# converge to the best model. It is thus important to repeat the optimization
+# several times for different initializations. This can be done by setting the
+# `n_restarts_optimizer` parameter of the
+# :class:`~sklearn.gaussian_process.GaussianProcessRegressor` class.
+#
+# Let's try again to fit our model with the bad initial values but this time
+# with 10 random restarts.
+
+kernel = 1.0 * RBF(length_scale=1e1, length_scale_bounds=(1e-2, 1e3)) + WhiteKernel(
+    noise_level=1, noise_level_bounds=(1e-10, 1e1)
+)
+gpr = GaussianProcessRegressor(
+    kernel=kernel, alpha=0.0, n_restarts_optimizer=10, random_state=0
+)
+gpr.fit(X_train, y_train)
+y_mean, y_std = gpr.predict(X, return_std=True)
+
+# %%
+plt.plot(X, y, label="Expected signal")
+plt.scatter(x=X_train[:, 0], y=y_train, color="black", alpha=0.4, label="Observations")
+plt.errorbar(X, y_mean, y_std, label="Posterior mean ± std")
+plt.legend()
+plt.xlabel("X")
+plt.ylabel("y")
+_ = plt.title(
+    (
+        f"Initial: {kernel}\nOptimum: {gpr.kernel_}\nLog-Marginal-Likelihood: "
+        f"{gpr.log_marginal_likelihood(gpr.kernel_.theta)}"
+    ),
+    fontsize=8,
+)
+
+# %%
+#
+# As we hoped, random restarts allow the optimization to find the best set
+# of hyperparameters despite the bad initial values.
Original file line number	Diff line number	Diff line change
`@@ -1183,6 +1183,7 @@ def _get_submodule(module_name, submodule_name):`
`1183`	`1183`	`"validation.check_symmetric",`
`1184`	`1184`	`"validation.column_or_1d",`
`1185`	`1185`	`"validation.has_fit_parameter",`
	`1186`	`+ "validation.validate_data",`
`1186`	`1187`	`],`
`1187`	`1188`	`},`
`1188`	`1189`	`{`