DOC Improve the plot_gpy_noisy.py example (scikit-learn#29788)

ogrisel · glemaitre · web-flow · commit 35f8159d8e40 · 2024-09-05T16:30:50.000Z
Co-authored-by: Guillaume Lemaitre &lt;guillaume@probabl.ai&gt;
diff --git a/examples/gaussian_process/plot_gpr_noisy.py b/examples/gaussian_process/plot_gpr_noisy.py
@@ -33,7 +33,7 @@ def target_generator(X, add_noise=False):
 # %%
 # Let's have a look to the target generator where we will not add any noise to
 # observe the signal that we would like to predict.
-X = np.linspace(0, 5, num=30).reshape(-1, 1)
+X = np.linspace(0, 5, num=80).reshape(-1, 1)
 y = target_generator(X, add_noise=False)
 
 # %%
@@ -88,7 +88,7 @@ def target_generator(X, add_noise=False):
 from sklearn.gaussian_process.kernels import RBF, WhiteKernel
 
 kernel = 1.0 * RBF(length_scale=1e1, length_scale_bounds=(1e-2, 1e3)) + WhiteKernel(
-    noise_level=1, noise_level_bounds=(1e-5, 1e1)
+    noise_level=1, noise_level_bounds=(1e-10, 1e1)
 )
 gpr = GaussianProcessRegressor(kernel=kernel, alpha=0.0)
 gpr.fit(X_train, y_train)
@@ -97,7 +97,7 @@ def target_generator(X, add_noise=False):
 # %%
 plt.plot(X, y, label="Expected signal")
 plt.scatter(x=X_train[:, 0], y=y_train, color="black", alpha=0.4, label="Observations")
-plt.errorbar(X, y_mean, y_std)
+plt.errorbar(X, y_mean, y_std, label="Posterior mean ± std")
 plt.legend()
 plt.xlabel("X")
 plt.ylabel("y")
@@ -109,15 +109,18 @@ def target_generator(X, add_noise=False):
     fontsize=8,
 )
 # %%
-# We see that the optimum kernel found still have a high noise level and
-# an even larger length scale. Furthermore, we observe that the
-# model does not provide faithful predictions.
+# We see that the optimum kernel found still has a high noise level and an even
+# larger length scale. The length scale reaches the maximum bound that we
+# allowed for this parameter and we got a warning as a result.
 #
-# Now, we will initialize the
-# :class:`~sklearn.gaussian_process.kernels.RBF` with a
-# larger `length_scale` and the
-# :class:`~sklearn.gaussian_process.kernels.WhiteKernel`
-# with a smaller noise level lower bound.
+# More importantly, we observe that the model does not provide useful
+# predictions: the mean prediction seems to be constant: it does not follow the
+# expected noise-free signal.
+#
+# Now, we will initialize the :class:`~sklearn.gaussian_process.kernels.RBF`
+# with a larger `length_scale` initial value and the
+# :class:`~sklearn.gaussian_process.kernels.WhiteKernel` with a smaller initial
+# noise level lower while keeping the parameter bounds unchanged.
 kernel = 1.0 * RBF(length_scale=1e-1, length_scale_bounds=(1e-2, 1e3)) + WhiteKernel(
     noise_level=1e-2, noise_level_bounds=(1e-10, 1e1)
 )
@@ -128,7 +131,7 @@ def target_generator(X, add_noise=False):
 # %%
 plt.plot(X, y, label="Expected signal")
 plt.scatter(x=X_train[:, 0], y=y_train, color="black", alpha=0.4, label="Observations")
-plt.errorbar(X, y_mean, y_std)
+plt.errorbar(X, y_mean, y_std, label="Posterior mean ± std")
 plt.legend()
 plt.xlabel("X")
 plt.ylabel("y")
@@ -153,21 +156,19 @@ def target_generator(X, add_noise=False):
 # for different hyperparameters to get a sense of the local minima.
 from matplotlib.colors import LogNorm
 
-length_scale = np.logspace(-2, 4, num=50)
-noise_level = np.logspace(-2, 1, num=50)
+length_scale = np.logspace(-2, 4, num=80)
+noise_level = np.logspace(-2, 1, num=80)
 length_scale_grid, noise_level_grid = np.meshgrid(length_scale, noise_level)
 
 log_marginal_likelihood = [
     gpr.log_marginal_likelihood(theta=np.log([0.36, scale, noise]))
     for scale, noise in zip(length_scale_grid.ravel(), noise_level_grid.ravel())
 ]
-log_marginal_likelihood = np.reshape(
-    log_marginal_likelihood, newshape=noise_level_grid.shape
-)
+log_marginal_likelihood = np.reshape(log_marginal_likelihood, noise_level_grid.shape)
 
 # %%
 vmin, vmax = (-log_marginal_likelihood).min(), 50
-level = np.around(np.logspace(np.log10(vmin), np.log10(vmax), num=50), decimals=1)
+level = np.around(np.logspace(np.log10(vmin), np.log10(vmax), num=20), decimals=1)
 plt.contour(
     length_scale_grid,
     noise_level_grid,
@@ -184,8 +185,43 @@ def target_generator(X, add_noise=False):
 plt.show()
 
 # %%
-# We see that there are two local minima that correspond to the combination
-# of hyperparameters previously found. Depending on the initial values for the
-# hyperparameters, the gradient-based optimization might converge whether or
-# not to the best model. It is thus important to repeat the optimization
-# several times for different initializations.
+#
+# We see that there are two local minima that correspond to the combination of
+# hyperparameters previously found. Depending on the initial values for the
+# hyperparameters, the gradient-based optimization might or might not
+# converge to the best model. It is thus important to repeat the optimization
+# several times for different initializations. This can be done by setting the
+# `n_restarts_optimizer` parameter of the
+# :class:`~sklearn.gaussian_process.GaussianProcessRegressor` class.
+#
+# Let's try again to fit our model with the bad initial values but this time
+# with 10 random restarts.
+
+kernel = 1.0 * RBF(length_scale=1e1, length_scale_bounds=(1e-2, 1e3)) + WhiteKernel(
+    noise_level=1, noise_level_bounds=(1e-10, 1e1)
+)
+gpr = GaussianProcessRegressor(
+    kernel=kernel, alpha=0.0, n_restarts_optimizer=10, random_state=0
+)
+gpr.fit(X_train, y_train)
+y_mean, y_std = gpr.predict(X, return_std=True)
+
+# %%
+plt.plot(X, y, label="Expected signal")
+plt.scatter(x=X_train[:, 0], y=y_train, color="black", alpha=0.4, label="Observations")
+plt.errorbar(X, y_mean, y_std, label="Posterior mean ± std")
+plt.legend()
+plt.xlabel("X")
+plt.ylabel("y")
+_ = plt.title(
+    (
+        f"Initial: {kernel}\nOptimum: {gpr.kernel_}\nLog-Marginal-Likelihood: "
+        f"{gpr.log_marginal_likelihood(gpr.kernel_.theta)}"
+    ),
+    fontsize=8,
+)
+
+# %%
+#
+# As we hoped, random restarts allow the optimization to find the best set
+# of hyperparameters despite the bad initial values.