MAINT replace obj and grad in sigmoid calibration with private loss module (scikit-learn#27185)

OmarManzoor · web-flow · commit 1abc9ad7a612 · 2023-09-06T09:16:55.000+02:00
diff --git a/bench_calibration.py b/bench_calibration.py
diff --git a/doc/whats_new/v1.4.rst b/doc/whats_new/v1.4.rst
@@ -85,6 +85,10 @@ Changelog
   produce large prediction scores. Before it was numerically unstable.
   :pr:`26913` by :user:`Omar Salman <OmarManzoor>`.
 
+- |Enhancement| The internal objective and gradient of the `sigmoid` method
+  of :class:`calibration.CalibratedClassifierCV` have been replaced by the
+  private loss module. :pr:`27185` by :user:`Omar Salman <OmarManzoor>`.
+
 :mod:`sklearn.cluster`
 ............................
 
diff --git a/sklearn/calibration.py b/sklearn/calibration.py
@@ -14,11 +14,12 @@
 from numbers import Integral, Real
 
 import numpy as np
-from scipy.optimize import fmin_bfgs
-from scipy.special import expit, xlogy
+from scipy.optimize import minimize
+from scipy.special import expit
 
 from sklearn.utils import Bunch
 
+from ._loss import HalfBinomialLoss
 from .base import (
     BaseEstimator,
     ClassifierMixin,
@@ -885,29 +886,32 @@ def _sigmoid_calibration(
     T = np.zeros_like(y, dtype=np.float64)
     T[y > 0] = (prior1 + 1.0) / (prior1 + 2.0)
     T[y <= 0] = 1.0 / (prior0 + 2.0)
-    T1 = 1.0 - T
 
-    def objective(AB):
-        # From Platt (beginning of Section 2.2)
-        P = expit(-(AB[0] * F + AB[1]))
-        loss = -(xlogy(T, P) + xlogy(T1, 1.0 - P))
-        if sample_weight is not None:
-            return (sample_weight * loss).sum()
-        else:
-            return loss.sum()
+    bin_loss = HalfBinomialLoss()
 
-    def grad(AB):
-        # gradient of the objective function
-        P = expit(-(AB[0] * F + AB[1]))
-        TEP_minus_T1P = T - P
-        if sample_weight is not None:
-            TEP_minus_T1P *= sample_weight
-        dA = np.dot(TEP_minus_T1P, F)
-        dB = np.sum(TEP_minus_T1P)
-        return np.array([dA, dB])
+    def loss_grad(AB):
+        l, g = bin_loss.loss_gradient(
+            y_true=T,
+            raw_prediction=-(AB[0] * F + AB[1]),
+            sample_weight=sample_weight,
+        )
+        loss = l.sum()
+        grad = np.array([-g @ F, -g.sum()])
+        return loss, grad
 
     AB0 = np.array([0.0, log((prior0 + 1.0) / (prior1 + 1.0))])
-    AB_ = fmin_bfgs(objective, AB0, fprime=grad, disp=False)
+
+    opt_result = minimize(
+        loss_grad,
+        AB0,
+        method="L-BFGS-B",
+        jac=True,
+        options={
+            "gtol": 1e-6,
+            "ftol": 64 * np.finfo(float).eps,
+        },
+    )
+    AB_ = opt_result.x
 
     # The tuned multiplicative parameter is converted back to the original
     # input feature scale. The offset parameter does not need rescaling since
diff --git a/sklearn/tests/test_docstring_parameters.py b/sklearn/tests/test_docstring_parameters.py
@@ -56,6 +56,7 @@
     "sklearn.pipeline.make_union",
     "sklearn.utils.extmath.safe_sparse_dot",
     "sklearn.utils._joblib",
+    "HalfBinomialLoss",
 ]
 
 # Methods where y param should be ignored if y=None by default

Original file line number	Diff line number	Diff line change
`@@ -56,6 +56,7 @@`
`56`	`56`	`"sklearn.pipeline.make_union",`
`57`	`57`	`"sklearn.utils.extmath.safe_sparse_dot",`
`58`	`58`	`"sklearn.utils._joblib",`
	`59`	`+ "HalfBinomialLoss",`
`59`	`60`	`]`
`60`	`61`
`61`	`62`	`# Methods where y param should be ignored if y=None by default`