Merge pull request #3055 from stan-dev/hess-sparse

andrjohns · web-flow · commit ee65daae950e · 2024-05-06T20:50:41.000+03:00
Allow Hessian functors to return Hessian as compressed sparse matrix
diff --git a/stan/math/fwd/functor/hessian.hpp b/stan/math/fwd/functor/hessian.hpp
@@ -2,6 +2,7 @@
 #define STAN_MATH_FWD_FUNCTOR_HESSIAN_HPP
 
 #include <stan/math/fwd/core.hpp>
+#include <stan/math/fwd/fun/value_of.hpp>
 #include <stan/math/prim/fun/Eigen.hpp>
 
 namespace stan {
@@ -14,6 +15,9 @@ namespace math {
  * mixed definition, which is faster for Hessians, is that this
  * version is itself differentiable.
  *
+ * Instead of returning the full symmetric Hessian, we return the
+ * lower-triangular only as a column-major compressed sparse matrix.
+ *
  * <p>The functor must implement
  *
  * <code>
@@ -35,23 +39,27 @@ namespace math {
  * @param[in] x Argument to function
  * @param[out] fx Function applied to argument
  * @param[out] grad gradient of function at argument
- * @param[out] H Hessian of function at argument
+ * @param[out] H Hessian of function at argument, as a lower-triangular
+ *                      compressed sparse matrix
  */
 template <typename T, typename F>
 void hessian(const F& f, const Eigen::Matrix<T, Eigen::Dynamic, 1>& x, T& fx,
              Eigen::Matrix<T, Eigen::Dynamic, 1>& grad,
-             Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>& H) {
-  H.resize(x.size(), x.size());
-  grad.resize(x.size());
-  // size 0 separate because nothing to loop over in main body
-  if (x.size() == 0) {
-    fx = f(x);
+             Eigen::SparseMatrix<T>& H) {
+  int d = x.size();
+  if (d == 0) {
+    fx = value_of(f(x));
     return;
   }
-  Eigen::Matrix<fvar<fvar<T> >, Eigen::Dynamic, 1> x_fvar(x.size());
-  for (int i = 0; i < x.size(); ++i) {
-    for (int j = i; j < x.size(); ++j) {
-      for (int k = 0; k < x.size(); ++k) {
+
+  H.resize(d, d);
+  H.reserve(Eigen::VectorXi::LinSpaced(d, 1, d).reverse());
+  grad.resize(d);
+
+  Eigen::Matrix<fvar<fvar<T> >, Eigen::Dynamic, 1> x_fvar(d);
+  for (int i = 0; i < d; ++i) {
+    for (int j = i; j < d; ++j) {
+      for (int k = 0; k < d; ++k) {
         x_fvar(k) = fvar<fvar<T> >(fvar<T>(x(k), j == k), fvar<T>(i == k, 0));
       }
       fvar<fvar<T> > fx_fvar = f(x_fvar);
@@ -61,10 +69,38 @@ void hessian(const F& f, const Eigen::Matrix<T, Eigen::Dynamic, 1>& x, T& fx,
       if (i == j) {
         grad(i) = fx_fvar.d_.val_;
       }
-      H(i, j) = fx_fvar.d_.d_;
-      H(j, i) = H(i, j);
+      H.insert(j, i) = fx_fvar.d_.d_;
     }
   }
+  H.makeCompressed();
+}
+
+/**
+ * Calculate the value, the gradient, and the Hessian,
+ * of the specified function at the specified argument in
+ * time O(N^3) time and O(N^2) space.  The advantage over the
+ * mixed definition, which is faster for Hessians, is that this
+ * version is itself differentiable.
+ *
+ * Overload for returning the Hessian as a symmetric dense matrix.
+ *
+ * @tparam T type of elements in the vector and matrix
+ * @tparam F type of function
+ * @param[in] f Function
+ * @param[in] x Argument to function
+ * @param[out] fx Function applied to argument
+ * @param[out] grad gradient of function at argument
+ * @param[out] H Hessian of function at argument, as a symmetric matrix
+ */
+template <typename T, typename F>
+void hessian(const F& f, const Eigen::Matrix<T, Eigen::Dynamic, 1>& x, T& fx,
+             Eigen::Matrix<T, Eigen::Dynamic, 1>& grad,
+             Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>& H) {
+  Eigen::SparseMatrix<T> hess_sparse;
+  hessian(f, x, fx, grad, hess_sparse);
+
+  H = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>(hess_sparse)
+          .template selfadjointView<Eigen::Lower>();
 }
 
 }  // namespace math
diff --git a/stan/math/mix/functor/hessian.hpp b/stan/math/mix/functor/hessian.hpp
@@ -1,9 +1,11 @@
 #ifndef STAN_MATH_MIX_FUNCTOR_HESSIAN_HPP
 #define STAN_MATH_MIX_FUNCTOR_HESSIAN_HPP
 
-#include <stan/math/fwd/core.hpp>
 #include <stan/math/prim/fun/Eigen.hpp>
+#include <stan/math/fwd/core.hpp>
+#include <stan/math/fwd/fun/value_of_rec.hpp>
 #include <stan/math/rev/core.hpp>
+#include <stan/math/rev/fun/value_of_rec.hpp>
 #include <stdexcept>
 
 namespace stan {
@@ -14,6 +16,9 @@ namespace math {
  * of the specified function at the specified argument in
  * O(N^2) time and O(N^2) space.
  *
+ * Instead of returning the full symmetric Hessian, we return the
+ * lower-triangular only as a column-major compressed sparse matrix.
+ *
  * <p>The functor must implement
  *
  * <code>
@@ -36,20 +41,22 @@ namespace math {
  * @param[in] x Argument to function
  * @param[out] fx Function applied to argument
  * @param[out] grad gradient of function at argument
- * @param[out] H Hessian of function at argument
+ * @param[out] H Hessian of function at argument, as a lower-triangular
+ *                      compressed sparse matrix
  */
 template <typename F>
-void hessian(const F& f, const Eigen::Matrix<double, Eigen::Dynamic, 1>& x,
-             double& fx, Eigen::Matrix<double, Eigen::Dynamic, 1>& grad,
-             Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic>& H) {
-  H.resize(x.size(), x.size());
-  grad.resize(x.size());
-
-  // need to compute fx even with size = 0
-  if (x.size() == 0) {
-    fx = f(x);
+void hessian(const F& f, const Eigen::VectorXd& x, double& fx,
+             Eigen::VectorXd& grad, Eigen::SparseMatrix<double>& H) {
+  int d = x.size();
+  if (d == 0) {
+    fx = value_of_rec(f(x));
     return;
   }
+
+  grad.resize(d);
+  H.resize(d, d);
+  H.reserve(Eigen::VectorXi::LinSpaced(d, 1, d).reverse());
+
   for (int i = 0; i < x.size(); ++i) {
     // Run nested autodiff in this scope
     nested_rev_autodiff nested;
@@ -64,10 +71,34 @@ void hessian(const F& f, const Eigen::Matrix<double, Eigen::Dynamic, 1>& x,
       fx = fx_fvar.val_.val();
     }
     stan::math::grad(fx_fvar.d_.vi_);
-    for (int j = 0; j < x.size(); ++j) {
-      H(i, j) = x_fvar(j).val_.adj();
+    for (int j = i; j < x.size(); ++j) {
+      H.insert(j, i) = x_fvar(j).val_.adj();
     }
   }
+  H.makeCompressed();
+}
+
+/**
+ * Calculate the value, the gradient, and the Hessian,
+ * of the specified function at the specified argument in
+ * O(N^2) time and O(N^2) space.
+ *
+ * Overload for returning the Hessian as a symmetric dense matrix.
+ *
+ * @tparam F Type of function
+ * @param[in] f Function
+ * @param[in] x Argument to function
+ * @param[out] fx Function applied to argument
+ * @param[out] grad gradient of function at argument
+ * @param[out] H Hessian of function at argument, as a symmetric matrix
+ */
+template <typename F>
+void hessian(const F& f, const Eigen::VectorXd& x, double& fx,
+             Eigen::VectorXd& grad, Eigen::MatrixXd& H) {
+  Eigen::SparseMatrix<double> hess_sparse;
+  hessian(f, x, fx, grad, hess_sparse);
+
+  H = Eigen::MatrixXd(hess_sparse).selfadjointView<Eigen::Lower>();
 }
 
 }  // namespace math
diff --git a/stan/math/mix/functor/hessian_times_vector.hpp b/stan/math/mix/functor/hessian_times_vector.hpp
@@ -42,9 +42,9 @@ void hessian_times_vector(const F& f,
                           Eigen::Matrix<T, Eigen::Dynamic, 1>& Hv) {
   using Eigen::Matrix;
   Matrix<T, Eigen::Dynamic, 1> grad;
-  Matrix<T, Eigen::Dynamic, Eigen::Dynamic> H;
+  Eigen::SparseMatrix<T> H;
   hessian(f, x, fx, grad, H);
-  Hv = H * v;
+  Hv = H.template selfadjointView<Eigen::Lower>() * v;
 }
 
 }  // namespace math
diff --git a/stan/math/rev/functor/finite_diff_hessian_auto.hpp b/stan/math/rev/functor/finite_diff_hessian_auto.hpp
@@ -3,6 +3,7 @@
 
 #include <stan/math/rev/meta.hpp>
 #include <stan/math/rev/core.hpp>
+#include <stan/math/rev/fun/value_of.hpp>
 #include <stan/math/prim/fun/Eigen.hpp>
 #include <stan/math/rev/functor.hpp>
 #include <stan/math/prim/fun/finite_diff_stepsize.hpp>
@@ -17,10 +18,15 @@ namespace internal {
  * automatically setting the stepsize between the function evaluations
  * along a dimension.
  *
+ * Instead of returning the full symmetric Hessian, we return the
+ * lower-triangular only as a column-major compressed sparse matrix.
+ *
  * <p>The functor must implement
  *
  * <code>
- * double operator()(const Eigen::VectorXd&)
+ * var
+ * operator()(const
+ * Eigen::Matrix<var, Eigen::Dynamic, 1>&)
  * </code>
  *
  * <p>For details of the algorithm, see
@@ -37,18 +43,24 @@ namespace internal {
  * @param[in] x Argument to function
  * @param[out] fx Function applied to argument
  * @param[out] grad_fx Gradient of function at argument
- * @param[out] hess_fx Hessian of function at argument
+ * @param[out] hess_fx Hessian of function at argument, as a lower-triangular
+ *                      compressed sparse matrix
  */
 template <typename F>
 void finite_diff_hessian_auto(const F& f, const Eigen::VectorXd& x, double& fx,
                               Eigen::VectorXd& grad_fx,
-                              Eigen::MatrixXd& hess_fx) {
+                              Eigen::SparseMatrix<double>& hess_fx) {
   int d = x.size();
+  if (d == 0) {
+    fx = value_of(f(x));
+    return;
+  }
+
+  gradient(f, x, fx, grad_fx);
 
   Eigen::VectorXd x_temp(x);
   hess_fx.resize(d, d);
-
-  gradient(f, x, fx, grad_fx);
+  hess_fx.reserve(Eigen::VectorXi::LinSpaced(d, 1, d).reverse());
 
   std::vector<Eigen::VectorXd> g_plus(d);
   std::vector<Eigen::VectorXd> g_minus(d);
@@ -74,12 +86,39 @@ void finite_diff_hessian_auto(const F& f, const Eigen::VectorXd& x, double& fx,
   // approximate the hessian as a finite difference of gradients
   for (int i = 0; i < d; ++i) {
     for (int j = i; j < d; ++j) {
-      hess_fx(j, i) = (g_plus[j](i) - g_minus[j](i)) / (4 * epsilons[j])
-                      + (g_plus[i](j) - g_minus[i](j)) / (4 * epsilons[i]);
-      hess_fx(i, j) = hess_fx(j, i);
+      hess_fx.insert(j, i)
+          = (g_plus[j](i) - g_minus[j](i)) / (4 * epsilons[j])
+            + (g_plus[i](j) - g_minus[i](j)) / (4 * epsilons[i]);
     }
   }
+  hess_fx.makeCompressed();
+}
+
+/**
+ * Calculate the value and the Hessian of the specified function at
+ * the specified argument using first-order finite difference of gradients,
+ * automatically setting the stepsize between the function evaluations
+ * along a dimension.
+ *
+ * Overload for returning the Hessian as a symmetric dense matrix.
+ *
+ * @tparam F Type of function
+ * @param[in] f Function
+ * @param[in] x Argument to function
+ * @param[out] fx Function applied to argument
+ * @param[out] grad_fx Gradient of function at argument
+ * @param[out] hess_fx Hessian of function at argument, as a symmetric matrix
+ */
+template <typename F>
+void finite_diff_hessian_auto(const F& f, const Eigen::VectorXd& x, double& fx,
+                              Eigen::VectorXd& grad_fx,
+                              Eigen::MatrixXd& hess_fx) {
+  Eigen::SparseMatrix<double> hess_sparse;
+  finite_diff_hessian_auto(f, x, fx, grad_fx, hess_sparse);
+
+  hess_fx = Eigen::MatrixXd(hess_sparse).selfadjointView<Eigen::Lower>();
 }
+
 }  // namespace internal
 }  // namespace math
 }  // namespace stan
diff --git a/test/unit/math/rev/functor/finite_diff_hessian_auto_test.cpp b/test/unit/math/rev/functor/finite_diff_hessian_auto_test.cpp
@@ -56,6 +56,7 @@ struct exp_full {
 struct one_arg {
   template <typename T>
   inline T operator()(const Matrix<T, Dynamic, 1>& x) const {
+    using stan::math::pow;
     return pow(x(0), 3);
   }
 };

Original file line number	Diff line number	Diff line change
`@@ -56,6 +56,7 @@ struct exp_full {`
`56`	`56`	`struct one_arg {`
`57`	`57`	`template <typename T>`
`58`	`58`	`inline T operator()(const Matrix<T, Dynamic, 1>& x) const {`
	`59`	`+ using stan::math::pow;`
`59`	`60`	`return pow(x(0), 3);`
`60`	`61`	`}`
`61`	`62`	`};`