add normalization to warping function (#2692)

sdaulton · facebook-github-bot · commit 7b803bd54cd1 · 2025-01-22T13:08:27.000-08:00
Summary: Pull Request resolved: #2692 X-link: facebook/Ax#3259 Warping only works on the unit cube. This ensures that inputs are first normalized. This doesn't use `ChainedInputTranform` to avoid nested `ChainedInputTransform` This is to support linear+warping models in MBM when we aren't using `UnitX`. We’d want to 1) ensure data is in the unit cube before applying `Warp`, 2) then center the warped data at 0 (using Normalize). One way to do this would to apply `Normalize(center=0.5`), `Warp`, `Normalize(center=0.0)`, but we can’t currently specify different options for two different transforms of the same class. So this insteads takes an approach suggested by saitcakmak to include normalization in the `Warp` transform, since we always want inputs to be in the unit cube before warping. Reviewed By: saitcakmak, Balandat Differential Revision: D68356342 fbshipit-source-id: dbd6682d2aacf779c6c813e1a7b779c04f290af0
diff --git a/botorch/models/transforms/input.py b/botorch/models/transforms/input.py
@@ -1071,6 +1071,7 @@ class Warp(ReversibleInputTransform, GPyTorchModule):
 
     def __init__(
         self,
+        d: int,
         indices: list[int],
         transform_on_train: bool = True,
         transform_on_eval: bool = True,
@@ -1080,6 +1081,7 @@ def __init__(
         concentration1_prior: Prior | None = None,
         concentration0_prior: Prior | None = None,
         batch_shape: torch.Size | None = None,
+        bounds: Tensor | None = None,
     ) -> None:
         r"""Initialize transform.
 
@@ -1102,6 +1104,7 @@ def __init__(
                 parameters for each batch of inputs. This should match the input batch
                 shape of the model (i.e., `train_X.shape[:-2]`).
                 NOTE: This is only supported for single-output models.
+            bounds: A `2 x d`-dim tensor of lower and upper bounds for the inputs.
         """
         super().__init__()
         self.register_buffer("indices", torch.tensor(indices, dtype=torch.long))
@@ -1112,6 +1115,9 @@ def __init__(
         self.batch_shape = batch_shape or torch.Size([])
         self._X_min = eps
         self._X_range = 1 - 2 * eps
+        self._normalize = Normalize(
+            d=d, indices=indices, bounds=bounds, batch_shape=self.batch_shape
+        )
         if len(self.batch_shape) > 0:
             # Note: this follows the gpytorch shape convention for lengthscales
             # There is ongoing discussion about the extra `1`.
@@ -1156,7 +1162,7 @@ def _set_concentration(self, i: int, value: float | Tensor) -> None:
         self.initialize(**{f"concentration{i}": value})
 
     @subset_transform
-    def _transform(self, X: Tensor) -> Tensor:
+    def _warp_transform(self, X: Tensor) -> Tensor:
         r"""Warp the inputs through the Kumaraswamy CDF.
 
         Args:
@@ -1165,10 +1171,9 @@ def _transform(self, X: Tensor) -> Tensor:
                 it is broadcastable with self.batch_shape if self.batch_shape is set.
 
         Returns:
-            A `input_batch_shape x (batch_shape) x n x d`-dim tensor of transformed
-                inputs.
+            A `input_batch_shape x (batch_shape) x n x d`-dim tensor
+                of transformed inputs.
         """
-        # normalize to [eps, 1-eps], IDEA: could use Normalize and ChainedTransform.
         return self._k.cdf(
             torch.clamp(
                 X * self._X_range + self._X_min,
@@ -1177,7 +1182,23 @@ def _transform(self, X: Tensor) -> Tensor:
             )
         )
 
-    @subset_transform
+    def _transform(self, X: Tensor) -> Tensor:
+        r"""Warp the inputs through the Kumaraswamy CDF.
+
+        Args:
+            X: A `input_batch_shape x (batch_shape) x n x d`-dim tensor of inputs.
+                batch_shape here can either be self.batch_shape or 1's such that
+                it is broadcastable with self.batch_shape if self.batch_shape is set.
+
+        Returns:
+            A `input_batch_shape x (batch_shape) x n x d`-dim tensor of transformed
+                inputs.
+        """
+        # Normalize to unit cube
+        X = self._normalize(X=X)
+        # normalize to [eps, 1-eps], IDEA: could use Normalize and ChainedTransform.
+        return self._warp_transform(X=X)
+
     def _untransform(self, X: Tensor) -> Tensor:
         r"""Warp the inputs through the Kumaraswamy inverse CDF.
 
@@ -1194,6 +1215,20 @@ def _untransform(self, X: Tensor) -> Tensor:
                     "The right most batch dims of X must match self.batch_shape: "
                     f"({self.batch_shape})."
                 )
+        untransformed_X = self._warp_untransform(X=X)
+        return self._normalize.untransform(X=untransformed_X)
+
+    @subset_transform
+    def _warp_untransform(self, X: Tensor) -> Tensor:
+        r"""Warp the inputs through the Kumaraswamy inverse CDF.
+
+        Args:
+            X: A `input_batch_shape x batch_shape x n x d`-dim tensor of inputs.
+
+        Returns:
+            A `input_batch_shape x batch_shape x n x d`-dim tensor of transformed
+                inputs.
+        """
         # unnormalize from [eps, 1-eps] to [0,1]
         return ((self._k.icdf(X) - self._X_min) / self._X_range).clamp(0.0, 1.0)
 
diff --git a/test/models/transforms/test_input.py b/test/models/transforms/test_input.py
@@ -41,8 +41,11 @@
 from torch.nn.functional import one_hot
 
 
-def get_test_warp(indices, **kwargs):
-    warp_tf = Warp(indices=indices, **kwargs)
+def get_test_warp(d, indices, bounds=None, **kwargs):
+    if bounds is None:
+        bounds = torch.zeros(2, d)
+        bounds[1] = 1
+    warp_tf = Warp(d=d, indices=indices, bounds=bounds, **kwargs)
     c0 = torch.tensor([1.0, 2.0])[: len(indices)]
     c1 = torch.tensor([2.0, 3.0])[: len(indices)]
     batch_shape = kwargs.get("batch_shape", torch.Size([]))
@@ -1031,12 +1034,17 @@ def test_warp_transform(self) -> None:
         ):
             tkwargs = {"device": self.device, "dtype": dtype}
             eps = 1e-6 if dtype == torch.double else 1e-5
+            if dtype == torch.float32:
+                # defaults are 1e-5, 1e-8
+                tols = {"rtol": 2e-5, "atol": 8e-8}
+            else:
+                tols = {}
 
             # basic init
             indices = [0, 2]
-            warp_tf = get_test_warp(indices, batch_shape=warp_batch_shape, eps=eps).to(
-                **tkwargs
-            )
+            warp_tf = get_test_warp(
+                d=3, indices=indices, batch_shape=warp_batch_shape, eps=eps
+            ).to(**tkwargs)
             self.assertTrue(warp_tf.training)
 
             k = Kumaraswamy(warp_tf.concentration1, warp_tf.concentration0)
@@ -1049,7 +1057,7 @@ def test_warp_transform(self) -> None:
             X = X.unsqueeze(-3) if len(warp_batch_shape) > 0 else X
             with torch.no_grad():
                 warp_tf = get_test_warp(
-                    indices=indices, batch_shape=warp_batch_shape, eps=eps
+                    d=3, indices=indices, batch_shape=warp_batch_shape, eps=eps
                 ).to(**tkwargs)
                 X_tf = warp_tf(X)
                 expected_X_tf = expand_and_copy_tensor(
@@ -1077,7 +1085,8 @@ def test_warp_transform(self) -> None:
 
                 # test no transform on eval
                 warp_tf = get_test_warp(
-                    indices,
+                    d=3,
+                    indices=indices,
                     transform_on_eval=False,
                     batch_shape=warp_batch_shape,
                     eps=eps,
@@ -1090,6 +1099,7 @@ def test_warp_transform(self) -> None:
 
                 # test no transform on train
                 warp_tf = get_test_warp(
+                    d=3,
                     indices=indices,
                     transform_on_train=False,
                     batch_shape=warp_batch_shape,
@@ -1103,6 +1113,7 @@ def test_warp_transform(self) -> None:
 
                 # test equals
                 warp_tf2 = get_test_warp(
+                    d=3,
                     indices=indices,
                     transform_on_train=False,
                     batch_shape=warp_batch_shape,
@@ -1111,11 +1122,12 @@ def test_warp_transform(self) -> None:
                 self.assertTrue(warp_tf.equals(warp_tf2))
                 # test different transform_on_train
                 warp_tf2 = get_test_warp(
-                    indices=indices, batch_shape=warp_batch_shape, eps=eps
+                    d=3, indices=indices, batch_shape=warp_batch_shape, eps=eps
                 )
                 self.assertFalse(warp_tf.equals(warp_tf2))
                 # test different indices
                 warp_tf2 = get_test_warp(
+                    d=3,
                     indices=[0, 1],
                     transform_on_train=False,
                     batch_shape=warp_batch_shape,
@@ -1137,6 +1149,7 @@ def test_warp_transform(self) -> None:
                 prior0 = LogNormalPrior(0.0, 0.75).to(**tkwargs)
                 prior1 = LogNormalPrior(0.0, 0.5).to(**tkwargs)
                 warp_tf = get_test_warp(
+                    d=3,
                     indices=[0, 1],
                     concentration0_prior=prior0,
                     concentration1_prior=prior1,
@@ -1148,11 +1161,23 @@ def test_warp_transform(self) -> None:
                     self.assertIsInstance(p, LogNormalPrior)
                     self.assertEqual(p.base_dist.scale, 0.75 if i == 0 else 0.5)
 
+                # test non-unit cube bounds
+                warp_tf = get_test_warp(
+                    d=3,
+                    indices=[0, 2],
+                    eps=eps,
+                    batch_shape=warp_batch_shape,
+                    bounds=torch.tensor([[1.0, 1.0, 1.0], [2.0, 2.0, 2.0]], **tkwargs),
+                ).to(**tkwargs)
+                X[..., indices] += 1
+                X_tf = warp_tf(X)
+                self.assertAllClose(expected_X_tf, X_tf, **tols)
+
             # test gradients
             X = 1 + 5 * torch.rand(*batch_shape, 4, 3, **tkwargs)
             X = X.unsqueeze(-3) if len(warp_batch_shape) > 0 else X
             warp_tf = get_test_warp(
-                indices=indices, batch_shape=warp_batch_shape, eps=eps
+                d=3, indices=indices, batch_shape=warp_batch_shape, eps=eps
             ).to(**tkwargs)
             X_tf = warp_tf(X)
             X_tf.sum().backward()
diff --git a/test/optim/utils/test_acquisition_utils.py b/test/optim/utils/test_acquisition_utils.py
@@ -240,7 +240,9 @@ def test_get_X_baseline(self):
             # to the train_inputs when the model is in eval mode, we
             # extract the untransformed train_inputs
             model = SingleTaskGP(
-                X_train, Y_train[:, :1], input_transform=Warp(indices=[0, 1])
+                X_train,
+                Y_train[:, :1],
+                input_transform=Warp(d=X_train.shape[-1], indices=[0, 1]),
             )
             model.eval()
             self.assertFalse(torch.equal(model.train_inputs[0], X_train))
diff --git a/tutorials/bo_with_warped_gp.ipynb b/tutorials/bo_with_warped_gp.ipynb