Setting up a custom GPyTorch model for BoTorch #1271

r-ashwin · 2020-09-12T23:23:08Z

r-ashwin
Sep 12, 2020

If you are submitting a bug report or feature request, please use the respective
issue template.

Issue description

I am trying to use the MultiTaskGP model from GPyTorch with the BoTorch's qMaxValueEntropy. I get the UnsupportedError because the objective kwarg is not supported. See error below

`---------------------------------------------------------------------------

UnsupportedError                          Traceback (most recent call last)
<ipython-input-9-e910224785b8> in <module>
    223 candidate_set = torch.rand(size=[1000, 1]) # MES requires a candidate set
    224 from botorch.acquisition.objective import ScalarizedObjective
--> 225 qSMES = qScalarizedMES(model, candidate_set=candidate_set, weights=torch.tensor([1.,0.]))

<ipython-input-9-e910224785b8> in __init__(self, model, candidate_set, weights, num_fantasies, num_mv_samples, num_y_samples, use_gumbel, maximize, X_pending)
     65         """
     66         sampler = SobolQMCNormalSampler(num_y_samples)
---> 67         super().__init__(model=model, sampler=sampler)
     68 
     69         # Batch GP models (e.g. fantasized models) are not currently supported

~\Anaconda3\lib\site-packages\botorch\acquisition\monte_carlo.py in __init__(self, model, sampler, objective, X_pending)
     69             if model.num_outputs != 1:
     70                 raise UnsupportedError(
---> 71                     "Must specify an objective when using a multi-output model."
     72                 )
     73             objective = IdentityMCObjective()

UnsupportedError: Must specify an objective when using a multi-output model.`

## Code example
See code below to reproduce error

import torch
import gpytorch
import math
from matplotlib import cm
from matplotlib import pyplot as plt
import numpy as np
from botorch.models import MultiTaskGP

def test_1d(X):
    a = 16
    f = 1*X**2 + torch.sin(a*X)
    dfx = 1*2*X + a * torch.cos(a*X)
    return f, dfx
x = torch.linspace(0.15, .65, 5)
f, dfx = test_1d(x)
train_x = x.unsqueeze(-1)
train_y = torch.stack((f, dfx),dim=1)
print(train_x.size())
plt.plot(x.numpy(), f.numpy())
plt.plot(x.numpy(), dfx.numpy(), ls='--', c='gray')

from botorch.posteriors import GPyTorchPosterior
from gpytorch.distributions import MultitaskMultivariateNormal
from botorch.models.gpytorch import GPyTorchModel
from gpytorch.likelihoods import MultitaskGaussianLikelihood

class GPModelWithDerivatives(gpytorch.models.ExactGP, GPyTorchModel):
    num_outputs = 2  # to inform GPyTorchModel API (only to interface with BoTorch)
    def __init__(self, train_x, train_y, likelihood):
        super().__init__(train_x, train_y, likelihood)
        self.mean_module = gpytorch.means.ConstantMeanGrad()
        self.base_kernel = gpytorch.kernels.RBFKernelGrad(ard_num_dims=1)
        self.covar_module = gpytorch.kernels.ScaleKernel(self.base_kernel)

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultitaskMultivariateNormal(mean_x, covar_x)
    
likelihood = MultitaskGaussianLikelihood(num_tasks=2)  # Value + x-derivative + y-derivative
model = GPModelWithDerivatives(train_x, train_y, likelihood)

# this is for running the notebook in our testing framework
import os
smoke_test = ('CI' in os.environ)
training_iter = 2 if smoke_test else 500


# Find optimal model hyperparameters
model.train()
likelihood.train()

# Use the adam optimizer
optimizer = torch.optim.Adam([
    {'params': model.parameters()},  # Includes GaussianLikelihood parameters
], lr=0.05)

# "Loss" for GPs - the marginal log likelihood
# likelihood.noise_covar.raw_noise_constraint.upper_bound = torch.tensor([1e-6, 1e-6])
likelihood.noise_covar.register_constraint("raw_noise", gpytorch.constraints.LessThan(1e-4) )
likelihood.noise_covar.register_constraint("raw_noise", gpytorch.constraints.GreaterThan(1e-8) )
mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)

for i in range(training_iter):
    optimizer.zero_grad()
    output = model(train_x)
    loss = -mll(output, train_y)
#     print(loss.item())
    loss.backward()
#     print("Iter %d/%d - Loss: %.3f   lengthscales: %.3f noise: %.8f" % (
#         i + 1, training_iter, loss.item(),
#         model.covar_module.base_kernel.lengthscale.squeeze().item(),
#         model.likelihood.noise.squeeze().item()
#     ))
    optimizer.step()
print(model.likelihood.noise.squeeze())

from botorch.acquisition import MCAcquisitionFunction
from botorch.acquisition.max_value_entropy_search import qMaxValueEntropy
from botorch.acquisition.objective import ScalarizedObjective

# Scalarized MES
import math

from torch import Tensor
from typing import Optional

from botorch.acquisition import MCAcquisitionObjective
from botorch.acquisition.acquisition import AcquisitionFunction
from botorch.acquisition.monte_carlo import MCAcquisitionFunction
from botorch.models.model import Model
from botorch.sampling.samplers import MCSampler, SobolQMCNormalSampler
# from botorch.utils import match_batch_shape, t_batch_mode_transform
from botorch.utils.transforms import match_batch_shape, t_batch_mode_transform

from botorch.models.utils import check_no_nans
from botorch.exceptions import UnsupportedError
CLAMP_LB = 1.0e-8

class qScalarizedMES(MCAcquisitionFunction):
    r"""The acquisition function for Max-value Entropy Search.

    This acquisition function computes the mutual information of
    max values and a candidate point X. See [Wang2018mves]_ for
    a detailed discussion.

    The model must be single-outcome.
    q > 1 is supported through cyclic optimization and fantasies.

    Example:
        >>> model = SingleTaskGP(train_X, train_Y)
        >>> candidate_set = torch.rand(1000, bounds.size(1))
        >>> candidate_set = bounds[0] + (bounds[1] - bounds[0]) * candidate_set
        >>> MES = qMaxValueEntropy(model, candidate_set)
        >>> mes = MES(test_X)
    """

    def __init__(
        self,
        model: Model,
        candidate_set: Tensor,
        weights: Tensor,
        num_fantasies: int = 16,
        num_mv_samples: int = 10,
        num_y_samples: int = 128,
        use_gumbel: bool = True,
        maximize: bool = True,
        X_pending: Optional[Tensor] = None,
    ) -> None:
        r"""Single-outcome max-value entropy search acquisition function.

        Args:
            model: A fitted single-outcome model.
            candidate_set: A `n x d` Tensor including `n` candidate points to
                discretize the design space. Max values are sampled from the
                (joint) model posterior over these points.
            num_fantasies: Number of fantasies to generate. The higher this
                number the more accurate the model (at the expense of model
                complexity, wall time and memory). Ignored if `X_pending` is `None`.
            num_mv_samples: Number of max value samples.
            num_y_samples: Number of posterior samples at specific design point `X`.
            use_gumbel: If True, use Gumbel approximation to sample the max values.
            X_pending: A `m x d`-dim Tensor of `m` design points that have been
                submitted for function evaluation but have not yet been evaluated.
            maximize: If True, consider the problem a maximization problem.
        """
        sampler = SobolQMCNormalSampler(num_y_samples)
        super().__init__(model=model, sampler=sampler)

        # Batch GP models (e.g. fantasized models) are not currently supported
        if self.model.train_inputs[0].ndim > 2:
            raise NotImplementedError(
                "Batch GP models (e.g. fantasized models) "
                "are not yet supported by qMaxValueEntropy"
            )

        self._init_model = model  # only used for the `fantasize()` in `set_X_pending()`
        train_inputs = match_batch_shape(model.train_inputs[0], candidate_set)
        self.candidate_set = torch.cat([candidate_set, train_inputs], dim=0)
        self.fantasies_sampler = SobolQMCNormalSampler(num_fantasies)
        self.num_fantasies = num_fantasies
        self.use_gumbel = use_gumbel
        self.num_mv_samples = num_mv_samples
        self.maximize = maximize
        self.weight = 1.0 if maximize else -1.0
        
        self.register_buffer("weights", torch.as_tensor(weights))

    @t_batch_mode_transform(expected_q=1)
    def forward(self, X: Tensor) -> Tensor:
        r"""Compute max-value entropy at the design points `X`.

        Args:
            X: A `batch_shape x 1 x d`-dim Tensor of `batch_shape` t-batches
                with `1` `d`-dim design points each.

        Returns:
            A `batch_shape`-dim Tensor of MVE values at the given design points `X`.
        """
        # Compute the posterior, posterior mean, variance and std
        posterior = self.model.posterior(X.unsqueeze(-3), observation_noise=False)
        mean = self.weight * posterior.mean.squeeze(-1).squeeze(-1)
        # batch_shape x num_fantasies
        variance = posterior.variance.clamp_min(CLAMP_LB).view_as(mean)
        check_no_nans(mean)
        check_no_nans(variance)
        
        posterior = self.model.posterior(X)
        samples = self.sampler(posterior)  # n x b x q x o
        scalarized_samples = samples.matmul(self.weights)  # n x b x q
#         mean = posterior.mean  # b x q x o
        scalarized_mean = mean.matmul(self.weights)  # b x q
            
        ig = self._compute_information_gain(
            X=X, mean_M=scalarized_mean, variance_M=variance, covar_mM=variance.unsqueeze(-1)
        )

        return ig.mean(dim=0)  # average over the fantasies
    
    def _compute_information_gain(
        self, X: Tensor, mean_M: Tensor, variance_M: Tensor, covar_mM: Tensor
    ) -> Tensor:
        r"""Computes the information gain at the design points `X`.

        Approximately computes the information gain at the design points `X`,
        for both MES with noisy observations and multi-fidelity MES with noisy
        observation and trace observations.

        The implementation is inspired from the paper on multi-fidelity MES by
        Takeno et. al. [Takeno2019mfmves]_. The notations in the comments in this
        function follows the Appendix A in the paper.

        Args:
            X: A `batch_shape x 1 x d`-dim Tensor of `batch_shape` t-batches
                with `1` `d`-dim design point each.
            mean_M, variance_M: `batch_shape x num_fantasies`-dim Tensors of
                `batch_shape` t-batches with `num_fantasies` fantasies.
                `num_fantasies = 1` for non-fantasized models.
                All are obtained without noise.
            covar_mM: `batch_shape x num_fantasies x (1 + num_trace_observations)`
                -dim Tensor. `num_fantasies = 1` for non-fantasized models.
                All are obtained without noise.

        Returns:
            A `num_fantasies x batch_shape`-dim Tensor of information gains at the
            given design points `X`.
        """

        # compute the std_m, variance_m with noisy observation
        posterior_m = self.model.posterior(X.unsqueeze(-3), observation_noise=True)
        mean_m = self.weight * posterior_m.mean.squeeze(-1)
        # batch_shape x num_fantasies x (1 + num_trace_observations)
        variance_m = posterior_m.mvn.covariance_matrix
        # batch_shape x num_fantasies x (1 + num_trace_observations)^2
        check_no_nans(variance_m)

        # compute mean and std for fM|ym, x, Dt ~ N(u, s^2)
        samples_m = self.weight * self.sampler(posterior_m).squeeze(-1)
        # s_m x batch_shape x num_fantasies x (1 + num_trace_observations)
        L = torch.cholesky(variance_m)
        temp_term = torch.cholesky_solve(covar_mM.unsqueeze(-1), L).transpose(-2, -1)
        # equivalent to torch.matmul(covar_mM.unsqueeze(-2), torch.inverse(variance_m))
        # batch_shape x num_fantasies x 1 x (1 + num_trace_observations)

        mean_pt1 = torch.matmul(temp_term, (samples_m - mean_m).unsqueeze(-1))
        mean_new = mean_pt1.squeeze(-1).squeeze(-1) + mean_M
        # s_m x batch_shape x num_fantasies
        variance_pt1 = torch.matmul(temp_term, covar_mM.unsqueeze(-1))
        variance_new = variance_M - variance_pt1.squeeze(-1).squeeze(-1)
        # batch_shape x num_fantasies
        stdv_new = variance_new.clamp_min(CLAMP_LB).sqrt()
        # batch_shape x num_fantasies

        # define normal distribution to compute cdf and pdf
        normal = torch.distributions.Normal(
            torch.zeros(1, device=X.device, dtype=X.dtype),
            torch.ones(1, device=X.device, dtype=X.dtype),
        )

        # Compute p(fM <= f* | ym, x, Dt)
        view_shape = (
            [self.num_mv_samples] + [1] * (len(X.shape) - 2) + [self.num_fantasies]
        )  # s_M x batch_shape x num_fantasies
        if self.X_pending is None:
            view_shape[-1] = 1
        max_vals = self.posterior_max_values.view(view_shape).unsqueeze(1)
        # s_M x 1 x batch_shape x num_fantasies
        normalized_mvs_new = (max_vals - mean_new) / stdv_new
        # s_M x s_m x batch_shape x num_fantasies =
        # s_M x 1 x batch_shape x num_fantasies - s_m x batch_shape x num_fantasies
        cdf_mvs_new = normal.cdf(normalized_mvs_new).clamp_min(CLAMP_LB)

        # Compute p(fM <= f* | x, Dt)
        stdv_M = variance_M.sqrt()
        normalized_mvs = (max_vals - mean_M) / stdv_M
        # s_M x 1 x batch_shape x num_fantasies =
        # s_M x 1 x 1 x num_fantasies - batch_shape x num_fantasies
        cdf_mvs = normal.cdf(normalized_mvs).clamp_min(CLAMP_LB)
        # s_M x 1 x batch_shape x num_fantasies

        # Compute log(p(ym | x, Dt))
        log_pdf_fm = posterior_m.mvn.log_prob(self.weight * samples_m).unsqueeze(0)
        # 1 x s_m x batch_shape x num_fantasies

        # H0 = H(ym | x, Dt)
        H0 = posterior_m.mvn.entropy()  # batch_shape x num_fantasies

        # regression adjusted H1 estimation, H1_hat = H1_bar - beta * (H0_bar - H0)
        # H1 = E_{f*|x, Dt}[H(ym|f*, x, Dt)]
        Z = cdf_mvs_new / cdf_mvs  # s_M x s_m x batch_shape x num_fantasies
        h1 = -Z * Z.log() - Z * log_pdf_fm  # s_M x s_m x batch_shape x num_fantasies
        check_no_nans(h1)
        dim = [0, 1]  # dimension of fm samples, fM samples
        H1_bar = h1.mean(dim=dim)
        h0 = -log_pdf_fm
        H0_bar = h0.mean(dim=dim)
        cov = ((h1 - H1_bar) * (h0 - H0_bar)).mean(dim=dim)
        beta = cov / (h0.var(dim=dim) * h1.var(dim=dim)).sqrt()
        H1_hat = H1_bar - beta * (H0_bar - H0)
        ig = H0 - H1_hat  # batch_shape x num_fantasies
        ig = ig.permute(-1, *range(ig.dim() - 1))  # num_fantasies x batch_shape
        return ig
    
candidate_set = torch.rand(size=[1000, 1]) # MES requires a candidate set
from botorch.acquisition.objective import ScalarizedObjective
qSMES = qScalarizedMES(model, candidate_set=candidate_set, weights=torch.tensor([1.,0.]))

`

System Info

Please provide information about your setup, including

BoTorch Version 0.2.5
GPyTorch Version 1.1.1
PyTorch Version 1.5.0+cpu
Computer OS windows

Balandat · 2020-09-14T06:19:25Z

Balandat
Sep 14, 2020
Collaborator

So the MCAcquisitionFunction does some internal validation that is meant to ensure you don't use a multi-output model (which yours is).

You basically have two options (possibly others, but these are the two I can think of right now):

Make a lightweight wrapper model around the multi-output gpytorch model that only has one outcome. Presumably you don't want to use the predicted derivatives in the acquisition function, and only use those to improve the fit of the model from the derivative information? If so you could fit the gpytorch model, and then stick that into a lightweight BoTorch model container that in the forward methods just subsets the predicted outcomes.
Avoid calling the constructor of MCAcqusitionFunction altogether. We do this in one of our tutorials: https://botorch.org/tutorials/custom_acquisition (looks like your code example is modeled on this to some extent. Note how for how we call AcquisitionFunction's constructor in that tutorial instead using super(MCAcquisitionFunction self).init(model=model)`

Hope this helps.

0 replies

r-ashwin · 2020-09-17T14:21:09Z

r-ashwin
Sep 17, 2020
Author

@Balandat Thanks for both suggestions! I think suggestion 1. is particularly simple and is exactly what I need. However, I guess I am stuck with the finer details of implementing it. I think what I see is that if I just subset the first output of the mean and the first nxn submatrix of the full covariance matrix, then I get an error due to dimensions mismatch due to the true function still producing two outputs (function and gradient). I have pasted the wrapper I wrote and the error. I am pretty sure it has got something with my implementation of the wrapper, so your help is greatly appreciated.

class wrapper_for_GPModelWithDerivatives(GPModelWithDerivatives):
    # a thin wrapper for GP with derivatives to accommodate MES
    num_outputs = 1  # to inform GPyTorchModel API (only to interface with BoTorch)
    def __init__(self, train_x, train_y, likelihood):
        super().__init__(train_x, train_y, likelihood)
        self.mean_module  = GPModelWithDerivatives(train_x, train_y, likelihood).mean_module
        self.covar_module = GPModelWithDerivatives(train_x, train_y, likelihood).covar_module
        
    def forward(self, x):
        n = x.size()[0]   
        mean_x = self.mean_module(x)[:,0].unsqueeze(-1) # extract the first of 2 columns of the predicted mean
        covar_x= self.covar_module(x)[:n, :n] # extract the first nxn submatrix of the full covar matrix
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

from botorch.acquisition.max_value_entropy_search import qMaxValueEntropy
gp2 = wrapper_for_GPModelWithDerivatives(train_x, train_y, likelihood)
candidate_set = torch.rand(size=[11, 1]) # MES requires a candidate set
MES = qMaxValueEntropy(gp2, candidate_set=candidate_set)

Error produced:

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-40-23d20238d579> in <module>
      2 gp2 = wrapper_for_GPModelWithDerivatives(train_x, train_y, likelihood)
      3 candidate_set = torch.rand(size=[11, 1]) # MES requires a candidate set
----> 4 MES = qMaxValueEntropy(gp2, candidate_set=candidate_set)

~\Anaconda3\lib\site-packages\botorch\acquisition\max_value_entropy_search.py in __init__(self, model, candidate_set, num_fantasies, num_mv_samples, num_y_samples, use_gumbel, maximize, X_pending)
    113         # since some members required by `_sample_max_values()` are not yet initialized.
    114         if X_pending is None:
--> 115             self._sample_max_values()
    116         else:
    117             self.set_X_pending(X_pending)

~\Anaconda3\lib\site-packages\botorch\acquisition\max_value_entropy_search.py in _sample_max_values(self)
    168             if self.use_gumbel:
    169                 self.posterior_max_values = _sample_max_value_Gumbel(
--> 170                     self.model, candidate_set, self.num_mv_samples, self.maximize
    171                 )
    172             else:

~\Anaconda3\lib\site-packages\botorch\acquisition\max_value_entropy_search.py in _sample_max_value_Gumbel(model, candidate_set, num_samples, maximize)
    492     """
    493     # define the approximate CDF for the max value under the independence assumption
--> 494     posterior = model.posterior(candidate_set)
    495     weight = 1.0 if maximize else -1.0
    496     mu = weight * posterior.mean

~\Anaconda3\lib\site-packages\botorch\models\gpytorch.py in posterior(self, X, observation_noise, **kwargs)
    123         self.eval()  # make sure model is in eval mode
    124         with gpt_posterior_settings():
--> 125             mvn = self(X)
    126             if observation_noise is not False:
    127                 if torch.is_tensor(observation_noise):

~\Anaconda3\lib\site-packages\gpytorch\models\exact_gp.py in __call__(self, *args, **kwargs)
    294                     train_prior_dist=train_output,
    295                     train_labels=self.train_targets,
--> 296                     likelihood=self.likelihood,
    297                 )
    298 

~\Anaconda3\lib\site-packages\gpytorch\models\exact_prediction_strategies.py in prediction_strategy(train_inputs, train_prior_dist, train_labels, likelihood)
     34     else:
     35         cls = DefaultPredictionStrategy
---> 36     return cls(train_inputs, train_prior_dist, train_labels, likelihood)
     37 
     38 

~\Anaconda3\lib\site-packages\gpytorch\models\exact_prediction_strategies.py in __init__(self, train_inputs, train_prior_dist, train_labels, likelihood, root, inv_root)
     41         # Flatten the training labels
     42         train_shape = train_prior_dist.event_shape
---> 43         train_labels = train_labels.view(*train_labels.shape[: -len(train_shape)], train_shape.numel())
     44 
     45         self.train_inputs = train_inputs

RuntimeError: shape '[5, 1]' is invalid for input of size 10

0 replies

Balandat · 2020-09-17T21:48:48Z

Balandat
Sep 17, 2020
Collaborator

So if you overwrite the forward method then you won't be able to fit this model properly with the GPyTorch internals.

What I was getting at is a simple BoTorch model variant that leaves the actual GPyTorch inference part untouched, and simply extracts the appropriate outcome from the multi-output posterior. (I guess the term "wrapper" in this context may be debatable, but it works...)

class MyWrapperModel(GPModelWithDerivatives):
    
    _num_outputs = 1
    
    def posterior(self, X: Tensor) -> GPyTorchPosterior:
        full_posterior = super().posterior(X)
        full_mean = full_posterior.mean
        full_covar = full_posterior.mvn.covariance_matrix  # not great, but ok for small posteriors
        mean = full_mean[..., 0]
        idxr = torch.arange(0, full_covar.shape[-1], step=full_mean.shape[-1])
        covar = full_covar[..., idxr][..., idxr, :]
        mvn = gpytorch.distributions.MultivariateNormal(mean, covar)
        return GPyTorchPosterior(mvn=mvn)

Then, say you have a trained model of type GPModelWithDerivatives, you could do the following:

$ wrapper_model = MyWrapperModel(train_x, train_y, likelihood)
$ wrapper_model.load_state_dict(model.state_dict()). # load hyperparams from fitted model into new wrapper class.
<All keys matched successfully>

$ wrapper_model.num_outputs
1

$ test_x = torch.rand(2, 1)

$ p = wrapper_model.posterior(test_x)
$ p.mean
tensor([[0.8453],
        [1.1509]], grad_fn=<UnsqueezeBackward0>)

$ p.variance
tensor([[0.0682],
        [0.0450]], grad_fn=<UnsqueezeBackward0>)

$ p.mvn.covariance_matrix
tensor([[0.0682, 0.0237],
        [0.0237, 0.0450]], grad_fn=<IndexBackward>)

The subsetting of the output is quite cumbersome right now, b/c of how GPyTorch represents multi-task mvns. I have an upstream PR that should simplify this and provide some convenience methods, I'm hoping to work on this some more soon: cornellius-gp/gpytorch#1083

0 replies

r-ashwin · 2020-09-18T02:08:53Z

r-ashwin
Sep 18, 2020
Author

Ah, I see -- thanks! Just FYI, with the wrapper like your MyWrapperModel, the optimization step with optimize_acqf throws an error. The fix is perhaps not that hard, but just pasting it here in case you might be interested.

from botorch.optim import optimize_acqf
bounds = torch.tensor([[0.], [1.]])
xnew, maxacq = optimize_acqf(MES, q=1, bounds=bounds, num_restarts=10, raw_samples=250)

RuntimeError                              Traceback (most recent call last)
<ipython-input-71-653c4851457d> in <module>
      2 bounds = torch.tensor([[0.], [1.]])
      3 # xnew, maxacq = test_x[torch.argmax(aMES)].detach().numpy(), torch.max(aMES).detach().numpy()
----> 4 xnew, maxacq = optimize_acqf(MES, q=1, bounds=bounds, num_restarts=10, raw_samples=250)

~\Anaconda3\lib\site-packages\botorch\optim\optimize.py in optimize_acqf(acq_function, bounds, q, num_restarts, raw_samples, options, inequality_constraints, equality_constraints, fixed_features, post_processing_func, batch_initial_conditions, return_best_only, sequential)
    153             num_restarts=num_restarts,
    154             raw_samples=raw_samples,
--> 155             options=options,
    156         )
    157 

~\Anaconda3\lib\site-packages\botorch\optim\initializers.py in gen_batch_initial_conditions(acq_function, bounds, q, num_restarts, raw_samples, options)
    106                     end_idx = min(start_idx + batch_limit, X_rnd.shape[0])
    107                     Y_rnd_curr = acq_function(
--> 108                         X_rnd[start_idx:end_idx].to(device=device)
    109                     ).cpu()
    110                     Y_rnd_list.append(Y_rnd_curr)

~\Anaconda3\lib\site-packages\torch\nn\modules\module.py in __call__(self, *input, **kwargs)
    548             result = self._slow_forward(*input, **kwargs)
    549         else:
--> 550             result = self.forward(*input, **kwargs)
    551         for hook in self._forward_hooks.values():
    552             hook_result = hook(self, input, result)

~\Anaconda3\lib\site-packages\botorch\utils\transforms.py in decorated(cls, X)
    170                 )
    171             X = X if X.dim() > 2 else X.unsqueeze(0)
--> 172             return method(cls, X)
    173 
    174         return decorated

~\Anaconda3\lib\site-packages\botorch\acquisition\max_value_entropy_search.py in forward(self, X)
    195 
    196         ig = self._compute_information_gain(
--> 197             X=X, mean_M=mean, variance_M=variance, covar_mM=variance.unsqueeze(-1)
    198         )
    199 

~\Anaconda3\lib\site-packages\botorch\acquisition\max_value_entropy_search.py in _compute_information_gain(self, X, mean_M, variance_M, covar_mM)
    241         # s_m x batch_shape x num_fantasies x (1 + num_trace_observations)
    242         L = torch.cholesky(variance_m)
--> 243         temp_term = torch.cholesky_solve(covar_mM.unsqueeze(-1), L).transpose(-2, -1)
    244         # equivalent to torch.matmul(covar_mM.unsqueeze(-2), torch.inverse(variance_m))
    245         # batch_shape x num_fantasies x 1 x (1 + num_trace_observations)

RuntimeError: Incompatible matrix sizes for cholesky_solve: each A matrix is 250 by 250 but each b matrix is 1 by 1

0 replies

Balandat · 2020-09-18T03:56:29Z

Balandat
Sep 18, 2020
Collaborator

Hmmm not sure what causes this.

Note that for full generality you want to pass observation_noise and **kwargs through the posterior method as follow:

from typing import Union, Any


class MyWrapperModel(GPModelWithDerivatives):
    
    _num_outputs = 1
    
    def posterior(
        self, X: Tensor,
        observation_noise: Union[bool, Tensor] = False,
        **kwargs: Any
    ) -> GPyTorchPosterior:
        full_posterior = super().posterior(X, observation_noise=observation_noise, **kwargs)
        full_mean = full_posterior.mean
        full_covar = full_posterior.mvn.covariance_matrix  # not great, but ok for small posteriors
        mean = full_mean[..., 0]
        idxr = torch.arange(0, full_covar.shape[-1], step=full_mean.shape[-1])
        covar = full_covar[..., idxr][..., idxr, :]
        mvn = gpytorch.distributions.MultivariateNormal(mean, covar)
        return GPyTorchPosterior(mvn=mvn)

With this, first running your code (up to failure), then running my code, and then doing the following works for me:

wrapper_model = MyWrapperModel(train_x, train_y, likelihood)
wrapper_model.load_state_dict(model.state_dict())

from botorch.acquisition import qMaxValueEntropy
mes = qNoisyExpectedImprovement(wrapper_model, X_baseline=train_x)
optimize_acqf(mes, bounds, 1, 4, 32)

0 replies

r-ashwin · 2020-09-18T10:40:01Z

r-ashwin
Sep 18, 2020
Author

Thanks @Balandat . Just to clarify, I used the qMaxValueEntropy and not the qNoisyExpectedImprovement.

from botorch.acquisition.max_value_entropy_search import qMaxValueEntropy
from botorch.optim import optimize_acqf

candidate_set = torch.rand(size=[10, 1]) # MES requires a candidate set
MES = qMaxValueEntropy(wrapper_model, candidate_set=candidate_set)
bounds = torch.tensor([[0.], [1.]])
xnew, maxacq = optimize_acqf(MES, bounds=bounds, q=1, num_restarts=10, raw_samples=250)

for which I get the same error as in above.

(edit) I think the error is due to MES() being able to take only inputs of size qx1xd to produce outputs of size float, whereas optimize_acqf tries to pass an array of random intial points to start the optimization. I verified this by setting
xnew, maxacq = optimize_acqf(MES, bounds=bounds, q=1, num_restarts=1, raw_samples=1)
which can be run repeatedly to generate random multistarts.

0 replies

Balandat · 2020-09-18T15:05:45Z

Balandat
Sep 18, 2020
Collaborator

Sorry must have pasted the wrong snippet, what I meant to say is that this works for me:

from botorch.acquisition import qMaxValueEntropy
from botorch.optim import optimize_acqf

bounds = torch.tensor([[0.], [1.]])
mes = qMaxValueEntropy(wrapper_model, torch.rand(10, 1))

optimize_acqf(mes, bounds, 1, 4, 32)

What version of botorch are you using?

(Side note: I am running into some singularity issues with posterior sampling in this simple 1d example, but those are unrelated to the model wrapper).

0 replies

r-ashwin · 2020-09-18T16:12:04Z

r-ashwin
Sep 18, 2020
Author

I am using v 0.2.5 of BoTorch.

I am aware of the singularity issue for the Cholesky decomposition that you're talking about. I went ahead and added a nugget of 1e-5 to my covariance matrix to stabilize it. On that note, if I may quickly add a side question, can you point me to any closed or open issues on this singularity issue with GPyTorch? I have seen it happen way too often when I try to fit an interpolating GP with noise-free observations even when observations are not too close to make the covariance matrix rank-deficient.

0 replies

Balandat · 2020-09-18T16:26:18Z

Balandat
Sep 18, 2020
Collaborator

I am using v 0.2.5 of BoTorch.

I am not sure if we fixed MES since then, but the code does run as above with the wrapper model and qMaxValueEntropySearch on 0.3.1, so maybe give that a try?

Re singularity: I can't really point to any issues directly, but exactly what kind of model were you trying to fit in GPyTorch? Depending on the amount of data the hyperparameters may end up with degenerate values if they are non constrained / do not have a prior. Would need a repro for your model to understand exactly what's going on (if this is just about the model, then let's maybe move that discussion to the gpytorch GitHub).

0 replies

r-ashwin · 2020-09-19T14:05:18Z

r-ashwin
Sep 19, 2020
Author

Okay thanks, I will update BoTorch and give it a shot. Like you said, I will perhaps open a new issue on the GPyTorch github about the singularity issue.

0 replies

Balandat · 2020-12-13T16:41:56Z

Balandat
Dec 13, 2020
Collaborator

@r-ashwin has this been resolved?

0 replies

r-ashwin · 2020-12-13T22:45:43Z

r-ashwin
Dec 13, 2020
Author

Yes it is. Thank you very much!

0 replies

r-ashwin · 2021-01-05T18:55:35Z

r-ashwin
Jan 5, 2021
Author

@Balandat quick question on the MyWrapperModel class you had provided to subset the mean and covariance of a multitask GP. What exactly is going on in this line full_covar = full_posterior.mvn.covariance_matrix? The multitask GP with observations and gradients is still Gaussian so (if I understood this correctly) why are we not using the original models' covar_module.forward() to get the covariance matrix?

The reason I ask is because, in my case, what I see is that for the same training data, with the approach above, instantiating MaxValueEntropySearch throws an error reporting RuntimeError: cholesky_cpu: U(3,3) is zero, singular U.. However, when I fit a SIngleTaskGP with the same training data (with the gradients excluded, ofcourse) I see that there is no error.

With or without gradients, the auto-covariance of the training data should be the same, unless the kernel hyperparameters are different. I understand that the kernel hyperparameters will not necessarily be the same, but when I actually checked their values, they were not too small to cause any singularity issue in the covariance matrix.

I would appreciate your thoughts on this issue. A full reproducible example is attached. Thanks!

BO_wGradients_gh.txt

0 replies

r-ashwin · 2021-01-08T01:22:41Z

r-ashwin
Jan 8, 2021
Author

@Balandat Just following up on the previous question, if you had a chance to take a look at it.

I noticed that the error appears for some test functions and not for some. I guess my question comes down to how can I subset the exact covariance matrix corresponding to the observations only from the joint (obs. + grad) covariance matrix? Thanks very much!

0 replies

Balandat · 2021-01-08T02:18:53Z

Balandat
Jan 8, 2021
Collaborator

What exactly is going on in this line full_covar = full_posterior.mvn.covariance_matrix? [...] why are we not using the original models' covar_module.forward() to get the covariance matrix?

We are, essentially. The posterior method for most of the GPyTorch models basically does just extract that (and makes sure that things like mean, var, samples are returned consistently as (batch_shape) x q x m where q is the number of points and m is the number of outputs).

I'm having some trouble running the attached nb - some vars are not defined. Also the nb doesn't include the full stack trace so it's hard to figure out what's going on without actually running into this myself - mind updating the nb?

My hunch is that MES is trying to instantiate a pytroch MultivariateNormal using the posterior covariance of the candidate_set. There are 2048 points in that set in 2d, so the likelihood that two points are very close is high - in that case the posterior covariance will be almost (numerically) singular, and the MES internal computations might fail. What version of botorch are you using? #518 may fix this issue.

0 replies

r-ashwin · 2021-01-08T18:06:22Z

r-ashwin
Jan 8, 2021
Author

I'm having some trouble running the attached nb

Sorry about that - I have now updated the notebook. I am using v0.3.3 which is indeed using the safe_cholesky.

As you said, MES is indeed evaluating the posterior covariance matrix on the candidate set (see part of the stack trace below). However,

   500     """
    501     # define the approximate CDF for the max value under the independence assumption
--> 502     posterior = model.posterior(candidate_set)
    503     weight = 1.0 if maximize else -1.0
    504     mu = weight * posterior.mean

<ipython-input-5-92487e7b2ed6> in posterior(self, X, observation_noise, **kwargs)
     16         covar = full_covar[..., idxr][..., idxr, :]
     17         covar = covar + torch.eye(covar.size()[0]) * 1e-4
---> 18         mvn   = gpytorch.distributions.MultivariateNormal(mean, covar)
     19         return GPyTorchPosterior(mvn=mvn)

This raises a couple of questions (1) because of the size of the candidate set potentially causing problems, is MES recommended only for low-d problems? (2) Is there a recommended size for the candidate_set that scales with dimensions, based on your benchmarks? (3) and also should the candidate_set be normalized or unnormalized? This tutorial seems to be using unnormalized values, but it seems a bit odd because we feed the normalized/standardized data to the GP model.

Also, I tried training a SingleTaskGP in the Branin with upto 5000 training points without singularity issues. Since you say the covarmatrix full_posterior.mvn.covariance_matrix and that of the SIngleTaskGP are the same, I am a bit lost in seeing this issue only when MES is instantiated.

import torch
from botorch.models import SingleTaskGP
from botorch.fit import fit_gpytorch_model
from gpytorch.mlls import ExactMarginalLogLikelihood
from botorch.test_functions import Branin
from botorch.utils.transforms import normalize, standardize, unnormalize

func    = Branin()
bounds  = func.bounds
train_x = bounds[0] + (bounds[1] - bounds[0]) * torch.rand([5000, 2])
train_y = func(train_x).reshape(-1,1)
model   = SingleTaskGP(train_x, standardize(train_y) )
mll     = ExactMarginalLogLikelihood(model.likelihood, model)
fit_gpytorch_model(mll);

0 replies

Balandat · 2021-01-09T01:17:57Z

Balandat
Jan 9, 2021
Collaborator

(1) because of the size of the candidate set potentially causing problems, is MES recommended only for low-d problems?

So the current implementation of MES uses samples at a discrete set of points to draw max posterior samples. This is indeed limited in scalability as the dimension of the problem grows. An alternative is to use Random Fourier Features (RFF)s or decoupled sampling as described e.g. in https://arxiv.org/abs/2011.04026, but that adds additional complexity since now we have to optimize each approximate posterior draw for each sample (each of which is a highly nonconvex high-dim optimization problem in its own right). We have some of this in the works and plan to addd functionality for using this in MES in the future.

Is there a recommended size for the candidate_set that scales with dimensions, based on your benchmarks?

I don't have a lot of specific data on this unfortunately. Generally, the denser the set of discrete points the better the quality of the max posterior samples will be. So usually one cranks this up until it either gets too slow or causes numerical issues.

(3) and also should the candidate_set be normalized or unnormalized?

Great point, this looks like it's a bug...

since you say the covarmatrix full_posterior.mvn.covariance_matrix and that of the SIngleTaskGP are the same, I am a bit lost in seeing this issue only when MES is instantiated.

Well in your example with the SingleTaskGP you never actually consider the posterior distribution. Note that during training, your covariance is the prior K(X, X) + sigma^2 I, where sigma is the noise level, which is guaranteed to be p.d. for positive sigma, even with repeated poitns in X. When you instantiate MES we compute the posterior K(X*, X*) - K(X*, X) (K(X, X) + sigma^2 I)^{-1}K(X,X*) for drawing max value samples, which is not guaranteed to be p.d. anymore since there is no noise term. Specifically if X^* contains (near) duplicate points, it will be (near) singular.

0 replies

r-ashwin · 2021-01-09T01:57:40Z

r-ashwin
Jan 9, 2021
Author

Thanks.

Well in your example with the SingleTaskGP you never actually consider the posterior distribution. Note that during training, your covariance is the prior K(X, X) + sigma^2 I, where sigma is the noise level, which is guaranteed to be p.d. for positive sigma, even with repeated poitns in X. When you instantiate MES we compute the posterior K(X*, X*) - K(X*, X) (K(X, X) + sigma^2 I)^{-1}K(X,X*) for drawing max value samples, which is not guaranteed to be p.d. anymore since there is no noise term. Specifically if X^* contains (near) duplicate points, it will be (near) singular.

Hmm.. in any case it is the ill-conditioning of this (K(X, X) + sigma^2 I) matrix that is throwing the error, right? So I don't see why it should make a difference.

0 replies

Balandat · 2021-01-09T03:49:07Z

Balandat
Jan 9, 2021
Collaborator

Hmm.. in any case it is the ill-conditioning of this K(X, X) + sigma^2 I matrix that is throwing the error, right? So I don't see why it should make a difference.

No it’s the full posterior covariance matrix (X*, X*) - K(X*, X) (K(X, X) + sigma^2 I)^{-1}K(X,X*) that will be singular if there are repeated test points in X*. The prior covariance including the likelihood term K(X, X) + sigma^2 I is always p.d.

0 replies

r-ashwin · 2021-01-14T18:23:22Z

r-ashwin
Jan 14, 2021
Author

@Balandat

The 1st approach above (that subselects the covariance matrix), as far as I understand, does not look correct to me (where you pick every oth column and row of the full covariance matrix). This is because in the posterior covariance matrix, there is a matrix inverse involved. Even in the prior covariance matrix, for n training points and o outputs, the first nxn is indeed the covariance matrix corresponding to just the observations would be the first nxn block of the full covariance matrix (see, e.g., http://www.gaussianprocess.org/gpml/chapters/RW9.pdf). Anyway, for these reasons, I thought it is worth pursuing the 2nd approach, which is to create a scalarized version of the MES acquisition function, following the qScalarizedUCB example provided in the tutorial. I believe I have implemented the acquisition function correctly, but I get the following error. The error message seems simple enough but you might be able to recognize it faster than I can. Any help is appreciated. A full reproducible example is attached.

Edit: the error seems to be due to the mismatch between 3 outputs in the model and the scalarized output, but I have no idea where the fix for this would be.

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-20-9a1695d00c3d> in <module>
      5 
      6 for i in range(test_x.shape[0]):
----> 7     aMES[i] = MES.forward(test_x[i,:].reshape(-1,1,2) )

~\Anaconda3\lib\site-packages\botorch\utils\transforms.py in decorated(cls, X, **kwargs)
    167                 )
    168             X = X if X.dim() > 2 else X.unsqueeze(0)
--> 169             return method(cls, X, **kwargs)
    170 
    171         return decorated

<ipython-input-18-9edb5f837f03> in forward(self, X)
    114 
    115         ig = self._compute_information_gain(
--> 116             X=X, mean_M=scalarized_mean, variance_M=variance, covar_mM=variance.unsqueeze(-1)
    117         )
    118 

<ipython-input-18-9edb5f837f03> in _compute_information_gain(self, X, mean_M, variance_M, covar_mM)
    160         # s_m x batch_shape x num_fantasies x (1 + num_trace_observations)
    161         L = torch.cholesky(variance_m)
--> 162         temp_term = torch.cholesky_solve(covar_mM.unsqueeze(-1), L).transpose(-2, -1)
    163         # equivalent to torch.matmul(covar_mM.unsqueeze(-2), torch.inverse(variance_m))
    164         # batch_shape x num_fantasies x 1 x (1 + num_trace_observations)

RuntimeError: Incompatible matrix sizes for cholesky_solve: each A matrix is 3 by 3 but each b matrix is 1 by 1

MES_wGrad_MCAcqApproach_repro.txt

0 replies

Balandat · 2021-01-14T19:24:54Z

Balandat
Jan 14, 2021
Collaborator

The 1st approach above (that subselects the covariance matrix), as far as I understand, does not look correct to me (where you pick every oth column and row of the full covariance matrix). This is because in the posterior covariance matrix, there is a matrix inverse involved. Even in the prior covariance matrix, for n training points and o outputs, the first nxn is indeed the covariance matrix corresponding to just the observations would be the first nxn block of the full covariance matrix (see, e.g., http://www.gaussianprocess.org/gpml/chapters/RW9.pdf).

I don't think I understand. There is no indexing going on in the prior covariance matrix, and the indexing into the posterior covariance happens on the computed one (after all inverses/solves have already been done).

Edit: the error seems to be due to the mismatch between 3 outputs in the model and the scalarized output, but I have no idea where the fix for this would be.

Thanks for the repro. Looking at the MES code I see there is some shape funkiness going on that I don't fully understand, will have to take a closer look. It also appears that the code is doing some repeated work that we shouldn't be doing. Let me see if I can clean this up.

0 replies

r-ashwin · 2021-01-14T19:48:43Z

r-ashwin
Jan 14, 2021
Author

Looking at the MES code I see there is some shape funkiness going on that I don't fully understand, will have to take a closer look. It also appears that the code is doing some repeated work that we shouldn't be doing. Let me see if I can clean this up.

Thanks @Balandat !

idxr = torch.arange(0, full_covar.shape[-1], step=full_mean.shape[-1])
covar = full_covar[..., idxr][..., idxr, :]

In the above lines in you wrapper class (assuming we have o=3 outputs: 1 obs, 2 partial derivatives), you are choosing every fourth column and row of the posterior full_covar, correct? I am saying those entries do not correspond to the covariance matrix considering the observations only. I hope that clarifies? I could also be missing your point here, so if you explain the above two lines, we might get on the same page.

0 replies

Balandat · 2021-01-14T19:55:18Z

Balandat
Jan 14, 2021
Collaborator

In the above lines in you wrapper class (assuming we have o=3 outputs: 1 obs, 2 partial derivatives), you are choosing every fourth column and row of the posterior full_covar, correct?

This should choose every third column/row, starting with the first one. So this should correspond to the covariance across the observations.

0 replies

r-ashwin · 2021-01-14T20:37:16Z

r-ashwin
Jan 14, 2021
Author

So this should correspond to the covariance across the observations.

That's the part I disagree with, but let me see if I can setup a case to verify that.

0 replies

Balandat · 2021-01-14T20:46:35Z

Balandat
Jan 14, 2021
Collaborator

if you're worried about the structure of the covariance (whether it's cross-point covariances stacked for each output, or cross-output covariance stacked for each data point), this shouldn't matter here since you only evaluate at a single point

0 replies

r-ashwin · 2021-01-15T15:25:48Z

r-ashwin
Jan 15, 2021
Author

Okay I think having the scalarized version might still be useful, since it is more generic. So whenever you get a chance to review my repro, please let me know. Thanks very much! Much appreciated!

0 replies

r-ashwin · 2021-01-19T22:45:02Z

r-ashwin
Jan 19, 2021
Author

@Balandat Any luck with the repro? I guess I am a bit stuck because trying to fix the dimension mismatch leads to further errors and it is not clear how everything propagates. 😇

0 replies

Balandat · 2021-01-20T16:02:09Z

Balandat
Jan 20, 2021
Collaborator

One thing I see in your nb is that you're passing a variance of shape 1 x 1 x 1 x 3 into _compute_information_gain while the scalarized_mean is of shape 1 x 1 x 1. So you have a shape incompatibility there. What's more, you're scalarizing the mean, but you're not scalarizing the variance properly (you're using the vector of marginal variances of the output).

I would suggest taking a look at https://github.com/pytorch/botorch/blob/master/botorch/acquisition/objective.py#L29. You should be able to instantiate this in the constructor and then just push the posteriors returned by all posterior() calls in the MES implementation through this without any other code changes (we should probably support this option for MES out of the box).

Sorry I'm pretty caught up in other stuff right now so I won't be able to test this myself until next week or so.

0 replies

Setting up a custom GPyTorch model for BoTorch #1271

Uh oh!

r-ashwin Sep 12, 2020

Issue description

System Info

Replies: 28 comments

Uh oh!

Balandat Sep 14, 2020 Collaborator

Uh oh!

Uh oh!

r-ashwin Sep 17, 2020 Author

Uh oh!

Balandat Sep 17, 2020 Collaborator

Uh oh!

r-ashwin Sep 18, 2020 Author

Uh oh!

Balandat Sep 18, 2020 Collaborator

Uh oh!

Uh oh!

r-ashwin Sep 18, 2020 Author

Uh oh!

Balandat Sep 18, 2020 Collaborator

Uh oh!

r-ashwin Sep 18, 2020 Author

Uh oh!

Balandat Sep 18, 2020 Collaborator

Uh oh!

r-ashwin Sep 19, 2020 Author

Uh oh!

Balandat Dec 13, 2020 Collaborator

Uh oh!

r-ashwin Dec 13, 2020 Author

Uh oh!

Uh oh!

r-ashwin Jan 5, 2021 Author

Uh oh!

r-ashwin Jan 8, 2021 Author

Uh oh!

Balandat Jan 8, 2021 Collaborator

Uh oh!

Uh oh!

r-ashwin Jan 8, 2021 Author

Uh oh!

Balandat Jan 9, 2021 Collaborator

Uh oh!

r-ashwin Jan 9, 2021 Author

Uh oh!

Uh oh!

Balandat Jan 9, 2021 Collaborator

Uh oh!

Uh oh!

r-ashwin Jan 14, 2021 Author

Uh oh!

Balandat Jan 14, 2021 Collaborator

Uh oh!

Uh oh!

r-ashwin Jan 14, 2021 Author

Uh oh!

Balandat Jan 14, 2021 Collaborator

Uh oh!

r-ashwin Jan 14, 2021 Author

Uh oh!

Balandat Jan 14, 2021 Collaborator

Uh oh!

r-ashwin Jan 15, 2021 Author

Uh oh!

r-ashwin Jan 19, 2021 Author

r-ashwin
Sep 12, 2020

Balandat
Sep 14, 2020
Collaborator

r-ashwin
Sep 17, 2020
Author

Balandat
Sep 17, 2020
Collaborator

r-ashwin
Sep 18, 2020
Author

Balandat
Sep 18, 2020
Collaborator

r-ashwin
Sep 18, 2020
Author

Balandat
Sep 18, 2020
Collaborator

r-ashwin
Sep 18, 2020
Author

Balandat
Sep 18, 2020
Collaborator

r-ashwin
Sep 19, 2020
Author

Balandat
Dec 13, 2020
Collaborator

r-ashwin
Dec 13, 2020
Author

r-ashwin
Jan 5, 2021
Author

r-ashwin
Jan 8, 2021
Author

Balandat
Jan 8, 2021
Collaborator

r-ashwin
Jan 8, 2021
Author

Balandat
Jan 9, 2021
Collaborator

r-ashwin
Jan 9, 2021
Author

Balandat
Jan 9, 2021
Collaborator

r-ashwin
Jan 14, 2021
Author

Balandat
Jan 14, 2021
Collaborator

r-ashwin
Jan 14, 2021
Author

Balandat
Jan 14, 2021
Collaborator

r-ashwin
Jan 14, 2021
Author

Balandat
Jan 14, 2021
Collaborator

r-ashwin
Jan 15, 2021
Author

r-ashwin
Jan 19, 2021
Author