Merge pull request #5 from ritheshkumar95/tristan/vq-function

ritheshkumar95 · web-flow · commit 279c5e9e6fab · 2018-05-22T14:28:36.000-04:00
Vector Quantization as a Pytorch Function
diff --git a/.gitignore b/.gitignore
@@ -45,6 +45,7 @@ nosetests.xml
 coverage.xml
 *.cover
 .hypothesis/
+.pytest_cache/
 
 # Translations
 *.mo
diff --git a/functions.py b/functions.py
@@ -0,0 +1,67 @@
+import torch
+from torch.autograd import Function
+
+class VectorQuantization(Function):
+    @staticmethod
+    def forward(ctx, inputs, codebook):
+        with torch.no_grad():
+            embedding_size = codebook.size(1)
+            inputs_size = inputs.size()
+            inputs_flatten = inputs.view(-1, embedding_size)
+
+            codebook_sqr = torch.sum(codebook ** 2, dim=1)
+            inputs_sqr = torch.sum(inputs_flatten ** 2, dim=1, keepdim=True)
+
+            # Compute the distances to the codebook
+            distances = torch.addmm(codebook_sqr + inputs_sqr,
+                inputs_flatten, codebook.t(), alpha=-2.0, beta=1.0)
+
+            _, indices_flatten = torch.min(distances, dim=1)
+            indices = indices_flatten.view(*inputs_size[:-1])
+            ctx.mark_non_differentiable(indices)
+
+            return indices
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        raise RuntimeError('Trying to call `.grad()` on graph containing '
+            '`VectorQuantization`. The function `VectorQuantization` '
+            'is not differentiable. Use `VectorQuantizationStraightThrough` '
+            'if you want a straight-through estimator of the gradient.')
+
+class VectorQuantizationStraightThrough(Function):
+    @staticmethod
+    def forward(ctx, inputs, codebook):
+        indices = vq(inputs, codebook)
+        indices_flatten = indices.view(-1)
+        ctx.save_for_backward(indices_flatten, codebook)
+        ctx.mark_non_differentiable(indices_flatten)
+
+        codes_flatten = torch.index_select(codebook, dim=0,
+            index=indices_flatten)
+        codes = codes_flatten.view_as(inputs)
+
+        return (codes, indices_flatten)
+
+    @staticmethod
+    def backward(ctx, grad_output, grad_indices):
+        grad_inputs, grad_codebook = None, None
+
+        if ctx.needs_input_grad[0]:
+            # Straight-through estimator
+            grad_inputs = grad_output.clone()
+        if ctx.needs_input_grad[1]:
+            # Gradient wrt. the codebook
+            indices, codebook = ctx.saved_tensors
+            embedding_size = codebook.size(1)
+
+            grad_output_flatten = (grad_output.contiguous()
+                                              .view(-1, embedding_size))
+            grad_codebook = torch.zeros_like(codebook)
+            grad_codebook.index_add_(0, indices, grad_output_flatten)
+
+        return (grad_inputs, grad_codebook)
+
+vq = VectorQuantization.apply
+vq_st = VectorQuantizationStraightThrough.apply
+__all__ = [vq, vq_st]
diff --git a/miniimagenet_pixelcnn_prior.py b/miniimagenet_pixelcnn_prior.py
@@ -14,7 +14,7 @@ def train(data_loader, model, prior, optimizer, args, writer):
     for images, labels in data_loader:
         with torch.no_grad():
             images = images.to(args.device)
-            latents, _ = model.encode(images)
+            latents = model.encode(images)
             latents = latents.detach()
 
         labels = labels.to(args.device)
@@ -39,7 +39,7 @@ def test(data_loader, model, prior, args, writer):
             images = images.to(args.device)
             labels = labels.to(args.device)
 
-            latents, _ = model.encode(images)
+            latents = model.encode(images)
             latents = latents.detach()
             logits = prior(latents, labels)
             logits = logits.permute(0, 2, 3, 1).contiguous()
diff --git a/miniimagenet_vqvae.py b/miniimagenet_vqvae.py
@@ -15,22 +15,16 @@ def train(data_loader, model, optimizer, args, writer):
 
         optimizer.zero_grad()
         x_tilde, z_e_x, z_q_x = model(images)
-        z_q_x.retain_grad()
 
+        # Reconstruction loss
         loss_recons = F.mse_loss(x_tilde, images)
-        loss_recons.backward(retain_graph=True)
-
-        # Straight-through estimator
-        z_e_x.backward(z_q_x.grad, retain_graph=True)
-
         # Vector quantization objective
-        model.codebook.embedding.zero_grad()
         loss_vq = F.mse_loss(z_q_x, z_e_x.detach())
-        loss_vq.backward(retain_graph=True)
-
         # Commitment objective
-        loss_commit = args.beta * F.mse_loss(z_e_x, z_q_x.detach())
-        loss_commit.backward()
+        loss_commit = F.mse_loss(z_e_x, z_q_x.detach())
+
+        loss = loss_recons + loss_vq + args.beta * loss_commit
+        loss.backward()
 
         # Logs
         writer.add_scalar('loss/train/reconstruction', loss_recons.item(), args.steps)
diff --git a/modules.py b/modules.py
@@ -4,6 +4,7 @@
 from torch.distributions.normal import Normal
 from torch.distributions import kl_divergence
 
+from functions import vq, vq_st
 
 def to_scalar(arr):
     if type(arr) == list:
@@ -73,17 +74,21 @@ def __init__(self, K, D):
         self.embedding.weight.data.uniform_(-1./K, 1./K)
 
     def forward(self, z_e_x):
-        # z_e_x - (B, D, H, W)
-        # emb   - (K, D)
+        z_e_x_ = z_e_x.permute(0, 2, 3, 1).contiguous()
+        latents = vq(z_e_x_, self.embedding.weight)
+        return latents
 
-        emb = self.embedding.weight
-        dists = torch.pow(
-            z_e_x.unsqueeze(1) - emb[None, :, :, None, None],
-            2
-        ).sum(2)
+    def straight_through(self, z_e_x):
+        z_e_x_ = z_e_x.permute(0, 2, 3, 1).contiguous()
+        z_q_x_, indices = vq_st(z_e_x_, self.embedding.weight)
+        z_q_x = z_q_x_.permute(0, 3, 1, 2).contiguous()
 
-        latents = dists.min(1)[1]
-        return latents
+        z_q_x_bar_flatten = torch.index_select(self.embedding.weight,
+            dim=0, index=indices)
+        z_q_x_bar_ = z_q_x_bar_flatten.view_as(z_e_x_)
+        z_q_x_bar = z_q_x_bar_.permute(0, 3, 1, 2).contiguous()
+
+        return z_q_x, z_q_x_bar
 
 
 class ResBlock(nn.Module):
@@ -132,16 +137,17 @@ def __init__(self, input_dim, dim, K=512):
     def encode(self, x):
         z_e_x = self.encoder(x)
         latents = self.codebook(z_e_x)
-        return latents, z_e_x
+        return latents
 
     def decode(self, latents):
         z_q_x = self.codebook.embedding(latents).permute(0, 3, 1, 2)  # (B, D, H, W)
         x_tilde = self.decoder(z_q_x)
-        return x_tilde, z_q_x
+        return x_tilde
 
     def forward(self, x):
-        latents, z_e_x = self.encode(x)
-        x_tilde, z_q_x = self.decode(latents)
+        z_e_x = self.encoder(x)
+        z_q_x_st, z_q_x = self.codebook.straight_through(z_e_x)
+        x_tilde = self.decoder(z_q_x_st)
         return x_tilde, z_e_x, z_q_x
 
 
diff --git a/pixelcnn_prior.py b/pixelcnn_prior.py
@@ -118,7 +118,7 @@ def generate_samples():
     label = label.to(device=DEVICE, dtype=torch.int64)
 
     latents = model.generate(label, shape=LATENT_SHAPE, batch_size=100)
-    x_tilde, _ = autoencoder.decode(latents)
+    x_tilde = autoencoder.decode(latents)
     images = (x_tilde.cpu().data + 1) / 2
 
     save_image(
diff --git a/test_functions.py b/test_functions.py
@@ -0,0 +1,72 @@
+import pytest
+
+import numpy as np
+import torch
+
+from functions import vq, vq_st
+
+def test_vq_shape():
+    inputs = torch.rand((2, 3, 5, 7), dtype=torch.float32, requires_grad=True)
+    codebook = torch.rand((11, 7), dtype=torch.float32, requires_grad=True)
+    indices = vq(inputs, codebook)
+
+    assert indices.size() == (2, 3, 5)
+    assert not indices.requires_grad
+    assert indices.dtype == torch.int64
+
+def test_vq():
+    inputs = torch.rand((2, 3, 5, 7), dtype=torch.float32, requires_grad=True)
+    codebook = torch.rand((11, 7), dtype=torch.float32, requires_grad=True)
+    indices = vq(inputs, codebook)
+
+    differences = inputs.unsqueeze(3) - codebook
+    distances = torch.norm(differences, p=2, dim=4)
+
+    _, indices_torch = torch.min(distances, dim=3)
+
+    assert np.allclose(indices.numpy(), indices_torch.numpy())
+
+def test_vq_st_shape():
+    inputs = torch.rand((2, 3, 5, 7), dtype=torch.float32, requires_grad=True)
+    codebook = torch.rand((11, 7), dtype=torch.float32, requires_grad=True)
+    codes, indices = vq_st(inputs, codebook)
+
+    assert codes.size() == (2, 3, 5, 7)
+    assert codes.requires_grad
+    assert codes.dtype == torch.float32
+
+    assert indices.size() == (2 * 3 * 5,)
+    assert not indices.requires_grad
+    assert indices.dtype == torch.int64
+
+def test_vq_st_gradient1():
+    inputs = torch.rand((2, 3, 5, 7), dtype=torch.float32, requires_grad=True)
+    codebook = torch.rand((11, 7), dtype=torch.float32, requires_grad=True)
+    codes, _ = vq_st(inputs, codebook)
+
+    grad_output = torch.rand((2, 3, 5, 7))
+    grad_inputs, = torch.autograd.grad(codes, inputs,
+        grad_outputs=[grad_output])
+
+    # Straight-through estimator
+    assert grad_inputs.size() == (2, 3, 5, 7)
+    assert np.allclose(grad_output.numpy(), grad_inputs.numpy())
+
+def test_vq_st_gradient2():
+    inputs = torch.rand((2, 3, 5, 7), dtype=torch.float32, requires_grad=True)
+    codebook = torch.rand((11, 7), dtype=torch.float32, requires_grad=True)
+    codes, _ = vq_st(inputs, codebook)
+
+    indices = vq(inputs, codebook)
+    codes_torch = torch.embedding(codebook, indices, padding_idx=-1,
+        scale_grad_by_freq=False, sparse=False)
+
+    grad_output = torch.rand((2, 3, 5, 7), dtype=torch.float32)
+    grad_codebook, = torch.autograd.grad(codes, codebook,
+        grad_outputs=[grad_output])
+    grad_codebook_torch, = torch.autograd.grad(codes_torch, codebook,
+        grad_outputs=[grad_output])
+
+    # Gradient is the same as torch.embedding function
+    assert grad_codebook.size() == (11, 7)
+    assert np.allclose(grad_codebook.numpy(), grad_codebook_torch.numpy())