ritheshkumar95
diff --git a/‎modules.py
Lines changed: 69 additions & 19 deletions b/‎modules.py
Lines changed: 69 additions & 19 deletions
diff --git a/‎pixelcnn.py
Lines changed: 4 additions & 4 deletions b/‎pixelcnn.py
Lines changed: 4 additions & 4 deletions
diff --git a/‎samples/reconstructions_CIFAR10.png
137 KB b/‎samples/reconstructions_CIFAR10.png
137 KB
diff --git a/‎samples/reconstructions_MNIST.png
11.1 KB b/‎samples/reconstructions_MNIST.png
11.1 KB
diff --git a/‎vae.py
Lines changed: 145 additions & 0 deletions b/‎vae.py
Lines changed: 145 additions & 0 deletions
@@ -1,6 +1,8 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from torch.distributions.normal import Normal
+from torch.distributions import kl_divergence
 
 
 def to_scalar(arr):
@@ -19,7 +21,7 @@ def weights_init(m):
 
 class ResBlock(nn.Module):
     def __init__(self, dim):
-        super(ResBlock, self).__init__()
+        super().__init__()
         self.block = nn.Sequential(
             nn.ReLU(True),
             nn.Conv2d(dim, dim, 3, 1, 1),
@@ -31,9 +33,66 @@ def forward(self, x):
         return x + self.block(x)
 
 
-class AutoEncoder(nn.Module):
+class VQEmbedding(nn.Module):
+    def __init__(self, K, D):
+        super().__init__()
+        self.embedding = nn.Embedding(K, D)
+        self.embedding.weight.data.uniform_(-1./K, 1./K)
+
+    def forward(self, z_e_x):
+        # z_e_x - (B, D, H, W)
+        # emb   - (K, D)
+
+        emb = self.embedding.weight
+        dists = torch.pow(
+            z_e_x.unsqueeze(1) - emb[None, :, :, None, None],
+            2
+        ).sum(2)
+
+        latents = dists.min(1)[1]
+        return latents
+
+
+class VAE(nn.Module):
+    def __init__(self, input_dim, dim, z_dim):
+        super().__init__()
+        self.encoder = nn.Sequential(
+            nn.Conv2d(input_dim, dim, 4, 2, 1),
+            nn.ReLU(True),
+            nn.Conv2d(dim, dim, 4, 2, 1),
+            nn.ReLU(True),
+            nn.Conv2d(dim, dim, 5, 1, 0),
+            nn.ReLU(True),
+            nn.Conv2d(dim, z_dim * 2, 3, 1, 0),
+        )
+
+        self.decoder = nn.Sequential(
+            nn.ConvTranspose2d(z_dim, dim, 3, 1, 0),
+            nn.ReLU(True),
+            nn.ConvTranspose2d(dim, dim, 5, 1, 0),
+            nn.ReLU(True),
+            nn.ConvTranspose2d(dim, dim, 4, 2, 1),
+            nn.ReLU(True),
+            nn.ConvTranspose2d(dim, input_dim, 4, 2, 1),
+            nn.Tanh()
+        )
+
+        self.apply(weights_init)
+
+    def forward(self, x):
+        mu, logvar = self.encoder(x).chunk(2, dim=1)
+
+        q_z_x = Normal(mu, logvar.mul(.5).exp())
+        p_z = Normal(torch.zeros_like(mu), torch.ones_like(logvar))
+        kl_div = kl_divergence(q_z_x, p_z).sum(1).mean()
+
+        x_tilde = self.decoder(q_z_x.rsample())
+        return x_tilde, kl_div
+
+
+class VectorQuantizedAE(nn.Module):
     def __init__(self, input_dim, dim, K=512):
-        super(AutoEncoder, self).__init__()
+        super().__init__()
         self.encoder = nn.Sequential(
             nn.Conv2d(input_dim, dim, 4, 2, 1),
             nn.ReLU(True),
@@ -42,9 +101,7 @@ def __init__(self, input_dim, dim, K=512):
             ResBlock(dim),
         )
 
-        self.embedding = nn.Embedding(K, dim)
-        # self.embedding.weight.data.copy_(1./K * torch.randn(K, 256))
-        self.embedding.weight.data.uniform_(-1./K, 1./K)
+        self.codebook = VQEmbedding(K, dim)
 
         self.decoder = nn.Sequential(
             ResBlock(dim),
@@ -60,20 +117,11 @@ def __init__(self, input_dim, dim, K=512):
 
     def encode(self, x):
         z_e_x = self.encoder(x)
-
-        z_e_x_transp = z_e_x.permute(0, 2, 3, 1)  # (B, H, W, C)
-        emb = self.embedding.weight.transpose(0, 1)  # (C, K)
-        dists = torch.pow(
-            z_e_x_transp.unsqueeze(4) - emb[None, None, None],
-            2
-        ).sum(-2)
-        latents = dists.min(-1)[1]
+        latents = self.codebook(z_e_x)
         return latents, z_e_x
 
     def decode(self, latents):
-        shp = latents.size() + (-1, )
-        z_q_x = self.embedding(latents.view(latents.size(0), -1))  # (B * H * W, C)
-        z_q_x = z_q_x.view(*shp).permute(0, 3, 1, 2)  # (B, C, H, W)
+        z_q_x = self.codebook.embedding(latents).permute(0, 3, 1, 2)  # (B, D, H, W)
         x_tilde = self.decoder(z_q_x)
         return x_tilde, z_q_x
 
@@ -191,8 +239,10 @@ def forward(self, x, label):
 
     def generate(self, label, shape=(8, 8), batch_size=64):
         param = next(self.parameters())
-        x = torch.zeros((batch_size, *shape),
-            dtype=torch.int64, device=param.device)
+        x = torch.zeros(
+            (batch_size, *shape),
+            dtype=torch.int64, device=param.device
+        )
 
         for i in range(shape[0]):
             for j in range(shape[1]):
 
@@ -16,11 +16,11 @@
 
 LATENT_SHAPE = (8, 8) # (8, 8) -> 32x32 images, (7, 7) -> 28x28 images
 INPUT_DIM = 3  # 3 (RGB) | 1 (Grayscale)
-DIM = 128
+DIM = 64
 VAE_DIM = 256
 N_LAYERS = 15
 K = 512
-LR = 3e-4
+LR = 1e-3
 
 DEVICE = torch.device('cuda') # torch.device('cpu')
 
@@ -45,13 +45,13 @@
 
 autoencoder = AutoEncoder(INPUT_DIM, VAE_DIM, K).to(DEVICE)
 autoencoder.load_state_dict(
-    torch.load('models/{}_autoencoder.pt'.format(DATASET))
+    torch.load('models/{}_vqvae.pt'.format(DATASET))
 )
 autoencoder.eval()
 
 model = GatedPixelCNN(K, DIM, N_LAYERS).to(DEVICE)
 criterion = nn.CrossEntropyLoss().to(DEVICE)
-opt = torch.optim.Adam(model.parameters(), lr=LR)
+opt = torch.optim.Adam(model.parameters(), lr=LR, amsgrad=True)
 
 
 def train():
 
@@ -0,0 +1,145 @@
+import numpy as np
+import time
+
+import torch
+import torch.nn.functional as F
+from torch.distributions.normal import Normal
+
+from torchvision import datasets, transforms
+from torchvision.utils import save_image
+
+from modules import VAE
+
+
+BATCH_SIZE = 32
+N_EPOCHS = 100
+PRINT_INTERVAL = 500
+DATASET = 'CIFAR10'  # CIFAR10 | MNIST | FashionMNIST
+NUM_WORKERS = 4
+
+INPUT_DIM = 3
+DIM = 256
+Z_DIM = 128
+LR = 3e-4
+
+
+preproc_transform = transforms.Compose([
+    transforms.ToTensor(),
+    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
+])
+train_loader = torch.utils.data.DataLoader(
+    eval('datasets.'+DATASET)(
+        '../data/{}/'.format(DATASET), train=True, download=True,
+        transform=preproc_transform,
+    ), batch_size=BATCH_SIZE, shuffle=False,
+    num_workers=NUM_WORKERS, pin_memory=True
+)
+test_loader = torch.utils.data.DataLoader(
+    eval('datasets.'+DATASET)(
+        '../data/{}/'.format(DATASET), train=False,
+        transform=preproc_transform
+    ), batch_size=BATCH_SIZE, shuffle=False,
+    num_workers=NUM_WORKERS, pin_memory=True
+)
+
+model = VAE(INPUT_DIM, DIM, Z_DIM).cuda()
+print(model)
+opt = torch.optim.Adam(model.parameters(), lr=LR, amsgrad=True)
+
+
+def train():
+    train_loss = []
+    model.train()
+    for batch_idx, (x, _) in enumerate(train_loader):
+        start_time = time.time()
+        x = x.cuda()
+
+        x_tilde, kl_d = model(x)
+        loss_recons = F.mse_loss(x_tilde, x, size_average=False) / x.size(0)
+        loss = loss_recons + kl_d
+
+        nll = -Normal(x_tilde, torch.ones_like(x_tilde)).log_prob(x)
+        log_px = nll.mean().item() - np.log(128) + kl_d.item()
+        log_px /= np.log(2)
+
+        opt.zero_grad()
+        loss.backward()
+        opt.step()
+
+        train_loss.append([log_px, loss.item()])
+
+        if (batch_idx + 1) % PRINT_INTERVAL == 0:
+            print('\tIter [{}/{} ({:.0f}%)]\tLoss: {} Time: {:5.3f} ms/batch'.format(
+                batch_idx * len(x), len(train_loader.dataset),
+                PRINT_INTERVAL * batch_idx / len(train_loader),
+                np.asarray(train_loss)[-PRINT_INTERVAL:].mean(0),
+                1000 * (time.time() - start_time)
+            ))
+
+
+def test():
+    start_time = time.time()
+    val_loss = []
+    model.eval()
+    with torch.no_grad():
+        for batch_idx, (x, _) in enumerate(test_loader):
+            x = x.cuda()
+            x_tilde, kl_d = model(x)
+            loss_recons = F.mse_loss(x_tilde, x, size_average=False) / x.size(0)
+            loss = loss_recons + kl_d
+            val_loss.append(loss.item())
+
+    print('\nValidation Completed!\tLoss: {:5.4f} Time: {:5.3f} s'.format(
+        np.asarray(val_loss).mean(0),
+        time.time() - start_time
+    ))
+    return np.asarray(val_loss).mean(0)
+
+
+def generate_reconstructions():
+    model.eval()
+    x, _ = test_loader.__iter__().next()
+    x = x[:32].cuda()
+    x_tilde, kl_div = model(x)
+
+    x_cat = torch.cat([x, x_tilde], 0)
+    images = (x_cat.cpu().data + 1) / 2
+
+    save_image(
+        images,
+        'samples/vae_reconstructions_{}.png'.format(DATASET),
+        nrow=8
+    )
+
+
+def generate_samples():
+    model.eval()
+    z_e_x = torch.randn(64, Z_DIM, 1, 1).cuda()
+    x_tilde = model.decoder(z_e_x)
+
+    images = (x_tilde.cpu().data + 1) / 2
+
+    save_image(
+        images,
+        'samples/vae_samples_{}.png'.format(DATASET),
+        nrow=8
+    )
+
+
+BEST_LOSS = 99999
+LAST_SAVED = -1
+for epoch in range(1, N_EPOCHS):
+    print("Epoch {}:".format(epoch))
+    train()
+    cur_loss = test()
+
+    if cur_loss <= BEST_LOSS:
+        BEST_LOSS = cur_loss
+        LAST_SAVED = epoch
+        print("Saving model!")
+        torch.save(model.state_dict(), 'models/{}_vae.pt'.format(DATASET))
+    else:
+        print("Not saving model! Last saved: {}".format(LAST_SAVED))
+
+    generate_reconstructions()
+    generate_samples()