ritheshkumar95
diff --git a/‎main.py
Lines changed: 83 additions & 31 deletions b/‎main.py
Lines changed: 83 additions & 31 deletions
diff --git a/‎modules.py
Lines changed: 173 additions & 30 deletions b/‎modules.py
Lines changed: 173 additions & 30 deletions
@@ -2,78 +2,130 @@
 import torch.nn.functional as F
 from torchvision import datasets, transforms
 from modules import AutoEncoder, to_scalar
-from torch.autograd import Variable
 import numpy as np
 from torchvision.utils import save_image
 import time
 
 
-kwargs = {'num_workers': 2, 'pin_memory': True}
-train_loader = torch.utils.data.DataLoader(
-    datasets.FashionMNIST(
-        'data/FashionMNIST/', train=True, download=True,
-        transform=transforms.ToTensor()
-        ), batch_size=64, shuffle=False, **kwargs
-    )
+BATCH_SIZE = 128
+N_EPOCHS = 100
+PRINT_INTERVAL = 100
+DATASET = 'FashionMNIST'  # CIFAR10 | MNIST | FashionMNIST
+NUM_WORKERS = 4
+
+INPUT_DIM = 1  # 3 (RGB) | 1 (Grayscale)
+DIM = 256
+K = 512
+LAMDA = 1
+LR = 2e-4
+
 
+preproc_transform = transforms.Compose([
+    transforms.ToTensor(),
+    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
+])
+train_loader = torch.utils.data.DataLoader(
+    eval('datasets.'+DATASET)(
+        '../data/{}/'.format(DATASET), train=True, download=True,
+        transform=preproc_transform,
+    ), batch_size=BATCH_SIZE, shuffle=False,
+    num_workers=NUM_WORKERS, pin_memory=True
+)
 test_loader = torch.utils.data.DataLoader(
-    datasets.FashionMNIST(
-        'data/FashionMNIST/', train=False,
-        transform=transforms.ToTensor()
-    ), batch_size=32, shuffle=False, **kwargs
+    eval('datasets.'+DATASET)(
+        '../data/{}/'.format(DATASET), train=False,
+        transform=preproc_transform
+    ), batch_size=BATCH_SIZE, shuffle=False,
+    num_workers=NUM_WORKERS, pin_memory=True
 )
-test_data = list(test_loader)
 
-model = AutoEncoder().cuda()
-opt = torch.optim.Adam(model.parameters(), lr=3e-4)
+model = AutoEncoder(INPUT_DIM, DIM, K).cuda()
+opt = torch.optim.Adam(model.parameters(), lr=LR)
 
 
-def train(epoch):
+def train():
     train_loss = []
-    for batch_idx, (data, _) in enumerate(train_loader):
+    for batch_idx, (x, _) in enumerate(train_loader):
         start_time = time.time()
-        x = Variable(data, requires_grad=False).cuda()
+        x = x.cuda()
 
         opt.zero_grad()
 
         x_tilde, z_e_x, z_q_x = model(x)
         z_q_x.retain_grad()
 
-        loss_recons = F.binary_cross_entropy(x_tilde, x)
+        loss_recons = F.mse_loss(x_tilde, x)
         loss_recons.backward(retain_graph=True)
 
         # Straight-through estimator
         z_e_x.backward(z_q_x.grad, retain_graph=True)
 
         # Vector quantization objective
+        model.embedding.zero_grad()
         loss_vq = F.mse_loss(z_q_x, z_e_x.detach())
         loss_vq.backward(retain_graph=True)
 
         # Commitment objective
-        loss_commit = 0.25 * F.mse_loss(z_e_x, z_q_x.detach())
+        loss_commit = LAMDA * F.mse_loss(z_e_x, z_q_x.detach())
         loss_commit.backward()
         opt.step()
 
         train_loss.append(to_scalar([loss_recons, loss_vq]))
 
-        if (batch_idx + 1) % 100 == 0:
-            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {} Time: {}'.format(
-                epoch, batch_idx * len(data), len(train_loader.dataset),
-                100. * batch_idx / len(train_loader),
-                np.asarray(train_loss)[-100:].mean(0),
+        if (batch_idx + 1) % PRINT_INTERVAL == 0:
+            print('\tIter [{}/{} ({:.0f}%)]\tLoss: {} Time: {}'.format(
+                batch_idx * len(x), len(train_loader.dataset),
+                PRINT_INTERVAL * batch_idx / len(train_loader),
+                np.asarray(train_loss)[-PRINT_INTERVAL:].mean(0),
                 time.time() - start_time
             ))
 
 
 def test():
-    x = Variable(test_data[0][0]).cuda()
+    start_time = time.time()
+    val_loss = []
+    for batch_idx, (x, _) in enumerate(test_loader):
+        x = x.cuda()
+        x_tilde, z_e_x, z_q_x = model(x)
+        loss_recons = F.mse_loss(x_tilde, x)
+        loss_vq = F.mse_loss(z_q_x, z_e_x.detach())
+        val_loss.append(to_scalar([loss_recons, loss_vq]))
+
+    print('\nValidation Completed!\tLoss: {} Time: {:5.3f}'.format(
+        np.asarray(val_loss).mean(0),
+        time.time() - start_time
+    ))
+    return np.asarray(val_loss).mean(0)
+
+
+def generate_samples():
+    x, _ = test_loader.__iter__().next()
+    x = x[:32].cuda()
     x_tilde, _, _ = model(x)
 
     x_cat = torch.cat([x, x_tilde], 0)
-    images = x_cat.cpu().data
-    save_image(images, './sample_fashion_mnist.png', nrow=8)
+    images = (x_cat.cpu().data + 1) / 2
+
+    save_image(
+        images,
+        'samples/reconstructions_{}.png'.format(DATASET),
+        nrow=8
+    )
+
+
+BEST_LOSS = 999
+LAST_SAVED = -1
+for epoch in range(1, N_EPOCHS):
+    print("Epoch {}:".format(epoch))
+    train()
+    cur_loss, _ = test()
 
+    if cur_loss <= BEST_LOSS:
+        BEST_LOSS = cur_loss
+        LAST_SAVED = epoch
+        print("Saving model!")
+        torch.save(model.state_dict(), 'models/{}_autoencoder.pt'.format(DATASET))
+    else:
+        print("Not saving model! Last saved: {}".format(LAST_SAVED))
 
-for i in range(100):
-    train(i)
-    test()
+    generate_samples()
@@ -1,59 +1,202 @@
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 
 
 def to_scalar(arr):
     if type(arr) == list:
-        return [x.cpu().data.tolist()[0] for x in arr]
+        return [x.item() for x in arr]
     else:
-        return arr.cpu().data.tolist()[0]
+        return arr.item()
 
 
-def euclidean_distance(z_e_x, emb):
-    dists = torch.pow(
-        z_e_x.unsqueeze(1) - emb[None, :, :, None, None],
-        2
-    ).sum(2)
-    return dists
+def weights_init(m):
+    classname = m.__class__.__name__
+    if classname.find('Conv') != -1:
+        nn.init.xavier_uniform_(m.weight.data)
+        m.bias.data.fill_(0)
+
+
+class ResBlock(nn.Module):
+    def __init__(self, dim):
+        super(ResBlock, self).__init__()
+        self.block = nn.Sequential(
+            nn.ReLU(True),
+            nn.Conv2d(dim, dim, 3, 1, 1),
+            nn.ReLU(True),
+            nn.Conv2d(dim, dim, 1),
+        )
+
+    def forward(self, x):
+        return x + self.block(x)
 
 
 class AutoEncoder(nn.Module):
-    def __init__(self):
+    def __init__(self, input_dim, dim, K=512):
         super(AutoEncoder, self).__init__()
         self.encoder = nn.Sequential(
-            nn.Conv2d(1, 16, 4, 2, 1),
-            nn.BatchNorm2d(16),
-            nn.ReLU(True),
-            nn.Conv2d(16, 32, 4, 2, 1),
-            nn.BatchNorm2d(32),
+            nn.Conv2d(input_dim, dim, 4, 2, 1),
             nn.ReLU(True),
-            nn.Conv2d(32, 64, 1, 1, 0),
-            nn.BatchNorm2d(64),
+            nn.Conv2d(dim, dim, 4, 2, 1),
+            ResBlock(dim),
+            ResBlock(dim),
         )
 
-        self.embedding = nn.Embedding(512, 64)
+        self.embedding = nn.Embedding(K, dim)
+        # self.embedding.weight.data.copy_(1./K * torch.randn(K, 256))
+        self.embedding.weight.data.uniform_(-1./K, 1./K)
 
         self.decoder = nn.Sequential(
-            nn.Conv2d(64, 32, 1, 1, 0),
-            nn.BatchNorm2d(32),
+            ResBlock(dim),
+            ResBlock(dim),
             nn.ReLU(True),
-            nn.ConvTranspose2d(32, 16, 4, 2, 1),
-            nn.BatchNorm2d(16),
+            nn.ConvTranspose2d(dim, dim, 4, 2, 1),
             nn.ReLU(True),
-            nn.ConvTranspose2d(16, 1, 4, 2, 1),
-            nn.Sigmoid()
+            nn.ConvTranspose2d(dim, input_dim, 4, 2, 1),
+            nn.Tanh()
         )
 
-    def forward(self, x):
+        self.apply(weights_init)
+
+    def encode(self, x):
         z_e_x = self.encoder(x)
-        B, C, H, W = z_e_x.size()
 
-        dists = euclidean_distance(z_e_x, self.embedding.weight)
-        latents = dists.min(1)[1]
+        z_e_x_transp = z_e_x.permute(0, 2, 3, 1)  # (B, H, W, C)
+        emb = self.embedding.weight.transpose(0, 1)  # (C, K)
+        dists = torch.pow(
+            z_e_x_transp.unsqueeze(4) - emb[None, None, None],
+            2
+        ).sum(-2)
+        latents = dists.min(-1)[1]
+        return latents, z_e_x
 
+    def decode(self, latents):
         shp = latents.size() + (-1, )
-        z_q_x = self.embedding(latents.view(-1)).view(*shp)
-        z_q_x = z_q_x.permute(0, 3, 1, 2)
-
+        z_q_x = self.embedding(latents.view(latents.size(0), -1))  # (B * H * W, C)
+        z_q_x = z_q_x.view(*shp).permute(0, 3, 1, 2)  # (B, C, H, W)
         x_tilde = self.decoder(z_q_x)
+        return x_tilde, z_q_x
+
+    def forward(self, x):
+        latents, z_e_x = self.encode(x)
+        x_tilde, z_q_x = self.decode(latents)
         return x_tilde, z_e_x, z_q_x
+
+
+class GatedActivation(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        x, y = x.chunk(2, dim=1)
+        return F.tanh(x) * F.sigmoid(y)
+
+
+class GatedMaskedConv2d(nn.Module):
+    def __init__(self, mask_type, dim, kernel, residual=True, n_classes=10):
+        super().__init__()
+        assert kernel % 2 == 1, print("Kernel size must be odd")
+        self.mask_type = mask_type
+        self.residual = residual
+
+        self.class_cond_embedding = nn.Embedding(
+            n_classes, 2 * dim
+        )
+
+        kernel_shp = (kernel // 2 + 1, kernel)  # (ceil(n/2), n)
+        padding_shp = (kernel // 2, kernel // 2)
+        self.vert_stack = nn.Conv2d(
+            dim, dim * 2,
+            kernel_shp, 1, padding_shp
+        )
+
+        self.vert_to_horiz = nn.Conv2d(2 * dim, 2 * dim, 1)
+
+        kernel_shp = (1, kernel // 2 + 1)
+        padding_shp = (0, kernel // 2)
+        self.horiz_stack = nn.Conv2d(
+            dim, dim * 2,
+            kernel_shp, 1, padding_shp
+        )
+
+        self.horiz_resid = nn.Conv2d(dim, dim, 1)
+
+        self.gate = GatedActivation()
+
+    def make_causal(self):
+        self.vert_stack.weight.data[:, :, -1].zero_()  # Mask final row
+        self.horiz_stack.weight.data[:, :, :, -1].zero_()  # Mask final column
+
+    def forward(self, x_v, x_h, h):
+        if self.mask_type == 'A':
+            self.make_causal()
+
+        h = self.class_cond_embedding(h)
+        h_vert = self.vert_stack(x_v)
+        h_vert = h_vert[:, :, :x_v.size(-1), :]
+        out_v = self.gate(h_vert + h[:, :, None, None])
+
+        h_horiz = self.horiz_stack(x_h)
+        h_horiz = h_horiz[:, :, :, :x_h.size(-2)]
+        v2h = self.vert_to_horiz(h_vert)
+
+        out = self.gate(v2h + h_horiz + h[:, :, None, None])
+        if self.residual:
+            out_h = self.horiz_resid(out) + x_h
+        else:
+            out_h = self.horiz_resid(out)
+
+        return out_v, out_h
+
+
+class GatedPixelCNN(nn.Module):
+    def __init__(self, input_dim=256, dim=64, n_layers=15):
+        super().__init__()
+        self.dim = 64
+
+        # Create embedding layer to embed input
+        self.embedding = nn.Embedding(input_dim, dim)
+
+        # Building the PixelCNN layer by layer
+        self.layers = nn.ModuleList()
+
+        # Initial block with Mask-A convolution
+        # Rest with Mask-B convolutions
+        for i in range(n_layers):
+            mask_type = 'A' if i == 0 else 'B'
+            kernel = 7 if i == 0 else 3
+            residual = False if i == 0 else True
+
+            self.layers.append(
+                GatedMaskedConv2d(mask_type, dim, kernel, residual)
+            )
+
+        # Add the output layer
+        self.output_conv = nn.Sequential(
+            nn.Conv2d(dim, dim, 1),
+            nn.ReLU(True),
+            nn.Conv2d(dim, input_dim, 1)
+        )
+
+    def forward(self, x, label):
+        shp = x.size() + (-1, )
+        x = self.embedding(x.view(-1)).view(shp)  # (B, H, W, C)
+        x = x.permute(0, 3, 1, 2)  # (B, C, W, W)
+
+        x_v, x_h = (x, x)
+        for i, layer in enumerate(self.layers):
+            x_v, x_h = layer(x_v, x_h, label)
+
+        return self.output_conv(x_h)
+
+    def generate(self, label, shape=(8, 8), batch_size=64):
+        x = torch.zeros(batch_size, *shape).long().cuda()
+
+        for i in range(shape[0]):
+            for j in range(shape[1]):
+                logits = self.forward(x, label)
+                probs = F.softmax(logits[:, :, i, j], -1)
+                x.data[:, i, j].copy_(
+                    probs.multinomial(1).squeeze().data
+                )
+        return x