NATTEN example (#16)

Birch-san · web-flow · commit 5e0d1b8053a1 · 2024-08-13T22:01:56.000-07:00
diff --git a/attn_gym/masks/natten.py b/attn_gym/masks/natten.py
@@ -0,0 +1,87 @@
+"""Generates a NATTEN mask"""
+
+import torch
+from torch import IntTensor, BoolTensor
+from torch.nn.attention.flex_attention import _mask_mod_signature
+from typing import Tuple
+
+
+def generate_natten(
+    canvas_w: int,
+    canvas_h: int,
+    kernel_w: int,
+    kernel_h: int,
+) -> _mask_mod_signature:
+    """Generates a NATTEN attention mask with a given kernel size.
+    Args:
+        canvas_w: The width of the canvas.
+        canvas_h: The height of the canvas.
+        kernel_w: The width of the kernel.
+        kernel_h: The height of the kernel.
+    """
+
+    def get_x_y(idx: IntTensor) -> Tuple[IntTensor, IntTensor]:
+        return idx // canvas_w, idx % canvas_w
+
+    def natten_mask_mod(
+        b: IntTensor,
+        h: IntTensor,
+        q_idx: IntTensor,
+        kv_idx: IntTensor,
+    ) -> BoolTensor:
+        q_x, q_y = get_x_y(q_idx)
+        kv_x, kv_y = get_x_y(kv_idx)
+        # kernel nominally attempts to center itself on the query, but kernel center
+        # is clamped to a fixed distance (kernel half-length) from the canvas edge
+        kernel_center_x = q_x.clamp(kernel_w // 2, (canvas_w - 1) - kernel_w // 2)
+        kernel_center_y = q_y.clamp(kernel_h // 2, (canvas_h - 1) - kernel_h // 2)
+        hori_mask = (kernel_center_x - kv_x).abs() <= kernel_w // 2
+        vert_mask = (kernel_center_y - kv_y).abs() <= kernel_h // 2
+        return hori_mask & vert_mask
+
+    natten_mask_mod.__name__ = f"natten_c{canvas_w}x{canvas_h}_k{kernel_w}x{kernel_h}"
+    return natten_mask_mod
+
+
+def main(device: str = "cpu"):
+    """Visualize the attention scores of NATTEN mask mod.
+    Note: a more complete implementation of NATTEN would include support for kernel dilation.
+    The NATTEN unfused kernel also has features like the ability to cross-attend to register tokens.
+    This capability is possible to express in Flex Attention but not attempted here.
+    See https://github.com/SHI-Labs/NATTEN for more details.
+
+    Args:
+        device (str): Device to use for computation. Defaults
+    """
+    from attn_gym import visualize_attention_scores
+
+    B, H, CANVAS_HEIGHT, CANVAS_WIDTH, HEAD_DIM = 1, 1, 6, 6, 8
+
+    def make_tensor():
+        return torch.ones(B, H, CANVAS_HEIGHT, CANVAS_WIDTH, HEAD_DIM, device=device)
+
+    query, key = make_tensor(), make_tensor()
+
+    kernel_size = 3
+    natten_mask = generate_natten(
+        canvas_w=CANVAS_WIDTH,
+        canvas_h=CANVAS_HEIGHT,
+        kernel_w=kernel_size,
+        kernel_h=kernel_size,
+    )
+    visualize_attention_scores(
+        # TODO: update visualize_attention_scores to support 2D sequences
+        query.flatten(start_dim=2, end_dim=3),
+        key.flatten(start_dim=2, end_dim=3),
+        mask_mod=natten_mask,
+        device=device,
+        name=natten_mask.__name__,
+    )
+
+
+if __name__ == "__main__":
+    try:
+        from jsonargparse import CLI
+    except ImportError:
+        raise ImportError("Be sure to run: pip install -e .'[viz]'")
+    CLI(main)
diff --git a/examples/flex_attn.ipynb b/examples/flex_attn.ipynb
@@ -560,15 +560,15 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### NATTEN MASKING\n",
+    "### Stand-Alone Self-Attention Masking\n",
     "\n",
     "In this case, imagine that we have a 2D image of size (H x W) flattened into a\n",
     "sequence of tokens. We only want to attend to tokens within 8 `pixels`, but\n",
     "from a 2D perspective.\n",
     "\n",
     "We can implement this mask_mod by first translating the 1D position into 2D coordinates. Then, we can simply check if the distance of both coordinates is within the window.\n",
     "\n",
-    "For more details check the paper's github repository [NATTEN](https://github.com/SHI-Labs/NATTEN) "
+    "For more details check the paper, [Stand-Alone Self-Attention in Vision Models](https://arxiv.org/abs/1906.05909)"
    ]
   },
   {
@@ -591,14 +591,72 @@
     "    return idx // W, idx % W\n",
     "\n",
     "\n",
-    "def natten_mask(b, h, q_idx, kv_idx):\n",
+    "def sasa_mask(b, h, q_idx, kv_idx):\n",
     "    q_x, q_y = get_x_y(q_idx)\n",
     "    kv_x, kv_y = get_x_y(kv_idx)\n",
     "    horizontal_mask = (q_x - kv_x).abs() <= WINDOW\n",
     "    vertical_mask = (q_y - kv_y).abs() <= WINDOW\n",
     "    return horizontal_mask & vertical_mask\n",
     "\n",
     "\n",
+    "test_mask(mask_mod=sasa_mask)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### NATTEN Masking\n",
+    "\n",
+    "Consider a 2D image of size (H x W) flattened into a sequence of tokens.\n",
+    "Queries attend to keys in a fixed kernel area (K_H x K_W), centered where possible\n",
+    "on the query, whilst staying within the canvas and always including the query.\n",
+    "\n",
+    "This is similar to SASA, except with extra handling to keep the kernel inside the canvas,\n",
+    "ensuring that all queries attend to a fixed number of keys.  \n",
+    "Keys compare their position to the kernel center, not the query. The kernel center attempts\n",
+    "to follow the query position, but is clamped to stay a fixed distance (its half-length) away\n",
+    "from the canvas edge.\n",
+    "\n",
+    "See the [NATTEN repository](https://github.com/SHI-Labs/NATTEN) for more information.  \n",
+    "_Note: a more complete implementation of NATTEN would include support for kernel dilation._  \n",
+    "_The NATTEN unfused kernel also has features like the ability to cross-attend to register tokens._\n",
+    "_This capability is possible to express in Flex Attention but not attempted here._"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "H = 128\n",
+    "W = 128\n",
+    "K_H = 7\n",
+    "K_W = 7\n",
+    "\n",
+    "\n",
+    "def get_x_y(idx):\n",
+    "    return idx // W, idx % W\n",
+    "\n",
+    "\n",
+    "def natten_mask(\n",
+    "    b,\n",
+    "    h,\n",
+    "    q_idx,\n",
+    "    kv_idx,\n",
+    "):\n",
+    "    q_x, q_y = get_x_y(q_idx)\n",
+    "    kv_x, kv_y = get_x_y(kv_idx)\n",
+    "    # kernel nominally attempts to center itself on the query, but kernel center\n",
+    "    # is clamped to a fixed distance (kernel half-length) from the canvas edge\n",
+    "    kernel_x = q_x.clamp(K_W // 2, (W - 1) - K_W // 2)\n",
+    "    kernel_y = q_y.clamp(K_H // 2, (H - 1) - K_H // 2)\n",
+    "    hori_mask = (kernel_x - kv_x).abs() <= K_W // 2\n",
+    "    vert_mask = (kernel_y - kv_y).abs() <= K_H // 2\n",
+    "    return hori_mask & vert_mask\n",
+    "\n",
+    "\n",
     "test_mask(mask_mod=natten_mask)"
    ]
   },