fix alibi bias (#86)

kashif · web-flow · commit 290c1e74d156 · 2024-12-03T07:40:36.000-05:00
diff --git a/attn_gym/mods/alibi.py b/attn_gym/mods/alibi.py
@@ -16,13 +16,13 @@ def generate_alibi_bias(H: int) -> _score_mod_signature:
 
     def alibi_mod(score, b, h, q_idx, kv_idx):
         scale = torch.exp2(-((h + 1) * 8.0 / H))
-        bias = (q_idx - kv_idx) * scale
+        bias = (kv_idx - q_idx) * scale
         return score + bias
 
     return alibi_mod
 
 
-def main(device: str = "cpu"):
+def main(device: str = "cpu", causal: bool = True):
     """Visualize the attention scores alibi bias score mod.
 
     Args:
@@ -40,8 +40,16 @@ def make_tensor():
 
     alibi_score_mod = generate_alibi_bias(H)
 
+    def causal_mask(b, h, q_idx, kv_idx):
+        return q_idx >= kv_idx
+
     visualize_attention_scores(
-        query, key, score_mod=alibi_score_mod, device=device, name="alibi_score_mod"
+        query,
+        key,
+        score_mod=alibi_score_mod,
+        mask_mod=causal_mask if causal else None,
+        device=device,
+        name=f"alibi_score_mod_{'causal' if causal else 'non-causal'}",
     )
 
 
diff --git a/attn_gym/utils.py b/attn_gym/utils.py
@@ -115,11 +115,27 @@ def visualize_attention_scores(
         batch_idx=batch_idx,
         head_idx=head_idx,
     )
+    # If both score_mod and mask_mod are provided, apply both
+    if score_mod is not None and mask_mod is not None:
+        mask_viz = create_score_mod(
+            query,
+            key,
+            score_mod=None,
+            mask_mod=mask_mod,
+            scale=scale,
+            device=device,
+            batch_idx=batch_idx,
+            head_idx=head_idx,
+        )
+        # Apply mask by setting masked positions to -inf
+        scores_viz = torch.where(mask_viz == 0, float("-inf"), scores_viz)
 
     suffix_title = f"Batch {batch_idx}, Head {head_idx}" if batch_idx != 0 or head_idx != 0 else ""
 
     fig, ax = plt.subplots(figsize=(12, 10))
     color = "viridis" if score_mod is not None else "cividis"
+    if score_mod is not None and mask_mod is not None:
+        color = "plasma"
     im = ax.imshow(scores_viz.cpu().detach()[0, 0, :, :], aspect="auto", cmap=color)
     fig.colorbar(im)
 
diff --git a/examples/flex_attn.ipynb b/examples/flex_attn.ipynb
@@ -700,13 +700,13 @@
     "\n",
     "\n",
     "def alibi_and_causal_closure(score, b, h, q_idx, kv_idx):\n",
-    "    bias = alibi_bias[h] * (q_idx - kv_idx)\n",
+    "    bias = alibi_bias[h] * (kv_idx - q_idx)\n",
     "    return score + bias\n",
     "\n",
     "\n",
     "def alibi_and_causal_functional(score, b, h, q_idx, kv_idx):\n",
     "    scale = torch.exp2(-((h + 1) * 8.0 / H))\n",
-    "    bias = (q_idx - kv_idx) * scale\n",
+    "    bias = (kv_idx - q_idx) * scale\n",
     "    return score + bias\n",
     "\n",
     "\n",