Fix incorrect sequence length in batch megacore mode and enable megacore tests which were incorrectly disabled before.

voutcn · jax authors · commit 10dbfcf74765 · 2024-04-10T17:49:16.000-07:00
Also configure sequence lengths in the unit test to cover edge cases (zero length, divisible/non-divisible by block size).

PiperOrigin-RevId: 623657472
diff --git a/jax/experimental/pallas/ops/tpu/paged_attention/paged_attention_kernel.py b/jax/experimental/pallas/ops/tpu/paged_attention/paged_attention_kernel.py
@@ -281,7 +281,14 @@ def body(i, _):
     return ()
 
   bk = pages_per_compute_block * k_pages_hbm_ref.shape[-2]
-  lax.fori_loop(0, lax.div(lengths_ref[b] + bk - 1, bk), body, ())
+
+  if megacore_mode == "batch":
+    num_cores = pl.num_programs(0)
+    length = lengths_ref[b * num_cores + core_index]
+  else:
+    length = lengths_ref[b]
+
+  lax.fori_loop(0, lax.div(length + bk - 1, bk), body, ())
 
 
 @functools.partial(
@@ -304,7 +311,7 @@ def paged_attention(
     pages_per_compute_block: int,
     megacore_mode: Optional[str] = None,
     inline_seq_dim: bool = True,
-) -> tuple[jax.Array, tuple[jax.Array, jax.Array]]:
+) -> jax.Array:
   """Paged grouped query attention.
 
   Args:
diff --git a/tests/pallas/BUILD b/tests/pallas/BUILD
@@ -217,6 +217,8 @@ jax_test(
     shard_count = 2,
     tags = [
         "noasan",  # Times out.
+        "nomsan",  # Times out.
+        "notsan",  # Times out.
     ],
     deps = [
         "//jax:pallas_tpu_ops",
diff --git a/tests/pallas/paged_attention_kernel_test.py b/tests/pallas/paged_attention_kernel_test.py
@@ -83,7 +83,7 @@ def _grouped_query_attention_reference(q, k, v, lengths):
 
 
 def _megacore_enabled():
-  return jax.devices()[0].device_kind == "TPU V4" and jtu.is_device_tpu(
+  return jax.devices()[0].device_kind == "TPU v4" or jtu.is_device_tpu(
       version=5, variant="p"
   )
 
@@ -114,15 +114,12 @@ def test_paged_attention(
     if not jtu.is_device_tpu_at_least(4):
       self.skipTest("Only supports TPU generation 4 or above")
     if megacore_mode and not _megacore_enabled():
-      self.skipTest("Megacore is only available on TPU v4 and TPU v5p")
+      self.skipTest("Megacore is only available on TPU v4 or TPU v5p")
     if num_kv_heads % 2 != 0 and megacore_mode == "kv_head":
       self.skipTest("Skip kv_head megacore mode when num_kv_heads is odd")
-    batch_size = 4
     max_kv_len = 2048
     block_size = 512
-    seq_lens = np.asarray(
-        [max_kv_len // batch_size * (i + 1) for i in range(batch_size)]
-    )
+    seq_lens = np.asarray([0, 3, 256, 513, 1023, 2048])
     q, k_pages, v_pages, page_indices = _generate_qkv(
         seq_lens,
         page_size,
@@ -151,7 +148,10 @@ def test_paged_attention(
     else:
       atol, rtol = 1e-1, 1e-1
     np.testing.assert_allclose(
-        o.astype(jnp.float32), o_ref.astype(jnp.float32), atol=atol, rtol=rtol
+        o[np.where(seq_lens > 0)].astype(jnp.float32),
+        o_ref[np.where(seq_lens > 0)].astype(jnp.float32),
+        atol=atol,
+        rtol=rtol,
     )