[Pallas:MGPU] Align TMEM allocations to 16 bytes

apaszke · Google-ML-Automation · commit a58d72cfdcd4 · 2025-07-11T01:11:35.000-07:00
This does not seem to be documented very well, but many tcgen05 instructions
seem to assume that the TMEM addresses they receive are aligned to 16-byte
boundaries.

PiperOrigin-RevId: 781488939
diff --git a/jax/_src/pallas/mosaic_gpu/core.py b/jax/_src/pallas/mosaic_gpu/core.py
@@ -57,6 +57,7 @@
 # sensitive to alignment and while this is quite conservative, it gets the job
 # done. We should make this more refined in the future.
 SMEM_ALIGNMENT = 1024
+TMEM_COL_ALIGNMENT = 4
 
 
 def is_trivial_index(idx, shape) -> bool:
@@ -307,7 +308,8 @@ def _ref_group_tmem_col_size(refs: _GPUMemoryRefTree) -> int:
   """
   ncols = 0
   for ref in jax.tree.leaves(refs):
-    ncols += ref.layout.cols_in_shape(ref.shape, dtypes.bit_width(ref.dtype))
+    ref_ncols = ref.layout.cols_in_shape(ref.shape, dtypes.bit_width(ref.dtype))
+    ncols += align_to(ref_ncols, TMEM_COL_ALIGNMENT)
   return ncols
 
 
@@ -365,6 +367,7 @@ def flatten_ref_union(ref_union: AbstractRefUnion) -> tuple[_Ref, ...]:
     for ref_group in ref_union.refs:
       col_offset = 0
       for ref in jax.tree.leaves(ref_group):
+        col_offset = align_to(col_offset, TMEM_COL_ALIGNMENT)
         if not isinstance(ref, pallas_core.TransformedRef):
           ref = pallas_core.TransformedRef(ref, transforms=())
         ncols = ref.layout.cols_in_shape(ref.shape, dtypes.bit_width(ref.dtype))
diff --git a/jax/_src/pallas/mosaic_gpu/lowering.py b/jax/_src/pallas/mosaic_gpu/lowering.py
@@ -305,9 +305,6 @@ def _run_scoped_resource_estimator(
     if aval.memory_space == gpu_core.TMEM:
       if len(aval.shape) != 2:
         raise ValueError(f"TMEM allocations must be 2D. Got {aval.shape}")
-      if aval.shape[0] not in (64, 128):
-        raise ValueError(
-            f"TMEM shape[0] must be 64 or 128. Got {aval.shape[0]}.")
       # Estimate columns used.
       if isinstance(aval, gpu_core.AbstractRefUnion):
         assert aval.shape[0] == 128
@@ -316,8 +313,6 @@ def _run_scoped_resource_estimator(
         cols_used = aval.layout.cols_in_shape(
             aval.shape, dtypes.bit_width(aval.dtype)
         )
-      # TODO(apaszke): Remove this. We only need to align the outermost allocation.
-      cols_used = tcgen05._alloc_ncols(cols_used, exact=False)
       if aval.collective:
         rs += Resources(tmem_collective_scratch_cols=cols_used)
       else:
@@ -463,6 +458,7 @@ def alloc_tmem(
     cols_used = layout.cols_in_shape(
         struct.shape, dtypes.bit_width(struct.dtype)
     )
+    cols_used = gpu_core.align_to(cols_used, gpu_core.TMEM_COL_ALIGNMENT)
     if collective:
       self.tmem_collective_used_cols += cols_used
       yield tmem_ref
diff --git a/tests/pallas/mosaic_gpu_test.py b/tests/pallas/mosaic_gpu_test.py
@@ -2929,6 +2929,39 @@ def kernel(a_smem, b_smem, out_ref, acc_tmem, scratch_smem, barrier_ref,
     expected = x @ y
     np.testing.assert_allclose(result, expected, rtol=1e-3)
 
+  def test_matmul_alignment(self):
+    self.skip_if_wg_semantics()
+    m = k = n = 128
+    dtype = jnp.float16
+    transforms = (plgpu.TilingTransform((8, 64)), plgpu.SwizzleTransform(128))
+
+    def kernel(a_smem, b_smem, out_ref, _, acc_tmem, barrier_ref):
+      plgpu.tcgen05_mma(acc_tmem, a_smem, b_smem, barrier_ref, accumulate=False)
+      plgpu.barrier_wait(barrier_ref)
+      # We don't await the load because acc_tmem is never modified again.
+      out_ref[...] = plgpu.async_load_tmem(acc_tmem).astype(dtype)
+
+    spec = plgpu.BlockSpec(transforms=transforms, memory_space=plgpu.SMEM)
+    f = self.pallas_call(
+        kernel,
+        in_specs=(spec, spec),
+        out_specs=spec,
+        out_shape=jax.ShapeDtypeStruct((m, n), dtype),
+        # Add a one column space to test if we align the accumulator.
+        scratch_shapes=(
+            plgpu.TMEM((128, 1), jnp.float32),
+            plgpu.TMEM((m, n), jnp.float32),
+            plgpu.Barrier(orders_tensor_core=True),
+        ),
+    )
+    lhs_shape = (m, k)
+    rhs_shape = (k, n)
+    x = jax.random.uniform(jax.random.key(0), shape=lhs_shape, dtype=dtype)
+    y = jax.random.uniform(jax.random.key(1), shape=rhs_shape, dtype=dtype)
+    result = f(x, y)
+    expected = x @ y
+    np.testing.assert_allclose(result, expected, rtol=1e-3)
+
   @parameterized.parameters(
       (128, jnp.float16)
   )