jax-ml
diff --git a/‎jax/_src/pallas/mosaic_gpu/core.py
Lines changed: 85 additions & 64 deletions b/‎jax/_src/pallas/mosaic_gpu/core.py
Lines changed: 85 additions & 64 deletions
diff --git a/‎jax/_src/pallas/mosaic_gpu/lowering.py
Lines changed: 19 additions & 31 deletions b/‎jax/_src/pallas/mosaic_gpu/lowering.py
Lines changed: 19 additions & 31 deletions
@@ -57,6 +57,7 @@
 # sensitive to alignment and while this is quite conservative, it gets the job
 # done. We should make this more refined in the future.
 SMEM_ALIGNMENT = 1024
+TMEM_COL_ALIGNMENT = 4
 
 
 def is_trivial_index(idx, shape) -> bool:
@@ -146,11 +147,36 @@ def __call__(
       *,
       transforms: Sequence[MemoryRefTransform] = (),
       packed: bool | None = None,
-      collective: bool | None = None
+      collective: bool | None = None,
+      layout: TMEMLayout | None = None,
   ) -> pallas_core.MemoryRef:
-    # A convenience function for constructing MemoryRef types.
+    if self == MemorySpace.TMEM:
+      if transforms:
+        raise ValueError("transforms are not supported for TMEM")
+      if collective is None:
+        collective = False
+      if layout is None:
+        if packed is None:
+          if dtypes.bit_width(dtype) != 32:
+            raise ValueError(
+                "dtypes narrower than 32-bit require either the packed argument"
+                " or an explicit TMEM layout"
+            )
+          packed = False
+        layout = infer_tmem_layout(
+            shape, dtype, packed=packed, collective=collective
+        )
+      else:
+        if packed is not None:
+          raise ValueError("packed cannot be specified if layout is specified.")
+        # We allow tcgen05.TMEMLayout to be passed in from our internal APIs.
+        if not isinstance(layout, tcgen05.TMEMLayout):
+          layout = layout.to_mgpu()
+    else:
+      if packed is not None or collective is not None or layout is not None:
+        raise ValueError("packed, collective and layout arguments are only supported for TMEM.")
     return GPUMemoryRef(shape, dtype, memory_space=self, transforms=transforms,
-                        packed=packed, collective=collective)
+                        layout=layout, collective=collective)
 
 
 class SemaphoreType(enum.Enum):
@@ -223,38 +249,26 @@ def cmap_body():
 class GPUMemoryRef(pallas_core.MemoryRef):
   transforms: Sequence[MemoryRefTransform] = ()
 
-  # Whether to allow TMEM packing for sub 32-bit dtypes.
-  packed: bool | None = dataclasses.field(default=None, kw_only=True)
+  layout: tcgen05.TMEMLayout | None = dataclasses.field(default=None, kw_only=True)
   collective: bool | None = dataclasses.field(default=None, kw_only=True)
 
   def __post_init__(self):
-    if self.memory_space == MemorySpace.TMEM:
-      if dtypes.bit_width(self.dtype) < 32 and self.packed is None:
-        raise ValueError(
-            "Packed option must be specified for sub-32 bit dtypes.")
-    else:
-      if self.packed is not None:
-        raise ValueError("Packed option is only supported for TMEM.")
-      if self.collective is not None:
-        raise ValueError("Collective option is only supported for TMEM.")
+    is_tmem = self.memory_space == MemorySpace.TMEM
+    assert (self.layout is not None) == is_tmem
+    assert (self.collective is not None) == is_tmem
+    assert not (self.transforms and is_tmem)
 
   def get_ref_aval(self) -> _Ref:
     aval = jax_core.ShapedArray(self.shape, self.dtype)
     for t in self.transforms:
       aval = t(aval)
     if self.memory_space == MemorySpace.TMEM:
-      collective = self.collective if self.collective is not None else False
-      packed = self.packed if self.packed is not None else False
-      ref = pallas_core.TransformedRef(
-          AbstractTMEMRef(aval,
-                          memory_space=self.memory_space,
-                          packed=packed,
-                          collective=collective), ()
+      aval = AbstractTMEMRef(
+          aval, self.memory_space, self.layout, self.collective
       )
     else:
-      ref = pallas_core.TransformedRef(
-          state.AbstractRef(aval, memory_space=self.memory_space), ()
-      )
+      aval = state.AbstractRef(aval, memory_space=self.memory_space)
+    ref = pallas_core.TransformedRef(aval, ())
     for t in reversed(self.transforms):
       ref = t.undo(ref)
     if not ref.transforms:
@@ -295,32 +309,23 @@ def _ref_group_tmem_col_size(refs: _GPUMemoryRefTree) -> int:
   """
   ncols = 0
   for ref in jax.tree.leaves(refs):
-    ncols += infer_tmem_cols_layout(ref.shape, ref.dtype,
-                                    collective=ref.collective,
-                                    packed=ref.packed)[0]
+    ref_ncols = ref.layout.cols_in_shape(ref.shape, dtypes.bit_width(ref.dtype))
+    ncols += align_to(ref_ncols, TMEM_COL_ALIGNMENT)
   return ncols
 
 
-def infer_tmem_cols_layout(
+def infer_tmem_layout(
     shape: tuple[int, ...],
     dtype: jnp.dtype,
     *,
     packed: bool,
-    collective: bool,
-    layout: tcgen05.TMEMLayout | None = None) -> tuple[int, tcgen05.TMEMLayout]:
+    collective: bool) -> tcgen05.TMEMLayout:
   """Infers the number of columns used and layout for allocating TMEM Refs."""
   if packed:
     packing = 32 // dtypes.bit_width(dtype)
   else:
     packing = 1
-  if layout is None:
-    layout = tcgen05._infer_tmem_layout(shape,  # type: ignore[arg-type]
-                                        collective=collective,
-                                        packing=packing)
-  with ir.Context():
-    ir_dtype = mgpu_utils.dtype_to_ir_type(dtype)
-  cols_used = layout.cols_in_shape(shape, ir_dtype)  # type: ignore[arg-type]
-  return cols_used, layout
+  return tcgen05._infer_tmem_layout(shape, collective=collective, packing=packing)
 
 
 def flatten_ref_union(ref_union: AbstractRefUnion) -> tuple[_Ref, ...]:
@@ -363,13 +368,12 @@ def flatten_ref_union(ref_union: AbstractRefUnion) -> tuple[_Ref, ...]:
     for ref_group in ref_union.refs:
       col_offset = 0
       for ref in jax.tree.leaves(ref_group):
+        col_offset = align_to(col_offset, TMEM_COL_ALIGNMENT)
         if not isinstance(ref, pallas_core.TransformedRef):
           ref = pallas_core.TransformedRef(ref, transforms=())
-        ncols, _ = infer_tmem_cols_layout(
-            ref.shape, ref.dtype,  # type: ignore[arg-type]
-            packed=ref.packed, collective=ref.collective)
+        ncols = ref.layout.cols_in_shape(ref.shape, dtypes.bit_width(ref.dtype))
         transform = ExtractAliasedRef.from_transformed_ref(
-            ref, col_offset, packed=ref.packed, collective=ref.collective)
+            ref, col_offset, layout=ref.layout)
         flat_refs.append(
             pallas_core.TransformedRef(
                 ref_union, transforms=(transform, *ref.transforms)
@@ -409,19 +413,23 @@ def update(self, inner_aval=None, memory_space=None):
     ref = super().update(inner_aval, memory_space)
     return AbstractRefUnion(ref.inner_aval, self.refs, self.memory_space)
 
+  @functools.cached_property
+  def layout(self) -> tcgen05.TMEMLayout:
+    if self.memory_space != TMEM:
+      raise ValueError("layout attribute is only defined for TMEM refs")
+    return tcgen05.tmem_default_layout(packing=1)
+
   @functools.cached_property
   def collective(self) -> bool:
     if self.memory_space != TMEM:
-      raise ValueError("Collective is only supported for TMEM.")
+      raise ValueError("collective attribute is only defined for TMEM refs")
     ref_leaves = jax.tree.leaves(self.refs)
     first_ref = ref_leaves[0]
-    # Check if all Refs have the same collective attribute.
-    if not all(ref.collective == first_ref.collective for ref in ref_leaves):
-      raise ValueError(f"All Refs must be either collective/not collective."
-        f" Got: {[ref.collective for ref in ref_leaves]}")
+    assert all(ref.collective == first_ref.collective for ref in ref_leaves)
     return first_ref.collective
 
 
+
 @dataclasses.dataclass(init=False, frozen=True)
 class RefUnion(GPUMemoryRef):
   """A sequence of trees of refs that are allowed to reuse the same memory.
@@ -450,11 +458,18 @@ def __init__(self, *refs: _GPUMemoryRefTree):
     elif all(ref.memory_space == TMEM for ref in ref_leaves):
       object.__setattr__(self, "refs", refs)
       max_cols = max(map(_ref_group_tmem_col_size, self.refs))
+      is_collective = ref_leaves[0].collective
+      if any(r.collective != is_collective for r in ref_leaves):
+        raise ValueError(
+            "Some aliased TMEM references are collective and some are not."
+        )
       super().__init__(
           shape=(128, max_cols,),
           dtype=jnp.int32,
           memory_space=TMEM,
           transforms=(),
+          layout=tcgen05.tmem_default_layout(packing=1),
+          collective=all(ref.collective for ref in ref_leaves),
       )
     else:
       raise NotImplementedError(
@@ -752,20 +767,16 @@ class ExtractAliasedRef(state_types.Transform):
   shape: tuple[int, ...]
   offset: int
   # TMEM-specific params
-  packed: bool | None
-  collective: bool | None
+  layout: tcgen05.TMEMLayout | None
 
   @classmethod
   def from_transformed_ref(
-      cls, ref: pallas_core.TransformedRef, byte_offset: int,
-      packed: bool | None = None,
-      collective: bool | None = None,
+      cls,
+      ref: pallas_core.TransformedRef,
+      byte_offset: int,
+      layout: tcgen05.TMEMLayout | None = None,
   ):
-    return cls(
-        dtypes.dtype(ref.dtype), ref.ref.shape, byte_offset,
-        packed=packed,
-        collective=collective,
-    )
+    return cls(dtypes.dtype(ref.dtype), ref.ref.shape, byte_offset, layout)
 
   def transform_shape(self, shape):
     if shape is None:
@@ -777,8 +788,7 @@ def transform_dtype(self, dtype):
     return self.dtype
 
   def tree_flatten(self):
-    return (), (self.dtype, self.shape, self.offset,
-                self.packed, self.collective)
+    return (), (self.dtype, self.shape, self.offset, self.layout)
 
   @classmethod
   def tree_unflatten(cls, metadata, arrays):
@@ -1040,20 +1050,20 @@ def _getitem(self, tracer, idx):
 
 
 class AbstractTMEMRef(state.AbstractRef):
-  __slots__ = ["inner_aval", "memory_space", "packed", "collective"]
+  __slots__ = ["inner_aval", "memory_space", "layout", "collective"]
 
-  def __init__(self, inner_aval, memory_space, packed, collective):
+  def __init__(self, inner_aval, memory_space, layout, collective):
     super().__init__(inner_aval, memory_space)
-    self.packed = packed
+    self.layout = layout
     self.collective = collective
 
   def __repr__(self) -> str:
-    return f'TMEM({self.inner_aval.str_short()},packed={self.packed})'
+    return f'TMEM({self.inner_aval.str_short()}, layout={self.layout}, collective={self.collective})'
 
   def update(self, inner_aval=None, memory_space=None):
     ref = super().update(inner_aval, memory_space)
     return AbstractTMEMRef(
-        ref.inner_aval, ref.memory_space, self.packed, self.collective
+        ref.inner_aval, ref.memory_space, self.layout, self.collective
     )
 
 
@@ -1246,6 +1256,7 @@ def to_mgpu(self) -> mgpu.FragmentedLayout:
       raise ValueError("Only TiledLayout supports reductions.")
     return layout.reduce(self.axes)
 
+
 class Layout(SomeLayout, enum.Enum):
   #: [m, n] matrix, where m % 64 == 0 == n % 8.
   WGMMA = enum.auto()
@@ -1297,3 +1308,13 @@ def check_no_args():
 Layout.TCGEN05_ROW = Layout.TCGEN05.reduce(1)
 Layout.TCGEN05_COL = Layout.TCGEN05.reduce(0)
 Layout.TCGEN05_TMEM_NATIVE_ROW = Layout.TCGEN05_TMEM_NATIVE.reduce(1)
+
+
+class TMEMLayout(enum.Enum):
+  """Layout for TMEM references."""
+  SCALES_LAYOUT = enum.auto()
+
+  def to_mgpu(self) -> mgpu.FragmentedLayout:
+    match self:
+      case TMEMLayout.SCALES_LAYOUT:
+        return tcgen05.scales_layout()
@@ -305,17 +305,14 @@ def _run_scoped_resource_estimator(
     if aval.memory_space == gpu_core.TMEM:
       if len(aval.shape) != 2:
         raise ValueError(f"TMEM allocations must be 2D. Got {aval.shape}")
-      if aval.shape[0] not in (64, 128):
-        raise ValueError(
-            f"TMEM shape[0] must be 64 or 128. Got {aval.shape[0]}.")
       # Estimate columns used.
       if isinstance(aval, gpu_core.AbstractRefUnion):
         assert aval.shape[0] == 128
         cols_used = aval.shape[1]
       else:
-        cols_used, _ = gpu_core.infer_tmem_cols_layout(
-            aval.shape, aval.dtype, packed=aval.packed, collective=aval.collective)
-      cols_used = tcgen05._alloc_ncols(cols_used, exact=False)
+        cols_used = aval.layout.cols_in_shape(
+            aval.shape, dtypes.bit_width(aval.dtype)
+        )
       if aval.collective:
         rs += Resources(tmem_collective_scratch_cols=cols_used)
       else:
@@ -443,10 +440,7 @@ def alloc_tmem(
       *,
       layout: tcgen05.TMEMLayout | None = None,
       collective: bool = False,
-      packed: bool = False,
   ) -> Iterator[ir.Value]:
-    cols_used, layout = gpu_core.infer_tmem_cols_layout(
-        struct.shape, struct.dtype, packed=packed, collective=collective, layout=layout)
     if collective:
       off = arith_dialect.addi(
           self.tmem_collective_base_ptr,
@@ -461,6 +455,10 @@ def alloc_tmem(
         shape=struct.shape,
         dtype=mgpu_utils.dtype_to_ir_type(struct.dtype),
         layout=layout)
+    cols_used = layout.cols_in_shape(
+        struct.shape, dtypes.bit_width(struct.dtype)
+    )
+    cols_used = gpu_core.align_to(cols_used, gpu_core.TMEM_COL_ALIGNMENT)
     if collective:
       self.tmem_collective_used_cols += cols_used
       yield tmem_ref
@@ -745,7 +743,9 @@ def ref_for_aval(aval: ShapedAbstractValue):
     if isinstance(aval, gpu_core.WGMMAAbstractAccumulatorRef):
       return gpu_core.WGMMAAccumulatorRef(aval.shape, aval.dtype)
     elif isinstance(aval, gpu_core.AbstractTMEMRef):
-      return gpu_core.TMEM(aval.shape, aval.dtype, packed=aval.packed)
+      return gpu_core.TMEM(
+          aval.shape, aval.dtype, layout=aval.layout, collective=aval.collective
+      )
     elif isinstance(aval, state_types.AbstractRef):
       return pallas_core.MemoryRef(aval.shape, aval.dtype, aval.memory_space)
     else:
@@ -1309,35 +1309,27 @@ def _extract_aliased_ref(
   match transforms:
     case (
         gpu_core.ExtractAliasedRef(
-            dtype, transformed_shape, offset, packed, collective
+            dtype, transformed_shape, offset, layout
         ),
         *other_transforms,
     ):
       mlir_dtype = mgpu_utils.dtype_to_ir_type(dtype)
       if isinstance(ref, tcgen05.TMEMRef):
-        assert packed is not None
-        assert collective is not None
+        assert layout is not None
         if ref.shape[0] != transformed_shape[0]:
           raise ValueError(
               "TMEM aliasing only supported for Refs with the same first"
               f" dimension, got {ref.shape[0]} != {transformed_shape[0]}."
           )
         address = arith_dialect.addi(ref.address, _i32_constant(offset))
-        _, tmem_layout = gpu_core.infer_tmem_cols_layout(
-            transformed_shape, dtype, packed=packed, collective=collective
-        )
         ref = tcgen05.TMEMRef(
-            address=address,
-            shape=transformed_shape,
-            dtype=mgpu_utils.dtype_to_ir_type(dtype),
-            layout=tmem_layout,
-        )
+          address=address,
+          shape=transformed_shape,
+          dtype=mgpu_utils.dtype_to_ir_type(dtype),
+          layout=layout)
       else:
-        assert packed is None
-        assert collective is None
-        ref_bits = math.prod(transformed_shape) * mgpu_utils.bitwidth(
-            mlir_dtype
-        )
+        assert layout is None
+        ref_bits = math.prod(transformed_shape) * mgpu_utils.bitwidth(mlir_dtype)
         if ref_bits % 8:
           raise NotImplementedError("Only byte-aligned bitcasts are supported.")
         assert offset % gpu_core.SMEM_ALIGNMENT == 0
@@ -2546,14 +2538,10 @@ def _run_scoped_lowering_rule(
         input_refs.append(input_ref)
         should_discharge.append(False)
       elif aval.memory_space == gpu_core.TMEM:
-        if isinstance(aval, gpu_core.AbstractRefUnion):
-          packed = False
-        else:
-          packed = aval.packed
         input_ref = alloc_stack.enter_context(
             ctx.module_ctx.alloc_tmem(
                 jax.ShapeDtypeStruct(shape=aval.shape, dtype=aval.dtype),
-                packed=packed,
+                layout=aval.layout,
                 collective=aval.collective,
             )
         )