[mlir][gpu][transforms] Add support for mapping to lanes (#146912)

nicolasvasilache · ftynse · web-flow · commit 0a6283696981 · 2025-07-07T15:14:52.000+02:00
This revision adds a new attribute for mapping `scf.forall` to linear
lane ids.

Example:
```
    // %arg2 and %arg3 map to lanes [0, 6) and are turned into epxressions
    // involving threadIdx.x/y by the map_nested_forall_to_threads
    // transformation. This results in a if (linear_thread_id &lt; 6) conditional.
    scf.forall (%arg2, %arg3) in (2, 3) {
       ...
    } {mapping = [#gpu.lane&lt;linear_dim_0&gt;, #gpu.lane&lt;linear_dim_1&gt;]}
 ```

---------

Co-authored-by: Oleksandr "Alex" Zinenko &lt;git@ozinenko.com&gt;
diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUDeviceMappingAttr.td b/mlir/include/mlir/Dialect/GPU/IR/GPUDeviceMappingAttr.td
@@ -228,6 +228,30 @@ def GPUThreadMappingAttr
   }];
 }
 
+def GPULaneMappingAttr
+    : GPU_Attr<"GPULaneMapping", "lane", [
+      DeclareAttrInterfaceMethods<DeviceMappingAttrInterface> ]> {
+  let parameters = (ins
+    EnumParameter<MappingIdEnum>:$lane
+  );
+  let assemblyFormat = "`<` params `>`";
+  let description = [{
+    An attribute that allows defining lane parallelism for GPU devices.
+
+    It can be consumed by lowering to generate GPU.
+
+    #### 3D mapping mode
+
+    Unsupported
+
+    #### Linear mapping mode
+
+    The linear lane id is obtained by linearizing the index of the lane.
+    If required, predication occurs on the linear id. This allows specifying
+    predication on a 1D subset of the (linearized) lanes.
+  }];
+}
+
 def GPUMemorySpaceMappingAttr : GPU_Attr<"GPUMemorySpaceMapping", "memory_space", [
   DeclareAttrInterfaceMethods<DeviceMappingAttrInterface> ] >  {
   let parameters = (ins
diff --git a/mlir/include/mlir/Dialect/GPU/TransformOps/Utils.h b/mlir/include/mlir/Dialect/GPU/TransformOps/Utils.h
@@ -117,6 +117,15 @@ struct GpuThreadIdBuilder : public GpuIdBuilder {
   GpuThreadIdBuilder(MLIRContext *ctx, bool useLinearMapping = false);
 };
 
+/// Builder for lane id.
+/// The `idBuilder` method returns nD values used for indexing rewrites as well
+/// as 1D sizes for predicate generation.
+/// This `useLinearMapping` case is the only supported case.
+struct GpuLaneIdBuilder : public GpuIdBuilder {
+  GpuLaneIdBuilder(MLIRContext *ctx, int64_t warpSize, bool unused);
+  int64_t warpSize = 32;
+};
+
 /// Determine if the size of the kernel configuration is supported by the
 /// GPU architecture being used.
 /// TODO this is currently hardwired to CUDA, parameterize and generalize.
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -106,6 +106,20 @@ int64_t GPUThreadMappingAttr::getRelativeIndex() const {
              : getMappingId();
 }
 
+int64_t GPULaneMappingAttr::getMappingId() const {
+  return static_cast<int64_t>(getLane());
+}
+
+bool GPULaneMappingAttr::isLinearMapping() const {
+  return getMappingId() >= static_cast<int64_t>(MappingId::LinearDim0);
+}
+
+int64_t GPULaneMappingAttr::getRelativeIndex() const {
+  return isLinearMapping()
+             ? getMappingId() - static_cast<int64_t>(MappingId::LinearDim0)
+             : getMappingId();
+}
+
 int64_t GPUMemorySpaceMappingAttr::getMappingId() const {
   return static_cast<int64_t>(getAddressSpace());
 }
diff --git a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
@@ -313,11 +313,14 @@ checkMappingAttributeTypes(std::optional<TransformOpInterface> transformOp,
                                      llvm::IsaPred<GPUWarpMappingAttr>);
   bool hasThreadMapping = llvm::any_of(forallOp.getMapping().value(),
                                        llvm::IsaPred<GPUThreadMappingAttr>);
+  bool hasLaneMapping = llvm::any_of(forallOp.getMapping().value(),
+                                     llvm::IsaPred<GPULaneMappingAttr>);
   int64_t countMappingTypes = 0;
   countMappingTypes += hasBlockMapping ? 1 : 0;
   countMappingTypes += hasWarpgroupMapping ? 1 : 0;
   countMappingTypes += hasWarpMapping ? 1 : 0;
   countMappingTypes += hasThreadMapping ? 1 : 0;
+  countMappingTypes += hasLaneMapping ? 1 : 0;
   if (countMappingTypes > 1) {
     return definiteFailureHelper(
         transformOp, forallOp,
@@ -330,7 +333,8 @@ checkMappingAttributeTypes(std::optional<TransformOpInterface> transformOp,
         "scf.forall op requires a mapping attribute of kind 'block'");
   }
   if (std::is_same<MappingKindType, ThreadMappingKind>::value &&
-      !hasThreadMapping && !hasWarpMapping && !hasWarpgroupMapping) {
+      !hasLaneMapping && !hasThreadMapping && !hasWarpMapping &&
+      !hasWarpgroupMapping) {
     return definiteFailureHelper(transformOp, forallOp,
                                  "scf.forall op requires a mapping attribute "
                                  "of kind 'thread' or 'warp'");
@@ -473,10 +477,17 @@ static DiagnosedSilenceableFailure rewriteOneForallCommonImpl(
   SmallVector<int64_t> originalBasis(availableMappingSizes);
   bool originalBasisWasProvided = !originalBasis.empty();
   if (!originalBasisWasProvided) {
+    LDBG("----originalBasis was not provided, deriving it and there will be no "
+         "predication");
     originalBasis = forallMappingSizes;
     while (originalBasis.size() < 3)
       originalBasis.push_back(1);
+  } else {
+    LDBG("----originalBasis was provided, using it, there will be predication");
   }
+  LLVM_DEBUG(
+      llvm::interleaveComma(originalBasis, DBGS() << "------originalBasis: ");
+      llvm::dbgs() << "\n");
 
   IdBuilderResult builderResult =
       gpuIdBuilder.idBuilder(rewriter, loc, forallMappingSizes, originalBasis);
@@ -490,6 +501,7 @@ static DiagnosedSilenceableFailure rewriteOneForallCommonImpl(
            forallMappingAttrs.getArrayRef().take_front(forallOp.getRank()))) {
     auto mappingAttr = cast<DeviceMappingAttrInterface>(dim);
     Value peIdOp = mappingIdOps[mappingAttr.getRelativeIndex()];
+    LDBG("----map: " << iv << " to" << peIdOp);
     bvm.map(iv, peIdOp);
   }
 
@@ -790,6 +802,9 @@ getThreadIdBuilder(std::optional<TransformOpInterface> transformOp,
           .Case([&](GPUThreadMappingAttr) {
             return GpuThreadIdBuilder(ctx, useLinearMapping);
           })
+          .Case([&](GPULaneMappingAttr) {
+            return GpuLaneIdBuilder(ctx, warpSize, useLinearMapping);
+          })
           .Default([&](DeviceMappingAttrInterface) -> GpuIdBuilder {
             llvm_unreachable("unknown mapping attribute");
           });
diff --git a/mlir/lib/Dialect/GPU/TransformOps/Utils.cpp b/mlir/lib/Dialect/GPU/TransformOps/Utils.cpp
@@ -156,6 +156,63 @@ static GpuIdBuilderFnType common3DIdBuilderFn(int64_t multiplicity = 1) {
   return res;
 }
 
+/// Create a lane id builder that takes the `originalBasis` and decompose
+/// it in the basis of `forallMappingSizes`. The linear id builder returns an
+/// n-D vector of ids for indexing and 1-D size + id for predicate generation.
+static GpuIdBuilderFnType laneIdBuilderFn(int64_t periodicity) {
+  auto res = [periodicity](RewriterBase &rewriter, Location loc,
+                           ArrayRef<int64_t> forallMappingSizes,
+                           ArrayRef<int64_t> originalBasis) {
+    SmallVector<OpFoldResult> originalBasisOfr =
+        getAsIndexOpFoldResult(rewriter.getContext(), originalBasis);
+    OpFoldResult linearId =
+        buildLinearId<ThreadIdOp>(rewriter, loc, originalBasisOfr);
+    AffineExpr d0 = getAffineDimExpr(0, rewriter.getContext());
+    linearId = affine::makeComposedFoldedAffineApply(
+        rewriter, loc, d0 % periodicity, {linearId});
+
+    // Sizes in [0 .. n] -> [n .. 0] order to properly compute strides in
+    // "row-major" order.
+    SmallVector<int64_t> reverseBasisSizes(llvm::reverse(forallMappingSizes));
+    SmallVector<int64_t> strides = computeStrides(reverseBasisSizes);
+    SmallVector<AffineExpr> delinearizingExprs = delinearize(d0, strides);
+    SmallVector<Value> ids;
+    // Reverse back to be in [0 .. n] order.
+    for (AffineExpr e : llvm::reverse(delinearizingExprs)) {
+      ids.push_back(
+          affine::makeComposedAffineApply(rewriter, loc, e, {linearId}));
+    }
+
+    // clang-format off
+      LLVM_DEBUG(llvm::interleaveComma(reverseBasisSizes,
+                                       DBGS() << "--delinearization basis: ");
+                 llvm::dbgs() << "\n";
+                 llvm::interleaveComma(strides,
+                                       DBGS() << "--delinearization strides: ");
+                 llvm::dbgs() << "\n";
+                 llvm::interleaveComma(delinearizingExprs,
+                                       DBGS() << "--delinearization exprs: ");
+                 llvm::dbgs() << "\n";
+                 llvm::interleaveComma(ids, DBGS() << "--ids: ");
+                 llvm::dbgs() << "\n";);
+    // clang-format on
+
+    // Return n-D ids for indexing and 1-D size + id for predicate generation.
+    return IdBuilderResult{
+        /*mappingIdOps=*/ids,
+        /*availableMappingSizes=*/
+        SmallVector<int64_t>{computeProduct(originalBasis)},
+        // `forallMappingSizes` iterate in the scaled basis, they need to be
+        // scaled back into the original basis to provide tight
+        // activeMappingSizes quantities for predication.
+        /*activeMappingSizes=*/
+        SmallVector<int64_t>{computeProduct(forallMappingSizes)},
+        /*activeIdOps=*/SmallVector<Value>{linearId.get<Value>()}};
+  };
+
+  return res;
+}
+
 namespace mlir {
 namespace transform {
 namespace gpu {
@@ -221,6 +278,16 @@ GpuThreadIdBuilder::GpuThreadIdBuilder(MLIRContext *ctx, bool useLinearMapping)
                   : common3DIdBuilderFn<ThreadIdOp>(/*multiplicity=*/1);
 }
 
+GpuLaneIdBuilder::GpuLaneIdBuilder(MLIRContext *ctx, int64_t warpSize,
+                                   bool unused)
+    : GpuIdBuilder(ctx, /*useLinearMapping=*/true,
+                   [](MLIRContext *ctx, MappingId id) {
+                     return GPULaneMappingAttr::get(ctx, id);
+                   }),
+      warpSize(warpSize) {
+  idBuilder = laneIdBuilderFn(/*periodicity=*/warpSize);
+}
+
 DiagnosedSilenceableFailure checkGpuLimits(TransformOpInterface transformOp,
                                            std::optional<int64_t> gridDimX,
                                            std::optional<int64_t> gridDimY,
diff --git a/mlir/test/Dialect/GPU/transform-gpu.mlir b/mlir/test/Dialect/GPU/transform-gpu.mlir
@@ -691,3 +691,67 @@ module attributes {transform.with_named_sequence} {
       transform.yield
   }
 }
+
+// -----
+
+#map = affine_map<(d0) -> (d0 * 128)>
+#map1 = affine_map<(d0) -> (d0 * 32)>
+
+// CHECK-DAG: #[[$MAPB:.*]] = affine_map<()[s0] -> (s0 * 128)>
+// CHECK-DAG: #[[$MAPLANE:.*]] = affine_map<()[s0, s1] -> ((s0 + s1 * 73) mod 32)>
+// CHECK-DAG: #[[$MAPI:.*]] = affine_map<()[s0, s1] -> (s0 * 32 + s1 * 2336 - ((s0 + s1 * 73) floordiv 2) * 64)>
+// CHECK-DAG: #[[$MAPJ:.*]] = affine_map<()[s0, s1] -> ((((s0 + s1 * 73) mod 32) floordiv 2) * 32)>
+
+// CHECK-LABEL: func.func @simple_fill(
+func.func @simple_fill(%arg0: memref<128x256xf32>) -> memref<128x256xf32> {
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant dense<0.000000e+00> : vector<16x32xf32>
+    //   CHECK:   %[[C6:.*]] = arith.constant 6 : index
+    //   CHECK:   gpu.launch
+  scf.forall (%arg1) in (1) {
+    //   CHECK:     %[[BIDX:.*]] = gpu.block_id  x
+    //   CHECK:     %[[BLX:.*]] = affine.apply #[[$MAPB]]()[%[[BIDX]]]
+    %0 = affine.apply #map(%arg1)
+    %subview = memref.subview %arg0[%0, 0] [128, 256] [1, 1]
+      : memref<128x256xf32> to memref<128x256xf32, strided<[256, 1], offset: ?>>
+
+    // %arg2 and %arg3 map to lanes [0, 6) and are turned into epxressions
+    // involving threadIdx.x/y by the map_nested_forall_to_threads
+    // transformation. This results in a if (linear_thread_id < 6) conditional.
+    scf.forall (%arg2, %arg3) in (2, 3) {
+      //       CHECK:     %[[TIDX:.*]] = gpu.thread_id  x
+      //       CHECK:     %[[TIDY:.*]] = gpu.thread_id  y
+      //       CHECK:     %[[LID:.*]] = affine.apply #[[$MAPLANE]]()[%[[TIDX]], %[[TIDY]]]
+      //       CHECK:     %[[COND:.*]] = arith.cmpi ult, %[[LID]], %[[C6]]
+      //       CHECK:     scf.if %[[COND]]
+      //       CHECK:       %[[I:.*]] = affine.apply #[[$MAPI]]()[%[[TIDX]], %[[TIDY]]]
+      //       CHECK:       %[[J:.*]] = affine.apply #[[$MAPJ]]()[%[[TIDX]], %[[TIDY]]]
+      //       CHECK:       memref.subview %{{.*}}[%[[I]], %[[J]]]
+      %1 = affine.apply #map1(%arg2)
+      %2 = affine.apply #map1(%arg3)
+      %subview_0 = memref.subview %subview[%1, %2] [16, 32] [1, 1] 
+        : memref<128x256xf32, strided<[256, 1], offset: ?>> to memref<16x32xf32, strided<[256, 1], offset: ?>>
+      vector.transfer_write %cst, %subview_0[%c0, %c0] {in_bounds = [true, true]} 
+        : vector<16x32xf32>, memref<16x32xf32, strided<[256, 1], offset: ?>>
+
+    // This could be obtained e.g. if a previous transformation mapped this loop
+    // to lanes. This can aslo be written by hand as valid IR.
+    } {mapping = [#gpu.lane<linear_dim_0>, #gpu.lane<linear_dim_1>]}
+  } {mapping = [#gpu.block<x>]}
+  return %arg0 : memref<128x256xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) {
+    %func = transform.structured.match ops{["func.func"]} in %module_op
+      : (!transform.any_op) -> !transform.any_op
+    %gpu_launch = transform.gpu.map_forall_to_blocks %func generate_gpu_launch
+      : (!transform.any_op) -> !transform.any_op
+
+    // This transformation maps scf.forall ivs to a particular mapping of thread
+    // ids (laneid, threadid, warpid or warpgroupid).
+    transform.gpu.map_nested_forall_to_threads %gpu_launch block_dims = [73, 5, 1]
+      : (!transform.any_op) -> !transform.any_op
+      transform.yield
+  }
+}