Skip to content

Commit 0a62836

Browse files
[mlir][gpu][transforms] Add support for mapping to lanes (#146912)
This revision adds a new attribute for mapping `scf.forall` to linear lane ids. Example: ``` // %arg2 and %arg3 map to lanes [0, 6) and are turned into epxressions // involving threadIdx.x/y by the map_nested_forall_to_threads // transformation. This results in a if (linear_thread_id < 6) conditional. scf.forall (%arg2, %arg3) in (2, 3) { ... } {mapping = [#gpu.lane<linear_dim_0>, #gpu.lane<linear_dim_1>]} ``` --------- Co-authored-by: Oleksandr "Alex" Zinenko <git@ozinenko.com>
1 parent 46e3ec0 commit 0a62836

File tree

6 files changed

+194
-1
lines changed

6 files changed

+194
-1
lines changed

mlir/include/mlir/Dialect/GPU/IR/GPUDeviceMappingAttr.td

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,30 @@ def GPUThreadMappingAttr
228228
}];
229229
}
230230

231+
def GPULaneMappingAttr
232+
: GPU_Attr<"GPULaneMapping", "lane", [
233+
DeclareAttrInterfaceMethods<DeviceMappingAttrInterface> ]> {
234+
let parameters = (ins
235+
EnumParameter<MappingIdEnum>:$lane
236+
);
237+
let assemblyFormat = "`<` params `>`";
238+
let description = [{
239+
An attribute that allows defining lane parallelism for GPU devices.
240+
241+
It can be consumed by lowering to generate GPU.
242+
243+
#### 3D mapping mode
244+
245+
Unsupported
246+
247+
#### Linear mapping mode
248+
249+
The linear lane id is obtained by linearizing the index of the lane.
250+
If required, predication occurs on the linear id. This allows specifying
251+
predication on a 1D subset of the (linearized) lanes.
252+
}];
253+
}
254+
231255
def GPUMemorySpaceMappingAttr : GPU_Attr<"GPUMemorySpaceMapping", "memory_space", [
232256
DeclareAttrInterfaceMethods<DeviceMappingAttrInterface> ] > {
233257
let parameters = (ins

mlir/include/mlir/Dialect/GPU/TransformOps/Utils.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,15 @@ struct GpuThreadIdBuilder : public GpuIdBuilder {
117117
GpuThreadIdBuilder(MLIRContext *ctx, bool useLinearMapping = false);
118118
};
119119

120+
/// Builder for lane id.
121+
/// The `idBuilder` method returns nD values used for indexing rewrites as well
122+
/// as 1D sizes for predicate generation.
123+
/// This `useLinearMapping` case is the only supported case.
124+
struct GpuLaneIdBuilder : public GpuIdBuilder {
125+
GpuLaneIdBuilder(MLIRContext *ctx, int64_t warpSize, bool unused);
126+
int64_t warpSize = 32;
127+
};
128+
120129
/// Determine if the size of the kernel configuration is supported by the
121130
/// GPU architecture being used.
122131
/// TODO this is currently hardwired to CUDA, parameterize and generalize.

mlir/lib/Dialect/GPU/IR/GPUDialect.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,20 @@ int64_t GPUThreadMappingAttr::getRelativeIndex() const {
106106
: getMappingId();
107107
}
108108

109+
int64_t GPULaneMappingAttr::getMappingId() const {
110+
return static_cast<int64_t>(getLane());
111+
}
112+
113+
bool GPULaneMappingAttr::isLinearMapping() const {
114+
return getMappingId() >= static_cast<int64_t>(MappingId::LinearDim0);
115+
}
116+
117+
int64_t GPULaneMappingAttr::getRelativeIndex() const {
118+
return isLinearMapping()
119+
? getMappingId() - static_cast<int64_t>(MappingId::LinearDim0)
120+
: getMappingId();
121+
}
122+
109123
int64_t GPUMemorySpaceMappingAttr::getMappingId() const {
110124
return static_cast<int64_t>(getAddressSpace());
111125
}

mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -313,11 +313,14 @@ checkMappingAttributeTypes(std::optional<TransformOpInterface> transformOp,
313313
llvm::IsaPred<GPUWarpMappingAttr>);
314314
bool hasThreadMapping = llvm::any_of(forallOp.getMapping().value(),
315315
llvm::IsaPred<GPUThreadMappingAttr>);
316+
bool hasLaneMapping = llvm::any_of(forallOp.getMapping().value(),
317+
llvm::IsaPred<GPULaneMappingAttr>);
316318
int64_t countMappingTypes = 0;
317319
countMappingTypes += hasBlockMapping ? 1 : 0;
318320
countMappingTypes += hasWarpgroupMapping ? 1 : 0;
319321
countMappingTypes += hasWarpMapping ? 1 : 0;
320322
countMappingTypes += hasThreadMapping ? 1 : 0;
323+
countMappingTypes += hasLaneMapping ? 1 : 0;
321324
if (countMappingTypes > 1) {
322325
return definiteFailureHelper(
323326
transformOp, forallOp,
@@ -330,7 +333,8 @@ checkMappingAttributeTypes(std::optional<TransformOpInterface> transformOp,
330333
"scf.forall op requires a mapping attribute of kind 'block'");
331334
}
332335
if (std::is_same<MappingKindType, ThreadMappingKind>::value &&
333-
!hasThreadMapping && !hasWarpMapping && !hasWarpgroupMapping) {
336+
!hasLaneMapping && !hasThreadMapping && !hasWarpMapping &&
337+
!hasWarpgroupMapping) {
334338
return definiteFailureHelper(transformOp, forallOp,
335339
"scf.forall op requires a mapping attribute "
336340
"of kind 'thread' or 'warp'");
@@ -473,10 +477,17 @@ static DiagnosedSilenceableFailure rewriteOneForallCommonImpl(
473477
SmallVector<int64_t> originalBasis(availableMappingSizes);
474478
bool originalBasisWasProvided = !originalBasis.empty();
475479
if (!originalBasisWasProvided) {
480+
LDBG("----originalBasis was not provided, deriving it and there will be no "
481+
"predication");
476482
originalBasis = forallMappingSizes;
477483
while (originalBasis.size() < 3)
478484
originalBasis.push_back(1);
485+
} else {
486+
LDBG("----originalBasis was provided, using it, there will be predication");
479487
}
488+
LLVM_DEBUG(
489+
llvm::interleaveComma(originalBasis, DBGS() << "------originalBasis: ");
490+
llvm::dbgs() << "\n");
480491

481492
IdBuilderResult builderResult =
482493
gpuIdBuilder.idBuilder(rewriter, loc, forallMappingSizes, originalBasis);
@@ -490,6 +501,7 @@ static DiagnosedSilenceableFailure rewriteOneForallCommonImpl(
490501
forallMappingAttrs.getArrayRef().take_front(forallOp.getRank()))) {
491502
auto mappingAttr = cast<DeviceMappingAttrInterface>(dim);
492503
Value peIdOp = mappingIdOps[mappingAttr.getRelativeIndex()];
504+
LDBG("----map: " << iv << " to" << peIdOp);
493505
bvm.map(iv, peIdOp);
494506
}
495507

@@ -790,6 +802,9 @@ getThreadIdBuilder(std::optional<TransformOpInterface> transformOp,
790802
.Case([&](GPUThreadMappingAttr) {
791803
return GpuThreadIdBuilder(ctx, useLinearMapping);
792804
})
805+
.Case([&](GPULaneMappingAttr) {
806+
return GpuLaneIdBuilder(ctx, warpSize, useLinearMapping);
807+
})
793808
.Default([&](DeviceMappingAttrInterface) -> GpuIdBuilder {
794809
llvm_unreachable("unknown mapping attribute");
795810
});

mlir/lib/Dialect/GPU/TransformOps/Utils.cpp

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,63 @@ static GpuIdBuilderFnType common3DIdBuilderFn(int64_t multiplicity = 1) {
156156
return res;
157157
}
158158

159+
/// Create a lane id builder that takes the `originalBasis` and decompose
160+
/// it in the basis of `forallMappingSizes`. The linear id builder returns an
161+
/// n-D vector of ids for indexing and 1-D size + id for predicate generation.
162+
static GpuIdBuilderFnType laneIdBuilderFn(int64_t periodicity) {
163+
auto res = [periodicity](RewriterBase &rewriter, Location loc,
164+
ArrayRef<int64_t> forallMappingSizes,
165+
ArrayRef<int64_t> originalBasis) {
166+
SmallVector<OpFoldResult> originalBasisOfr =
167+
getAsIndexOpFoldResult(rewriter.getContext(), originalBasis);
168+
OpFoldResult linearId =
169+
buildLinearId<ThreadIdOp>(rewriter, loc, originalBasisOfr);
170+
AffineExpr d0 = getAffineDimExpr(0, rewriter.getContext());
171+
linearId = affine::makeComposedFoldedAffineApply(
172+
rewriter, loc, d0 % periodicity, {linearId});
173+
174+
// Sizes in [0 .. n] -> [n .. 0] order to properly compute strides in
175+
// "row-major" order.
176+
SmallVector<int64_t> reverseBasisSizes(llvm::reverse(forallMappingSizes));
177+
SmallVector<int64_t> strides = computeStrides(reverseBasisSizes);
178+
SmallVector<AffineExpr> delinearizingExprs = delinearize(d0, strides);
179+
SmallVector<Value> ids;
180+
// Reverse back to be in [0 .. n] order.
181+
for (AffineExpr e : llvm::reverse(delinearizingExprs)) {
182+
ids.push_back(
183+
affine::makeComposedAffineApply(rewriter, loc, e, {linearId}));
184+
}
185+
186+
// clang-format off
187+
LLVM_DEBUG(llvm::interleaveComma(reverseBasisSizes,
188+
DBGS() << "--delinearization basis: ");
189+
llvm::dbgs() << "\n";
190+
llvm::interleaveComma(strides,
191+
DBGS() << "--delinearization strides: ");
192+
llvm::dbgs() << "\n";
193+
llvm::interleaveComma(delinearizingExprs,
194+
DBGS() << "--delinearization exprs: ");
195+
llvm::dbgs() << "\n";
196+
llvm::interleaveComma(ids, DBGS() << "--ids: ");
197+
llvm::dbgs() << "\n";);
198+
// clang-format on
199+
200+
// Return n-D ids for indexing and 1-D size + id for predicate generation.
201+
return IdBuilderResult{
202+
/*mappingIdOps=*/ids,
203+
/*availableMappingSizes=*/
204+
SmallVector<int64_t>{computeProduct(originalBasis)},
205+
// `forallMappingSizes` iterate in the scaled basis, they need to be
206+
// scaled back into the original basis to provide tight
207+
// activeMappingSizes quantities for predication.
208+
/*activeMappingSizes=*/
209+
SmallVector<int64_t>{computeProduct(forallMappingSizes)},
210+
/*activeIdOps=*/SmallVector<Value>{linearId.get<Value>()}};
211+
};
212+
213+
return res;
214+
}
215+
159216
namespace mlir {
160217
namespace transform {
161218
namespace gpu {
@@ -221,6 +278,16 @@ GpuThreadIdBuilder::GpuThreadIdBuilder(MLIRContext *ctx, bool useLinearMapping)
221278
: common3DIdBuilderFn<ThreadIdOp>(/*multiplicity=*/1);
222279
}
223280

281+
GpuLaneIdBuilder::GpuLaneIdBuilder(MLIRContext *ctx, int64_t warpSize,
282+
bool unused)
283+
: GpuIdBuilder(ctx, /*useLinearMapping=*/true,
284+
[](MLIRContext *ctx, MappingId id) {
285+
return GPULaneMappingAttr::get(ctx, id);
286+
}),
287+
warpSize(warpSize) {
288+
idBuilder = laneIdBuilderFn(/*periodicity=*/warpSize);
289+
}
290+
224291
DiagnosedSilenceableFailure checkGpuLimits(TransformOpInterface transformOp,
225292
std::optional<int64_t> gridDimX,
226293
std::optional<int64_t> gridDimY,

mlir/test/Dialect/GPU/transform-gpu.mlir

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -691,3 +691,67 @@ module attributes {transform.with_named_sequence} {
691691
transform.yield
692692
}
693693
}
694+
695+
// -----
696+
697+
#map = affine_map<(d0) -> (d0 * 128)>
698+
#map1 = affine_map<(d0) -> (d0 * 32)>
699+
700+
// CHECK-DAG: #[[$MAPB:.*]] = affine_map<()[s0] -> (s0 * 128)>
701+
// CHECK-DAG: #[[$MAPLANE:.*]] = affine_map<()[s0, s1] -> ((s0 + s1 * 73) mod 32)>
702+
// CHECK-DAG: #[[$MAPI:.*]] = affine_map<()[s0, s1] -> (s0 * 32 + s1 * 2336 - ((s0 + s1 * 73) floordiv 2) * 64)>
703+
// CHECK-DAG: #[[$MAPJ:.*]] = affine_map<()[s0, s1] -> ((((s0 + s1 * 73) mod 32) floordiv 2) * 32)>
704+
705+
// CHECK-LABEL: func.func @simple_fill(
706+
func.func @simple_fill(%arg0: memref<128x256xf32>) -> memref<128x256xf32> {
707+
%c0 = arith.constant 0 : index
708+
%cst = arith.constant dense<0.000000e+00> : vector<16x32xf32>
709+
// CHECK: %[[C6:.*]] = arith.constant 6 : index
710+
// CHECK: gpu.launch
711+
scf.forall (%arg1) in (1) {
712+
// CHECK: %[[BIDX:.*]] = gpu.block_id x
713+
// CHECK: %[[BLX:.*]] = affine.apply #[[$MAPB]]()[%[[BIDX]]]
714+
%0 = affine.apply #map(%arg1)
715+
%subview = memref.subview %arg0[%0, 0] [128, 256] [1, 1]
716+
: memref<128x256xf32> to memref<128x256xf32, strided<[256, 1], offset: ?>>
717+
718+
// %arg2 and %arg3 map to lanes [0, 6) and are turned into epxressions
719+
// involving threadIdx.x/y by the map_nested_forall_to_threads
720+
// transformation. This results in a if (linear_thread_id < 6) conditional.
721+
scf.forall (%arg2, %arg3) in (2, 3) {
722+
// CHECK: %[[TIDX:.*]] = gpu.thread_id x
723+
// CHECK: %[[TIDY:.*]] = gpu.thread_id y
724+
// CHECK: %[[LID:.*]] = affine.apply #[[$MAPLANE]]()[%[[TIDX]], %[[TIDY]]]
725+
// CHECK: %[[COND:.*]] = arith.cmpi ult, %[[LID]], %[[C6]]
726+
// CHECK: scf.if %[[COND]]
727+
// CHECK: %[[I:.*]] = affine.apply #[[$MAPI]]()[%[[TIDX]], %[[TIDY]]]
728+
// CHECK: %[[J:.*]] = affine.apply #[[$MAPJ]]()[%[[TIDX]], %[[TIDY]]]
729+
// CHECK: memref.subview %{{.*}}[%[[I]], %[[J]]]
730+
%1 = affine.apply #map1(%arg2)
731+
%2 = affine.apply #map1(%arg3)
732+
%subview_0 = memref.subview %subview[%1, %2] [16, 32] [1, 1]
733+
: memref<128x256xf32, strided<[256, 1], offset: ?>> to memref<16x32xf32, strided<[256, 1], offset: ?>>
734+
vector.transfer_write %cst, %subview_0[%c0, %c0] {in_bounds = [true, true]}
735+
: vector<16x32xf32>, memref<16x32xf32, strided<[256, 1], offset: ?>>
736+
737+
// This could be obtained e.g. if a previous transformation mapped this loop
738+
// to lanes. This can aslo be written by hand as valid IR.
739+
} {mapping = [#gpu.lane<linear_dim_0>, #gpu.lane<linear_dim_1>]}
740+
} {mapping = [#gpu.block<x>]}
741+
return %arg0 : memref<128x256xf32>
742+
}
743+
744+
module attributes {transform.with_named_sequence} {
745+
transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) {
746+
%func = transform.structured.match ops{["func.func"]} in %module_op
747+
: (!transform.any_op) -> !transform.any_op
748+
%gpu_launch = transform.gpu.map_forall_to_blocks %func generate_gpu_launch
749+
: (!transform.any_op) -> !transform.any_op
750+
751+
// This transformation maps scf.forall ivs to a particular mapping of thread
752+
// ids (laneid, threadid, warpid or warpgroupid).
753+
transform.gpu.map_nested_forall_to_threads %gpu_launch block_dims = [73, 5, 1]
754+
: (!transform.any_op) -> !transform.any_op
755+
transform.yield
756+
}
757+
}

0 commit comments

Comments
 (0)