Skip to content

[mlir][xegpu] Add initial skeleton implementation for lowering ConvertLayoutOp #146176

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 11 additions & 10 deletions mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -918,21 +918,22 @@ def XeGPU_FenceOp: XeGPU_Op<"fence", []> {
def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["source", "result"]>]> {
let summary = "Convert the layout of the input operand";
let description = [{
`convert_layout` adjusts the data distribution across subgroups and/or work-items by modifying
the `LayoutAttr`. Both `srcMap` and `resMap` must correspond to the same programming scope, such
as workgroup-level (wg) or subgroup-level (sg) code. This operation is not valid once the IR is
lowered to WI level because that is the end result of all distributions.
`convert_layout` redistribute data across subgroups and/or work-items from the `input_layout` to
the `target_layout`. Both `input_layout` and `target_layout` must correspond to the same programming
scope, such as workgroup-level (wg) or subgroup-level (sg) code. This operation is not valid once
the IR is lowered to WI level because that is the end result of all distributions.
}];
let arguments = (ins XeGPU_Vector2DType: $source,
XeGPU_LayoutAttr: $srcMap,
XeGPU_LayoutAttr: $resMap
);
let results = (outs XeGPU_Vector2DType: $result);
let arguments = (ins XeGPU_VectorType: $source,
XeGPU_LayoutAttr: $input_layout,
XeGPU_LayoutAttr: $target_layout);
let results = (outs XeGPU_VectorType: $result);
let assemblyFormat = [{
$source attr-dict `:` type($source)
$source prop-dict attr-dict `:` type($source)
}];

let hasFolder = 1;
let hasVerifier = 1;
let hasCanonicalizer = 1;
}

#endif // MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD
4 changes: 2 additions & 2 deletions mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ def XeGPU_DpasOprType: VectorOfRankAndType<[1, 2, 3], [XeGPU_ScalarType]>;
def XeGPU_DpasResType: VectorOfRankAndType<[1, 2], [XeGPU_ScalarType]>;
def XeGPU_OffsetType: VectorOfRankAndType<[1], [Index]>;
def XeGPU_MaskType: AnyTypeOf<[VectorOfRankAndType<[1], [I1]>, I1]>;
def XeGPU_ValueType: AnyTypeOf<[VectorOfRankAndType<[1,2,3,4], [XeGPU_ScalarType]>, XeGPU_ScalarType]>;
def XeGPU_Vector2DType: VectorOfRankAndType<[2], [XeGPU_ScalarType]>;
def XeGPU_VectorType: VectorOfRankAndType<[1,2,3,4,5,6], [XeGPU_ScalarType]>;
def XeGPU_ValueType: AnyTypeOf<[XeGPU_VectorType, XeGPU_ScalarType]>;

// common base class for types in XeGPU dialect
class XeGPUTypeDef<string name, string typeMnemonic, list<Trait> traits = [],
Expand Down
61 changes: 42 additions & 19 deletions mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -609,32 +609,55 @@ LogicalResult DpasOp::verify() {
// XeGPU_ConvertLayoutOp
//===----------------------------------------------------------------------===//
LogicalResult ConvertLayoutOp::verify() {
auto srcMap = getSrcMapAttr();
auto resMap = getResMapAttr();
if (!srcMap)
return emitOpError("expected srcMap.");
if (!resMap)
return emitOpError("expected resMap.");

if (srcMap == resMap)
return emitOpError("expected different srcMap and resMap.");

// both srcMap and resMap should be WgLayout or SgLayout at the same time.
if ((!srcMap.isWgLayout() || !resMap.isWgLayout()) &&
(!srcMap.isSgLayout() || !resMap.isSgLayout()))
return emitOpError(
"expected srcMap and resMap be WgLayout or SgLayout at the same time.");
auto srcLayout = getInputLayout();
auto resLayout = getTargetLayout();
if (!srcLayout)
return emitOpError("expected input layout.");
if (!resLayout)
return emitOpError("expected target layout.");

// both input and target layouts should be WgLayout or SgLayout at the same
// time.
if ((!srcLayout.isWgLayout() || !resLayout.isWgLayout()) &&
(!srcLayout.isSgLayout() || !resLayout.isSgLayout()))
return emitOpError("expected input layout and target layout be WgLayout or "
"SgLayout at the same time.");

auto shape = getSource().getType().getShape();
if (!XeGPUDialect::isEvenlyDistributable(shape, srcMap))
return emitOpError("invalid srcMap, data cannot be evenly distributed.");
if (!XeGPUDialect::isEvenlyDistributable(shape, srcLayout))
return emitOpError(
"invalid input layout, data cannot be evenly distributed.");

if (!XeGPUDialect::isEvenlyDistributable(shape, resMap))
return emitOpError("invalid resMap, data cannot be evenly distributed.");
if (!XeGPUDialect::isEvenlyDistributable(shape, resLayout))
return emitOpError(
"invalid target layout, data cannot be evenly distributed.");

return mlir::success();
}

OpFoldResult ConvertLayoutOp::fold(FoldAdaptor adaptor) {
if (getInputLayout() == getTargetLayout())
return getSource();
return {};
}

struct FoldConvertLayoutOp : public OpRewritePattern<xegpu::ConvertLayoutOp> {
using OpRewritePattern<xegpu::ConvertLayoutOp>::OpRewritePattern;
LogicalResult matchAndRewrite(xegpu::ConvertLayoutOp op,
PatternRewriter &rewriter) const override {
if (op.getInputLayout() == op.getTargetLayout()) {
rewriter.replaceOp(op, op.getSource());
return success();
}
return failure();
}
};

void ConvertLayoutOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
MLIRContext *context) {
patterns.add<FoldConvertLayoutOp>(context);
}

} // namespace xegpu
} // namespace mlir

Expand Down
15 changes: 15 additions & 0 deletions mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,20 @@ resolveUnrealizedConversionCastOp(UnrealizedConversionCastOp castOp) {
}
}

struct ConvertLayoutOpPattern
: public OpRewritePattern<xegpu::ConvertLayoutOp> {
using OpRewritePattern::OpRewritePattern;
LogicalResult matchAndRewrite(xegpu::ConvertLayoutOp op,
PatternRewriter &rewriter) const override {
xegpu::LayoutAttr input_layout = op.getInputLayoutAttr().dropInstData();
xegpu::LayoutAttr target_layout = op.getTargetLayoutAttr().dropInstData();
auto newOp = rewriter.createOrFold<xegpu::ConvertLayoutOp>(
op.getLoc(), op.getType(), op.getSource(), input_layout, target_layout);
rewriter.replaceOp(op, newOp);
return success();
}
};

//===------------------------------------------------------------------------===//
// The XeGPUBlockingPass leverages the unroll patterns for XeGPU and Vector ops
// to partition operations that process large shapes into multiple operations on
Expand Down Expand Up @@ -335,6 +349,7 @@ void XeGPUBlockingPass::runOnOperation() {
});

RewritePatternSet patterns(ctx);
patterns.add<ConvertLayoutOpPattern>(ctx);

vector::UnrollVectorOptions vectorOptions;
vectorOptions.setNativeShapeFn(options.nativeShape);
Expand Down
53 changes: 49 additions & 4 deletions mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,12 +106,12 @@ struct WgToSgCreateNdOp : public OpConversionPattern<xegpu::CreateNdDescOp> {
using OpConversionPattern<xegpu::CreateNdDescOp>::OpConversionPattern;

// Calculate offset for each subgroup
SmallVector<OpFoldResult>
static SmallVector<OpFoldResult>
calculateGlobalOffsets(ConversionPatternRewriter &rewriter, Location loc,
const SmallVector<OpFoldResult> &originalOffsets,
const SmallVector<Value> &localOffset,
const SmallVector<int64_t> &distUnitBaseAddr,
const SmallVector<int64_t> &distUnitShape) const {
const SmallVector<int64_t> &distUnitShape) {
assert(localOffset.size() == distUnitBaseAddr.size() &&
"localOffset and distUnitBaseAddr must have the same rank");

Expand Down Expand Up @@ -390,6 +390,46 @@ struct WgToSgElementwiseOp : public ConversionPattern {
}
};

struct WgToSgConvertLayoutOp
: public OpConversionPattern<xegpu::ConvertLayoutOp> {
using OpConversionPattern<xegpu::ConvertLayoutOp>::OpConversionPattern;
LogicalResult
matchAndRewrite(xegpu::ConvertLayoutOp op, OneToNOpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
xegpu::LayoutAttr input = op.getInputLayout();
xegpu::LayoutAttr target = op.getTargetLayout();

if (!input || !target || !input.isWgLayout() || !target.isWgLayout())
return rewriter.notifyMatchFailure(
op, "Input and target layouts must have subgroup layout");

DenseI32ArrayAttr inputSgLayout = input.getSgLayout();
DenseI32ArrayAttr inputSgData = input.getSgData();
DenseI32ArrayAttr targetSgLayout = target.getSgLayout();
DenseI32ArrayAttr targetSgData = target.getSgData();

// TODO: currently we only support for optimal case, where input and
// output has the same sg_layout and sg_data, so SLM is not involved.
if (inputSgLayout != targetSgLayout || inputSgData != targetSgData)
return failure();

input = input.dropSgLayoutAndData();
target = target.dropSgLayoutAndData();

SmallVector<Value> newOps(adaptor.getSource());

if (input && target) {
for (auto [i, src] : llvm::enumerate(adaptor.getSource())) {
auto newOp = rewriter.create<xegpu::ConvertLayoutOp>(
op.getLoc(), src.getType(), src, input, target);
newOps[i] = newOp;
}
}
rewriter.replaceOpWithMultiple(op, {newOps});
return success();
}
};

// Handles UnrealizedConversionCastOp generated during
// SCFStructuralTypeConversions (step 1). This op may appear as either a
// target or source materialization for Vector values, e.g.:
Expand Down Expand Up @@ -473,8 +513,8 @@ namespace xegpu {
void populateXeGPUWgToSgDistributePatterns(RewritePatternSet &patterns) {
patterns.add<WgToSgCreateNdOp, WgToSgLoadNdOp, WgToSgStoreNdOp,
WgToSgUpdateNdOffsetOp, WgToSgDpasOp, WgToSgPrefetchNdOp,
UnrealizedConversionCastOpPattern, WgToSgElementwiseOp>(
patterns.getContext());
UnrealizedConversionCastOpPattern, WgToSgElementwiseOp,
WgToSgConvertLayoutOp>(patterns.getContext());
}
} // namespace xegpu
} // namespace mlir
Expand Down Expand Up @@ -581,6 +621,11 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
return isLegal(layout);
});

target.addDynamicallyLegalOp<xegpu::ConvertLayoutOp>(
[=](xegpu::ConvertLayoutOp op) -> bool {
return isLegal(op.getInputLayout()) && isLegal(op.getTargetLayout());
});

target.addDynamicallyLegalDialect<math::MathDialect, arith::ArithDialect>(
[=](Operation *op) -> std::optional<bool> {
// Only handle elementwise mappable ops
Expand Down
4 changes: 4 additions & 0 deletions mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,10 @@ xegpu::LayoutAttr xegpu::getLayoutAttr(const Value value) {
Operation *defOp = result.getDefiningOp();
assert(defOp && "result must have a defining op");

// For ConvertLayoutOp, the layout is stored in the targetLayoutAttr
if (auto convertOp = dyn_cast<xegpu::ConvertLayoutOp>(defOp))
return convertOp.getTargetLayoutAttr();

// for LoadNdOp, the layout is stored in the tensor descriptor
if (auto loadNd = dyn_cast<xegpu::LoadNdOp>(defOp))
return getLayoutAttr(loadNd.getTensorDesc());
Expand Down
14 changes: 3 additions & 11 deletions mlir/test/Dialect/XeGPU/invalid.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -511,19 +511,11 @@ func.func @tensor_desc_scatter_invalid_chunk_size_2D(%src: ui64, %offsets: vecto
return
}

// -----
func.func @convert_layout_same_map(%a: vector<32x64xf16>) {
// expected-error@+1 {{expected different srcMap and resMap}}
%2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
resMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
gpu.return
}

// -----
func.func @convert_layout_unmatch(%a: vector<32x64xf16>) {
// expected-error@+1 {{expected srcMap and resMap be WgLayout or SgLayout at the same time}}
%2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
resMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
// expected-error@+1 {{expected input layout and target layout be WgLayout or SgLayout at the same time}}
%2 = xegpu.convert_layout %a <{input_layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
target_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<32x64xf16>
gpu.return
}

Expand Down
8 changes: 4 additions & 4 deletions mlir/test/Dialect/XeGPU/layout.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -35,14 +35,14 @@ gpu.func @create_nd_tdesc_wg_1(%src: memref<24x32xf32>) {
}

gpu.func @convert_layout(%a: vector<32x64xf16>) {
%2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
resMap = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
%2 = xegpu.convert_layout %a <{input_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [2, 1]>,
target_layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<32x64xf16>
gpu.return
}

gpu.func @convert_layout_wg(%a: vector<32x64xf16>) {
%2 = xegpu.convert_layout %a {srcMap = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
resMap = #xegpu.layout<sg_layout = [4, 2], sg_data = [8, 32], lane_layout = [1, 16], lane_data = [1, 1]>} : vector<32x64xf16>
%2 = xegpu.convert_layout %a <{input_layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [16, 16], lane_layout = [1, 16], lane_data = [1, 1]>,
target_layout = #xegpu.layout<sg_layout = [4, 2], sg_data = [8, 32], lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<32x64xf16>
gpu.return
}

Expand Down
10 changes: 10 additions & 0 deletions mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -198,4 +198,14 @@ gpu.module @test_round_robin_assignment {
gpu.return
}

gpu.func @convert_layout_optimal(%arg0: memref<32x64xf32>) {
%0 = xegpu.create_nd_tdesc %arg0[0, 0] : memref<32x64xf32> -> !xegpu.tensor_desc<32x64xf32, #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [16, 16]>>
//CHECK-2: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<16x16xf32, #xegpu.layout<inst_data = [16, 16]>> -> vector<16x16xf32>
//CHECK-2: xegpu.convert_layout {{.*}} <{input_layout = #xegpu.layout<inst_data = [16, 16]>, target_layout = #xegpu.layout<inst_data = [8, 16]>}> : vector<16x16xf32>
%1 = xegpu.load_nd %0 : !xegpu.tensor_desc<32x64xf32, #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [16, 16]>> -> vector<32x64xf32>
%2 = xegpu.convert_layout %1 <{input_layout = #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [16, 16]>,
target_layout = #xegpu.layout<sg_layout = [2, 2], sg_data = [16, 16], inst_data = [8, 16]>}> : vector<32x64xf32>
gpu.return
}

}
Loading