diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index daab65ec893b8..97887cef684df 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -918,21 +918,22 @@ def XeGPU_FenceOp: XeGPU_Op<"fence", []> { def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["source", "result"]>]> { let summary = "Convert the layout of the input operand"; let description = [{ - `convert_layout` adjusts the data distribution across subgroups and/or work-items by modifying - the `LayoutAttr`. Both `srcMap` and `resMap` must correspond to the same programming scope, such - as workgroup-level (wg) or subgroup-level (sg) code. This operation is not valid once the IR is - lowered to WI level because that is the end result of all distributions. + `convert_layout` redistribute data across subgroups and/or work-items from the `input_layout` to + the `target_layout`. Both `input_layout` and `target_layout` must correspond to the same programming + scope, such as workgroup-level (wg) or subgroup-level (sg) code. This operation is not valid once + the IR is lowered to WI level because that is the end result of all distributions. }]; - let arguments = (ins XeGPU_Vector2DType: $source, - XeGPU_LayoutAttr: $srcMap, - XeGPU_LayoutAttr: $resMap - ); - let results = (outs XeGPU_Vector2DType: $result); + let arguments = (ins XeGPU_VectorType: $source, + XeGPU_LayoutAttr: $input_layout, + XeGPU_LayoutAttr: $target_layout); + let results = (outs XeGPU_VectorType: $result); let assemblyFormat = [{ - $source attr-dict `:` type($source) + $source prop-dict attr-dict `:` type($source) }]; + let hasFolder = 1; let hasVerifier = 1; + let hasCanonicalizer = 1; } #endif // MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td index 84314875c2ae5..af40b3754bd8a 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td @@ -21,8 +21,8 @@ def XeGPU_DpasOprType: VectorOfRankAndType<[1, 2, 3], [XeGPU_ScalarType]>; def XeGPU_DpasResType: VectorOfRankAndType<[1, 2], [XeGPU_ScalarType]>; def XeGPU_OffsetType: VectorOfRankAndType<[1], [Index]>; def XeGPU_MaskType: AnyTypeOf<[VectorOfRankAndType<[1], [I1]>, I1]>; -def XeGPU_ValueType: AnyTypeOf<[VectorOfRankAndType<[1,2,3,4], [XeGPU_ScalarType]>, XeGPU_ScalarType]>; -def XeGPU_Vector2DType: VectorOfRankAndType<[2], [XeGPU_ScalarType]>; +def XeGPU_VectorType: VectorOfRankAndType<[1,2,3,4,5,6], [XeGPU_ScalarType]>; +def XeGPU_ValueType: AnyTypeOf<[XeGPU_VectorType, XeGPU_ScalarType]>; // common base class for types in XeGPU dialect class XeGPUTypeDef traits = [], diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index 2793c7a35bc97..97415cc74f928 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -609,32 +609,55 @@ LogicalResult DpasOp::verify() { // XeGPU_ConvertLayoutOp //===----------------------------------------------------------------------===// LogicalResult ConvertLayoutOp::verify() { - auto srcMap = getSrcMapAttr(); - auto resMap = getResMapAttr(); - if (!srcMap) - return emitOpError("expected srcMap."); - if (!resMap) - return emitOpError("expected resMap."); - - if (srcMap == resMap) - return emitOpError("expected different srcMap and resMap."); - - // both srcMap and resMap should be WgLayout or SgLayout at the same time. - if ((!srcMap.isWgLayout() || !resMap.isWgLayout()) && - (!srcMap.isSgLayout() || !resMap.isSgLayout())) - return emitOpError( - "expected srcMap and resMap be WgLayout or SgLayout at the same time."); + auto srcLayout = getInputLayout(); + auto resLayout = getTargetLayout(); + if (!srcLayout) + return emitOpError("expected input layout."); + if (!resLayout) + return emitOpError("expected target layout."); + + // both input and target layouts should be WgLayout or SgLayout at the same + // time. + if ((!srcLayout.isWgLayout() || !resLayout.isWgLayout()) && + (!srcLayout.isSgLayout() || !resLayout.isSgLayout())) + return emitOpError("expected input layout and target layout be WgLayout or " + "SgLayout at the same time."); auto shape = getSource().getType().getShape(); - if (!XeGPUDialect::isEvenlyDistributable(shape, srcMap)) - return emitOpError("invalid srcMap, data cannot be evenly distributed."); + if (!XeGPUDialect::isEvenlyDistributable(shape, srcLayout)) + return emitOpError( + "invalid input layout, data cannot be evenly distributed."); - if (!XeGPUDialect::isEvenlyDistributable(shape, resMap)) - return emitOpError("invalid resMap, data cannot be evenly distributed."); + if (!XeGPUDialect::isEvenlyDistributable(shape, resLayout)) + return emitOpError( + "invalid target layout, data cannot be evenly distributed."); return mlir::success(); } +OpFoldResult ConvertLayoutOp::fold(FoldAdaptor adaptor) { + if (getInputLayout() == getTargetLayout()) + return getSource(); + return {}; +} + +struct FoldConvertLayoutOp : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + LogicalResult matchAndRewrite(xegpu::ConvertLayoutOp op, + PatternRewriter &rewriter) const override { + if (op.getInputLayout() == op.getTargetLayout()) { + rewriter.replaceOp(op, op.getSource()); + return success(); + } + return failure(); + } +}; + +void ConvertLayoutOp::getCanonicalizationPatterns(RewritePatternSet &patterns, + MLIRContext *context) { + patterns.add(context); +} + } // namespace xegpu } // namespace mlir diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp index 3950e8f70d1ca..06e0c6105df58 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp @@ -78,6 +78,20 @@ resolveUnrealizedConversionCastOp(UnrealizedConversionCastOp castOp) { } } +struct ConvertLayoutOpPattern + : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + LogicalResult matchAndRewrite(xegpu::ConvertLayoutOp op, + PatternRewriter &rewriter) const override { + xegpu::LayoutAttr input_layout = op.getInputLayoutAttr().dropInstData(); + xegpu::LayoutAttr target_layout = op.getTargetLayoutAttr().dropInstData(); + auto newOp = rewriter.createOrFold( + op.getLoc(), op.getType(), op.getSource(), input_layout, target_layout); + rewriter.replaceOp(op, newOp); + return success(); + } +}; + //===------------------------------------------------------------------------===// // The XeGPUBlockingPass leverages the unroll patterns for XeGPU and Vector ops // to partition operations that process large shapes into multiple operations on @@ -335,6 +349,7 @@ void XeGPUBlockingPass::runOnOperation() { }); RewritePatternSet patterns(ctx); + patterns.add(ctx); vector::UnrollVectorOptions vectorOptions; vectorOptions.setNativeShapeFn(options.nativeShape); diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp index e3563d10bc6f1..89dcddec752a1 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp @@ -106,12 +106,12 @@ struct WgToSgCreateNdOp : public OpConversionPattern { using OpConversionPattern::OpConversionPattern; // Calculate offset for each subgroup - SmallVector + static SmallVector calculateGlobalOffsets(ConversionPatternRewriter &rewriter, Location loc, const SmallVector &originalOffsets, const SmallVector &localOffset, const SmallVector &distUnitBaseAddr, - const SmallVector &distUnitShape) const { + const SmallVector &distUnitShape) { assert(localOffset.size() == distUnitBaseAddr.size() && "localOffset and distUnitBaseAddr must have the same rank"); @@ -390,6 +390,46 @@ struct WgToSgElementwiseOp : public ConversionPattern { } }; +struct WgToSgConvertLayoutOp + : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(xegpu::ConvertLayoutOp op, OneToNOpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + xegpu::LayoutAttr input = op.getInputLayout(); + xegpu::LayoutAttr target = op.getTargetLayout(); + + if (!input || !target || !input.isWgLayout() || !target.isWgLayout()) + return rewriter.notifyMatchFailure( + op, "Input and target layouts must have subgroup layout"); + + DenseI32ArrayAttr inputSgLayout = input.getSgLayout(); + DenseI32ArrayAttr inputSgData = input.getSgData(); + DenseI32ArrayAttr targetSgLayout = target.getSgLayout(); + DenseI32ArrayAttr targetSgData = target.getSgData(); + + // TODO: currently we only support for optimal case, where input and + // output has the same sg_layout and sg_data, so SLM is not involved. + if (inputSgLayout != targetSgLayout || inputSgData != targetSgData) + return failure(); + + input = input.dropSgLayoutAndData(); + target = target.dropSgLayoutAndData(); + + SmallVector newOps(adaptor.getSource()); + + if (input && target) { + for (auto [i, src] : llvm::enumerate(adaptor.getSource())) { + auto newOp = rewriter.create( + op.getLoc(), src.getType(), src, input, target); + newOps[i] = newOp; + } + } + rewriter.replaceOpWithMultiple(op, {newOps}); + return success(); + } +}; + // Handles UnrealizedConversionCastOp generated during // SCFStructuralTypeConversions (step 1). This op may appear as either a // target or source materialization for Vector values, e.g.: @@ -473,8 +513,8 @@ namespace xegpu { void populateXeGPUWgToSgDistributePatterns(RewritePatternSet &patterns) { patterns.add( - patterns.getContext()); + UnrealizedConversionCastOpPattern, WgToSgElementwiseOp, + WgToSgConvertLayoutOp>(patterns.getContext()); } } // namespace xegpu } // namespace mlir @@ -581,6 +621,11 @@ void XeGPUWgToSgDistributePass::runOnOperation() { return isLegal(layout); }); + target.addDynamicallyLegalOp( + [=](xegpu::ConvertLayoutOp op) -> bool { + return isLegal(op.getInputLayout()) && isLegal(op.getTargetLayout()); + }); + target.addDynamicallyLegalDialect( [=](Operation *op) -> std::optional { // Only handle elementwise mappable ops diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index 6b85a66a8bd36..d5ae3c20e222e 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -124,6 +124,10 @@ xegpu::LayoutAttr xegpu::getLayoutAttr(const Value value) { Operation *defOp = result.getDefiningOp(); assert(defOp && "result must have a defining op"); + // For ConvertLayoutOp, the layout is stored in the targetLayoutAttr + if (auto convertOp = dyn_cast(defOp)) + return convertOp.getTargetLayoutAttr(); + // for LoadNdOp, the layout is stored in the tensor descriptor if (auto loadNd = dyn_cast(defOp)) return getLayoutAttr(loadNd.getTensorDesc()); diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir index a2778cd94d963..65e1d22449bdd 100644 --- a/mlir/test/Dialect/XeGPU/invalid.mlir +++ b/mlir/test/Dialect/XeGPU/invalid.mlir @@ -511,19 +511,11 @@ func.func @tensor_desc_scatter_invalid_chunk_size_2D(%src: ui64, %offsets: vecto return } -// ----- -func.func @convert_layout_same_map(%a: vector<32x64xf16>) { - // expected-error@+1 {{expected different srcMap and resMap}} - %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout, - resMap = #xegpu.layout} : vector<32x64xf16> - gpu.return -} - // ----- func.func @convert_layout_unmatch(%a: vector<32x64xf16>) { - // expected-error@+1 {{expected srcMap and resMap be WgLayout or SgLayout at the same time}} - %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout, - resMap = #xegpu.layout} : vector<32x64xf16> + // expected-error@+1 {{expected input layout and target layout be WgLayout or SgLayout at the same time}} + %2 = xegpu.convert_layout %a <{input_layout = #xegpu.layout, + target_layout = #xegpu.layout}> : vector<32x64xf16> gpu.return } diff --git a/mlir/test/Dialect/XeGPU/layout.mlir b/mlir/test/Dialect/XeGPU/layout.mlir index 7f3ebec225cdf..ef51dfbbfd574 100644 --- a/mlir/test/Dialect/XeGPU/layout.mlir +++ b/mlir/test/Dialect/XeGPU/layout.mlir @@ -35,14 +35,14 @@ gpu.func @create_nd_tdesc_wg_1(%src: memref<24x32xf32>) { } gpu.func @convert_layout(%a: vector<32x64xf16>) { - %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout, - resMap = #xegpu.layout} : vector<32x64xf16> + %2 = xegpu.convert_layout %a <{input_layout = #xegpu.layout, + target_layout = #xegpu.layout}> : vector<32x64xf16> gpu.return } gpu.func @convert_layout_wg(%a: vector<32x64xf16>) { - %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout, - resMap = #xegpu.layout} : vector<32x64xf16> + %2 = xegpu.convert_layout %a <{input_layout = #xegpu.layout, + target_layout = #xegpu.layout}> : vector<32x64xf16> gpu.return } diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir index c6124f90e0f48..6c688f4db6dec 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir @@ -198,4 +198,14 @@ gpu.module @test_round_robin_assignment { gpu.return } + gpu.func @convert_layout_optimal(%arg0: memref<32x64xf32>) { + %0 = xegpu.create_nd_tdesc %arg0[0, 0] : memref<32x64xf32> -> !xegpu.tensor_desc<32x64xf32, #xegpu.layout> + //CHECK-2: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<16x16xf32, #xegpu.layout> -> vector<16x16xf32> + //CHECK-2: xegpu.convert_layout {{.*}} <{input_layout = #xegpu.layout, target_layout = #xegpu.layout}> : vector<16x16xf32> + %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<32x64xf32, #xegpu.layout> -> vector<32x64xf32> + %2 = xegpu.convert_layout %1 <{input_layout = #xegpu.layout, + target_layout = #xegpu.layout}> : vector<32x64xf32> + gpu.return + } + }