From 2e0f4dbcb5c3635904e6200cbe763b683e4e3f21 Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Tue, 24 Jun 2025 19:57:09 +0000 Subject: [PATCH 1/9] update convert layout definition --- .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 21 +++---- .../mlir/Dialect/XeGPU/IR/XeGPUTypes.td | 4 +- mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 60 +++++++++++++------ mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 4 ++ mlir/test/Dialect/XeGPU/invalid.mlir | 14 +---- mlir/test/Dialect/XeGPU/layout.mlir | 8 +-- 6 files changed, 67 insertions(+), 44 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td index daab65ec893b8..97887cef684df 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td @@ -918,21 +918,22 @@ def XeGPU_FenceOp: XeGPU_Op<"fence", []> { def XeGPU_ConvertLayoutOp: XeGPU_Op<"convert_layout", [Pure, AllTypesMatch<["source", "result"]>]> { let summary = "Convert the layout of the input operand"; let description = [{ - `convert_layout` adjusts the data distribution across subgroups and/or work-items by modifying - the `LayoutAttr`. Both `srcMap` and `resMap` must correspond to the same programming scope, such - as workgroup-level (wg) or subgroup-level (sg) code. This operation is not valid once the IR is - lowered to WI level because that is the end result of all distributions. + `convert_layout` redistribute data across subgroups and/or work-items from the `input_layout` to + the `target_layout`. Both `input_layout` and `target_layout` must correspond to the same programming + scope, such as workgroup-level (wg) or subgroup-level (sg) code. This operation is not valid once + the IR is lowered to WI level because that is the end result of all distributions. }]; - let arguments = (ins XeGPU_Vector2DType: $source, - XeGPU_LayoutAttr: $srcMap, - XeGPU_LayoutAttr: $resMap - ); - let results = (outs XeGPU_Vector2DType: $result); + let arguments = (ins XeGPU_VectorType: $source, + XeGPU_LayoutAttr: $input_layout, + XeGPU_LayoutAttr: $target_layout); + let results = (outs XeGPU_VectorType: $result); let assemblyFormat = [{ - $source attr-dict `:` type($source) + $source prop-dict attr-dict `:` type($source) }]; + let hasFolder = 1; let hasVerifier = 1; + let hasCanonicalizer = 1; } #endif // MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td index 84314875c2ae5..af40b3754bd8a 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td @@ -21,8 +21,8 @@ def XeGPU_DpasOprType: VectorOfRankAndType<[1, 2, 3], [XeGPU_ScalarType]>; def XeGPU_DpasResType: VectorOfRankAndType<[1, 2], [XeGPU_ScalarType]>; def XeGPU_OffsetType: VectorOfRankAndType<[1], [Index]>; def XeGPU_MaskType: AnyTypeOf<[VectorOfRankAndType<[1], [I1]>, I1]>; -def XeGPU_ValueType: AnyTypeOf<[VectorOfRankAndType<[1,2,3,4], [XeGPU_ScalarType]>, XeGPU_ScalarType]>; -def XeGPU_Vector2DType: VectorOfRankAndType<[2], [XeGPU_ScalarType]>; +def XeGPU_VectorType: VectorOfRankAndType<[1,2,3,4,5,6], [XeGPU_ScalarType]>; +def XeGPU_ValueType: AnyTypeOf<[XeGPU_VectorType, XeGPU_ScalarType]>; // common base class for types in XeGPU dialect class XeGPUTypeDef traits = [], diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index 2793c7a35bc97..10ce019d5a812 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -609,32 +609,58 @@ LogicalResult DpasOp::verify() { // XeGPU_ConvertLayoutOp //===----------------------------------------------------------------------===// LogicalResult ConvertLayoutOp::verify() { - auto srcMap = getSrcMapAttr(); - auto resMap = getResMapAttr(); - if (!srcMap) - return emitOpError("expected srcMap."); - if (!resMap) - return emitOpError("expected resMap."); - - if (srcMap == resMap) - return emitOpError("expected different srcMap and resMap."); + auto srcLayout = getInputLayoutAttr(); + auto resLayout = getTargetLayoutAttr(); + if (!srcLayout) + return emitOpError("expected input layout."); + if (!resLayout) + return emitOpError("expected target layout."); // both srcMap and resMap should be WgLayout or SgLayout at the same time. - if ((!srcMap.isWgLayout() || !resMap.isWgLayout()) && - (!srcMap.isSgLayout() || !resMap.isSgLayout())) - return emitOpError( - "expected srcMap and resMap be WgLayout or SgLayout at the same time."); + if ((!srcLayout.isWgLayout() || !resLayout.isWgLayout()) && + (!srcLayout.isSgLayout() || !resLayout.isSgLayout())) + return emitOpError("expected input layout and target layout be WgLayout or " + "SgLayout at the same time."); auto shape = getSource().getType().getShape(); - if (!XeGPUDialect::isEvenlyDistributable(shape, srcMap)) - return emitOpError("invalid srcMap, data cannot be evenly distributed."); + if (!XeGPUDialect::isEvenlyDistributable(shape, srcLayout)) + return emitOpError( + "invalid input layout, data cannot be evenly distributed."); - if (!XeGPUDialect::isEvenlyDistributable(shape, resMap)) - return emitOpError("invalid resMap, data cannot be evenly distributed."); + if (!XeGPUDialect::isEvenlyDistributable(shape, resLayout)) + return emitOpError( + "invalid target layout, data cannot be evenly distributed."); return mlir::success(); } +OpFoldResult ConvertLayoutOp::fold(FoldAdaptor adaptor) { + llvm::dbgs() << "\nSource from adaptor: " << adaptor.getSource() << "\n"; + auto srcLayout = getInputLayoutAttr(); + auto resLayout = getTargetLayoutAttr(); + if (srcLayout == resLayout) + return adaptor.getSource(); + return {}; +} + +struct FoldConvertLayoutOp : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + LogicalResult matchAndRewrite(xegpu::ConvertLayoutOp op, + PatternRewriter &rewriter) const override { + auto inputLayout = op.getInputLayoutAttr(); + auto targetLayout = op.getTargetLayoutAttr(); + if (inputLayout != targetLayout) + return failure(); + rewriter.replaceOp(op, op.getSource()); + return success(); + } +}; + +void ConvertLayoutOp::getCanonicalizationPatterns(RewritePatternSet &patterns, + MLIRContext *context) { + patterns.add(context); +} + } // namespace xegpu } // namespace mlir diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index 6b85a66a8bd36..aa1755e25996a 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -124,6 +124,10 @@ xegpu::LayoutAttr xegpu::getLayoutAttr(const Value value) { Operation *defOp = result.getDefiningOp(); assert(defOp && "result must have a defining op"); + // For ConvertLayoutOp, the layout is stored in the tensor descriptor + if (auto convertOp = dyn_cast(defOp)) + return convertOp.getTargetLayoutAttr(); + // for LoadNdOp, the layout is stored in the tensor descriptor if (auto loadNd = dyn_cast(defOp)) return getLayoutAttr(loadNd.getTensorDesc()); diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir index a2778cd94d963..65e1d22449bdd 100644 --- a/mlir/test/Dialect/XeGPU/invalid.mlir +++ b/mlir/test/Dialect/XeGPU/invalid.mlir @@ -511,19 +511,11 @@ func.func @tensor_desc_scatter_invalid_chunk_size_2D(%src: ui64, %offsets: vecto return } -// ----- -func.func @convert_layout_same_map(%a: vector<32x64xf16>) { - // expected-error@+1 {{expected different srcMap and resMap}} - %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout, - resMap = #xegpu.layout} : vector<32x64xf16> - gpu.return -} - // ----- func.func @convert_layout_unmatch(%a: vector<32x64xf16>) { - // expected-error@+1 {{expected srcMap and resMap be WgLayout or SgLayout at the same time}} - %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout, - resMap = #xegpu.layout} : vector<32x64xf16> + // expected-error@+1 {{expected input layout and target layout be WgLayout or SgLayout at the same time}} + %2 = xegpu.convert_layout %a <{input_layout = #xegpu.layout, + target_layout = #xegpu.layout}> : vector<32x64xf16> gpu.return } diff --git a/mlir/test/Dialect/XeGPU/layout.mlir b/mlir/test/Dialect/XeGPU/layout.mlir index 7f3ebec225cdf..ef51dfbbfd574 100644 --- a/mlir/test/Dialect/XeGPU/layout.mlir +++ b/mlir/test/Dialect/XeGPU/layout.mlir @@ -35,14 +35,14 @@ gpu.func @create_nd_tdesc_wg_1(%src: memref<24x32xf32>) { } gpu.func @convert_layout(%a: vector<32x64xf16>) { - %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout, - resMap = #xegpu.layout} : vector<32x64xf16> + %2 = xegpu.convert_layout %a <{input_layout = #xegpu.layout, + target_layout = #xegpu.layout}> : vector<32x64xf16> gpu.return } gpu.func @convert_layout_wg(%a: vector<32x64xf16>) { - %2 = xegpu.convert_layout %a {srcMap = #xegpu.layout, - resMap = #xegpu.layout} : vector<32x64xf16> + %2 = xegpu.convert_layout %a <{input_layout = #xegpu.layout, + target_layout = #xegpu.layout}> : vector<32x64xf16> gpu.return } From 9e89e7279a56816b54f5eb5ce1fc9ed3fcde0576 Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Tue, 24 Jun 2025 21:16:38 +0000 Subject: [PATCH 2/9] add convert layout blocking pattern --- mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 12 ++++-------- mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp | 11 +++++++++++ 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index 10ce019d5a812..54b1e360d11f1 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -609,8 +609,8 @@ LogicalResult DpasOp::verify() { // XeGPU_ConvertLayoutOp //===----------------------------------------------------------------------===// LogicalResult ConvertLayoutOp::verify() { - auto srcLayout = getInputLayoutAttr(); - auto resLayout = getTargetLayoutAttr(); + auto srcLayout = getInputLayout(); + auto resLayout = getTargetLayout(); if (!srcLayout) return emitOpError("expected input layout."); if (!resLayout) @@ -636,9 +636,7 @@ LogicalResult ConvertLayoutOp::verify() { OpFoldResult ConvertLayoutOp::fold(FoldAdaptor adaptor) { llvm::dbgs() << "\nSource from adaptor: " << adaptor.getSource() << "\n"; - auto srcLayout = getInputLayoutAttr(); - auto resLayout = getTargetLayoutAttr(); - if (srcLayout == resLayout) + if (getInputLayout() == getTargetLayout()) return adaptor.getSource(); return {}; } @@ -647,9 +645,7 @@ struct FoldConvertLayoutOp : public OpRewritePattern { using OpRewritePattern::OpRewritePattern; LogicalResult matchAndRewrite(xegpu::ConvertLayoutOp op, PatternRewriter &rewriter) const override { - auto inputLayout = op.getInputLayoutAttr(); - auto targetLayout = op.getTargetLayoutAttr(); - if (inputLayout != targetLayout) + if (op.getInputLayout() != op.getTargetLayout()) return failure(); rewriter.replaceOp(op, op.getSource()); return success(); diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp index 3950e8f70d1ca..bf6d0b3164e16 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp @@ -78,6 +78,17 @@ resolveUnrealizedConversionCastOp(UnrealizedConversionCastOp castOp) { } } +struct ConvertLayoutOpPattern: public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + LogicalResult matchAndRewrite(xegpu::ConvertLayoutOp op, PatternRewriter &rewriter) const override { + xegpu::LayoutAttr input_layout = op.getInputLayoutAttr().dropInstData(); + xegpu::LayoutAttr target_layout = op.getTargetLayoutAttr().dropInstData(); + auto newOp = rewriter.createOrFold(op.getLoc(), op.getType(), op.getSource(), input_layout, target_layout); + rewriter.replaceOp(op, newOp); + return success(); + } +}; + //===------------------------------------------------------------------------===// // The XeGPUBlockingPass leverages the unroll patterns for XeGPU and Vector ops // to partition operations that process large shapes into multiple operations on From 149aeeaa3148f98d378177ccb64c8941a41d8dd4 Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Wed, 25 Jun 2025 15:00:36 +0000 Subject: [PATCH 3/9] add WgToSg pattern for convert layout --- mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 3 +- .../XeGPU/Transforms/XeGPUBlocking.cpp | 1 + .../Transforms/XeGPUWgToSgDistribute.cpp | 34 +++++++++++++++++-- 3 files changed, 34 insertions(+), 4 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index 54b1e360d11f1..00fe251f48757 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -635,9 +635,8 @@ LogicalResult ConvertLayoutOp::verify() { } OpFoldResult ConvertLayoutOp::fold(FoldAdaptor adaptor) { - llvm::dbgs() << "\nSource from adaptor: " << adaptor.getSource() << "\n"; if (getInputLayout() == getTargetLayout()) - return adaptor.getSource(); + return getSource(); return {}; } diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp index bf6d0b3164e16..3472bceca40ce 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp @@ -346,6 +346,7 @@ void XeGPUBlockingPass::runOnOperation() { }); RewritePatternSet patterns(ctx); + patterns.add(ctx); vector::UnrollVectorOptions vectorOptions; vectorOptions.setNativeShapeFn(options.nativeShape); diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp index e3563d10bc6f1..fa45169021581 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp @@ -390,6 +390,31 @@ struct WgToSgElementwiseOp : public ConversionPattern { } }; +struct WgToSgConvertLayoutOp + : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(xegpu::ConvertLayoutOp op, OneToNOpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + xegpu::LayoutAttr input = op.getInputLayout(); + xegpu::LayoutAttr target = op.getTargetLayout(); + if (input.getSgLayout() == target.getSgLayout() && + input.getSgData() == target.getSgData()) { + input = input.dropSgLayoutAndData(); + target = target.dropSgLayoutAndData(); + SmallVector newOps; + for (auto src : adaptor.getSource()) { + auto newOp = rewriter.create( + op.getLoc(), src.getType(), src, input, target); + newOps.push_back(newOp); + } + rewriter.replaceOpWithMultiple(op, newOps); + return success(); + } + return failure(); + } +}; + // Handles UnrealizedConversionCastOp generated during // SCFStructuralTypeConversions (step 1). This op may appear as either a // target or source materialization for Vector values, e.g.: @@ -473,8 +498,8 @@ namespace xegpu { void populateXeGPUWgToSgDistributePatterns(RewritePatternSet &patterns) { patterns.add( - patterns.getContext()); + UnrealizedConversionCastOpPattern, WgToSgElementwiseOp, + WgToSgConvertLayoutOp>(patterns.getContext()); } } // namespace xegpu } // namespace mlir @@ -581,6 +606,11 @@ void XeGPUWgToSgDistributePass::runOnOperation() { return isLegal(layout); }); + target.addDynamicallyLegalOp( + [=](xegpu::ConvertLayoutOp op) -> bool { + return isLegal(op.getInputLayout()) && isLegal(op.getTargetLayout()); + }); + target.addDynamicallyLegalDialect( [=](Operation *op) -> std::optional { // Only handle elementwise mappable ops From aee53c4cff7abc4665598c8ee9689456cc373889 Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Fri, 27 Jun 2025 23:26:05 +0000 Subject: [PATCH 4/9] improve ConvertLayoutOpPattern --- mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 14 +- .../Transforms/XeGPUWgToSgDistribute.cpp | 206 ++++++++++++++---- 2 files changed, 168 insertions(+), 52 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp index 7ef61de190b4c..6249d0484c215 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp @@ -313,13 +313,13 @@ LogicalResult TensorDescType::verify( if (rank != 1 && rank != 2) return emitError() << "expected 1D or 2D tensor"; - auto blockAttr = mlir::dyn_cast_if_present(encoding); - if (blockAttr) { - MemorySpaceAttr memorySpaceAttr = blockAttr.getMemorySpace(); - if (rank == 2 && memorySpaceAttr && - memorySpaceAttr.getValue() == MemorySpace::SLM) - return emitError() << "SLM is not supported for 2D block tensor"; - } + // auto blockAttr = mlir::dyn_cast_if_present(encoding); + // if (blockAttr) { + // MemorySpaceAttr memorySpaceAttr = blockAttr.getMemorySpace(); + // if (rank == 2 && memorySpaceAttr && + // memorySpaceAttr.getValue() == MemorySpace::SLM) + // return emitError() << "SLM is not supported for 2D block tensor"; + // } // for gather and scatter ops, Low-precision types are packed in 32-bit units. unsigned bitWidth = elementType.getIntOrFloatBitWidth(); diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp index fa45169021581..d542fb219a7c7 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp @@ -57,6 +57,39 @@ getSgShapeAndCount(ArrayRef shape, xegpu::LayoutAttr layout) { return std::make_pair(sgShape, count); } +// Calculate offset for each subgroup +static SmallVector +calculateGlobalOffsets(ConversionPatternRewriter &rewriter, Location loc, + const SmallVector &originalOffsets, + const SmallVector &localOffset, + const SmallVector &distUnitBaseAddr, + const SmallVector &distUnitShape) { + assert(localOffset.size() == distUnitBaseAddr.size() && + "localOffset and distUnitBaseAddr must have the same rank"); + + SmallVector globalOffsets(originalOffsets.begin(), + originalOffsets.end()); + size_t rank = localOffset.size(); + for (size_t i = 0; i < rank; ++i) { + size_t dimIdx = originalOffsets.size() - rank + i; + Value constOffset = + rewriter.create(loc, distUnitBaseAddr[i]); + Value offset = + rewriter.createOrFold(loc, localOffset[i], constOffset); + Value modValue = + rewriter.create(loc, distUnitShape[i]); + Value offsetMod = + rewriter.createOrFold(loc, offset, modValue); + Value origOffset = + getValueOrCreateConstantIndexOp(rewriter, loc, originalOffsets[dimIdx]); + Value globalOffset = + rewriter.createOrFold(loc, origOffset, offsetMod); + globalOffsets[dimIdx] = globalOffset; + } + + return globalOffsets; +} + /// This pattern transforms the CreateNdDescOp to create a subgroup descriptor /// from a workgroup descriptor. It replaces the offsets and sizes with /// appropriate values for the subgroup. @@ -105,39 +138,6 @@ getSgShapeAndCount(ArrayRef shape, xegpu::LayoutAttr layout) { struct WgToSgCreateNdOp : public OpConversionPattern { using OpConversionPattern::OpConversionPattern; - // Calculate offset for each subgroup - SmallVector - calculateGlobalOffsets(ConversionPatternRewriter &rewriter, Location loc, - const SmallVector &originalOffsets, - const SmallVector &localOffset, - const SmallVector &distUnitBaseAddr, - const SmallVector &distUnitShape) const { - assert(localOffset.size() == distUnitBaseAddr.size() && - "localOffset and distUnitBaseAddr must have the same rank"); - - SmallVector globalOffsets(originalOffsets.begin(), - originalOffsets.end()); - size_t rank = localOffset.size(); - for (size_t i = 0; i < rank; ++i) { - size_t dimIdx = originalOffsets.size() - rank + i; - Value constOffset = - rewriter.create(loc, distUnitBaseAddr[i]); - Value offset = - rewriter.createOrFold(loc, localOffset[i], constOffset); - Value modValue = - rewriter.create(loc, distUnitShape[i]); - Value offsetMod = - rewriter.createOrFold(loc, offset, modValue); - Value origOffset = getValueOrCreateConstantIndexOp( - rewriter, loc, originalOffsets[dimIdx]); - Value globalOffset = - rewriter.createOrFold(loc, origOffset, offsetMod); - globalOffsets[dimIdx] = globalOffset; - } - - return globalOffsets; - } - LogicalResult matchAndRewrite(xegpu::CreateNdDescOp op, OneToNOpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { @@ -390,6 +390,21 @@ struct WgToSgElementwiseOp : public ConversionPattern { } }; +// based on the size of the given vector type +static TypedValue +allocateSLMBuffer(ConversionPatternRewriter &rewriter, Location loc, + VectorType type) { + int64_t bits = type.getElementType().getIntOrFloatBitWidth(); + int64_t slmSizeInBytes = type.getNumElements() * bits / 8; + auto slmTy = MemRefType::get(slmSizeInBytes, rewriter.getI8Type(), {}, 3); + auto slm = rewriter.create(loc, slmTy); + auto viewTy = MemRefType::get(type.getShape(), type.getElementType(), {}, 3); + auto view = rewriter.create( + loc, viewTy, slm, rewriter.create(loc, 0), + ValueRange()); + return view; +} + struct WgToSgConvertLayoutOp : public OpConversionPattern { using OpConversionPattern::OpConversionPattern; @@ -398,20 +413,121 @@ struct WgToSgConvertLayoutOp ConversionPatternRewriter &rewriter) const override { xegpu::LayoutAttr input = op.getInputLayout(); xegpu::LayoutAttr target = op.getTargetLayout(); - if (input.getSgLayout() == target.getSgLayout() && - input.getSgData() == target.getSgData()) { - input = input.dropSgLayoutAndData(); - target = target.dropSgLayoutAndData(); - SmallVector newOps; - for (auto src : adaptor.getSource()) { - auto newOp = rewriter.create( - op.getLoc(), src.getType(), src, input, target); - newOps.push_back(newOp); + + if (!input || !target || !input.isWgLayout() || !target.isWgLayout()) + return rewriter.notifyMatchFailure( + op, "Input and target layouts must have subgroup layout"); + + // initialize values with the source values + SmallVector values(adaptor.getSource()); + + Location loc = op.getLoc(); + MLIRContext *ctx = op.getContext(); + VectorType type = op.getResult().getType(); + ArrayRef shape = type.getShape(); + + DenseI32ArrayAttr inputSgLayout = input.getSgLayout(); + DenseI32ArrayAttr inputSgData = input.getSgData(); + DenseI32ArrayAttr targetSgLayout = target.getSgLayout(); + DenseI32ArrayAttr targetSgData = target.getSgData(); + + // we only need SLM support when input and target layouts are different + if (inputSgLayout != targetSgLayout || inputSgData != targetSgData) { + values.clear(); + rewriter.setInsertionPoint(op); + TypedValue slmBuffer = allocateSLMBuffer(rewriter, loc, type); + + auto linearSgId = rewriter.create( + loc, rewriter.getIndexType(), nullptr); + + { // store to slm buffer + SmallVector sgLayout = + llvm::to_vector_of(input.getSgLayout().asArrayRef()); + SmallVector sgShape = getSgShapeAndCount(shape, input).first; + auto delinearized = affine::delinearizeIndex( + rewriter, loc, linearSgId, getAsIndexOpFoldResult(ctx, sgLayout)); + if (failed(delinearized)) + return rewriter.notifyMatchFailure(op, "Failed to delinearize sgId"); + SmallVector sgIds = *delinearized; + + SmallVector distUnitShape(sgLayout.size()); + SmallVector localOffset(sgLayout.size()); + for (size_t i = 0; i < sgLayout.size(); i++) { + distUnitShape[i] = std::min(sgLayout[i] * sgShape[i], shape[i]); + localOffset[i] = rewriter.createOrFold( + loc, sgIds[i], + rewriter.create(loc, sgShape[i])); + } + + auto tdescTy = xegpu::TensorDescType::get( + sgShape, type.getElementType(), 1, false, xegpu::MemorySpace::SLM, + input.dropSgLayoutAndData()); + + SmallVector zeros = getAsIndexOpFoldResult( + ctx, SmallVector(sgLayout.size(), 0)); + for (auto [data, baseOffsets] : + llvm::zip_equal(adaptor.getSource(), + StaticTileOffsetRange(shape, distUnitShape))) { + SmallVector offsets = calculateGlobalOffsets( + rewriter, loc, zeros, localOffset, baseOffsets, distUnitShape); + auto tdesc = rewriter.create( + loc, tdescTy, slmBuffer, offsets); + rewriter.create(loc, data, tdesc, nullptr, nullptr, + nullptr); + } + } + + rewriter.create(loc); + + { // load from SLM + SmallVector sgLayout = + llvm::to_vector_of(target.getSgLayout().asArrayRef()); + SmallVector sgShape = getSgShapeAndCount(shape, target).first; + auto delinearized = affine::delinearizeIndex( + rewriter, loc, linearSgId, getAsIndexOpFoldResult(ctx, sgLayout)); + if (failed(delinearized)) + return rewriter.notifyMatchFailure(op, "Failed to delinearize sgId"); + SmallVector sgIds = *delinearized; + + SmallVector distUnitShape(sgLayout.size()); + SmallVector localOffset(sgLayout.size()); + for (size_t i = 0; i < sgLayout.size(); i++) { + distUnitShape[i] = std::min(sgLayout[i] * sgShape[i], shape[i]); + localOffset[i] = rewriter.createOrFold( + loc, sgIds[i], + rewriter.create(loc, sgShape[i])); + } + + auto tdescTy = xegpu::TensorDescType::get( + sgShape, type.getElementType(), 1, false, xegpu::MemorySpace::SLM, + target.dropSgLayoutAndData()); + auto valueTy = VectorType::get(sgShape, type.getElementType()); + + SmallVector zeros = getAsIndexOpFoldResult( + ctx, SmallVector(sgLayout.size(), 0)); + for (auto baseOffsets : StaticTileOffsetRange(shape, distUnitShape)) { + SmallVector offsets = calculateGlobalOffsets( + rewriter, loc, zeros, localOffset, baseOffsets, distUnitShape); + auto tdesc = rewriter.create( + loc, tdescTy, slmBuffer, offsets); + auto newOp = rewriter.create( + loc, TypeRange({valueTy}), ValueRange({tdesc})); + values.push_back(newOp); + } } - rewriter.replaceOpWithMultiple(op, newOps); - return success(); } - return failure(); + + input = input.dropSgLayoutAndData(); + target = target.dropSgLayoutAndData(); + + SmallVector newOps; + for (auto src : values) { + auto newOp = rewriter.create( + op.getLoc(), src.getType(), src, input, target); + newOps.push_back(newOp); + } + rewriter.replaceOpWithMultiple(op, newOps); + return success(); } }; From c416cec159b701fbd405b049be1330f6ee24afc7 Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Mon, 30 Jun 2025 15:07:33 +0000 Subject: [PATCH 5/9] code format --- mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp index 3472bceca40ce..06e0c6105df58 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp @@ -78,12 +78,15 @@ resolveUnrealizedConversionCastOp(UnrealizedConversionCastOp castOp) { } } -struct ConvertLayoutOpPattern: public OpRewritePattern { +struct ConvertLayoutOpPattern + : public OpRewritePattern { using OpRewritePattern::OpRewritePattern; - LogicalResult matchAndRewrite(xegpu::ConvertLayoutOp op, PatternRewriter &rewriter) const override { + LogicalResult matchAndRewrite(xegpu::ConvertLayoutOp op, + PatternRewriter &rewriter) const override { xegpu::LayoutAttr input_layout = op.getInputLayoutAttr().dropInstData(); xegpu::LayoutAttr target_layout = op.getTargetLayoutAttr().dropInstData(); - auto newOp = rewriter.createOrFold(op.getLoc(), op.getType(), op.getSource(), input_layout, target_layout); + auto newOp = rewriter.createOrFold( + op.getLoc(), op.getType(), op.getSource(), input_layout, target_layout); rewriter.replaceOp(op, newOp); return success(); } From 65b5dbd5745c662f64212c7803af45fe9605f5ca Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Wed, 9 Jul 2025 22:11:37 +0000 Subject: [PATCH 6/9] refactor ConvertLayoutPattern for wg to sg. --- mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 14 +- mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 11 +- .../Transforms/XeGPUWgToSgDistribute.cpp | 193 +++++------------- mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 2 +- .../test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir | 10 + 5 files changed, 70 insertions(+), 160 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp index 6249d0484c215..7ef61de190b4c 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp @@ -313,13 +313,13 @@ LogicalResult TensorDescType::verify( if (rank != 1 && rank != 2) return emitError() << "expected 1D or 2D tensor"; - // auto blockAttr = mlir::dyn_cast_if_present(encoding); - // if (blockAttr) { - // MemorySpaceAttr memorySpaceAttr = blockAttr.getMemorySpace(); - // if (rank == 2 && memorySpaceAttr && - // memorySpaceAttr.getValue() == MemorySpace::SLM) - // return emitError() << "SLM is not supported for 2D block tensor"; - // } + auto blockAttr = mlir::dyn_cast_if_present(encoding); + if (blockAttr) { + MemorySpaceAttr memorySpaceAttr = blockAttr.getMemorySpace(); + if (rank == 2 && memorySpaceAttr && + memorySpaceAttr.getValue() == MemorySpace::SLM) + return emitError() << "SLM is not supported for 2D block tensor"; + } // for gather and scatter ops, Low-precision types are packed in 32-bit units. unsigned bitWidth = elementType.getIntOrFloatBitWidth(); diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index 00fe251f48757..03e3aed3fa8ce 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -616,7 +616,7 @@ LogicalResult ConvertLayoutOp::verify() { if (!resLayout) return emitOpError("expected target layout."); - // both srcMap and resMap should be WgLayout or SgLayout at the same time. + // both input and target layouts should be WgLayout or SgLayout at the same time. if ((!srcLayout.isWgLayout() || !resLayout.isWgLayout()) && (!srcLayout.isSgLayout() || !resLayout.isSgLayout())) return emitOpError("expected input layout and target layout be WgLayout or " @@ -644,10 +644,11 @@ struct FoldConvertLayoutOp : public OpRewritePattern { using OpRewritePattern::OpRewritePattern; LogicalResult matchAndRewrite(xegpu::ConvertLayoutOp op, PatternRewriter &rewriter) const override { - if (op.getInputLayout() != op.getTargetLayout()) - return failure(); - rewriter.replaceOp(op, op.getSource()); - return success(); + if (op.getInputLayout() == op.getTargetLayout()) { + rewriter.replaceOp(op, op.getSource()); + return success(); + } + return failure(); } }; diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp index d542fb219a7c7..ed393309b3309 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp @@ -57,39 +57,6 @@ getSgShapeAndCount(ArrayRef shape, xegpu::LayoutAttr layout) { return std::make_pair(sgShape, count); } -// Calculate offset for each subgroup -static SmallVector -calculateGlobalOffsets(ConversionPatternRewriter &rewriter, Location loc, - const SmallVector &originalOffsets, - const SmallVector &localOffset, - const SmallVector &distUnitBaseAddr, - const SmallVector &distUnitShape) { - assert(localOffset.size() == distUnitBaseAddr.size() && - "localOffset and distUnitBaseAddr must have the same rank"); - - SmallVector globalOffsets(originalOffsets.begin(), - originalOffsets.end()); - size_t rank = localOffset.size(); - for (size_t i = 0; i < rank; ++i) { - size_t dimIdx = originalOffsets.size() - rank + i; - Value constOffset = - rewriter.create(loc, distUnitBaseAddr[i]); - Value offset = - rewriter.createOrFold(loc, localOffset[i], constOffset); - Value modValue = - rewriter.create(loc, distUnitShape[i]); - Value offsetMod = - rewriter.createOrFold(loc, offset, modValue); - Value origOffset = - getValueOrCreateConstantIndexOp(rewriter, loc, originalOffsets[dimIdx]); - Value globalOffset = - rewriter.createOrFold(loc, origOffset, offsetMod); - globalOffsets[dimIdx] = globalOffset; - } - - return globalOffsets; -} - /// This pattern transforms the CreateNdDescOp to create a subgroup descriptor /// from a workgroup descriptor. It replaces the offsets and sizes with /// appropriate values for the subgroup. @@ -138,6 +105,39 @@ calculateGlobalOffsets(ConversionPatternRewriter &rewriter, Location loc, struct WgToSgCreateNdOp : public OpConversionPattern { using OpConversionPattern::OpConversionPattern; + // Calculate offset for each subgroup + static SmallVector + calculateGlobalOffsets(ConversionPatternRewriter &rewriter, Location loc, + const SmallVector &originalOffsets, + const SmallVector &localOffset, + const SmallVector &distUnitBaseAddr, + const SmallVector &distUnitShape) { + assert(localOffset.size() == distUnitBaseAddr.size() && + "localOffset and distUnitBaseAddr must have the same rank"); + + SmallVector globalOffsets(originalOffsets.begin(), + originalOffsets.end()); + size_t rank = localOffset.size(); + for (size_t i = 0; i < rank; ++i) { + size_t dimIdx = originalOffsets.size() - rank + i; + Value constOffset = + rewriter.create(loc, distUnitBaseAddr[i]); + Value offset = + rewriter.createOrFold(loc, localOffset[i], constOffset); + Value modValue = + rewriter.create(loc, distUnitShape[i]); + Value offsetMod = + rewriter.createOrFold(loc, offset, modValue); + Value origOffset = + getValueOrCreateConstantIndexOp(rewriter, loc, originalOffsets[dimIdx]); + Value globalOffset = + rewriter.createOrFold(loc, origOffset, offsetMod); + globalOffsets[dimIdx] = globalOffset; + } + + return globalOffsets; + } + LogicalResult matchAndRewrite(xegpu::CreateNdDescOp op, OneToNOpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { @@ -390,21 +390,6 @@ struct WgToSgElementwiseOp : public ConversionPattern { } }; -// based on the size of the given vector type -static TypedValue -allocateSLMBuffer(ConversionPatternRewriter &rewriter, Location loc, - VectorType type) { - int64_t bits = type.getElementType().getIntOrFloatBitWidth(); - int64_t slmSizeInBytes = type.getNumElements() * bits / 8; - auto slmTy = MemRefType::get(slmSizeInBytes, rewriter.getI8Type(), {}, 3); - auto slm = rewriter.create(loc, slmTy); - auto viewTy = MemRefType::get(type.getShape(), type.getElementType(), {}, 3); - auto view = rewriter.create( - loc, viewTy, slm, rewriter.create(loc, 0), - ValueRange()); - return view; -} - struct WgToSgConvertLayoutOp : public OpConversionPattern { using OpConversionPattern::OpConversionPattern; @@ -418,115 +403,29 @@ struct WgToSgConvertLayoutOp return rewriter.notifyMatchFailure( op, "Input and target layouts must have subgroup layout"); - // initialize values with the source values - SmallVector values(adaptor.getSource()); - - Location loc = op.getLoc(); - MLIRContext *ctx = op.getContext(); - VectorType type = op.getResult().getType(); - ArrayRef shape = type.getShape(); - DenseI32ArrayAttr inputSgLayout = input.getSgLayout(); DenseI32ArrayAttr inputSgData = input.getSgData(); DenseI32ArrayAttr targetSgLayout = target.getSgLayout(); DenseI32ArrayAttr targetSgData = target.getSgData(); - // we only need SLM support when input and target layouts are different - if (inputSgLayout != targetSgLayout || inputSgData != targetSgData) { - values.clear(); - rewriter.setInsertionPoint(op); - TypedValue slmBuffer = allocateSLMBuffer(rewriter, loc, type); - - auto linearSgId = rewriter.create( - loc, rewriter.getIndexType(), nullptr); - - { // store to slm buffer - SmallVector sgLayout = - llvm::to_vector_of(input.getSgLayout().asArrayRef()); - SmallVector sgShape = getSgShapeAndCount(shape, input).first; - auto delinearized = affine::delinearizeIndex( - rewriter, loc, linearSgId, getAsIndexOpFoldResult(ctx, sgLayout)); - if (failed(delinearized)) - return rewriter.notifyMatchFailure(op, "Failed to delinearize sgId"); - SmallVector sgIds = *delinearized; - - SmallVector distUnitShape(sgLayout.size()); - SmallVector localOffset(sgLayout.size()); - for (size_t i = 0; i < sgLayout.size(); i++) { - distUnitShape[i] = std::min(sgLayout[i] * sgShape[i], shape[i]); - localOffset[i] = rewriter.createOrFold( - loc, sgIds[i], - rewriter.create(loc, sgShape[i])); - } - - auto tdescTy = xegpu::TensorDescType::get( - sgShape, type.getElementType(), 1, false, xegpu::MemorySpace::SLM, - input.dropSgLayoutAndData()); - - SmallVector zeros = getAsIndexOpFoldResult( - ctx, SmallVector(sgLayout.size(), 0)); - for (auto [data, baseOffsets] : - llvm::zip_equal(adaptor.getSource(), - StaticTileOffsetRange(shape, distUnitShape))) { - SmallVector offsets = calculateGlobalOffsets( - rewriter, loc, zeros, localOffset, baseOffsets, distUnitShape); - auto tdesc = rewriter.create( - loc, tdescTy, slmBuffer, offsets); - rewriter.create(loc, data, tdesc, nullptr, nullptr, - nullptr); - } - } - - rewriter.create(loc); - - { // load from SLM - SmallVector sgLayout = - llvm::to_vector_of(target.getSgLayout().asArrayRef()); - SmallVector sgShape = getSgShapeAndCount(shape, target).first; - auto delinearized = affine::delinearizeIndex( - rewriter, loc, linearSgId, getAsIndexOpFoldResult(ctx, sgLayout)); - if (failed(delinearized)) - return rewriter.notifyMatchFailure(op, "Failed to delinearize sgId"); - SmallVector sgIds = *delinearized; - - SmallVector distUnitShape(sgLayout.size()); - SmallVector localOffset(sgLayout.size()); - for (size_t i = 0; i < sgLayout.size(); i++) { - distUnitShape[i] = std::min(sgLayout[i] * sgShape[i], shape[i]); - localOffset[i] = rewriter.createOrFold( - loc, sgIds[i], - rewriter.create(loc, sgShape[i])); - } - - auto tdescTy = xegpu::TensorDescType::get( - sgShape, type.getElementType(), 1, false, xegpu::MemorySpace::SLM, - target.dropSgLayoutAndData()); - auto valueTy = VectorType::get(sgShape, type.getElementType()); - - SmallVector zeros = getAsIndexOpFoldResult( - ctx, SmallVector(sgLayout.size(), 0)); - for (auto baseOffsets : StaticTileOffsetRange(shape, distUnitShape)) { - SmallVector offsets = calculateGlobalOffsets( - rewriter, loc, zeros, localOffset, baseOffsets, distUnitShape); - auto tdesc = rewriter.create( - loc, tdescTy, slmBuffer, offsets); - auto newOp = rewriter.create( - loc, TypeRange({valueTy}), ValueRange({tdesc})); - values.push_back(newOp); - } - } - } + // TODO: currently we only support for optimal case, where input and + // output has the same sg_layout and sg_data, so SLM is not involved. + if (inputSgLayout != targetSgLayout || inputSgData != targetSgData) + return failure(); input = input.dropSgLayoutAndData(); target = target.dropSgLayoutAndData(); - SmallVector newOps; - for (auto src : values) { - auto newOp = rewriter.create( - op.getLoc(), src.getType(), src, input, target); - newOps.push_back(newOp); + SmallVector newOps(adaptor.getSource()); + + if (input && target) { + for (auto [i, src] : llvm::enumerate(adaptor.getSource())) { + auto newOp = rewriter.create( + op.getLoc(), src.getType(), src, input, target); + newOps[i] = newOp; + } } - rewriter.replaceOpWithMultiple(op, newOps); + rewriter.replaceOpWithMultiple(op, {newOps}); return success(); } }; diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index aa1755e25996a..d5ae3c20e222e 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -124,7 +124,7 @@ xegpu::LayoutAttr xegpu::getLayoutAttr(const Value value) { Operation *defOp = result.getDefiningOp(); assert(defOp && "result must have a defining op"); - // For ConvertLayoutOp, the layout is stored in the tensor descriptor + // For ConvertLayoutOp, the layout is stored in the targetLayoutAttr if (auto convertOp = dyn_cast(defOp)) return convertOp.getTargetLayoutAttr(); diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir index c6124f90e0f48..6c688f4db6dec 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-rr.mlir @@ -198,4 +198,14 @@ gpu.module @test_round_robin_assignment { gpu.return } + gpu.func @convert_layout_optimal(%arg0: memref<32x64xf32>) { + %0 = xegpu.create_nd_tdesc %arg0[0, 0] : memref<32x64xf32> -> !xegpu.tensor_desc<32x64xf32, #xegpu.layout> + //CHECK-2: xegpu.load_nd {{.*}} : !xegpu.tensor_desc<16x16xf32, #xegpu.layout> -> vector<16x16xf32> + //CHECK-2: xegpu.convert_layout {{.*}} <{input_layout = #xegpu.layout, target_layout = #xegpu.layout}> : vector<16x16xf32> + %1 = xegpu.load_nd %0 : !xegpu.tensor_desc<32x64xf32, #xegpu.layout> -> vector<32x64xf32> + %2 = xegpu.convert_layout %1 <{input_layout = #xegpu.layout, + target_layout = #xegpu.layout}> : vector<32x64xf32> + gpu.return + } + } From ec4e7ad1ec94efb0fab31daf412cddfd9e439753 Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Wed, 9 Jul 2025 22:16:02 +0000 Subject: [PATCH 7/9] fix format issue --- mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 3 ++- mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index 03e3aed3fa8ce..97415cc74f928 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -616,7 +616,8 @@ LogicalResult ConvertLayoutOp::verify() { if (!resLayout) return emitOpError("expected target layout."); - // both input and target layouts should be WgLayout or SgLayout at the same time. + // both input and target layouts should be WgLayout or SgLayout at the same + // time. if ((!srcLayout.isWgLayout() || !resLayout.isWgLayout()) && (!srcLayout.isSgLayout() || !resLayout.isSgLayout())) return emitOpError("expected input layout and target layout be WgLayout or " diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp index ed393309b3309..89dcddec752a1 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp @@ -128,8 +128,8 @@ struct WgToSgCreateNdOp : public OpConversionPattern { rewriter.create(loc, distUnitShape[i]); Value offsetMod = rewriter.createOrFold(loc, offset, modValue); - Value origOffset = - getValueOrCreateConstantIndexOp(rewriter, loc, originalOffsets[dimIdx]); + Value origOffset = getValueOrCreateConstantIndexOp( + rewriter, loc, originalOffsets[dimIdx]); Value globalOffset = rewriter.createOrFold(loc, origOffset, offsetMod); globalOffsets[dimIdx] = globalOffset; From b9c02fcc90fa402a4917741a0b5620b8fd305e6a Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Fri, 11 Jul 2025 16:50:55 +0000 Subject: [PATCH 8/9] fix a bug --- mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index d5ae3c20e222e..bdc80d78f202a 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -141,7 +141,8 @@ xegpu::LayoutAttr xegpu::getLayoutAttr(const Value value) { auto parentOp = arg.getOwner()->getParentOp(); if (auto loop = dyn_cast(parentOp)) { OpOperand *tiedInit = loop.getTiedLoopInit(arg); - return getLayoutAttr(tiedInit->get()); + if (tiedInit) + return getLayoutAttr(tiedInit->get()); } } @@ -178,11 +179,15 @@ void xegpu::setLayoutAttrs(Operation *op, function_ref getLayoutImpl) { op->walk([&](Operation *nestOp) { for (OpOperand &opr : nestOp->getOpOperands()) { + llvm::dbgs() << "set layout for: " << opr.get(); auto layout = getLayoutImpl(opr.get()); + llvm::dbgs() << " with: " << layout << "\n"; setLayoutAttr(opr, layout); } for (OpResult result : nestOp->getOpResults()) { + llvm::dbgs() << "set layout for: " << result; auto layout = getLayoutImpl(result); + llvm::dbgs() << " with: " << layout << "\n"; setLayoutAttr(result, layout); } }); From d8035af2f8f548452030a171c65d1926c9d59ae7 Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Fri, 11 Jul 2025 16:53:07 +0000 Subject: [PATCH 9/9] clean up --- mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index bdc80d78f202a..8922c07f24067 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -179,15 +179,11 @@ void xegpu::setLayoutAttrs(Operation *op, function_ref getLayoutImpl) { op->walk([&](Operation *nestOp) { for (OpOperand &opr : nestOp->getOpOperands()) { - llvm::dbgs() << "set layout for: " << opr.get(); auto layout = getLayoutImpl(opr.get()); - llvm::dbgs() << " with: " << layout << "\n"; setLayoutAttr(opr, layout); } for (OpResult result : nestOp->getOpResults()) { - llvm::dbgs() << "set layout for: " << result; auto layout = getLayoutImpl(result); - llvm::dbgs() << " with: " << layout << "\n"; setLayoutAttr(result, layout); } });