From 2dbb60534c2966f7616ddbd72d600f1cf9ba70aa Mon Sep 17 00:00:00 2001 From: linuxlonelyeagle <2020382038@qq.com> Date: Sun, 9 Mar 2025 17:35:14 +0800 Subject: [PATCH 1/3] update async wait op. --- .../include/mlir/Dialect/NVGPU/IR/NVGPUOps.td | 5 ++--- .../NVGPU/Transforms/CreateAsyncGroups.cpp | 7 ++----- .../Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir | 6 +++--- .../Dialect/NVGPU/optimize-shared-memory.mlir | 14 +++++++------- mlir/test/Dialect/NVGPU/roundtrip.mlir | 4 ++-- .../NVGPU/transform-create-async-groups.mlir | 6 +++--- .../NVGPU/transform-pipeline-shared.mlir | 19 +++++++------------ 7 files changed, 26 insertions(+), 35 deletions(-) diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td index eb0fb90d271ed..8d8eddf18efc2 100644 --- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td +++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td @@ -294,10 +294,9 @@ def NVGPU_DeviceAsyncWaitOp : NVGPU_Op<"device_async_wait", []> { nvgpu.device_async_wait %0 ``` }]; - let arguments = (ins NVGPU_DeviceAsyncToken:$asyncDependencies, - OptionalAttr:$numGroups); + let arguments = (ins OptionalAttr:$numGroups); let assemblyFormat = [{ - $asyncDependencies attr-dict + attr-dict }]; } diff --git a/mlir/lib/Dialect/NVGPU/Transforms/CreateAsyncGroups.cpp b/mlir/lib/Dialect/NVGPU/Transforms/CreateAsyncGroups.cpp index 10bc1993ffd96..ed37af096751f 100644 --- a/mlir/lib/Dialect/NVGPU/Transforms/CreateAsyncGroups.cpp +++ b/mlir/lib/Dialect/NVGPU/Transforms/CreateAsyncGroups.cpp @@ -265,11 +265,8 @@ void nvgpu::createAsyncGroups(RewriterBase &rewriter, Operation *op, } // Create the group and wait for it right after. - Value groupToken = rewriter.create( - op->getLoc(), nvgpu::DeviceAsyncTokenType::get(op->getContext()), - tokens); - rewriter.create(op->getLoc(), groupToken, - nullptr); + rewriter.create(op->getLoc(), nvgpu::DeviceAsyncTokenType::get(op->getContext()), tokens); + rewriter.create(op->getLoc(), nullptr); // Clean up old stores. for (Operation *writeOp : group) rewriter.eraseOp(writeOp); diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir index 6b59b5e4343b4..7cbf39cb97dc8 100644 --- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir +++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir @@ -243,7 +243,7 @@ func.func @async_cp( // CHECK: nvvm.cp.async.commit.group %1 = nvgpu.device_async_create_group %0 // CHECK: nvvm.cp.async.wait.group 1 - nvgpu.device_async_wait %1 { numGroups = 1 : i32 } + nvgpu.device_async_wait { numGroups = 1 : i32 } // CHECK: nvvm.cp.async.shared.global %{{.*}}, %{{.*}}, 16, cache = cg %2 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i, %i], 4 {bypassL1}: memref<128x128xf32> to memref<3x16x128xf32, 3> @@ -301,7 +301,7 @@ func.func @async_cp_zfill_f32_align4( // CHECK: nvvm.cp.async.commit.group %1 = nvgpu.device_async_create_group %0 // CHECK: nvvm.cp.async.wait.group 1 - nvgpu.device_async_wait %1 { numGroups = 1 : i32 } + nvgpu.device_async_wait { numGroups = 1 : i32 } return } @@ -336,7 +336,7 @@ func.func @async_cp_zfill_f32_align1( // CHECK: nvvm.cp.async.commit.group %1 = nvgpu.device_async_create_group %0 // CHECK: nvvm.cp.async.wait.group 1 - nvgpu.device_async_wait %1 { numGroups = 1 : i32 } + nvgpu.device_async_wait { numGroups = 1 : i32 } return } diff --git a/mlir/test/Dialect/NVGPU/optimize-shared-memory.mlir b/mlir/test/Dialect/NVGPU/optimize-shared-memory.mlir index 7477e18728677..144e422f6c2b3 100644 --- a/mlir/test/Dialect/NVGPU/optimize-shared-memory.mlir +++ b/mlir/test/Dialect/NVGPU/optimize-shared-memory.mlir @@ -20,7 +20,7 @@ func.func @optimize_128x32xf16_32x128xf16(%arg0: memref<128x128xf16>, %0 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shm[%stRow, %stCol], 8 : memref<128x128xf16> to memref<128x32xf16, 3> %1 = nvgpu.device_async_create_group %0 - nvgpu.device_async_wait %1 { numGroups = 1 : i32} + nvgpu.device_async_wait { numGroups = 1 : i32} // CHECK: [[c6:%.+]] = arith.constant 6 : index // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c6]] @@ -40,7 +40,7 @@ func.func @optimize_128x32xf16_32x128xf16(%arg0: memref<128x128xf16>, %2 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shmB[%stRow, %stCol], 8 : memref<128x128xf16> to memref<32x128xf16, 3> %3 = nvgpu.device_async_create_group %0 - nvgpu.device_async_wait %1 { numGroups = 1 : i32} + nvgpu.device_async_wait { numGroups = 1 : i32} // CHECK: [[c15:%.+]] = arith.constant 15 : index // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c15]] @@ -77,7 +77,7 @@ func.func @optimize_64x16xf32_16x64xf32(%arg0: memref<128x128xf32>, %0 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shm[%stRow, %stCol], 4 : memref<128x128xf32> to memref<64x16xf32, 3> %1 = nvgpu.device_async_create_group %0 - nvgpu.device_async_wait %1 { numGroups = 1 : i32} + nvgpu.device_async_wait { numGroups = 1 : i32} // CHECK: [[c6:%.+]] = arith.constant 6 : index // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c6]] @@ -133,7 +133,7 @@ func.func @optimize_64x16xf32_16x64xf32(%arg0: memref<128x128xf32>, %2 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shmB[%stRow, %stCol], 4 : memref<128x128xf32> to memref<16x64xf32, 3> %3 = nvgpu.device_async_create_group %0 - nvgpu.device_async_wait %1 { numGroups = 1 : i32} + nvgpu.device_async_wait { numGroups = 1 : i32} // CHECK: [[c15:%.+]] = arith.constant 15 : index // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c15]] @@ -178,7 +178,7 @@ func.func @small_column_size_f64(%arg0: memref<32x32xf64>, %0 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shm[%stRow, %stCol], 2 : memref<32x32xf64> to memref<32x4xf64, 3> %1 = nvgpu.device_async_create_group %0 - nvgpu.device_async_wait %1 { numGroups = 1 : i32} + nvgpu.device_async_wait { numGroups = 1 : i32} // CHECK: [[c6:%.+]] = arith.constant 4 : index // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c6]] @@ -204,7 +204,7 @@ func.func @too_small_column_size_f16(%arg0: memref<128x128xf16>, %0 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shm[%stRow, %stCol], 8 : memref<128x128xf16> to memref<128x8xf16, 3> %1 = nvgpu.device_async_create_group %0 - nvgpu.device_async_wait %1 { numGroups = 1 : i32} + nvgpu.device_async_wait { numGroups = 1 : i32} // CHECK: nvgpu.ldmatrix [[shm]][[[fragRow]], [[fragCol]]] %mat = nvgpu.ldmatrix %shm[%fragRow, %fragCol] {numTiles = 1 : i32, transpose = false} @@ -230,7 +230,7 @@ func.func @abort_if_subview(%arg0: memref<128x128xf16>, %0 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shm[%stRow, %stCol], 8 : memref<128x128xf16> to memref<128x32xf16, 3> %1 = nvgpu.device_async_create_group %0 - nvgpu.device_async_wait %1 { numGroups = 1 : i32} + nvgpu.device_async_wait { numGroups = 1 : i32} // CHECK: nvgpu.ldmatrix [[shmView]][[[fragRow]], [[fragCol]]] %mat = nvgpu.ldmatrix %shmView[%fragRow, %fragCol] {numTiles = 1 : i32, transpose = false} diff --git a/mlir/test/Dialect/NVGPU/roundtrip.mlir b/mlir/test/Dialect/NVGPU/roundtrip.mlir index ad516b4d2c200..71f8f52bcbc64 100644 --- a/mlir/test/Dialect/NVGPU/roundtrip.mlir +++ b/mlir/test/Dialect/NVGPU/roundtrip.mlir @@ -65,7 +65,7 @@ func.func @async_cp(%dst : memref<2x7x5xf32, 3>, %src : memref<4x5xf32>){ %0 = nvgpu.device_async_copy %src[%c0, %c0], %dst[%c0, %c0, %c0], 4 : memref<4x5xf32> to memref<2x7x5xf32, 3> // CHECK: %{{.*}} = nvgpu.device_async_create_group %token = nvgpu.device_async_create_group %0 - // CHECK: nvgpu.device_async_wait %{{.*}} {numGroups = 1 : i32} - nvgpu.device_async_wait %token {numGroups = 1 : i32} + // CHECK: nvgpu.device_async_wait {numGroups = 1 : i32} + nvgpu.device_async_wait {numGroups = 1 : i32} return } diff --git a/mlir/test/Dialect/NVGPU/transform-create-async-groups.mlir b/mlir/test/Dialect/NVGPU/transform-create-async-groups.mlir index 8290001c45856..f325dac6f7303 100644 --- a/mlir/test/Dialect/NVGPU/transform-create-async-groups.mlir +++ b/mlir/test/Dialect/NVGPU/transform-create-async-groups.mlir @@ -18,7 +18,7 @@ builtin.module { %2 = vector.transfer_read %a[%c0, %c4], %cst_0 {in_bounds = [true]} : memref<1024x1024xf32>, vector<1xf32> vector.transfer_write %2, %0[%c0, %c4, %c0] {in_bounds = [true]} : vector<1xf32>, memref<4x32x16xf32, #gpu.address_space> // CHECK: %[[G:.*]] = nvgpu.device_async_create_group %[[CP0]], %[[CP1]] - // CHECK: nvgpu.device_async_wait %[[G]] + // CHECK: nvgpu.device_async_wait return } @@ -52,7 +52,7 @@ builtin.module { %2 = vector.transfer_read %a[%c0, %c4], %cst_0 {in_bounds = [true]} : memref<1024x1024xf32>, vector<1xf32> vector.transfer_write %2, %0[%c0, %c4, %c0] {in_bounds = [true]} : vector<1xf32>, memref<4x32x16xf32, #gpu.address_space> // CHECK: %[[G:.*]] = nvgpu.device_async_create_group %[[CP0]], %[[CP1]] - // CHECK: nvgpu.device_async_wait %[[G]] + // CHECK: nvgpu.device_async_wait return } @@ -84,7 +84,7 @@ builtin.module { %2 = vector.load %a[%c0, %c4] : memref<1024x1024xf32>, vector<1xf32> vector.store %2, %0[%c0, %c4, %c0] : memref<4x32x16xf32, #gpu.address_space>, vector<1xf32> // CHECK: %[[G:.*]] = nvgpu.device_async_create_group %[[CP0]], %[[CP1]] - // CHECK: nvgpu.device_async_wait %[[G]] + // CHECK: nvgpu.device_async_wait return } diff --git a/mlir/test/Dialect/NVGPU/transform-pipeline-shared.mlir b/mlir/test/Dialect/NVGPU/transform-pipeline-shared.mlir index e959949babd9e..e93a6a40391bb 100644 --- a/mlir/test/Dialect/NVGPU/transform-pipeline-shared.mlir +++ b/mlir/test/Dialect/NVGPU/transform-pipeline-shared.mlir @@ -94,13 +94,11 @@ func.func @async_depth_2_predicated(%global: memref, %alloc_size: index) %c0f = arith.constant 0.0 : f32 // CHECK: %[[TOKEN0:.+]] = nvgpu.device_async_copy // CHECK: %[[TOKEN1:.+]] = nvgpu.device_async_copy - // CHECK: scf.for %[[I:.+]] = {{.*}} iter_args - // CHECK-SAME: %[[ITER_ARG0:.+]] = %[[TOKEN0]] - // CHECK-SAME: %[[ITER_ARG1:.+]] = %[[TOKEN1]] + // CHECK: scf.for %[[I:.+]] = {{.*}} scf.for %i = %c0 to %c98 step %c4 { // Condition for the predication "select" below. // CHECK: %[[CMP0:.+]] = arith.cmpi slt, %[[I]], %[[C90]] - // CHECK: nvgpu.device_async_wait %[[ITER_ARG0]] {numGroups = 1 + // CHECK: nvgpu.device_async_wait {numGroups = 1 // Original "select" with updated induction variable. // CHECK: %[[I_PLUS_8:.+]] = arith.addi %[[I]], %[[C8]] // CHECK: %[[CMP1:.+]] = arith.cmpi slt, %[[I_PLUS_8]], %[[C96]] @@ -122,9 +120,7 @@ func.func @async_depth_2_predicated(%global: memref, %alloc_size: index) %token = nvgpu.device_async_copy %global[%i], %shared[%i], 4, %read_size : memref to memref> - nvgpu.device_async_wait %token - - // CHECK: scf.yield %[[ITER_ARG1]], %[[ASYNC_TOKEN]] + nvgpu.device_async_wait } // There is no need to wait for the last copies as it it was fully predicated // out and doesn't load the original data. @@ -156,12 +152,11 @@ func.func @async_depth_2_peeled(%global: memref) { // CHECK: nvgpu.device_async_copy // CHECK: nvgpu.device_async_copy // CHECK: scf.for - // CHECK: nvgpu.device_async_wait %{{.*}} {numGroups = 1 + // CHECK: nvgpu.device_async_wait {numGroups = 1 // CHECK: arith.select // CHECK: nvgpu.device_async_copy - // CHECK: scf.yield - // CHECK: nvgpu.device_async_wait %{{.*}} {numGroups = 1 - // CHECK: nvgpu.device_async_wait %{{.*}} {numGroups = 0 + // CHECK: nvgpu.device_async_wait {numGroups = 1 + // CHECK: nvgpu.device_async_wait {numGroups = 0 scf.for %i = %c0 to %c98 step %c4 { %c96 = arith.constant 96 : index %cond = arith.cmpi slt, %i, %c96 : index @@ -169,7 +164,7 @@ func.func @async_depth_2_peeled(%global: memref) { %read_size = arith.select %cond, %c4, %c2 : index %token = nvgpu.device_async_copy %global[%i], %shared[%i], 4, %read_size : memref to memref> - nvgpu.device_async_wait %token + nvgpu.device_async_wait } return } From 2001dc0c4283baacadaaf5e93132a0f7b527c36f Mon Sep 17 00:00:00 2001 From: linuxlonelyeagle <2020382038@qq.com> Date: Sun, 9 Mar 2025 18:52:08 +0800 Subject: [PATCH 2/3] update create_group op and update doc. --- .../include/mlir/Dialect/NVGPU/IR/NVGPUOps.td | 32 +++++++++++-------- .../Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp | 7 +--- .../NVGPU/Transforms/CreateAsyncGroups.cpp | 2 +- .../Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir | 6 ++-- .../Dialect/NVGPU/optimize-shared-memory.mlir | 14 ++++---- mlir/test/Dialect/NVGPU/roundtrip.mlir | 4 +-- .../NVGPU/transform-create-async-groups.mlir | 6 ++-- 7 files changed, 36 insertions(+), 35 deletions(-) diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td index 8d8eddf18efc2..03a9485e26bc7 100644 --- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td +++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td @@ -216,15 +216,13 @@ def NVGPU_DeviceAsyncCopyOp : NVGPU_Op<"device_async_copy", [ // copy 2. %cp2 = nvgpu.device_async_copy %C[%c0], %D[%c0], 4 : memref<16xf32> to memref<16xf32, 3> // group 1 contains copy 1 and copy 2. - %token1 = nvgpu.device_async_create_group %cp1, %cp2 + nvgpu.device_async_create_group %cp1, %cp2 // copy 3. %cp3 = nvgpu.device_async_copy %E[%c0], %F[%c0], 4 : memref<16xf32> to memref<16xf32, 3> // group 2 contains copy 3. - %token2 = nvgpu.device_async_create_group %cp3 - // after the wait copy 1 and copy 2 are complete. - nvgpu.device_async_wait %token1 - // after the wait copy 3 is complete. - nvgpu.device_async_wait %token2 + nvgpu.device_async_create_group %cp3 + // after the wait copy 1, copy 2 and copy 3 are complete. + nvgpu.device_async_wait ``` Example: @@ -255,9 +253,7 @@ def NVGPU_DeviceAsyncCreateGroupOp : NVGPU_Op<"device_async_create_group", []> { The `nvgpu.device_async_create_group` op creates a group of memory accesses containing all the pending `device_async_copy` operations associated with argument tokens. Each token can only be part of one group. - - It returns a token that can be use to wait until the group fully completes. - + This is meant to be used with `nvgpu.device_async_wait` to synchronize copies as explained in those ops descriptions. @@ -266,10 +262,10 @@ def NVGPU_DeviceAsyncCreateGroupOp : NVGPU_Op<"device_async_create_group", []> { Example: ```mlir - %0 = nvgpu.device_async_create_group - ``` + %cp = nvgpu.device_async_copy %E[%c0], %F[%c0], 4 : memref<16xf32> to memref<16xf32, 3> + nvgpu.device_async_create_group %cp + ``` }]; - let results = (outs NVGPU_DeviceAsyncToken:$asyncToken); let arguments = (ins Variadic:$inputTokens); let assemblyFormat = [{ $inputTokens attr-dict @@ -291,7 +287,17 @@ def NVGPU_DeviceAsyncWaitOp : NVGPU_Op<"device_async_wait", []> { Example: ```mlir - nvgpu.device_async_wait %0 + // copy 1. + %cp1 = nvgpu.device_async_copy %A[%c0], %B[%c0], 4 :memref<16xf32> to memref<16xf32, 3> + nvgpu.device_async_create_group %cp1 + // copy 2. + %cp2 = nvgpu.device_async_copy %C[%c0], %D[%c0], 4 : memref<16xf32> to memref<16xf32, 3> + nvgpu.device_async_create_group %cp2 + // copy 3. + %cp3 = nvgpu.device_async_copy %E[%c0], %F[%c0], 4 : memref<16xf32> to memref<16xf32, 3> + nvgpu.device_async_create_group %cp3 + // after the wait copy 1 and copy 2 are complete. + nvgpu.device_async_wait {numGroups = 1 : i32} ``` }]; let arguments = (ins OptionalAttr:$numGroups); diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp index f53de416f2abd..3bf1fd04d1759 100644 --- a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp +++ b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp @@ -733,12 +733,7 @@ struct NVGPUAsyncCreateGroupLowering LogicalResult matchAndRewrite(nvgpu::DeviceAsyncCreateGroupOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { - rewriter.create(op.getLoc()); - // Drop the result token. - Value zero = rewriter.create( - op->getLoc(), IntegerType::get(op.getContext(), 32), - rewriter.getI32IntegerAttr(0)); - rewriter.replaceOp(op, zero); + rewriter.replaceOpWithNewOp(op); return success(); } }; diff --git a/mlir/lib/Dialect/NVGPU/Transforms/CreateAsyncGroups.cpp b/mlir/lib/Dialect/NVGPU/Transforms/CreateAsyncGroups.cpp index ed37af096751f..08794b2b328fa 100644 --- a/mlir/lib/Dialect/NVGPU/Transforms/CreateAsyncGroups.cpp +++ b/mlir/lib/Dialect/NVGPU/Transforms/CreateAsyncGroups.cpp @@ -265,7 +265,7 @@ void nvgpu::createAsyncGroups(RewriterBase &rewriter, Operation *op, } // Create the group and wait for it right after. - rewriter.create(op->getLoc(), nvgpu::DeviceAsyncTokenType::get(op->getContext()), tokens); + rewriter.create(op->getLoc(), tokens); rewriter.create(op->getLoc(), nullptr); // Clean up old stores. for (Operation *writeOp : group) diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir index 7cbf39cb97dc8..524eb3e1fa7b1 100644 --- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir +++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir @@ -241,7 +241,7 @@ func.func @async_cp( // CHECK-DAG: nvvm.cp.async.shared.global %[[ADDRESSDST]], %[[CAST2]], 16, cache = ca %0 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i, %i], 4 : memref<128x128xf32> to memref<3x16x128xf32, 3> // CHECK: nvvm.cp.async.commit.group - %1 = nvgpu.device_async_create_group %0 + nvgpu.device_async_create_group %0 // CHECK: nvvm.cp.async.wait.group 1 nvgpu.device_async_wait { numGroups = 1 : i32 } @@ -299,7 +299,7 @@ func.func @async_cp_zfill_f32_align4( // CHECK-DAG: nvvm.cp.async.shared.global %[[ADDRESSDST]], %[[CAST2]], 16, cache = cg, %[[c5]] %0 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i, %i], 4, %srcElements {bypassL1}: memref<128x128xf32> to memref<3x16x128xf32, 3> // CHECK: nvvm.cp.async.commit.group - %1 = nvgpu.device_async_create_group %0 + nvgpu.device_async_create_group %0 // CHECK: nvvm.cp.async.wait.group 1 nvgpu.device_async_wait { numGroups = 1 : i32 } @@ -334,7 +334,7 @@ func.func @async_cp_zfill_f32_align1( // CHECK-DAG: nvvm.cp.async.shared.global %[[ADDRESSDST]], %[[CAST2]], 4, cache = ca, %[[c5]] %0 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i, %i], 1, %srcElements : memref<128x128xf32> to memref<3x16x128xf32, 3> // CHECK: nvvm.cp.async.commit.group - %1 = nvgpu.device_async_create_group %0 + nvgpu.device_async_create_group %0 // CHECK: nvvm.cp.async.wait.group 1 nvgpu.device_async_wait { numGroups = 1 : i32 } diff --git a/mlir/test/Dialect/NVGPU/optimize-shared-memory.mlir b/mlir/test/Dialect/NVGPU/optimize-shared-memory.mlir index 144e422f6c2b3..610afb56d3175 100644 --- a/mlir/test/Dialect/NVGPU/optimize-shared-memory.mlir +++ b/mlir/test/Dialect/NVGPU/optimize-shared-memory.mlir @@ -19,7 +19,7 @@ func.func @optimize_128x32xf16_32x128xf16(%arg0: memref<128x128xf16>, // CHECK: nvgpu.device_async_copy [[arg0]][[[ldRow]], [[ldCol]]], [[shm]][[[stRow]], [[stColPerm]]] %0 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shm[%stRow, %stCol], 8 : memref<128x128xf16> to memref<128x32xf16, 3> - %1 = nvgpu.device_async_create_group %0 + nvgpu.device_async_create_group %0 nvgpu.device_async_wait { numGroups = 1 : i32} // CHECK: [[c6:%.+]] = arith.constant 6 : index @@ -39,7 +39,7 @@ func.func @optimize_128x32xf16_32x128xf16(%arg0: memref<128x128xf16>, // CHECK: nvgpu.device_async_copy [[arg0]][[[ldRow]], [[ldCol]]], [[shmB]][[[stRow]], [[stColPerm]]] %2 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shmB[%stRow, %stCol], 8 : memref<128x128xf16> to memref<32x128xf16, 3> - %3 = nvgpu.device_async_create_group %0 + nvgpu.device_async_create_group %0 nvgpu.device_async_wait { numGroups = 1 : i32} // CHECK: [[c15:%.+]] = arith.constant 15 : index @@ -76,7 +76,7 @@ func.func @optimize_64x16xf32_16x64xf32(%arg0: memref<128x128xf32>, // CHECK: nvgpu.device_async_copy [[arg0]][[[ldRow]], [[ldCol]]], [[shm]][[[stRow]], [[stColPerm]]] %0 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shm[%stRow, %stCol], 4 : memref<128x128xf32> to memref<64x16xf32, 3> - %1 = nvgpu.device_async_create_group %0 + nvgpu.device_async_create_group %0 nvgpu.device_async_wait { numGroups = 1 : i32} // CHECK: [[c6:%.+]] = arith.constant 6 : index @@ -132,7 +132,7 @@ func.func @optimize_64x16xf32_16x64xf32(%arg0: memref<128x128xf32>, // CHECK: nvgpu.device_async_copy [[arg0]][[[ldRow]], [[ldCol]]], [[shmB]][[[stRow]], [[stColPerm]]] %2 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shmB[%stRow, %stCol], 4 : memref<128x128xf32> to memref<16x64xf32, 3> - %3 = nvgpu.device_async_create_group %0 + nvgpu.device_async_create_group %0 nvgpu.device_async_wait { numGroups = 1 : i32} // CHECK: [[c15:%.+]] = arith.constant 15 : index @@ -177,7 +177,7 @@ func.func @small_column_size_f64(%arg0: memref<32x32xf64>, // CHECK: nvgpu.device_async_copy [[arg0]][[[ldRow]], [[ldCol]]], [[shm]][[[stRow]], [[stColPerm]]] %0 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shm[%stRow, %stCol], 2 : memref<32x32xf64> to memref<32x4xf64, 3> - %1 = nvgpu.device_async_create_group %0 + nvgpu.device_async_create_group %0 nvgpu.device_async_wait { numGroups = 1 : i32} // CHECK: [[c6:%.+]] = arith.constant 4 : index @@ -203,7 +203,7 @@ func.func @too_small_column_size_f16(%arg0: memref<128x128xf16>, // CHECK: nvgpu.device_async_copy [[arg0]][[[ldRow]], [[ldCol]]], [[shm]][[[stRow]], [[stCol]]] %0 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shm[%stRow, %stCol], 8 : memref<128x128xf16> to memref<128x8xf16, 3> - %1 = nvgpu.device_async_create_group %0 + nvgpu.device_async_create_group %0 nvgpu.device_async_wait { numGroups = 1 : i32} // CHECK: nvgpu.ldmatrix [[shm]][[[fragRow]], [[fragCol]]] @@ -229,7 +229,7 @@ func.func @abort_if_subview(%arg0: memref<128x128xf16>, // CHECK: nvgpu.device_async_copy [[arg0]][[[ldRow]], [[ldCol]]], [[shm]][[[stRow]], [[stCol]]] %0 = nvgpu.device_async_copy %arg0[%ldRow, %ldCol], %shm[%stRow, %stCol], 8 : memref<128x128xf16> to memref<128x32xf16, 3> - %1 = nvgpu.device_async_create_group %0 + nvgpu.device_async_create_group %0 nvgpu.device_async_wait { numGroups = 1 : i32} // CHECK: nvgpu.ldmatrix [[shmView]][[[fragRow]], [[fragCol]]] diff --git a/mlir/test/Dialect/NVGPU/roundtrip.mlir b/mlir/test/Dialect/NVGPU/roundtrip.mlir index 71f8f52bcbc64..dbd9c368d9e47 100644 --- a/mlir/test/Dialect/NVGPU/roundtrip.mlir +++ b/mlir/test/Dialect/NVGPU/roundtrip.mlir @@ -63,8 +63,8 @@ func.func @async_cp(%dst : memref<2x7x5xf32, 3>, %src : memref<4x5xf32>){ %c0 = arith.constant 0 : index // CHECK: nvgpu.device_async_copy %{{.*}}[{{.*}}, {{.*}}], %{{.*}}[{{.*}}, {{.*}}, {{.*}}], 4 : memref<4x5xf32> to memref<2x7x5xf32, 3> %0 = nvgpu.device_async_copy %src[%c0, %c0], %dst[%c0, %c0, %c0], 4 : memref<4x5xf32> to memref<2x7x5xf32, 3> - // CHECK: %{{.*}} = nvgpu.device_async_create_group - %token = nvgpu.device_async_create_group %0 + // CHECK: nvgpu.device_async_create_group + nvgpu.device_async_create_group %0 // CHECK: nvgpu.device_async_wait {numGroups = 1 : i32} nvgpu.device_async_wait {numGroups = 1 : i32} return diff --git a/mlir/test/Dialect/NVGPU/transform-create-async-groups.mlir b/mlir/test/Dialect/NVGPU/transform-create-async-groups.mlir index f325dac6f7303..aaaeb50854dc4 100644 --- a/mlir/test/Dialect/NVGPU/transform-create-async-groups.mlir +++ b/mlir/test/Dialect/NVGPU/transform-create-async-groups.mlir @@ -17,7 +17,7 @@ builtin.module { // CHECK: %[[CP1:.*]] = nvgpu.device_async_copy {{.*}}, {{.*}}, 1 %2 = vector.transfer_read %a[%c0, %c4], %cst_0 {in_bounds = [true]} : memref<1024x1024xf32>, vector<1xf32> vector.transfer_write %2, %0[%c0, %c4, %c0] {in_bounds = [true]} : vector<1xf32>, memref<4x32x16xf32, #gpu.address_space> - // CHECK: %[[G:.*]] = nvgpu.device_async_create_group %[[CP0]], %[[CP1]] + // CHECK: nvgpu.device_async_create_group %[[CP0]], %[[CP1]] // CHECK: nvgpu.device_async_wait return } @@ -51,7 +51,7 @@ builtin.module { // CHECK: %[[CP1:.*]] = nvgpu.device_async_copy {{.*}}, {{.*}}, 1 : %2 = vector.transfer_read %a[%c0, %c4], %cst_0 {in_bounds = [true]} : memref<1024x1024xf32>, vector<1xf32> vector.transfer_write %2, %0[%c0, %c4, %c0] {in_bounds = [true]} : vector<1xf32>, memref<4x32x16xf32, #gpu.address_space> - // CHECK: %[[G:.*]] = nvgpu.device_async_create_group %[[CP0]], %[[CP1]] + // CHECK: nvgpu.device_async_create_group %[[CP0]], %[[CP1]] // CHECK: nvgpu.device_async_wait return } @@ -83,7 +83,7 @@ builtin.module { // CHECK: %[[CP1:.*]] = nvgpu.device_async_copy {{.*}}, {{.*}}, 1 : %2 = vector.load %a[%c0, %c4] : memref<1024x1024xf32>, vector<1xf32> vector.store %2, %0[%c0, %c4, %c0] : memref<4x32x16xf32, #gpu.address_space>, vector<1xf32> - // CHECK: %[[G:.*]] = nvgpu.device_async_create_group %[[CP0]], %[[CP1]] + // CHECK: nvgpu.device_async_create_group %[[CP0]], %[[CP1]] // CHECK: nvgpu.device_async_wait return } From bc929f3bbc498ef12116ad1b5f19c5a723a4c4a7 Mon Sep 17 00:00:00 2001 From: linuxlonelyeagle <2020382038@qq.com> Date: Fri, 14 Mar 2025 10:44:35 +0800 Subject: [PATCH 3/3] update description and add prop-attr. --- mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td | 12 +++++++----- mlir/include/mlir/Dialect/NVGPU/IR/NVGPUTypes.td | 1 - mlir/test/Dialect/NVGPU/roundtrip.mlir | 2 +- .../Dialect/NVGPU/transform-pipeline-shared.mlir | 8 ++++---- 4 files changed, 12 insertions(+), 11 deletions(-) diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td index 03a9485e26bc7..581bd5005873d 100644 --- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td +++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUOps.td @@ -275,14 +275,16 @@ def NVGPU_DeviceAsyncCreateGroupOp : NVGPU_Op<"device_async_create_group", []> { def NVGPU_DeviceAsyncWaitOp : NVGPU_Op<"device_async_wait", []> { let summary = "Wait for async gpu ops to complete."; let description = [{ - The `nvgpu.device_async_wait` op will block the execution thread until the group - associated with the source token is fully completed. + The `nvgpu.device_async_wait` op will block the execution thread until the till + only `$numGroups` or fewer of the most recent async copy groups are pending and + all the prior async copy groups committed by the executing threads are complete. The optional `$numGroups` attribute gives an upper bound of the number of groups uncompleted when the wait can unblock the thread. For example, if 16 async groups are pushe and `$numGroups` is set to 12, then the thread - will unblock when 12 groups or fewer are in flight (4 groups have - completed). + will unblock when 12 groups or fewer are in flight (4 groups have completed). + Its default value is 0, This means waiting for all previously committed groups + to complete. Example: @@ -302,7 +304,7 @@ def NVGPU_DeviceAsyncWaitOp : NVGPU_Op<"device_async_wait", []> { }]; let arguments = (ins OptionalAttr:$numGroups); let assemblyFormat = [{ - attr-dict + prop-dict attr-dict }]; } diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUTypes.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUTypes.td index 8836a1a9dfcd8..ee71a145734ae 100644 --- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUTypes.td +++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPUTypes.td @@ -10,7 +10,6 @@ // //===----------------------------------------------------------------------===// - #ifndef MLIR_DIALECT_NVGPU_IR_NVGPUTYPES_TD #define MLIR_DIALECT_NVGPU_IR_NVGPUTYPES_TD diff --git a/mlir/test/Dialect/NVGPU/roundtrip.mlir b/mlir/test/Dialect/NVGPU/roundtrip.mlir index dbd9c368d9e47..bb79c288fd064 100644 --- a/mlir/test/Dialect/NVGPU/roundtrip.mlir +++ b/mlir/test/Dialect/NVGPU/roundtrip.mlir @@ -65,7 +65,7 @@ func.func @async_cp(%dst : memref<2x7x5xf32, 3>, %src : memref<4x5xf32>){ %0 = nvgpu.device_async_copy %src[%c0, %c0], %dst[%c0, %c0, %c0], 4 : memref<4x5xf32> to memref<2x7x5xf32, 3> // CHECK: nvgpu.device_async_create_group nvgpu.device_async_create_group %0 - // CHECK: nvgpu.device_async_wait {numGroups = 1 : i32} + // CHECK: nvgpu.device_async_wait <{numGroups = 1 : i32}> nvgpu.device_async_wait {numGroups = 1 : i32} return } diff --git a/mlir/test/Dialect/NVGPU/transform-pipeline-shared.mlir b/mlir/test/Dialect/NVGPU/transform-pipeline-shared.mlir index e93a6a40391bb..f17475622a240 100644 --- a/mlir/test/Dialect/NVGPU/transform-pipeline-shared.mlir +++ b/mlir/test/Dialect/NVGPU/transform-pipeline-shared.mlir @@ -98,7 +98,7 @@ func.func @async_depth_2_predicated(%global: memref, %alloc_size: index) scf.for %i = %c0 to %c98 step %c4 { // Condition for the predication "select" below. // CHECK: %[[CMP0:.+]] = arith.cmpi slt, %[[I]], %[[C90]] - // CHECK: nvgpu.device_async_wait {numGroups = 1 + // CHECK: nvgpu.device_async_wait <{numGroups = 1 : i32}> // Original "select" with updated induction variable. // CHECK: %[[I_PLUS_8:.+]] = arith.addi %[[I]], %[[C8]] // CHECK: %[[CMP1:.+]] = arith.cmpi slt, %[[I_PLUS_8]], %[[C96]] @@ -152,11 +152,11 @@ func.func @async_depth_2_peeled(%global: memref) { // CHECK: nvgpu.device_async_copy // CHECK: nvgpu.device_async_copy // CHECK: scf.for - // CHECK: nvgpu.device_async_wait {numGroups = 1 + // CHECK: nvgpu.device_async_wait <{numGroups = 1 : i32}> // CHECK: arith.select // CHECK: nvgpu.device_async_copy - // CHECK: nvgpu.device_async_wait {numGroups = 1 - // CHECK: nvgpu.device_async_wait {numGroups = 0 + // CHECK: nvgpu.device_async_wait <{numGroups = 1 : i32}> + // CHECK: nvgpu.device_async_wait <{numGroups = 0 : i32}> scf.for %i = %c0 to %c98 step %c4 { %c96 = arith.constant 96 : index %cond = arith.cmpi slt, %i, %c96 : index