diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 5631342ecc13e..32717fecc5411 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -2157,16 +2157,9 @@ bool NVPTXScopes::empty() const { return Scopes.size() == 0; } ? NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_SHARED32_##mode##suffix \ : NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_##mode##suffix) -#define CP_ASYNC_BULK_TENSOR_OPCODE_S2G_IMPL(op, dim, mode, is_ch, is_s32) \ - (is_ch ? (CP_ASYNC_BULK_TENSOR_OPCODE(op, dim, mode, is_s32, _CH)) \ - : (CP_ASYNC_BULK_TENSOR_OPCODE(op, dim, mode, is_s32, ))) - -#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(dim, mode, is_reduce, is_ch, \ - is_s32) \ - (is_reduce \ - ? (CP_ASYNC_BULK_TENSOR_OPCODE_S2G_IMPL(RED, dim, mode, is_ch, is_s32)) \ - : (CP_ASYNC_BULK_TENSOR_OPCODE_S2G_IMPL(S2G, dim, mode, is_ch, \ - is_s32))) +#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(dim, mode, is_ch, is_s32) \ + (is_ch ? (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, _CH)) \ + : (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, ))) #define GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(dim, mode, is_mc, is_ch, is_s32) \ [&]() -> auto { \ @@ -2179,48 +2172,45 @@ bool NVPTXScopes::empty() const { return Scopes.size() == 0; } return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, ); \ }() -#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(dim, mode, is_ch) \ - (is_ch ? NVPTX::CP_ASYNC_BULK_TENSOR_PREFETCH_##dim##_##mode##_CH \ - : NVPTX::CP_ASYNC_BULK_TENSOR_PREFETCH_##dim##_##mode) - -static unsigned GetCpAsyncBulkTensorS2GOpcode(size_t Dim, bool IsShared32, - bool IsCacheHint, bool IsIm2Col, - bool IsReduce = false) { +static unsigned GetCpAsyncBulkTensorS2GReductionOpcode(size_t Dim, + bool IsShared32, + bool IsCacheHint, + bool IsIm2Col) { if (IsIm2Col) { switch (Dim) { case 3: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(3D, IM2COL, IsReduce, - IsCacheHint, IsShared32); + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(3D, IM2COL, IsCacheHint, + IsShared32); case 4: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(4D, IM2COL, IsReduce, - IsCacheHint, IsShared32); + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(4D, IM2COL, IsCacheHint, + IsShared32); case 5: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(5D, IM2COL, IsReduce, - IsCacheHint, IsShared32); + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(5D, IM2COL, IsCacheHint, + IsShared32); default: llvm_unreachable("Invalid Dimension in im2col mode for " - "GetCpAsyncBulkTensorS2GOpcode."); + "GetCpAsyncBulkTensorS2GReductionOpcode."); } } else { switch (Dim) { case 1: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(1D, TILE, IsReduce, - IsCacheHint, IsShared32); + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(1D, TILE, IsCacheHint, + IsShared32); case 2: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(2D, TILE, IsReduce, - IsCacheHint, IsShared32); + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(2D, TILE, IsCacheHint, + IsShared32); case 3: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(3D, TILE, IsReduce, - IsCacheHint, IsShared32); + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(3D, TILE, IsCacheHint, + IsShared32); case 4: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(4D, TILE, IsReduce, - IsCacheHint, IsShared32); + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(4D, TILE, IsCacheHint, + IsShared32); case 5: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(5D, TILE, IsReduce, - IsCacheHint, IsShared32); + return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(5D, TILE, IsCacheHint, + IsShared32); default: - llvm_unreachable( - "Invalid Dimension in tile mode for GetCpAsyncBulkTensorS2GOpcode."); + llvm_unreachable("Invalid Dimension in tile mode for " + "GetCpAsyncBulkTensorS2GReductionOpcode."); } } } @@ -2267,39 +2257,6 @@ static unsigned GetCpAsyncBulkTensorG2SOpcode(size_t Dim, bool IsShared32, } } -static unsigned GetCpAsyncBulkTensorPrefetchOpcode(size_t Dim, bool IsCacheHint, - bool IsIm2Col) { - if (IsIm2Col) { - switch (Dim) { - case 3: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(3D, IM2COL, IsCacheHint); - case 4: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(4D, IM2COL, IsCacheHint); - case 5: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(5D, IM2COL, IsCacheHint); - default: - llvm_unreachable("Invalid Dimension in im2col mode for " - "GetCpAsyncBulkTensorPrefetchOpcode."); - } - } else { - switch (Dim) { - case 1: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(1D, TILE, IsCacheHint); - case 2: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(2D, TILE, IsCacheHint); - case 3: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(3D, TILE, IsCacheHint); - case 4: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(4D, TILE, IsCacheHint); - case 5: - return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(5D, TILE, IsCacheHint); - default: - llvm_unreachable("Invalid Dimension in tile mode for " - "GetCpAsyncBulkTensorPrefetchOpcode."); - } - } -} - static size_t GetDimsFromIntrinsic(unsigned IID) { switch (IID) { case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d: @@ -2364,52 +2321,6 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorG2SCommon(SDNode *N, ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops)); } -void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorS2GCommon(SDNode *N, - bool IsIm2Col) { - // We have {Chain, Intrinsic-ID} followed by the actual intrisic args: - // src, dst, dims{d0...dN}, cache_hint, cache_hint_flag - // NumOperands = {Chain, IID} + {Actual intrinsic args} - // = {2} + {4 + dims} - size_t NumOps = N->getNumOperands(); - size_t NumDims = NumOps - 6; - bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1; - size_t NumArgs = NumDims + (IsCacheHint ? 3 : 2); // src, dst, cache_hint - - SDLoc DL(N); - SmallVector Ops(N->ops().slice(2, NumArgs)); - Ops.push_back(N->getOperand(0)); // Chain operand - - bool IsShared32 = - CurDAG->getDataLayout().getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32; - unsigned Opcode = - GetCpAsyncBulkTensorS2GOpcode(NumDims, IsShared32, IsCacheHint, IsIm2Col); - ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops)); -} - -void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorPrefetchCommon(SDNode *N, - bool IsIm2Col) { - // We have {Chain, Intrinsic-ID} followed by the actual intrisic args: - // {src, dims{d0...dN}, im2col_offsets{dims-2} - // cache_hint, cache_hint_flag} - // NumOperands = {Chain, IID} + {Actual intrinsic args} - // = {2} + {3 + dims + im2col_offsets} - size_t NumOps = N->getNumOperands(); - size_t NumDims = IsIm2Col ? GetDimsFromIntrinsic(N->getConstantOperandVal(1)) - : (NumOps - 5); - // Offsets is always 'NumDims - 2' and only for im2col mode - size_t NumOffsets = IsIm2Col ? (NumDims - 2) : 0; - bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1; - size_t NumArgs = NumDims + NumOffsets + (IsCacheHint ? 2 : 1); - - SDLoc DL(N); - SmallVector Ops(N->ops().slice(2, NumArgs)); - Ops.push_back(N->getOperand(0)); // Chain operand - - unsigned Opcode = - GetCpAsyncBulkTensorPrefetchOpcode(NumDims, IsCacheHint, IsIm2Col); - ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops)); -} - void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(SDNode *N, unsigned RedOp, bool IsIm2Col) { @@ -2429,8 +2340,8 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(SDNode *N, bool IsShared32 = CurDAG->getDataLayout().getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32; - unsigned Opcode = GetCpAsyncBulkTensorS2GOpcode( - NumDims, IsShared32, IsCacheHint, IsIm2Col, /*IsReduce=*/true); + unsigned Opcode = GetCpAsyncBulkTensorS2GReductionOpcode( + NumDims, IsShared32, IsCacheHint, IsIm2Col); ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops)); } @@ -2550,18 +2461,6 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) { switch (IID) { default: return false; - case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_1d: - case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_2d: - case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_3d: - case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_4d: - case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_5d: - SelectCpAsyncBulkTensorS2GCommon(N); - return true; - case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_im2col_3d: - case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_im2col_4d: - case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_im2col_5d: - SelectCpAsyncBulkTensorS2GCommon(N, /*IsIm2Col=*/true); - return true; case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d: case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d: case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d: @@ -2574,18 +2473,6 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) { case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d: SelectCpAsyncBulkTensorG2SCommon(N, /*IsIm2Col=*/true); return true; - case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_1d: - case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_2d: - case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_3d: - case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_4d: - case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_5d: - SelectCpAsyncBulkTensorPrefetchCommon(N); - return true; - case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_3d: - case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_4d: - case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_5d: - SelectCpAsyncBulkTensorPrefetchCommon(N, /*IsIm2Col=*/true); - return true; case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_1d: case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_2d: case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_3d: diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index 0e4dec1adca67..016a2a349f9f5 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -92,8 +92,6 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel { void SelectV2I64toI128(SDNode *N); void SelectI128toV2I64(SDNode *N); void SelectCpAsyncBulkTensorG2SCommon(SDNode *N, bool IsIm2Col = false); - void SelectCpAsyncBulkTensorS2GCommon(SDNode *N, bool IsIm2Col = false); - void SelectCpAsyncBulkTensorPrefetchCommon(SDNode *N, bool IsIm2Col = false); void SelectCpAsyncBulkTensorReduceCommon(SDNode *N, unsigned RedOp, bool IsIm2Col = false); void SelectTcgen05Ld(SDNode *N, bool hasOffset = false); diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index cc1fd027d8515..f1e93df4c5b72 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -560,6 +560,30 @@ defm CP_ASYNC_BULK_PREFETCH_CH : CP_ASYNC_BULK_PREFETCH_INTR; // TMA Async Bulk Tensor Copy Functions //------------------------------------- +class TMA_DIMS_UTIL { + // For example, when 'dim' is 3, this generates: + // an ins_dag: B32:$d0, B32:$d1, B32:$d2 + // with base_str: $d0, $d1, $d2 + dag ins_dag = !dag(ins, !listsplat(B32, dim), !foreach(i, !range(dim), "d" # i)); + string base_str = !interleave(!foreach(i, !range(dim), "$d" # i), ", "); +} + +class TMA_IM2COL_UTIL { + // For im2col_w/w_128 modes, number of offsets is always 2. + // For im2col mode, offsets is (dim - 2). + // For non-im2col modes (i.e. tile) there are no offsets. + int offsets = !cond( + !eq(mode, "im2col") : !sub(dim, 2), + !eq(mode, "im2col_w") : 2, + !eq(mode, "im2col_w_128") : 2, + true : 0); // for all other modes + + dag ins_dag = !if(!gt(offsets, 0), + !dag(ins, !listsplat(B16, offsets), !foreach(i, !range(offsets), "im2col" # i)), + (ins)); + string base_str = !interleave(!foreach(i, !range(offsets), "$im2col" # i), ", "); +} + // From Global to Shared memory (G2S) class G2S_STRINGS { string prefix = "cp.async.bulk.tensor"; @@ -583,8 +607,8 @@ def CTAGroupFlags : Operand { } multiclass CP_ASYNC_BULK_TENSOR_G2S_INTR { - defvar dims_dag = !dag(ins, !listsplat(B32, dim), !foreach(i, !range(dim), "d" # i)); - defvar dims_str = !interleave(!foreach(i, !range(dim), "$d" # i), ", "); + defvar dims_dag = TMA_DIMS_UTIL.ins_dag; + defvar dims_str = TMA_DIMS_UTIL.base_str; defvar asm_str_default = "$cg [$dst], [$tmap, {{" # dims_str # "}}], [$mbar]"; defvar rc = !if(is_shared32, B32, B64); @@ -628,39 +652,46 @@ foreach dim = [1, 2, 3, 4, 5] in { } } -// From Shared to Global memory (S2G) -class S2G_STRINGS { - string dir = "global.shared::cta"; - string completion = "bulk_group"; - string inst_name = !if(is_reduce, "cp.reduce", "cp") - # ".async.bulk.tensor" - # "." # dim # "d" - # "." # dir - # "." # mode - # "." # completion - # !if(ch, ".L2::cache_hint", ""); - string intr_name = "CP_ASYNC_BULK_TENSOR_" - # !if(is_reduce, "RED_", "S2G_") - # dim # "D" - # !if(is_shared32, "_SHARED32", "") - # !if(!eq(mode, "tile"), "_TILE", "_IM2COL"); -} - -multiclass CP_ASYNC_BULK_TENSOR_S2G_INTR { - defvar dims_dag = !dag(ins, !listsplat(B32, dim), !foreach(i, !range(dim), "d" # i)); - defvar dims_str = !interleave(!foreach(i, !range(dim), "$d" # i), ", "); +multiclass TMA_TENSOR_S2G_INTR pred = [hasPTX<80>, hasSM<90>]> { + defvar dims_dag = TMA_DIMS_UTIL.ins_dag; + defvar dims_str = TMA_DIMS_UTIL.base_str; defvar asm_str = " [$tmap, {{" # dims_str # "}}], [$src]"; - defvar rc = !if(shared32, B32, B64); + + defvar intr = !cast( + "int_nvvm_cp_async_bulk_tensor_s2g_" # mode # "_" # dim # d); + defvar intr_dag = !con((intr addr:$src, B64:$tmap), + !setdagop(dims_dag, intr), + (intr B64:$ch, 0)); + defvar intr_dag_with_ch = !con((intr addr:$src, B64:$tmap), + !setdagop(dims_dag, intr), + (intr B64:$ch, -1)); + + // For im2col mode, the actual asm_str is "im2col_no_offs" + defvar mode_asm_str = !if(!eq(mode, "im2col"), + "im2col_no_offs", mode); + defvar prefix = "cp.async.bulk.tensor" + # "." # dim # "d" + # ".global.shared::cta" + # "." # mode_asm_str + # ".bulk_group"; def "" : NVPTXInst<(outs), - !con((ins rc:$src, B64:$tmap), dims_dag), - !strconcat(S2G_STRINGS.inst_name, asm_str, ";"), []>, - Requires<[hasPTX<80>, hasSM<90>]>; + !con((ins ADDR:$src, B64:$tmap), dims_dag, (ins B64:$ch)), + prefix # asm_str # ";", + [intr_dag]>, + Requires; def _CH : NVPTXInst<(outs), - !con((ins rc:$src, B64:$tmap), dims_dag, (ins B64:$ch)), - !strconcat(S2G_STRINGS.inst_name, asm_str, ", $ch;"), []>, - Requires<[hasPTX<80>, hasSM<90>]>; + !con((ins ADDR:$src, B64:$tmap), dims_dag, (ins B64:$ch)), + prefix # ".L2::cache_hint" # asm_str # ", $ch;", + [intr_dag_with_ch]>, + Requires; +} +foreach dim = 1...5 in { + foreach mode = !if(!ge(dim, 3), ["tile", "im2col"], ["tile"]) in { + defvar suffix = !toupper(mode) # "_" # dim # D; + defm TMA_TENSOR_S2G_ # suffix : TMA_TENSOR_S2G_INTR; + } } def TMAReductionFlags : Operand { @@ -669,13 +700,16 @@ def TMAReductionFlags : Operand { // TMA Copy from Shared to Global memory with Reduction multiclass CP_ASYNC_BULK_TENSOR_REDUCE_INTR { - defvar dims_dag = !dag(ins, !listsplat(B32, dim), !foreach(i, !range(dim), "d" # i)); - defvar dims_str = !interleave(!foreach(i, !range(dim), "$d" # i), ", "); + defvar dims_dag = TMA_DIMS_UTIL.ins_dag; + defvar dims_str = TMA_DIMS_UTIL.base_str; defvar asm_str = " [$tmap, {{" # dims_str # "}}], [$src]"; defvar rc = !if(shared32, B32, B64); + // For im2col mode, the actual asm_str is "im2col_no_offs" + defvar mode_asm_str = !if(!eq(mode, "im2col"), + "im2col_no_offs", mode); defvar prefix = "cp.reduce.async.bulk.tensor" # "." # dim # "d" # ".global.shared::cta"; - defvar suffix = "." # mode # ".bulk_group"; + defvar suffix = "." # mode_asm_str # ".bulk_group"; def "" : NVPTXInst<(outs), !con((ins rc:$src, B64:$tmap), dims_dag, (ins TMAReductionFlags:$red_op)), @@ -689,58 +723,63 @@ multiclass CP_ASYNC_BULK_TENSOR_REDUCE_INTR foreach dim = [1, 2, 3, 4, 5] in { foreach shared32 = [true, false] in { - foreach mode = !if(!ge(dim, 3), ["tile", "im2col_no_offs"], ["tile"]) in { - defm S2G_STRINGS.intr_name : - CP_ASYNC_BULK_TENSOR_S2G_INTR; - defm S2G_STRINGS.intr_name : + foreach mode = !if(!ge(dim, 3), ["tile", "im2col"], ["tile"]) in { + defvar suffix = dim # "D" + # !if(shared32, "_SHARED32", "") + # "_" # !toupper(mode); + defm CP_ASYNC_BULK_TENSOR_RED_ # suffix : CP_ASYNC_BULK_TENSOR_REDUCE_INTR; } } } // TMA Prefetch from Global memory to L2 cache -class PREFETCH_STRINGS { - string prefix = "cp.async.bulk.prefetch.tensor"; - string dir = "L2.global"; - string inst_name = prefix +multiclass TMA_TENSOR_PREFETCH_INTR pred = [hasPTX<80>, hasSM<90>]> { + defvar dims_dag = TMA_DIMS_UTIL.ins_dag; + defvar dims_str = TMA_DIMS_UTIL.base_str; + defvar asm_str_base = " [$tmap, {{" # dims_str # "}}]"; + + defvar im2col_dag = TMA_IM2COL_UTIL.ins_dag; + defvar im2col_str = TMA_IM2COL_UTIL.base_str; + defvar asm_str = !if(!empty(im2col_str), + asm_str_base, + asm_str_base # ", {{" # im2col_str # "}}"); + + defvar inst_name = "cp.async.bulk.prefetch.tensor" # "." # dim # "d" - # "." # dir - # "." # mode - # !if(ch, ".L2::cache_hint", ""); - string intr_name = "CP_ASYNC_BULK_TENSOR_PREFETCH_" - # dim # "D" - # !if(!eq(mode, "tile"), "_TILE", "_IM2COL"); -} - -multiclass CP_ASYNC_BULK_TENSOR_PREFETCH_INTR { - defvar dims_dag = !dag(ins, !listsplat(B32, dim), !foreach(i, !range(dim), "d" # i)); - defvar dims_str = !interleave(!foreach(i, !range(dim), "$d" # i), ", "); - defvar asm_str_default = " [$tmap, {{" # dims_str # "}}]"; - - defvar num_im2col = !if(!ge(dim, 3), !add(dim, -2), 0); - defvar im2col_dag = !if(!eq(mode, "im2col"), - !dag(ins, !listsplat(B16, num_im2col), !foreach(i, !range(num_im2col), "im2col" # i)), - (ins)); - defvar im2col_str = !interleave(!foreach(i, !range(num_im2col), "$im2col" # i), ", "); - defvar im2col_asm_str = ", {{" # im2col_str # "}}"; - - defvar asm_str = !if(!eq(mode, "im2col"), - !strconcat(asm_str_default, im2col_asm_str), asm_str_default); - - def "" : NVPTXInst<(outs), - !con((ins B64:$tmap), dims_dag, im2col_dag), - !strconcat(PREFETCH_STRINGS.inst_name, asm_str, ";"), []>, - Requires<[hasPTX<80>, hasSM<90>]>; - def _CH : NVPTXInst<(outs), - !con((ins B64:$tmap), dims_dag, im2col_dag, (ins B64:$ch)), - !strconcat(PREFETCH_STRINGS.inst_name, asm_str, ", $ch;"), []>, - Requires<[hasPTX<80>, hasSM<90>]>; -} - -foreach dim = [1, 2, 3, 4, 5] in { + # "." # "L2.global" + # "." # mode; + + defvar intr = !cast( + "int_nvvm_cp_async_bulk_tensor_prefetch_" # mode # "_" # dim # d); + + defvar ins_dag = !con((ins B64:$tmap), + dims_dag, + im2col_dag, + (ins B64:$ch)); + defvar intr_dag = !con((intr B64:$tmap), + !setdagop(dims_dag, intr), + !setdagop(im2col_dag, intr), + (intr B64:$ch, 0)); + defvar intr_dag_with_ch = !con((intr B64:$tmap), + !setdagop(dims_dag, intr), + !setdagop(im2col_dag, intr), + (intr B64:$ch, -1)); + + def "" : NVPTXInst<(outs), ins_dag, + inst_name # asm_str # ";", + [intr_dag]>, + Requires; + def _CH : NVPTXInst<(outs), ins_dag, + inst_name # ".L2::cache_hint" # asm_str # ", $ch;", + [intr_dag_with_ch]>, + Requires; +} +foreach dim = 1...5 in { foreach mode = !if(!ge(dim, 3), ["tile", "im2col"], ["tile"]) in { - defm PREFETCH_STRINGS.intr_name : - CP_ASYNC_BULK_TENSOR_PREFETCH_INTR; + defvar suffix = !toupper(mode) # "_" # dim # D; + defm TMA_TENSOR_PF_ # suffix : TMA_TENSOR_PREFETCH_INTR; } } diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch.ll index 09dbe91d07513..cf166f83fb241 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-prefetch.ll @@ -24,8 +24,8 @@ define void @cp_async_bulk_tensor_prefetch_tile_1d(ptr %tmap, i32 %d0, i64 %ch) ; CHECK-PTX-NEXT: // %bb.0: ; CHECK-PTX-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_prefetch_tile_1d_param_0]; ; CHECK-PTX-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_prefetch_tile_1d_param_1]; -; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.1d.L2.global.tile [%rd1, {%r1}]; ; CHECK-PTX-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_prefetch_tile_1d_param_2]; +; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.1d.L2.global.tile [%rd1, {%r1}]; ; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.1d.L2.global.tile.L2::cache_hint [%rd1, {%r1}], %rd2; ; CHECK-PTX-NEXT: ret; tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.1d(ptr %tmap, i32 %d0, i64 %ch, i1 0) @@ -44,8 +44,8 @@ define void @cp_async_bulk_tensor_prefetch_tile_2d(i32 %flag, ptr %tmap, i32 %d0 ; CHECK-PTX-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_prefetch_tile_2d_param_1]; ; CHECK-PTX-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_prefetch_tile_2d_param_2]; ; CHECK-PTX-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_prefetch_tile_2d_param_3]; -; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.2d.L2.global.tile [%rd1, {%r1, %r2}]; ; CHECK-PTX-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_prefetch_tile_2d_param_4]; +; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.2d.L2.global.tile [%rd1, {%r1, %r2}]; ; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.2d.L2.global.tile.L2::cache_hint [%rd1, {%r1, %r2}], %rd2; ; CHECK-PTX-NEXT: ret; tail call void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.2d(ptr %tmap, i32 %d0, i32 %d1, i64 %ch, i1 0) @@ -66,8 +66,8 @@ define void @cp_async_bulk_tensor_prefetch_3d(i32 %flag, ptr %tmap, i32 %d0, i32 ; CHECK-PTX-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_prefetch_3d_param_2]; ; CHECK-PTX-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_prefetch_3d_param_3]; ; CHECK-PTX-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_prefetch_3d_param_4]; -; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.3d.L2.global.tile [%rd1, {%r1, %r2, %r3}]; ; CHECK-PTX-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_prefetch_3d_param_6]; +; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.3d.L2.global.tile [%rd1, {%r1, %r2, %r3}]; ; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.3d.L2.global.tile.L2::cache_hint [%rd1, {%r1, %r2, %r3}], %rd2; ; CHECK-PTX-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_prefetch_3d_param_5]; ; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.3d.L2.global.im2col [%rd1, {%r1, %r2, %r3}], {%rs1}; @@ -95,8 +95,8 @@ define void @cp_async_bulk_tensor_prefetch_4d(i32 %flag, ptr %tmap, i32 %d0, i32 ; CHECK-PTX-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_prefetch_4d_param_3]; ; CHECK-PTX-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_prefetch_4d_param_4]; ; CHECK-PTX-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_prefetch_4d_param_5]; -; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.4d.L2.global.tile [%rd1, {%r1, %r2, %r3, %r4}]; ; CHECK-PTX-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_prefetch_4d_param_8]; +; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.4d.L2.global.tile [%rd1, {%r1, %r2, %r3, %r4}]; ; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.4d.L2.global.tile.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4}], %rd2; ; CHECK-PTX-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_prefetch_4d_param_6]; ; CHECK-PTX-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_prefetch_4d_param_7]; @@ -126,8 +126,8 @@ define void @cp_async_bulk_tensor_prefetch_5d(i32 %flag, ptr %tmap, i32 %d0, i32 ; CHECK-PTX-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_prefetch_5d_param_4]; ; CHECK-PTX-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_prefetch_5d_param_5]; ; CHECK-PTX-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_prefetch_5d_param_6]; -; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.5d.L2.global.tile [%rd1, {%r1, %r2, %r3, %r4, %r5}]; ; CHECK-PTX-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_prefetch_5d_param_10]; +; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.5d.L2.global.tile [%rd1, {%r1, %r2, %r3, %r4, %r5}]; ; CHECK-PTX-NEXT: cp.async.bulk.prefetch.tensor.5d.L2.global.tile.L2::cache_hint [%rd1, {%r1, %r2, %r3, %r4, %r5}], %rd2; ; CHECK-PTX-NEXT: ld.param.b16 %rs1, [cp_async_bulk_tensor_prefetch_5d_param_7]; ; CHECK-PTX-NEXT: ld.param.b16 %rs2, [cp_async_bulk_tensor_prefetch_5d_param_8]; diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g.ll index 5998883f77ac1..3b5bd161896bc 100644 --- a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g.ll +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-s2g.ll @@ -27,8 +27,8 @@ define void @cp_async_bulk_tensor_s2g_tile_1d(ptr addrspace(3) %src, ptr %tmap, ; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_s2g_tile_1d_param_0]; ; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_tile_1d_param_1]; ; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_s2g_tile_1d_param_2]; -; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [%rd2, {%r1}], [%rd1]; ; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_s2g_tile_1d_param_3]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [%rd2, {%r1}], [%rd1]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd2, {%r1}], [%rd1], %rd3; ; CHECK-PTX64-NEXT: ret; ; @@ -41,8 +41,8 @@ define void @cp_async_bulk_tensor_s2g_tile_1d(ptr addrspace(3) %src, ptr %tmap, ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_s2g_tile_1d_param_0]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_s2g_tile_1d_param_1]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_s2g_tile_1d_param_2]; -; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [%rd1, {%r2}], [%r1]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_tile_1d_param_3]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group [%rd1, {%r2}], [%r1]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd1, {%r2}], [%r1], %rd2; ; CHECK-PTX-SHARED32-NEXT: ret; tail call void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.1d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i64 %ch, i1 0) @@ -62,8 +62,8 @@ define void @cp_async_bulk_tensor_s2g_tile_2d(i32 %flag, ptr addrspace(3) %src, ; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_tile_2d_param_2]; ; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_s2g_tile_2d_param_3]; ; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_s2g_tile_2d_param_4]; -; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [%rd2, {%r1, %r2}], [%rd1]; ; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_s2g_tile_2d_param_5]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [%rd2, {%r1, %r2}], [%rd1]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2}], [%rd1], %rd3; ; CHECK-PTX64-NEXT: ret; ; @@ -77,8 +77,8 @@ define void @cp_async_bulk_tensor_s2g_tile_2d(i32 %flag, ptr addrspace(3) %src, ; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [cp_async_bulk_tensor_s2g_tile_2d_param_2]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_s2g_tile_2d_param_3]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_s2g_tile_2d_param_4]; -; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [%rd1, {%r2, %r3}], [%r1]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_tile_2d_param_5]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group [%rd1, {%r2, %r3}], [%r1]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd1, {%r2, %r3}], [%r1], %rd2; ; CHECK-PTX-SHARED32-NEXT: ret; tail call void @llvm.nvvm.cp.async.bulk.tensor.s2g.tile.2d(ptr addrspace(3) %src, ptr %tmap, i32 %d0, i32 %d1, i64 %ch, i1 0) @@ -99,8 +99,8 @@ define void @cp_async_bulk_tensor_s2g_3d(i32 %flag, ptr addrspace(3) %src, ptr % ; CHECK-PTX64-NEXT: ld.param.b32 %r1, [cp_async_bulk_tensor_s2g_3d_param_3]; ; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_s2g_3d_param_4]; ; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_s2g_3d_param_5]; -; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1]; ; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_s2g_3d_param_6]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.global.shared::cta.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3}], [%rd1]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.global.shared::cta.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3}], [%rd1], %rd3; @@ -117,8 +117,8 @@ define void @cp_async_bulk_tensor_s2g_3d(i32 %flag, ptr addrspace(3) %src, ptr % ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_s2g_3d_param_3]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_s2g_3d_param_4]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_s2g_3d_param_5]; -; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [%rd1, {%r2, %r3, %r4}], [%r1]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_3d_param_6]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group [%rd1, {%r2, %r3, %r4}], [%r1]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd1, {%r2, %r3, %r4}], [%r1], %rd2; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.global.shared::cta.im2col_no_offs.bulk_group [%rd1, {%r2, %r3, %r4}], [%r1]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.global.shared::cta.im2col_no_offs.bulk_group.L2::cache_hint [%rd1, {%r2, %r3, %r4}], [%r1], %rd2; @@ -145,8 +145,8 @@ define void @cp_async_bulk_tensor_s2g_4d(i32 %flag, ptr addrspace(3) %src, ptr % ; CHECK-PTX64-NEXT: ld.param.b32 %r2, [cp_async_bulk_tensor_s2g_4d_param_4]; ; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_s2g_4d_param_5]; ; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_s2g_4d_param_6]; -; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1]; ; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_s2g_4d_param_7]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.global.shared::cta.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4}], [%rd1]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.global.shared::cta.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4}], [%rd1], %rd3; @@ -164,8 +164,8 @@ define void @cp_async_bulk_tensor_s2g_4d(i32 %flag, ptr addrspace(3) %src, ptr % ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_s2g_4d_param_4]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_s2g_4d_param_5]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_s2g_4d_param_6]; -; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [%rd1, {%r2, %r3, %r4, %r5}], [%r1]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_4d_param_7]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group [%rd1, {%r2, %r3, %r4, %r5}], [%r1]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd1, {%r2, %r3, %r4, %r5}], [%r1], %rd2; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.global.shared::cta.im2col_no_offs.bulk_group [%rd1, {%r2, %r3, %r4, %r5}], [%r1]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.global.shared::cta.im2col_no_offs.bulk_group.L2::cache_hint [%rd1, {%r2, %r3, %r4, %r5}], [%r1], %rd2; @@ -193,8 +193,8 @@ define void @cp_async_bulk_tensor_s2g_5d(i32 %flag, ptr addrspace(3) %src, ptr % ; CHECK-PTX64-NEXT: ld.param.b32 %r3, [cp_async_bulk_tensor_s2g_5d_param_5]; ; CHECK-PTX64-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_s2g_5d_param_6]; ; CHECK-PTX64-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_s2g_5d_param_7]; -; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1]; ; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [cp_async_bulk_tensor_s2g_5d_param_8]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.global.shared::cta.im2col_no_offs.bulk_group [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1]; ; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.global.shared::cta.im2col_no_offs.bulk_group.L2::cache_hint [%rd2, {%r1, %r2, %r3, %r4, %r5}], [%rd1], %rd3; @@ -213,8 +213,8 @@ define void @cp_async_bulk_tensor_s2g_5d(i32 %flag, ptr addrspace(3) %src, ptr % ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [cp_async_bulk_tensor_s2g_5d_param_5]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [cp_async_bulk_tensor_s2g_5d_param_6]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [cp_async_bulk_tensor_s2g_5d_param_7]; -; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [%rd1, {%r2, %r3, %r4, %r5, %r6}], [%r1]; ; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [cp_async_bulk_tensor_s2g_5d_param_8]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group [%rd1, {%r2, %r3, %r4, %r5, %r6}], [%r1]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.global.shared::cta.tile.bulk_group.L2::cache_hint [%rd1, {%r2, %r3, %r4, %r5, %r6}], [%r1], %rd2; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.global.shared::cta.im2col_no_offs.bulk_group [%rd1, {%r2, %r3, %r4, %r5, %r6}], [%r1]; ; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.global.shared::cta.im2col_no_offs.bulk_group.L2::cache_hint [%rd1, {%r2, %r3, %r4, %r5, %r6}], [%r1], %rd2;