Skip to content

Commit c3ec38d

Browse files
authored
[NVPTX][NFC] Move more TMA intrinsics lowering to tablegen (#147576)
This patch moves the lowering of the TMA Tensor prefetch and S2G-copy intrinsics to tablegen itself. This is in preparation for adding Blackwell-specific additions to these intrinsic. The TMA reduction intrinsics lowering is kept intact (C++), and hence the macro names are updated to reflect the current usage. The existing tests have full coverage and continue to pass as expected. Signed-off-by: Durgadoss R <durgadossr@nvidia.com>
1 parent a84ae9c commit c3ec38d

File tree

5 files changed

+160
-236
lines changed

5 files changed

+160
-236
lines changed

llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp

Lines changed: 28 additions & 141 deletions
Original file line numberDiff line numberDiff line change
@@ -2147,16 +2147,9 @@ bool NVPTXScopes::empty() const { return Scopes.size() == 0; }
21472147
? NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_SHARED32_##mode##suffix \
21482148
: NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_##mode##suffix)
21492149

2150-
#define CP_ASYNC_BULK_TENSOR_OPCODE_S2G_IMPL(op, dim, mode, is_ch, is_s32) \
2151-
(is_ch ? (CP_ASYNC_BULK_TENSOR_OPCODE(op, dim, mode, is_s32, _CH)) \
2152-
: (CP_ASYNC_BULK_TENSOR_OPCODE(op, dim, mode, is_s32, )))
2153-
2154-
#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(dim, mode, is_reduce, is_ch, \
2155-
is_s32) \
2156-
(is_reduce \
2157-
? (CP_ASYNC_BULK_TENSOR_OPCODE_S2G_IMPL(RED, dim, mode, is_ch, is_s32)) \
2158-
: (CP_ASYNC_BULK_TENSOR_OPCODE_S2G_IMPL(S2G, dim, mode, is_ch, \
2159-
is_s32)))
2150+
#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(dim, mode, is_ch, is_s32) \
2151+
(is_ch ? (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, _CH)) \
2152+
: (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, )))
21602153

21612154
#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(dim, mode, is_mc, is_ch, is_s32) \
21622155
[&]() -> auto { \
@@ -2169,48 +2162,45 @@ bool NVPTXScopes::empty() const { return Scopes.size() == 0; }
21692162
return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, ); \
21702163
}()
21712164

2172-
#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(dim, mode, is_ch) \
2173-
(is_ch ? NVPTX::CP_ASYNC_BULK_TENSOR_PREFETCH_##dim##_##mode##_CH \
2174-
: NVPTX::CP_ASYNC_BULK_TENSOR_PREFETCH_##dim##_##mode)
2175-
2176-
static unsigned GetCpAsyncBulkTensorS2GOpcode(size_t Dim, bool IsShared32,
2177-
bool IsCacheHint, bool IsIm2Col,
2178-
bool IsReduce = false) {
2165+
static unsigned GetCpAsyncBulkTensorS2GReductionOpcode(size_t Dim,
2166+
bool IsShared32,
2167+
bool IsCacheHint,
2168+
bool IsIm2Col) {
21792169
if (IsIm2Col) {
21802170
switch (Dim) {
21812171
case 3:
2182-
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(3D, IM2COL, IsReduce,
2183-
IsCacheHint, IsShared32);
2172+
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(3D, IM2COL, IsCacheHint,
2173+
IsShared32);
21842174
case 4:
2185-
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(4D, IM2COL, IsReduce,
2186-
IsCacheHint, IsShared32);
2175+
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(4D, IM2COL, IsCacheHint,
2176+
IsShared32);
21872177
case 5:
2188-
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(5D, IM2COL, IsReduce,
2189-
IsCacheHint, IsShared32);
2178+
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(5D, IM2COL, IsCacheHint,
2179+
IsShared32);
21902180
default:
21912181
llvm_unreachable("Invalid Dimension in im2col mode for "
2192-
"GetCpAsyncBulkTensorS2GOpcode.");
2182+
"GetCpAsyncBulkTensorS2GReductionOpcode.");
21932183
}
21942184
} else {
21952185
switch (Dim) {
21962186
case 1:
2197-
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(1D, TILE, IsReduce,
2198-
IsCacheHint, IsShared32);
2187+
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(1D, TILE, IsCacheHint,
2188+
IsShared32);
21992189
case 2:
2200-
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(2D, TILE, IsReduce,
2201-
IsCacheHint, IsShared32);
2190+
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(2D, TILE, IsCacheHint,
2191+
IsShared32);
22022192
case 3:
2203-
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(3D, TILE, IsReduce,
2204-
IsCacheHint, IsShared32);
2193+
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(3D, TILE, IsCacheHint,
2194+
IsShared32);
22052195
case 4:
2206-
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(4D, TILE, IsReduce,
2207-
IsCacheHint, IsShared32);
2196+
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(4D, TILE, IsCacheHint,
2197+
IsShared32);
22082198
case 5:
2209-
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(5D, TILE, IsReduce,
2210-
IsCacheHint, IsShared32);
2199+
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(5D, TILE, IsCacheHint,
2200+
IsShared32);
22112201
default:
2212-
llvm_unreachable(
2213-
"Invalid Dimension in tile mode for GetCpAsyncBulkTensorS2GOpcode.");
2202+
llvm_unreachable("Invalid Dimension in tile mode for "
2203+
"GetCpAsyncBulkTensorS2GReductionOpcode.");
22142204
}
22152205
}
22162206
}
@@ -2257,39 +2247,6 @@ static unsigned GetCpAsyncBulkTensorG2SOpcode(size_t Dim, bool IsShared32,
22572247
}
22582248
}
22592249

2260-
static unsigned GetCpAsyncBulkTensorPrefetchOpcode(size_t Dim, bool IsCacheHint,
2261-
bool IsIm2Col) {
2262-
if (IsIm2Col) {
2263-
switch (Dim) {
2264-
case 3:
2265-
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(3D, IM2COL, IsCacheHint);
2266-
case 4:
2267-
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(4D, IM2COL, IsCacheHint);
2268-
case 5:
2269-
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(5D, IM2COL, IsCacheHint);
2270-
default:
2271-
llvm_unreachable("Invalid Dimension in im2col mode for "
2272-
"GetCpAsyncBulkTensorPrefetchOpcode.");
2273-
}
2274-
} else {
2275-
switch (Dim) {
2276-
case 1:
2277-
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(1D, TILE, IsCacheHint);
2278-
case 2:
2279-
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(2D, TILE, IsCacheHint);
2280-
case 3:
2281-
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(3D, TILE, IsCacheHint);
2282-
case 4:
2283-
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(4D, TILE, IsCacheHint);
2284-
case 5:
2285-
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(5D, TILE, IsCacheHint);
2286-
default:
2287-
llvm_unreachable("Invalid Dimension in tile mode for "
2288-
"GetCpAsyncBulkTensorPrefetchOpcode.");
2289-
}
2290-
}
2291-
}
2292-
22932250
static size_t GetDimsFromIntrinsic(unsigned IID) {
22942251
switch (IID) {
22952252
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
@@ -2354,52 +2311,6 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorG2SCommon(SDNode *N,
23542311
ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
23552312
}
23562313

2357-
void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorS2GCommon(SDNode *N,
2358-
bool IsIm2Col) {
2359-
// We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
2360-
// src, dst, dims{d0...dN}, cache_hint, cache_hint_flag
2361-
// NumOperands = {Chain, IID} + {Actual intrinsic args}
2362-
// = {2} + {4 + dims}
2363-
size_t NumOps = N->getNumOperands();
2364-
size_t NumDims = NumOps - 6;
2365-
bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1;
2366-
size_t NumArgs = NumDims + (IsCacheHint ? 3 : 2); // src, dst, cache_hint
2367-
2368-
SDLoc DL(N);
2369-
SmallVector<SDValue, 8> Ops(N->ops().slice(2, NumArgs));
2370-
Ops.push_back(N->getOperand(0)); // Chain operand
2371-
2372-
bool IsShared32 =
2373-
CurDAG->getDataLayout().getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32;
2374-
unsigned Opcode =
2375-
GetCpAsyncBulkTensorS2GOpcode(NumDims, IsShared32, IsCacheHint, IsIm2Col);
2376-
ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
2377-
}
2378-
2379-
void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorPrefetchCommon(SDNode *N,
2380-
bool IsIm2Col) {
2381-
// We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
2382-
// {src, dims{d0...dN}, im2col_offsets{dims-2}
2383-
// cache_hint, cache_hint_flag}
2384-
// NumOperands = {Chain, IID} + {Actual intrinsic args}
2385-
// = {2} + {3 + dims + im2col_offsets}
2386-
size_t NumOps = N->getNumOperands();
2387-
size_t NumDims = IsIm2Col ? GetDimsFromIntrinsic(N->getConstantOperandVal(1))
2388-
: (NumOps - 5);
2389-
// Offsets is always 'NumDims - 2' and only for im2col mode
2390-
size_t NumOffsets = IsIm2Col ? (NumDims - 2) : 0;
2391-
bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1;
2392-
size_t NumArgs = NumDims + NumOffsets + (IsCacheHint ? 2 : 1);
2393-
2394-
SDLoc DL(N);
2395-
SmallVector<SDValue, 12> Ops(N->ops().slice(2, NumArgs));
2396-
Ops.push_back(N->getOperand(0)); // Chain operand
2397-
2398-
unsigned Opcode =
2399-
GetCpAsyncBulkTensorPrefetchOpcode(NumDims, IsCacheHint, IsIm2Col);
2400-
ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
2401-
}
2402-
24032314
void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(SDNode *N,
24042315
unsigned RedOp,
24052316
bool IsIm2Col) {
@@ -2419,8 +2330,8 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(SDNode *N,
24192330

24202331
bool IsShared32 =
24212332
CurDAG->getDataLayout().getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32;
2422-
unsigned Opcode = GetCpAsyncBulkTensorS2GOpcode(
2423-
NumDims, IsShared32, IsCacheHint, IsIm2Col, /*IsReduce=*/true);
2333+
unsigned Opcode = GetCpAsyncBulkTensorS2GReductionOpcode(
2334+
NumDims, IsShared32, IsCacheHint, IsIm2Col);
24242335
ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
24252336
}
24262337

@@ -2540,18 +2451,6 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
25402451
switch (IID) {
25412452
default:
25422453
return false;
2543-
case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_1d:
2544-
case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_2d:
2545-
case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_3d:
2546-
case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_4d:
2547-
case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_5d:
2548-
SelectCpAsyncBulkTensorS2GCommon(N);
2549-
return true;
2550-
case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_im2col_3d:
2551-
case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_im2col_4d:
2552-
case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_im2col_5d:
2553-
SelectCpAsyncBulkTensorS2GCommon(N, /*IsIm2Col=*/true);
2554-
return true;
25552454
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d:
25562455
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d:
25572456
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d:
@@ -2564,18 +2463,6 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
25642463
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
25652464
SelectCpAsyncBulkTensorG2SCommon(N, /*IsIm2Col=*/true);
25662465
return true;
2567-
case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_1d:
2568-
case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_2d:
2569-
case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_3d:
2570-
case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_4d:
2571-
case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_5d:
2572-
SelectCpAsyncBulkTensorPrefetchCommon(N);
2573-
return true;
2574-
case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_3d:
2575-
case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_4d:
2576-
case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_5d:
2577-
SelectCpAsyncBulkTensorPrefetchCommon(N, /*IsIm2Col=*/true);
2578-
return true;
25792466
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_1d:
25802467
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_2d:
25812468
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_3d:

llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -92,8 +92,6 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
9292
void SelectV2I64toI128(SDNode *N);
9393
void SelectI128toV2I64(SDNode *N);
9494
void SelectCpAsyncBulkTensorG2SCommon(SDNode *N, bool IsIm2Col = false);
95-
void SelectCpAsyncBulkTensorS2GCommon(SDNode *N, bool IsIm2Col = false);
96-
void SelectCpAsyncBulkTensorPrefetchCommon(SDNode *N, bool IsIm2Col = false);
9795
void SelectCpAsyncBulkTensorReduceCommon(SDNode *N, unsigned RedOp,
9896
bool IsIm2Col = false);
9997
void SelectTcgen05Ld(SDNode *N, bool hasOffset = false);

0 commit comments

Comments
 (0)