Skip to content

Commit 0535ac2

Browse files
committed
[NVPTX][NFC] Move more TMA intrinsics lowering to tablegen
This patch moves the lowering of the TMA Tensor prefetch and S2G-copy intrinsics to tablegen itself. This is in preparation for adding blackwell specific additions to these intrinsics. Signed-off-by: Durgadoss R <durgadossr@nvidia.com>
1 parent eb2b63c commit 0535ac2

File tree

5 files changed

+143
-223
lines changed

5 files changed

+143
-223
lines changed

llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp

Lines changed: 28 additions & 141 deletions
Original file line numberDiff line numberDiff line change
@@ -2157,16 +2157,9 @@ bool NVPTXScopes::empty() const { return Scopes.size() == 0; }
21572157
? NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_SHARED32_##mode##suffix \
21582158
: NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_##mode##suffix)
21592159

2160-
#define CP_ASYNC_BULK_TENSOR_OPCODE_S2G_IMPL(op, dim, mode, is_ch, is_s32) \
2161-
(is_ch ? (CP_ASYNC_BULK_TENSOR_OPCODE(op, dim, mode, is_s32, _CH)) \
2162-
: (CP_ASYNC_BULK_TENSOR_OPCODE(op, dim, mode, is_s32, )))
2163-
2164-
#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(dim, mode, is_reduce, is_ch, \
2165-
is_s32) \
2166-
(is_reduce \
2167-
? (CP_ASYNC_BULK_TENSOR_OPCODE_S2G_IMPL(RED, dim, mode, is_ch, is_s32)) \
2168-
: (CP_ASYNC_BULK_TENSOR_OPCODE_S2G_IMPL(S2G, dim, mode, is_ch, \
2169-
is_s32)))
2160+
#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(dim, mode, is_ch, is_s32) \
2161+
(is_ch ? (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, _CH)) \
2162+
: (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, )))
21702163

21712164
#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(dim, mode, is_mc, is_ch, is_s32) \
21722165
[&]() -> auto { \
@@ -2179,48 +2172,45 @@ bool NVPTXScopes::empty() const { return Scopes.size() == 0; }
21792172
return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, ); \
21802173
}()
21812174

2182-
#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(dim, mode, is_ch) \
2183-
(is_ch ? NVPTX::CP_ASYNC_BULK_TENSOR_PREFETCH_##dim##_##mode##_CH \
2184-
: NVPTX::CP_ASYNC_BULK_TENSOR_PREFETCH_##dim##_##mode)
2185-
2186-
static unsigned GetCpAsyncBulkTensorS2GOpcode(size_t Dim, bool IsShared32,
2187-
bool IsCacheHint, bool IsIm2Col,
2188-
bool IsReduce = false) {
2175+
static unsigned GetCpAsyncBulkTensorS2GReductionOpcode(size_t Dim,
2176+
bool IsShared32,
2177+
bool IsCacheHint,
2178+
bool IsIm2Col) {
21892179
if (IsIm2Col) {
21902180
switch (Dim) {
21912181
case 3:
2192-
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(3D, IM2COL, IsReduce,
2193-
IsCacheHint, IsShared32);
2182+
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(3D, IM2COL, IsCacheHint,
2183+
IsShared32);
21942184
case 4:
2195-
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(4D, IM2COL, IsReduce,
2196-
IsCacheHint, IsShared32);
2185+
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(4D, IM2COL, IsCacheHint,
2186+
IsShared32);
21972187
case 5:
2198-
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(5D, IM2COL, IsReduce,
2199-
IsCacheHint, IsShared32);
2188+
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(5D, IM2COL, IsCacheHint,
2189+
IsShared32);
22002190
default:
22012191
llvm_unreachable("Invalid Dimension in im2col mode for "
2202-
"GetCpAsyncBulkTensorS2GOpcode.");
2192+
"GetCpAsyncBulkTensorS2GReductionOpcode.");
22032193
}
22042194
} else {
22052195
switch (Dim) {
22062196
case 1:
2207-
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(1D, TILE, IsReduce,
2208-
IsCacheHint, IsShared32);
2197+
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(1D, TILE, IsCacheHint,
2198+
IsShared32);
22092199
case 2:
2210-
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(2D, TILE, IsReduce,
2211-
IsCacheHint, IsShared32);
2200+
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(2D, TILE, IsCacheHint,
2201+
IsShared32);
22122202
case 3:
2213-
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(3D, TILE, IsReduce,
2214-
IsCacheHint, IsShared32);
2203+
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(3D, TILE, IsCacheHint,
2204+
IsShared32);
22152205
case 4:
2216-
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(4D, TILE, IsReduce,
2217-
IsCacheHint, IsShared32);
2206+
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(4D, TILE, IsCacheHint,
2207+
IsShared32);
22182208
case 5:
2219-
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G(5D, TILE, IsReduce,
2220-
IsCacheHint, IsShared32);
2209+
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED(5D, TILE, IsCacheHint,
2210+
IsShared32);
22212211
default:
2222-
llvm_unreachable(
2223-
"Invalid Dimension in tile mode for GetCpAsyncBulkTensorS2GOpcode.");
2212+
llvm_unreachable("Invalid Dimension in tile mode for "
2213+
"GetCpAsyncBulkTensorS2GReductionOpcode.");
22242214
}
22252215
}
22262216
}
@@ -2267,39 +2257,6 @@ static unsigned GetCpAsyncBulkTensorG2SOpcode(size_t Dim, bool IsShared32,
22672257
}
22682258
}
22692259

2270-
static unsigned GetCpAsyncBulkTensorPrefetchOpcode(size_t Dim, bool IsCacheHint,
2271-
bool IsIm2Col) {
2272-
if (IsIm2Col) {
2273-
switch (Dim) {
2274-
case 3:
2275-
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(3D, IM2COL, IsCacheHint);
2276-
case 4:
2277-
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(4D, IM2COL, IsCacheHint);
2278-
case 5:
2279-
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(5D, IM2COL, IsCacheHint);
2280-
default:
2281-
llvm_unreachable("Invalid Dimension in im2col mode for "
2282-
"GetCpAsyncBulkTensorPrefetchOpcode.");
2283-
}
2284-
} else {
2285-
switch (Dim) {
2286-
case 1:
2287-
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(1D, TILE, IsCacheHint);
2288-
case 2:
2289-
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(2D, TILE, IsCacheHint);
2290-
case 3:
2291-
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(3D, TILE, IsCacheHint);
2292-
case 4:
2293-
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(4D, TILE, IsCacheHint);
2294-
case 5:
2295-
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH(5D, TILE, IsCacheHint);
2296-
default:
2297-
llvm_unreachable("Invalid Dimension in tile mode for "
2298-
"GetCpAsyncBulkTensorPrefetchOpcode.");
2299-
}
2300-
}
2301-
}
2302-
23032260
static size_t GetDimsFromIntrinsic(unsigned IID) {
23042261
switch (IID) {
23052262
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
@@ -2364,52 +2321,6 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorG2SCommon(SDNode *N,
23642321
ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
23652322
}
23662323

2367-
void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorS2GCommon(SDNode *N,
2368-
bool IsIm2Col) {
2369-
// We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
2370-
// src, dst, dims{d0...dN}, cache_hint, cache_hint_flag
2371-
// NumOperands = {Chain, IID} + {Actual intrinsic args}
2372-
// = {2} + {4 + dims}
2373-
size_t NumOps = N->getNumOperands();
2374-
size_t NumDims = NumOps - 6;
2375-
bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1;
2376-
size_t NumArgs = NumDims + (IsCacheHint ? 3 : 2); // src, dst, cache_hint
2377-
2378-
SDLoc DL(N);
2379-
SmallVector<SDValue, 8> Ops(N->ops().slice(2, NumArgs));
2380-
Ops.push_back(N->getOperand(0)); // Chain operand
2381-
2382-
bool IsShared32 =
2383-
CurDAG->getDataLayout().getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32;
2384-
unsigned Opcode =
2385-
GetCpAsyncBulkTensorS2GOpcode(NumDims, IsShared32, IsCacheHint, IsIm2Col);
2386-
ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
2387-
}
2388-
2389-
void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorPrefetchCommon(SDNode *N,
2390-
bool IsIm2Col) {
2391-
// We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
2392-
// {src, dims{d0...dN}, im2col_offsets{dims-2}
2393-
// cache_hint, cache_hint_flag}
2394-
// NumOperands = {Chain, IID} + {Actual intrinsic args}
2395-
// = {2} + {3 + dims + im2col_offsets}
2396-
size_t NumOps = N->getNumOperands();
2397-
size_t NumDims = IsIm2Col ? GetDimsFromIntrinsic(N->getConstantOperandVal(1))
2398-
: (NumOps - 5);
2399-
// Offsets is always 'NumDims - 2' and only for im2col mode
2400-
size_t NumOffsets = IsIm2Col ? (NumDims - 2) : 0;
2401-
bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1;
2402-
size_t NumArgs = NumDims + NumOffsets + (IsCacheHint ? 2 : 1);
2403-
2404-
SDLoc DL(N);
2405-
SmallVector<SDValue, 12> Ops(N->ops().slice(2, NumArgs));
2406-
Ops.push_back(N->getOperand(0)); // Chain operand
2407-
2408-
unsigned Opcode =
2409-
GetCpAsyncBulkTensorPrefetchOpcode(NumDims, IsCacheHint, IsIm2Col);
2410-
ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
2411-
}
2412-
24132324
void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(SDNode *N,
24142325
unsigned RedOp,
24152326
bool IsIm2Col) {
@@ -2429,8 +2340,8 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(SDNode *N,
24292340

24302341
bool IsShared32 =
24312342
CurDAG->getDataLayout().getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32;
2432-
unsigned Opcode = GetCpAsyncBulkTensorS2GOpcode(
2433-
NumDims, IsShared32, IsCacheHint, IsIm2Col, /*IsReduce=*/true);
2343+
unsigned Opcode = GetCpAsyncBulkTensorS2GReductionOpcode(
2344+
NumDims, IsShared32, IsCacheHint, IsIm2Col);
24342345
ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
24352346
}
24362347

@@ -2550,18 +2461,6 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
25502461
switch (IID) {
25512462
default:
25522463
return false;
2553-
case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_1d:
2554-
case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_2d:
2555-
case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_3d:
2556-
case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_4d:
2557-
case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_5d:
2558-
SelectCpAsyncBulkTensorS2GCommon(N);
2559-
return true;
2560-
case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_im2col_3d:
2561-
case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_im2col_4d:
2562-
case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_im2col_5d:
2563-
SelectCpAsyncBulkTensorS2GCommon(N, /*IsIm2Col=*/true);
2564-
return true;
25652464
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d:
25662465
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d:
25672466
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d:
@@ -2574,18 +2473,6 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
25742473
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
25752474
SelectCpAsyncBulkTensorG2SCommon(N, /*IsIm2Col=*/true);
25762475
return true;
2577-
case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_1d:
2578-
case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_2d:
2579-
case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_3d:
2580-
case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_4d:
2581-
case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_5d:
2582-
SelectCpAsyncBulkTensorPrefetchCommon(N);
2583-
return true;
2584-
case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_3d:
2585-
case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_4d:
2586-
case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_5d:
2587-
SelectCpAsyncBulkTensorPrefetchCommon(N, /*IsIm2Col=*/true);
2588-
return true;
25892476
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_1d:
25902477
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_2d:
25912478
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_3d:

llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -92,8 +92,6 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
9292
void SelectV2I64toI128(SDNode *N);
9393
void SelectI128toV2I64(SDNode *N);
9494
void SelectCpAsyncBulkTensorG2SCommon(SDNode *N, bool IsIm2Col = false);
95-
void SelectCpAsyncBulkTensorS2GCommon(SDNode *N, bool IsIm2Col = false);
96-
void SelectCpAsyncBulkTensorPrefetchCommon(SDNode *N, bool IsIm2Col = false);
9795
void SelectCpAsyncBulkTensorReduceCommon(SDNode *N, unsigned RedOp,
9896
bool IsIm2Col = false);
9997
void SelectTcgen05Ld(SDNode *N, bool hasOffset = false);

0 commit comments

Comments
 (0)