@@ -2147,16 +2147,9 @@ bool NVPTXScopes::empty() const { return Scopes.size() == 0; }
2147
2147
? NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_SHARED32_##mode##suffix \
2148
2148
: NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_##mode##suffix)
2149
2149
2150
- #define CP_ASYNC_BULK_TENSOR_OPCODE_S2G_IMPL (op, dim, mode, is_ch, is_s32 ) \
2151
- (is_ch ? (CP_ASYNC_BULK_TENSOR_OPCODE(op, dim, mode, is_s32, _CH)) \
2152
- : (CP_ASYNC_BULK_TENSOR_OPCODE(op, dim, mode, is_s32, )))
2153
-
2154
- #define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G (dim, mode, is_reduce, is_ch, \
2155
- is_s32) \
2156
- (is_reduce \
2157
- ? (CP_ASYNC_BULK_TENSOR_OPCODE_S2G_IMPL(RED, dim, mode, is_ch, is_s32)) \
2158
- : (CP_ASYNC_BULK_TENSOR_OPCODE_S2G_IMPL(S2G, dim, mode, is_ch, \
2159
- is_s32)))
2150
+ #define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED (dim, mode, is_ch, is_s32 ) \
2151
+ (is_ch ? (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, _CH)) \
2152
+ : (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, )))
2160
2153
2161
2154
#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S (dim, mode, is_mc, is_ch, is_s32 ) \
2162
2155
[&]() -> auto { \
@@ -2169,48 +2162,45 @@ bool NVPTXScopes::empty() const { return Scopes.size() == 0; }
2169
2162
return CP_ASYNC_BULK_TENSOR_OPCODE (G2S, dim, mode, is_s32, ); \
2170
2163
}()
2171
2164
2172
- #define GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH (dim, mode, is_ch ) \
2173
- (is_ch ? NVPTX::CP_ASYNC_BULK_TENSOR_PREFETCH_##dim##_##mode##_CH \
2174
- : NVPTX::CP_ASYNC_BULK_TENSOR_PREFETCH_##dim##_##mode)
2175
-
2176
- static unsigned GetCpAsyncBulkTensorS2GOpcode (size_t Dim, bool IsShared32,
2177
- bool IsCacheHint, bool IsIm2Col,
2178
- bool IsReduce = false ) {
2165
+ static unsigned GetCpAsyncBulkTensorS2GReductionOpcode (size_t Dim,
2166
+ bool IsShared32,
2167
+ bool IsCacheHint,
2168
+ bool IsIm2Col) {
2179
2169
if (IsIm2Col) {
2180
2170
switch (Dim) {
2181
2171
case 3 :
2182
- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G (3D, IM2COL, IsReduce ,
2183
- IsCacheHint, IsShared32);
2172
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED (3D, IM2COL, IsCacheHint ,
2173
+ IsShared32);
2184
2174
case 4 :
2185
- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G (4D, IM2COL, IsReduce ,
2186
- IsCacheHint, IsShared32);
2175
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED (4D, IM2COL, IsCacheHint ,
2176
+ IsShared32);
2187
2177
case 5 :
2188
- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G (5D, IM2COL, IsReduce ,
2189
- IsCacheHint, IsShared32);
2178
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED (5D, IM2COL, IsCacheHint ,
2179
+ IsShared32);
2190
2180
default :
2191
2181
llvm_unreachable (" Invalid Dimension in im2col mode for "
2192
- " GetCpAsyncBulkTensorS2GOpcode ." );
2182
+ " GetCpAsyncBulkTensorS2GReductionOpcode ." );
2193
2183
}
2194
2184
} else {
2195
2185
switch (Dim) {
2196
2186
case 1 :
2197
- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G (1D, TILE, IsReduce ,
2198
- IsCacheHint, IsShared32);
2187
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED (1D, TILE, IsCacheHint ,
2188
+ IsShared32);
2199
2189
case 2 :
2200
- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G (2D, TILE, IsReduce ,
2201
- IsCacheHint, IsShared32);
2190
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED (2D, TILE, IsCacheHint ,
2191
+ IsShared32);
2202
2192
case 3 :
2203
- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G (3D, TILE, IsReduce ,
2204
- IsCacheHint, IsShared32);
2193
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED (3D, TILE, IsCacheHint ,
2194
+ IsShared32);
2205
2195
case 4 :
2206
- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G (4D, TILE, IsReduce ,
2207
- IsCacheHint, IsShared32);
2196
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED (4D, TILE, IsCacheHint ,
2197
+ IsShared32);
2208
2198
case 5 :
2209
- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G (5D, TILE, IsReduce ,
2210
- IsCacheHint, IsShared32);
2199
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED (5D, TILE, IsCacheHint ,
2200
+ IsShared32);
2211
2201
default :
2212
- llvm_unreachable (
2213
- " Invalid Dimension in tile mode for GetCpAsyncBulkTensorS2GOpcode ." );
2202
+ llvm_unreachable (" Invalid Dimension in tile mode for "
2203
+ " GetCpAsyncBulkTensorS2GReductionOpcode ." );
2214
2204
}
2215
2205
}
2216
2206
}
@@ -2257,39 +2247,6 @@ static unsigned GetCpAsyncBulkTensorG2SOpcode(size_t Dim, bool IsShared32,
2257
2247
}
2258
2248
}
2259
2249
2260
- static unsigned GetCpAsyncBulkTensorPrefetchOpcode (size_t Dim, bool IsCacheHint,
2261
- bool IsIm2Col) {
2262
- if (IsIm2Col) {
2263
- switch (Dim) {
2264
- case 3 :
2265
- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH (3D, IM2COL, IsCacheHint);
2266
- case 4 :
2267
- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH (4D, IM2COL, IsCacheHint);
2268
- case 5 :
2269
- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH (5D, IM2COL, IsCacheHint);
2270
- default :
2271
- llvm_unreachable (" Invalid Dimension in im2col mode for "
2272
- " GetCpAsyncBulkTensorPrefetchOpcode." );
2273
- }
2274
- } else {
2275
- switch (Dim) {
2276
- case 1 :
2277
- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH (1D, TILE, IsCacheHint);
2278
- case 2 :
2279
- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH (2D, TILE, IsCacheHint);
2280
- case 3 :
2281
- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH (3D, TILE, IsCacheHint);
2282
- case 4 :
2283
- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH (4D, TILE, IsCacheHint);
2284
- case 5 :
2285
- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH (5D, TILE, IsCacheHint);
2286
- default :
2287
- llvm_unreachable (" Invalid Dimension in tile mode for "
2288
- " GetCpAsyncBulkTensorPrefetchOpcode." );
2289
- }
2290
- }
2291
- }
2292
-
2293
2250
static size_t GetDimsFromIntrinsic (unsigned IID) {
2294
2251
switch (IID) {
2295
2252
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
@@ -2354,52 +2311,6 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorG2SCommon(SDNode *N,
2354
2311
ReplaceNode (N, CurDAG->getMachineNode (Opcode, DL, N->getVTList (), Ops));
2355
2312
}
2356
2313
2357
- void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorS2GCommon (SDNode *N,
2358
- bool IsIm2Col) {
2359
- // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
2360
- // src, dst, dims{d0...dN}, cache_hint, cache_hint_flag
2361
- // NumOperands = {Chain, IID} + {Actual intrinsic args}
2362
- // = {2} + {4 + dims}
2363
- size_t NumOps = N->getNumOperands ();
2364
- size_t NumDims = NumOps - 6 ;
2365
- bool IsCacheHint = N->getConstantOperandVal (NumOps - 1 ) == 1 ;
2366
- size_t NumArgs = NumDims + (IsCacheHint ? 3 : 2 ); // src, dst, cache_hint
2367
-
2368
- SDLoc DL (N);
2369
- SmallVector<SDValue, 8 > Ops (N->ops ().slice (2 , NumArgs));
2370
- Ops.push_back (N->getOperand (0 )); // Chain operand
2371
-
2372
- bool IsShared32 =
2373
- CurDAG->getDataLayout ().getPointerSizeInBits (ADDRESS_SPACE_SHARED) == 32 ;
2374
- unsigned Opcode =
2375
- GetCpAsyncBulkTensorS2GOpcode (NumDims, IsShared32, IsCacheHint, IsIm2Col);
2376
- ReplaceNode (N, CurDAG->getMachineNode (Opcode, DL, N->getVTList (), Ops));
2377
- }
2378
-
2379
- void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorPrefetchCommon (SDNode *N,
2380
- bool IsIm2Col) {
2381
- // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
2382
- // {src, dims{d0...dN}, im2col_offsets{dims-2}
2383
- // cache_hint, cache_hint_flag}
2384
- // NumOperands = {Chain, IID} + {Actual intrinsic args}
2385
- // = {2} + {3 + dims + im2col_offsets}
2386
- size_t NumOps = N->getNumOperands ();
2387
- size_t NumDims = IsIm2Col ? GetDimsFromIntrinsic (N->getConstantOperandVal (1 ))
2388
- : (NumOps - 5 );
2389
- // Offsets is always 'NumDims - 2' and only for im2col mode
2390
- size_t NumOffsets = IsIm2Col ? (NumDims - 2 ) : 0 ;
2391
- bool IsCacheHint = N->getConstantOperandVal (NumOps - 1 ) == 1 ;
2392
- size_t NumArgs = NumDims + NumOffsets + (IsCacheHint ? 2 : 1 );
2393
-
2394
- SDLoc DL (N);
2395
- SmallVector<SDValue, 12 > Ops (N->ops ().slice (2 , NumArgs));
2396
- Ops.push_back (N->getOperand (0 )); // Chain operand
2397
-
2398
- unsigned Opcode =
2399
- GetCpAsyncBulkTensorPrefetchOpcode (NumDims, IsCacheHint, IsIm2Col);
2400
- ReplaceNode (N, CurDAG->getMachineNode (Opcode, DL, N->getVTList (), Ops));
2401
- }
2402
-
2403
2314
void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon (SDNode *N,
2404
2315
unsigned RedOp,
2405
2316
bool IsIm2Col) {
@@ -2419,8 +2330,8 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(SDNode *N,
2419
2330
2420
2331
bool IsShared32 =
2421
2332
CurDAG->getDataLayout ().getPointerSizeInBits (ADDRESS_SPACE_SHARED) == 32 ;
2422
- unsigned Opcode = GetCpAsyncBulkTensorS2GOpcode (
2423
- NumDims, IsShared32, IsCacheHint, IsIm2Col, /* IsReduce= */ true );
2333
+ unsigned Opcode = GetCpAsyncBulkTensorS2GReductionOpcode (
2334
+ NumDims, IsShared32, IsCacheHint, IsIm2Col);
2424
2335
ReplaceNode (N, CurDAG->getMachineNode (Opcode, DL, N->getVTList (), Ops));
2425
2336
}
2426
2337
@@ -2540,18 +2451,6 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
2540
2451
switch (IID) {
2541
2452
default :
2542
2453
return false ;
2543
- case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_1d:
2544
- case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_2d:
2545
- case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_3d:
2546
- case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_4d:
2547
- case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_5d:
2548
- SelectCpAsyncBulkTensorS2GCommon (N);
2549
- return true ;
2550
- case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_im2col_3d:
2551
- case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_im2col_4d:
2552
- case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_im2col_5d:
2553
- SelectCpAsyncBulkTensorS2GCommon (N, /* IsIm2Col=*/ true );
2554
- return true ;
2555
2454
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d:
2556
2455
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d:
2557
2456
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d:
@@ -2564,18 +2463,6 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
2564
2463
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
2565
2464
SelectCpAsyncBulkTensorG2SCommon (N, /* IsIm2Col=*/ true );
2566
2465
return true ;
2567
- case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_1d:
2568
- case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_2d:
2569
- case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_3d:
2570
- case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_4d:
2571
- case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_5d:
2572
- SelectCpAsyncBulkTensorPrefetchCommon (N);
2573
- return true ;
2574
- case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_3d:
2575
- case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_4d:
2576
- case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_5d:
2577
- SelectCpAsyncBulkTensorPrefetchCommon (N, /* IsIm2Col=*/ true );
2578
- return true ;
2579
2466
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_1d:
2580
2467
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_2d:
2581
2468
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_3d:
0 commit comments