@@ -2157,16 +2157,9 @@ bool NVPTXScopes::empty() const { return Scopes.size() == 0; }
2157
2157
? NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_SHARED32_##mode##suffix \
2158
2158
: NVPTX::CP_ASYNC_BULK_TENSOR_##dir##_##dim##_##mode##suffix)
2159
2159
2160
- #define CP_ASYNC_BULK_TENSOR_OPCODE_S2G_IMPL (op, dim, mode, is_ch, is_s32 ) \
2161
- (is_ch ? (CP_ASYNC_BULK_TENSOR_OPCODE(op, dim, mode, is_s32, _CH)) \
2162
- : (CP_ASYNC_BULK_TENSOR_OPCODE(op, dim, mode, is_s32, )))
2163
-
2164
- #define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G (dim, mode, is_reduce, is_ch, \
2165
- is_s32) \
2166
- (is_reduce \
2167
- ? (CP_ASYNC_BULK_TENSOR_OPCODE_S2G_IMPL(RED, dim, mode, is_ch, is_s32)) \
2168
- : (CP_ASYNC_BULK_TENSOR_OPCODE_S2G_IMPL(S2G, dim, mode, is_ch, \
2169
- is_s32)))
2160
+ #define GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED (dim, mode, is_ch, is_s32 ) \
2161
+ (is_ch ? (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, _CH)) \
2162
+ : (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, )))
2170
2163
2171
2164
#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S (dim, mode, is_mc, is_ch, is_s32 ) \
2172
2165
[&]() -> auto { \
@@ -2179,48 +2172,45 @@ bool NVPTXScopes::empty() const { return Scopes.size() == 0; }
2179
2172
return CP_ASYNC_BULK_TENSOR_OPCODE (G2S, dim, mode, is_s32, ); \
2180
2173
}()
2181
2174
2182
- #define GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH (dim, mode, is_ch ) \
2183
- (is_ch ? NVPTX::CP_ASYNC_BULK_TENSOR_PREFETCH_##dim##_##mode##_CH \
2184
- : NVPTX::CP_ASYNC_BULK_TENSOR_PREFETCH_##dim##_##mode)
2185
-
2186
- static unsigned GetCpAsyncBulkTensorS2GOpcode (size_t Dim, bool IsShared32,
2187
- bool IsCacheHint, bool IsIm2Col,
2188
- bool IsReduce = false ) {
2175
+ static unsigned GetCpAsyncBulkTensorS2GReductionOpcode (size_t Dim,
2176
+ bool IsShared32,
2177
+ bool IsCacheHint,
2178
+ bool IsIm2Col) {
2189
2179
if (IsIm2Col) {
2190
2180
switch (Dim) {
2191
2181
case 3 :
2192
- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G (3D, IM2COL, IsReduce ,
2193
- IsCacheHint, IsShared32);
2182
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED (3D, IM2COL, IsCacheHint ,
2183
+ IsShared32);
2194
2184
case 4 :
2195
- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G (4D, IM2COL, IsReduce ,
2196
- IsCacheHint, IsShared32);
2185
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED (4D, IM2COL, IsCacheHint ,
2186
+ IsShared32);
2197
2187
case 5 :
2198
- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G (5D, IM2COL, IsReduce ,
2199
- IsCacheHint, IsShared32);
2188
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED (5D, IM2COL, IsCacheHint ,
2189
+ IsShared32);
2200
2190
default :
2201
2191
llvm_unreachable (" Invalid Dimension in im2col mode for "
2202
- " GetCpAsyncBulkTensorS2GOpcode ." );
2192
+ " GetCpAsyncBulkTensorS2GReductionOpcode ." );
2203
2193
}
2204
2194
} else {
2205
2195
switch (Dim) {
2206
2196
case 1 :
2207
- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G (1D, TILE, IsReduce ,
2208
- IsCacheHint, IsShared32);
2197
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED (1D, TILE, IsCacheHint ,
2198
+ IsShared32);
2209
2199
case 2 :
2210
- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G (2D, TILE, IsReduce ,
2211
- IsCacheHint, IsShared32);
2200
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED (2D, TILE, IsCacheHint ,
2201
+ IsShared32);
2212
2202
case 3 :
2213
- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G (3D, TILE, IsReduce ,
2214
- IsCacheHint, IsShared32);
2203
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED (3D, TILE, IsCacheHint ,
2204
+ IsShared32);
2215
2205
case 4 :
2216
- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G (4D, TILE, IsReduce ,
2217
- IsCacheHint, IsShared32);
2206
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED (4D, TILE, IsCacheHint ,
2207
+ IsShared32);
2218
2208
case 5 :
2219
- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G (5D, TILE, IsReduce ,
2220
- IsCacheHint, IsShared32);
2209
+ return GET_CP_ASYNC_BULK_TENSOR_OPCODE_S2G_RED (5D, TILE, IsCacheHint ,
2210
+ IsShared32);
2221
2211
default :
2222
- llvm_unreachable (
2223
- " Invalid Dimension in tile mode for GetCpAsyncBulkTensorS2GOpcode ." );
2212
+ llvm_unreachable (" Invalid Dimension in tile mode for "
2213
+ " GetCpAsyncBulkTensorS2GReductionOpcode ." );
2224
2214
}
2225
2215
}
2226
2216
}
@@ -2267,39 +2257,6 @@ static unsigned GetCpAsyncBulkTensorG2SOpcode(size_t Dim, bool IsShared32,
2267
2257
}
2268
2258
}
2269
2259
2270
- static unsigned GetCpAsyncBulkTensorPrefetchOpcode (size_t Dim, bool IsCacheHint,
2271
- bool IsIm2Col) {
2272
- if (IsIm2Col) {
2273
- switch (Dim) {
2274
- case 3 :
2275
- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH (3D, IM2COL, IsCacheHint);
2276
- case 4 :
2277
- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH (4D, IM2COL, IsCacheHint);
2278
- case 5 :
2279
- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH (5D, IM2COL, IsCacheHint);
2280
- default :
2281
- llvm_unreachable (" Invalid Dimension in im2col mode for "
2282
- " GetCpAsyncBulkTensorPrefetchOpcode." );
2283
- }
2284
- } else {
2285
- switch (Dim) {
2286
- case 1 :
2287
- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH (1D, TILE, IsCacheHint);
2288
- case 2 :
2289
- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH (2D, TILE, IsCacheHint);
2290
- case 3 :
2291
- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH (3D, TILE, IsCacheHint);
2292
- case 4 :
2293
- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH (4D, TILE, IsCacheHint);
2294
- case 5 :
2295
- return GET_CP_ASYNC_BULK_TENSOR_OPCODE_PREFETCH (5D, TILE, IsCacheHint);
2296
- default :
2297
- llvm_unreachable (" Invalid Dimension in tile mode for "
2298
- " GetCpAsyncBulkTensorPrefetchOpcode." );
2299
- }
2300
- }
2301
- }
2302
-
2303
2260
static size_t GetDimsFromIntrinsic (unsigned IID) {
2304
2261
switch (IID) {
2305
2262
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
@@ -2364,52 +2321,6 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorG2SCommon(SDNode *N,
2364
2321
ReplaceNode (N, CurDAG->getMachineNode (Opcode, DL, N->getVTList (), Ops));
2365
2322
}
2366
2323
2367
- void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorS2GCommon (SDNode *N,
2368
- bool IsIm2Col) {
2369
- // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
2370
- // src, dst, dims{d0...dN}, cache_hint, cache_hint_flag
2371
- // NumOperands = {Chain, IID} + {Actual intrinsic args}
2372
- // = {2} + {4 + dims}
2373
- size_t NumOps = N->getNumOperands ();
2374
- size_t NumDims = NumOps - 6 ;
2375
- bool IsCacheHint = N->getConstantOperandVal (NumOps - 1 ) == 1 ;
2376
- size_t NumArgs = NumDims + (IsCacheHint ? 3 : 2 ); // src, dst, cache_hint
2377
-
2378
- SDLoc DL (N);
2379
- SmallVector<SDValue, 8 > Ops (N->ops ().slice (2 , NumArgs));
2380
- Ops.push_back (N->getOperand (0 )); // Chain operand
2381
-
2382
- bool IsShared32 =
2383
- CurDAG->getDataLayout ().getPointerSizeInBits (ADDRESS_SPACE_SHARED) == 32 ;
2384
- unsigned Opcode =
2385
- GetCpAsyncBulkTensorS2GOpcode (NumDims, IsShared32, IsCacheHint, IsIm2Col);
2386
- ReplaceNode (N, CurDAG->getMachineNode (Opcode, DL, N->getVTList (), Ops));
2387
- }
2388
-
2389
- void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorPrefetchCommon (SDNode *N,
2390
- bool IsIm2Col) {
2391
- // We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
2392
- // {src, dims{d0...dN}, im2col_offsets{dims-2}
2393
- // cache_hint, cache_hint_flag}
2394
- // NumOperands = {Chain, IID} + {Actual intrinsic args}
2395
- // = {2} + {3 + dims + im2col_offsets}
2396
- size_t NumOps = N->getNumOperands ();
2397
- size_t NumDims = IsIm2Col ? GetDimsFromIntrinsic (N->getConstantOperandVal (1 ))
2398
- : (NumOps - 5 );
2399
- // Offsets is always 'NumDims - 2' and only for im2col mode
2400
- size_t NumOffsets = IsIm2Col ? (NumDims - 2 ) : 0 ;
2401
- bool IsCacheHint = N->getConstantOperandVal (NumOps - 1 ) == 1 ;
2402
- size_t NumArgs = NumDims + NumOffsets + (IsCacheHint ? 2 : 1 );
2403
-
2404
- SDLoc DL (N);
2405
- SmallVector<SDValue, 12 > Ops (N->ops ().slice (2 , NumArgs));
2406
- Ops.push_back (N->getOperand (0 )); // Chain operand
2407
-
2408
- unsigned Opcode =
2409
- GetCpAsyncBulkTensorPrefetchOpcode (NumDims, IsCacheHint, IsIm2Col);
2410
- ReplaceNode (N, CurDAG->getMachineNode (Opcode, DL, N->getVTList (), Ops));
2411
- }
2412
-
2413
2324
void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon (SDNode *N,
2414
2325
unsigned RedOp,
2415
2326
bool IsIm2Col) {
@@ -2429,8 +2340,8 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(SDNode *N,
2429
2340
2430
2341
bool IsShared32 =
2431
2342
CurDAG->getDataLayout ().getPointerSizeInBits (ADDRESS_SPACE_SHARED) == 32 ;
2432
- unsigned Opcode = GetCpAsyncBulkTensorS2GOpcode (
2433
- NumDims, IsShared32, IsCacheHint, IsIm2Col, /* IsReduce= */ true );
2343
+ unsigned Opcode = GetCpAsyncBulkTensorS2GReductionOpcode (
2344
+ NumDims, IsShared32, IsCacheHint, IsIm2Col);
2434
2345
ReplaceNode (N, CurDAG->getMachineNode (Opcode, DL, N->getVTList (), Ops));
2435
2346
}
2436
2347
@@ -2550,18 +2461,6 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
2550
2461
switch (IID) {
2551
2462
default :
2552
2463
return false ;
2553
- case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_1d:
2554
- case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_2d:
2555
- case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_3d:
2556
- case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_4d:
2557
- case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_tile_5d:
2558
- SelectCpAsyncBulkTensorS2GCommon (N);
2559
- return true ;
2560
- case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_im2col_3d:
2561
- case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_im2col_4d:
2562
- case Intrinsic::nvvm_cp_async_bulk_tensor_s2g_im2col_5d:
2563
- SelectCpAsyncBulkTensorS2GCommon (N, /* IsIm2Col=*/ true );
2564
- return true ;
2565
2464
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d:
2566
2465
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d:
2567
2466
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d:
@@ -2574,18 +2473,6 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
2574
2473
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
2575
2474
SelectCpAsyncBulkTensorG2SCommon (N, /* IsIm2Col=*/ true );
2576
2475
return true ;
2577
- case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_1d:
2578
- case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_2d:
2579
- case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_3d:
2580
- case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_4d:
2581
- case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_tile_5d:
2582
- SelectCpAsyncBulkTensorPrefetchCommon (N);
2583
- return true ;
2584
- case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_3d:
2585
- case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_4d:
2586
- case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_5d:
2587
- SelectCpAsyncBulkTensorPrefetchCommon (N, /* IsIm2Col=*/ true );
2588
- return true ;
2589
2476
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_1d:
2590
2477
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_2d:
2591
2478
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_3d:
0 commit comments