Skip to content

Commit bcb698b

Browse files
committed
[mlir][sparse][gpu] various cuSparse refinements
(1) keep all cuSparse ops on single stream without wait() in right order (2) use more type precise memref types for COO (3) use ToTensor on resulting memref (even though it folds away again) Reviewed By: K-Wu Differential Revision: https://reviews.llvm.org/D151404
1 parent edfd360 commit bcb698b

File tree

4 files changed

+51
-55
lines changed

4 files changed

+51
-55
lines changed

mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp

Lines changed: 13 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -378,23 +378,26 @@ static Value genSecondCrds(OpBuilder &builder, Location loc, Value a,
378378
bool isCOO, bool enableRT) {
379379
if (isCOO && !enableRT)
380380
return Value(); // nothing needed
381-
return genToCoordinates(builder, loc, a, 1, /*cooStart=*/0);
381+
return genToCoordinates(builder, loc, a, 1, /*cooStart=*/isCOO ? 0 : 2);
382382
}
383383

384384
/// Generates the sparse matrix multiplication.
385385
static Operation *genSpMat(OpBuilder &builder, Location loc, Type handleTp,
386-
Type tokenTp, Value token, Value szY, Value szX,
386+
Type tokenTp, Value token, Value sz1, Value sz2,
387387
Value nseA, Value rowA, Value colA, Value valA,
388388
bool isCOO, bool enableRT) {
389389
if (isCOO) {
390390
// Library uses SoA COO, direct IR uses AoS COO.
391-
if (enableRT)
391+
if (enableRT) {
392+
assert(colA);
392393
return builder.create<gpu::CreateCooOp>(loc, handleTp, tokenTp, token,
393-
szY, szX, nseA, rowA, colA, valA);
394+
sz1, sz2, nseA, rowA, colA, valA);
395+
}
394396
llvm_unreachable("gpu::CreateCooAoSOp is deprecated");
395397
}
396-
return builder.create<gpu::CreateCsrOp>(loc, handleTp, tokenTp, token, szY,
397-
szX, nseA, rowA, colA, valA);
398+
assert(colA);
399+
return builder.create<gpu::CreateCsrOp>(loc, handleTp, tokenTp, token, sz1,
400+
sz2, nseA, rowA, colA, valA);
398401
}
399402

400403
/// Match and rewrite SpMV kernel.
@@ -482,24 +485,20 @@ static LogicalResult rewriteSpMV(PatternRewriter &rewriter,
482485
.getAsyncToken();
483486
token = rewriter.create<gpu::DestroySparseEnvOp>(loc, tokenTp, token, handle)
484487
.getAsyncToken();
485-
tokens.push_back(token);
486-
genBlockingWait(rewriter, loc, tokens);
487-
tokens.clear();
488-
token = genFirstWait(rewriter, loc);
489-
token = genCopyMemRef(rewriter, loc, memY, vecY, token);
490488
token = genDeallocMemRef(rewriter, loc, rowA, token);
491489
if (colA)
492490
token = genDeallocMemRef(rewriter, loc, colA, token);
493491
token = genDeallocMemRef(rewriter, loc, valA, token);
494492
token = genDeallocMemRef(rewriter, loc, buffer, token);
495493
token = genDeallocMemRef(rewriter, loc, vecX, token);
494+
token = genCopyMemRef(rewriter, loc, memY, vecY, token);
496495
token = genDeallocMemRef(rewriter, loc, vecY, token);
497496
tokens.push_back(token);
498497
genBlockingWait(rewriter, loc, tokens);
499498
tokens.clear();
500499

501500
// Done.
502-
rewriter.replaceOp(op, op.getDpsInitOperand(0)->get());
501+
rewriter.replaceOpWithNewOp<bufferization::ToTensorOp>(op, memY);
503502
return success();
504503
}
505504

@@ -589,24 +588,20 @@ static LogicalResult rewriteSpMM(PatternRewriter &rewriter,
589588
.getAsyncToken();
590589
token = rewriter.create<gpu::DestroySparseEnvOp>(loc, tokenTp, token, handle)
591590
.getAsyncToken();
592-
tokens.push_back(token);
593-
genBlockingWait(rewriter, loc, tokens);
594-
tokens.clear();
595-
token = genFirstWait(rewriter, loc);
596-
token = genCopyMemRef(rewriter, loc, bufC, matC, token);
597591
token = genDeallocMemRef(rewriter, loc, rowA, token);
598592
if (colA)
599593
token = genDeallocMemRef(rewriter, loc, colA, token);
600594
token = genDeallocMemRef(rewriter, loc, valA, token);
601595
token = genDeallocMemRef(rewriter, loc, buffer, token);
602596
token = genDeallocMemRef(rewriter, loc, matB, token);
597+
token = genCopyMemRef(rewriter, loc, bufC, matC, token);
603598
token = genDeallocMemRef(rewriter, loc, matC, token);
604599
tokens.push_back(token);
605600
genBlockingWait(rewriter, loc, tokens);
606601
tokens.clear();
607602

608603
// Done.
609-
rewriter.replaceOp(op, op.getDpsInitOperand(0)->get());
604+
rewriter.replaceOpWithNewOp<bufferization::ToTensorOp>(op, matC);
610605
return success();
611606
}
612607

mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -248,9 +248,12 @@ static inline cusparseIndexType_t idxTp(int32_t width) {
248248

249249
// Some macro magic to get float/double alpha and beta on host.
250250
#define ALPHABETA(w, alpha, beta) \
251-
float(alpha##f) = 1.0, (beta##f) = 1.0; \
252-
double(alpha##d) = 1.0, (beta##d) = 1.0; \
253-
void *(alpha##p), *(beta##p); \
251+
float(alpha##f) = 1.0f; \
252+
float(beta##f) = 1.0f; \
253+
double(alpha##d) = 1.0; \
254+
double(beta##d) = 1.0; \
255+
const void *(alpha##p) = nullptr; \
256+
const void *(beta##p) = nullptr; \
254257
if ((w) == 32) { \
255258
(alpha##p) = reinterpret_cast<void *>(&(alpha##f)); \
256259
(beta##p) = reinterpret_cast<void *>(&(beta##f)); \

mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir

Lines changed: 19 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -14,19 +14,19 @@
1414
// CHECK-DAG: %[[VAL_4:.*]] = arith.constant 1 : index
1515
// CHECK-DAG: %[[VAL_5:.*]] = sparse_tensor.number_of_entries %[[VAL_0]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.*}}}>>
1616
// CHECK-DAG: %[[VAL_6:.*]] = tensor.dim %[[VAL_0]], %[[VAL_3]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.*}}}>>
17-
// CHECK-DAG: %[[VAL_7:.*]] = tensor.dim %[[VAL_1]], %[[VAL_4]] : tensor<?x?xf64>
18-
// CHECK-DAG: %[[VAL_8:.*]] = tensor.dim %[[VAL_0]], %[[VAL_4]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.*}}}>>
17+
// CHECK-DAG: %[[VAL_7:.*]] = tensor.dim %[[VAL_0]], %[[VAL_4]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.*}}}>>
18+
// CHECK-DAG: %[[VAL_8:.*]] = tensor.dim %[[VAL_1]], %[[VAL_4]] : tensor<?x?xf64>
1919
// CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<?x?xf64, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xindex>
20-
// CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<?x?xf64, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xindex, strided<[?], offset: ?>>
21-
// CHECK: %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xf64>
20+
// CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<?x?xf64, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xindex>
21+
// CHECK-DAG: %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xf64>
2222
// CHECK: %[[VAL_12:.*]] = gpu.wait async
2323
// CHECK: %[[VAL_13:.*]] = memref.dim %[[VAL_9]], %[[VAL_3]] : memref<?xindex>
2424
// CHECK: %[[VAL_14:.*]], %[[VAL_15:.*]] = gpu.alloc async {{\[}}%[[VAL_12]]] (%[[VAL_13]]) : memref<?xindex>
2525
// CHECK: %[[VAL_16:.*]] = gpu.memcpy async {{\[}}%[[VAL_15]]] %[[VAL_14]], %[[VAL_9]] : memref<?xindex>, memref<?xindex>
2626
// CHECK: %[[VAL_17:.*]] = gpu.wait async
27-
// CHECK: %[[VAL_18:.*]] = memref.dim %[[VAL_10]], %[[VAL_3]] : memref<?xindex, strided<[?], offset: ?>>
27+
// CHECK: %[[VAL_18:.*]] = memref.dim %[[VAL_10]], %[[VAL_3]] : memref<?xindex>
2828
// CHECK: %[[VAL_19:.*]], %[[VAL_20:.*]] = gpu.alloc async {{\[}}%[[VAL_17]]] (%[[VAL_18]]) : memref<?xindex>
29-
// CHECK: %[[VAL_21:.*]] = gpu.memcpy async {{\[}}%[[VAL_20]]] %[[VAL_19]], %[[VAL_10]] : memref<?xindex>, memref<?xindex, strided<[?], offset: ?>>
29+
// CHECK: %[[VAL_21:.*]] = gpu.memcpy async {{\[}}%[[VAL_20]]] %[[VAL_19]], %[[VAL_10]] : memref<?xindex>, memref<?xindex>
3030
// CHECK: %[[VAL_22:.*]] = gpu.wait async
3131
// CHECK: %[[VAL_23:.*]] = memref.dim %[[VAL_11]], %[[VAL_3]] : memref<?xf64>
3232
// CHECK: %[[VAL_24:.*]], %[[VAL_25:.*]] = gpu.alloc async {{\[}}%[[VAL_22]]] (%[[VAL_23]]) : memref<?xf64>
@@ -46,27 +46,26 @@
4646
// CHECK: gpu.wait {{\[}}%[[VAL_16]], %[[VAL_21]], %[[VAL_26]], %[[VAL_33]], %[[VAL_40]]]
4747
// CHECK: %[[VAL_41:.*]] = gpu.wait async
4848
// CHECK: %[[VAL_42:.*]], %[[VAL_43:.*]] = gpu.create_sparse_env async {{\[}}%[[VAL_41]]]
49-
// CHECK: %[[VAL_44:.*]], %[[VAL_45:.*]] = gpu.create_csr async {{\[}}%[[VAL_43]]] %[[VAL_6]], %[[VAL_8]], %[[VAL_5]], %[[VAL_14]], %[[VAL_19]], %[[VAL_24]] : memref<?xindex>, memref<?xindex>, memref<?xf64>
50-
// CHECK: %[[VAL_46:.*]], %[[VAL_47:.*]] = gpu.create_dn_mat async {{\[}}%[[VAL_45]]] %[[VAL_8]], %[[VAL_7]], %[[VAL_31]] : memref<?x?xf64>
51-
// CHECK: %[[VAL_48:.*]], %[[VAL_49:.*]] = gpu.create_dn_mat async {{\[}}%[[VAL_47]]] %[[VAL_6]], %[[VAL_7]], %[[VAL_38]] : memref<?x?xf64>
49+
// CHECK: %[[VAL_44:.*]], %[[VAL_45:.*]] = gpu.create_csr async {{\[}}%[[VAL_43]]] %[[VAL_6]], %[[VAL_7]], %[[VAL_5]], %[[VAL_14]], %[[VAL_19]], %[[VAL_24]] : memref<?xindex>, memref<?xindex>, memref<?xf64>
50+
// CHECK: %[[VAL_46:.*]], %[[VAL_47:.*]] = gpu.create_dn_mat async {{\[}}%[[VAL_45]]] %[[VAL_7]], %[[VAL_8]], %[[VAL_31]] : memref<?x?xf64>
51+
// CHECK: %[[VAL_48:.*]], %[[VAL_49:.*]] = gpu.create_dn_mat async {{\[}}%[[VAL_47]]] %[[VAL_6]], %[[VAL_8]], %[[VAL_38]] : memref<?x?xf64>
5252
// CHECK: %[[VAL_50:.*]], %[[VAL_51:.*]] = gpu.spmm_buffer_size async {{\[}}%[[VAL_49]]] %[[VAL_42]], %[[VAL_44]], %[[VAL_46]], %[[VAL_48]]
5353
// CHECK: %[[VAL_52:.*]], %[[VAL_53:.*]] = gpu.alloc async {{\[}}%[[VAL_51]]] (%[[VAL_50]]) : memref<?xi8>
5454
// CHECK: %[[VAL_54:.*]] = gpu.spmm async {{\[}}%[[VAL_53]]] %[[VAL_42]], %[[VAL_44]], %[[VAL_46]], %[[VAL_48]], %[[VAL_52]] : memref<?xi8>
5555
// CHECK: %[[VAL_55:.*]] = gpu.destroy_sp_mat async {{\[}}%[[VAL_54]]] %[[VAL_44]]
5656
// CHECK: %[[VAL_56:.*]] = gpu.destroy_dn_mat async {{\[}}%[[VAL_55]]] %[[VAL_46]]
5757
// CHECK: %[[VAL_57:.*]] = gpu.destroy_dn_mat async {{\[}}%[[VAL_56]]] %[[VAL_48]]
5858
// CHECK: %[[VAL_58:.*]] = gpu.destroy_sparse_env async {{\[}}%[[VAL_57]]] %[[VAL_42]]
59-
// CHECK: gpu.wait {{\[}}%[[VAL_58]]]
60-
// CHECK: %[[VAL_59:.*]] = gpu.wait async
61-
// CHECK: %[[VAL_60:.*]] = gpu.memcpy async {{\[}}%[[VAL_59]]] %[[VAL_34]], %[[VAL_38]] : memref<?x?xf64>, memref<?x?xf64>
62-
// CHECK: %[[VAL_61:.*]] = gpu.dealloc async {{\[}}%[[VAL_60]]] %[[VAL_14]] : memref<?xindex>
63-
// CHECK: %[[VAL_62:.*]] = gpu.dealloc async {{\[}}%[[VAL_61]]] %[[VAL_19]] : memref<?xindex>
64-
// CHECK: %[[VAL_63:.*]] = gpu.dealloc async {{\[}}%[[VAL_62]]] %[[VAL_24]] : memref<?xf64>
65-
// CHECK: %[[VAL_64:.*]] = gpu.dealloc async {{\[}}%[[VAL_63]]] %[[VAL_52]] : memref<?xi8>
66-
// CHECK: %[[VAL_65:.*]] = gpu.dealloc async {{\[}}%[[VAL_64]]] %[[VAL_31]] : memref<?x?xf64>
67-
// CHECK: %[[VAL_66:.*]] = gpu.dealloc async {{\[}}%[[VAL_65]]] %[[VAL_38]] : memref<?x?xf64>
68-
// CHECK: gpu.wait {{\[}}%[[VAL_66]]]
69-
// CHECK: return %[[VAL_2]] : tensor<?x?xf64>
59+
// CHECK: %[[VAL_59:.*]] = gpu.dealloc async {{\[}}%[[VAL_58]]] %[[VAL_14]] : memref<?xindex>
60+
// CHECK: %[[VAL_60:.*]] = gpu.dealloc async {{\[}}%[[VAL_59]]] %[[VAL_19]] : memref<?xindex>
61+
// CHECK: %[[VAL_61:.*]] = gpu.dealloc async {{\[}}%[[VAL_60]]] %[[VAL_24]] : memref<?xf64>
62+
// CHECK: %[[VAL_62:.*]] = gpu.dealloc async {{\[}}%[[VAL_61]]] %[[VAL_52]] : memref<?xi8>
63+
// CHECK: %[[VAL_63:.*]] = gpu.dealloc async {{\[}}%[[VAL_62]]] %[[VAL_31]] : memref<?x?xf64>
64+
// CHECK: %[[VAL_64:.*]] = gpu.memcpy async {{\[}}%[[VAL_63]]] %[[VAL_34]], %[[VAL_38]] : memref<?x?xf64>, memref<?x?xf64>
65+
// CHECK: %[[VAL_65:.*]] = gpu.dealloc async {{\[}}%[[VAL_64]]] %[[VAL_38]] : memref<?x?xf64>
66+
// CHECK: gpu.wait {{\[}}%[[VAL_65]]]
67+
// CHECK: %[[VAL_66:.*]] = bufferization.to_tensor %[[VAL_38]] : memref<?x?xf64>
68+
// CHECK: return %[[VAL_66]] : tensor<?x?xf64>
7069
// CHECK: }
7170
func.func @matmul(%A: tensor<?x?xf64, #CSR>, %B: tensor<?x?xf64>, %C_in: tensor<?x?xf64>) -> tensor<?x?xf64> {
7271
%C_out = linalg.matmul

mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,9 @@ module {
1616
// CHECK-DAG: %[[VAL_5:.*]] = sparse_tensor.number_of_entries %[[VAL_0]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.*}}}>>
1717
// CHECK-DAG: %[[VAL_6:.*]] = tensor.dim %[[VAL_0]], %[[VAL_3]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.*}}}>>
1818
// CHECK-DAG: %[[VAL_7:.*]] = tensor.dim %[[VAL_0]], %[[VAL_4]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.*}}}>>
19-
// CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<?x?xf64, #sparse_tensor.encoding<{{{.*}}}>>
20-
// CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<?x?xf64, #sparse_tensor.encoding<{{{.*}}}>>
21-
// CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.*}}}>>
19+
// CHECK-DAG: %[[VAL_8:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<?x?xf64, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xindex, strided<[?], offset: ?>>
20+
// CHECK-DAG: %[[VAL_9:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<?x?xf64, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xindex, strided<[?], offset: ?>>
21+
// CHECK-DAG: %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xf64>
2222
// CHECK: %[[VAL_11:.*]] = gpu.wait async
2323
// CHECK: %[[VAL_12:.*]] = memref.dim %[[VAL_8]], %[[VAL_3]] : memref<?xindex, strided<[?], offset: ?>>
2424
// CHECK: %[[VAL_13:.*]], %[[VAL_14:.*]] = gpu.alloc async {{\[}}%[[VAL_11]]] (%[[VAL_12]]) : memref<?xindex>
@@ -54,17 +54,16 @@ module {
5454
// CHECK: %[[VAL_53:.*]] = gpu.destroy_dn_vec async {{\[}}%[[VAL_52]]] %[[VAL_43]]
5555
// CHECK: %[[VAL_54:.*]] = gpu.destroy_dn_vec async {{\[}}%[[VAL_53]]] %[[VAL_45]]
5656
// CHECK: %[[VAL_55:.*]] = gpu.destroy_sparse_env async {{\[}}%[[VAL_54]]] %[[VAL_39]]
57-
// CHECK: gpu.wait {{\[}}%[[VAL_55]]]
58-
// CHECK: %[[VAL_56:.*]] = gpu.wait async
59-
// CHECK: %[[VAL_57:.*]] = gpu.memcpy async {{\[}}%[[VAL_56]]] %[[VAL_32]], %[[VAL_35]] : memref<?xf64>, memref<?xf64>
60-
// CHECK: %[[VAL_58:.*]] = gpu.dealloc async {{\[}}%[[VAL_57]]] %[[VAL_13]] : memref<?xindex>
61-
// CHECK: %[[VAL_59:.*]] = gpu.dealloc async {{\[}}%[[VAL_58]]] %[[VAL_18]] : memref<?xindex>
62-
// CHECK: %[[VAL_60:.*]] = gpu.dealloc async {{\[}}%[[VAL_59]]] %[[VAL_23]] : memref<?xf64>
63-
// CHECK: %[[VAL_61:.*]] = gpu.dealloc async {{\[}}%[[VAL_60]]] %[[VAL_49]] : memref<?xi8>
64-
// CHECK: %[[VAL_62:.*]] = gpu.dealloc async {{\[}}%[[VAL_61]]] %[[VAL_29]] : memref<?xf64>
65-
// CHECK: %[[VAL_63:.*]] = gpu.dealloc async {{\[}}%[[VAL_62]]] %[[VAL_35]] : memref<?xf64>
66-
// CHECK: gpu.wait {{\[}}%[[VAL_63]]]
67-
// CHECK: return %[[VAL_2]] : tensor<?xf64>
57+
// CHECK: %[[VAL_56:.*]] = gpu.dealloc async {{\[}}%[[VAL_55]]] %[[VAL_13]] : memref<?xindex>
58+
// CHECK: %[[VAL_57:.*]] = gpu.dealloc async {{\[}}%[[VAL_56]]] %[[VAL_18]] : memref<?xindex>
59+
// CHECK: %[[VAL_58:.*]] = gpu.dealloc async {{\[}}%[[VAL_57]]] %[[VAL_23]] : memref<?xf64>
60+
// CHECK: %[[VAL_59:.*]] = gpu.dealloc async {{\[}}%[[VAL_58]]] %[[VAL_49]] : memref<?xi8>
61+
// CHECK: %[[VAL_60:.*]] = gpu.dealloc async {{\[}}%[[VAL_59]]] %[[VAL_29]] : memref<?xf64>
62+
// CHECK: %[[VAL_61:.*]] = gpu.memcpy async {{\[}}%[[VAL_60]]] %[[VAL_32]], %[[VAL_35]] : memref<?xf64>, memref<?xf64>
63+
// CHECK: %[[VAL_62:.*]] = gpu.dealloc async {{\[}}%[[VAL_61]]] %[[VAL_35]] : memref<?xf64>
64+
// CHECK: gpu.wait {{\[}}%[[VAL_62]]]
65+
// CHECK: %[[VAL_63:.*]] = bufferization.to_tensor %[[VAL_32]] : memref<?xf64>
66+
// CHECK: return %[[VAL_63]] : tensor<?xf64>
6867
// CHECK: }
6968
func.func @matvec(%A: tensor<?x?xf64, #SortedCOO>,
7069
%x: tensor<?xf64>,

0 commit comments

Comments
 (0)