[mlir][sparse][gpu] various cuSparse refinements

aartbik · aartbik · commit bcb698bfdc72 · 2023-05-24T22:32:52.000-07:00
(1) keep all cuSparse ops on single stream without wait() in right order (2) use more type precise memref types for COO (3) use ToTensor on resulting memref (even though it folds away again) Reviewed By: K-Wu Differential Revision: https://reviews.llvm.org/D151404
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
@@ -378,23 +378,26 @@ static Value genSecondCrds(OpBuilder &builder, Location loc, Value a,
                            bool isCOO, bool enableRT) {
   if (isCOO && !enableRT)
     return Value(); // nothing needed
-  return genToCoordinates(builder, loc, a, 1, /*cooStart=*/0);
+  return genToCoordinates(builder, loc, a, 1, /*cooStart=*/isCOO ? 0 : 2);
 }
 
 /// Generates the sparse matrix multiplication.
 static Operation *genSpMat(OpBuilder &builder, Location loc, Type handleTp,
-                           Type tokenTp, Value token, Value szY, Value szX,
+                           Type tokenTp, Value token, Value sz1, Value sz2,
                            Value nseA, Value rowA, Value colA, Value valA,
                            bool isCOO, bool enableRT) {
   if (isCOO) {
     // Library uses SoA COO, direct IR uses AoS COO.
-    if (enableRT)
+    if (enableRT) {
+      assert(colA);
       return builder.create<gpu::CreateCooOp>(loc, handleTp, tokenTp, token,
-                                              szY, szX, nseA, rowA, colA, valA);
+                                              sz1, sz2, nseA, rowA, colA, valA);
+    }
     llvm_unreachable("gpu::CreateCooAoSOp is deprecated");
   }
-  return builder.create<gpu::CreateCsrOp>(loc, handleTp, tokenTp, token, szY,
-                                          szX, nseA, rowA, colA, valA);
+  assert(colA);
+  return builder.create<gpu::CreateCsrOp>(loc, handleTp, tokenTp, token, sz1,
+                                          sz2, nseA, rowA, colA, valA);
 }
 
 /// Match and rewrite SpMV kernel.
@@ -482,24 +485,20 @@ static LogicalResult rewriteSpMV(PatternRewriter &rewriter,
               .getAsyncToken();
   token = rewriter.create<gpu::DestroySparseEnvOp>(loc, tokenTp, token, handle)
               .getAsyncToken();
-  tokens.push_back(token);
-  genBlockingWait(rewriter, loc, tokens);
-  tokens.clear();
-  token = genFirstWait(rewriter, loc);
-  token = genCopyMemRef(rewriter, loc, memY, vecY, token);
   token = genDeallocMemRef(rewriter, loc, rowA, token);
   if (colA)
     token = genDeallocMemRef(rewriter, loc, colA, token);
   token = genDeallocMemRef(rewriter, loc, valA, token);
   token = genDeallocMemRef(rewriter, loc, buffer, token);
   token = genDeallocMemRef(rewriter, loc, vecX, token);
+  token = genCopyMemRef(rewriter, loc, memY, vecY, token);
   token = genDeallocMemRef(rewriter, loc, vecY, token);
   tokens.push_back(token);
   genBlockingWait(rewriter, loc, tokens);
   tokens.clear();
 
   // Done.
-  rewriter.replaceOp(op, op.getDpsInitOperand(0)->get());
+  rewriter.replaceOpWithNewOp<bufferization::ToTensorOp>(op, memY);
   return success();
 }
 
@@ -589,24 +588,20 @@ static LogicalResult rewriteSpMM(PatternRewriter &rewriter,
               .getAsyncToken();
   token = rewriter.create<gpu::DestroySparseEnvOp>(loc, tokenTp, token, handle)
               .getAsyncToken();
-  tokens.push_back(token);
-  genBlockingWait(rewriter, loc, tokens);
-  tokens.clear();
-  token = genFirstWait(rewriter, loc);
-  token = genCopyMemRef(rewriter, loc, bufC, matC, token);
   token = genDeallocMemRef(rewriter, loc, rowA, token);
   if (colA)
     token = genDeallocMemRef(rewriter, loc, colA, token);
   token = genDeallocMemRef(rewriter, loc, valA, token);
   token = genDeallocMemRef(rewriter, loc, buffer, token);
   token = genDeallocMemRef(rewriter, loc, matB, token);
+  token = genCopyMemRef(rewriter, loc, bufC, matC, token);
   token = genDeallocMemRef(rewriter, loc, matC, token);
   tokens.push_back(token);
   genBlockingWait(rewriter, loc, tokens);
   tokens.clear();
 
   // Done.
-  rewriter.replaceOp(op, op.getDpsInitOperand(0)->get());
+  rewriter.replaceOpWithNewOp<bufferization::ToTensorOp>(op, matC);
   return success();
 }
 
diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -248,9 +248,12 @@ static inline cusparseIndexType_t idxTp(int32_t width) {
 
 // Some macro magic to get float/double alpha and beta on host.
 #define ALPHABETA(w, alpha, beta)                                              \
-  float(alpha##f) = 1.0, (beta##f) = 1.0;                                      \
-  double(alpha##d) = 1.0, (beta##d) = 1.0;                                     \
-  void *(alpha##p), *(beta##p);                                                \
+  float(alpha##f) = 1.0f;                                                      \
+  float(beta##f) = 1.0f;                                                       \
+  double(alpha##d) = 1.0;                                                      \
+  double(beta##d) = 1.0;                                                       \
+  const void *(alpha##p) = nullptr;                                            \
+  const void *(beta##p) = nullptr;                                             \
   if ((w) == 32) {                                                             \
     (alpha##p) = reinterpret_cast<void *>(&(alpha##f));                        \
     (beta##p) = reinterpret_cast<void *>(&(beta##f));                          \
diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir
@@ -14,19 +14,19 @@
 // CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 1 : index
 // CHECK-DAG:       %[[VAL_5:.*]] = sparse_tensor.number_of_entries %[[VAL_0]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.*}}}>>
 // CHECK-DAG:       %[[VAL_6:.*]] = tensor.dim %[[VAL_0]], %[[VAL_3]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK-DAG:       %[[VAL_7:.*]] = tensor.dim %[[VAL_1]], %[[VAL_4]] : tensor<?x?xf64>
-// CHECK-DAG:       %[[VAL_8:.*]] = tensor.dim %[[VAL_0]], %[[VAL_4]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-DAG:       %[[VAL_7:.*]] = tensor.dim %[[VAL_0]], %[[VAL_4]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-DAG:       %[[VAL_8:.*]] = tensor.dim %[[VAL_1]], %[[VAL_4]] : tensor<?x?xf64>
 // CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<?x?xf64, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xindex>
-// CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<?x?xf64, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xindex, strided<[?], offset: ?>>
-// CHECK:           %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xf64>
+// CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<?x?xf64, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xf64>
 // CHECK:           %[[VAL_12:.*]] = gpu.wait async
 // CHECK:           %[[VAL_13:.*]] = memref.dim %[[VAL_9]], %[[VAL_3]] : memref<?xindex>
 // CHECK:           %[[VAL_14:.*]], %[[VAL_15:.*]] = gpu.alloc async {{\[}}%[[VAL_12]]] (%[[VAL_13]]) : memref<?xindex>
 // CHECK:           %[[VAL_16:.*]] = gpu.memcpy async {{\[}}%[[VAL_15]]] %[[VAL_14]], %[[VAL_9]] : memref<?xindex>, memref<?xindex>
 // CHECK:           %[[VAL_17:.*]] = gpu.wait async
-// CHECK:           %[[VAL_18:.*]] = memref.dim %[[VAL_10]], %[[VAL_3]] : memref<?xindex, strided<[?], offset: ?>>
+// CHECK:           %[[VAL_18:.*]] = memref.dim %[[VAL_10]], %[[VAL_3]] : memref<?xindex>
 // CHECK:           %[[VAL_19:.*]], %[[VAL_20:.*]] = gpu.alloc async {{\[}}%[[VAL_17]]] (%[[VAL_18]]) : memref<?xindex>
-// CHECK:           %[[VAL_21:.*]] = gpu.memcpy async {{\[}}%[[VAL_20]]] %[[VAL_19]], %[[VAL_10]] : memref<?xindex>, memref<?xindex, strided<[?], offset: ?>>
+// CHECK:           %[[VAL_21:.*]] = gpu.memcpy async {{\[}}%[[VAL_20]]] %[[VAL_19]], %[[VAL_10]] : memref<?xindex>, memref<?xindex>
 // CHECK:           %[[VAL_22:.*]] = gpu.wait async
 // CHECK:           %[[VAL_23:.*]] = memref.dim %[[VAL_11]], %[[VAL_3]] : memref<?xf64>
 // CHECK:           %[[VAL_24:.*]], %[[VAL_25:.*]] = gpu.alloc async {{\[}}%[[VAL_22]]] (%[[VAL_23]]) : memref<?xf64>
@@ -46,27 +46,26 @@
 // CHECK:           gpu.wait {{\[}}%[[VAL_16]], %[[VAL_21]], %[[VAL_26]], %[[VAL_33]], %[[VAL_40]]]
 // CHECK:           %[[VAL_41:.*]] = gpu.wait async
 // CHECK:           %[[VAL_42:.*]], %[[VAL_43:.*]] = gpu.create_sparse_env async {{\[}}%[[VAL_41]]]
-// CHECK:           %[[VAL_44:.*]], %[[VAL_45:.*]] = gpu.create_csr async {{\[}}%[[VAL_43]]] %[[VAL_6]], %[[VAL_8]], %[[VAL_5]], %[[VAL_14]], %[[VAL_19]], %[[VAL_24]] : memref<?xindex>, memref<?xindex>, memref<?xf64>
-// CHECK:           %[[VAL_46:.*]], %[[VAL_47:.*]] = gpu.create_dn_mat async {{\[}}%[[VAL_45]]] %[[VAL_8]], %[[VAL_7]], %[[VAL_31]] : memref<?x?xf64>
-// CHECK:           %[[VAL_48:.*]], %[[VAL_49:.*]] = gpu.create_dn_mat async {{\[}}%[[VAL_47]]] %[[VAL_6]], %[[VAL_7]], %[[VAL_38]] : memref<?x?xf64>
+// CHECK:           %[[VAL_44:.*]], %[[VAL_45:.*]] = gpu.create_csr async {{\[}}%[[VAL_43]]] %[[VAL_6]], %[[VAL_7]], %[[VAL_5]], %[[VAL_14]], %[[VAL_19]], %[[VAL_24]] : memref<?xindex>, memref<?xindex>, memref<?xf64>
+// CHECK:           %[[VAL_46:.*]], %[[VAL_47:.*]] = gpu.create_dn_mat async {{\[}}%[[VAL_45]]] %[[VAL_7]], %[[VAL_8]], %[[VAL_31]] : memref<?x?xf64>
+// CHECK:           %[[VAL_48:.*]], %[[VAL_49:.*]] = gpu.create_dn_mat async {{\[}}%[[VAL_47]]] %[[VAL_6]], %[[VAL_8]], %[[VAL_38]] : memref<?x?xf64>
 // CHECK:           %[[VAL_50:.*]], %[[VAL_51:.*]] = gpu.spmm_buffer_size async {{\[}}%[[VAL_49]]] %[[VAL_42]], %[[VAL_44]], %[[VAL_46]], %[[VAL_48]]
 // CHECK:           %[[VAL_52:.*]], %[[VAL_53:.*]] = gpu.alloc async {{\[}}%[[VAL_51]]] (%[[VAL_50]]) : memref<?xi8>
 // CHECK:           %[[VAL_54:.*]] = gpu.spmm async {{\[}}%[[VAL_53]]] %[[VAL_42]], %[[VAL_44]], %[[VAL_46]], %[[VAL_48]], %[[VAL_52]] : memref<?xi8>
 // CHECK:           %[[VAL_55:.*]] = gpu.destroy_sp_mat async {{\[}}%[[VAL_54]]] %[[VAL_44]]
 // CHECK:           %[[VAL_56:.*]] = gpu.destroy_dn_mat async {{\[}}%[[VAL_55]]] %[[VAL_46]]
 // CHECK:           %[[VAL_57:.*]] = gpu.destroy_dn_mat async {{\[}}%[[VAL_56]]] %[[VAL_48]]
 // CHECK:           %[[VAL_58:.*]] = gpu.destroy_sparse_env async {{\[}}%[[VAL_57]]] %[[VAL_42]]
-// CHECK:           gpu.wait {{\[}}%[[VAL_58]]]
-// CHECK:           %[[VAL_59:.*]] = gpu.wait async
-// CHECK:           %[[VAL_60:.*]] = gpu.memcpy async {{\[}}%[[VAL_59]]] %[[VAL_34]], %[[VAL_38]] : memref<?x?xf64>, memref<?x?xf64>
-// CHECK:           %[[VAL_61:.*]] = gpu.dealloc async {{\[}}%[[VAL_60]]] %[[VAL_14]] : memref<?xindex>
-// CHECK:           %[[VAL_62:.*]] = gpu.dealloc async {{\[}}%[[VAL_61]]] %[[VAL_19]] : memref<?xindex>
-// CHECK:           %[[VAL_63:.*]] = gpu.dealloc async {{\[}}%[[VAL_62]]] %[[VAL_24]] : memref<?xf64>
-// CHECK:           %[[VAL_64:.*]] = gpu.dealloc async {{\[}}%[[VAL_63]]] %[[VAL_52]] : memref<?xi8>
-// CHECK:           %[[VAL_65:.*]] = gpu.dealloc async {{\[}}%[[VAL_64]]] %[[VAL_31]] : memref<?x?xf64>
-// CHECK:           %[[VAL_66:.*]] = gpu.dealloc async {{\[}}%[[VAL_65]]] %[[VAL_38]] : memref<?x?xf64>
-// CHECK:           gpu.wait {{\[}}%[[VAL_66]]]
-// CHECK:           return %[[VAL_2]] : tensor<?x?xf64>
+// CHECK:           %[[VAL_59:.*]] = gpu.dealloc async {{\[}}%[[VAL_58]]] %[[VAL_14]] : memref<?xindex>
+// CHECK:           %[[VAL_60:.*]] = gpu.dealloc async {{\[}}%[[VAL_59]]] %[[VAL_19]] : memref<?xindex>
+// CHECK:           %[[VAL_61:.*]] = gpu.dealloc async {{\[}}%[[VAL_60]]] %[[VAL_24]] : memref<?xf64>
+// CHECK:           %[[VAL_62:.*]] = gpu.dealloc async {{\[}}%[[VAL_61]]] %[[VAL_52]] : memref<?xi8>
+// CHECK:           %[[VAL_63:.*]] = gpu.dealloc async {{\[}}%[[VAL_62]]] %[[VAL_31]] : memref<?x?xf64>
+// CHECK:           %[[VAL_64:.*]] = gpu.memcpy async {{\[}}%[[VAL_63]]] %[[VAL_34]], %[[VAL_38]] : memref<?x?xf64>, memref<?x?xf64>
+// CHECK:           %[[VAL_65:.*]] = gpu.dealloc async {{\[}}%[[VAL_64]]] %[[VAL_38]] : memref<?x?xf64>
+// CHECK:           gpu.wait {{\[}}%[[VAL_65]]]
+// CHECK:           %[[VAL_66:.*]] = bufferization.to_tensor %[[VAL_38]] : memref<?x?xf64>
+// CHECK:           return %[[VAL_66]] : tensor<?x?xf64>
 // CHECK:         }
 func.func @matmul(%A: tensor<?x?xf64, #CSR>, %B: tensor<?x?xf64>, %C_in: tensor<?x?xf64>) -> tensor<?x?xf64> {
   %C_out = linalg.matmul
diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir
@@ -16,9 +16,9 @@ module {
 // CHECK-DAG:       %[[VAL_5:.*]] = sparse_tensor.number_of_entries %[[VAL_0]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.*}}}>>
 // CHECK-DAG:       %[[VAL_6:.*]] = tensor.dim %[[VAL_0]], %[[VAL_3]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.*}}}>>
 // CHECK-DAG:       %[[VAL_7:.*]] = tensor.dim %[[VAL_0]], %[[VAL_4]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<?x?xf64, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<?x?xf64, #sparse_tensor.encoding<{{{.*}}}>>
-// CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<?x?xf64, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xindex, strided<[?], offset: ?>>
+// CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<?x?xf64, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xindex, strided<[?], offset: ?>>
+// CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xf64>
 // CHECK:           %[[VAL_11:.*]] = gpu.wait async
 // CHECK:           %[[VAL_12:.*]] = memref.dim %[[VAL_8]], %[[VAL_3]] : memref<?xindex, strided<[?], offset: ?>>
 // CHECK:           %[[VAL_13:.*]], %[[VAL_14:.*]] = gpu.alloc async {{\[}}%[[VAL_11]]] (%[[VAL_12]]) : memref<?xindex>
@@ -54,17 +54,16 @@ module {
 // CHECK:           %[[VAL_53:.*]] = gpu.destroy_dn_vec async {{\[}}%[[VAL_52]]] %[[VAL_43]]
 // CHECK:           %[[VAL_54:.*]] = gpu.destroy_dn_vec async {{\[}}%[[VAL_53]]] %[[VAL_45]]
 // CHECK:           %[[VAL_55:.*]] = gpu.destroy_sparse_env async {{\[}}%[[VAL_54]]] %[[VAL_39]]
-// CHECK:           gpu.wait {{\[}}%[[VAL_55]]]
-// CHECK:           %[[VAL_56:.*]] = gpu.wait async
-// CHECK:           %[[VAL_57:.*]] = gpu.memcpy async {{\[}}%[[VAL_56]]] %[[VAL_32]], %[[VAL_35]] : memref<?xf64>, memref<?xf64>
-// CHECK:           %[[VAL_58:.*]] = gpu.dealloc async {{\[}}%[[VAL_57]]] %[[VAL_13]] : memref<?xindex>
-// CHECK:           %[[VAL_59:.*]] = gpu.dealloc async {{\[}}%[[VAL_58]]] %[[VAL_18]] : memref<?xindex>
-// CHECK:           %[[VAL_60:.*]] = gpu.dealloc async {{\[}}%[[VAL_59]]] %[[VAL_23]] : memref<?xf64>
-// CHECK:           %[[VAL_61:.*]] = gpu.dealloc async {{\[}}%[[VAL_60]]] %[[VAL_49]] : memref<?xi8>
-// CHECK:           %[[VAL_62:.*]] = gpu.dealloc async {{\[}}%[[VAL_61]]] %[[VAL_29]] : memref<?xf64>
-// CHECK:           %[[VAL_63:.*]] = gpu.dealloc async {{\[}}%[[VAL_62]]] %[[VAL_35]] : memref<?xf64>
-// CHECK:           gpu.wait {{\[}}%[[VAL_63]]]
-// CHECK:           return %[[VAL_2]] : tensor<?xf64>
+// CHECK:           %[[VAL_56:.*]] = gpu.dealloc async {{\[}}%[[VAL_55]]] %[[VAL_13]] : memref<?xindex>
+// CHECK:           %[[VAL_57:.*]] = gpu.dealloc async {{\[}}%[[VAL_56]]] %[[VAL_18]] : memref<?xindex>
+// CHECK:           %[[VAL_58:.*]] = gpu.dealloc async {{\[}}%[[VAL_57]]] %[[VAL_23]] : memref<?xf64>
+// CHECK:           %[[VAL_59:.*]] = gpu.dealloc async {{\[}}%[[VAL_58]]] %[[VAL_49]] : memref<?xi8>
+// CHECK:           %[[VAL_60:.*]] = gpu.dealloc async {{\[}}%[[VAL_59]]] %[[VAL_29]] : memref<?xf64>
+// CHECK:           %[[VAL_61:.*]] = gpu.memcpy async {{\[}}%[[VAL_60]]] %[[VAL_32]], %[[VAL_35]] : memref<?xf64>, memref<?xf64>
+// CHECK:           %[[VAL_62:.*]] = gpu.dealloc async {{\[}}%[[VAL_61]]] %[[VAL_35]] : memref<?xf64>
+// CHECK:           gpu.wait {{\[}}%[[VAL_62]]]
+// CHECK:           %[[VAL_63:.*]] = bufferization.to_tensor %[[VAL_32]] : memref<?xf64>
+// CHECK:           return %[[VAL_63]] : tensor<?xf64>
 // CHECK:         }
 func.func @matvec(%A: tensor<?x?xf64, #SortedCOO>,
                   %x: tensor<?xf64>,