Add CUDA sync inside host_task

joeatodd · joeatodd · commit 94b96b4212f8 · 2022-06-30T11:44:31.000+01:00
This used to be inadvertently handled by an implementation detail which
has since changed.
diff --git a/examples/cuda_interop/vec_add.cu b/examples/cuda_interop/vec_add.cu
@@ -74,6 +74,9 @@ int main(int argc, char *argv[]) {
         gridSize = static_cast<int>(ceil(static_cast<float>(n) / blockSize));
         // Call the CUDA kernel directly from SYCL
         vecAdd<<<gridSize, blockSize>>>(dA, dB, dC, n);
+        // Interop with host_task doesn't add CUDA event to task graph
+        // so we must manually sync here.
+        cudaDeviceSynchronize();
       });
     });
 
diff --git a/examples/sgemm_interop/sycl_sgemm.cpp b/examples/sgemm_interop/sycl_sgemm.cpp
@@ -81,6 +81,8 @@ int main() {
       auto d_C = b_C.get_access<sycl::access::mode::write>(h);
 
       h.host_task([=](sycl::interop_handle ih) {
+        auto cuStream = ih.get_native_queue<backend::ext_oneapi_cuda>();
+        cublasSetStream(handle, cuStream);
         cuCtxSetCurrent(ih.get_native_context<backend::cuda>());
         cublasSetStream(handle, ih.get_native_queue<backend::cuda>());
         auto cuA = reinterpret_cast<float *>(ih.get_native_mem<backend::cuda>(d_A));
@@ -90,6 +92,7 @@ int main() {
         CHECK_ERROR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, WIDTH, HEIGHT,
                                 WIDTH, &ALPHA, cuA, WIDTH, cuB, WIDTH, &BETA,
                                 cuC, WIDTH));
+        cuStreamSynchronize(cuStream);
       });
     });
   }
diff --git a/examples/sgemm_interop/sycl_sgemm_usm.cpp b/examples/sgemm_interop/sycl_sgemm_usm.cpp
@@ -86,13 +86,14 @@ int main() {
     h.host_task([=](sycl::interop_handle ih) {
 
       // Set the correct cuda context & stream
-      cuCtxSetCurrent(ih.get_native_context<backend::cuda>());
-      cublasSetStream(handle, ih.get_native_queue<backend::cuda>());
+      auto cuStream = ih.get_native_queue<backend::ext_oneapi_cuda>();
+      cublasSetStream(handle, cuStream);
 
       // Call generalised matrix-matrix multiply
       CHECK_ERROR(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, WIDTH, HEIGHT,
                               WIDTH, &ALPHA, d_A, WIDTH, d_B, WIDTH, &BETA,
                               d_C, WIDTH));
+      cuStreamSynchronize(cuStream);
     });
   }).wait();