@@ -659,6 +659,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
659
659
enqueueEventsWait (hQueue, Stream, numEventsInWaitList, phEventWaitList);
660
660
661
661
// We have to use a different copy function for each image dimensionality.
662
+ // All the async copy function should be treated as synchronous because of
663
+ // the explicit call to cuStreamSynchronize at the end
662
664
663
665
if (imageCopyFlags == UR_EXP_IMAGE_COPY_FLAG_HOST_TO_DEVICE) {
664
666
if (pImageDesc->type == UR_MEM_TYPE_IMAGE1D) {
@@ -893,12 +895,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
893
895
cpy_desc.Depth = pImageDesc->arraySize ;
894
896
UR_CHECK_ERROR (cuMemcpy3DAsync (&cpy_desc, Stream));
895
897
}
898
+ // Synchronization is required here to handle the case of copying data
899
+ // from
900
+ // host to device, then device to device and finally device to host.
901
+ // Without it, there is a risk of the copies not being executed in the
902
+ // intended order.
903
+ cuStreamSynchronize (Stream);
896
904
}
897
- // Synchronization is required here to handle the case of copying data from
898
- // host to device, then device to device and finally device to host.
899
- // Without it, there is a risk of the copies not being executed in the
900
- // intended order.
901
- cuStreamSynchronize (Stream);
905
+
902
906
if (phEvent) {
903
907
auto NewEvent = ur_event_handle_t_::makeNative (UR_COMMAND_MEM_IMAGE_COPY,
904
908
hQueue, Stream);
0 commit comments