@@ -690,12 +690,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
690
690
}
691
691
} else if (pImageDesc->type == UR_MEM_TYPE_IMAGE2D) {
692
692
CUDA_MEMCPY2D cpy_desc = {};
693
- cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
694
- cpy_desc.srcHost = pSrc;
695
693
cpy_desc.srcXInBytes = srcOffset.x * PixelSizeBytes;
696
694
cpy_desc.srcY = srcOffset.y ;
697
695
cpy_desc.dstXInBytes = dstOffset.x * PixelSizeBytes;
698
696
cpy_desc.dstY = dstOffset.y ;
697
+ cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
698
+ cpy_desc.srcHost = pSrc;
699
699
cpy_desc.srcPitch = hostExtent.width * PixelSizeBytes;
700
700
if (pImageDesc->rowPitch == 0 ) {
701
701
cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
@@ -781,8 +781,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
781
781
cpy_desc.srcY = srcOffset.y ;
782
782
cpy_desc.dstXInBytes = dstOffset.x ;
783
783
cpy_desc.dstY = dstOffset.y ;
784
- cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
785
- cpy_desc.dstHost = pDst;
786
784
if (pImageDesc->rowPitch == 0 ) {
787
785
cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
788
786
cpy_desc.srcArray = (CUarray)pSrc;
@@ -792,6 +790,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
792
790
cpy_desc.srcPitch = pImageDesc->rowPitch ;
793
791
cpy_desc.srcDevice = (CUdeviceptr)pSrc;
794
792
}
793
+ cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
794
+ cpy_desc.dstHost = pDst;
795
795
cpy_desc.WidthInBytes = PixelSizeBytes * copyExtent.width ;
796
796
cpy_desc.Height = copyExtent.height ;
797
797
UR_CHECK_ERROR (cuMemcpy2DAsync (&cpy_desc, Stream));
@@ -831,10 +831,79 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
831
831
UR_CHECK_ERROR (cuMemcpy3DAsync (&cpy_desc, Stream));
832
832
}
833
833
} else {
834
- // / imageCopyFlags == UR_EXP_IMAGE_COPY_FLAG_DEVICE_TO_DEVICE
835
- // / TODO: implemet device to device copy
836
- return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
834
+ // imageCopyFlags == UR_EXP_IMAGE_COPY_FLAG_DEVICE_TO_DEVICE
835
+
836
+ // All the following async copy function calls should be treated as
837
+ // synchronous because of the explicit call to cuStreamSynchronize at
838
+ // the end
839
+ if (pImageDesc->type == UR_MEM_TYPE_IMAGE1D) {
840
+ CUDA_MEMCPY2D cpy_desc = {};
841
+ cpy_desc.srcXInBytes = srcOffset.x ;
842
+ cpy_desc.srcY = 0 ;
843
+ cpy_desc.dstXInBytes = dstOffset.x ;
844
+ cpy_desc.dstY = 0 ;
845
+ cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
846
+ cpy_desc.srcArray = (CUarray)pSrc;
847
+ cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
848
+ cpy_desc.dstArray = (CUarray)pDst;
849
+ cpy_desc.WidthInBytes = PixelSizeBytes * copyExtent.width ;
850
+ cpy_desc.Height = 1 ;
851
+ UR_CHECK_ERROR (cuMemcpy2DAsync (&cpy_desc, Stream));
852
+ } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE2D) {
853
+ CUDA_MEMCPY2D cpy_desc = {};
854
+ cpy_desc.srcXInBytes = srcOffset.x ;
855
+ cpy_desc.srcY = srcOffset.y ;
856
+ cpy_desc.dstXInBytes = dstOffset.x ;
857
+ cpy_desc.dstY = dstOffset.y ;
858
+ cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
859
+ cpy_desc.srcArray = (CUarray)pSrc;
860
+ cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
861
+ cpy_desc.dstArray = (CUarray)pDst;
862
+ cpy_desc.WidthInBytes = PixelSizeBytes * copyExtent.width ;
863
+ cpy_desc.Height = copyExtent.height ;
864
+ UR_CHECK_ERROR (cuMemcpy2DAsync (&cpy_desc, Stream));
865
+ } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE3D) {
866
+ CUDA_MEMCPY3D cpy_desc = {};
867
+ cpy_desc.srcXInBytes = srcOffset.x ;
868
+ cpy_desc.srcY = srcOffset.y ;
869
+ cpy_desc.srcZ = srcOffset.z ;
870
+ cpy_desc.dstXInBytes = dstOffset.x ;
871
+ cpy_desc.dstY = dstOffset.y ;
872
+ cpy_desc.dstZ = dstOffset.z ;
873
+ cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
874
+ cpy_desc.srcArray = (CUarray)pSrc;
875
+ cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
876
+ cpy_desc.dstArray = (CUarray)pDst;
877
+ cpy_desc.WidthInBytes = PixelSizeBytes * copyExtent.width ;
878
+ cpy_desc.Height = copyExtent.height ;
879
+ cpy_desc.Depth = copyExtent.depth ;
880
+ UR_CHECK_ERROR (cuMemcpy3DAsync (&cpy_desc, Stream));
881
+ } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE1D_ARRAY ||
882
+ pImageDesc->type == UR_MEM_TYPE_IMAGE2D_ARRAY ||
883
+ pImageDesc->type == UR_MEM_TYPE_IMAGE_CUBEMAP_EXP) {
884
+ CUDA_MEMCPY3D cpy_desc = {};
885
+ cpy_desc.srcXInBytes = srcOffset.x ;
886
+ cpy_desc.srcY = srcOffset.y ;
887
+ cpy_desc.srcZ = srcOffset.z ;
888
+ cpy_desc.dstXInBytes = dstOffset.x ;
889
+ cpy_desc.dstY = dstOffset.y ;
890
+ cpy_desc.dstZ = dstOffset.z ;
891
+ cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
892
+ cpy_desc.srcArray = (CUarray)pSrc;
893
+ cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
894
+ cpy_desc.dstArray = (CUarray)pDst;
895
+ cpy_desc.WidthInBytes = PixelSizeBytes * copyExtent.width ;
896
+ cpy_desc.Height = std::max (uint64_t {1 }, copyExtent.height );
897
+ cpy_desc.Depth = pImageDesc->arraySize ;
898
+ UR_CHECK_ERROR (cuMemcpy3DAsync (&cpy_desc, Stream));
899
+ }
900
+ // Synchronization is required here to handle the case of copying data
901
+ // from host to device, then device to device and finally device to host.
902
+ // Without it, there is a risk of the copies not being executed in the
903
+ // intended order.
904
+ cuStreamSynchronize (Stream);
837
905
}
906
+
838
907
if (phEvent) {
839
908
auto NewEvent = ur_event_handle_t_::makeNative (UR_COMMAND_MEM_IMAGE_COPY,
840
909
hQueue, Stream);
0 commit comments