Skip to content

Commit 7fcfe3a

Browse files
authored
Merge pull request #1265 from cppchedy/chedy/device-to-device-copy
[Bindless][Exp] Add support for device to device copies between CuArrays
2 parents b37fa2c + f9fb116 commit 7fcfe3a

File tree

1 file changed

+76
-7
lines changed

1 file changed

+76
-7
lines changed

source/adapters/cuda/image.cpp

Lines changed: 76 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -690,12 +690,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
690690
}
691691
} else if (pImageDesc->type == UR_MEM_TYPE_IMAGE2D) {
692692
CUDA_MEMCPY2D cpy_desc = {};
693-
cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
694-
cpy_desc.srcHost = pSrc;
695693
cpy_desc.srcXInBytes = srcOffset.x * PixelSizeBytes;
696694
cpy_desc.srcY = srcOffset.y;
697695
cpy_desc.dstXInBytes = dstOffset.x * PixelSizeBytes;
698696
cpy_desc.dstY = dstOffset.y;
697+
cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
698+
cpy_desc.srcHost = pSrc;
699699
cpy_desc.srcPitch = hostExtent.width * PixelSizeBytes;
700700
if (pImageDesc->rowPitch == 0) {
701701
cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
@@ -781,8 +781,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
781781
cpy_desc.srcY = srcOffset.y;
782782
cpy_desc.dstXInBytes = dstOffset.x;
783783
cpy_desc.dstY = dstOffset.y;
784-
cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
785-
cpy_desc.dstHost = pDst;
786784
if (pImageDesc->rowPitch == 0) {
787785
cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
788786
cpy_desc.srcArray = (CUarray)pSrc;
@@ -792,6 +790,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
792790
cpy_desc.srcPitch = pImageDesc->rowPitch;
793791
cpy_desc.srcDevice = (CUdeviceptr)pSrc;
794792
}
793+
cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
794+
cpy_desc.dstHost = pDst;
795795
cpy_desc.WidthInBytes = PixelSizeBytes * copyExtent.width;
796796
cpy_desc.Height = copyExtent.height;
797797
UR_CHECK_ERROR(cuMemcpy2DAsync(&cpy_desc, Stream));
@@ -831,10 +831,79 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
831831
UR_CHECK_ERROR(cuMemcpy3DAsync(&cpy_desc, Stream));
832832
}
833833
} else {
834-
/// imageCopyFlags == UR_EXP_IMAGE_COPY_FLAG_DEVICE_TO_DEVICE
835-
/// TODO: implemet device to device copy
836-
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
834+
// imageCopyFlags == UR_EXP_IMAGE_COPY_FLAG_DEVICE_TO_DEVICE
835+
836+
// All the following async copy function calls should be treated as
837+
// synchronous because of the explicit call to cuStreamSynchronize at
838+
// the end
839+
if (pImageDesc->type == UR_MEM_TYPE_IMAGE1D) {
840+
CUDA_MEMCPY2D cpy_desc = {};
841+
cpy_desc.srcXInBytes = srcOffset.x;
842+
cpy_desc.srcY = 0;
843+
cpy_desc.dstXInBytes = dstOffset.x;
844+
cpy_desc.dstY = 0;
845+
cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
846+
cpy_desc.srcArray = (CUarray)pSrc;
847+
cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
848+
cpy_desc.dstArray = (CUarray)pDst;
849+
cpy_desc.WidthInBytes = PixelSizeBytes * copyExtent.width;
850+
cpy_desc.Height = 1;
851+
UR_CHECK_ERROR(cuMemcpy2DAsync(&cpy_desc, Stream));
852+
} else if (pImageDesc->type == UR_MEM_TYPE_IMAGE2D) {
853+
CUDA_MEMCPY2D cpy_desc = {};
854+
cpy_desc.srcXInBytes = srcOffset.x;
855+
cpy_desc.srcY = srcOffset.y;
856+
cpy_desc.dstXInBytes = dstOffset.x;
857+
cpy_desc.dstY = dstOffset.y;
858+
cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
859+
cpy_desc.srcArray = (CUarray)pSrc;
860+
cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
861+
cpy_desc.dstArray = (CUarray)pDst;
862+
cpy_desc.WidthInBytes = PixelSizeBytes * copyExtent.width;
863+
cpy_desc.Height = copyExtent.height;
864+
UR_CHECK_ERROR(cuMemcpy2DAsync(&cpy_desc, Stream));
865+
} else if (pImageDesc->type == UR_MEM_TYPE_IMAGE3D) {
866+
CUDA_MEMCPY3D cpy_desc = {};
867+
cpy_desc.srcXInBytes = srcOffset.x;
868+
cpy_desc.srcY = srcOffset.y;
869+
cpy_desc.srcZ = srcOffset.z;
870+
cpy_desc.dstXInBytes = dstOffset.x;
871+
cpy_desc.dstY = dstOffset.y;
872+
cpy_desc.dstZ = dstOffset.z;
873+
cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
874+
cpy_desc.srcArray = (CUarray)pSrc;
875+
cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
876+
cpy_desc.dstArray = (CUarray)pDst;
877+
cpy_desc.WidthInBytes = PixelSizeBytes * copyExtent.width;
878+
cpy_desc.Height = copyExtent.height;
879+
cpy_desc.Depth = copyExtent.depth;
880+
UR_CHECK_ERROR(cuMemcpy3DAsync(&cpy_desc, Stream));
881+
} else if (pImageDesc->type == UR_MEM_TYPE_IMAGE1D_ARRAY ||
882+
pImageDesc->type == UR_MEM_TYPE_IMAGE2D_ARRAY ||
883+
pImageDesc->type == UR_MEM_TYPE_IMAGE_CUBEMAP_EXP) {
884+
CUDA_MEMCPY3D cpy_desc = {};
885+
cpy_desc.srcXInBytes = srcOffset.x;
886+
cpy_desc.srcY = srcOffset.y;
887+
cpy_desc.srcZ = srcOffset.z;
888+
cpy_desc.dstXInBytes = dstOffset.x;
889+
cpy_desc.dstY = dstOffset.y;
890+
cpy_desc.dstZ = dstOffset.z;
891+
cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
892+
cpy_desc.srcArray = (CUarray)pSrc;
893+
cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
894+
cpy_desc.dstArray = (CUarray)pDst;
895+
cpy_desc.WidthInBytes = PixelSizeBytes * copyExtent.width;
896+
cpy_desc.Height = std::max(uint64_t{1}, copyExtent.height);
897+
cpy_desc.Depth = pImageDesc->arraySize;
898+
UR_CHECK_ERROR(cuMemcpy3DAsync(&cpy_desc, Stream));
899+
}
900+
// Synchronization is required here to handle the case of copying data
901+
// from host to device, then device to device and finally device to host.
902+
// Without it, there is a risk of the copies not being executed in the
903+
// intended order.
904+
cuStreamSynchronize(Stream);
837905
}
906+
838907
if (phEvent) {
839908
auto NewEvent = ur_event_handle_t_::makeNative(UR_COMMAND_MEM_IMAGE_COPY,
840909
hQueue, Stream);

0 commit comments

Comments
 (0)