Skip to content

Commit 2f6ca9c

Browse files
committed
Merge branch 'main' into review/yang/misalign_access
2 parents 772ae05 + 187c2fa commit 2f6ca9c

28 files changed

+875
-193
lines changed

.github/workflows/e2e_core.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,11 @@ jobs:
174174
echo "LIT_XFAIL=${{inputs.xfail}}" >> $GITHUB_ENV
175175
echo "LIT_FILTER_OUT=${{inputs.filter_out}}" >> $GITHUB_ENV
176176
177+
# TODO: remove once intel/llvm lit tests can properly recognize the GPU
178+
- name: Configure hardware platform feature for L0
179+
if: matrix.adapter.name == 'L0'
180+
run: sed -i '/import lit.llvm/i config.available_features.add("gpu-intel-pvc-1T")' build-e2e/lit.site.cfg.py
181+
177182
- name: Run e2e tests
178183
id: tests
179184
run: ninja -C build-e2e check-sycl-e2e

.github/workflows/e2e_level_zero.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,9 @@ jobs:
2121
config: ""
2222
unit: "gpu"
2323
# Failing tests
24-
xfail: "ESIMD/preemption.cpp;syclcompat/atomic/atomic_class.cpp;ProgramManager/uneven_kernel_split.cpp;Plugin/level_zero_ext_intel_queue_index.cpp;Plugin/level_zero_ext_intel_cslice.cpp;Matrix/joint_matrix_rowmajorA_rowmajorB.cpp;Matrix/element_wise_ops.cpp;Matrix/element_wise_all_ops.cpp;Matrix/SG32/element_wise_all_ops.cpp"
24+
xfail: "ESIMD/preemption.cpp;Matrix/SG32/element_wise_all_ops.cpp;Matrix/SG32/get_coord_int8_matB.cpp;Matrix/element_wise_all_ops.cpp;Matrix/element_wise_all_ops_1d.cpp;Matrix/element_wise_all_ops_1d_cont.cpp;Matrix/element_wise_all_ops_scalar.cpp;Matrix/element_wise_ops.cpp;Matrix/get_coord_int8_matB.cpp;Matrix/joint_matrix_apply_bf16.cpp;Matrix/joint_matrix_apply_two_matrices.cpp;Matrix/joint_matrix_bfloat16.cpp;Matrix/joint_matrix_bfloat16_array.cpp;Matrix/joint_matrix_rowmajorA_rowmajorB.cpp;ProgramManager/uneven_kernel_split.cpp"
2525
# Flaky tests
26-
filter_out: "GroupAlgorithm/root_group.cpp|Basic/exceptions-SYCL-2020.cpp|Graph/UnsupportedDevice/device_query.cpp|Graph/RecordReplay/exception_inconsistent_contexts.cpp"
27-
# These runners by default spawn upwards of 260 workers. That's too much for the GPU.
26+
filter_out: "UserDefinedReductions/user_defined_reductions.cpp"
27+
# These runners by default spawn upwards of 260 workers.
2828
# We also add a time out just in case some test hangs
29-
extra_lit_flags: "-sv -j 50 --max-time 600"
29+
extra_lit_flags: "-sv -j 100 --max-time 600"

source/adapters/cuda/image.cpp

Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -773,9 +773,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
773773
}
774774
} else if (pImageDesc->type == UR_MEM_TYPE_IMAGE2D) {
775775
CUDA_MEMCPY2D cpy_desc = {};
776-
cpy_desc.srcXInBytes = srcOffset.x;
776+
cpy_desc.srcXInBytes = srcOffset.x * PixelSizeBytes;
777777
cpy_desc.srcY = srcOffset.y;
778-
cpy_desc.dstXInBytes = dstOffset.x;
778+
cpy_desc.dstXInBytes = dstOffset.x * PixelSizeBytes;
779779
cpy_desc.dstY = dstOffset.y;
780780
if (pImageDesc->rowPitch == 0) {
781781
cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
@@ -788,21 +788,24 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
788788
}
789789
cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
790790
cpy_desc.dstHost = pDst;
791+
cpy_desc.dstPitch = hostExtent.width * PixelSizeBytes;
791792
cpy_desc.WidthInBytes = PixelSizeBytes * copyExtent.width;
792793
cpy_desc.Height = copyExtent.height;
793794
UR_CHECK_ERROR(cuMemcpy2DAsync(&cpy_desc, Stream));
794795
} else if (pImageDesc->type == UR_MEM_TYPE_IMAGE3D) {
795796
CUDA_MEMCPY3D cpy_desc = {};
796-
cpy_desc.srcXInBytes = srcOffset.x;
797+
cpy_desc.srcXInBytes = srcOffset.x * PixelSizeBytes;
797798
cpy_desc.srcY = srcOffset.y;
798799
cpy_desc.srcZ = srcOffset.z;
799-
cpy_desc.dstXInBytes = dstOffset.x;
800+
cpy_desc.dstXInBytes = dstOffset.x * PixelSizeBytes;
800801
cpy_desc.dstY = dstOffset.y;
801802
cpy_desc.dstZ = dstOffset.z;
802803
cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
803804
cpy_desc.srcArray = (CUarray)pSrc;
804805
cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
805806
cpy_desc.dstHost = pDst;
807+
cpy_desc.dstPitch = hostExtent.width * PixelSizeBytes;
808+
cpy_desc.dstHeight = hostExtent.height;
806809
cpy_desc.WidthInBytes = PixelSizeBytes * copyExtent.width;
807810
cpy_desc.Height = copyExtent.height;
808811
cpy_desc.Depth = copyExtent.depth;
@@ -811,16 +814,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
811814
pImageDesc->type == UR_MEM_TYPE_IMAGE2D_ARRAY ||
812815
pImageDesc->type == UR_MEM_TYPE_IMAGE_CUBEMAP_EXP) {
813816
CUDA_MEMCPY3D cpy_desc = {};
814-
cpy_desc.srcXInBytes = srcOffset.x;
817+
cpy_desc.srcXInBytes = srcOffset.x * PixelSizeBytes;
815818
cpy_desc.srcY = srcOffset.y;
816819
cpy_desc.srcZ = srcOffset.z;
817-
cpy_desc.dstXInBytes = dstOffset.x;
820+
cpy_desc.dstXInBytes = dstOffset.x * PixelSizeBytes;
818821
cpy_desc.dstY = dstOffset.y;
819822
cpy_desc.dstZ = dstOffset.z;
820823
cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
821824
cpy_desc.srcArray = (CUarray)pSrc;
822825
cpy_desc.dstMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_HOST;
823826
cpy_desc.dstHost = pDst;
827+
cpy_desc.dstPitch = hostExtent.width * PixelSizeBytes;
828+
cpy_desc.dstHeight = hostExtent.height;
824829
cpy_desc.WidthInBytes = PixelSizeBytes * copyExtent.width;
825830
cpy_desc.Height = std::max(uint64_t{1}, copyExtent.height);
826831
cpy_desc.Depth = pImageDesc->arraySize;
@@ -834,9 +839,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
834839
// the end
835840
if (pImageDesc->type == UR_MEM_TYPE_IMAGE1D) {
836841
CUDA_MEMCPY2D cpy_desc = {};
837-
cpy_desc.srcXInBytes = srcOffset.x;
842+
cpy_desc.srcXInBytes = srcOffset.x * PixelSizeBytes;
838843
cpy_desc.srcY = 0;
839-
cpy_desc.dstXInBytes = dstOffset.x;
844+
cpy_desc.dstXInBytes = dstOffset.x * PixelSizeBytes;
840845
cpy_desc.dstY = 0;
841846
cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
842847
cpy_desc.srcArray = (CUarray)pSrc;
@@ -847,9 +852,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
847852
UR_CHECK_ERROR(cuMemcpy2DAsync(&cpy_desc, Stream));
848853
} else if (pImageDesc->type == UR_MEM_TYPE_IMAGE2D) {
849854
CUDA_MEMCPY2D cpy_desc = {};
850-
cpy_desc.srcXInBytes = srcOffset.x;
855+
cpy_desc.srcXInBytes = srcOffset.x * PixelSizeBytes;
851856
cpy_desc.srcY = srcOffset.y;
852-
cpy_desc.dstXInBytes = dstOffset.x;
857+
cpy_desc.dstXInBytes = dstOffset.x * PixelSizeBytes;
853858
cpy_desc.dstY = dstOffset.y;
854859
cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
855860
cpy_desc.srcArray = (CUarray)pSrc;
@@ -860,10 +865,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
860865
UR_CHECK_ERROR(cuMemcpy2DAsync(&cpy_desc, Stream));
861866
} else if (pImageDesc->type == UR_MEM_TYPE_IMAGE3D) {
862867
CUDA_MEMCPY3D cpy_desc = {};
863-
cpy_desc.srcXInBytes = srcOffset.x;
868+
cpy_desc.srcXInBytes = srcOffset.x * PixelSizeBytes;
864869
cpy_desc.srcY = srcOffset.y;
865870
cpy_desc.srcZ = srcOffset.z;
866-
cpy_desc.dstXInBytes = dstOffset.x;
871+
cpy_desc.dstXInBytes = dstOffset.x * PixelSizeBytes;
867872
cpy_desc.dstY = dstOffset.y;
868873
cpy_desc.dstZ = dstOffset.z;
869874
cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;
@@ -878,10 +883,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesImageCopyExp(
878883
pImageDesc->type == UR_MEM_TYPE_IMAGE2D_ARRAY ||
879884
pImageDesc->type == UR_MEM_TYPE_IMAGE_CUBEMAP_EXP) {
880885
CUDA_MEMCPY3D cpy_desc = {};
881-
cpy_desc.srcXInBytes = srcOffset.x;
886+
cpy_desc.srcXInBytes = srcOffset.x * PixelSizeBytes;
882887
cpy_desc.srcY = srcOffset.y;
883888
cpy_desc.srcZ = srcOffset.z;
884-
cpy_desc.dstXInBytes = dstOffset.x;
889+
cpy_desc.dstXInBytes = dstOffset.x * PixelSizeBytes;
885890
cpy_desc.dstY = dstOffset.y;
886891
cpy_desc.dstZ = dstOffset.z;
887892
cpy_desc.srcMemoryType = CUmemorytype_enum::CU_MEMORYTYPE_ARRAY;

source/adapters/level_zero/adapter.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,9 @@ ur_result_t initPlatforms(PlatformVec &platforms) noexcept try {
4949
for (uint32_t I = 0; I < ZeDriverCount; ++I) {
5050
auto platform = std::make_unique<ur_platform_handle_t_>(ZeDrivers[I]);
5151
UR_CALL(platform->initialize());
52+
ZE2UR_CALL(zelLoaderTranslateHandle,
53+
(ZEL_HANDLE_DRIVER, platform->ZeDriver,
54+
(void **)&platform->ZeDriverHandleExpTranslated));
5255

5356
// Save a copy in the cache for future uses.
5457
platforms.push_back(std::move(platform));

source/adapters/level_zero/adapter.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
#include "logger/ur_logger.hpp"
1313
#include <atomic>
14+
#include <loader/ze_loader.h>
1415
#include <mutex>
1516
#include <optional>
1617
#include <ur/ur.hpp>

source/adapters/level_zero/command_buffer.cpp

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -56,12 +56,14 @@ bool PreferCopyEngineForFill = [] {
5656
ur_exp_command_buffer_handle_t_::ur_exp_command_buffer_handle_t_(
5757
ur_context_handle_t Context, ur_device_handle_t Device,
5858
ze_command_list_handle_t CommandList,
59+
ze_command_list_handle_t CommandListTranslated,
5960
ze_command_list_handle_t CommandListResetEvents,
6061
ze_command_list_handle_t CopyCommandList,
6162
ZeStruct<ze_command_list_desc_t> ZeDesc,
6263
ZeStruct<ze_command_list_desc_t> ZeCopyDesc,
6364
const ur_exp_command_buffer_desc_t *Desc, const bool IsInOrderCmdList)
6465
: Context(Context), Device(Device), ZeComputeCommandList(CommandList),
66+
ZeComputeCommandListTranslated(CommandListTranslated),
6567
ZeCommandListResetEvents(CommandListResetEvents),
6668
ZeCommandListDesc(ZeDesc), ZeCopyCommandList(CopyCommandList),
6769
ZeCopyCommandListDesc(ZeCopyDesc), ZeFencesMap(), ZeActiveFence(nullptr),
@@ -605,11 +607,16 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device,
605607
&ZeCopyCommandList));
606608
}
607609

610+
ze_command_list_handle_t ZeComputeCommandListTranslated = nullptr;
611+
ZE2UR_CALL(zelLoaderTranslateHandle,
612+
(ZEL_HANDLE_COMMAND_LIST, ZeComputeCommandList,
613+
(void **)&ZeComputeCommandListTranslated));
614+
608615
try {
609616
*CommandBuffer = new ur_exp_command_buffer_handle_t_(
610-
Context, Device, ZeComputeCommandList, ZeCommandListResetEvents,
611-
ZeCopyCommandList, ZeCommandListDesc, ZeCopyCommandListDesc,
612-
CommandBufferDesc, IsInOrder);
617+
Context, Device, ZeComputeCommandList, ZeComputeCommandListTranslated,
618+
ZeCommandListResetEvents, ZeCopyCommandList, ZeCommandListDesc,
619+
ZeCopyCommandListDesc, CommandBufferDesc, IsInOrder);
613620
} catch (const std::bad_alloc &) {
614621
return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
615622
} catch (...) {
@@ -791,8 +798,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendKernelLaunchExp(
791798
UR_ASSERT(Plt->ZeMutableCmdListExt.Supported,
792799
UR_RESULT_ERROR_UNSUPPORTED_FEATURE);
793800
ZE2UR_CALL(Plt->ZeMutableCmdListExt.zexCommandListGetNextCommandIdExp,
794-
(CommandBuffer->ZeComputeCommandList, &ZeMutableCommandDesc,
795-
&CommandId));
801+
(CommandBuffer->ZeComputeCommandListTranslated,
802+
&ZeMutableCommandDesc, &CommandId));
796803
DEBUG_LOG(CommandId);
797804
}
798805
try {
@@ -1619,8 +1626,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
16191626
auto Plt = CommandBuffer->Context->getPlatform();
16201627
UR_ASSERT(Plt->ZeMutableCmdListExt.Supported,
16211628
UR_RESULT_ERROR_UNSUPPORTED_FEATURE);
1622-
ZE2UR_CALL(Plt->ZeMutableCmdListExt.zexCommandListUpdateMutableCommandsExp,
1623-
(CommandBuffer->ZeComputeCommandList, &MutableCommandDesc));
1629+
ZE2UR_CALL(
1630+
Plt->ZeMutableCmdListExt.zexCommandListUpdateMutableCommandsExp,
1631+
(CommandBuffer->ZeComputeCommandListTranslated, &MutableCommandDesc));
16241632
ZE2UR_CALL(zeCommandListClose, (CommandBuffer->ZeComputeCommandList));
16251633

16261634
return UR_RESULT_SUCCESS;

source/adapters/level_zero/command_buffer.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ struct ur_exp_command_buffer_handle_t_ : public _ur_object {
2828
ur_exp_command_buffer_handle_t_(
2929
ur_context_handle_t Context, ur_device_handle_t Device,
3030
ze_command_list_handle_t CommandList,
31+
ze_command_list_handle_t CommandListTranslated,
3132
ze_command_list_handle_t CommandListResetEvents,
3233
ze_command_list_handle_t CopyCommandList,
3334
ZeStruct<ze_command_list_desc_t> ZeDesc,
@@ -55,6 +56,9 @@ struct ur_exp_command_buffer_handle_t_ : public _ur_object {
5556
ur_device_handle_t Device;
5657
// Level Zero command list handle
5758
ze_command_list_handle_t ZeComputeCommandList;
59+
// Given a multi driver scenario, the driver handle must be translated to the
60+
// internal driver handle to allow calls to driver experimental apis.
61+
ze_command_list_handle_t ZeComputeCommandListTranslated;
5862
// Level Zero command list handle
5963
ze_command_list_handle_t ZeCommandListResetEvents;
6064
// Level Zero command list descriptor

source/adapters/level_zero/common.cpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -66,9 +66,6 @@ ur_result_t ze2urResult(ze_result_t ZeResult) {
6666
}
6767
}
6868

69-
usm::DisjointPoolAllConfigs DisjointPoolConfigInstance =
70-
InitializeDisjointPoolConfig();
71-
7269
// This function will ensure compatibility with both Linux and Windows for
7370
// setting environment variables.
7471
bool setEnvVar(const char *name, const char *value) {

source/adapters/level_zero/common.hpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -325,6 +325,14 @@ ur_result_t ze2urResult(ze_result_t ZeResult);
325325
return ze2urResult(Result); \
326326
}
327327

328+
// Trace a call to Level-Zero RT, throw on error
329+
#define ZE2UR_CALL_THROWS(ZeName, ZeArgs) \
330+
{ \
331+
ze_result_t ZeResult = ZeName ZeArgs; \
332+
if (auto Result = ZeCall().doCall(ZeResult, #ZeName, #ZeArgs, true)) \
333+
throw ze2urResult(Result); \
334+
}
335+
328336
// Perform traced call to L0 without checking for errors
329337
#define ZE_CALL_NOCHECK(ZeName, ZeArgs) \
330338
ZeCall().doCall(ZeName ZeArgs, #ZeName, #ZeArgs, false)

source/adapters/level_zero/image.cpp

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -488,7 +488,11 @@ ur_result_t bindlessImagesCreateImpl(ur_context_handle_t hContext,
488488
return UR_RESULT_ERROR_INVALID_OPERATION;
489489

490490
uint64_t DeviceOffset{};
491-
ZE2UR_CALL(zeImageGetDeviceOffsetExpFunctionPtr, (ZeImage, &DeviceOffset));
491+
ze_image_handle_t ZeImageTranslated;
492+
ZE2UR_CALL(zelLoaderTranslateHandle,
493+
(ZEL_HANDLE_IMAGE, ZeImage, (void **)&ZeImageTranslated));
494+
ZE2UR_CALL(zeImageGetDeviceOffsetExpFunctionPtr,
495+
(ZeImageTranslated, &DeviceOffset));
492496
*phImage = reinterpret_cast<ur_exp_image_handle_t>(DeviceOffset);
493497

494498
return UR_RESULT_SUCCESS;
@@ -652,8 +656,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMPitchedAllocExp(
652656

653657
size_t Width = widthInBytes / elementSizeBytes;
654658
size_t RowPitch;
659+
ze_device_handle_t ZeDeviceTranslated;
660+
ZE2UR_CALL(zelLoaderTranslateHandle, (ZEL_HANDLE_DEVICE, hDevice->ZeDevice,
661+
(void **)&ZeDeviceTranslated));
655662
ZE2UR_CALL(zeMemGetPitchFor2dImageFunctionPtr,
656-
(hContext->ZeContext, hDevice->ZeDevice, Width, height,
663+
(hContext->ZeContext, ZeDeviceTranslated, Width, height,
657664
elementSizeBytes, &RowPitch));
658665
*pResultPitch = RowPitch;
659666

0 commit comments

Comments
 (0)