Skip to content

Commit ad198f9

Browse files
committed
Merge branch 'main' into usm-p2p-add-test-and-macro
2 parents f39d41f + 6513abc commit ad198f9

File tree

12 files changed

+310
-214
lines changed

12 files changed

+310
-214
lines changed

.github/workflows/cmake.yml

Lines changed: 45 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -115,50 +115,51 @@ jobs:
115115
working-directory: ${{github.workspace}}/build
116116
run: ctest -C ${{matrix.build_type}} --output-on-failure -L "umf|loader|validation|tracing|unit|urtrace"
117117

118-
fuzztest-build:
119-
name: Build and run quick fuzztest scenarios
120-
strategy:
121-
matrix:
122-
build_type: [Debug, Release]
123-
compiler: [{c: clang, cxx: clang++}]
124-
125-
runs-on: 'ubuntu-22.04'
126-
127-
steps:
128-
- uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
129-
130-
- name: Install pip packages
131-
run: pip install -r third_party/requirements.txt
132-
133-
- name: Download DPC++
134-
run: |
135-
sudo apt install libncurses5
136-
wget -O ${{github.workspace}}/dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/sycl-nightly%2F20230626/dpcpp-compiler.tar.gz
137-
tar -xvf ${{github.workspace}}/dpcpp_compiler.tar.gz
138-
139-
- name: Setup DPC++
140-
run: |
141-
source ${{github.workspace}}/dpcpp_compiler/startup.sh
142-
143-
- name: Configure CMake
144-
run: >
145-
cmake
146-
-B${{github.workspace}}/build
147-
-DCMAKE_C_COMPILER=${{matrix.compiler.c}}
148-
-DCMAKE_CXX_COMPILER=${{matrix.compiler.cxx}}
149-
-DUR_ENABLE_TRACING=ON
150-
-DCMAKE_BUILD_TYPE=${{matrix.build_type}}
151-
-DUR_BUILD_TESTS=ON
152-
-DUR_USE_ASAN=ON
153-
-DUR_USE_UBSAN=ON
154-
-DUR_DPCXX=${{github.workspace}}/dpcpp_compiler/bin/clang++
155-
156-
- name: Build
157-
run: cmake --build ${{github.workspace}}/build -j $(nproc)
158-
159-
- name: Fuzz test
160-
working-directory: ${{github.workspace}}/build
161-
run: ctest -C ${{matrix.build_type}} --output-on-failure -L "fuzz-short"
118+
# Disable short fuzz tests until the ubuntu-22.04 runner is fixed
119+
# fuzztest-build:
120+
# name: Build and run quick fuzztest scenarios
121+
# strategy:
122+
# matrix:
123+
# build_type: [Debug, Release]
124+
# compiler: [{c: clang, cxx: clang++}]
125+
126+
# runs-on: 'ubuntu-22.04'
127+
128+
# steps:
129+
# - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
130+
131+
# - name: Install pip packages
132+
# run: pip install -r third_party/requirements.txt
133+
134+
# - name: Download DPC++
135+
# run: |
136+
# sudo apt install libncurses5
137+
# wget -O ${{github.workspace}}/dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/sycl-nightly%2F20230626/dpcpp-compiler.tar.gz
138+
# tar -xvf ${{github.workspace}}/dpcpp_compiler.tar.gz
139+
140+
# - name: Setup DPC++
141+
# run: |
142+
# source ${{github.workspace}}/dpcpp_compiler/startup.sh
143+
144+
# - name: Configure CMake
145+
# run: >
146+
# cmake
147+
# -B${{github.workspace}}/build
148+
# -DCMAKE_C_COMPILER=${{matrix.compiler.c}}
149+
# -DCMAKE_CXX_COMPILER=${{matrix.compiler.cxx}}
150+
# -DUR_ENABLE_TRACING=ON
151+
# -DCMAKE_BUILD_TYPE=${{matrix.build_type}}
152+
# -DUR_BUILD_TESTS=ON
153+
# -DUR_USE_ASAN=ON
154+
# -DUR_USE_UBSAN=ON
155+
# -DUR_DPCXX=${{github.workspace}}/dpcpp_compiler/bin/clang++
156+
157+
# - name: Build
158+
# run: cmake --build ${{github.workspace}}/build -j $(nproc)
159+
160+
# - name: Fuzz test
161+
# working-directory: ${{github.workspace}}/build
162+
# run: ctest -C ${{matrix.build_type}} --output-on-failure -L "fuzz-short"
162163

163164
adapter-build-hw:
164165
name: Build - Adapters on HW

.github/workflows/nightly.yml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ jobs:
4848
LD_LIBRARY_PATH=${{github.workspace}}/dpcpp_compiler/lib
4949
cmake --build ${{github.workspace}}/build -j $(nproc)
5050
51-
- name: Fuzz long test
52-
working-directory: ${{github.workspace}}/build
53-
run: ctest -C ${{matrix.build_type}} --output-on-failure -L "fuzz-long"
51+
# Disable long fuzz tests until the ubuntu-22.04 runner is fixed
52+
# - name: Fuzz long test
53+
# working-directory: ${{github.workspace}}/build
54+
# run: ctest -C ${{matrix.build_type}} --output-on-failure -L "fuzz-long"

CMakeLists.txt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,16 @@ set(UR_SYCL_LIBRARY_DIR "" CACHE PATH
5252
set(UR_CONFORMANCE_TARGET_TRIPLES "" CACHE STRING
5353
"List of sycl targets to build CTS device binaries for")
5454
set(UR_CONFORMANCE_AMD_ARCH "" CACHE STRING "AMD device target ID to build CTS binaries for")
55+
set(UR_ADAPTER_LEVEL_ZERO_SOURCE_DIR "" CACHE PATH
56+
"Path to external 'level_zero' adapter source dir")
57+
set(UR_ADAPTER_OPENCL_SOURCE_DIR "" CACHE PATH
58+
"Path to external 'opencl' adapter source dir")
59+
set(UR_ADAPTER_CUDA_SOURCE_DIR "" CACHE PATH
60+
"Path to external 'cuda' adapter source dir")
61+
set(UR_ADAPTER_HIP_SOURCE_DIR "" CACHE PATH
62+
"Path to external 'hip' adapter source dir")
63+
set(UR_ADAPTER_NATIVE_CPU_SOURCE_DIR "" CACHE PATH
64+
"Path to external 'native_cpu' adapter source dir")
5565

5666
# There's little reason not to generate the compile_commands.json
5767
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

source/adapters/CMakeLists.txt

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,21 +30,37 @@ endfunction()
3030

3131
add_subdirectory(null)
3232

33+
function(add_ur_adapter_subdirectory name)
34+
string(TOUPPER ${name} NAME)
35+
if(UR_ADAPTER_${NAME}_SOURCE_DIR)
36+
if(NOT IS_DIRECTORY ${UR_ADAPTER_${NAME}_SOURCE_DIR})
37+
message(FATAL_ERROR
38+
"UR_ADAPTER_${NAME}_SOURCE_DIR is not a directory: "
39+
"${UR_ADAPTER_${NAME}_SOURCE_DIR}")
40+
endif()
41+
add_subdirectory(
42+
"${UR_ADAPTER_${NAME}_SOURCE_DIR}"
43+
"${CMAKE_CURRENT_BINARY_DIR}/${name}")
44+
else()
45+
add_subdirectory(${name})
46+
endif()
47+
endfunction()
48+
3349
if(UR_BUILD_ADAPTER_L0 OR UR_BUILD_ADAPTER_ALL)
34-
add_subdirectory(level_zero)
50+
add_ur_adapter_subdirectory(level_zero)
3551
endif()
3652

3753
if(UR_BUILD_ADAPTER_CUDA OR UR_BUILD_ADAPTER_ALL)
38-
add_subdirectory(cuda)
54+
add_ur_adapter_subdirectory(cuda)
3955
endif()
4056

4157
if(UR_BUILD_ADAPTER_HIP OR UR_BUILD_ADAPTER_ALL)
42-
add_subdirectory(hip)
58+
add_ur_adapter_subdirectory(hip)
4359
endif()
4460

4561
if(UR_BUILD_ADAPTER_OPENCL OR UR_BUILD_ADAPTER_ALL)
46-
add_subdirectory(opencl)
62+
add_ur_adapter_subdirectory(opencl)
4763
endif()
4864
if(UR_BUILD_ADAPTER_NATIVE_CPU OR UR_BUILD_ADAPTER_ALL)
49-
add_subdirectory(native_cpu)
65+
add_ur_adapter_subdirectory(native_cpu)
5066
endif()

source/adapters/cuda/command_buffer.cpp

Lines changed: 61 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,6 @@ static ur_result_t enqueueCommandBufferFillHelper(
170170

171171
try {
172172
const size_t N = Size / PatternSize;
173-
auto Value = *static_cast<const uint32_t *>(Pattern);
174173
auto DstPtr = DstType == CU_MEMORYTYPE_DEVICE
175174
? *static_cast<CUdeviceptr *>(DstDevice)
176175
: (CUdeviceptr)DstDevice;
@@ -183,9 +182,27 @@ static ur_result_t enqueueCommandBufferFillHelper(
183182
NodeParams.elementSize = PatternSize;
184183
NodeParams.height = N;
185184
NodeParams.pitch = PatternSize;
186-
NodeParams.value = Value;
187185
NodeParams.width = 1;
188186

187+
// pattern size in bytes
188+
switch (PatternSize) {
189+
case 1: {
190+
auto Value = *static_cast<const uint8_t *>(Pattern);
191+
NodeParams.value = Value;
192+
break;
193+
}
194+
case 2: {
195+
auto Value = *static_cast<const uint16_t *>(Pattern);
196+
NodeParams.value = Value;
197+
break;
198+
}
199+
case 4: {
200+
auto Value = *static_cast<const uint32_t *>(Pattern);
201+
NodeParams.value = Value;
202+
break;
203+
}
204+
}
205+
189206
UR_CHECK_ERROR(cuGraphAddMemsetNode(
190207
&GraphNode, CommandBuffer->CudaGraph, DepsList.data(),
191208
DepsList.size(), &NodeParams, CommandBuffer->Device->getContext()));
@@ -198,29 +215,54 @@ static ur_result_t enqueueCommandBufferFillHelper(
198215
// CUDA has no memset functions that allow setting values more than 4
199216
// bytes. UR API lets you pass an arbitrary "pattern" to the buffer
200217
// fill, which can be more than 4 bytes. We must break up the pattern
201-
// into 4 byte values, and set the buffer using multiple strided calls.
202-
// This means that one cuGraphAddMemsetNode call is made for every 4 bytes
203-
// in the pattern.
218+
// into 1 byte values, and set the buffer using multiple strided calls.
219+
// This means that one cuGraphAddMemsetNode call is made for every 1
220+
// bytes in the pattern.
221+
222+
size_t NumberOfSteps = PatternSize / sizeof(uint8_t);
204223

205-
size_t NumberOfSteps = PatternSize / sizeof(uint32_t);
224+
// Shared pointer that will point to the last node created
225+
std::shared_ptr<CUgraphNode> GraphNodePtr;
226+
// Create a new node
227+
CUgraphNode GraphNodeFirst;
228+
// Update NodeParam
229+
CUDA_MEMSET_NODE_PARAMS NodeParamsStepFirst = {};
230+
NodeParamsStepFirst.dst = DstPtr;
231+
NodeParamsStepFirst.elementSize = sizeof(uint32_t);
232+
NodeParamsStepFirst.height = Size / sizeof(uint32_t);
233+
NodeParamsStepFirst.pitch = sizeof(uint32_t);
234+
NodeParamsStepFirst.value = *static_cast<const uint32_t *>(Pattern);
235+
NodeParamsStepFirst.width = 1;
206236

207-
// we walk up the pattern in 4-byte steps, and call cuMemset for each
208-
// 4-byte chunk of the pattern.
209-
for (auto Step = 0u; Step < NumberOfSteps; ++Step) {
237+
UR_CHECK_ERROR(cuGraphAddMemsetNode(
238+
&GraphNodeFirst, CommandBuffer->CudaGraph, DepsList.data(),
239+
DepsList.size(), &NodeParamsStepFirst,
240+
CommandBuffer->Device->getContext()));
241+
242+
// Get sync point and register the cuNode with it.
243+
*SyncPoint = CommandBuffer->addSyncPoint(
244+
std::make_shared<CUgraphNode>(GraphNodeFirst));
245+
246+
DepsList.clear();
247+
DepsList.push_back(GraphNodeFirst);
248+
249+
// we walk up the pattern in 1-byte steps, and call cuMemset for each
250+
// 1-byte chunk of the pattern.
251+
for (auto Step = 4u; Step < NumberOfSteps; ++Step) {
210252
// take 4 bytes of the pattern
211-
auto Value = *(static_cast<const uint32_t *>(Pattern) + Step);
253+
auto Value = *(static_cast<const uint8_t *>(Pattern) + Step);
212254

213255
// offset the pointer to the part of the buffer we want to write to
214-
auto OffsetPtr = DstPtr + (Step * sizeof(uint32_t));
256+
auto OffsetPtr = DstPtr + (Step * sizeof(uint8_t));
215257

216258
// Create a new node
217259
CUgraphNode GraphNode;
218260
// Update NodeParam
219261
CUDA_MEMSET_NODE_PARAMS NodeParamsStep = {};
220262
NodeParamsStep.dst = (CUdeviceptr)OffsetPtr;
221-
NodeParamsStep.elementSize = 4;
222-
NodeParamsStep.height = N;
223-
NodeParamsStep.pitch = PatternSize;
263+
NodeParamsStep.elementSize = sizeof(uint8_t);
264+
NodeParamsStep.height = Size / NumberOfSteps;
265+
NodeParamsStep.pitch = NumberOfSteps * sizeof(uint8_t);
224266
NodeParamsStep.value = Value;
225267
NodeParamsStep.width = 1;
226268

@@ -229,9 +271,12 @@ static ur_result_t enqueueCommandBufferFillHelper(
229271
DepsList.size(), &NodeParamsStep,
230272
CommandBuffer->Device->getContext()));
231273

274+
GraphNodePtr = std::make_shared<CUgraphNode>(GraphNode);
232275
// Get sync point and register the cuNode with it.
233-
*SyncPoint = CommandBuffer->addSyncPoint(
234-
std::make_shared<CUgraphNode>(GraphNode));
276+
*SyncPoint = CommandBuffer->addSyncPoint(GraphNodePtr);
277+
278+
DepsList.clear();
279+
DepsList.push_back(*GraphNodePtr.get());
235280
}
236281
}
237282
} catch (ur_result_t Err) {

source/adapters/cuda/image.cpp

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1006,17 +1006,23 @@ UR_APIEXPORT ur_result_t UR_APICALL urBindlessImagesMapExternalArrayExp(
10061006
ArrayDesc.Format = format;
10071007

10081008
CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC mipmapDesc = {};
1009-
mipmapDesc.numLevels = 1;
1009+
mipmapDesc.numLevels = pImageDesc->numMipLevel;
10101010
mipmapDesc.arrayDesc = ArrayDesc;
10111011

1012+
// External memory is mapped to a CUmipmappedArray
1013+
// If desired, a CUarray is retrieved from the mipmaps 0th level
10121014
CUmipmappedArray memMipMap;
10131015
UR_CHECK_ERROR(cuExternalMemoryGetMappedMipmappedArray(
10141016
&memMipMap, (CUexternalMemory)hInteropMem, &mipmapDesc));
10151017

1016-
CUarray memArray;
1017-
UR_CHECK_ERROR(cuMipmappedArrayGetLevel(&memArray, memMipMap, 0));
1018+
if (pImageDesc->numMipLevel > 1) {
1019+
*phImageMem = (ur_exp_image_mem_handle_t)memMipMap;
1020+
} else {
1021+
CUarray memArray;
1022+
UR_CHECK_ERROR(cuMipmappedArrayGetLevel(&memArray, memMipMap, 0));
10181023

1019-
*phImageMem = (ur_exp_image_mem_handle_t)memArray;
1024+
*phImageMem = (ur_exp_image_mem_handle_t)memArray;
1025+
}
10201026

10211027
} catch (ur_result_t Err) {
10221028
return Err;

source/adapters/cuda/tracing.cpp

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -27,20 +27,20 @@
2727
using tracing_event_t = xpti_td *;
2828
using subscriber_handle_t = CUpti_SubscriberHandle;
2929

30-
using cuptiSubscribe_fn = CUPTIAPI
31-
CUptiResult (*)(CUpti_SubscriberHandle *subscriber, CUpti_CallbackFunc callback,
32-
void *userdata);
30+
using cuptiSubscribe_fn =
31+
CUptiResult(CUPTIAPI *)(CUpti_SubscriberHandle *subscriber,
32+
CUpti_CallbackFunc callback, void *userdata);
3333

34-
using cuptiUnsubscribe_fn = CUPTIAPI
35-
CUptiResult (*)(CUpti_SubscriberHandle subscriber);
34+
using cuptiUnsubscribe_fn =
35+
CUptiResult(CUPTIAPI *)(CUpti_SubscriberHandle subscriber);
3636

37-
using cuptiEnableDomain_fn = CUPTIAPI
38-
CUptiResult (*)(uint32_t enable, CUpti_SubscriberHandle subscriber,
39-
CUpti_CallbackDomain domain);
37+
using cuptiEnableDomain_fn =
38+
CUptiResult(CUPTIAPI *)(uint32_t enable, CUpti_SubscriberHandle subscriber,
39+
CUpti_CallbackDomain domain);
4040

41-
using cuptiEnableCallback_fn = CUPTIAPI
42-
CUptiResult (*)(uint32_t enable, CUpti_SubscriberHandle subscriber,
43-
CUpti_CallbackDomain domain, CUpti_CallbackId cbid);
41+
using cuptiEnableCallback_fn =
42+
CUptiResult(CUPTIAPI *)(uint32_t enable, CUpti_SubscriberHandle subscriber,
43+
CUpti_CallbackDomain domain, CUpti_CallbackId cbid);
4444

4545
#define LOAD_CUPTI_SYM(p, lib, x) \
4646
p.x = (cupti##x##_fn)ur_loader::LibLoader::getFunctionPtr(lib.get(), \

0 commit comments

Comments
 (0)