Skip to content

Commit 088f12a

Browse files
author
Hugh Delaney
committed
Fix bug in CUDA local range calculation
A bug in the CUDA adapter was sometimes generating Y and Z ranges that did not divide the global Y or Z dimension. This fixes that. Also moves some helper functions into ur/ur.hpp that may be reused by other adapters
1 parent ec634ff commit 088f12a

File tree

2 files changed

+65
-28
lines changed

2 files changed

+65
-28
lines changed

source/adapters/cuda/enqueue.cpp

Lines changed: 12 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
#include <cmath>
2020
#include <cuda.h>
21+
#include <ur/ur.hpp>
2122

2223
ur_result_t enqueueEventsWait(ur_queue_handle_t CommandQueue, CUstream Stream,
2324
uint32_t NumEventsInWaitList,
@@ -140,12 +141,10 @@ ur_result_t setCuMemAdvise(CUdeviceptr DevPtr, size_t Size,
140141
void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
141142
const size_t *GlobalWorkSize, const uint32_t WorkDim,
142143
const size_t MaxThreadsPerBlock[3],
143-
ur_kernel_handle_t Kernel, uint32_t LocalSize) {
144+
ur_kernel_handle_t Kernel) {
144145
assert(ThreadsPerBlock != nullptr);
145146
assert(GlobalWorkSize != nullptr);
146147
assert(Kernel != nullptr);
147-
int MinGrid, MaxBlockSize;
148-
size_t MaxBlockDim[3];
149148

150149
// The below assumes a three dimensional range but this is not guaranteed by
151150
// UR.
@@ -154,33 +153,18 @@ void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
154153
GlobalSizeNormalized[i] = GlobalWorkSize[i];
155154
}
156155

156+
size_t MaxBlockDim[3];
157+
MaxBlockDim[0] = MaxThreadsPerBlock[0];
157158
MaxBlockDim[1] = Device->getMaxBlockDimY();
158159
MaxBlockDim[2] = Device->getMaxBlockDimZ();
159160

160-
UR_CHECK_ERROR(
161-
cuOccupancyMaxPotentialBlockSize(&MinGrid, &MaxBlockSize, Kernel->get(),
162-
NULL, LocalSize, MaxThreadsPerBlock[0]));
163-
164-
ThreadsPerBlock[2] = std::min(GlobalSizeNormalized[2], MaxBlockDim[2]);
165-
ThreadsPerBlock[1] =
166-
std::min(GlobalSizeNormalized[1],
167-
std::min(MaxBlockSize / ThreadsPerBlock[2], MaxBlockDim[1]));
168-
MaxBlockDim[0] = MaxBlockSize / (ThreadsPerBlock[1] * ThreadsPerBlock[2]);
169-
ThreadsPerBlock[0] = std::min(
170-
MaxThreadsPerBlock[0], std::min(GlobalSizeNormalized[0], MaxBlockDim[0]));
171-
172-
static auto IsPowerOf2 = [](size_t Value) -> bool {
173-
return Value && !(Value & (Value - 1));
174-
};
175-
176-
// Find a local work group size that is a divisor of the global
177-
// work group size to produce uniform work groups.
178-
// Additionally, for best compute utilisation, the local size has
179-
// to be a power of two.
180-
while (0u != (GlobalSizeNormalized[0] % ThreadsPerBlock[0]) ||
181-
!IsPowerOf2(ThreadsPerBlock[0])) {
182-
--ThreadsPerBlock[0];
183-
}
161+
int MinGrid, MaxBlockSize;
162+
UR_CHECK_ERROR(cuOccupancyMaxPotentialBlockSize(
163+
&MinGrid, &MaxBlockSize, Kernel->get(), NULL, Kernel->getLocalSize(),
164+
MaxThreadsPerBlock[0]));
165+
166+
roundToHighestFactorOfGlobalSizeIn3d(ThreadsPerBlock, GlobalSizeNormalized,
167+
MaxBlockDim, MaxBlockSize);
184168
}
185169

186170
// Helper to verify out-of-registers case (exceeded block max registers).
@@ -261,7 +245,7 @@ setKernelParams(const ur_context_handle_t Context,
261245
}
262246
} else {
263247
guessLocalWorkSize(Device, ThreadsPerBlock, GlobalWorkSize, WorkDim,
264-
MaxThreadsPerBlock, Kernel, LocalSize);
248+
MaxThreadsPerBlock, Kernel);
265249
}
266250
}
267251

source/ur/ur.hpp

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -321,3 +321,56 @@ template <typename T> class Result {
321321
private:
322322
std::variant<ur_result_t, T> value_or_err;
323323
};
324+
325+
// Helper to make sure each x, y, z dim divide the global dimension.
326+
//
327+
// In/Out: ThreadsPerBlockInDim - The dimension of workgroup in some dimension
328+
// In: GlobalWorkSizeInDim - The global size in some dimension
329+
static inline void
330+
roundToHighestFactorOfGlobalSize(size_t &ThreadsPerBlockInDim,
331+
const size_t GlobalWorkSizeInDim) {
332+
while (ThreadsPerBlockInDim > 1 &&
333+
GlobalWorkSizeInDim % ThreadsPerBlockInDim) {
334+
--ThreadsPerBlockInDim;
335+
}
336+
}
337+
338+
// Returns whether or not Value is a power of 2
339+
template <typename T> inline bool isPowerOf2(const T &Value) {
340+
return Value && !(Value & (Value - 1));
341+
}
342+
343+
// Helper to make sure each x, y, z dim divide the global dimension.
344+
// Additionally it makes sure that the inner dimension always is a power of 2
345+
//
346+
// In/Out: ThreadsPerBlock - The size of wg in 3d
347+
// In: GlobalSize - The global size in 3d (if dim < 3 then outer
348+
// dims == 1)
349+
// In: MaxBlockDim - The max size of block in 3d
350+
// In: MaxBlockSize - The max total size of block in all dimensions
351+
// In: WorkDim - The workdim (1, 2 or 3)
352+
static inline void roundToHighestFactorOfGlobalSizeIn3d(
353+
size_t *ThreadsPerBlock, const size_t *GlobalSize,
354+
const size_t *MaxBlockDim, const size_t MaxBlockSize) {
355+
assert(GlobalSize[0] && "GlobalSize[0] cannot be zero");
356+
assert(GlobalSize[1] && "GlobalSize[1] cannot be zero");
357+
assert(GlobalSize[2] && "GlobalSize[2] cannot be zero");
358+
359+
ThreadsPerBlock[0] =
360+
std::min(GlobalSize[0], std::min(MaxBlockSize, MaxBlockDim[0]));
361+
do {
362+
roundToHighestFactorOfGlobalSize(ThreadsPerBlock[0], GlobalSize[0]);
363+
} while (!isPowerOf2(ThreadsPerBlock[0]) && ThreadsPerBlock[0] > 32 &&
364+
--ThreadsPerBlock[0]);
365+
366+
ThreadsPerBlock[1] =
367+
std::min(GlobalSize[1],
368+
std::min(MaxBlockSize / ThreadsPerBlock[0], MaxBlockDim[1]));
369+
roundToHighestFactorOfGlobalSize(ThreadsPerBlock[1], GlobalSize[1]);
370+
371+
ThreadsPerBlock[2] = std::min(
372+
GlobalSize[2],
373+
std::min(MaxBlockSize / (ThreadsPerBlock[1] * ThreadsPerBlock[0]),
374+
MaxBlockDim[2]));
375+
roundToHighestFactorOfGlobalSize(ThreadsPerBlock[2], GlobalSize[2]);
376+
}

0 commit comments

Comments
 (0)