Skip to content

Commit 7021af9

Browse files
author
Hugh Delaney
committed
Make the HIP adapter use complex subgroup size calculation
The HIP adapter was only finding a good sg size in the X dim. This changes it so that it now chooses a sg size that divides the global dim in X, Y and Z dimensions. It also chooses a power of 2 sg size in the X dim, which is the same that the CUDA adapter does. This may give some performance improvements.
1 parent 088f12a commit 7021af9

File tree

1 file changed

+22
-14
lines changed

1 file changed

+22
-14
lines changed

source/adapters/hip/enqueue.cpp

Lines changed: 22 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
#include "memory.hpp"
1717
#include "queue.hpp"
1818

19+
#include <ur/ur.hpp>
20+
1921
extern size_t imageElementByteSize(hipArray_Format ArrayFormat);
2022

2123
ur_result_t enqueueEventsWait(ur_queue_handle_t, hipStream_t Stream,
@@ -48,23 +50,29 @@ ur_result_t enqueueEventsWait(ur_queue_handle_t, hipStream_t Stream,
4850
}
4951
}
5052

51-
void simpleGuessLocalWorkSize(size_t *ThreadsPerBlock,
52-
const size_t *GlobalWorkSize,
53-
const size_t MaxThreadsPerBlock[3],
54-
ur_kernel_handle_t Kernel) {
53+
// Determine local work sizes that result in uniform work groups.
54+
// The default threadsPerBlock only require handling the first work_dim
55+
// dimension.
56+
void guessLocalWorkSize(ur_device_handle_t Device, size_t *ThreadsPerBlock,
57+
const size_t *GlobalWorkSize, const uint32_t WorkDim,
58+
const size_t MaxThreadsPerBlock[3]) {
5559
assert(ThreadsPerBlock != nullptr);
5660
assert(GlobalWorkSize != nullptr);
57-
assert(Kernel != nullptr);
5861

59-
std::ignore = Kernel;
62+
// FIXME: The below assumes a three dimensional range but this is not
63+
// guaranteed by UR.
64+
size_t GlobalSizeNormalized[3] = {1, 1, 1};
65+
for (uint32_t i = 0; i < WorkDim; i++) {
66+
GlobalSizeNormalized[i] = GlobalWorkSize[i];
67+
}
6068

61-
ThreadsPerBlock[0] = std::min(MaxThreadsPerBlock[0], GlobalWorkSize[0]);
69+
size_t MaxBlockDim[3];
70+
MaxBlockDim[0] = MaxThreadsPerBlock[0];
71+
MaxBlockDim[1] = Device->getMaxBlockDimY();
72+
MaxBlockDim[2] = Device->getMaxBlockDimZ();
6273

63-
// Find a local work group size that is a divisor of the global
64-
// work group size to produce uniform work groups.
65-
while (GlobalWorkSize[0] % ThreadsPerBlock[0]) {
66-
--ThreadsPerBlock[0];
67-
}
74+
roundToHighestFactorOfGlobalSizeIn3d(ThreadsPerBlock, GlobalSizeNormalized,
75+
MaxBlockDim, MaxThreadsPerBlock[0]);
6876
}
6977

7078
namespace {
@@ -1793,8 +1801,8 @@ setKernelParams(const ur_device_handle_t Device, const uint32_t WorkDim,
17931801
return err;
17941802
}
17951803
} else {
1796-
simpleGuessLocalWorkSize(ThreadsPerBlock, GlobalWorkSize,
1797-
MaxThreadsPerBlock, Kernel);
1804+
guessLocalWorkSize(Device, ThreadsPerBlock, GlobalWorkSize, WorkDim,
1805+
MaxThreadsPerBlock);
17981806
}
17991807
}
18001808

0 commit comments

Comments
 (0)