Skip to content

Commit 8ae8d31

Browse files
authored
[Offload] Add liboffload unit tests for shared/local memory (#147040)
1 parent 0f48baf commit 8ae8d31

File tree

5 files changed

+140
-17
lines changed

5 files changed

+140
-17
lines changed

offload/unittests/OffloadAPI/device_code/CMakeLists.txt

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,16 @@ add_offload_test_device_code(foo.c foo)
22
add_offload_test_device_code(bar.c bar)
33
# Compile with optimizations to eliminate AMDGPU implicit arguments.
44
add_offload_test_device_code(noargs.c noargs -O3)
5+
add_offload_test_device_code(localmem.c localmem)
6+
add_offload_test_device_code(localmem_reduction.c localmem_reduction)
7+
add_offload_test_device_code(localmem_static.c localmem_static)
58

6-
add_custom_target(offload_device_binaries DEPENDS foo.bin bar.bin noargs.bin)
9+
add_custom_target(offload_device_binaries DEPENDS
10+
foo.bin
11+
bar.bin
12+
noargs.bin
13+
localmem.bin
14+
localmem_reduction.bin
15+
localmem_static.bin
16+
)
717
set(OFFLOAD_TEST_DEVICE_CODE_PATH ${CMAKE_CURRENT_BINARY_DIR} PARENT_SCOPE)
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
#include <gpuintrin.h>
2+
#include <stdint.h>
3+
4+
extern __gpu_local uint32_t shared_mem[];
5+
6+
__gpu_kernel void localmem(uint32_t *out) {
7+
shared_mem[__gpu_thread_id(0)] = __gpu_thread_id(0);
8+
shared_mem[__gpu_thread_id(0)] *= 2;
9+
out[__gpu_thread_id(0) + (__gpu_num_threads(0) * __gpu_block_id(0))] =
10+
shared_mem[__gpu_thread_id(0)];
11+
}
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#include <gpuintrin.h>
2+
#include <stdint.h>
3+
4+
extern __gpu_local uint32_t shared_mem[];
5+
6+
__gpu_kernel void localmem_reduction(uint32_t *out) {
7+
shared_mem[__gpu_thread_id(0)] = 2;
8+
9+
__gpu_sync_threads();
10+
11+
if (__gpu_thread_id(0) == 0) {
12+
out[__gpu_block_id(0)] = 0;
13+
for (uint32_t i = 0; i < __gpu_num_threads(0); i++)
14+
out[__gpu_block_id(0)] += shared_mem[i];
15+
}
16+
}
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#include <gpuintrin.h>
2+
#include <stdint.h>
3+
4+
[[clang::loader_uninitialized]]
5+
__gpu_local uint32_t shared_mem[64];
6+
7+
__gpu_kernel void localmem_static(uint32_t *out) {
8+
shared_mem[__gpu_thread_id(0)] = 2;
9+
10+
__gpu_sync_threads();
11+
12+
if (__gpu_thread_id(0) == 0) {
13+
out[__gpu_block_id(0)] = 0;
14+
for (uint32_t i = 0; i < __gpu_num_threads(0); i++)
15+
out[__gpu_block_id(0)] += shared_mem[i];
16+
}
17+
}

offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp

Lines changed: 85 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -38,21 +38,19 @@ struct LaunchKernelTestBase : OffloadQueueTest {
3838
ol_kernel_launch_size_args_t LaunchArgs{};
3939
};
4040

41-
struct olLaunchKernelTest : LaunchKernelTestBase {
42-
void SetUp() override {
43-
RETURN_ON_FATAL_FAILURE(LaunchKernelTestBase::SetUpKernel("foo"));
44-
}
45-
};
46-
OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olLaunchKernelTest);
47-
48-
struct olLaunchKernelNoArgsTest : LaunchKernelTestBase {
49-
void SetUp() override {
50-
RETURN_ON_FATAL_FAILURE(LaunchKernelTestBase::SetUpKernel("noargs"));
51-
}
52-
};
53-
OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olLaunchKernelNoArgsTest);
54-
55-
TEST_P(olLaunchKernelTest, Success) {
41+
#define KERNEL_TEST(NAME, KERNEL) \
42+
struct olLaunchKernel##NAME##Test : LaunchKernelTestBase { \
43+
void SetUp() override { LaunchKernelTestBase::SetUpKernel(#KERNEL); } \
44+
}; \
45+
OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olLaunchKernel##NAME##Test);
46+
47+
KERNEL_TEST(Foo, foo)
48+
KERNEL_TEST(NoArgs, noargs)
49+
KERNEL_TEST(LocalMem, localmem)
50+
KERNEL_TEST(LocalMemReduction, localmem_reduction)
51+
KERNEL_TEST(LocalMemStatic, localmem_static)
52+
53+
TEST_P(olLaunchKernelFooTest, Success) {
5654
void *Mem;
5755
ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
5856
LaunchArgs.GroupSize.x * sizeof(uint32_t), &Mem));
@@ -80,7 +78,7 @@ TEST_P(olLaunchKernelNoArgsTest, Success) {
8078
ASSERT_SUCCESS(olWaitQueue(Queue));
8179
}
8280

83-
TEST_P(olLaunchKernelTest, SuccessSynchronous) {
81+
TEST_P(olLaunchKernelFooTest, SuccessSynchronous) {
8482
void *Mem;
8583
ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
8684
LaunchArgs.GroupSize.x * sizeof(uint32_t), &Mem));
@@ -99,3 +97,74 @@ TEST_P(olLaunchKernelTest, SuccessSynchronous) {
9997

10098
ASSERT_SUCCESS(olMemFree(Mem));
10199
}
100+
101+
TEST_P(olLaunchKernelLocalMemTest, Success) {
102+
LaunchArgs.NumGroups.x = 4;
103+
LaunchArgs.DynSharedMemory = 64 * sizeof(uint32_t);
104+
105+
void *Mem;
106+
ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
107+
LaunchArgs.GroupSize.x * LaunchArgs.NumGroups.x *
108+
sizeof(uint32_t),
109+
&Mem));
110+
struct {
111+
void *Mem;
112+
} Args{Mem};
113+
114+
ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args),
115+
&LaunchArgs, nullptr));
116+
117+
ASSERT_SUCCESS(olWaitQueue(Queue));
118+
119+
uint32_t *Data = (uint32_t *)Mem;
120+
for (uint32_t i = 0; i < LaunchArgs.GroupSize.x * LaunchArgs.NumGroups.x; i++)
121+
ASSERT_EQ(Data[i], (i % 64) * 2);
122+
123+
ASSERT_SUCCESS(olMemFree(Mem));
124+
}
125+
126+
TEST_P(olLaunchKernelLocalMemReductionTest, Success) {
127+
LaunchArgs.NumGroups.x = 4;
128+
LaunchArgs.DynSharedMemory = 64 * sizeof(uint32_t);
129+
130+
void *Mem;
131+
ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
132+
LaunchArgs.NumGroups.x * sizeof(uint32_t), &Mem));
133+
struct {
134+
void *Mem;
135+
} Args{Mem};
136+
137+
ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args),
138+
&LaunchArgs, nullptr));
139+
140+
ASSERT_SUCCESS(olWaitQueue(Queue));
141+
142+
uint32_t *Data = (uint32_t *)Mem;
143+
for (uint32_t i = 0; i < LaunchArgs.NumGroups.x; i++)
144+
ASSERT_EQ(Data[i], 2 * LaunchArgs.GroupSize.x);
145+
146+
ASSERT_SUCCESS(olMemFree(Mem));
147+
}
148+
149+
TEST_P(olLaunchKernelLocalMemStaticTest, Success) {
150+
LaunchArgs.NumGroups.x = 4;
151+
LaunchArgs.DynSharedMemory = 0;
152+
153+
void *Mem;
154+
ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
155+
LaunchArgs.NumGroups.x * sizeof(uint32_t), &Mem));
156+
struct {
157+
void *Mem;
158+
} Args{Mem};
159+
160+
ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args),
161+
&LaunchArgs, nullptr));
162+
163+
ASSERT_SUCCESS(olWaitQueue(Queue));
164+
165+
uint32_t *Data = (uint32_t *)Mem;
166+
for (uint32_t i = 0; i < LaunchArgs.NumGroups.x; i++)
167+
ASSERT_EQ(Data[i], 2 * LaunchArgs.GroupSize.x);
168+
169+
ASSERT_SUCCESS(olMemFree(Mem));
170+
}

0 commit comments

Comments
 (0)