Skip to content

Commit 9712d4f

Browse files
RossBruntongithub-actions[bot]
authored andcommitted
Automerge: [Offload] Tests for global memory and constructors (#147537)
Adds two "launch kernel" tests for lib offload, one testing that global memory works and persists between different kernels, and one verifying that `[[gnu::constructor]]` works correctly. Since we now have tests that contain multiple kernels in the same binary, the test framework has been updated a bit.
2 parents 5281e31 + bed9fe7 commit 9712d4f

File tree

5 files changed

+147
-6
lines changed

5 files changed

+147
-6
lines changed

offload/unittests/OffloadAPI/device_code/CMakeLists.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@ add_offload_test_device_code(noargs.c noargs -O3)
55
add_offload_test_device_code(localmem.c localmem)
66
add_offload_test_device_code(localmem_reduction.c localmem_reduction)
77
add_offload_test_device_code(localmem_static.c localmem_static)
8+
add_offload_test_device_code(global.c global)
9+
add_offload_test_device_code(global_ctor.c global_ctor)
10+
add_offload_test_device_code(global_dtor.c global_dtor)
811

912
add_custom_target(offload_device_binaries DEPENDS
1013
foo.bin
@@ -13,5 +16,8 @@ add_custom_target(offload_device_binaries DEPENDS
1316
localmem.bin
1417
localmem_reduction.bin
1518
localmem_static.bin
19+
global.bin
20+
global_ctor.bin
21+
global_dtor.bin
1622
)
1723
set(OFFLOAD_TEST_DEVICE_CODE_PATH ${CMAKE_CURRENT_BINARY_DIR} PARENT_SCOPE)
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
#include <gpuintrin.h>
2+
#include <stdint.h>
3+
4+
uint32_t global[64];
5+
6+
__gpu_kernel void write() {
7+
global[__gpu_thread_id(0)] = __gpu_thread_id(0);
8+
global[__gpu_thread_id(0)] *= 2;
9+
}
10+
11+
__gpu_kernel void read(uint32_t *out) {
12+
out[__gpu_thread_id(0) + (__gpu_num_threads(0) * __gpu_block_id(0))] =
13+
global[__gpu_thread_id(0)];
14+
}
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
#include <gpuintrin.h>
2+
#include <stdint.h>
3+
4+
uint32_t global[64];
5+
6+
[[gnu::constructor(202)]] void ctorc() {
7+
for (unsigned I = 0; I < 64; I++)
8+
global[I] += 20;
9+
}
10+
11+
[[gnu::constructor(200)]] void ctora() {
12+
for (unsigned I = 0; I < 64; I++)
13+
global[I] = 40;
14+
}
15+
16+
[[gnu::constructor(201)]] void ctorb() {
17+
for (unsigned I = 0; I < 64; I++)
18+
global[I] *= 2;
19+
}
20+
21+
__gpu_kernel void global_ctor(uint32_t *out) {
22+
global[__gpu_thread_id(0)] += __gpu_thread_id(0);
23+
out[__gpu_thread_id(0) + (__gpu_num_threads(0) * __gpu_block_id(0))] =
24+
global[__gpu_thread_id(0)];
25+
}
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
#include <gpuintrin.h>
2+
#include <stdint.h>
3+
4+
uint32_t global[64];
5+
6+
[[gnu::destructor]] void dtor() {
7+
for (unsigned I = 0; I < 64; I++)
8+
global[I] = 1;
9+
}
10+
11+
__gpu_kernel void global_dtor() {
12+
// no-op
13+
}

offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp

Lines changed: 89 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,13 @@
1111
#include <gtest/gtest.h>
1212

1313
struct LaunchKernelTestBase : OffloadQueueTest {
14-
void SetUpKernel(const char *kernel) {
14+
void SetUpProgram(const char *program) {
1515
RETURN_ON_FATAL_FAILURE(OffloadQueueTest::SetUp());
16-
ASSERT_TRUE(TestEnvironment::loadDeviceBinary(kernel, Device, DeviceBin));
16+
ASSERT_TRUE(TestEnvironment::loadDeviceBinary(program, Device, DeviceBin));
1717
ASSERT_GE(DeviceBin->getBufferSize(), 0lu);
1818
ASSERT_SUCCESS(olCreateProgram(Device, DeviceBin->getBufferStart(),
1919
DeviceBin->getBufferSize(), &Program));
20-
ASSERT_SUCCESS(olGetKernel(Program, kernel, &Kernel));
20+
2121
LaunchArgs.Dimensions = 1;
2222
LaunchArgs.GroupSize = {64, 1, 1};
2323
LaunchArgs.NumGroups = {1, 1, 1};
@@ -34,13 +34,21 @@ struct LaunchKernelTestBase : OffloadQueueTest {
3434

3535
std::unique_ptr<llvm::MemoryBuffer> DeviceBin;
3636
ol_program_handle_t Program = nullptr;
37-
ol_kernel_handle_t Kernel = nullptr;
3837
ol_kernel_launch_size_args_t LaunchArgs{};
3938
};
4039

40+
struct LaunchSingleKernelTestBase : LaunchKernelTestBase {
41+
void SetUpKernel(const char *kernel) {
42+
RETURN_ON_FATAL_FAILURE(SetUpProgram(kernel));
43+
ASSERT_SUCCESS(olGetKernel(Program, kernel, &Kernel));
44+
}
45+
46+
ol_kernel_handle_t Kernel = nullptr;
47+
};
48+
4149
#define KERNEL_TEST(NAME, KERNEL) \
42-
struct olLaunchKernel##NAME##Test : LaunchKernelTestBase { \
43-
void SetUp() override { LaunchKernelTestBase::SetUpKernel(#KERNEL); } \
50+
struct olLaunchKernel##NAME##Test : LaunchSingleKernelTestBase { \
51+
void SetUp() override { SetUpKernel(#KERNEL); } \
4452
}; \
4553
OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olLaunchKernel##NAME##Test);
4654

@@ -49,6 +57,29 @@ KERNEL_TEST(NoArgs, noargs)
4957
KERNEL_TEST(LocalMem, localmem)
5058
KERNEL_TEST(LocalMemReduction, localmem_reduction)
5159
KERNEL_TEST(LocalMemStatic, localmem_static)
60+
KERNEL_TEST(GlobalCtor, global_ctor)
61+
KERNEL_TEST(GlobalDtor, global_dtor)
62+
63+
struct LaunchMultipleKernelTestBase : LaunchKernelTestBase {
64+
void SetUpKernels(const char *program, std::vector<const char *> kernels) {
65+
RETURN_ON_FATAL_FAILURE(SetUpProgram(program));
66+
67+
Kernels.resize(kernels.size());
68+
size_t I = 0;
69+
for (auto K : kernels)
70+
ASSERT_SUCCESS(olGetKernel(Program, K, &Kernels[I++]));
71+
}
72+
73+
std::vector<ol_kernel_handle_t> Kernels;
74+
};
75+
76+
#define KERNEL_MULTI_TEST(NAME, PROGRAM, ...) \
77+
struct olLaunchKernel##NAME##Test : LaunchMultipleKernelTestBase { \
78+
void SetUp() override { SetUpKernels(#PROGRAM, {__VA_ARGS__}); } \
79+
}; \
80+
OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olLaunchKernel##NAME##Test);
81+
82+
KERNEL_MULTI_TEST(Global, global, "write", "read")
5283

5384
TEST_P(olLaunchKernelFooTest, Success) {
5485
void *Mem;
@@ -168,3 +199,55 @@ TEST_P(olLaunchKernelLocalMemStaticTest, Success) {
168199

169200
ASSERT_SUCCESS(olMemFree(Mem));
170201
}
202+
203+
TEST_P(olLaunchKernelGlobalTest, Success) {
204+
void *Mem;
205+
ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
206+
LaunchArgs.GroupSize.x * sizeof(uint32_t), &Mem));
207+
struct {
208+
void *Mem;
209+
} Args{Mem};
210+
211+
ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernels[0], nullptr, 0,
212+
&LaunchArgs, nullptr));
213+
ASSERT_SUCCESS(olWaitQueue(Queue));
214+
ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernels[1], &Args, sizeof(Args),
215+
&LaunchArgs, nullptr));
216+
ASSERT_SUCCESS(olWaitQueue(Queue));
217+
218+
uint32_t *Data = (uint32_t *)Mem;
219+
for (uint32_t i = 0; i < 64; i++) {
220+
ASSERT_EQ(Data[i], i * 2);
221+
}
222+
223+
ASSERT_SUCCESS(olMemFree(Mem));
224+
}
225+
226+
TEST_P(olLaunchKernelGlobalCtorTest, Success) {
227+
void *Mem;
228+
ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
229+
LaunchArgs.GroupSize.x * sizeof(uint32_t), &Mem));
230+
struct {
231+
void *Mem;
232+
} Args{Mem};
233+
234+
ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args),
235+
&LaunchArgs, nullptr));
236+
ASSERT_SUCCESS(olWaitQueue(Queue));
237+
238+
uint32_t *Data = (uint32_t *)Mem;
239+
for (uint32_t i = 0; i < 64; i++) {
240+
ASSERT_EQ(Data[i], i + 100);
241+
}
242+
243+
ASSERT_SUCCESS(olMemFree(Mem));
244+
}
245+
246+
TEST_P(olLaunchKernelGlobalDtorTest, Success) {
247+
// TODO: We can't inspect the result of a destructor yet, once we
248+
// find/implement a way, update this test. For now we just check that nothing
249+
// crashes
250+
ASSERT_SUCCESS(
251+
olLaunchKernel(Queue, Device, Kernel, nullptr, 0, &LaunchArgs, nullptr));
252+
ASSERT_SUCCESS(olWaitQueue(Queue));
253+
}

0 commit comments

Comments
 (0)