Skip to content

Commit 38049dc

Browse files
authored
[libc] Handle differing wavefront sizes correctly in the AMDHSA loader (#117788)
Summary: The AMDGPU backend can handle wavefront sizes of 32 and 64, with the native hardware preferring one or the other. The user can override the hardware with `-mwavefrontsize64` or `-mwavefrontsize32` which previously wasn't handled. We need to know the wavefront size to know how much memory to allocate and how to index the RPC buffer. There isn't a good way to do this with ROCm so we just use the LLVM support for offloading to check this from the image.
1 parent f67ba58 commit 38049dc

File tree

5 files changed

+101
-23
lines changed

5 files changed

+101
-23
lines changed

libc/cmake/modules/LLVMLibCTestRules.cmake

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -476,14 +476,14 @@ function(add_integration_test test_name)
476476

477477
if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
478478
target_link_options(${fq_build_target_name} PRIVATE
479-
${LIBC_COMPILE_OPTIONS_DEFAULT} -Wno-multi-gpu
480-
-mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto
479+
${LIBC_COMPILE_OPTIONS_DEFAULT} ${INTEGRATION_TEST_COMPILE_OPTIONS}
480+
-Wno-multi-gpu -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto
481481
"-Wl,-mllvm,-amdgpu-lower-global-ctor-dtor=0" -nostdlib -static
482482
"-Wl,-mllvm,-amdhsa-code-object-version=${LIBC_GPU_CODE_OBJECT_VERSION}")
483483
elseif(LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
484484
target_link_options(${fq_build_target_name} PRIVATE
485-
${LIBC_COMPILE_OPTIONS_DEFAULT} -Wno-multi-gpu
486-
"-Wl,--suppress-stack-size-warning"
485+
${LIBC_COMPILE_OPTIONS_DEFAULT} ${INTEGRATION_TEST_COMPILE_OPTIONS}
486+
"-Wl,--suppress-stack-size-warning" -Wno-multi-gpu
487487
"-Wl,-mllvm,-nvptx-lower-global-ctor-dtor=1"
488488
"-Wl,-mllvm,-nvptx-emit-init-fini-kernel"
489489
-march=${LIBC_GPU_TARGET_ARCHITECTURE} -nostdlib -static

libc/test/integration/startup/gpu/CMakeLists.txt

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,3 +53,36 @@ add_integration_test(
5353
--threads 32
5454
--blocks 8
5555
)
56+
57+
if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU)
58+
add_integration_test(
59+
startup_rpc_lane_test_w32
60+
SUITE libc-startup-tests
61+
SRCS
62+
rpc_lane_test
63+
LOADER_ARGS
64+
--threads 32
65+
COMPILE_OPTIONS
66+
-mno-wavefrontsize64
67+
)
68+
69+
add_integration_test(
70+
startup_rpc_lane_test_w64
71+
SUITE libc-startup-tests
72+
SRCS
73+
rpc_lane_test.cpp
74+
LOADER_ARGS
75+
--threads 64
76+
COMPILE_OPTIONS
77+
-mwavefrontsize64
78+
)
79+
else()
80+
add_integration_test(
81+
startup_rpc_lane_test_w32
82+
SUITE libc-startup-tests
83+
SRCS
84+
rpc_lane_test.cpp
85+
LOADER_ARGS
86+
--threads 32
87+
)
88+
endif()
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
//===-- Loader test to check the RPC interface with the loader ------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#include "include/llvm-libc-types/test_rpc_opcodes_t.h"
10+
#include "src/__support/GPU/utils.h"
11+
#include "src/__support/RPC/rpc_client.h"
12+
#include "test/IntegrationTest/test.h"
13+
14+
using namespace LIBC_NAMESPACE;
15+
16+
static void test_add() {
17+
uint64_t cnt = gpu::get_lane_id();
18+
LIBC_NAMESPACE::rpc::Client::Port port =
19+
LIBC_NAMESPACE::rpc::client.open<RPC_TEST_INCREMENT>();
20+
port.send_and_recv(
21+
[=](LIBC_NAMESPACE::rpc::Buffer *buffer, uint32_t) {
22+
reinterpret_cast<uint64_t *>(buffer->data)[0] = cnt;
23+
},
24+
[&](LIBC_NAMESPACE::rpc::Buffer *buffer, uint32_t) {
25+
cnt = reinterpret_cast<uint64_t *>(buffer->data)[0];
26+
});
27+
port.close();
28+
EXPECT_EQ(cnt, gpu::get_lane_id() + 1);
29+
EXPECT_EQ(gpu::get_thread_id(), gpu::get_lane_id());
30+
}
31+
32+
TEST_MAIN(int argc, char **argv, char **envp) {
33+
test_add();
34+
35+
return 0;
36+
}

libc/utils/gpu/loader/amdgpu/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ set(LLVM_LINK_COMPONENTS
33
Object
44
Option
55
Support
6+
FrontendOffloading
67
)
78

89
add_llvm_executable(amdhsa-loader amdhsa-loader.cpp)

libc/utils/gpu/loader/amdgpu/amdhsa-loader.cpp

Lines changed: 27 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@
2828
#include "hsa/hsa_ext_amd.h"
2929
#endif
3030

31+
#include "llvm/Frontend/Offloading/Utility.h"
32+
3133
#include <atomic>
3234
#include <cstdio>
3335
#include <cstdlib>
@@ -163,17 +165,13 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
163165
hsa_queue_t *queue, rpc::Server &server,
164166
const LaunchParameters &params,
165167
const char *kernel_name, args_t kernel_args,
166-
bool print_resource_usage) {
168+
uint32_t wavefront_size, bool print_resource_usage) {
167169
// Look up the kernel in the loaded executable.
168170
hsa_executable_symbol_t symbol;
169171
if (hsa_status_t err = hsa_executable_get_symbol_by_name(
170172
executable, kernel_name, &dev_agent, &symbol))
171173
return err;
172174

173-
uint32_t wavefront_size = 0;
174-
if (hsa_status_t err = hsa_agent_get_info(
175-
dev_agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size))
176-
handle_error(err);
177175
// Retrieve different properties of the kernel symbol used for launch.
178176
uint64_t kernel;
179177
uint32_t args_size;
@@ -419,6 +417,16 @@ int load(int argc, const char **argv, const char **envp, void *image,
419417
dev_agent, &coarsegrained_pool))
420418
handle_error(err);
421419

420+
// The AMDGPU target can change its wavefront size. There currently isn't a
421+
// good way to look this up through the HSA API so we use the LLVM interface.
422+
uint16_t abi_version;
423+
llvm::StringRef image_ref(reinterpret_cast<char *>(image), size);
424+
llvm::StringMap<llvm::offloading::amdgpu::AMDGPUKernelMetaData> info_map;
425+
if (llvm::Error err = llvm::offloading::amdgpu::getAMDGPUMetaDataFromImage(
426+
llvm::MemoryBufferRef(image_ref, ""), info_map, abi_version)) {
427+
handle_error(llvm::toString(std::move(err)).c_str());
428+
}
429+
422430
// Allocate fine-grained memory on the host to hold the pointer array for the
423431
// copied argv and allow the GPU agent to access it.
424432
auto allocator = [&](uint64_t size) -> void * {
@@ -448,10 +456,10 @@ int load(int argc, const char **argv, const char **envp, void *image,
448456
hsa_amd_memory_fill(dev_ret, 0, /*count=*/1);
449457

450458
// Allocate finegrained memory for the RPC server and client to share.
451-
uint32_t wavefront_size = 0;
452-
if (hsa_status_t err = hsa_agent_get_info(
453-
dev_agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size))
454-
handle_error(err);
459+
uint32_t wavefront_size =
460+
llvm::max_element(info_map, [](auto &&x, auto &&y) {
461+
return x.second.WavefrontSize < y.second.WavefrontSize;
462+
})->second.WavefrontSize;
455463

456464
// Set up the RPC server.
457465
void *rpc_buffer;
@@ -513,7 +521,6 @@ int load(int argc, const char **argv, const char **envp, void *image,
513521
if (HSA_STATUS_SUCCESS ==
514522
hsa_executable_get_symbol_by_name(executable, "__llvm_libc_clock_freq",
515523
&dev_agent, &freq_sym)) {
516-
517524
void *host_clock_freq;
518525
if (hsa_status_t err =
519526
hsa_amd_memory_pool_allocate(finegrained_pool, sizeof(uint64_t),
@@ -553,16 +560,17 @@ int load(int argc, const char **argv, const char **envp, void *image,
553560

554561
LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
555562
begin_args_t init_args = {argc, dev_argv, dev_envp};
556-
if (hsa_status_t err = launch_kernel(dev_agent, executable, kernargs_pool,
557-
coarsegrained_pool, queue, server,
558-
single_threaded_params, "_begin.kd",
559-
init_args, print_resource_usage))
563+
if (hsa_status_t err = launch_kernel(
564+
dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
565+
server, single_threaded_params, "_begin.kd", init_args,
566+
info_map["_begin"].WavefrontSize, print_resource_usage))
560567
handle_error(err);
561568

562569
start_args_t args = {argc, dev_argv, dev_envp, dev_ret};
563570
if (hsa_status_t err = launch_kernel(
564571
dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
565-
server, params, "_start.kd", args, print_resource_usage))
572+
server, params, "_start.kd", args, info_map["_start"].WavefrontSize,
573+
print_resource_usage))
566574
handle_error(err);
567575

568576
void *host_ret;
@@ -580,10 +588,10 @@ int load(int argc, const char **argv, const char **envp, void *image,
580588
int ret = *static_cast<int *>(host_ret);
581589

582590
end_args_t fini_args = {ret};
583-
if (hsa_status_t err = launch_kernel(dev_agent, executable, kernargs_pool,
584-
coarsegrained_pool, queue, server,
585-
single_threaded_params, "_end.kd",
586-
fini_args, print_resource_usage))
591+
if (hsa_status_t err = launch_kernel(
592+
dev_agent, executable, kernargs_pool, coarsegrained_pool, queue,
593+
server, single_threaded_params, "_end.kd", fini_args,
594+
info_map["_end"].WavefrontSize, print_resource_usage))
587595
handle_error(err);
588596

589597
if (hsa_status_t err = hsa_amd_memory_pool_free(rpc_buffer))

0 commit comments

Comments
 (0)