Skip to content

Commit 24828c8

Browse files
authored
[libc] Efficiently implement aligned_alloc for AMDGPU (#146585)
Summary: This patch uses the actual allocator interface to implement `aligned_alloc`. We do this by simply rounding up the amount allocated. Because of how index calculation works, any offset within an allocated pointer will still map to the same chunk, so we can just adjust internally and it will free all the same.
1 parent bca79ec commit 24828c8

File tree

5 files changed

+81
-9
lines changed

5 files changed

+81
-9
lines changed

libc/src/__support/GPU/allocator.cpp

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,11 @@ void uniform_memset(uint32_t *s, uint32_t c, uint32_t n, uint64_t uniform) {
138138
s[i] = c;
139139
}
140140

141+
// Indicates that the provided value is a power of two.
142+
static inline constexpr bool is_pow2(uint64_t x) {
143+
return x && (x & (x - 1)) == 0;
144+
}
145+
141146
} // namespace impl
142147

143148
/// A slab allocator used to hand out identically sized slabs of memory.
@@ -572,5 +577,27 @@ void *reallocate(void *ptr, uint64_t size) {
572577
return new_ptr;
573578
}
574579

580+
void *aligned_allocate(uint32_t alignment, uint64_t size) {
581+
// All alignment values must be a non-zero power of two.
582+
if (!impl::is_pow2(alignment))
583+
return nullptr;
584+
585+
// If the requested alignment is less than what we already provide this is
586+
// just a normal allocation.
587+
if (alignment < MIN_ALIGNMENT + 1)
588+
return gpu::allocate(size);
589+
590+
// We can't handle alignments greater than 2MiB so we simply fail.
591+
if (alignment > SLAB_ALIGNMENT + 1)
592+
return nullptr;
593+
594+
// Trying to handle allocation internally would break the assumption that each
595+
// chunk is identical to eachother. Allocate enough memory with worst-case
596+
// alignment and then round up. The index logic will round down properly.
597+
uint64_t rounded = size + alignment - 1;
598+
void *ptr = gpu::allocate(rounded);
599+
return __builtin_align_up(ptr, alignment);
600+
}
601+
575602
} // namespace gpu
576603
} // namespace LIBC_NAMESPACE_DECL

libc/src/__support/GPU/allocator.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ namespace gpu {
1818
void *allocate(uint64_t size);
1919
void deallocate(void *ptr);
2020
void *reallocate(void *ptr, uint64_t size);
21+
void *aligned_allocate(uint32_t alignment, uint64_t size);
2122

2223
} // namespace gpu
2324
} // namespace LIBC_NAMESPACE_DECL

libc/src/stdlib/gpu/aligned_alloc.cpp

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15,15 +15,15 @@
1515
namespace LIBC_NAMESPACE_DECL {
1616

1717
LLVM_LIBC_FUNCTION(void *, aligned_alloc, (size_t alignment, size_t size)) {
18-
if ((alignment & -alignment) != alignment)
19-
return nullptr;
20-
21-
void *ptr = gpu::allocate(size);
22-
if ((reinterpret_cast<uintptr_t>(ptr) & (alignment - 1)) != 0) {
23-
gpu::deallocate(ptr);
24-
return nullptr;
25-
}
26-
return ptr;
18+
// FIXME: NVIDIA targets currently use the built-in 'malloc' which we cannot
19+
// reason with. But we still need to provide this function for compatibility.
20+
#ifndef LIBC_TARGET_ARCH_IS_NVPTX
21+
return gpu::aligned_allocate(static_cast<uint32_t>(alignment), size);
22+
#else
23+
(void)alignment;
24+
(void)size;
25+
return nullptr;
26+
#endif
2727
}
2828

2929
} // namespace LIBC_NAMESPACE_DECL

libc/test/integration/src/stdlib/gpu/CMakeLists.txt

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,21 @@ if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX)
3232
--blocks 1024
3333
)
3434

35+
add_integration_test(
36+
aligned_alloc
37+
SUITE
38+
stdlib-gpu-integration-tests
39+
SRCS
40+
aligned_alloc.cpp
41+
DEPENDS
42+
libc.src.stdlib.aligned_alloc
43+
libc.src.stdlib.malloc
44+
libc.src.stdlib.free
45+
LOADER_ARGS
46+
--threads 256
47+
--blocks 128
48+
)
49+
3550
add_integration_test(
3651
malloc_stress
3752
SUITE
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
#include "test/IntegrationTest/test.h"
2+
3+
#include "src/__support/GPU/utils.h"
4+
#include "src/stdlib/aligned_alloc.h" // Adjust path if needed
5+
#include "src/stdlib/free.h"
6+
7+
using namespace LIBC_NAMESPACE;
8+
9+
TEST_MAIN(int, char **, char **) {
10+
// aligned_alloc with valid alignment and size
11+
void *ptr = LIBC_NAMESPACE::aligned_alloc(32, 16);
12+
EXPECT_NE(ptr, nullptr);
13+
EXPECT_EQ(__builtin_is_aligned(ptr, 32), 0U);
14+
15+
LIBC_NAMESPACE::free(ptr);
16+
17+
// aligned_alloc fails if alignment is not power of two
18+
void *bad_align = LIBC_NAMESPACE::aligned_alloc(30, 99);
19+
EXPECT_EQ(bad_align, nullptr);
20+
21+
// aligned_alloc with a divergent size.
22+
size_t alignment = 1 << (__gpu_lane_id() % 8 + 1);
23+
void *div =
24+
LIBC_NAMESPACE::aligned_alloc(alignment, (gpu::get_thread_id() + 1) * 4);
25+
EXPECT_NE(div, nullptr);
26+
EXPECT_EQ(__builtin_is_aligned(div, alignment), 0U);
27+
28+
return 0;
29+
}

0 commit comments

Comments
 (0)