Skip to content

Commit 97c0138

Browse files
committed
Move page cache via mbind to prevent cross-NUMA access
page cache pages are retained in memory of the node after running llama-bench bound to a node on multi-node systems, incuring cross-NUMA memory access penalty for subsequent runs of llama-bench bound to a different node. This commit introduces an mbind call as best effort basis to move the pages to the target node where llama-bench is executed, ensuring optimal NUMA locality. Additionally, necessary NUMA headers are included and the build is updated to link against the NUMA library. Experiments: 1. Run llama-bench on node 1 (base) 2. Run llama-bench on node 0 (regression observed) 3. Run patched llama-bench on node 0 (throughput same as base) `+ /usr/bin/time -p numactl -N 1 -m 1 $llama-bench -m $models/llama-2-7b-chat.Q8_0.gguf -ngl 0 --prio 0 -b 1 -t 24` | model | size | params | backend | threads | n_batch | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | ------: | ------: | ------------: | -------------------: | | llama 7B Q8_0 | 6.67 GiB | 6.74 B | CPU | 24 | 1 | pp512 | 5.39 ± 0.01 | | llama 7B Q8_0 | 6.67 GiB | 6.74 B | CPU | 24 | 1 | tg128 | 5.49 ± 0.03 | build: 35782ae (5014) real 687.60 user 15653.73 sys 42.67 `+ /usr/bin/time -p numactl -N 0 -m 0 $llama-bench -m $models/llama-2-7b-chat.Q8_0.gguf -ngl 0 --prio 0 -b 1 -t 24` | model | size | params | backend | threads | n_batch | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | ------: | ------: | ------------: | -------------------: | | llama 7B Q8_0 | 6.67 GiB | 6.74 B | CPU | 24 | 1 | pp512 | 4.60 ± 0.01 | | llama 7B Q8_0 | 6.67 GiB | 6.74 B | CPU | 24 | 1 | tg128 | 4.67 ± 0.03 | build: 35782ae (5014) real 805.99 user 18187.26 sys 48.93 `+ /usr/bin/time -p numactl -N 0 -m 0 $patched-llama-bench -m $models/llama-2-7b-chat.Q8_0.gguf -ngl 0 --prio 0 -b 1 -t 24` | model | size | params | backend | threads | n_batch | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | ------: | ------: | ------------: | -------------------: | | llama 7B Q8_0 | 6.67 GiB | 6.74 B | CPU | 24 | 1 | pp512 | 5.35 ± 0.01 | | llama 7B Q8_0 | 6.67 GiB | 6.74 B | CPU | 24 | 1 | tg128 | 5.46 ± 0.02 | build: 35782ae (5014) real 696.12 user 15735.41 sys 44.08 Suggested-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com> Signed-off-by: Vishal Chourasia <vishalc@linux.ibm.com>
1 parent 91a8ee6 commit 97c0138

File tree

3 files changed

+52
-0
lines changed

3 files changed

+52
-0
lines changed

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE})
8383

8484
# 3rd party libs
8585
option(LLAMA_CURL "llama: use libcurl to download model from an URL" ON)
86+
option(LLAMA_NUMA "llama: use libnuma to get memory policy of the llama-bench" ON)
8687
option(LLAMA_LLGUIDANCE "llama-common: include LLGuidance library for structured output in common utils" OFF)
8788

8889
# Required for relocatable CMake package

src/CMakeLists.txt

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,19 @@ target_compile_features (llama PRIVATE cxx_std_17) # don't bump
4242

4343
target_link_libraries(llama PUBLIC ggml)
4444

45+
if (LLAMA_NUMA)
46+
find_library(NUMA_LIB numa)
47+
check_include_file_cxx("numa.h" HAVE_NUMA_HEADERS)
48+
check_include_file_cxx("numaif.h" HAVE_NUMAIF_HEADERS)
49+
if (HAVE_NUMA_HEADERS AND HAVE_NUMAIF_HEADERS AND NUMA_LIB)
50+
target_compile_definitions(llama PRIVATE USE_LIBNUMA)
51+
target_link_libraries(llama PRIVATE numa)
52+
message(STATUS "libnuma found, page cache will be moved to the local node using mbind() syscall. Disable with LLAMA_NUMA=OFF")
53+
else()
54+
message(STATUS "Warning: NUMA headers not found - consider disabling this Warning with LLAMA_NUMA=OFF")
55+
endif()
56+
endif()
57+
4558
if (BUILD_SHARED_LIBS)
4659
set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON)
4760
target_compile_definitions(llama PRIVATE LLAMA_BUILD)

src/llama-mmap.cpp

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,12 @@
1010
#include <cerrno>
1111
#include <algorithm>
1212

13+
#ifdef USE_LIBNUMA
14+
#include <numa.h>
15+
#include <numaif.h>
16+
#include <sched.h>
17+
#endif
18+
1319
#ifdef __has_include
1420
#if __has_include(<unistd.h>)
1521
#include <unistd.h>
@@ -273,6 +279,27 @@ struct llama_mmap::impl {
273279
#ifdef _POSIX_MAPPED_FILES
274280
std::vector<std::pair<size_t, size_t>> mapped_fragments;
275281

282+
#ifdef USE_LIBNUMA
283+
static void move_pages(void *addr, size_t size) {
284+
int cpu, ret;
285+
struct bitmask *nodemask = numa_allocate_nodemask();
286+
287+
/* Get memory policy of the calling thread. */
288+
ret = get_mempolicy(nullptr, nodemask->maskp, nodemask->size, nullptr, 0);
289+
if (ret || numa_bitmask_weight(nodemask) == 0) {
290+
cpu = sched_getcpu();
291+
if (cpu >= 0) {
292+
numa_bitmask_clearall(nodemask);
293+
numa_bitmask_setbit(nodemask, numa_node_of_cpu(cpu));
294+
}
295+
}
296+
if (numa_bitmask_weight(nodemask) == 1) {
297+
mbind(addr, size, MPOL_BIND, nodemask->maskp, nodemask->size, MPOL_MF_MOVE);
298+
}
299+
numa_free_nodemask(nodemask);
300+
}
301+
#endif
302+
276303
impl(struct llama_file * file, size_t prefetch, bool numa) {
277304
size = file->size();
278305
int fd = file->file_id();
@@ -291,6 +318,17 @@ struct llama_mmap::impl {
291318
}
292319

293320
if (prefetch > 0) {
321+
#ifdef USE_LIBNUMA
322+
/*
323+
* Given that we already pre-fault all memory when prefetch > 0, it is
324+
* necessary to move any page cache pages that might have been
325+
* instantiated during previous runs on different NUMA nodes. This call
326+
* to move_pages() ensures that all memory-mapped pages are relocated
327+
* according to the calling thread's memory policy or the CPU on which
328+
* it is running.
329+
*/
330+
move_pages(addr, file->size());
331+
#endif
294332
if (posix_madvise(addr, std::min(file->size(), prefetch), POSIX_MADV_WILLNEED)) {
295333
LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n",
296334
strerror(errno));

0 commit comments

Comments
 (0)