From 57631b99403adb213b61b6330e34a961f0c217d0 Mon Sep 17 00:00:00 2001 From: Vishal Chourasia Date: Tue, 6 May 2025 09:28:09 +0530 Subject: [PATCH] Move page cache via mbind to prevent cross-NUMA access MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit page cache pages are retained in memory of the node after running llama-bench bound to a node on multi-node systems, incuring cross-NUMA memory access penalty for subsequent runs of llama-bench bound to a different node. This commit introduces an mbind call as best effort basis to move the pages to the target node where llama-bench is executed, ensuring optimal NUMA locality. Additionally, necessary NUMA headers are included and the build is updated to link against the NUMA library. Experiments: 1. Run llama-bench on node 1 (base) 2. Run llama-bench on node 0 (regression observed) 3. Run patched llama-bench on node 0 (throughput same as base) + /usr/bin/time -p numactl -N 1 -m 1 $llama-bench -m $models/llama-2-7b-chat.Q8_0.gguf -ngl 0 --prio 0 -b 1 -t 24 | model | size | params | backend | threads | n_batch | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | ------: | ------: | ------------: | -------------------: | | llama 7B Q8_0 | 6.67 GiB | 6.74 B | CPU | 24 | 1 | pp512 | 5.39 ± 0.01 | | llama 7B Q8_0 | 6.67 GiB | 6.74 B | CPU | 24 | 1 | tg128 | 5.49 ± 0.03 | build: 35782aee (5014) real 687.60 user 15653.73 sys 42.67 + /usr/bin/time -p numactl -N 0 -m 0 $llama-bench -m $models/llama-2-7b-chat.Q8_0.gguf -ngl 0 --prio 0 -b 1 -t 24 | model | size | params | backend | threads | n_batch | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | ------: | ------: | ------------: | -------------------: | | llama 7B Q8_0 | 6.67 GiB | 6.74 B | CPU | 24 | 1 | pp512 | 4.60 ± 0.01 | | llama 7B Q8_0 | 6.67 GiB | 6.74 B | CPU | 24 | 1 | tg128 | 4.67 ± 0.03 | build: 35782aee (5014) real 805.99 user 18187.26 sys 48.93 + /usr/bin/time -p numactl -N 0 -m 0 $patched-llama-bench -m $models/llama-2-7b-chat.Q8_0.gguf -ngl 0 --prio 0 -b 1 -t 24 | model | size | params | backend | threads | n_batch | test | t/s | | ------------------------------ | ---------: | ---------: | ---------- | ------: | ------: | ------------: | -------------------: | | llama 7B Q8_0 | 6.67 GiB | 6.74 B | CPU | 24 | 1 | pp512 | 5.35 ± 0.01 | | llama 7B Q8_0 | 6.67 GiB | 6.74 B | CPU | 24 | 1 | tg128 | 5.46 ± 0.02 | build: 35782aee (5014) real 696.12 user 15735.41 sys 44.08 Suggested-by: Ritesh Harjani (IBM) Signed-off-by: Vishal Chourasia --- src/CMakeLists.txt | 7 ++++++- src/llama-mmap.cpp | 38 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 1cd316b03e132..4d67dfc571d88 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -36,7 +36,12 @@ target_include_directories(llama PRIVATE .) target_include_directories(llama PUBLIC ../include) target_compile_features (llama PRIVATE cxx_std_17) # don't bump -target_link_libraries(llama PUBLIC ggml) +# Conditionally link numa on Linux +if (CMAKE_SYSTEM_NAME MATCHES "Linux") + target_link_libraries(llama PUBLIC ggml numa) +else() + target_link_libraries(llama PUBLIC ggml) +endif() if (BUILD_SHARED_LIBS) set_target_properties(llama PROPERTIES POSITION_INDEPENDENT_CODE ON) diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp index 9da97f1bc5057..d0d4a7df5ca7c 100644 --- a/src/llama-mmap.cpp +++ b/src/llama-mmap.cpp @@ -10,6 +10,12 @@ #include #include +#ifdef __linux__ + #include + #include + #include +#endif + #ifdef __has_include #if __has_include() #include @@ -273,6 +279,27 @@ struct llama_mmap::impl { #ifdef _POSIX_MAPPED_FILES std::vector> mapped_fragments; +#ifdef __linux__ + static void move_pages(void *addr, size_t size) { + int cpu, ret; + struct bitmask *nodemask = numa_allocate_nodemask(); + + /* Get memory policy of the calling thread. */ + ret = get_mempolicy(nullptr, nodemask->maskp, nodemask->size, nullptr, 0); + if (ret || numa_bitmask_weight(nodemask) == 0) { + cpu = sched_getcpu(); + if (cpu >= 0) { + numa_bitmask_clearall(nodemask); + numa_bitmask_setbit(nodemask, numa_node_of_cpu(cpu)); + } + } + if (numa_bitmask_weight(nodemask) == 1) { + mbind(addr, size, MPOL_BIND, nodemask->maskp, nodemask->size, MPOL_MF_MOVE); + } + numa_free_nodemask(nodemask); + } +#endif + impl(struct llama_file * file, size_t prefetch, bool numa) { size = file->size(); int fd = file->file_id(); @@ -291,6 +318,17 @@ struct llama_mmap::impl { } if (prefetch > 0) { +#ifdef __linux__ + /* + * Given that we already pre-fault all memory when prefetch > 0, it is + * necessary to move any page cache pages that might have been + * instantiated during previous runs on different NUMA nodes. This call + * to move_pages() ensures that all memory-mapped pages are relocated + * according to the calling thread's memory policy or the CPU on which + * it is running. + */ + move_pages(addr, file->size()); +#endif if (posix_madvise(addr, std::min(file->size(), prefetch), POSIX_MADV_WILLNEED)) { LLAMA_LOG_WARN("warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n", strerror(errno));