From adfc523e917b6f8aaf55e1e8c5e6ddf78db86c66 Mon Sep 17 00:00:00 2001 From: Jimmy Chen Date: Wed, 16 Jul 2025 15:50:50 +1000 Subject: [PATCH 1/5] Add jemalloc profiling support via HTTP API. Co-authored-by: Paul Hauner --- .cargo/config.toml | 5 ++- Cargo.lock | 2 + beacon_node/http_api/Cargo.toml | 1 + beacon_node/http_api/src/lib.rs | 49 ++++++++++++++++++++++ common/malloc_utils/Cargo.toml | 5 ++- common/malloc_utils/src/jemalloc.rs | 64 +++++++++++++++++++++++++++++ common/malloc_utils/src/lib.rs | 31 ++++++++++++++ 7 files changed, 154 insertions(+), 3 deletions(-) diff --git a/.cargo/config.toml b/.cargo/config.toml index a408305c4d1..8cd4b4b39da 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -1,3 +1,6 @@ [env] # Set the number of arenas to 16 when using jemalloc. -JEMALLOC_SYS_WITH_MALLOC_CONF = "abort_conf:true,narenas:16" +# +# Provide `prof:true` to allow profiling, but `prof_active:false` to require +# profiling to be explicitly activated at runtime (possible via the BN HTTP API). +JEMALLOC_SYS_WITH_MALLOC_CONF = "abort_conf:true,narenas:16,prof:true,prof_active:false" \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index af5d63e97f4..2179aab19bc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4290,6 +4290,7 @@ dependencies = [ "lighthouse_version", "logging", "lru", + "malloc_utils", "metrics", "network", "operation_pool", @@ -5904,6 +5905,7 @@ dependencies = [ "metrics", "parking_lot 0.12.3", "tikv-jemalloc-ctl", + "tikv-jemalloc-sys", "tikv-jemallocator", ] diff --git a/beacon_node/http_api/Cargo.toml b/beacon_node/http_api/Cargo.toml index 781a4cfa44e..2b4188f378c 100644 --- a/beacon_node/http_api/Cargo.toml +++ b/beacon_node/http_api/Cargo.toml @@ -23,6 +23,7 @@ lighthouse_network = { workspace = true } lighthouse_version = { workspace = true } logging = { workspace = true } lru = { workspace = true } +malloc_utils = { workspace = true } metrics = { workspace = true } network = { workspace = true } operation_pool = { workspace = true } diff --git a/beacon_node/http_api/src/lib.rs b/beacon_node/http_api/src/lib.rs index cacdd4a44c5..013e2388ab9 100644 --- a/beacon_node/http_api/src/lib.rs +++ b/beacon_node/http_api/src/lib.rs @@ -4631,6 +4631,53 @@ pub fn serve( }, ); + // POST lighthouse/malloc/prof_dump + let post_lighthouse_malloc_prof_dump = warp::path("lighthouse") + .and(warp::path("malloc")) + .and(warp::path("prof_dump")) + .and(warp::body::json()) + .and(warp::path::end()) + // Skip the `BeaconProcessor` for memory dumps so we can execute them as + // quickly as possible. Memory dumps should be uncommon and very + // deliberate. + .then(|filename: String| { + let dump = || { + let path = PathBuf::from_str(&filename).map_err(|e| { + warp_utils::reject::custom_bad_request(format!( + "Unable to parse {filename} as path: {e:?}" + )) + })?; + if path.exists() { + Err(warp_utils::reject::custom_bad_request(format!( + "{filename} already exists" + ))) + } else { + malloc_utils::prof_dump(&filename) + .map(|()| warp::reply::json(&filename).into_response()) + .map_err(warp_utils::reject::custom_bad_request) + } + }; + + convert_rejection(dump()) + }); + + // POST lighthouse/malloc/prof_active + let post_lighthouse_malloc_prof_active = warp::path("lighthouse") + .and(warp::path("malloc")) + .and(warp::path("prof_active")) + .and(warp::body::json()) + .and(warp::path::end()) + // Skip the `BeaconProcessor` for profiling so we can execute it as + // quickly as possible. Memory dumps should be uncommon and very + // deliberate. + .then(|enable: bool| { + let result = malloc_utils::prof_active(enable) + .map(|()| warp::reply::json(&enable).into_response()) + .map_err(warp_utils::reject::custom_bad_request); + + convert_rejection(result) + }); + let get_events = eth_v1 .and(warp::path("events")) .and(warp::path::end()) @@ -4908,6 +4955,8 @@ pub fn serve( .uor(post_lighthouse_compaction) .uor(post_lighthouse_add_peer) .uor(post_lighthouse_remove_peer) + .uor(post_lighthouse_malloc_prof_dump) + .uor(post_lighthouse_malloc_prof_active) .recover(warp_utils::reject::handle_rejection), ), ) diff --git a/common/malloc_utils/Cargo.toml b/common/malloc_utils/Cargo.toml index 89973493b4e..25aa3b064b9 100644 --- a/common/malloc_utils/Cargo.toml +++ b/common/malloc_utils/Cargo.toml @@ -6,14 +6,14 @@ edition = { workspace = true } [features] mallinfo2 = [] -jemalloc = ["tikv-jemallocator", "tikv-jemalloc-ctl"] -jemalloc-profiling = ["tikv-jemallocator/profiling"] +jemalloc = ["tikv-jemallocator", "tikv-jemalloc-ctl", "tikv-jemalloc-sys"] [dependencies] libc = "0.2.79" metrics = { workspace = true } parking_lot = { workspace = true } tikv-jemalloc-ctl = { version = "0.6.0", optional = true, features = ["stats"] } +tikv-jemalloc-sys = { version = "0.6.0", optional = true } [target.'cfg(not(target_os = "linux"))'.dependencies] tikv-jemallocator = { version = "0.6.0", optional = true, features = ["stats"] } @@ -23,4 +23,5 @@ tikv-jemallocator = { version = "0.6.0", optional = true, features = ["stats"] } tikv-jemallocator = { version = "0.6.0", optional = true, features = [ "stats", "background_threads", + "profiling" ] } diff --git a/common/malloc_utils/src/jemalloc.rs b/common/malloc_utils/src/jemalloc.rs index 2e90c0ddf33..4ab0b86ba48 100644 --- a/common/malloc_utils/src/jemalloc.rs +++ b/common/malloc_utils/src/jemalloc.rs @@ -7,10 +7,13 @@ //! //! A) `JEMALLOC_SYS_WITH_MALLOC_CONF` at compile-time. //! B) `_RJEM_MALLOC_CONF` at runtime. + use metrics::{ set_gauge, set_gauge_vec, try_create_int_gauge, try_create_int_gauge_vec, IntGauge, IntGaugeVec, }; +use std::ffi::{c_char, c_int}; use std::sync::LazyLock; +use std::{mem, ptr}; use tikv_jemalloc_ctl::{arenas, epoch, raw, stats, Access, AsName, Error}; #[global_allocator] @@ -124,6 +127,67 @@ pub fn page_size() -> Result { "arenas.page\0".name().read() } +/// A convenience wrapper around `mallctl` for writing `value` to `name`. +/// +/// # Safety +/// +/// - `name` must be a valid, null-terminated jemalloc control name. +/// - `value` must match the expected type for the specified control. +/// - The jemalloc allocator must be initialised. +/// +/// Incorrect usage may cause undefined behaviour or allocator corruption. +unsafe fn mallctl_write(name: &[u8], mut value: T) -> Result<(), c_int> { + // Use `tikv_jemalloc_sys::mallctl` directly since the `jemalloc_ctl::raw` + // functions artifically limit the `name` values. + let status = tikv_jemalloc_sys::mallctl( + name as *const _ as *const c_char, + ptr::null_mut(), + ptr::null_mut(), + &mut value as *mut _ as *mut _, + mem::size_of::(), + ); + + if status == 0 { + Ok(()) + } else { + Err(status) + } +} + +/// Add a C-style `0x00` terminator to the string and return it as a `Vec` of +/// bytes. +#[allow(dead_code)] +fn terminate_string_for_c(s: &str) -> Vec { + let mut terminated = vec![0x00_u8; s.len() + 1]; + terminated[..s.len()].copy_from_slice(s.as_ref()); + terminated +} + +/// Uses `mallctl` to call `"prof.dump"`. +/// +/// This generates a heap profile at `filename`. +#[allow(dead_code)] +pub fn prof_dump(filename: &str) -> Result<(), String> { + let terminated_filename = terminate_string_for_c(filename); + + unsafe { + mallctl_write( + "prof.dump\0".as_ref(), + terminated_filename.as_ptr() as *const c_char, + ) + } + .map_err(|e| format!("Failed to call prof.dump on mallctl: {e:?}")) +} + +/// Uses `mallctl` to call `"prof.enable"`. +/// +/// Controls whether profile sampling is active. +#[allow(dead_code)] +pub fn prof_active(enable: bool) -> Result<(), String> { + unsafe { mallctl_write("prof.active\0".as_ref(), enable) } + .map_err(|e| format!("Failed to call prof.active on mallctl with code {e:?}")) +} + #[cfg(test)] mod test { use super::*; diff --git a/common/malloc_utils/src/lib.rs b/common/malloc_utils/src/lib.rs index 50d2785a744..45139fd9072 100644 --- a/common/malloc_utils/src/lib.rs +++ b/common/malloc_utils/src/lib.rs @@ -44,6 +44,11 @@ pub use interface::*; mod interface { pub use crate::glibc::configure_glibc_malloc as configure_memory_allocator; pub use crate::glibc::scrape_mallinfo_metrics as scrape_allocator_metrics; + + #[allow(dead_code)] + pub use super::prof_active_unsupported as prof_active; + #[allow(dead_code)] + pub use super::prof_dump_unsupported as prof_dump; } #[cfg(feature = "jemalloc")] @@ -53,6 +58,8 @@ mod interface { Ok(()) } + pub use crate::jemalloc::prof_active; + pub use crate::jemalloc::prof_dump; pub use crate::jemalloc::scrape_jemalloc_metrics as scrape_allocator_metrics; } @@ -68,4 +75,28 @@ mod interface { #[allow(dead_code)] pub fn scrape_allocator_metrics() {} + + #[allow(dead_code)] + pub use super::prof_dump_unsupported as prof_dump; + + #[allow(dead_code)] + pub use super::prof_active_unsupported as prof_active; +} + +#[allow(dead_code)] +pub fn prof_dump_unsupported(_: &str) -> Result<(), String> { + Err( + "Profile dumps are only supported when Lighthouse is built for Linux \ + using the `jemalloc` feature." + .to_string(), + ) +} + +#[allow(dead_code)] +pub fn prof_active_unsupported(_: bool) -> Result<(), String> { + Err( + "Enabling profiling is only supported when Lighthouse is built for Linux \ + using the `jemalloc` feature." + .to_string(), + ) } From c072e5d6086bd591fc91d10433f077477edc87d5 Mon Sep 17 00:00:00 2001 From: Jimmy Chen Date: Thu, 17 Jul 2025 15:26:07 +1000 Subject: [PATCH 2/5] Fix startup error on macos and include profiling feature. --- common/malloc_utils/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/malloc_utils/Cargo.toml b/common/malloc_utils/Cargo.toml index 25aa3b064b9..b3b363708e6 100644 --- a/common/malloc_utils/Cargo.toml +++ b/common/malloc_utils/Cargo.toml @@ -16,7 +16,7 @@ tikv-jemalloc-ctl = { version = "0.6.0", optional = true, features = ["stats"] } tikv-jemalloc-sys = { version = "0.6.0", optional = true } [target.'cfg(not(target_os = "linux"))'.dependencies] -tikv-jemallocator = { version = "0.6.0", optional = true, features = ["stats"] } +tikv-jemallocator = { version = "0.6.0", optional = true, features = ["stats", "profiling"] } # Jemalloc's background_threads feature requires Linux (pthreads). [target.'cfg(target_os = "linux")'.dependencies] From 66594d833f103af592837189f38a17f01ff03566 Mon Sep 17 00:00:00 2001 From: Jimmy Chen Date: Fri, 18 Jul 2025 13:15:36 +1000 Subject: [PATCH 3/5] Add memory profiling doc. --- book/src/SUMMARY.md | 1 + book/src/memory_profiling.md | 51 ++++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) create mode 100644 book/src/memory_profiling.md diff --git a/book/src/SUMMARY.md b/book/src/SUMMARY.md index feecd4b6894..0fb2e0f0b08 100644 --- a/book/src/SUMMARY.md +++ b/book/src/SUMMARY.md @@ -63,6 +63,7 @@ * [FAQs](./faq.md) * [Protocol Developers](./developers.md) * [Lighthouse Architecture](./developers_architecture.md) + * [Memory Profiling](./memory_profiling.md) * [Security Researchers](./security.md) * [Archived](./archived.md) * [Merge Migration](./archived_merge_migration.md) diff --git a/book/src/memory_profiling.md b/book/src/memory_profiling.md new file mode 100644 index 00000000000..46d971a18f6 --- /dev/null +++ b/book/src/memory_profiling.md @@ -0,0 +1,51 @@ +# Memory Profiling in Lighthouse + +Lighthouse ships with jemalloc enabled by default on Linux, with heap profiling (`prof:true,prof_active:false`) already configured. This guide explains how to capture and inspect heap profiles using `jeprof` to help diagnose memory issues. Use this profiling setup to catch leaks, regressions, or bloated allocation paths. + +## 1. Build Lighthouse with Debug Symbols + +To make the profiling data readable, build Lighthouse with debug symbols: + +```bash +RUSTFLAGS="-C debuginfo=2" make +``` + +This ensures the installed `lighthouse` binary includes symbol information. `debug = true` in Cargo profiles is more expansive (enabling `opt-level = 0`, e.g.), but `debuginfo=2` is sufficient and better suited for profiling with optimized binaries. + +## 2. Run the Beacon Node + +Run the node as usual: + +```bash +lighthouse bn ... +``` + +Let it run for a while to accumulate allocations if desired. Note that jemalloc only records allocations after profiling is activated - consider this when deciding when to start profiling. + +> **Be consistent:** When analyzing a profile dump, `jeprof` must be given the exact path to the binary used to launch the process. In this setup, it's simply `$(which lighthouse)`. + +## 3. Start Profiling and Dump Memory + +Enable jemalloc profiling: + +```bash +curl -X POST http://localhost:5052/lighthouse/malloc/prof_active -H "Content-Type: application/json" -d "true" +``` + +Trigger a memory profile dump: + +```bash +curl -X POST http://localhost:5052/lighthouse/malloc/prof_dump -H "Content-Type: application/json" -d '"/home/ubuntu/prof.dump"' +``` + +## 4. Analyze with `jeprof` + +Generate a visualization: + +```bash +jeprof --svg $(which lighthouse) /home/ubuntu/prof.dump > profile.svg +``` + +Open `profile.svg` in a browser to inspect memory hotspots. + +> **Important:** Symbol resolution will fail if the path to `lighthouse` doesn't exactly match how it was invoked. Stick to `$(which lighthouse)` if that's how the binary was executed. From 535d1808cde4869127ea4e0b6727fa569ca944e1 Mon Sep 17 00:00:00 2001 From: Jimmy Chen Date: Fri, 18 Jul 2025 15:03:21 +1000 Subject: [PATCH 4/5] Fix spellcheck. --- book/src/memory_profiling.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/book/src/memory_profiling.md b/book/src/memory_profiling.md index 46d971a18f6..7a666eddec5 100644 --- a/book/src/memory_profiling.md +++ b/book/src/memory_profiling.md @@ -46,6 +46,6 @@ Generate a visualization: jeprof --svg $(which lighthouse) /home/ubuntu/prof.dump > profile.svg ``` -Open `profile.svg` in a browser to inspect memory hotspots. +Open `profile.svg` in a browser to inspect memory usage. > **Important:** Symbol resolution will fail if the path to `lighthouse` doesn't exactly match how it was invoked. Stick to `$(which lighthouse)` if that's how the binary was executed. From 3b01dd72f19418704c1778c48a5b81a1e63e7bfa Mon Sep 17 00:00:00 2001 From: Jimmy Chen Date: Tue, 22 Jul 2025 14:50:06 +1000 Subject: [PATCH 5/5] Update docs --- book/src/SUMMARY.md | 2 +- ...{memory_profiling.md => developers_memory_profiling.md} | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) rename book/src/{memory_profiling.md => developers_memory_profiling.md} (94%) diff --git a/book/src/SUMMARY.md b/book/src/SUMMARY.md index 0fb2e0f0b08..c3207865f33 100644 --- a/book/src/SUMMARY.md +++ b/book/src/SUMMARY.md @@ -63,7 +63,7 @@ * [FAQs](./faq.md) * [Protocol Developers](./developers.md) * [Lighthouse Architecture](./developers_architecture.md) - * [Memory Profiling](./memory_profiling.md) + * [Memory Profiling](./developers_memory_profiling.md) * [Security Researchers](./security.md) * [Archived](./archived.md) * [Merge Migration](./archived_merge_migration.md) diff --git a/book/src/memory_profiling.md b/book/src/developers_memory_profiling.md similarity index 94% rename from book/src/memory_profiling.md rename to book/src/developers_memory_profiling.md index 7a666eddec5..49bb4a55b57 100644 --- a/book/src/memory_profiling.md +++ b/book/src/developers_memory_profiling.md @@ -40,6 +40,13 @@ curl -X POST http://localhost:5052/lighthouse/malloc/prof_dump -H "Content-Type: ## 4. Analyze with `jeprof` +Install `jeprof` and dependencies + +```bash +sudo apt update +sudo apt install libjemalloc-dev graphviz +``` + Generate a visualization: ```bash