diff --git a/bindings/profilers/wall.cc b/bindings/profilers/wall.cc index 938aa9e6..7ab05a9c 100644 --- a/bindings/profilers/wall.cc +++ b/bindings/profilers/wall.cc @@ -14,6 +14,9 @@ * limitations under the License. */ +#include +#include +#include #include #include #include @@ -22,10 +25,7 @@ #include #include -#include -#include -#include - +#include "defer.hh" #include "per-isolate-data.hh" #include "translate-time-profile.hh" #include "wall.hh" @@ -59,6 +59,67 @@ using namespace v8; namespace dd { +class SignalGuard { + std::atomic& guard_; + + inline void store(bool value) { + std::atomic_signal_fence(std::memory_order_release); + guard_.store(value, std::memory_order_relaxed); + } + + public: + inline SignalGuard(std::atomic& guard) : guard_(guard) { store(true); } + + inline ~SignalGuard() { store(false); } +}; + +void SetContextPtr(ContextPtr& contextPtr, + Isolate* isolate, + Local value) { + if (!value->IsNullOrUndefined()) { + contextPtr = std::make_shared>(isolate, value); + } else { + contextPtr.reset(); + } +} + +class PersistentContextPtr { + ContextPtr context; + std::vector* dead; + Persistent per; + + public: + PersistentContextPtr(std::vector* dead) : dead(dead) {} + + void UnregisterFromGC() { + if (!per.IsEmpty()) { + per.ClearWeak(); + per.Reset(); + } + } + + void MarkDead() { dead->push_back(this); } + + void RegisterForGC(Isolate* isolate, const Local& obj) { + // Register a callback to delete this object when the object is GCed + per.Reset(isolate, obj); + per.SetWeak( + this, + [](const WeakCallbackInfo& data) { + auto ptr = data.GetParameter(); + ptr->MarkDead(); + ptr->UnregisterFromGC(); + }, + WeakCallbackType::kParameter); + } + + void Set(Isolate* isolate, const Local& value) { + SetContextPtr(context, isolate, value); + } + + ContextPtr Get() const { return context; } +}; + // Maximum number of rounds in the GetV8ToEpochOffset static constexpr int MAX_EPOCH_OFFSET_ATTEMPTS = 20; @@ -332,8 +393,7 @@ void SignalHandler::HandleProfilerSignal(int sig, auto time_from = Now(); old_handler(sig, info, context); auto time_to = Now(); - auto async_id = prof->GetAsyncId(isolate); - prof->PushContext(time_from, time_to, cpu_time, async_id); + prof->PushContext(time_from, time_to, cpu_time, isolate); } #else class SignalHandler { @@ -544,8 +604,10 @@ WallProfiler::WallProfiler(std::chrono::microseconds samplingPeriod, bool workaroundV8Bug, bool collectCpuTime, bool collectAsyncId, - bool isMainThread) + bool isMainThread, + bool useCPED) : samplingPeriod_(samplingPeriod), + useCPED_(useCPED), includeLines_(includeLines), withContexts_(withContexts), isMainThread_(isMainThread) { @@ -556,12 +618,16 @@ WallProfiler::WallProfiler(std::chrono::microseconds samplingPeriod, workaroundV8Bug_ = workaroundV8Bug && DD_WALL_USE_SIGPROF && detectV8Bug_; collectCpuTime_ = collectCpuTime && withContexts; collectAsyncId_ = collectAsyncId && withContexts; +#if NODE_MAJOR_VERSION >= 23 + useCPED_ = useCPED && withContexts; +#else + useCPED_ = false; +#endif if (withContexts_) { contexts_.reserve(duration * 2 / samplingPeriod); } - curContext_.store(&context1_, std::memory_order_relaxed); collectionMode_.store(CollectionMode::kNoCollect, std::memory_order_relaxed); gcCount.store(0, std::memory_order_relaxed); @@ -576,10 +642,26 @@ WallProfiler::WallProfiler(std::chrono::microseconds samplingPeriod, jsArray_ = v8::Global(isolate, jsArray); std::fill(fields_, fields_ + kFieldCount, 0); - if (collectAsyncId_) { + if (collectAsyncId_ || useCPED_) { isolate->AddGCPrologueCallback(&GCPrologueCallback, this); isolate->AddGCEpilogueCallback(&GCEpilogueCallback, this); } + + if (useCPED_) { + cpedSymbol_.Reset( + isolate, + Private::ForApi(isolate, + String::NewFromUtf8Literal( + isolate, "dd::WallProfiler::cpedSymbol_"))); + } +} + +void WallProfiler::UpdateContextCount() { + std::atomic_store_explicit( + reinterpret_cast*>( + &fields_[WallProfiler::Fields::kCPEDContextCount]), + liveContextPtrs_.size(), + std::memory_order_relaxed); } void WallProfiler::Dispose(Isolate* isolate, bool removeFromMap) { @@ -591,13 +673,21 @@ void WallProfiler::Dispose(Isolate* isolate, bool removeFromMap) { g_profilers.RemoveProfiler(isolate, this); } - if (collectAsyncId_) { + if (collectAsyncId_ || useCPED_) { isolate->RemoveGCPrologueCallback(&GCPrologueCallback, this); isolate->RemoveGCEpilogueCallback(&GCEpilogueCallback, this); } node::RemoveEnvironmentCleanupHook( isolate, &WallProfiler::CleanupHook, isolate); + + for (auto ptr : liveContextPtrs_) { + ptr->UnregisterFromGC(); + delete ptr; + } + liveContextPtrs_.clear(); + deadContextPtrs_.clear(); + UpdateContextCount(); } } @@ -653,6 +743,14 @@ NAN_METHOD(WallProfiler::New) { DD_WALL_PROFILER_GET_BOOLEAN_CONFIG(collectCpuTime); DD_WALL_PROFILER_GET_BOOLEAN_CONFIG(collectAsyncId); DD_WALL_PROFILER_GET_BOOLEAN_CONFIG(isMainThread); + DD_WALL_PROFILER_GET_BOOLEAN_CONFIG(useCPED); + +#if NODE_MAJOR_VERSION < 23 + if (useCPED) { + return Nan::ThrowTypeError( + "useCPED is not supported on this Node.js version."); + } +#endif if (withContexts && !DD_WALL_USE_SIGPROF) { return Nan::ThrowTypeError("Contexts are not supported."); @@ -692,7 +790,8 @@ NAN_METHOD(WallProfiler::New) { workaroundV8Bug, collectCpuTime, collectAsyncId, - isMainThread); + isMainThread, + useCPED); obj->Wrap(info.This()); info.GetReturnValue().Set(info.This()); } else { @@ -763,6 +862,7 @@ v8::ProfilerId WallProfiler::StartInternal() { if (withContexts_ || workaroundV8Bug_) { SignalHandler::IncreaseUseCount(); fields_[kSampleCount] = 0; + fields_[kCPEDContextCount] = 0; } if (collectCpuTime_) { @@ -982,6 +1082,11 @@ NAN_MODULE_INIT(WallProfiler::Init) { Nan::New(kSampleCount), ReadOnlyDontDelete) .FromJust(); + Nan::DefineOwnProperty(constants, + Nan::New("kCPEDContextCount").ToLocalChecked(), + Nan::New(kCPEDContextCount), + ReadOnlyDontDelete) + .FromJust(); Nan::DefineOwnProperty(target, Nan::New("constants").ToLocalChecked(), constants, @@ -1006,29 +1111,134 @@ v8::CpuProfiler* WallProfiler::CreateV8CpuProfiler() { return cpuProfiler_; } -v8::Local WallProfiler::GetContext(Isolate* isolate) { - auto context = *curContext_.load(std::memory_order_relaxed); - if (!context) return v8::Undefined(isolate); - return context->Get(isolate); +Local WallProfiler::GetContext(Isolate* isolate) { + auto context = GetContextPtr(isolate); + if (context) { + return context->Get(isolate); + } + return Undefined(isolate); +} + +void WallProfiler::SetCurrentContextPtr(Isolate* isolate, Local value) { + SignalGuard m(setInProgress_); + SetContextPtr(curContext_, isolate, value); } void WallProfiler::SetContext(Isolate* isolate, Local value) { - // Need to be careful here, because we might be interrupted by a - // signal handler that will make use of curContext_. - // Update of shared_ptr is not atomic, so instead we use a pointer - // (curContext_) that points on two shared_ptr (context1_ and context2_), - // update the shared_ptr that is not currently in use and then atomically - // update curContext_. - auto newCurContext = curContext_.load(std::memory_order_relaxed) == &context1_ - ? &context2_ - : &context1_; - if (!value->IsNullOrUndefined()) { - *newCurContext = std::make_shared>(isolate, value); +#if NODE_MAJOR_VERSION >= 23 + if (!useCPED_) { + SetCurrentContextPtr(isolate, value); + return; + } + + defer { + UpdateContextCount(); + }; + + // Clean up dead context pointers + for (auto ptr : deadContextPtrs_) { + liveContextPtrs_.erase(ptr); + delete ptr; + } + deadContextPtrs_.clear(); + + auto cped = isolate->GetContinuationPreservedEmbedderData(); + // No Node AsyncContextFrame in this continuation yet + if (!cped->IsObject()) return; + + auto v8Ctx = isolate->GetCurrentContext(); + // This should always be called from a V8 context, but check just in case. + if (v8Ctx.IsEmpty()) return; + + auto cpedObj = cped.As(); + auto localSymbol = cpedSymbol_.Get(isolate); + auto maybeProfData = cpedObj->GetPrivate(v8Ctx, localSymbol); + if (maybeProfData.IsEmpty()) return; + + PersistentContextPtr* contextPtr = nullptr; + auto profData = maybeProfData.ToLocalChecked(); + SignalGuard m(setInProgress_); + if (profData->IsUndefined()) { + if (value->IsNullOrUndefined()) { + // Don't go to the trouble of mutating the CPED for null or undefined as + // the absence of a sample context will be interpreted as undefined in + // GetContextPtr anyway. + return; + } + contextPtr = new PersistentContextPtr(&deadContextPtrs_); + + auto external = External::New(isolate, contextPtr); + auto maybeSetResult = cpedObj->SetPrivate(v8Ctx, localSymbol, external); + if (maybeSetResult.IsNothing()) { + delete contextPtr; + return; + } + liveContextPtrs_.insert(contextPtr); + contextPtr->RegisterForGC(isolate, cpedObj); } else { - newCurContext->reset(); + contextPtr = + static_cast(profData.As()->Value()); } - std::atomic_signal_fence(std::memory_order_release); - curContext_.store(newCurContext, std::memory_order_relaxed); + + contextPtr->Set(isolate, value); +#else + SetCurrentContextPtr(isolate, value); +#endif +} + +ContextPtr WallProfiler::GetContextPtrSignalSafe(Isolate* isolate) { + auto isSetInProgress = setInProgress_.load(std::memory_order_relaxed); + std::atomic_signal_fence(std::memory_order_acquire); + if (isSetInProgress) { + // New sample context is being set. Safe behavior is to not try attempt + // Object::Get on it and just return empty right now. + return ContextPtr(); + } + + if (useCPED_) { + auto curGcCount = gcCount.load(std::memory_order_relaxed); + std::atomic_signal_fence(std::memory_order_acquire); + if (curGcCount > 0) { + return gcContext_; + } + } + + return GetContextPtr(isolate); +} + +ContextPtr WallProfiler::GetContextPtr(Isolate* isolate) { +#if NODE_MAJOR_VERSION >= 23 + if (!useCPED_) { + return curContext_; + } + + if (!isolate->IsInUse()) { + // Must not try to create a handle scope if isolate is not in use. + return ContextPtr(); + } + HandleScope scope(isolate); + + auto cped = isolate->GetContinuationPreservedEmbedderData(); + if (cped->IsObject()) { + auto v8Ctx = isolate->GetEnteredOrMicrotaskContext(); + if (!v8Ctx.IsEmpty()) { + auto cpedObj = cped.As(); + auto localSymbol = cpedSymbol_.Get(isolate); + auto maybeProfData = cpedObj->GetPrivate(v8Ctx, localSymbol); + if (!maybeProfData.IsEmpty()) { + auto profData = maybeProfData.ToLocalChecked(); + if (!profData->IsUndefined()) { + return static_cast( + profData.As()->Value()) + ->Get(); + } + } + } + } + return ContextPtr(); +#else + return curContext_; +#endif } NAN_GETTER(WallProfiler::GetContext) { @@ -1061,6 +1271,10 @@ NAN_METHOD(WallProfiler::Dispose) { } double GetAsyncIdNoGC(v8::Isolate* isolate) { + if (!isolate->IsInUse()) { + // Must not try to create a handle scope if isolate is not in use. + return -1; + } #if NODE_MAJOR_VERSION >= 24 HandleScope scope(isolate); auto context = isolate->GetEnteredOrMicrotaskContext(); @@ -1086,27 +1300,39 @@ void WallProfiler::OnGCStart(v8::Isolate* isolate) { auto curCount = gcCount.load(std::memory_order_relaxed); std::atomic_signal_fence(std::memory_order_acquire); if (curCount == 0) { - gcAsyncId = GetAsyncIdNoGC(isolate); + if (collectAsyncId_) { + gcAsyncId = GetAsyncIdNoGC(isolate); + } + if (useCPED_) { + gcContext_ = GetContextPtrSignalSafe(isolate); + } } std::atomic_signal_fence(std::memory_order_release); gcCount.store(curCount + 1, std::memory_order_relaxed); } void WallProfiler::OnGCEnd() { - gcCount.fetch_sub(1, std::memory_order_relaxed); + auto oldCount = gcCount.fetch_sub(1, std::memory_order_relaxed); + if (oldCount == 1 && useCPED_) { + // Not strictly necessary, as we'll reset it to something else on next GC, + // but why retain it longer than needed? + gcContext_.reset(); + } } void WallProfiler::PushContext(int64_t time_from, int64_t time_to, int64_t cpu_time, - double async_id) { + Isolate* isolate) { // Be careful this is called in a signal handler context therefore all // operations must be async signal safe (in particular no allocations). // Our ring buffer avoids allocations. - auto context = curContext_.load(std::memory_order_relaxed); - std::atomic_signal_fence(std::memory_order_acquire); if (contexts_.size() < contexts_.capacity()) { - contexts_.push_back({*context, time_from, time_to, cpu_time, async_id}); + contexts_.push_back({GetContextPtrSignalSafe(isolate), + time_from, + time_to, + cpu_time, + GetAsyncId(isolate)}); std::atomic_fetch_add_explicit( reinterpret_cast*>(&fields_[kSampleCount]), 1U, diff --git a/bindings/profilers/wall.hh b/bindings/profilers/wall.hh index caea8b5c..15d5c17e 100644 --- a/bindings/profilers/wall.hh +++ b/bindings/profilers/wall.hh @@ -36,29 +36,36 @@ struct Result { std::string msg; }; +using ContextPtr = std::shared_ptr>; + +class PersistentContextPtr; + class WallProfiler : public Nan::ObjectWrap { public: enum class CollectionMode { kNoCollect, kPassThrough, kCollectContexts }; + enum Fields { kSampleCount, kCPEDContextCount, kFieldCount }; private: - enum Fields { kSampleCount, kFieldCount }; - - using ContextPtr = std::shared_ptr>; - std::chrono::microseconds samplingPeriod_{0}; v8::CpuProfiler* cpuProfiler_ = nullptr; - // TODO: Investigate use of v8::Persistent instead of shared_ptr to - // avoid heap allocation. Need to figure out the right move/copy semantics in - // and out of the ring buffer. - // We're using a pair of shared pointers and an atomic pointer-to-current as - // a way to ensure signal safety on update. - ContextPtr context1_; - ContextPtr context2_; - std::atomic curContext_; + bool useCPED_ = false; + // If we aren't using the CPED, we use a single context ptr stored here. + ContextPtr curContext_; + // Otherwise we'll use a private symbol to store the context in CPED objects. + v8::Global cpedSymbol_; + // We track live context pointers in a set to avoid memory leaks. They will + // be deleted when the profiler is disposed. + std::unordered_set liveContextPtrs_; + // Context pointers belonging to GC'd CPED objects register themselves here. + // They will be deleted and removed from liveContextPtrs_ next time SetContext + // is invoked. + std::vector deadContextPtrs_; std::atomic gcCount = 0; + std::atomic setInProgress_ = false; double gcAsyncId; + ContextPtr gcContext_; std::atomic collectionMode_; std::atomic noCollectCallCount_; @@ -107,6 +114,12 @@ class WallProfiler : public Nan::ObjectWrap { static void CleanupHook(void* data); void Cleanup(v8::Isolate* isolate); + ContextPtr GetContextPtr(v8::Isolate* isolate); + ContextPtr GetContextPtrSignalSafe(v8::Isolate* isolate); + + void SetCurrentContextPtr(v8::Isolate* isolate, v8::Local context); + void UpdateContextCount(); + public: /** * @param samplingPeriodMicros sampling interval, in microseconds @@ -114,6 +127,10 @@ class WallProfiler : public Nan::ObjectWrap { * parameter is informative; it is up to the caller to call the Stop method * every period. The parameter is used to preallocate data structures that * should not be reallocated in async signal safe code. + * @param useCPED whether to use the V8 ContinuationPreservedEmbedderData to + * store the current sampling context. It can be used if AsyncLocalStorage + * uses the AsyncContextFrame implementation (experimental in Node 23, default + * in Node 24.) */ explicit WallProfiler(std::chrono::microseconds samplingPeriod, std::chrono::microseconds duration, @@ -122,14 +139,15 @@ class WallProfiler : public Nan::ObjectWrap { bool workaroundV8bug, bool collectCpuTime, bool collectAsyncId, - bool isMainThread); + bool isMainThread, + bool useCPED); v8::Local GetContext(v8::Isolate*); void SetContext(v8::Isolate*, v8::Local); void PushContext(int64_t time_from, int64_t time_to, int64_t cpu_time, - double async_id); + v8::Isolate* isolate); Result StartImpl(); v8::ProfilerId StartInternal(); Result StopImpl(bool restart, v8::Local& profile); diff --git a/doc/sample_context_in_cped.md b/doc/sample_context_in_cped.md new file mode 100644 index 00000000..259c7749 --- /dev/null +++ b/doc/sample_context_in_cped.md @@ -0,0 +1,190 @@ +# Storing Sample Context in V8 Continuation-Preserved Embedder Data + +## What is the Sample Context? +Datadog's Node.js profiler has the ability to store a custom object that it will +then associate with collected CPU samples. We refer to this object as the +"sample context." A higher-level embedding (typically, dd-trace-js) will then +update the sample context to keep it current with changes in the execution. A +typical piece of data sample context stores is the tracing span ID, so whenever +it changes, the sample context needs to be updated. + +## How is the Sample Context stored and updated? +Before Node 23, the sample context would be stored in a +`std::shared_ptr>` field on the C++ `WallProfiler` +instance. (In fact, due to the need for ensuring atomic updates and shared +pointers not being effectively updateable atomically it's actually a pair of +fields with an atomic pointer-to-shared-pointer switching between them, but I +digress.) Due to it being a single piece of instance state, it had to be updated +every time the active span changed, possibly on every invocation of +`AsyncLocalStorage.enterWith` and `.run`, but even more importantly on every +async context change, and for that we needed to register a "before" callback +with `async_hooks.createHook`. This meant that we needed to both update the +sample context on every async context change, but more importantly it also meant +we needed to use `async_hooks.createHook` which is getting deprecated in Node. +Current documentation for it is not exactly a shining endorsement: +> Please migrate away from this API, if you can. We do not recommend using the +> createHook, AsyncHook, and executionAsyncResource APIs as they have usability +> issues, safety risks, and performance implications. + +Fortunately, first the V8 engine and then Node.js gave us building blocks for a +better solution. + +## V8 Continuation-Preserved Embedder Data and Node.js Async Context Frame +In the V8 engine starting from version 12 (the one shipping with Node 22) +`v8::Isolate` exposes an API to set and get embedder-specific data on it so that +it is preserved across executions that are logical continuations of each other +(essentially: across promise chains; this includes await expressions.) Even +though the APIs are exposed on the isolate, the data is stored on a +per-continuation basis and the engine takes care to return the right one when +`Isolate::GetContinuationPreservedEmbedderData()` method is invoked. We will +refer to continuation-preserved embedder data as "CPED" from now on. + +Starting with Node.js 23, CPED is used to implement data storage behind Node.js +`AsyncLocalStorage` API. This dovetails nicely with our needs as all the +span-related data we set on the sample context is normally managed in an async +local storage (ALS) by the tracer. An application can create any number of +ALSes, and each ALS manages a single value per async context. This value is +somewhat confusingly called the "store" of the async local storage, making it +important to not confuse the terms "storage" (an identity with multiple values, +one per async context) and "store", which is a value of a storage within a +particular async context. + +The new implementation for storing ALS stores introduces an internal Node.js +class named `AsyncContextFrame` (ACF) which is a map that uses ALSes as keys, +and their stores as the map values, essentially providing a mapping from an ALS +to its store in the current async context. (This implementation is very similar +to how e.g. Java implements `ThreadLocal`, which is a close analogue to ALS in +Node.js.) ACF instances are then stored in CPED. + +## Storing the Sample Context in CPED +Node.js – as the embedder of V8 – commandeers the CPED to store instances of +ACF in it. This means that our profiler can't directly store our sample context +in the CPED, because then we'd overwrite the ACF reference already in there and +break Node.js. Fortunately, since ACF is "just" an ordinary JavaScript object, +we can define a new property on it, and store our sample context in it! +JavaScript properties can have strings, numbers, or symbols as their keys, with +symbols being the recommended practice to define properties that are hidden from +unrelated code as symbols are private to their creator and only compare equal to +themselves. Thus we create a private symbol in the profiler instance for our +property key, and our logic for storing the sample context thus becomes: +* get the CPED from the V8 isolate +* if it is not an object, do nothing (we can't set the sample context) +* otherwise set the sample context as a value in the object with our property + key. + +The reality is a bit thornier, though. Imagine what happens if while we're +setting the property, we get interrupted by a PROF signal and the signal handler +tries to read the property value? It could easily observe an inconsistent state +and crash. But even if it reads a property value, which one did it read? Still +the old one, already the new one, or maybe a torn value between the two? + +Fortunately, we had the exact same problem with our previous approach where we +only stored one sample context in the profiler instances, and the solution is +the same. We encapsulate the pair of shared pointers to a V8 `Global` and an +atomic pointer-to-pointer in a class named `AtomicContextPtr`, which looks like +this: +``` +using ContextPtr = std::shared_ptr>; + +class AtomicContextPtr { + ContextPtr ptr1; + ContextPtr ptr2; + std::atomic currentPtr = &ptr1; + ... +``` +A `Set` method on this class will first store the newly passed sample context in +either `ptr1` or `ptr2` – whichever `currentPtr` is _not_ pointing to at the +moment. Subsequently it atomically updates `currentPtr` to now point to it. + +Instead of storing the current sample context in the ACF property directly, +we want to store an `AtomicContextPtr` (ACP.) The only problem? This is a C++ +class, and properties of JavaScript objects can only be JavaScript values. +Fortunately, V8 gives us a solution for this as well: the `v8::External` type is +a V8 value type that wraps a `void *`. +So now the algorithm for setting a sample context is: +* get the CPED from the V8 isolate +* if it is not an object, do nothing (we can't set the sample context) +* Retrieve the property value. If there is one, it's the `External` wrapping the + pointer to the ACP we use. +* If there is none, allocate a new ACP on C++ heap, create a `v8::External` to + hold its pointer, and store it as a property in the ACF. +* Set the sample context as a value on the either retrieved or created ACP. + +The chain of data now looks something like this: +``` +v8::Isolate (from Isolate::GetCurrent()) + +-> current continuation (internally managed by V8) + +-> node::AsyncContextFrame (in continuation's CPED field) + +-> v8::External (in AsyncContextFrame's private property) + +-> dd::AsyncContextPtr (in External's data field) + +-> std::shared_ptr> (in either AsyncContextPtr::ptr1 or ptr2) + +-> v8::Global (in shared_ptr) + +-> v8::Value (the actual sample context object) +``` +The last 3-4 steps were the same in the previous code version as well, except +`ptr1` and `ptr2` were directly represented in the `WallProfiler`, so then it +looked like this: +``` +dd::WallProfiler + +-> std::shared_ptr> (in either WallProfiler::ptr1 or ptr2) + +-> v8::Global (in shared_ptr) + +-> v8::Value (the actual sample context object) +``` +The difference between the two diagrams shows how we encapsulated the +`(ptr1, ptr2, currentPtr)` tuple into a separate class and moved it out from +being an instance state of `WallProfiler` to being a property of every ACF we +encounter. + +## Odds and ends +And that's mostly it! There are few more small odds and ends to make it work +safely. We still need to guard writing the property value to the ACF against +concurrent access by the signal handler, but now it happens only once for every +ACF, when we create its ACP. We guard by introducing an atomic boolean and +proper signal fencing. + +The signal handler code also needs to be prevented from trying to access the +data while a GC is in progress. With this new model, the signal handler +unfortunately needs to do a small number of V8 API invocations. It needs to +retrieve the current V8 `Context`, it needs to obtain a `Local` for the property +key, and finally it needs to use both in an `Object::Get` call on the CPED. +Calling a property getter on an object is reentrancy into V8, which is advised +against, but this being an ordinary property it ends up being a single dependent +load, which turns out to work safely… unless there's GC happening. For this +reason, we register GC prologue and epilogue callbacks with the V8 isolate so we +can know when GCs are ongoing and the signal handler will refrain from touching +CPED during them. We'll however grab the current sample context from the CPED +and store it in a profiler instance field in the GC prologue and use it for any +samples taken during GC. + +Speaking of GC, we can now have an unbounded number of ACPs – one for each live +ACF. Each ACP is allocated on the C++ heap, and needs to be deleted eventually. +The profiler tracks every ACP it creates in an internal set of live ACPs and +deletes them all when it itself gets disposed. This would still allow for +unbounded growth so we additionally register a V8 GC finalization callback for +every ACF. When V8 collects an ACF instance its finalization callback will put +that ACF's ACP into the profiler's internal vector of ready-to-delete ACPs and +the profiler processes that vector (both deletes the ACP and removes it from the +live set) on each call to `SetContext`. + +## Changes in dd-trace-js +For completeness, we'll describe the changes in dd-trace-js here as well. The +main change is that with Node 24, we no longer require async hooks. The +instrumentation points for `AsyncLocalStorage.enterWith` and +`AsyncLocalStorage.run` remain in place – they are the only ones that are needed +now. + +There are some small performance optimizations that no longer apply with the new +approach, though. For one, with the old approach we did some data conversions +(span IDs to string, a tag array to endpoint string) in a sample when a sample +was captured. With the new approach, we do these conversions for all sample +contexts during profile serialization. Doing them after each sample capture +amortized their cost possibly minimally reducing the latency induced at +serialization time. With the old approach we also called `SetContext` only once +per sampling – we'd install a sample context to be used for the next sample, and +then kept updating a `ref` field in it with a reference to the actual data. +Since we no longer have a single sample context (but one per continuation) we +can not do this anymore, and we need to call `SetContext` on every ACF change. +The cost of this (basically, going into a native call from JavaScript) are still +well offset by not having to use async hooks and do work on every async context +change. We could arguably simplify the code by removing those small +optimizations. diff --git a/package.json b/package.json index 0ba2a0f4..6a5b6e02 100644 --- a/package.json +++ b/package.json @@ -57,6 +57,7 @@ "mocha": "^10.2.0", "nan": "^2.22.2", "nyc": "^15.1.0", + "semver": "^7.7.2", "sinon": "^15.2.0", "source-map-support": "^0.5.21", "tmp": "0.2.1", diff --git a/ts/src/time-profiler.ts b/ts/src/time-profiler.ts index b9cd299d..caf50301 100644 --- a/ts/src/time-profiler.ts +++ b/ts/src/time-profiler.ts @@ -30,7 +30,7 @@ import { import {GenerateTimeLabelsFunction} from './v8-types'; import {isMainThread} from 'worker_threads'; -const {kSampleCount} = profilerConstants; +const {kSampleCount, kCPEDContextCount} = profilerConstants; const DEFAULT_INTERVAL_MICROS: Microseconds = 1000; const DEFAULT_DURATION_MILLIS: Milliseconds = 60000; @@ -67,6 +67,7 @@ export interface TimeProfilerOptions { workaroundV8Bug?: boolean; collectCpuTime?: boolean; collectAsyncId?: boolean; + useCPED?: boolean; } const DEFAULT_OPTIONS: TimeProfilerOptions = { @@ -77,6 +78,7 @@ const DEFAULT_OPTIONS: TimeProfilerOptions = { workaroundV8Bug: true, collectCpuTime: false, collectAsyncId: false, + useCPED: false, }; export async function profile(options: TimeProfilerOptions = {}) { @@ -100,8 +102,8 @@ export function start(options: TimeProfilerOptions = {}) { gProfiler.start(); - // If contexts are enabled, set an initial empty context - if (options.withContexts) { + // If contexts are enabled without using CPED, set an initial empty context + if (options.withContexts && !options.useCPED) { setContext({}); } } @@ -176,6 +178,7 @@ export function v8ProfilerStuckEventLoopDetected() { export const constants = { kSampleCount, + kCPEDContextCount, GARBAGE_COLLECTION_FUNCTION_NAME, NON_JS_THREADS_FUNCTION_NAME, }; diff --git a/ts/test/test-time-profiler.ts b/ts/test/test-time-profiler.ts index 2434c813..5a14e8cf 100644 --- a/ts/test/test-time-profiler.ts +++ b/ts/test/test-time-profiler.ts @@ -23,9 +23,15 @@ import {hrtime} from 'process'; import {Label, Profile} from 'pprof-format'; import {AssertionError} from 'assert'; import {GenerateTimeLabelsArgs, LabelSet} from '../src/v8-types'; +import {AsyncLocalStorage} from 'async_hooks'; +import {satisfies} from 'semver'; const assert = require('assert'); +const useCPED = + satisfies(process.versions.node, '>=24.0.0') && + !process.execArgv.includes('--no-async-context-frame'); + const PROFILE_OPTIONS = { durationMillis: 500, intervalMicros: 1000, @@ -44,11 +50,16 @@ describe('Time Profiler', () => { this.skip(); } const startTime = BigInt(Date.now()) * 1000n; + if (useCPED) { + // Ensure an async context frame is created to hold the profiler context. + new AsyncLocalStorage().enterWith(1); + } time.start({ intervalMicros: 20 * 1_000, durationMillis: PROFILE_OPTIONS.durationMillis, withContexts: true, lineNumbers: false, + useCPED, }); const initialContext: {[key: string]: string} = {}; time.setContext(initialContext); @@ -97,11 +108,16 @@ describe('Time Profiler', () => { this.timeout(3000); const intervalNanos = PROFILE_OPTIONS.intervalMicros * 1_000; + if (useCPED) { + // Ensure an async context frame is created to hold the profiler context. + new AsyncLocalStorage().enterWith(1); + } time.start({ intervalMicros: PROFILE_OPTIONS.intervalMicros, durationMillis: PROFILE_OPTIONS.durationMillis, withContexts: true, lineNumbers: false, + useCPED, }); // By repeating the test few times, we also exercise the profiler // start-stop overlap behavior. diff --git a/ts/test/worker.ts b/ts/test/worker.ts index a5f5a1bd..155c89f5 100644 --- a/ts/test/worker.ts +++ b/ts/test/worker.ts @@ -3,13 +3,18 @@ import {pbkdf2} from 'crypto'; import {time} from '../src/index'; import {Profile, ValueType} from 'pprof-format'; import {getAndVerifyPresence, getAndVerifyString} from './profiles-for-tests'; +import {satisfies} from 'semver'; import assert from 'assert'; +import {AsyncLocalStorage} from 'async_hooks'; const DURATION_MILLIS = 1000; const intervalMicros = 10000; const withContexts = process.platform === 'darwin' || process.platform === 'linux'; +const useCPED = + satisfies(process.versions.node, '>=24.0.0') && + !process.execArgv.includes('--no-async-context-frame'); function createWorker(durationMs: number): Promise { return new Promise((resolve, reject) => { @@ -52,15 +57,20 @@ function getCpuUsage() { } async function main(durationMs: number) { + if (useCPED) new AsyncLocalStorage().enterWith(1); time.start({ durationMillis: durationMs * 3, intervalMicros, withContexts, collectCpuTime: withContexts, + useCPED: useCPED, }); + if (withContexts) { + time.setContext({}); + } const cpu0 = getCpuUsage(); - const nbWorkers = Number(process.argv[2] || 2); + const nbWorkers = Number(process.argv[2] ?? 2); // start workers const workers = executeWorkers(nbWorkers, durationMs); @@ -91,12 +101,17 @@ async function main(durationMs: number) { } async function worker(durationMs: number) { + if (useCPED) new AsyncLocalStorage().enterWith(1); time.start({ durationMillis: durationMs, intervalMicros, withContexts, collectCpuTime: withContexts, + useCPED: useCPED, }); + if (withContexts) { + time.setContext({}); + } const deadline = Date.now() + durationMs; await Promise.all([bar(deadline), foo(deadline)]);