Skip to content

Commit 0272ba4

Browse files
authored
Add crashtracker support for sidecar (#1265)
Add crashtracker support for sidecar This also switches from a stack pointer based approach to an instruction pointer based approach to remove stack frames. It seems like in rust code - at least in my test rust code - the crashing frame was always omitted; e.g.: datadog_crashtracker::collector::crash_handler::handle_posix_sigaction -> sp: 0xffffb763ed20 __kernel_rt_sigreturn -> sp: 0xffffb763eda0 datadog_sidecar::unix::test -> sp: 0xffffb763eed0 < dummy function to test a crash - here it crashes, but it's omitted, because the sp is right below __kernel_rt_sigreturn frame... for reasons? ddog_daemon_entry_point -> sp: 0xffffdb23c560 < and that's now above the fault_sp. Signed-off-by: Bob Weinand <bob.weinand@datadoghq.com>
1 parent ba89553 commit 0272ba4

File tree

15 files changed

+278
-114
lines changed

15 files changed

+278
-114
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

bin_tests/tests/crashtracker_bin_test.rs

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -200,17 +200,13 @@ fn test_crash_tracking_callstack() {
200200

201201
// Note: in Release, we do not have the crate and module name prepended to the function name
202202
// Here we compile the crashing app in Debug.
203-
let mut expected_functions = Vec::new();
204-
// It seems that on arm/arm64, fn3 is inlined in fn2, so not present.
205-
// Add fn3 only for x86_64 arch
206-
#[cfg(target_arch = "x86_64")]
207-
expected_functions.push("crashing_test_app::unix::fn3");
208-
expected_functions.extend_from_slice(&[
203+
let expected_functions = [
204+
"crashing_test_app::unix::fn3",
209205
"crashing_test_app::unix::fn2",
210206
"crashing_test_app::unix::fn1",
211207
"crashing_test_app::unix::main",
212208
"crashing_test_app::main",
213-
]);
209+
];
214210

215211
let crashing_callstack = &crash_payload["error"]["stack"]["frames"];
216212
assert!(

datadog-crashtracker/src/collector/emitters.rs

Lines changed: 82 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ pub enum EmitterError {
4545
unsafe fn emit_backtrace_by_frames(
4646
w: &mut impl Write,
4747
resolve_frames: StacktraceCollection,
48-
fault_rsp: usize,
48+
fault_ip: usize,
4949
) -> Result<(), EmitterError> {
5050
// https://docs.rs/backtrace/latest/backtrace/index.html
5151
writeln!(w, "{DD_CRASHTRACK_BEGIN_STACKTRACE}")?;
@@ -61,56 +61,68 @@ unsafe fn emit_backtrace_by_frames(
6161
Ok(())
6262
}
6363

64-
backtrace::trace_unsynchronized(|frame| {
65-
// Skip all stack frames whose stack pointer is less than to the determined crash stack
66-
// pointer (fault_rsp). These frames belong exclusively to the crash tracker and the
67-
// backtrace functionality and are therefore not relevant for troubleshooting.
68-
let sp = frame.sp();
69-
if !sp.is_null() && (sp as usize) < fault_rsp {
70-
return true;
71-
}
72-
if resolve_frames == StacktraceCollection::EnabledWithInprocessSymbols {
73-
backtrace::resolve_frame_unsynchronized(frame, |symbol| {
74-
#[allow(clippy::unwrap_used)]
75-
write!(w, "{{").unwrap();
76-
#[allow(clippy::unwrap_used)]
77-
emit_absolute_addresses(w, frame).unwrap();
78-
if let Some(column) = symbol.colno() {
64+
let mut ip_found = false;
65+
loop {
66+
backtrace::trace_unsynchronized(|frame| {
67+
// Skip all stack frames until we encounter the determined crash instruction pointer
68+
// (fault_ip). These initial frames belong exclusively to the crash tracker and the
69+
// backtrace functionality and are therefore not relevant for troubleshooting.
70+
let ip = frame.ip();
71+
if ip as usize == fault_ip {
72+
ip_found = true;
73+
}
74+
if !ip_found {
75+
return true;
76+
}
77+
if resolve_frames == StacktraceCollection::EnabledWithInprocessSymbols {
78+
backtrace::resolve_frame_unsynchronized(frame, |symbol| {
7979
#[allow(clippy::unwrap_used)]
80-
write!(w, ", \"column\": {column}").unwrap();
81-
}
82-
if let Some(file) = symbol.filename() {
83-
// The debug printer for path already wraps it in `"` marks.
80+
write!(w, "{{").unwrap();
8481
#[allow(clippy::unwrap_used)]
85-
write!(w, ", \"file\": {file:?}").unwrap();
86-
}
87-
if let Some(function) = symbol.name() {
82+
emit_absolute_addresses(w, frame).unwrap();
83+
if let Some(column) = symbol.colno() {
84+
#[allow(clippy::unwrap_used)]
85+
write!(w, ", \"column\": {column}").unwrap();
86+
}
87+
if let Some(file) = symbol.filename() {
88+
// The debug printer for path already wraps it in `"` marks.
89+
#[allow(clippy::unwrap_used)]
90+
write!(w, ", \"file\": {file:?}").unwrap();
91+
}
92+
if let Some(function) = symbol.name() {
93+
#[allow(clippy::unwrap_used)]
94+
write!(w, ", \"function\": \"{function}\"").unwrap();
95+
}
96+
if let Some(line) = symbol.lineno() {
97+
#[allow(clippy::unwrap_used)]
98+
write!(w, ", \"line\": {line}").unwrap();
99+
}
88100
#[allow(clippy::unwrap_used)]
89-
write!(w, ", \"function\": \"{function}\"").unwrap();
90-
}
91-
if let Some(line) = symbol.lineno() {
101+
writeln!(w, "}}").unwrap();
102+
// Flush eagerly to ensure that each frame gets emitted even if the next one
103+
// fails
92104
#[allow(clippy::unwrap_used)]
93-
write!(w, ", \"line\": {line}").unwrap();
94-
}
105+
w.flush().unwrap();
106+
});
107+
} else {
108+
#[allow(clippy::unwrap_used)]
109+
write!(w, "{{").unwrap();
110+
#[allow(clippy::unwrap_used)]
111+
emit_absolute_addresses(w, frame).unwrap();
95112
#[allow(clippy::unwrap_used)]
96113
writeln!(w, "}}").unwrap();
97114
// Flush eagerly to ensure that each frame gets emitted even if the next one fails
98115
#[allow(clippy::unwrap_used)]
99116
w.flush().unwrap();
100-
});
101-
} else {
102-
#[allow(clippy::unwrap_used)]
103-
write!(w, "{{").unwrap();
104-
#[allow(clippy::unwrap_used)]
105-
emit_absolute_addresses(w, frame).unwrap();
106-
#[allow(clippy::unwrap_used)]
107-
writeln!(w, "}}").unwrap();
108-
// Flush eagerly to ensure that each frame gets emitted even if the next one fails
109-
#[allow(clippy::unwrap_used)]
110-
w.flush().unwrap();
117+
}
118+
true // keep going to the next frame
119+
});
120+
if ip_found {
121+
break;
111122
}
112-
true // keep going to the next frame
113-
});
123+
// emit anything at all, if the crashing frame is not found for some reason
124+
ip_found = true;
125+
}
114126
writeln!(w, "{DD_CRASHTRACK_END_STACKTRACE}")?;
115127
w.flush()?;
116128
Ok(())
@@ -145,8 +157,8 @@ pub(crate) fn emit_crashreport(
145157
// https://doc.rust-lang.org/src/std/backtrace.rs.html#332
146158
// Do this last, so even if it crashes, we still get the other info.
147159
if config.resolve_frames() != StacktraceCollection::Disabled {
148-
let fault_rsp = extract_rsp(ucontext);
149-
unsafe { emit_backtrace_by_frames(pipe, config.resolve_frames(), fault_rsp)? };
160+
let fault_ip = extract_ip(ucontext);
161+
unsafe { emit_backtrace_by_frames(pipe, config.resolve_frames(), fault_ip)? };
150162
}
151163
writeln!(pipe, "{DD_CRASHTRACK_DONE}")?;
152164
pipe.flush()?;
@@ -307,17 +319,17 @@ fn emit_text_file(w: &mut impl Write, path: &str) -> Result<(), EmitterError> {
307319
Ok(())
308320
}
309321

310-
fn extract_rsp(ucontext: *const ucontext_t) -> usize {
322+
fn extract_ip(ucontext: *const ucontext_t) -> usize {
311323
unsafe {
312324
#[cfg(all(target_os = "macos", target_arch = "x86_64"))]
313-
return (*(*ucontext).uc_mcontext).__ss.__rsp as usize;
325+
return (*(*ucontext).uc_mcontext).__ss.__rip as usize;
314326
#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
315-
return (*(*ucontext).uc_mcontext).__ss.__sp as usize;
327+
return (*(*ucontext).uc_mcontext).__ss.__pc as usize;
316328

317329
#[cfg(all(target_os = "linux", target_arch = "x86_64"))]
318-
return (*ucontext).uc_mcontext.gregs[libc::REG_RSP as usize] as usize;
330+
return (*ucontext).uc_mcontext.gregs[libc::REG_RIP as usize] as usize;
319331
#[cfg(all(target_os = "linux", target_arch = "aarch64"))]
320-
return (*ucontext).uc_mcontext.sp as usize;
332+
return (*ucontext).uc_mcontext.pc as usize;
321333
}
322334
}
323335

@@ -326,14 +338,28 @@ mod tests {
326338
use super::*;
327339
use std::str;
328340

329-
#[test]
330-
#[cfg_attr(miri, ignore)]
331-
fn test_emit_backtrace_disabled() {
341+
#[inline(never)]
342+
fn inner_test_emit_backtrace_with_symbols(collection: StacktraceCollection) -> Vec<u8> {
343+
let mut ip_of_test_fn = 0;
344+
let mut skip = 3;
345+
unsafe {
346+
backtrace::trace_unsynchronized(|frame| {
347+
ip_of_test_fn = frame.ip() as usize;
348+
skip -= 1;
349+
skip > 0
350+
})
351+
};
332352
let mut buf = Vec::new();
333353
unsafe {
334-
emit_backtrace_by_frames(&mut buf, StacktraceCollection::Disabled, 0)
335-
.expect("to work ;-)");
354+
emit_backtrace_by_frames(&mut buf, collection, ip_of_test_fn).expect("to work ;-)");
336355
}
356+
buf
357+
}
358+
359+
#[test]
360+
#[cfg_attr(miri, ignore)]
361+
fn test_emit_backtrace_disabled() {
362+
let buf = inner_test_emit_backtrace_with_symbols(StacktraceCollection::Disabled);
337363
let out = str::from_utf8(&buf).expect("to be valid UTF8");
338364
assert!(out.contains("BEGIN_STACKTRACE"));
339365
assert!(out.contains("END_STACKTRACE"));
@@ -353,18 +379,10 @@ mod tests {
353379
#[test]
354380
#[cfg_attr(miri, ignore)]
355381
fn test_emit_backtrace_with_symbols() {
356-
let dummy = 0u8;
382+
let buf = inner_test_emit_backtrace_with_symbols(
383+
StacktraceCollection::EnabledWithInprocessSymbols,
384+
);
357385
// retrieve stack pointer for this function
358-
let sp_of_test_fn = &dummy as *const u8 as usize;
359-
let mut buf = Vec::new();
360-
unsafe {
361-
emit_backtrace_by_frames(
362-
&mut buf,
363-
StacktraceCollection::EnabledWithInprocessSymbols,
364-
sp_of_test_fn,
365-
)
366-
.expect("to work ;-)");
367-
}
368386
let out = str::from_utf8(&buf).expect("to be valid UTF8");
369387
assert!(out.contains("BEGIN_STACKTRACE"));
370388
assert!(out.contains("END_STACKTRACE"));

datadog-ipc/tests/flock.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,15 @@ use std::{
99
};
1010

1111
use datadog_ipc::platform::locks::FLock;
12-
use spawn_worker::{assert_child_exit, entrypoint, fork::set_default_child_panic_handler, Stdio};
12+
use spawn_worker::{
13+
assert_child_exit, entrypoint, fork::set_default_child_panic_handler, Stdio, TrampolineData,
14+
};
1315
use tempfile::tempdir;
1416

1517
static ENV_LOCK_PATH: &str = "__LOCK_PATH";
1618

1719
#[no_mangle]
18-
pub extern "C" fn flock_test_entrypoint() {
20+
pub extern "C" fn flock_test_entrypoint(_trampoline_data: &TrampolineData) {
1921
set_default_child_panic_handler();
2022
let lock_path = std::env::var(ENV_LOCK_PATH).unwrap();
2123

datadog-sidecar-ffi/src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -309,7 +309,7 @@ pub extern "C" fn ddog_sidecar_transport_drop(_: Box<SidecarTransport>) {}
309309
/// Caller must ensure the process is safe to fork, at the time when this method is called
310310
#[no_mangle]
311311
pub extern "C" fn ddog_sidecar_connect(connection: &mut *mut SidecarTransport) -> MaybeError {
312-
let cfg = datadog_sidecar::config::Config::get();
312+
let cfg = datadog_sidecar::config::FromEnv::config();
313313

314314
let stream = Box::new(try_c!(datadog_sidecar::start_or_connect_to_sidecar(cfg)));
315315
*connection = Box::into_raw(stream);

datadog-sidecar/src/config.rs

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
// Copyright 2021-Present Datadog, Inc. https://www.datadoghq.com/
22
// SPDX-License-Identifier: Apache-2.0
33

4+
use ddcommon::Endpoint;
45
use http::uri::{PathAndQuery, Scheme};
56
use serde::{Deserialize, Serialize};
6-
use std::{collections::HashMap, path::PathBuf, time::Duration};
7-
8-
use ddcommon::Endpoint;
97
use spawn_worker::LibDependency;
8+
use std::sync::LazyLock;
9+
use std::{collections::HashMap, path::PathBuf, time::Duration};
1010

1111
const ENV_SIDECAR_IPC_MODE: &str = "_DD_DEBUG_SIDECAR_IPC_MODE";
1212
const SIDECAR_IPC_MODE_SHARED: &str = "shared";
@@ -28,6 +28,8 @@ const ENV_SIDECAR_SELF_TELEMETRY: &str = "_DD_SIDECAR_SELF_TELEMETRY";
2828

2929
const ENV_SIDECAR_WATCHDOG_MAX_MEMORY: &str = "_DD_SIDECAR_WATCHDOG_MAX_MEMORY";
3030

31+
const ENV_SIDECAR_CRASHTRACKER_ENDPOINT: &str = "_DD_SIDECAR_CRASHTRACKER_ENDPOINT";
32+
3133
const ENV_SIDECAR_APPSEC_SHARED_LIB_PATH: &str = "_DD_SIDECAR_APPSEC_SHARED_LIB_PATH";
3234
const ENV_SIDECAR_APPSEC_SOCKET_FILE_PATH: &str = "_DD_SIDECAR_APPSEC_SOCKET_FILE_PATH";
3335
const ENV_SIDECAR_APPSEC_LOCK_FILE_PATH: &str = "_DD_SIDECAR_APPSEC_LOCK_FILE_PATH";
@@ -89,6 +91,7 @@ pub struct Config {
8991
pub self_telemetry: bool,
9092
pub library_dependencies: Vec<LibDependency>,
9193
pub child_env: HashMap<std::ffi::OsString, std::ffi::OsString>,
94+
pub crashtracker_endpoint: Option<Endpoint>,
9295
pub appsec_config: Option<AppSecConfig>,
9396
pub max_memory: usize,
9497
}
@@ -102,9 +105,11 @@ pub struct AppSecConfig {
102105
pub log_level: String,
103106
}
104107

108+
static ENV_CONFIG: LazyLock<Config> = LazyLock::new(FromEnv::config);
109+
105110
impl Config {
106-
pub fn get() -> Self {
107-
FromEnv::config()
111+
pub fn get() -> &'static Self {
112+
&ENV_CONFIG
108113
}
109114

110115
pub fn to_env(&self) -> HashMap<&'static str, std::ffi::OsString> {
@@ -120,6 +125,9 @@ impl Config {
120125
self.self_telemetry.to_string().into(),
121126
),
122127
]);
128+
if let Ok(json) = serde_json::to_string(&self.crashtracker_endpoint) {
129+
res.insert(ENV_SIDECAR_CRASHTRACKER_ENDPOINT, json.into());
130+
}
123131
if self.appsec_config.is_some() {
124132
#[allow(clippy::unwrap_used)]
125133
res.extend(self.appsec_config.as_ref().unwrap().to_env());
@@ -225,6 +233,12 @@ impl FromEnv {
225233
.unwrap_or(0)
226234
}
227235

236+
fn crashtracker_endpoint() -> Option<Endpoint> {
237+
std::env::var(ENV_SIDECAR_CRASHTRACKER_ENDPOINT)
238+
.ok()
239+
.and_then(|json| serde_json::from_str(&json).ok())
240+
}
241+
228242
pub fn config() -> Config {
229243
Config {
230244
ipc_mode: Self::ipc_mode(),
@@ -234,12 +248,13 @@ impl FromEnv {
234248
self_telemetry: Self::self_telemetry(),
235249
library_dependencies: vec![],
236250
child_env: std::env::vars_os().collect(),
251+
crashtracker_endpoint: Self::crashtracker_endpoint(),
237252
appsec_config: Self::appsec_config(),
238253
max_memory: Self::max_memory(),
239254
}
240255
}
241256

242-
pub fn appsec_config() -> Option<AppSecConfig> {
257+
fn appsec_config() -> Option<AppSecConfig> {
243258
let shared_lib_path = std::env::var_os(ENV_SIDECAR_APPSEC_SHARED_LIB_PATH)?;
244259
let socket_file_path = std::env::var_os(ENV_SIDECAR_APPSEC_SOCKET_FILE_PATH)?;
245260
let lock_file_path = std::env::var_os(ENV_SIDECAR_APPSEC_LOCK_FILE_PATH)?;

datadog-sidecar/src/log.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -410,7 +410,7 @@ pub(crate) fn enable_logging() -> anyhow::Result<()> {
410410
let filter = MULTI_LOG_FILTER.add(config.log_level.clone());
411411
_ = PERMANENT_MIN_LOG_LEVEL.set(filter);
412412
}
413-
MULTI_LOG_WRITER.add(config.log_method); // same than MULTI_LOG_FILTER
413+
MULTI_LOG_WRITER.add(config.log_method.clone()); // same than MULTI_LOG_FILTER
414414

415415
LogTracer::init()?;
416416

0 commit comments

Comments
 (0)