From ae3eff1ad2d4fa2865439e2d26de98746803ff1f Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Fri, 17 Feb 2023 13:56:00 +0200 Subject: [PATCH] Tracing panic hook (#3475) Fixes #3468. This does change how the panics look, and most importantly, make sure they are not interleaved with other messages. Adds a `GET /v1/panic` endpoint for panic testing (useful for sentry dedup and this hook testing). The panics are now logged within a new error level span called `panic` which separates it from other error level events. The panic info is unpacked into span fields: - thread=mgmt request worker - location="pageserver/src/http/routes.rs:898:9" Co-authored-by: Christian Schwarz --- pageserver/src/bin/pageserver.rs | 54 ++++++++++++++++++++++++++++++-- pageserver/src/http/routes.rs | 12 +++++++ 2 files changed, 63 insertions(+), 3 deletions(-) diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 50eefa8c77..c499fd8d74 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -88,6 +88,13 @@ fn main() -> anyhow::Result<()> { } }; + // Initialize logging, which must be initialized before the custom panic hook is installed. + logging::init(conf.log_format)?; + + // disable the default rust panic hook by using `set_hook`. sentry will install it's own on top + // of this, always processing the panic before we log it. + std::panic::set_hook(Box::new(tracing_panic_hook)); + // initialize sentry if SENTRY_DSN is provided let _sentry_guard = init_sentry( Some(GIT_VERSION.into()), @@ -210,9 +217,6 @@ fn start_pageserver( launch_ts: &'static LaunchTimestamp, conf: &'static PageServerConf, ) -> anyhow::Result<()> { - // Initialize logging - logging::init(conf.log_format)?; - // Print version and launch timestamp to the log, // and expose them as prometheus metrics. // A changed version string indicates changed software. @@ -495,6 +499,50 @@ fn cli() -> Command { ) } +/// Named symbol for our panic hook, which logs the panic. +fn tracing_panic_hook(info: &std::panic::PanicInfo) { + // following rust 1.66.1 std implementation: + // https://github.com/rust-lang/rust/blob/90743e7298aca107ddaa0c202a4d3604e29bfeb6/library/std/src/panicking.rs#L235-L288 + let location = info.location(); + + let msg = match info.payload().downcast_ref::<&'static str>() { + Some(s) => *s, + None => match info.payload().downcast_ref::() { + Some(s) => &s[..], + None => "Box", + }, + }; + + let thread = std::thread::current(); + let thread = thread.name().unwrap_or(""); + let backtrace = std::backtrace::Backtrace::capture(); + + struct PrettyLocation<'a, 'b>(&'a std::panic::Location<'b>); + + impl std::fmt::Display for PrettyLocation<'_, '_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}:{}:{}", self.0.file(), self.0.line(), self.0.column()) + } + } + + let _entered = if let Some(location) = location { + tracing::error_span!("panic", %thread, location = %PrettyLocation(location)) + } else { + // very unlikely to hit here, but the guarantees of std could change + tracing::error_span!("panic", %thread) + } + .entered(); + + if backtrace.status() == std::backtrace::BacktraceStatus::Captured { + // this has an annoying extra '\n' in the end which anyhow doesn't do, but we cannot really + // get rid of it as we cannot get in between of std::fmt::Formatter<'_>; we could format to + // string, maybe even to a TLS one but tracing already does that. + tracing::error!("{msg}\n\nStack backtrace:\n{backtrace}"); + } else { + tracing::error!("{msg}"); + } +} + #[test] fn verify_cli() { cli().debug_assert(); diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 7cd7e81fe1..71273159b7 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -1029,6 +1029,17 @@ async fn active_timeline_of_active_tenant( .map_err(ApiError::NotFound) } +async fn always_panic_handler(req: Request) -> Result, ApiError> { + // Deliberately cause a panic to exercise the panic hook registered via std::panic::set_hook(). + // For pageserver, the relevant panic hook is `tracing_panic_hook` , and the `sentry` crate's wrapper around it. + // Use catch_unwind to ensure that tokio nor hyper are distracted by our panic. + let query = req.uri().query(); + let _ = std::panic::catch_unwind(|| { + panic!("unconditional panic for testing panic hook integration; request query: {query:?}") + }); + json_response(StatusCode::NO_CONTENT, ()) +} + async fn handler_404(_: Request) -> Result, ApiError> { json_response( StatusCode::NOT_FOUND, @@ -1147,5 +1158,6 @@ pub fn make_router( "/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name", evict_timeline_layer_handler, ) + .get("/v1/panic", always_panic_handler) .any(handler_404)) }