diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 36578ee4e0..50c57e3405 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -24,7 +24,7 @@ pub mod walredo; use std::path::Path; use crate::task_mgr::TaskKind; -use tracing::info; +use tracing::{info, instrument}; /// Current storage format version /// @@ -45,6 +45,7 @@ static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]); pub use crate::metrics::preinitialize_metrics; +#[instrument(skip_all)] pub async fn shutdown_pageserver(exit_code: i32) { // Shut down the libpq endpoint task. This prevents new connections from // being accepted. diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index cacf80b364..e2798d03a8 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -1515,6 +1515,8 @@ impl Tenant { /// /// Used at graceful shutdown. /// + // don't have a tenant_id field, freeze_and_flush adds it + #[instrument(skip_all)] pub async fn freeze_and_flush(&self) -> anyhow::Result<()> { // Scan through the hashmap and collect a list of all the timelines, // while holding the lock. Then drop the lock and actually perform the @@ -1529,13 +1531,15 @@ impl Tenant { }; for timeline in &timelines_to_flush { - timeline.freeze_and_flush().await.with_context(|| { - format!( - "freeze_and_flush timeline {} (state={:?})", - timeline.timeline_id, - timeline.current_state() - ) - })?; + match timeline.freeze_and_flush().await { + Ok(()) => (), + Err(err) => { + tracing::error!( + timeline_id=%timeline.timeline_id, err=?err, + "freeze_and_flush timeline failed", + ); + } + } } Ok(()) diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 81bf967669..c088cfece2 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -226,6 +226,7 @@ pub fn schedule_local_tenant_processing( /// That could be easily misinterpreted by control plane, the consumer of the /// management API. For example, it could attach the tenant on a different pageserver. /// We would then be in split-brain once this pageserver restarts. +#[instrument(skip_all)] pub async fn shutdown_all_tenants() { // Prevent new tenants from being created. let tenants_to_shut_down = { diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 7135b621cb..00393d8d44 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -272,7 +272,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild ".*Ignoring new state, equal to the existing one: Stopping" ) env.pageserver.allowed_errors.append( - ".*during shutdown: cannot flush frozen layers when flush_loop is not running, state is Exited" + ".*shutdown_pageserver:.*freeze_and_flush.*cannot flush frozen layers when flush_loop is not running, state is Exited" ) ps_http = env.pageserver.http_client()