instrument shutdown_all_tenants code path, include timeline_id in logs if failed to flush

This can be extracted into an independent commit.
This commit is contained in:
Christian Schwarz
2023-05-26 10:11:46 +02:00
parent f2abc4c933
commit 609a929968
4 changed files with 15 additions and 9 deletions

View File

@@ -24,7 +24,7 @@ pub mod walredo;
use std::path::Path;
use crate::task_mgr::TaskKind;
use tracing::info;
use tracing::{info, instrument};
/// Current storage format version
///
@@ -45,6 +45,7 @@ static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);
pub use crate::metrics::preinitialize_metrics;
#[instrument(skip_all)]
pub async fn shutdown_pageserver(exit_code: i32) {
// Shut down the libpq endpoint task. This prevents new connections from
// being accepted.

View File

@@ -1515,6 +1515,8 @@ impl Tenant {
///
/// Used at graceful shutdown.
///
// don't have a tenant_id field, freeze_and_flush adds it
#[instrument(skip_all)]
pub async fn freeze_and_flush(&self) -> anyhow::Result<()> {
// Scan through the hashmap and collect a list of all the timelines,
// while holding the lock. Then drop the lock and actually perform the
@@ -1529,13 +1531,15 @@ impl Tenant {
};
for timeline in &timelines_to_flush {
timeline.freeze_and_flush().await.with_context(|| {
format!(
"freeze_and_flush timeline {} (state={:?})",
timeline.timeline_id,
timeline.current_state()
)
})?;
match timeline.freeze_and_flush().await {
Ok(()) => (),
Err(err) => {
tracing::error!(
timeline_id=%timeline.timeline_id, err=?err,
"freeze_and_flush timeline failed",
);
}
}
}
Ok(())

View File

@@ -226,6 +226,7 @@ pub fn schedule_local_tenant_processing(
/// That could be easily misinterpreted by control plane, the consumer of the
/// management API. For example, it could attach the tenant on a different pageserver.
/// We would then be in split-brain once this pageserver restarts.
#[instrument(skip_all)]
pub async fn shutdown_all_tenants() {
// Prevent new tenants from being created.
let tenants_to_shut_down = {

View File

@@ -272,7 +272,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
".*Ignoring new state, equal to the existing one: Stopping"
)
env.pageserver.allowed_errors.append(
".*during shutdown: cannot flush frozen layers when flush_loop is not running, state is Exited"
".*shutdown_pageserver:.*freeze_and_flush.*cannot flush frozen layers when flush_loop is not running, state is Exited"
)
ps_http = env.pageserver.http_client()