mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-16 12:40:36 +00:00
instrument shutdown_all_tenants code path, include timeline_id in logs if failed to flush
This can be extracted into an independent commit.
This commit is contained in:
@@ -24,7 +24,7 @@ pub mod walredo;
|
||||
use std::path::Path;
|
||||
|
||||
use crate::task_mgr::TaskKind;
|
||||
use tracing::info;
|
||||
use tracing::{info, instrument};
|
||||
|
||||
/// Current storage format version
|
||||
///
|
||||
@@ -45,6 +45,7 @@ static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);
|
||||
|
||||
pub use crate::metrics::preinitialize_metrics;
|
||||
|
||||
#[instrument(skip_all)]
|
||||
pub async fn shutdown_pageserver(exit_code: i32) {
|
||||
// Shut down the libpq endpoint task. This prevents new connections from
|
||||
// being accepted.
|
||||
|
||||
@@ -1515,6 +1515,8 @@ impl Tenant {
|
||||
///
|
||||
/// Used at graceful shutdown.
|
||||
///
|
||||
// don't have a tenant_id field, freeze_and_flush adds it
|
||||
#[instrument(skip_all)]
|
||||
pub async fn freeze_and_flush(&self) -> anyhow::Result<()> {
|
||||
// Scan through the hashmap and collect a list of all the timelines,
|
||||
// while holding the lock. Then drop the lock and actually perform the
|
||||
@@ -1529,13 +1531,15 @@ impl Tenant {
|
||||
};
|
||||
|
||||
for timeline in &timelines_to_flush {
|
||||
timeline.freeze_and_flush().await.with_context(|| {
|
||||
format!(
|
||||
"freeze_and_flush timeline {} (state={:?})",
|
||||
timeline.timeline_id,
|
||||
timeline.current_state()
|
||||
)
|
||||
})?;
|
||||
match timeline.freeze_and_flush().await {
|
||||
Ok(()) => (),
|
||||
Err(err) => {
|
||||
tracing::error!(
|
||||
timeline_id=%timeline.timeline_id, err=?err,
|
||||
"freeze_and_flush timeline failed",
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -226,6 +226,7 @@ pub fn schedule_local_tenant_processing(
|
||||
/// That could be easily misinterpreted by control plane, the consumer of the
|
||||
/// management API. For example, it could attach the tenant on a different pageserver.
|
||||
/// We would then be in split-brain once this pageserver restarts.
|
||||
#[instrument(skip_all)]
|
||||
pub async fn shutdown_all_tenants() {
|
||||
// Prevent new tenants from being created.
|
||||
let tenants_to_shut_down = {
|
||||
|
||||
@@ -272,7 +272,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
|
||||
".*Ignoring new state, equal to the existing one: Stopping"
|
||||
)
|
||||
env.pageserver.allowed_errors.append(
|
||||
".*during shutdown: cannot flush frozen layers when flush_loop is not running, state is Exited"
|
||||
".*shutdown_pageserver:.*freeze_and_flush.*cannot flush frozen layers when flush_loop is not running, state is Exited"
|
||||
)
|
||||
|
||||
ps_http = env.pageserver.http_client()
|
||||
|
||||
Reference in New Issue
Block a user