This commit is contained in:
Christian Schwarz
2024-06-24 18:25:57 +00:00
parent 34f42669fa
commit 7507d137de
4 changed files with 19 additions and 21 deletions

View File

@@ -394,7 +394,7 @@ fn start_pageserver(
deletion_workers.spawn_with(BACKGROUND_RUNTIME.handle());
}
// Set up global walredo manager state
// Set up global tracking of walredo processes
let walredo_global_state = BACKGROUND_RUNTIME.block_on(
pageserver::walredo::GlobalState::spawn(conf, shutdown_pageserver.clone()),
);

View File

@@ -82,6 +82,17 @@ pub async fn shutdown_pageserver(
)
.await;
// In theory, walredo processes are tenant-scoped and should have been shut down after
// tenant manager shutdown above.
// In practive, we have lingering walredo processes even after pageserver shutdowns that
// don't hit the systemd TimeoutSec timeout of 10 seconds (i.e., that log `Shut down successfully completed` below).
timed(
walredo_global_state.wait_shutdown_complete(),
"wait for walredo processes to exit",
Duration::from_secs(1),
)
.await;
// Shut down any page service tasks: any in-progress work for particular timelines or tenants
// should already have been canclled via mgr::shutdown_all_tenants
timed(
@@ -91,16 +102,6 @@ pub async fn shutdown_pageserver(
)
.await;
// The caller of this function already cancelled the `shutdown_pageserver` cancellation token,
// to which all the per-tenant walredo _manager_ methods are sensitive.
// This here is just to make sure the underlying walredo _processes_ are gone.
timed(
walredo_global_state.wait_shutdown_complete().await,
"wait for walredo processes to exit",
Duration::from_secs(1),
)
.await;
// Best effort to persist any outstanding deletions, to avoid leaking objects
deletion_queue.shutdown(Duration::from_secs(5)).await;

View File

@@ -84,10 +84,8 @@ impl GlobalState {
self.spawn_gate.close().await;
info!("all walredo processes have been killed and no new ones will be spawned");
}
pub(self) fn is_shutdown_requested(self: &Arc<Self>) -> bool {
self.shutdown_bool.load(Ordering::Relaxed)
}
pub(crate) fn wait_shutdown_complete(self: &Arc<Self>) {
pub(crate) async fn wait_shutdown_complete(self: &Arc<Self>) {
assert!(
self.shutdown.is_cancelled(),
"must cancel the `shutdown` token before waiting, otherwise we will wait forever"
@@ -319,7 +317,10 @@ impl PostgresRedoManager {
let result = proc
.apply_wal_records(rel, blknum, &base_img, records, wal_redo_timeout)
.await
.context("apply_wal_records");
.map_err(|e| match e {
Error::Cancelled => Error::Cancelled,
Error::Other(e) => Error::Other(e.context("apply_wal_records")),
});
if matches!(result, Err(Error::Cancelled)) {
// bail asap and also avoid log noise due to the error reporting below
return Err(Error::Cancelled);
@@ -396,7 +397,7 @@ impl PostgresRedoManager {
}
n_attempts += 1;
if n_attempts > MAX_RETRY_ATTEMPTS || result.is_ok() {
return result.map_err(Error::Other);
return result;
}
}
}

View File

@@ -218,10 +218,6 @@ impl WalRedoProcess {
) -> Result<Bytes, super::Error> {
debug_assert_current_span_has_tenant_id();
if self.global_state.is_shutdown_requested() {
return Err(super::Error::Cancelled);
}
let tag = protocol::BufferTag { rel, blknum };
// Serialize all the messages to send the WAL redo process first.