mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-16 18:02:56 +00:00
WIP
This commit is contained in:
@@ -394,7 +394,7 @@ fn start_pageserver(
|
||||
deletion_workers.spawn_with(BACKGROUND_RUNTIME.handle());
|
||||
}
|
||||
|
||||
// Set up global walredo manager state
|
||||
// Set up global tracking of walredo processes
|
||||
let walredo_global_state = BACKGROUND_RUNTIME.block_on(
|
||||
pageserver::walredo::GlobalState::spawn(conf, shutdown_pageserver.clone()),
|
||||
);
|
||||
|
||||
@@ -82,6 +82,17 @@ pub async fn shutdown_pageserver(
|
||||
)
|
||||
.await;
|
||||
|
||||
// In theory, walredo processes are tenant-scoped and should have been shut down after
|
||||
// tenant manager shutdown above.
|
||||
// In practive, we have lingering walredo processes even after pageserver shutdowns that
|
||||
// don't hit the systemd TimeoutSec timeout of 10 seconds (i.e., that log `Shut down successfully completed` below).
|
||||
timed(
|
||||
walredo_global_state.wait_shutdown_complete(),
|
||||
"wait for walredo processes to exit",
|
||||
Duration::from_secs(1),
|
||||
)
|
||||
.await;
|
||||
|
||||
// Shut down any page service tasks: any in-progress work for particular timelines or tenants
|
||||
// should already have been canclled via mgr::shutdown_all_tenants
|
||||
timed(
|
||||
@@ -91,16 +102,6 @@ pub async fn shutdown_pageserver(
|
||||
)
|
||||
.await;
|
||||
|
||||
// The caller of this function already cancelled the `shutdown_pageserver` cancellation token,
|
||||
// to which all the per-tenant walredo _manager_ methods are sensitive.
|
||||
// This here is just to make sure the underlying walredo _processes_ are gone.
|
||||
timed(
|
||||
walredo_global_state.wait_shutdown_complete().await,
|
||||
"wait for walredo processes to exit",
|
||||
Duration::from_secs(1),
|
||||
)
|
||||
.await;
|
||||
|
||||
// Best effort to persist any outstanding deletions, to avoid leaking objects
|
||||
deletion_queue.shutdown(Duration::from_secs(5)).await;
|
||||
|
||||
|
||||
@@ -84,10 +84,8 @@ impl GlobalState {
|
||||
self.spawn_gate.close().await;
|
||||
info!("all walredo processes have been killed and no new ones will be spawned");
|
||||
}
|
||||
pub(self) fn is_shutdown_requested(self: &Arc<Self>) -> bool {
|
||||
self.shutdown_bool.load(Ordering::Relaxed)
|
||||
}
|
||||
pub(crate) fn wait_shutdown_complete(self: &Arc<Self>) {
|
||||
|
||||
pub(crate) async fn wait_shutdown_complete(self: &Arc<Self>) {
|
||||
assert!(
|
||||
self.shutdown.is_cancelled(),
|
||||
"must cancel the `shutdown` token before waiting, otherwise we will wait forever"
|
||||
@@ -319,7 +317,10 @@ impl PostgresRedoManager {
|
||||
let result = proc
|
||||
.apply_wal_records(rel, blknum, &base_img, records, wal_redo_timeout)
|
||||
.await
|
||||
.context("apply_wal_records");
|
||||
.map_err(|e| match e {
|
||||
Error::Cancelled => Error::Cancelled,
|
||||
Error::Other(e) => Error::Other(e.context("apply_wal_records")),
|
||||
});
|
||||
if matches!(result, Err(Error::Cancelled)) {
|
||||
// bail asap and also avoid log noise due to the error reporting below
|
||||
return Err(Error::Cancelled);
|
||||
@@ -396,7 +397,7 @@ impl PostgresRedoManager {
|
||||
}
|
||||
n_attempts += 1;
|
||||
if n_attempts > MAX_RETRY_ATTEMPTS || result.is_ok() {
|
||||
return result.map_err(Error::Other);
|
||||
return result;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -218,10 +218,6 @@ impl WalRedoProcess {
|
||||
) -> Result<Bytes, super::Error> {
|
||||
debug_assert_current_span_has_tenant_id();
|
||||
|
||||
if self.global_state.is_shutdown_requested() {
|
||||
return Err(super::Error::Cancelled);
|
||||
}
|
||||
|
||||
let tag = protocol::BufferTag { rel, blknum };
|
||||
|
||||
// Serialize all the messages to send the WAL redo process first.
|
||||
|
||||
Reference in New Issue
Block a user