mirror of
https://github.com/neondatabase/neon.git
synced 2026-06-03 05:20:38 +00:00
fix: shutdown does not kill walredo processes (#8150)
While investigating Pageserver logs from the cases where systemd hangs during shutdown (https://github.com/neondatabase/cloud/issues/11387), I noticed that even if Pageserver shuts down cleanly[^1], there are lingering walredo processes. [^1]: Meaning, pageserver finishes its shutdown procedure and calls `exit(0)` on its own terms, instead of hitting the systemd unit's `TimeoutSec=` limit and getting SIGKILLed. While systemd should never lock up like it does, maybe we can avoid hitting that bug by cleaning up properly. Changes ------- This PR adds a shutdown method to `WalRedoManager` and hooks it up to tenant shutdown. We keep track of intent to shutdown through the new `enum ProcessOnceCell` stored inside the pre-existing `redo_process` field. A gate is added to keep track of running processes, using the new type `struct Process`. Future Work ----------- Requests that don't need the redo process will not observe the shutdown (see doc comment). Doing so would be nice for completeness sake, but doesn't provide much benefit because `Tenant` and `Timeline` already shut down all walredo users. Testing ------- I did manual testing to confirm that the problem exists before this PR and that it's gone after. Setup: * `neon_local` with a single tenant, create some data using `pgbench` * ensure walredo process is running, not pid * watch `strace -e kill,wait4 -f -p "$(pgrep pageserver)"` * `neon_local pageserver stop` With this PR, we always observe ``` $ strace -e kill,wait4 -f -p "$(pgrep pageserver)" ... [pid 591120] --- SIGTERM {si_signo=SIGTERM, si_code=SI_USER, si_pid=591215, si_uid=1000} --- [pid 591134] kill(591174, SIGKILL) = 0 [pid 591134] wait4(591174, <unfinished ...> [pid 591142] --- SIGCHLD {si_signo=SIGCHLD, si_code=CLD_KILLED, si_pid=591174, si_uid=1000, si_status=SIGKILL, si_utime=0, si_stime=0} --- [pid 591134] <... wait4 resumed>[{WIFSIGNALED(s) && WTERMSIG(s) == SIGKILL}], 0, NULL) = 591174 ... +++ exited with 0 +++ ``` Before this PR, we'd usually observe just ``` ... [pid 596239] --- SIGTERM {si_signo=SIGTERM, si_code=SI_USER, si_pid=596455, si_uid=1000} --- ... +++ exited with 0 +++ ``` Refs ---- refs https://github.com/neondatabase/cloud/issues/11387
This commit is contained in:
committed by
GitHub
parent
89cf8df93b
commit
66b0bf41a1
@@ -48,6 +48,7 @@
|
||||
//! medium/128 time: [8.8311 ms 8.9849 ms 9.1263 ms]
|
||||
//! ```
|
||||
|
||||
use anyhow::Context;
|
||||
use bytes::{Buf, Bytes};
|
||||
use criterion::{BenchmarkId, Criterion};
|
||||
use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
|
||||
@@ -188,6 +189,7 @@ impl Request {
|
||||
manager
|
||||
.request_redo(*key, *lsn, base_img.clone(), records.clone(), *pg_version)
|
||||
.await
|
||||
.context("request_redo")
|
||||
}
|
||||
|
||||
fn pg_record(will_init: bool, bytes: &'static [u8]) -> NeonWalRecord {
|
||||
|
||||
@@ -88,6 +88,7 @@ use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart;
|
||||
use crate::tenant::remote_timeline_client::INITDB_PATH;
|
||||
use crate::tenant::storage_layer::DeltaLayer;
|
||||
use crate::tenant::storage_layer::ImageLayer;
|
||||
use crate::walredo;
|
||||
use crate::InitializationOrder;
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::BTreeSet;
|
||||
@@ -323,6 +324,16 @@ impl From<harness::TestRedoManager> for WalRedoManager {
|
||||
}
|
||||
|
||||
impl WalRedoManager {
|
||||
pub(crate) async fn shutdown(&self) {
|
||||
match self {
|
||||
Self::Prod(mgr) => mgr.shutdown().await,
|
||||
#[cfg(test)]
|
||||
Self::Test(_) => {
|
||||
// Not applicable to test redo manager
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn maybe_quiesce(&self, idle_timeout: Duration) {
|
||||
match self {
|
||||
Self::Prod(mgr) => mgr.maybe_quiesce(idle_timeout),
|
||||
@@ -343,7 +354,7 @@ impl WalRedoManager {
|
||||
base_img: Option<(Lsn, bytes::Bytes)>,
|
||||
records: Vec<(Lsn, crate::walrecord::NeonWalRecord)>,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<bytes::Bytes> {
|
||||
) -> Result<bytes::Bytes, walredo::Error> {
|
||||
match self {
|
||||
Self::Prod(mgr) => {
|
||||
mgr.request_redo(key, lsn, base_img, records, pg_version)
|
||||
@@ -1853,6 +1864,10 @@ impl Tenant {
|
||||
tracing::debug!("Waiting for tasks...");
|
||||
task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id), None).await;
|
||||
|
||||
if let Some(walredo_mgr) = self.walredo_mgr.as_ref() {
|
||||
walredo_mgr.shutdown().await;
|
||||
}
|
||||
|
||||
// Wait for any in-flight operations to complete
|
||||
self.gate.close().await;
|
||||
|
||||
@@ -3854,7 +3869,7 @@ pub(crate) mod harness {
|
||||
base_img: Option<(Lsn, Bytes)>,
|
||||
records: Vec<(Lsn, NeonWalRecord)>,
|
||||
_pg_version: u32,
|
||||
) -> anyhow::Result<Bytes> {
|
||||
) -> Result<Bytes, walredo::Error> {
|
||||
let records_neon = records.iter().all(|r| apply_neon::can_apply_in_neon(&r.1));
|
||||
if records_neon {
|
||||
// For Neon wal records, we can decode without spawning postgres, so do so.
|
||||
|
||||
@@ -40,6 +40,7 @@ use std::time::Duration;
|
||||
use std::time::Instant;
|
||||
use tracing::*;
|
||||
use utils::lsn::Lsn;
|
||||
use utils::sync::gate::GateError;
|
||||
use utils::sync::heavier_once_cell;
|
||||
|
||||
///
|
||||
@@ -53,10 +54,18 @@ pub struct PostgresRedoManager {
|
||||
tenant_shard_id: TenantShardId,
|
||||
conf: &'static PageServerConf,
|
||||
last_redo_at: std::sync::Mutex<Option<Instant>>,
|
||||
/// The current [`process::WalRedoProcess`] that is used by new redo requests.
|
||||
/// We use [`heavier_once_cell`] for coalescing the spawning, but the redo
|
||||
/// requests don't use the [`heavier_once_cell::Guard`] to keep ahold of the
|
||||
/// We use [`heavier_once_cell`] for
|
||||
///
|
||||
/// 1. coalescing the lazy spawning of walredo processes ([`ProcessOnceCell::Spawned`])
|
||||
/// 2. prevent new processes from being spawned on [`Self::shutdown`] (=> [`ProcessOnceCell::ManagerShutDown`]).
|
||||
///
|
||||
/// # Spawning
|
||||
///
|
||||
/// Redo requests use the once cell to coalesce onto one call to [`process::WalRedoProcess::launch`].
|
||||
///
|
||||
/// Notably, requests don't use the [`heavier_once_cell::Guard`] to keep ahold of the
|
||||
/// their process object; we use [`Arc::clone`] for that.
|
||||
///
|
||||
/// This is primarily because earlier implementations that didn't use [`heavier_once_cell`]
|
||||
/// had that behavior; it's probably unnecessary.
|
||||
/// The only merit of it is that if one walredo process encounters an error,
|
||||
@@ -65,7 +74,63 @@ pub struct PostgresRedoManager {
|
||||
/// still be using the old redo process. But, those other tasks will most likely
|
||||
/// encounter an error as well, and errors are an unexpected condition anyway.
|
||||
/// So, probably we could get rid of the `Arc` in the future.
|
||||
redo_process: heavier_once_cell::OnceCell<Arc<process::WalRedoProcess>>,
|
||||
///
|
||||
/// # Shutdown
|
||||
///
|
||||
/// See [`Self::launched_processes`].
|
||||
redo_process: heavier_once_cell::OnceCell<ProcessOnceCell>,
|
||||
|
||||
/// Gate that is entered when launching a walredo process and held open
|
||||
/// until the process has been `kill()`ed and `wait()`ed upon.
|
||||
///
|
||||
/// Manager shutdown waits for this gate to close after setting the
|
||||
/// [`ProcessOnceCell::ManagerShutDown`] state in [`Self::redo_process`].
|
||||
///
|
||||
/// This type of usage is a bit unusual because gates usually keep track of
|
||||
/// concurrent operations, e.g., every [`Self::request_redo`] that is inflight.
|
||||
/// But we use it here to keep track of the _processes_ that we have launched,
|
||||
/// which may outlive any individual redo request because
|
||||
/// - we keep walredo process around until its quiesced to amortize spawn cost and
|
||||
/// - the Arc may be held by multiple concurrent redo requests, so, just because
|
||||
/// you replace the [`Self::redo_process`] cell's content doesn't mean the
|
||||
/// process gets killed immediately.
|
||||
///
|
||||
/// We could simplify this by getting rid of the [`Arc`].
|
||||
/// See the comment on [`Self::redo_process`] for more details.
|
||||
launched_processes: utils::sync::gate::Gate,
|
||||
}
|
||||
|
||||
/// See [`PostgresRedoManager::redo_process`].
|
||||
enum ProcessOnceCell {
|
||||
Spawned(Arc<Process>),
|
||||
ManagerShutDown,
|
||||
}
|
||||
|
||||
struct Process {
|
||||
_launched_processes_guard: utils::sync::gate::GateGuard,
|
||||
process: process::WalRedoProcess,
|
||||
}
|
||||
|
||||
impl std::ops::Deref for Process {
|
||||
type Target = process::WalRedoProcess;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.process
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum Error {
|
||||
#[error("cancelled")]
|
||||
Cancelled,
|
||||
#[error(transparent)]
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
macro_rules! bail {
|
||||
($($arg:tt)*) => {
|
||||
return Err($crate::walredo::Error::Other(::anyhow::anyhow!($($arg)*)));
|
||||
}
|
||||
}
|
||||
|
||||
///
|
||||
@@ -88,9 +153,9 @@ impl PostgresRedoManager {
|
||||
base_img: Option<(Lsn, Bytes)>,
|
||||
records: Vec<(Lsn, NeonWalRecord)>,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<Bytes> {
|
||||
) -> Result<Bytes, Error> {
|
||||
if records.is_empty() {
|
||||
anyhow::bail!("invalid WAL redo request with no records");
|
||||
bail!("invalid WAL redo request with no records");
|
||||
}
|
||||
|
||||
let base_img_lsn = base_img.as_ref().map(|p| p.0).unwrap_or(Lsn::INVALID);
|
||||
@@ -148,10 +213,10 @@ impl PostgresRedoManager {
|
||||
chrono::Utc::now().checked_sub_signed(chrono::Duration::from_std(age).ok()?)
|
||||
})
|
||||
},
|
||||
process: self
|
||||
.redo_process
|
||||
.get()
|
||||
.map(|p| WalRedoManagerProcessStatus { pid: p.id() }),
|
||||
process: self.redo_process.get().and_then(|p| match &*p {
|
||||
ProcessOnceCell::Spawned(p) => Some(WalRedoManagerProcessStatus { pid: p.id() }),
|
||||
ProcessOnceCell::ManagerShutDown => None,
|
||||
}),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -170,9 +235,39 @@ impl PostgresRedoManager {
|
||||
conf,
|
||||
last_redo_at: std::sync::Mutex::default(),
|
||||
redo_process: heavier_once_cell::OnceCell::default(),
|
||||
launched_processes: utils::sync::gate::Gate::default(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Shut down the WAL redo manager.
|
||||
///
|
||||
/// After this future completes
|
||||
/// - no redo process is running
|
||||
/// - no new redo process will be spawned
|
||||
/// - redo requests that need walredo process will fail with [`Error::Cancelled`]
|
||||
/// - [`apply_neon`]-only redo requests may still work, but this may change in the future
|
||||
///
|
||||
/// # Cancel-Safety
|
||||
///
|
||||
/// This method is cancellation-safe.
|
||||
pub async fn shutdown(&self) {
|
||||
// prevent new processes from being spawned
|
||||
let permit = match self.redo_process.get_or_init_detached().await {
|
||||
Ok(guard) => {
|
||||
let (proc, permit) = guard.take_and_deinit();
|
||||
drop(proc); // this just drops the Arc, its refcount may not be zero yet
|
||||
permit
|
||||
}
|
||||
Err(permit) => permit,
|
||||
};
|
||||
self.redo_process
|
||||
.set(ProcessOnceCell::ManagerShutDown, permit);
|
||||
// wait for ongoing requests to drain and the refcounts of all Arc<WalRedoProcess> that
|
||||
// we ever launched to drop to zero, which when it happens synchronously kill()s & wait()s
|
||||
// for the underlying process.
|
||||
self.launched_processes.close().await;
|
||||
}
|
||||
|
||||
/// This type doesn't have its own background task to check for idleness: we
|
||||
/// rely on our owner calling this function periodically in its own housekeeping
|
||||
/// loops.
|
||||
@@ -203,38 +298,48 @@ impl PostgresRedoManager {
|
||||
records: &[(Lsn, NeonWalRecord)],
|
||||
wal_redo_timeout: Duration,
|
||||
pg_version: u32,
|
||||
) -> anyhow::Result<Bytes> {
|
||||
) -> Result<Bytes, Error> {
|
||||
*(self.last_redo_at.lock().unwrap()) = Some(Instant::now());
|
||||
|
||||
let (rel, blknum) = key.to_rel_block().context("invalid record")?;
|
||||
const MAX_RETRY_ATTEMPTS: u32 = 1;
|
||||
let mut n_attempts = 0u32;
|
||||
loop {
|
||||
let proc: Arc<process::WalRedoProcess> =
|
||||
match self.redo_process.get_or_init_detached().await {
|
||||
Ok(guard) => Arc::clone(&guard),
|
||||
Err(permit) => {
|
||||
// don't hold poison_guard, the launch code can bail
|
||||
let start = Instant::now();
|
||||
let proc = Arc::new(
|
||||
process::WalRedoProcess::launch(
|
||||
let proc: Arc<Process> = match self.redo_process.get_or_init_detached().await {
|
||||
Ok(guard) => match &*guard {
|
||||
ProcessOnceCell::Spawned(proc) => Arc::clone(proc),
|
||||
ProcessOnceCell::ManagerShutDown => {
|
||||
return Err(Error::Cancelled);
|
||||
}
|
||||
},
|
||||
Err(permit) => {
|
||||
let start = Instant::now();
|
||||
let proc = Arc::new(Process {
|
||||
_launched_processes_guard: match self.launched_processes.enter() {
|
||||
Ok(guard) => guard,
|
||||
Err(GateError::GateClosed) => unreachable!(
|
||||
"shutdown sets the once cell to `ManagerShutDown` state before closing the gate"
|
||||
),
|
||||
},
|
||||
process: process::WalRedoProcess::launch(
|
||||
self.conf,
|
||||
self.tenant_shard_id,
|
||||
pg_version,
|
||||
)
|
||||
.context("launch walredo process")?,
|
||||
);
|
||||
let duration = start.elapsed();
|
||||
WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
|
||||
info!(
|
||||
duration_ms = duration.as_millis(),
|
||||
pid = proc.id(),
|
||||
"launched walredo process"
|
||||
);
|
||||
self.redo_process.set(Arc::clone(&proc), permit);
|
||||
proc
|
||||
}
|
||||
};
|
||||
});
|
||||
let duration = start.elapsed();
|
||||
WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
|
||||
info!(
|
||||
duration_ms = duration.as_millis(),
|
||||
pid = proc.id(),
|
||||
"launched walredo process"
|
||||
);
|
||||
self.redo_process
|
||||
.set(ProcessOnceCell::Spawned(Arc::clone(&proc)), permit);
|
||||
proc
|
||||
}
|
||||
};
|
||||
|
||||
let started_at = std::time::Instant::now();
|
||||
|
||||
@@ -299,12 +404,17 @@ impl PostgresRedoManager {
|
||||
match self.redo_process.get() {
|
||||
None => (),
|
||||
Some(guard) => {
|
||||
if Arc::ptr_eq(&proc, &*guard) {
|
||||
// We're the first to observe an error from `proc`, it's our job to take it out of rotation.
|
||||
guard.take_and_deinit();
|
||||
} else {
|
||||
// Another task already spawned another redo process (further up in this method)
|
||||
// and put it into `redo_process`. Do nothing, our view of the world is behind.
|
||||
match &*guard {
|
||||
ProcessOnceCell::ManagerShutDown => {}
|
||||
ProcessOnceCell::Spawned(guard_proc) => {
|
||||
if Arc::ptr_eq(&proc, guard_proc) {
|
||||
// We're the first to observe an error from `proc`, it's our job to take it out of rotation.
|
||||
guard.take_and_deinit();
|
||||
} else {
|
||||
// Another task already spawned another redo process (further up in this method)
|
||||
// and put it into `redo_process`. Do nothing, our view of the world is behind.
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -315,7 +425,7 @@ impl PostgresRedoManager {
|
||||
}
|
||||
n_attempts += 1;
|
||||
if n_attempts > MAX_RETRY_ATTEMPTS || result.is_ok() {
|
||||
return result;
|
||||
return result.map_err(Error::Other);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -329,7 +439,7 @@ impl PostgresRedoManager {
|
||||
lsn: Lsn,
|
||||
base_img: Option<Bytes>,
|
||||
records: &[(Lsn, NeonWalRecord)],
|
||||
) -> anyhow::Result<Bytes> {
|
||||
) -> Result<Bytes, Error> {
|
||||
let start_time = Instant::now();
|
||||
|
||||
let mut page = BytesMut::new();
|
||||
@@ -338,7 +448,7 @@ impl PostgresRedoManager {
|
||||
page.extend_from_slice(&fpi[..]);
|
||||
} else {
|
||||
// All the current WAL record types that we can handle require a base image.
|
||||
anyhow::bail!("invalid neon WAL redo request with no base image");
|
||||
bail!("invalid neon WAL redo request with no base image");
|
||||
}
|
||||
|
||||
// Apply all the WAL records in the batch
|
||||
|
||||
Reference in New Issue
Block a user