mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-28 02:20:42 +00:00
walredo process management: handle every error on the kill() and drop path
If we're not calling kill() before dropping the PostgresRedoProcess, we currently leak it. That's most likely the root cause for #2761. This patch 1. adds an error log message for that case and 2. adds error handling for all errors on the kill() path. If we're a `testing` build, we panic. Otherwise, we log an error and leak the process. The error handling changes (2) are necessary to conclusively state that the root cause for #2761 is indeed (1). If we didn't have them, the root cause could be missing error handling instead. To make the log messages useful, I've added tracing::instrument attributes that log the tenant_id and PID. That helps mapping back the PID of `defunct` processes to pageserver log messages. Note that a defunct process's `/proc/$PID/` directory isn't very useful. We have left little more than its PID. Once we have validated the root cause, we'll find a fix, but that's still an ongoing discussion. refs https://github.com/neondatabase/neon/issues/2761 closes https://github.com/neondatabase/neon/pull/2769
This commit is contained in:
committed by
GitHub
parent
c1a76eb0e5
commit
c3a470a29b
@@ -229,7 +229,7 @@ impl PostgresRedoManager {
|
||||
|
||||
// launch the WAL redo process on first use
|
||||
if process_guard.is_none() {
|
||||
let p = PostgresRedoProcess::launch(self.conf, &self.tenant_id, pg_version)?;
|
||||
let p = PostgresRedoProcess::launch(self.conf, self.tenant_id, pg_version)?;
|
||||
*process_guard = Some(p);
|
||||
}
|
||||
let process = process_guard.as_mut().unwrap();
|
||||
@@ -579,26 +579,29 @@ impl<C: CommandExt> CloseFileDescriptors for C {
|
||||
/// Handle to the Postgres WAL redo process
|
||||
///
|
||||
struct PostgresRedoProcess {
|
||||
tenant_id: TenantId,
|
||||
child: Child,
|
||||
stdin: ChildStdin,
|
||||
stdout: ChildStdout,
|
||||
stderr: ChildStderr,
|
||||
called_kill: bool,
|
||||
}
|
||||
|
||||
impl PostgresRedoProcess {
|
||||
//
|
||||
// Start postgres binary in special WAL redo mode.
|
||||
//
|
||||
#[instrument(skip_all,fields(tenant_id=%tenant_id, pg_version=pg_version))]
|
||||
fn launch(
|
||||
conf: &PageServerConf,
|
||||
tenant_id: &TenantId,
|
||||
tenant_id: TenantId,
|
||||
pg_version: u32,
|
||||
) -> Result<PostgresRedoProcess, Error> {
|
||||
// FIXME: We need a dummy Postgres cluster to run the process in. Currently, we
|
||||
// just create one with constant name. That fails if you try to launch more than
|
||||
// one WAL redo manager concurrently.
|
||||
let datadir = path_with_suffix_extension(
|
||||
conf.tenant_path(tenant_id).join("wal-redo-datadir"),
|
||||
conf.tenant_path(&tenant_id).join("wal-redo-datadir"),
|
||||
TEMP_FILE_SUFFIX,
|
||||
);
|
||||
|
||||
@@ -681,6 +684,7 @@ impl PostgresRedoProcess {
|
||||
})?;
|
||||
|
||||
info!(
|
||||
pid = child.id(),
|
||||
"launched WAL redo postgres process on {}",
|
||||
datadir.display()
|
||||
);
|
||||
@@ -689,22 +693,59 @@ impl PostgresRedoProcess {
|
||||
let stdout = child.stdout.take().unwrap();
|
||||
let stderr = child.stderr.take().unwrap();
|
||||
|
||||
set_nonblock(stdin.as_raw_fd())?;
|
||||
set_nonblock(stdout.as_raw_fd())?;
|
||||
set_nonblock(stderr.as_raw_fd())?;
|
||||
macro_rules! set_nonblock_or_log_err {
|
||||
($file:ident) => {{
|
||||
let res = set_nonblock($file.as_raw_fd());
|
||||
if let Err(e) = &res {
|
||||
error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed");
|
||||
}
|
||||
res
|
||||
}};
|
||||
}
|
||||
set_nonblock_or_log_err!(stdin)?;
|
||||
set_nonblock_or_log_err!(stdout)?;
|
||||
set_nonblock_or_log_err!(stderr)?;
|
||||
|
||||
Ok(PostgresRedoProcess {
|
||||
tenant_id,
|
||||
child,
|
||||
stdin,
|
||||
stdout,
|
||||
stderr,
|
||||
called_kill: false,
|
||||
})
|
||||
}
|
||||
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%self.child.id()))]
|
||||
fn kill(mut self) {
|
||||
let _ = self.child.kill();
|
||||
if let Ok(exit_status) = self.child.wait() {
|
||||
error!("wal-redo-postgres exited with code {}", exit_status);
|
||||
info!("killing wal-redo-postgres process");
|
||||
self.called_kill = true;
|
||||
|
||||
let res = self.child.kill();
|
||||
if let Err(e) = res {
|
||||
// This branch is very unlikely because:
|
||||
// - We (= pageserver) spawned this process successfully, so, we're allowed to kill it.
|
||||
// - This is the only place that calls .kill()
|
||||
// - We consume `self`, so, .kill() can't be called twice.
|
||||
// - If the process exited by itself or was killed by someone else,
|
||||
// .kill() will still succeed because we haven't wait()'ed yet.
|
||||
//
|
||||
// So, if we arrive here, we have really no idea what happened,
|
||||
// whether the PID stored in self.child is still valid, etc.
|
||||
// If this function were fallible, we'd return an error, but
|
||||
// since it isn't, all we can do is log an error and proceed
|
||||
// with the wait().
|
||||
error!(error = %e, "failed to SIGKILL wal-redo-postgres; subsequent wait() might fail or wait for wrong process");
|
||||
}
|
||||
|
||||
match self.child.wait() {
|
||||
Ok(exit_status) => {
|
||||
// log at error level since .kill() is something we only do on errors ATM
|
||||
error!(exit_status = %exit_status, "wal-redo-postgres wait successful");
|
||||
}
|
||||
Err(e) => {
|
||||
error!(error = %e, "wal-redo-postgres wait error; might leak the child process; it will show as zombie (defunct)");
|
||||
}
|
||||
}
|
||||
drop(self);
|
||||
}
|
||||
@@ -713,6 +754,7 @@ impl PostgresRedoProcess {
|
||||
// Apply given WAL records ('records') over an old page image. Returns
|
||||
// new page image.
|
||||
//
|
||||
#[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%self.child.id()))]
|
||||
fn apply_wal_records(
|
||||
&mut self,
|
||||
tag: BufferTag,
|
||||
@@ -838,6 +880,14 @@ impl PostgresRedoProcess {
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for PostgresRedoProcess {
|
||||
fn drop(&mut self) {
|
||||
if !self.called_kill {
|
||||
error!(tenant_id=%self.tenant_id, pid = %self.child.id(), "dropping PostgresRedoProcess that wasn't killed, likely causing defunct postgres process");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Functions for constructing messages to send to the postgres WAL redo
|
||||
// process. See pgxn/neon_walredo/walredoproc.c for
|
||||
// explanation of the protocol.
|
||||
|
||||
Reference in New Issue
Block a user