walredo process management: handle every error on the kill() and drop path

If we're not calling kill() before dropping the PostgresRedoProcess, we
currently leak it.
That's most likely the root cause for #2761.
This patch
1. adds an error log message for that case and
2. adds error handling for all errors on the kill() path. If we're a
`testing` build, we panic. Otherwise, we log an error and leak the
process.

The error handling changes (2) are necessary to conclusively state that
the root cause for #2761 is indeed (1). If we didn't have them, the root
cause could be missing error handling instead.

To make the log messages useful, I've added tracing::instrument
attributes that log the tenant_id and PID. That helps mapping back the
PID of `defunct` processes to pageserver log messages. Note that a
defunct process's `/proc/$PID/` directory isn't very useful. We have
left little more than its PID.

Once we have validated the root cause, we'll find a fix, but that's
still an ongoing discussion.

refs https://github.com/neondatabase/neon/issues/2761
closes https://github.com/neondatabase/neon/pull/2769
This commit is contained in:
Christian Schwarz
2022-11-08 14:03:13 +01:00
committed by GitHub
parent c1a76eb0e5
commit c3a470a29b

View File

@@ -229,7 +229,7 @@ impl PostgresRedoManager {
// launch the WAL redo process on first use
if process_guard.is_none() {
let p = PostgresRedoProcess::launch(self.conf, &self.tenant_id, pg_version)?;
let p = PostgresRedoProcess::launch(self.conf, self.tenant_id, pg_version)?;
*process_guard = Some(p);
}
let process = process_guard.as_mut().unwrap();
@@ -579,26 +579,29 @@ impl<C: CommandExt> CloseFileDescriptors for C {
/// Handle to the Postgres WAL redo process
///
struct PostgresRedoProcess {
tenant_id: TenantId,
child: Child,
stdin: ChildStdin,
stdout: ChildStdout,
stderr: ChildStderr,
called_kill: bool,
}
impl PostgresRedoProcess {
//
// Start postgres binary in special WAL redo mode.
//
#[instrument(skip_all,fields(tenant_id=%tenant_id, pg_version=pg_version))]
fn launch(
conf: &PageServerConf,
tenant_id: &TenantId,
tenant_id: TenantId,
pg_version: u32,
) -> Result<PostgresRedoProcess, Error> {
// FIXME: We need a dummy Postgres cluster to run the process in. Currently, we
// just create one with constant name. That fails if you try to launch more than
// one WAL redo manager concurrently.
let datadir = path_with_suffix_extension(
conf.tenant_path(tenant_id).join("wal-redo-datadir"),
conf.tenant_path(&tenant_id).join("wal-redo-datadir"),
TEMP_FILE_SUFFIX,
);
@@ -681,6 +684,7 @@ impl PostgresRedoProcess {
})?;
info!(
pid = child.id(),
"launched WAL redo postgres process on {}",
datadir.display()
);
@@ -689,22 +693,59 @@ impl PostgresRedoProcess {
let stdout = child.stdout.take().unwrap();
let stderr = child.stderr.take().unwrap();
set_nonblock(stdin.as_raw_fd())?;
set_nonblock(stdout.as_raw_fd())?;
set_nonblock(stderr.as_raw_fd())?;
macro_rules! set_nonblock_or_log_err {
($file:ident) => {{
let res = set_nonblock($file.as_raw_fd());
if let Err(e) = &res {
error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed");
}
res
}};
}
set_nonblock_or_log_err!(stdin)?;
set_nonblock_or_log_err!(stdout)?;
set_nonblock_or_log_err!(stderr)?;
Ok(PostgresRedoProcess {
tenant_id,
child,
stdin,
stdout,
stderr,
called_kill: false,
})
}
#[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%self.child.id()))]
fn kill(mut self) {
let _ = self.child.kill();
if let Ok(exit_status) = self.child.wait() {
error!("wal-redo-postgres exited with code {}", exit_status);
info!("killing wal-redo-postgres process");
self.called_kill = true;
let res = self.child.kill();
if let Err(e) = res {
// This branch is very unlikely because:
// - We (= pageserver) spawned this process successfully, so, we're allowed to kill it.
// - This is the only place that calls .kill()
// - We consume `self`, so, .kill() can't be called twice.
// - If the process exited by itself or was killed by someone else,
// .kill() will still succeed because we haven't wait()'ed yet.
//
// So, if we arrive here, we have really no idea what happened,
// whether the PID stored in self.child is still valid, etc.
// If this function were fallible, we'd return an error, but
// since it isn't, all we can do is log an error and proceed
// with the wait().
error!(error = %e, "failed to SIGKILL wal-redo-postgres; subsequent wait() might fail or wait for wrong process");
}
match self.child.wait() {
Ok(exit_status) => {
// log at error level since .kill() is something we only do on errors ATM
error!(exit_status = %exit_status, "wal-redo-postgres wait successful");
}
Err(e) => {
error!(error = %e, "wal-redo-postgres wait error; might leak the child process; it will show as zombie (defunct)");
}
}
drop(self);
}
@@ -713,6 +754,7 @@ impl PostgresRedoProcess {
// Apply given WAL records ('records') over an old page image. Returns
// new page image.
//
#[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%self.child.id()))]
fn apply_wal_records(
&mut self,
tag: BufferTag,
@@ -838,6 +880,14 @@ impl PostgresRedoProcess {
}
}
impl Drop for PostgresRedoProcess {
fn drop(&mut self) {
if !self.called_kill {
error!(tenant_id=%self.tenant_id, pid = %self.child.id(), "dropping PostgresRedoProcess that wasn't killed, likely causing defunct postgres process");
}
}
}
// Functions for constructing messages to send to the postgres WAL redo
// process. See pgxn/neon_walredo/walredoproc.c for
// explanation of the protocol.