From c3a470a29b14e5d71476f683da48506b3bf198b2 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Tue, 8 Nov 2022 14:03:13 +0100 Subject: [PATCH] walredo process management: handle every error on the kill() and drop path If we're not calling kill() before dropping the PostgresRedoProcess, we currently leak it. That's most likely the root cause for #2761. This patch 1. adds an error log message for that case and 2. adds error handling for all errors on the kill() path. If we're a `testing` build, we panic. Otherwise, we log an error and leak the process. The error handling changes (2) are necessary to conclusively state that the root cause for #2761 is indeed (1). If we didn't have them, the root cause could be missing error handling instead. To make the log messages useful, I've added tracing::instrument attributes that log the tenant_id and PID. That helps mapping back the PID of `defunct` processes to pageserver log messages. Note that a defunct process's `/proc/$PID/` directory isn't very useful. We have left little more than its PID. Once we have validated the root cause, we'll find a fix, but that's still an ongoing discussion. refs https://github.com/neondatabase/neon/issues/2761 closes https://github.com/neondatabase/neon/pull/2769 --- pageserver/src/walredo.rs | 68 +++++++++++++++++++++++++++++++++------ 1 file changed, 59 insertions(+), 9 deletions(-) diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index e21ec4d742..a787da7069 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -229,7 +229,7 @@ impl PostgresRedoManager { // launch the WAL redo process on first use if process_guard.is_none() { - let p = PostgresRedoProcess::launch(self.conf, &self.tenant_id, pg_version)?; + let p = PostgresRedoProcess::launch(self.conf, self.tenant_id, pg_version)?; *process_guard = Some(p); } let process = process_guard.as_mut().unwrap(); @@ -579,26 +579,29 @@ impl CloseFileDescriptors for C { /// Handle to the Postgres WAL redo process /// struct PostgresRedoProcess { + tenant_id: TenantId, child: Child, stdin: ChildStdin, stdout: ChildStdout, stderr: ChildStderr, + called_kill: bool, } impl PostgresRedoProcess { // // Start postgres binary in special WAL redo mode. // + #[instrument(skip_all,fields(tenant_id=%tenant_id, pg_version=pg_version))] fn launch( conf: &PageServerConf, - tenant_id: &TenantId, + tenant_id: TenantId, pg_version: u32, ) -> Result { // FIXME: We need a dummy Postgres cluster to run the process in. Currently, we // just create one with constant name. That fails if you try to launch more than // one WAL redo manager concurrently. let datadir = path_with_suffix_extension( - conf.tenant_path(tenant_id).join("wal-redo-datadir"), + conf.tenant_path(&tenant_id).join("wal-redo-datadir"), TEMP_FILE_SUFFIX, ); @@ -681,6 +684,7 @@ impl PostgresRedoProcess { })?; info!( + pid = child.id(), "launched WAL redo postgres process on {}", datadir.display() ); @@ -689,22 +693,59 @@ impl PostgresRedoProcess { let stdout = child.stdout.take().unwrap(); let stderr = child.stderr.take().unwrap(); - set_nonblock(stdin.as_raw_fd())?; - set_nonblock(stdout.as_raw_fd())?; - set_nonblock(stderr.as_raw_fd())?; + macro_rules! set_nonblock_or_log_err { + ($file:ident) => {{ + let res = set_nonblock($file.as_raw_fd()); + if let Err(e) = &res { + error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed"); + } + res + }}; + } + set_nonblock_or_log_err!(stdin)?; + set_nonblock_or_log_err!(stdout)?; + set_nonblock_or_log_err!(stderr)?; Ok(PostgresRedoProcess { + tenant_id, child, stdin, stdout, stderr, + called_kill: false, }) } + #[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%self.child.id()))] fn kill(mut self) { - let _ = self.child.kill(); - if let Ok(exit_status) = self.child.wait() { - error!("wal-redo-postgres exited with code {}", exit_status); + info!("killing wal-redo-postgres process"); + self.called_kill = true; + + let res = self.child.kill(); + if let Err(e) = res { + // This branch is very unlikely because: + // - We (= pageserver) spawned this process successfully, so, we're allowed to kill it. + // - This is the only place that calls .kill() + // - We consume `self`, so, .kill() can't be called twice. + // - If the process exited by itself or was killed by someone else, + // .kill() will still succeed because we haven't wait()'ed yet. + // + // So, if we arrive here, we have really no idea what happened, + // whether the PID stored in self.child is still valid, etc. + // If this function were fallible, we'd return an error, but + // since it isn't, all we can do is log an error and proceed + // with the wait(). + error!(error = %e, "failed to SIGKILL wal-redo-postgres; subsequent wait() might fail or wait for wrong process"); + } + + match self.child.wait() { + Ok(exit_status) => { + // log at error level since .kill() is something we only do on errors ATM + error!(exit_status = %exit_status, "wal-redo-postgres wait successful"); + } + Err(e) => { + error!(error = %e, "wal-redo-postgres wait error; might leak the child process; it will show as zombie (defunct)"); + } } drop(self); } @@ -713,6 +754,7 @@ impl PostgresRedoProcess { // Apply given WAL records ('records') over an old page image. Returns // new page image. // + #[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%self.child.id()))] fn apply_wal_records( &mut self, tag: BufferTag, @@ -838,6 +880,14 @@ impl PostgresRedoProcess { } } +impl Drop for PostgresRedoProcess { + fn drop(&mut self) { + if !self.called_kill { + error!(tenant_id=%self.tenant_id, pid = %self.child.id(), "dropping PostgresRedoProcess that wasn't killed, likely causing defunct postgres process"); + } + } +} + // Functions for constructing messages to send to the postgres WAL redo // process. See pgxn/neon_walredo/walredoproc.c for // explanation of the protocol.