From c3a470a29b14e5d71476f683da48506b3bf198b2 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 8 Nov 2022 14:03:13 +0100
Subject: [PATCH] walredo process management: handle every error on the kill()
 and drop path

If we're not calling kill() before dropping the PostgresRedoProcess, we
currently leak it.
That's most likely the root cause for #2761.
This patch
1. adds an error log message for that case and
2. adds error handling for all errors on the kill() path. If we're a
`testing` build, we panic. Otherwise, we log an error and leak the
process.

The error handling changes (2) are necessary to conclusively state that
the root cause for #2761 is indeed (1). If we didn't have them, the root
cause could be missing error handling instead.

To make the log messages useful, I've added tracing::instrument
attributes that log the tenant_id and PID. That helps mapping back the
PID of `defunct` processes to pageserver log messages. Note that a
defunct process's `/proc/$PID/` directory isn't very useful. We have
left little more than its PID.

Once we have validated the root cause, we'll find a fix, but that's
still an ongoing discussion.

refs https://github.com/neondatabase/neon/issues/2761
closes https://github.com/neondatabase/neon/pull/2769
---
 pageserver/src/walredo.rs | 68 +++++++++++++++++++++++++++++++++------
 1 file changed, 59 insertions(+), 9 deletions(-)

diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index e21ec4d742..a787da7069 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -229,7 +229,7 @@ impl PostgresRedoManager {
 
         // launch the WAL redo process on first use
         if process_guard.is_none() {
-            let p = PostgresRedoProcess::launch(self.conf, &self.tenant_id, pg_version)?;
+            let p = PostgresRedoProcess::launch(self.conf, self.tenant_id, pg_version)?;
             *process_guard = Some(p);
         }
         let process = process_guard.as_mut().unwrap();
@@ -579,26 +579,29 @@ impl<C: CommandExt> CloseFileDescriptors for C {
 /// Handle to the Postgres WAL redo process
 ///
 struct PostgresRedoProcess {
+    tenant_id: TenantId,
     child: Child,
     stdin: ChildStdin,
     stdout: ChildStdout,
     stderr: ChildStderr,
+    called_kill: bool,
 }
 
 impl PostgresRedoProcess {
     //
     // Start postgres binary in special WAL redo mode.
     //
+    #[instrument(skip_all,fields(tenant_id=%tenant_id, pg_version=pg_version))]
     fn launch(
         conf: &PageServerConf,
-        tenant_id: &TenantId,
+        tenant_id: TenantId,
         pg_version: u32,
     ) -> Result<PostgresRedoProcess, Error> {
         // FIXME: We need a dummy Postgres cluster to run the process in. Currently, we
         // just create one with constant name. That fails if you try to launch more than
         // one WAL redo manager concurrently.
         let datadir = path_with_suffix_extension(
-            conf.tenant_path(tenant_id).join("wal-redo-datadir"),
+            conf.tenant_path(&tenant_id).join("wal-redo-datadir"),
             TEMP_FILE_SUFFIX,
         );
 
@@ -681,6 +684,7 @@ impl PostgresRedoProcess {
             })?;
 
         info!(
+            pid = child.id(),
             "launched WAL redo postgres process on {}",
             datadir.display()
         );
@@ -689,22 +693,59 @@ impl PostgresRedoProcess {
         let stdout = child.stdout.take().unwrap();
         let stderr = child.stderr.take().unwrap();
 
-        set_nonblock(stdin.as_raw_fd())?;
-        set_nonblock(stdout.as_raw_fd())?;
-        set_nonblock(stderr.as_raw_fd())?;
+        macro_rules! set_nonblock_or_log_err {
+            ($file:ident) => {{
+                let res = set_nonblock($file.as_raw_fd());
+                if let Err(e) = &res {
+                    error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed");
+                }
+                res
+            }};
+        }
+        set_nonblock_or_log_err!(stdin)?;
+        set_nonblock_or_log_err!(stdout)?;
+        set_nonblock_or_log_err!(stderr)?;
 
         Ok(PostgresRedoProcess {
+            tenant_id,
             child,
             stdin,
             stdout,
             stderr,
+            called_kill: false,
         })
     }
 
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%self.child.id()))]
     fn kill(mut self) {
-        let _ = self.child.kill();
-        if let Ok(exit_status) = self.child.wait() {
-            error!("wal-redo-postgres exited with code {}", exit_status);
+        info!("killing wal-redo-postgres process");
+        self.called_kill = true;
+
+        let res = self.child.kill();
+        if let Err(e) = res {
+            // This branch is very unlikely because:
+            // - We (= pageserver) spawned this process successfully, so, we're allowed to kill it.
+            // - This is the only place that calls .kill()
+            // - We consume `self`, so, .kill() can't be called twice.
+            // - If the process exited by itself or was killed by someone else,
+            //   .kill() will still succeed because we haven't wait()'ed yet.
+            //
+            // So, if we arrive here, we have really no idea what happened,
+            // whether the PID stored in self.child is still valid, etc.
+            // If this function were fallible, we'd return an error, but
+            // since it isn't, all we can do is log an error and proceed
+            // with the wait().
+            error!(error = %e, "failed to SIGKILL wal-redo-postgres; subsequent wait() might fail or wait for wrong process");
+        }
+
+        match self.child.wait() {
+            Ok(exit_status) => {
+                // log at error level since .kill() is something we only do on errors ATM
+                error!(exit_status = %exit_status, "wal-redo-postgres wait successful");
+            }
+            Err(e) => {
+                error!(error = %e, "wal-redo-postgres wait error; might leak the child process; it will show as zombie (defunct)");
+            }
         }
         drop(self);
     }
@@ -713,6 +754,7 @@ impl PostgresRedoProcess {
     // Apply given WAL records ('records') over an old page image. Returns
     // new page image.
     //
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%self.child.id()))]
     fn apply_wal_records(
         &mut self,
         tag: BufferTag,
@@ -838,6 +880,14 @@ impl PostgresRedoProcess {
     }
 }
 
+impl Drop for PostgresRedoProcess {
+    fn drop(&mut self) {
+        if !self.called_kill {
+            error!(tenant_id=%self.tenant_id, pid = %self.child.id(), "dropping PostgresRedoProcess that wasn't killed, likely causing defunct postgres process");
+        }
+    }
+}
+
 // Functions for constructing messages to send to the postgres WAL redo
 // process. See pgxn/neon_walredo/walredoproc.c for
 // explanation of the protocol.