From b37bb7d7edaab870d05bff7286e345066d49664e Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 8 May 2025 20:48:24 +0200
Subject: [PATCH] pageserver: timeline shutdown: fully quiesce ingest path
 before`freeze_and_flush` (#11851)

# Problem

Before this PR, timeline shutdown would
- cancel the walreceiver cancellation token subtree (child token of
Timeline::cancel)
- call freeze_and_flush
- Timeline::cancel.cancel()
- ... bunch of waiting for things ...
- Timeline::gate.close()

As noted by the comment that is deleted by this PR, this left a window
where, after freeze_and_flush, walreceiver could still be running and
ingest data into a new InMemoryLayer.

This presents a potential source of log noise during Timeline shutdown
where the InMemoryLayer created after the freeze_and_flush observes
that Timeline::cancel is cancelled, failing the ingest with some
anyhow::Error wrapping (deeply) a `FlushTaskError::Cancelled` instance
(`flush task cancelled` error message).

# Solution

It turns out that it is quite easy to shut down, not just cancel,
walreceiver completely
because the only subtask spawned by walreceiver connection manager is
the `handle_walreceiver_connection` task, which is properly shut down
and waited upon when the manager task observes cancellation and exits
its retry loop.

The alternative is to replace all the usage of `anyhow` on the ingest
path
with differentiated error types. A lot of busywork for little gain to
fix
a potential logging noise nuisance, so, not doing that for now.

# Correctness / Risk

We do not risk leaking walreceiver child tasks because existing
discipline
is to hold a gate guard.

We will prolong `Timeline::shutdown` to the degree that we're no longer
making
progress with the rest of shutdown while the walreceiver task hasn't yet
observed cancellation. In practice, this should be negligible.

`Timeline::shutdown` could fail to complete if there is a hidden
dependency
of walreceiver shutdown on some subsystem. The code certainly suggests
there
isn't, and I'm not aware of any such dependency. Anyway, impact will be
low
because we only shut down Timeline instances that are obsolete, either
because
there is a newer attachment at a different location, or because the
timeline
got deleted by the user. We would learn about this through stuck cplane
operations or stuck storcon reconciliations. We would be able to
mitigate by
cancelling such stuck operations/reconciliations and/or by rolling back
pageserver.

# Refs
- identified this while investigating
https://github.com/neondatabase/neon/issues/11762
- PR that _does_ fix a bunch _real_ `flush task cancelled` noise on the
compaction path: https://github.com/neondatabase/neon/pull/11853
---
 pageserver/src/tenant/timeline.rs             | 12 ++----------
 pageserver/src/tenant/timeline/walreceiver.rs | 16 ++++++++++++++--
 2 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index c8d897d074..d7f5958128 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2127,22 +2127,14 @@ impl Timeline {
         debug_assert_current_span_has_tenant_and_timeline_id();
 
         // Regardless of whether we're going to try_freeze_and_flush
-        // or not, stop ingesting any more data. Walreceiver only provides
-        // cancellation but no "wait until gone", because it uses the Timeline::gate.
-        // So, only after the self.gate.close() below will we know for sure that
-        // no walreceiver tasks are left.
-        // For `try_freeze_and_flush=true`, this means that we might still be ingesting
-        // data during the call to `self.freeze_and_flush()` below.
-        // That's not ideal, but, we don't have the concept of a ChildGuard,
-        // which is what we'd need to properly model early shutdown of the walreceiver
-        // task sub-tree before the other Timeline task sub-trees.
+        // or not, stop ingesting any more data.
         let walreceiver = self.walreceiver.lock().unwrap().take();
         tracing::debug!(
             is_some = walreceiver.is_some(),
             "Waiting for WalReceiverManager..."
         );
         if let Some(walreceiver) = walreceiver {
-            walreceiver.cancel();
+            walreceiver.shutdown().await;
         }
         // ... and inform any waiters for newer LSNs that there won't be any.
         self.last_record_lsn.shutdown();
diff --git a/pageserver/src/tenant/timeline/walreceiver.rs b/pageserver/src/tenant/timeline/walreceiver.rs
index 4f80073cc3..0f73eb839b 100644
--- a/pageserver/src/tenant/timeline/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -63,6 +63,7 @@ pub struct WalReceiver {
     /// All task spawned by [`WalReceiver::start`] and its children are sensitive to this token.
     /// It's a child token of [`Timeline`] so that timeline shutdown can cancel WalReceiver tasks early for `freeze_and_flush=true`.
     cancel: CancellationToken,
+    task: tokio::task::JoinHandle<()>,
 }
 
 impl WalReceiver {
@@ -79,7 +80,7 @@ impl WalReceiver {
         let loop_status = Arc::new(std::sync::RwLock::new(None));
         let manager_status = Arc::clone(&loop_status);
         let cancel = timeline.cancel.child_token();
-        WALRECEIVER_RUNTIME.spawn({
+        let task = WALRECEIVER_RUNTIME.spawn({
             let cancel = cancel.clone();
             async move {
                 debug_assert_current_span_has_tenant_and_timeline_id();
@@ -120,14 +121,25 @@ impl WalReceiver {
         Self {
             manager_status,
             cancel,
+            task,
         }
     }
 
     #[instrument(skip_all, level = tracing::Level::DEBUG)]
-    pub fn cancel(&self) {
+    pub async fn shutdown(self) {
         debug_assert_current_span_has_tenant_and_timeline_id();
         debug!("cancelling walreceiver tasks");
         self.cancel.cancel();
+        match self.task.await {
+            Ok(()) => debug!("Shutdown success"),
+            Err(je) if je.is_cancelled() => unreachable!("not used"),
+            Err(je) if je.is_panic() => {
+                // already logged by panic hook
+            }
+            Err(je) => {
+                error!("shutdown walreceiver task join error: {je}")
+            }
+        }
     }
 
     pub(crate) fn status(&self) -> Option<ConnectionManagerStatus> {