From f561cbe1c709f07c507ffe642e975838ee430ef6 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Fri, 8 Nov 2024 10:35:27 -0500
Subject: [PATCH] fix(pageserver): drain upload queue before detaching ancestor
 (#9651)

In INC-317
https://neondb.slack.com/archives/C033RQ5SPDH/p1730815677932209, we saw
an interesting series of operations that would remove valid layer files
existing in the layer map.

* Timeline A starts compaction and generates an image layer Z but not
uploading it yet.
* Timeline B/C starts ancestor detaching (which should not affect
timeline A)
* The tenant gets restarted as part of the ancestor detaching process,
without increasing the generation number.
* Timeline A reloads, discovering the layer Z is a future layer, and
schedules a **deletion into the deletion queue**. This means that the
file will be deleted any time in the future.
* Timeline A starts compaction and generates layer Z again, adding it to
the layer map. Note that because we don't bump generation number during
ancestor detach, it has the same filename + generation number as the
original Z.
* Timeline A deletes layer Z from s3 + disk, and now we have a dangling
reference in the layer map, blocking all
compaction/logical_size_calculation process.

## Summary of changes

* We wait until all layers to be uploaded before shutting down the
tenants in `Flush` mode.
* Ancestor detach restarts now use this mode.
* Ancestor detach also waits for remote queue completion before starting
the detaching process.
* The patch ensures that we don't have any future image layer (or
something similar) after restart, but not fixing the underlying problem
around generation numbers.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/http/routes.rs                 | 15 +++++++++++
 pageserver/src/tenant/mgr.rs                  |  2 +-
 .../src/tenant/remote_timeline_client.rs      | 12 +++++++++
 pageserver/src/tenant/timeline.rs             | 25 ++++++++++++++-----
 4 files changed, 47 insertions(+), 7 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 72eb3e7ade..d57bd98e95 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2169,6 +2169,21 @@ async fn timeline_detach_ancestor_handler(
         let ctx = RequestContext::new(TaskKind::DetachAncestor, DownloadBehavior::Download);
         let ctx = &ctx;
 
+        // Flush the upload queues of all timelines before detaching ancestor. We do the same thing again
+        // during shutdown. This early upload ensures the pageserver does not need to upload too many
+        // things and creates downtime during timeline reloads.
+        for timeline in tenant.list_timelines() {
+            timeline
+                .remote_client
+                .wait_completion()
+                .await
+                .map_err(|e| {
+                    ApiError::PreconditionFailed(format!("cannot drain upload queue: {e}").into())
+                })?;
+        }
+
+        tracing::info!("all timeline upload queues are drained");
+
         let timeline = tenant.get_timeline(timeline_id, true)?;
 
         let progress = timeline
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index a4c458b737..4fc9d740c8 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -1959,7 +1959,7 @@ impl TenantManager {
             attempt.before_reset_tenant();
 
             let (_guard, progress) = utils::completion::channel();
-            match tenant.shutdown(progress, ShutdownMode::Hard).await {
+            match tenant.shutdown(progress, ShutdownMode::Flush).await {
                 Ok(()) => {
                     slot_guard.drop_old_value().expect("it was just shutdown");
                 }
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 0aa8d61036..b37c16e133 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -2201,6 +2201,18 @@ impl RemoteTimelineClient {
         inner.initialized_mut()?;
         Ok(UploadQueueAccessor { inner })
     }
+
+    pub(crate) fn no_pending_work(&self) -> bool {
+        let inner = self.upload_queue.lock().unwrap();
+        match &*inner {
+            UploadQueue::Uninitialized
+            | UploadQueue::Stopped(UploadQueueStopped::Uninitialized) => true,
+            UploadQueue::Stopped(UploadQueueStopped::Deletable(x)) => {
+                x.upload_queue_for_deletion.no_pending_work()
+            }
+            UploadQueue::Initialized(x) => x.no_pending_work(),
+        }
+    }
 }
 
 pub(crate) struct UploadQueueAccessor<'a> {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 6e082aecf5..4d086df2d1 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -852,6 +852,10 @@ pub(crate) enum ShutdownMode {
     /// While we are flushing, we continue to accept read I/O for LSNs ingested before
     /// the call to [`Timeline::shutdown`].
     FreezeAndFlush,
+    /// Only flush the layers to the remote storage without freezing any open layers. This is the
+    /// mode used by ancestor detach and any other operations that reloads a tenant but not increasing
+    /// the generation number.
+    Flush,
     /// Shut down immediately, without waiting for any open layers to flush.
     Hard,
 }
@@ -1678,11 +1682,6 @@ impl Timeline {
     pub(crate) async fn shutdown(&self, mode: ShutdownMode) {
         debug_assert_current_span_has_tenant_and_timeline_id();
 
-        let try_freeze_and_flush = match mode {
-            ShutdownMode::FreezeAndFlush => true,
-            ShutdownMode::Hard => false,
-        };
-
         // Regardless of whether we're going to try_freeze_and_flush
         // or not, stop ingesting any more data. Walreceiver only provides
         // cancellation but no "wait until gone", because it uses the Timeline::gate.
@@ -1704,7 +1703,7 @@ impl Timeline {
         // ... and inform any waiters for newer LSNs that there won't be any.
         self.last_record_lsn.shutdown();
 
-        if try_freeze_and_flush {
+        if let ShutdownMode::FreezeAndFlush = mode {
             if let Some((open, frozen)) = self
                 .layers
                 .read()
@@ -1746,6 +1745,20 @@ impl Timeline {
                     warn!("failed to freeze and flush: {e:#}");
                 }
             }
+
+            // `self.remote_client.shutdown().await` above should have already flushed everything from the queue, but
+            // we also do a final check here to ensure that the queue is empty.
+            if !self.remote_client.no_pending_work() {
+                warn!("still have pending work in remote upload queue, but continuing shutting down anyways");
+            }
+        }
+
+        if let ShutdownMode::Flush = mode {
+            // drain the upload queue
+            self.remote_client.shutdown().await;
+            if !self.remote_client.no_pending_work() {
+                warn!("still have pending work in remote upload queue, but continuing shutting down anyways");
+            }
         }
 
         // Signal any subscribers to our cancellation token to drop out