From f561cbe1c709f07c507ffe642e975838ee430ef6 Mon Sep 17 00:00:00 2001 From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com> Date: Fri, 8 Nov 2024 10:35:27 -0500 Subject: [PATCH] fix(pageserver): drain upload queue before detaching ancestor (#9651) In INC-317 https://neondb.slack.com/archives/C033RQ5SPDH/p1730815677932209, we saw an interesting series of operations that would remove valid layer files existing in the layer map. * Timeline A starts compaction and generates an image layer Z but not uploading it yet. * Timeline B/C starts ancestor detaching (which should not affect timeline A) * The tenant gets restarted as part of the ancestor detaching process, without increasing the generation number. * Timeline A reloads, discovering the layer Z is a future layer, and schedules a **deletion into the deletion queue**. This means that the file will be deleted any time in the future. * Timeline A starts compaction and generates layer Z again, adding it to the layer map. Note that because we don't bump generation number during ancestor detach, it has the same filename + generation number as the original Z. * Timeline A deletes layer Z from s3 + disk, and now we have a dangling reference in the layer map, blocking all compaction/logical_size_calculation process. ## Summary of changes * We wait until all layers to be uploaded before shutting down the tenants in `Flush` mode. * Ancestor detach restarts now use this mode. * Ancestor detach also waits for remote queue completion before starting the detaching process. * The patch ensures that we don't have any future image layer (or something similar) after restart, but not fixing the underlying problem around generation numbers. --------- Signed-off-by: Alex Chi Z --- pageserver/src/http/routes.rs | 15 +++++++++++ pageserver/src/tenant/mgr.rs | 2 +- .../src/tenant/remote_timeline_client.rs | 12 +++++++++ pageserver/src/tenant/timeline.rs | 25 ++++++++++++++----- 4 files changed, 47 insertions(+), 7 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 72eb3e7ade..d57bd98e95 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -2169,6 +2169,21 @@ async fn timeline_detach_ancestor_handler( let ctx = RequestContext::new(TaskKind::DetachAncestor, DownloadBehavior::Download); let ctx = &ctx; + // Flush the upload queues of all timelines before detaching ancestor. We do the same thing again + // during shutdown. This early upload ensures the pageserver does not need to upload too many + // things and creates downtime during timeline reloads. + for timeline in tenant.list_timelines() { + timeline + .remote_client + .wait_completion() + .await + .map_err(|e| { + ApiError::PreconditionFailed(format!("cannot drain upload queue: {e}").into()) + })?; + } + + tracing::info!("all timeline upload queues are drained"); + let timeline = tenant.get_timeline(timeline_id, true)?; let progress = timeline diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index a4c458b737..4fc9d740c8 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -1959,7 +1959,7 @@ impl TenantManager { attempt.before_reset_tenant(); let (_guard, progress) = utils::completion::channel(); - match tenant.shutdown(progress, ShutdownMode::Hard).await { + match tenant.shutdown(progress, ShutdownMode::Flush).await { Ok(()) => { slot_guard.drop_old_value().expect("it was just shutdown"); } diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 0aa8d61036..b37c16e133 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -2201,6 +2201,18 @@ impl RemoteTimelineClient { inner.initialized_mut()?; Ok(UploadQueueAccessor { inner }) } + + pub(crate) fn no_pending_work(&self) -> bool { + let inner = self.upload_queue.lock().unwrap(); + match &*inner { + UploadQueue::Uninitialized + | UploadQueue::Stopped(UploadQueueStopped::Uninitialized) => true, + UploadQueue::Stopped(UploadQueueStopped::Deletable(x)) => { + x.upload_queue_for_deletion.no_pending_work() + } + UploadQueue::Initialized(x) => x.no_pending_work(), + } + } } pub(crate) struct UploadQueueAccessor<'a> { diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 6e082aecf5..4d086df2d1 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -852,6 +852,10 @@ pub(crate) enum ShutdownMode { /// While we are flushing, we continue to accept read I/O for LSNs ingested before /// the call to [`Timeline::shutdown`]. FreezeAndFlush, + /// Only flush the layers to the remote storage without freezing any open layers. This is the + /// mode used by ancestor detach and any other operations that reloads a tenant but not increasing + /// the generation number. + Flush, /// Shut down immediately, without waiting for any open layers to flush. Hard, } @@ -1678,11 +1682,6 @@ impl Timeline { pub(crate) async fn shutdown(&self, mode: ShutdownMode) { debug_assert_current_span_has_tenant_and_timeline_id(); - let try_freeze_and_flush = match mode { - ShutdownMode::FreezeAndFlush => true, - ShutdownMode::Hard => false, - }; - // Regardless of whether we're going to try_freeze_and_flush // or not, stop ingesting any more data. Walreceiver only provides // cancellation but no "wait until gone", because it uses the Timeline::gate. @@ -1704,7 +1703,7 @@ impl Timeline { // ... and inform any waiters for newer LSNs that there won't be any. self.last_record_lsn.shutdown(); - if try_freeze_and_flush { + if let ShutdownMode::FreezeAndFlush = mode { if let Some((open, frozen)) = self .layers .read() @@ -1746,6 +1745,20 @@ impl Timeline { warn!("failed to freeze and flush: {e:#}"); } } + + // `self.remote_client.shutdown().await` above should have already flushed everything from the queue, but + // we also do a final check here to ensure that the queue is empty. + if !self.remote_client.no_pending_work() { + warn!("still have pending work in remote upload queue, but continuing shutting down anyways"); + } + } + + if let ShutdownMode::Flush = mode { + // drain the upload queue + self.remote_client.shutdown().await; + if !self.remote_client.no_pending_work() { + warn!("still have pending work in remote upload queue, but continuing shutting down anyways"); + } } // Signal any subscribers to our cancellation token to drop out