From 6d6e2c6a395d365852df478b37bbf50af1e48e6b Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 13 Aug 2024 20:51:51 +0300
Subject: [PATCH] feat(detach_ancestor): better retries with persistent gc
 blocking (#8430)

With the persistent gc blocking, we can now retry reparenting timelines
which had failed for whatever reason on the previous attempt(s).
Restructure the detach_ancestor into three phases:

- prepare (insert persistent gc blocking, copy lsn prefix, layers)
- detach and reparent
    - reparenting can fail, so we might need to retry this portion
- complete (remove persistent gc blocking)

Cc: #6994
---
 libs/utils/src/completion.rs                  |  31 +-
 pageserver/src/http/routes.rs                 |   4 +-
 pageserver/src/tenant.rs                      |  12 +-
 pageserver/src/tenant/metadata.rs             |   3 +
 pageserver/src/tenant/mgr.rs                  | 119 +++-
 .../src/tenant/remote_timeline_client.rs      |  70 ++-
 .../tenant/remote_timeline_client/index.rs    |  57 +-
 pageserver/src/tenant/tasks.rs                |  14 +-
 pageserver/src/tenant/timeline.rs             |  30 +-
 .../src/tenant/timeline/detach_ancestor.rs    | 528 ++++++++++++++----
 storage_controller/src/service.rs             |   4 +-
 .../regress/test_timeline_detach_ancestor.py  | 327 ++++++++++-
 12 files changed, 960 insertions(+), 239 deletions(-)

diff --git a/libs/utils/src/completion.rs b/libs/utils/src/completion.rs
index 2fef8d35df..f65c080ad4 100644
--- a/libs/utils/src/completion.rs
+++ b/libs/utils/src/completion.rs
@@ -5,13 +5,40 @@ use tokio_util::task::{task_tracker::TaskTrackerToken, TaskTracker};
 /// Can be cloned, moved and kept around in futures as "guard objects".
 #[derive(Clone)]
 pub struct Completion {
-    _token: TaskTrackerToken,
+    token: TaskTrackerToken,
+}
+
+impl std::fmt::Debug for Completion {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("Completion")
+            .field("siblings", &self.token.task_tracker().len())
+            .finish()
+    }
+}
+
+impl Completion {
+    /// Returns true if this completion is associated with the given barrier.
+    pub fn blocks(&self, barrier: &Barrier) -> bool {
+        TaskTracker::ptr_eq(self.token.task_tracker(), &barrier.0)
+    }
+
+    pub fn barrier(&self) -> Barrier {
+        Barrier(self.token.task_tracker().clone())
+    }
 }
 
 /// Barrier will wait until all clones of [`Completion`] have been dropped.
 #[derive(Clone)]
 pub struct Barrier(TaskTracker);
 
+impl std::fmt::Debug for Barrier {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("Barrier")
+            .field("remaining", &self.0.len())
+            .finish()
+    }
+}
+
 impl Default for Barrier {
     fn default() -> Self {
         let (_, rx) = channel();
@@ -51,5 +78,5 @@ pub fn channel() -> (Completion, Barrier) {
     tracker.close();
 
     let token = tracker.token();
-    (Completion { _token: token }, Barrier(tracker))
+    (Completion { token }, Barrier(tracker))
 }
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 2b0156079e..6f7480cc6c 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1887,7 +1887,7 @@ async fn timeline_detach_ancestor_handler(
         // drop(tenant);
 
         let resp = match progress {
-            detach_ancestor::Progress::Prepared(_guard, prepared) => {
+            detach_ancestor::Progress::Prepared(attempt, prepared) => {
                 // it would be great to tag the guard on to the tenant activation future
                 let reparented_timelines = state
                     .tenant_manager
@@ -1895,10 +1895,10 @@ async fn timeline_detach_ancestor_handler(
                         tenant_shard_id,
                         timeline_id,
                         prepared,
+                        attempt,
                         ctx,
                     )
                     .await
-                    .context("timeline detach ancestor completion")
                     .map_err(ApiError::InternalServerError)?;
 
                 AncestorDetached {
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index a238004aad..b065f58382 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -302,7 +302,11 @@ pub struct Tenant {
     pub(crate) timeline_get_throttle:
         Arc<throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>>,
 
-    /// An ongoing timeline detach must be checked during attempts to GC or compact a timeline.
+    /// An ongoing timeline detach concurrency limiter.
+    ///
+    /// As a tenant will likely be restarted as part of timeline detach ancestor it makes no sense
+    /// to have two running at the same time. A different one can be started if an earlier one
+    /// has failed for whatever reason.
     ongoing_timeline_detach: std::sync::Mutex<Option<(TimelineId, utils::completion::Barrier)>>,
 
     /// `index_part.json` based gc blocking reason tracking.
@@ -833,9 +837,9 @@ impl Tenant {
                             // The Stopping case is for when we have passed control on to DeleteTenantFlow:
                             // if it errors, we will call make_broken when tenant is already in Stopping.
                             assert!(
-                            matches!(*state, TenantState::Attaching | TenantState::Stopping { .. }),
-                            "the attach task owns the tenant state until activation is complete"
-                        );
+                                matches!(*state, TenantState::Attaching | TenantState::Stopping { .. }),
+                                "the attach task owns the tenant state until activation is complete"
+                            );
 
                             *state = TenantState::broken_from_reason(err.to_string());
                         });
diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs
index bbc070a81b..6073abc8c3 100644
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -285,12 +285,15 @@ impl TimelineMetadata {
     }
 
     /// When reparenting, the `ancestor_lsn` does not change.
+    ///
+    /// Returns true if anything was changed.
     pub fn reparent(&mut self, timeline: &TimelineId) {
         assert!(self.body.ancestor_timeline.is_some());
         // no assertion for redoing this: it's fine, we may have to repeat this multiple times over
         self.body.ancestor_timeline = Some(*timeline);
     }
 
+    /// Returns true if anything was changed
     pub fn detach_from_ancestor(&mut self, branchpoint: &(TimelineId, Lsn)) {
         if let Some(ancestor) = self.body.ancestor_timeline {
             assert_eq!(ancestor, branchpoint.0);
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index c8a11e88cc..5f2539d426 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -54,7 +54,7 @@ use utils::id::{TenantId, TimelineId};
 
 use super::remote_timeline_client::remote_tenant_path;
 use super::secondary::SecondaryTenant;
-use super::timeline::detach_ancestor::PreparedTimelineDetach;
+use super::timeline::detach_ancestor::{self, PreparedTimelineDetach};
 use super::{GlobalShutDown, TenantSharedResources};
 
 /// For a tenant that appears in TenantsMap, it may either be
@@ -1927,8 +1927,10 @@ impl TenantManager {
         tenant_shard_id: TenantShardId,
         timeline_id: TimelineId,
         prepared: PreparedTimelineDetach,
+        mut attempt: detach_ancestor::Attempt,
         ctx: &RequestContext,
     ) -> Result<HashSet<TimelineId>, anyhow::Error> {
+        use crate::tenant::timeline::detach_ancestor::Error;
         // FIXME: this is unnecessary, slotguard already has these semantics
         struct RevertOnDropSlot(Option<SlotGuard>);
 
@@ -1977,43 +1979,98 @@ impl TenantManager {
 
         let timeline = tenant.get_timeline(timeline_id, true)?;
 
-        let reparented = timeline
-            .complete_detaching_timeline_ancestor(&tenant, prepared, ctx)
+        let resp = timeline
+            .detach_from_ancestor_and_reparent(&tenant, prepared, ctx)
             .await?;
 
         let mut slot_guard = slot_guard.into_inner();
 
-        let (_guard, progress) = utils::completion::channel();
-        match tenant.shutdown(progress, ShutdownMode::Hard).await {
-            Ok(()) => {
-                slot_guard.drop_old_value()?;
+        let tenant = if resp.reset_tenant_required() {
+            attempt.before_reset_tenant();
+
+            let (_guard, progress) = utils::completion::channel();
+            match tenant.shutdown(progress, ShutdownMode::Hard).await {
+                Ok(()) => {
+                    slot_guard.drop_old_value()?;
+                }
+                Err(_barrier) => {
+                    slot_guard.revert();
+                    // this really should not happen, at all, unless shutdown was already going?
+                    anyhow::bail!("Cannot restart Tenant, already shutting down");
+                }
             }
-            Err(_barrier) => {
-                slot_guard.revert();
-                // this really should not happen, at all, unless shutdown was already going?
-                anyhow::bail!("Cannot restart Tenant, already shutting down");
+
+            let tenant_path = self.conf.tenant_path(&tenant_shard_id);
+            let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?;
+
+            let shard_identity = config.shard;
+            let tenant = tenant_spawn(
+                self.conf,
+                tenant_shard_id,
+                &tenant_path,
+                self.resources.clone(),
+                AttachedTenantConf::try_from(config)?,
+                shard_identity,
+                None,
+                SpawnMode::Eager,
+                ctx,
+            )?;
+
+            {
+                let mut g = tenant.ongoing_timeline_detach.lock().unwrap();
+                assert!(
+                    g.is_none(),
+                    "there cannot be any new timeline detach ancestor on newly created tenant"
+                );
+                *g = Some((attempt.timeline_id, attempt.new_barrier()));
             }
+
+            slot_guard.upsert(TenantSlot::Attached(tenant.clone()))?;
+            tenant
+        } else {
+            tracing::info!("skipping tenant_reset as no changes made required it");
+            tenant
+        };
+
+        if let Some(reparented) = resp.completed() {
+            // finally ask the restarted tenant to complete the detach
+            //
+            // rationale for 9999s: we don't really have a timetable here; if retried, the caller
+            // will get an 503.
+            tenant
+                .wait_to_become_active(std::time::Duration::from_secs(9999))
+                .await
+                .map_err(|e| {
+                    use pageserver_api::models::TenantState;
+                    use GetActiveTenantError::{Cancelled, WillNotBecomeActive};
+                    match e {
+                        Cancelled | WillNotBecomeActive(TenantState::Stopping { .. }) => {
+                            Error::ShuttingDown
+                        }
+                        other => Error::Unexpected(other.into()),
+                    }
+                })?;
+
+            utils::pausable_failpoint!(
+                "timeline-detach-ancestor::after_activating_before_finding-pausable"
+            );
+
+            let timeline = tenant
+                .get_timeline(attempt.timeline_id, true)
+                .map_err(|_| Error::DetachedNotFoundAfterRestart)?;
+
+            timeline
+                .complete_detaching_timeline_ancestor(&tenant, attempt, ctx)
+                .await
+                .map(|()| reparented)
+                .map_err(|e| e.into())
+        } else {
+            // at least the latest versions have now been downloaded and refreshed; be ready to
+            // retry another time.
+            Err(anyhow::anyhow!(
+                "failed to reparent all candidate timelines, please retry"
+            ))
         }
-
-        let tenant_path = self.conf.tenant_path(&tenant_shard_id);
-        let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?;
-
-        let shard_identity = config.shard;
-        let tenant = tenant_spawn(
-            self.conf,
-            tenant_shard_id,
-            &tenant_path,
-            self.resources.clone(),
-            AttachedTenantConf::try_from(config)?,
-            shard_identity,
-            None,
-            SpawnMode::Eager,
-            ctx,
-        )?;
-
-        slot_guard.upsert(TenantSlot::Attached(tenant))?;
-
-        Ok(reparented)
     }
 
     /// A page service client sends a TenantId, and to look up the correct Tenant we must
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 8a76d7532f..b4d7ad1e97 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -736,12 +736,13 @@ impl RemoteTimelineClient {
         Ok(())
     }
 
+    /// Reparent this timeline to a new parent.
+    ///
+    /// A retryable step of timeline ancestor detach.
     pub(crate) async fn schedule_reparenting_and_wait(
         self: &Arc<Self>,
         new_parent: &TimelineId,
     ) -> anyhow::Result<()> {
-        // FIXME: because of how Timeline::schedule_uploads works when called from layer flushing
-        // and reads the in-memory part we cannot do the detaching like this
         let receiver = {
             let mut guard = self.upload_queue.lock().unwrap();
             let upload_queue = guard.initialized_mut()?;
@@ -752,17 +753,25 @@ impl RemoteTimelineClient {
                 ));
             };
 
-            upload_queue.dirty.metadata.reparent(new_parent);
-            upload_queue.dirty.lineage.record_previous_ancestor(&prev);
+            let uploaded = &upload_queue.clean.0.metadata;
 
-            self.schedule_index_upload(upload_queue)?;
+            if uploaded.ancestor_timeline().is_none() && !uploaded.ancestor_lsn().is_valid() {
+                // nothing to do
+                None
+            } else {
+                upload_queue.dirty.metadata.reparent(new_parent);
+                upload_queue.dirty.lineage.record_previous_ancestor(&prev);
 
-            self.schedule_barrier0(upload_queue)
+                self.schedule_index_upload(upload_queue)?;
+
+                Some(self.schedule_barrier0(upload_queue))
+            }
         };
 
-        Self::wait_completion0(receiver)
-            .await
-            .context("wait completion")
+        if let Some(receiver) = receiver {
+            Self::wait_completion0(receiver).await?;
+        }
+        Ok(())
     }
 
     /// Schedules uploading a new version of `index_part.json` with the given layers added,
@@ -778,26 +787,30 @@ impl RemoteTimelineClient {
             let mut guard = self.upload_queue.lock().unwrap();
             let upload_queue = guard.initialized_mut()?;
 
-            upload_queue.dirty.metadata.detach_from_ancestor(&adopted);
-            upload_queue.dirty.lineage.record_detaching(&adopted);
+            if upload_queue.clean.0.lineage.detached_previous_ancestor() == Some(adopted) {
+                None
+            } else {
+                upload_queue.dirty.metadata.detach_from_ancestor(&adopted);
+                upload_queue.dirty.lineage.record_detaching(&adopted);
 
-            for layer in layers {
-                upload_queue
-                    .dirty
-                    .layer_metadata
-                    .insert(layer.layer_desc().layer_name(), layer.metadata());
+                for layer in layers {
+                    let prev = upload_queue
+                        .dirty
+                        .layer_metadata
+                        .insert(layer.layer_desc().layer_name(), layer.metadata());
+                    assert!(prev.is_none(), "copied layer existed already {layer}");
+                }
+
+                self.schedule_index_upload(upload_queue)?;
+
+                Some(self.schedule_barrier0(upload_queue))
             }
-
-            self.schedule_index_upload(upload_queue)?;
-
-            let barrier = self.schedule_barrier0(upload_queue);
-            self.launch_queued_tasks(upload_queue);
-            barrier
         };
 
-        Self::wait_completion0(barrier)
-            .await
-            .context("wait completion")
+        if let Some(barrier) = barrier {
+            Self::wait_completion0(barrier).await?;
+        }
+        Ok(())
     }
 
     /// Adds a gc blocking reason for this timeline if one does not exist already.
@@ -873,12 +886,7 @@ impl RemoteTimelineClient {
             let upload_queue = guard.initialized_mut()?;
 
             if let index::GcBlockingReason::DetachAncestor = reason {
-                if !upload_queue
-                    .clean
-                    .0
-                    .lineage
-                    .is_detached_from_original_ancestor()
-                {
+                if !upload_queue.clean.0.lineage.is_detached_from_ancestor() {
                     drop(guard);
                     panic!("cannot complete timeline_ancestor_detach while not detached");
                 }
diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index 90453b1922..757fb9d032 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -216,26 +216,47 @@ fn is_false(b: &bool) -> bool {
 impl Lineage {
     const REMEMBER_AT_MOST: usize = 100;
 
-    pub(crate) fn record_previous_ancestor(&mut self, old_ancestor: &TimelineId) {
+    pub(crate) fn record_previous_ancestor(&mut self, old_ancestor: &TimelineId) -> bool {
         if self.reparenting_history.last() == Some(old_ancestor) {
             // do not re-record it
-            return;
-        }
+            false
+        } else {
+            #[cfg(feature = "testing")]
+            {
+                let existing = self
+                    .reparenting_history
+                    .iter()
+                    .position(|x| x == old_ancestor);
+                assert_eq!(
+                    existing, None,
+                    "we cannot reparent onto and off and onto the same timeline twice"
+                );
+            }
+            let drop_oldest = self.reparenting_history.len() + 1 >= Self::REMEMBER_AT_MOST;
 
-        let drop_oldest = self.reparenting_history.len() + 1 >= Self::REMEMBER_AT_MOST;
-
-        self.reparenting_history_truncated |= drop_oldest;
-        if drop_oldest {
-            self.reparenting_history.remove(0);
+            self.reparenting_history_truncated |= drop_oldest;
+            if drop_oldest {
+                self.reparenting_history.remove(0);
+            }
+            self.reparenting_history.push(*old_ancestor);
+            true
         }
-        self.reparenting_history.push(*old_ancestor);
     }
 
-    pub(crate) fn record_detaching(&mut self, branchpoint: &(TimelineId, Lsn)) {
-        assert!(self.original_ancestor.is_none());
-
-        self.original_ancestor =
-            Some((branchpoint.0, branchpoint.1, chrono::Utc::now().naive_utc()));
+    /// Returns true if anything changed.
+    pub(crate) fn record_detaching(&mut self, branchpoint: &(TimelineId, Lsn)) -> bool {
+        if let Some((id, lsn, _)) = self.original_ancestor {
+            assert_eq!(
+                &(id, lsn),
+                branchpoint,
+                "detaching attempt has to be for the same ancestor we are already detached from"
+            );
+            false
+        } else {
+            self.original_ancestor =
+                Some((branchpoint.0, branchpoint.1, chrono::Utc::now().naive_utc()));
+            true
+        }
     }
 
     /// The queried lsn is most likely the basebackup lsn, and this answers question "is it allowed
@@ -247,10 +268,16 @@ impl Lineage {
             .is_some_and(|(_, ancestor_lsn, _)| ancestor_lsn == lsn)
     }
 
-    pub(crate) fn is_detached_from_original_ancestor(&self) -> bool {
+    /// Returns true if the timeline originally had an ancestor, and no longer has one.
+    pub(crate) fn is_detached_from_ancestor(&self) -> bool {
         self.original_ancestor.is_some()
     }
 
+    /// Returns original ancestor timeline id and lsn that this timeline has been detached from.
+    pub(crate) fn detached_previous_ancestor(&self) -> Option<(TimelineId, Lsn)> {
+        self.original_ancestor.map(|(id, lsn, _)| (id, lsn))
+    }
+
     pub(crate) fn is_reparented(&self) -> bool {
         !self.reparenting_history.is_empty()
     }
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 713845e9ac..dbcd704b4e 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -366,14 +366,13 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
             if first {
                 first = false;
 
-                if delay_by_lease_length(tenant.get_lsn_lease_length(), &cancel)
-                    .await
-                    .is_err()
-                {
-                    break;
-                }
+                let delays = async {
+                    delay_by_lease_length(tenant.get_lsn_lease_length(), &cancel).await?;
+                    random_init_delay(period, &cancel).await?;
+                    Ok::<_, Cancelled>(())
+                };
 
-                if random_init_delay(period, &cancel).await.is_err() {
+                if delays.await.is_err() {
                     break;
                 }
             }
@@ -425,7 +424,6 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
 
             warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc);
 
-            // Sleep
             if tokio::time::timeout(sleep_duration, cancel.cancelled())
                 .await
                 .is_ok()
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 767f5969fc..f1587951c6 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4328,18 +4328,34 @@ impl Timeline {
         detach_ancestor::prepare(self, tenant, options, ctx).await
     }
 
-    /// Completes the ancestor detach. This method is to be called while holding the
-    /// TenantManager's tenant slot, so during this method we cannot be deleted nor can any
-    /// timeline be deleted. After this method returns successfully, tenant must be reloaded.
+    /// Second step of detach from ancestor; detaches the `self` from it's current ancestor and
+    /// reparents any reparentable children of previous ancestor.
     ///
-    /// Pageserver receiving a SIGKILL during this operation is not supported (yet).
-    pub(crate) async fn complete_detaching_timeline_ancestor(
+    /// This method is to be called while holding the TenantManager's tenant slot, so during this
+    /// method we cannot be deleted nor can any timeline be deleted. After this method returns
+    /// successfully, tenant must be reloaded.
+    ///
+    /// Final step will be to [`Self::complete_detaching_timeline_ancestor`] after optionally
+    /// resetting the tenant.
+    pub(crate) async fn detach_from_ancestor_and_reparent(
         self: &Arc<Timeline>,
         tenant: &crate::tenant::Tenant,
         prepared: detach_ancestor::PreparedTimelineDetach,
         ctx: &RequestContext,
-    ) -> Result<HashSet<TimelineId>, anyhow::Error> {
-        detach_ancestor::complete(self, tenant, prepared, ctx).await
+    ) -> Result<detach_ancestor::DetachingAndReparenting, anyhow::Error> {
+        detach_ancestor::detach_and_reparent(self, tenant, prepared, ctx).await
+    }
+
+    /// Final step which unblocks the GC.
+    ///
+    /// The tenant must've been reset if ancestry was modified previously (in tenant manager).
+    pub(crate) async fn complete_detaching_timeline_ancestor(
+        self: &Arc<Timeline>,
+        tenant: &crate::tenant::Tenant,
+        attempt: detach_ancestor::Attempt,
+        ctx: &RequestContext,
+    ) -> Result<(), detach_ancestor::Error> {
+        detach_ancestor::complete(self, tenant, attempt, ctx).await
     }
 
     /// Switch aux file policy and schedule upload to the index part.
diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs
index 3b52adc77b..969da2662b 100644
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -5,12 +5,16 @@ use crate::{
     context::{DownloadBehavior, RequestContext},
     task_mgr::TaskKind,
     tenant::{
+        mgr::GetActiveTenantError,
+        remote_timeline_client::index::GcBlockingReason::DetachAncestor,
         storage_layer::{AsLayerDesc as _, DeltaLayerWriter, Layer, ResidentLayer},
         Tenant,
     },
     virtual_file::{MaybeFatalIo, VirtualFile},
 };
+use anyhow::Context;
 use pageserver_api::models::detach_ancestor::AncestorDetached;
+use tokio::sync::Semaphore;
 use tokio_util::sync::CancellationToken;
 use tracing::Instrument;
 use utils::{completion, generation::Generation, http::error::ApiError, id::TimelineId, lsn::Lsn};
@@ -38,6 +42,12 @@ pub(crate) enum Error {
     #[error("remote copying layer failed")]
     CopyFailed(#[source] anyhow::Error),
 
+    #[error("wait for tenant to activate after restarting")]
+    WaitToActivate(#[source] GetActiveTenantError),
+
+    #[error("detached timeline was not found after restart")]
+    DetachedNotFoundAfterRestart,
+
     #[error("unexpected error")]
     Unexpected(#[source] anyhow::Error),
 
@@ -55,6 +65,10 @@ impl From<Error> for ApiError {
             Error::OtherTimelineDetachOngoing(_) => {
                 ApiError::ResourceUnavailable("other timeline detach is already ongoing".into())
             }
+            e @ Error::WaitToActivate(_) => {
+                let s = utils::error::report_compact_sources(&e).to_string();
+                ApiError::ResourceUnavailable(s.into())
+            }
             // All of these contain shutdown errors, in fact, it's the most common
             e @ Error::FlushAncestor(_)
             | e @ Error::RewrittenDeltaDownloadFailed(_)
@@ -63,6 +77,7 @@ impl From<Error> for ApiError {
             | e @ Error::CopyFailed(_)
             | e @ Error::Unexpected(_)
             | e @ Error::Failpoint(_) => ApiError::InternalServerError(e.into()),
+            Error::DetachedNotFoundAfterRestart => ApiError::NotFound(value.into()),
         }
     }
 }
@@ -96,8 +111,25 @@ impl From<FlushLayerError> for Error {
     }
 }
 
+impl From<GetActiveTenantError> for Error {
+    fn from(value: GetActiveTenantError) -> Self {
+        use pageserver_api::models::TenantState;
+        use GetActiveTenantError::*;
+
+        match value {
+            Cancelled | WillNotBecomeActive(TenantState::Stopping { .. }) | SwitchedTenant => {
+                Error::ShuttingDown
+            }
+            WaitForActiveTimeout { .. } | NotFound(_) | Broken(_) | WillNotBecomeActive(_) => {
+                // NotFound seems out-of-place
+                Error::WaitToActivate(value)
+            }
+        }
+    }
+}
+
 pub(crate) enum Progress {
-    Prepared(completion::Completion, PreparedTimelineDetach),
+    Prepared(Attempt, PreparedTimelineDetach),
     Done(AncestorDetached),
 }
 
@@ -121,6 +153,26 @@ impl Default for Options {
     }
 }
 
+/// Represents an across tenant reset exclusive single attempt to detach ancestor.
+#[derive(Debug)]
+pub(crate) struct Attempt {
+    pub(crate) timeline_id: TimelineId,
+
+    _guard: completion::Completion,
+    gate_entered: Option<utils::sync::gate::GateGuard>,
+}
+
+impl Attempt {
+    pub(crate) fn before_reset_tenant(&mut self) {
+        let taken = self.gate_entered.take();
+        assert!(taken.is_some());
+    }
+
+    pub(crate) fn new_barrier(&self) -> completion::Barrier {
+        self._guard.barrier()
+    }
+}
+
 /// See [`Timeline::prepare_to_detach_from_ancestor`]
 pub(super) async fn prepare(
     detached: &Arc<Timeline>,
@@ -135,15 +187,33 @@ pub(super) async fn prepare(
         .as_ref()
         .map(|tl| (tl.clone(), detached.ancestor_lsn))
     else {
-        {
+        let still_in_progress = {
             let accessor = detached.remote_client.initialized_upload_queue()?;
 
             // we are safe to inspect the latest uploaded, because we can only witness this after
             // restart is complete and ancestor is no more.
             let latest = accessor.latest_uploaded_index_part();
-            if !latest.lineage.is_detached_from_original_ancestor() {
+            if latest.lineage.detached_previous_ancestor().is_none() {
                 return Err(NoAncestor);
-            }
+            };
+
+            latest
+                .gc_blocking
+                .as_ref()
+                .is_some_and(|b| b.blocked_by(DetachAncestor))
+        };
+
+        if still_in_progress {
+            // gc is still blocked, we can still reparent and complete.
+            // we are safe to reparent remaining, because they were locked in in the beginning.
+            let attempt = continue_with_blocked_gc(detached, tenant).await?;
+
+            // because the ancestor of detached is already set to none, we have published all
+            // of the layers, so we are still "prepared."
+            return Ok(Progress::Prepared(
+                attempt,
+                PreparedTimelineDetach { layers: Vec::new() },
+            ));
         }
 
         let reparented_timelines = reparented_direct_children(detached, tenant)?;
@@ -164,22 +234,7 @@ pub(super) async fn prepare(
         return Err(TooManyAncestors);
     }
 
-    // before we acquire the gate, we must mark the ancestor as having a detach operation
-    // ongoing which will block other concurrent detach operations so we don't get to ackward
-    // situations where there would be two branches trying to reparent earlier branches.
-    let (guard, barrier) = completion::channel();
-
-    {
-        let mut guard = tenant.ongoing_timeline_detach.lock().unwrap();
-        if let Some((tl, other)) = guard.as_ref() {
-            if !other.is_ready() {
-                return Err(OtherTimelineDetachOngoing(*tl));
-            }
-        }
-        *guard = Some((detached.timeline_id, barrier));
-    }
-
-    let _gate_entered = detached.gate.enter().map_err(|_| ShuttingDown)?;
+    let attempt = start_new_attempt(detached, tenant).await?;
 
     utils::pausable_failpoint!("timeline-detach-ancestor::before_starting_after_locking_pausable");
 
@@ -245,7 +300,8 @@ pub(super) async fn prepare(
     };
 
     // TODO: layers are already sorted by something: use that to determine how much of remote
-    // copies are already done.
+    // copies are already done -- gc is blocked, but a compaction could had happened on ancestor,
+    // which is something to keep in mind if copy skipping is implemented.
     tracing::info!(filtered=%filtered_layers, to_rewrite = straddling_branchpoint.len(), historic=%rest_of_historic.len(), "collected layers");
 
     // TODO: copying and lsn prefix copying could be done at the same time with a single fsync after
@@ -259,29 +315,33 @@ pub(super) async fn prepare(
 
         let mut wrote_any = false;
 
-        let limiter = Arc::new(tokio::sync::Semaphore::new(
-            options.rewrite_concurrency.get(),
-        ));
+        let limiter = Arc::new(Semaphore::new(options.rewrite_concurrency.get()));
 
         for layer in straddling_branchpoint {
             let limiter = limiter.clone();
             let timeline = detached.clone();
             let ctx = ctx.detached_child(TaskKind::DetachAncestor, DownloadBehavior::Download);
 
-            tasks.spawn(async move {
-                let _permit = limiter.acquire().await;
-                let copied =
-                    upload_rewritten_layer(end_lsn, &layer, &timeline, &timeline.cancel, &ctx)
-                        .await?;
-                Ok(copied)
-            });
+            let span = tracing::info_span!("upload_rewritten_layer", %layer);
+            tasks.spawn(
+                async move {
+                    let _permit = limiter.acquire().await;
+                    let copied =
+                        upload_rewritten_layer(end_lsn, &layer, &timeline, &timeline.cancel, &ctx)
+                            .await?;
+                    if let Some(copied) = copied.as_ref() {
+                        tracing::info!(%copied, "rewrote and uploaded");
+                    }
+                    Ok(copied)
+                }
+                .instrument(span),
+            );
         }
 
         while let Some(res) = tasks.join_next().await {
             match res {
                 Ok(Ok(Some(copied))) => {
                     wrote_any = true;
-                    tracing::info!(layer=%copied, "rewrote and uploaded");
                     new_layers.push(copied);
                 }
                 Ok(Ok(None)) => {}
@@ -308,7 +368,7 @@ pub(super) async fn prepare(
     }
 
     let mut tasks = tokio::task::JoinSet::new();
-    let limiter = Arc::new(tokio::sync::Semaphore::new(options.copy_concurrency.get()));
+    let limiter = Arc::new(Semaphore::new(options.copy_concurrency.get()));
 
     for adopted in rest_of_historic {
         let limiter = limiter.clone();
@@ -342,7 +402,56 @@ pub(super) async fn prepare(
 
     let prepared = PreparedTimelineDetach { layers: new_layers };
 
-    Ok(Progress::Prepared(guard, prepared))
+    Ok(Progress::Prepared(attempt, prepared))
+}
+
+async fn start_new_attempt(detached: &Timeline, tenant: &Tenant) -> Result<Attempt, Error> {
+    let attempt = obtain_exclusive_attempt(detached, tenant)?;
+
+    // insert the block in the index_part.json, if not already there.
+    let _dont_care = tenant
+        .gc_block
+        .insert(
+            detached,
+            crate::tenant::remote_timeline_client::index::GcBlockingReason::DetachAncestor,
+        )
+        .await
+        // FIXME: better error
+        .map_err(Error::Unexpected)?;
+
+    Ok(attempt)
+}
+
+async fn continue_with_blocked_gc(detached: &Timeline, tenant: &Tenant) -> Result<Attempt, Error> {
+    // FIXME: it would be nice to confirm that there is an in-memory version, since we've just
+    // verified there is a persistent one?
+    obtain_exclusive_attempt(detached, tenant)
+}
+
+fn obtain_exclusive_attempt(detached: &Timeline, tenant: &Tenant) -> Result<Attempt, Error> {
+    use Error::{OtherTimelineDetachOngoing, ShuttingDown};
+
+    // ensure we are the only active attempt for this tenant
+    let (guard, barrier) = completion::channel();
+    {
+        let mut guard = tenant.ongoing_timeline_detach.lock().unwrap();
+        if let Some((tl, other)) = guard.as_ref() {
+            if !other.is_ready() {
+                return Err(OtherTimelineDetachOngoing(*tl));
+            }
+            // FIXME: no test enters here
+        }
+        *guard = Some((detached.timeline_id, barrier));
+    }
+
+    // ensure the gate is still open
+    let _gate_entered = detached.gate.enter().map_err(|_| ShuttingDown)?;
+
+    Ok(Attempt {
+        timeline_id: detached.timeline_id,
+        _guard: guard,
+        gate_entered: Some(_gate_entered),
+    })
 }
 
 fn reparented_direct_children(
@@ -548,96 +657,207 @@ async fn remote_copy(
         .map_err(CopyFailed)
 }
 
-/// See [`Timeline::complete_detaching_timeline_ancestor`].
-pub(super) async fn complete(
+pub(crate) enum DetachingAndReparenting {
+    /// All of the following timeline ids were reparented and the timeline ancestor detach must be
+    /// marked as completed.
+    Reparented(HashSet<TimelineId>),
+
+    /// Some of the reparentings failed. The timeline ancestor detach must **not** be marked as
+    /// completed.
+    ///
+    /// Nested `must_reset_tenant` is set to true when any restart requiring changes were made.
+    SomeReparentingFailed { must_reset_tenant: bool },
+
+    /// Detaching and reparentings were completed in a previous attempt. Timeline ancestor detach
+    /// must be marked as completed.
+    AlreadyDone(HashSet<TimelineId>),
+}
+
+impl DetachingAndReparenting {
+    pub(crate) fn reset_tenant_required(&self) -> bool {
+        use DetachingAndReparenting::*;
+        match self {
+            Reparented(_) => true,
+            SomeReparentingFailed { must_reset_tenant } => *must_reset_tenant,
+            AlreadyDone(_) => false,
+        }
+    }
+
+    pub(crate) fn completed(self) -> Option<HashSet<TimelineId>> {
+        use DetachingAndReparenting::*;
+        match self {
+            Reparented(x) | AlreadyDone(x) => Some(x),
+            SomeReparentingFailed { .. } => None,
+        }
+    }
+}
+
+/// See [`Timeline::detach_from_ancestor_and_reparent`].
+pub(super) async fn detach_and_reparent(
     detached: &Arc<Timeline>,
     tenant: &Tenant,
     prepared: PreparedTimelineDetach,
     _ctx: &RequestContext,
-) -> Result<HashSet<TimelineId>, anyhow::Error> {
+) -> Result<DetachingAndReparenting, anyhow::Error> {
     let PreparedTimelineDetach { layers } = prepared;
 
-    let ancestor = detached
-        .ancestor_timeline
-        .as_ref()
-        .expect("must still have a ancestor");
-    let ancestor_lsn = detached.get_ancestor_lsn();
+    #[derive(Debug)]
+    enum Ancestor {
+        NotDetached(Arc<Timeline>, Lsn),
+        Detached(Arc<Timeline>, Lsn),
+    }
+
+    let (recorded_branchpoint, still_ongoing) = {
+        let access = detached.remote_client.initialized_upload_queue()?;
+        let latest = access.latest_uploaded_index_part();
+
+        (
+            latest.lineage.detached_previous_ancestor(),
+            latest
+                .gc_blocking
+                .as_ref()
+                .is_some_and(|b| b.blocked_by(DetachAncestor)),
+        )
+    };
+    assert!(
+        still_ongoing,
+        "cannot (detach? reparent)? complete if the operation is not still ongoing"
+    );
+
+    let ancestor = match (detached.ancestor_timeline.as_ref(), recorded_branchpoint) {
+        (Some(ancestor), None) => {
+            assert!(
+                !layers.is_empty(),
+                "there should always be at least one layer to inherit"
+            );
+            Ancestor::NotDetached(ancestor.clone(), detached.ancestor_lsn)
+        }
+        (Some(_), Some(_)) => {
+            panic!(
+                "it should be impossible to get to here without having gone through the tenant reset; if the tenant was reset, then the ancestor_timeline would be None"
+            );
+        }
+        (None, Some((ancestor_id, ancestor_lsn))) => {
+            // it has been either:
+            // - detached but still exists => we can try reparenting
+            // - detached and deleted
+            //
+            // either way, we must complete
+            assert!(
+                layers.is_empty(),
+                "no layers should had been copied as detach is done"
+            );
+
+            let existing = tenant.timelines.lock().unwrap().get(&ancestor_id).cloned();
+
+            if let Some(ancestor) = existing {
+                Ancestor::Detached(ancestor, ancestor_lsn)
+            } else {
+                let direct_children = reparented_direct_children(detached, tenant)?;
+                return Ok(DetachingAndReparenting::AlreadyDone(direct_children));
+            }
+        }
+        (None, None) => {
+            // TODO: make sure there are no `?` before tenant_reset from after a questionmark from
+            // here.
+            panic!(
+            "bug: detach_and_reparent called on a timeline which has not been detached or which has no live ancestor"
+            );
+        }
+    };
 
     // publish the prepared layers before we reparent any of the timelines, so that on restart
     // reparented timelines find layers. also do the actual detaching.
     //
-    // if we crash after this operation, we will at least come up having detached a timeline, but
-    // we cannot go back and reparent the timelines which would had been reparented in normal
-    // execution.
-    //
-    // this is not perfect, but it avoids us a retry happening after a compaction or gc on restart
-    // which could give us a completely wrong layer combination.
-    detached
-        .remote_client
-        .schedule_adding_existing_layers_to_index_detach_and_wait(
-            &layers,
-            (ancestor.timeline_id, ancestor_lsn),
-        )
-        .await?;
+    // if we crash after this operation, a retry will allow reparenting the remaining timelines as
+    // gc is blocked.
+
+    let (ancestor, ancestor_lsn, was_detached) = match ancestor {
+        Ancestor::NotDetached(ancestor, ancestor_lsn) => {
+            // this has to complete before any reparentings because otherwise they would not have
+            // layers on the new parent.
+            detached
+                .remote_client
+                .schedule_adding_existing_layers_to_index_detach_and_wait(
+                    &layers,
+                    (ancestor.timeline_id, ancestor_lsn),
+                )
+                .await
+                .context("publish layers and detach ancestor")?;
+
+            tracing::info!(
+                ancestor=%ancestor.timeline_id,
+                %ancestor_lsn,
+                inherited_layers=%layers.len(),
+                "detached from ancestor"
+            );
+            (ancestor, ancestor_lsn, true)
+        }
+        Ancestor::Detached(ancestor, ancestor_lsn) => (ancestor, ancestor_lsn, false),
+    };
 
     let mut tasks = tokio::task::JoinSet::new();
 
+    // Returns a single permit semaphore which will be used to make one reparenting succeed,
+    // others will fail as if those timelines had been stopped for whatever reason.
+    #[cfg(feature = "testing")]
+    let failpoint_sem = || -> Option<Arc<Semaphore>> {
+        fail::fail_point!("timeline-detach-ancestor::allow_one_reparented", |_| Some(
+            Arc::new(Semaphore::new(1))
+        ));
+        None
+    }();
+
     // because we are now keeping the slot in progress, it is unlikely that there will be any
     // timeline deletions during this time. if we raced one, then we'll just ignore it.
-    tenant
-        .timelines
-        .lock()
-        .unwrap()
-        .values()
-        .filter_map(|tl| {
-            if Arc::ptr_eq(tl, detached) {
-                return None;
-            }
+    {
+        let g = tenant.timelines.lock().unwrap();
+        reparentable_timelines(g.values(), detached, &ancestor, ancestor_lsn)
+            .cloned()
+            .for_each(|timeline| {
+                // important in this scope: we are holding the Tenant::timelines lock
+                let span = tracing::info_span!("reparent", reparented=%timeline.timeline_id);
+                let new_parent = detached.timeline_id;
+                #[cfg(feature = "testing")]
+                let failpoint_sem = failpoint_sem.clone();
 
-            if !tl.is_active() {
-                return None;
-            }
+                tasks.spawn(
+                    async move {
+                        let res = async {
+                            #[cfg(feature = "testing")]
+                            if let Some(failpoint_sem) = failpoint_sem {
+                                let _permit = failpoint_sem.acquire().await.map_err(|_| {
+                                    anyhow::anyhow!(
+                                        "failpoint: timeline-detach-ancestor::allow_one_reparented",
+                                    )
+                                })?;
+                                failpoint_sem.close();
+                            }
 
-            let tl_ancestor = tl.ancestor_timeline.as_ref()?;
-            let is_same = Arc::ptr_eq(ancestor, tl_ancestor);
-            let is_earlier = tl.get_ancestor_lsn() <= ancestor_lsn;
-
-            let is_deleting = tl
-                .delete_progress
-                .try_lock()
-                .map(|flow| !flow.is_not_started())
-                .unwrap_or(true);
-
-            if is_same && is_earlier && !is_deleting {
-                Some(tl.clone())
-            } else {
-                None
-            }
-        })
-        .for_each(|timeline| {
-            // important in this scope: we are holding the Tenant::timelines lock
-            let span = tracing::info_span!("reparent", reparented=%timeline.timeline_id);
-            let new_parent = detached.timeline_id;
-
-            tasks.spawn(
-                async move {
-                    let res = timeline
-                        .remote_client
-                        .schedule_reparenting_and_wait(&new_parent)
+                            timeline
+                                .remote_client
+                                .schedule_reparenting_and_wait(&new_parent)
+                                .await
+                        }
                         .await;
 
-                    match res {
-                        Ok(()) => Some(timeline),
-                        Err(e) => {
-                            // with the use of tenant slot, we no longer expect these.
-                            tracing::warn!("reparenting failed: {e:#}");
-                            None
+                        match res {
+                            Ok(()) => {
+                                tracing::info!("reparented");
+                                Some(timeline)
+                            }
+                            Err(e) => {
+                                // with the use of tenant slot, raced timeline deletion is the most
+                                // likely reason.
+                                tracing::warn!("reparenting failed: {e:#}");
+                                None
+                            }
                         }
                     }
-                }
-                .instrument(span),
-            );
-        });
+                    .instrument(span),
+                );
+            });
+    }
 
     let reparenting_candidates = tasks.len();
     let mut reparented = HashSet::with_capacity(tasks.len());
@@ -645,33 +865,103 @@ pub(super) async fn complete(
     while let Some(res) = tasks.join_next().await {
         match res {
             Ok(Some(timeline)) => {
-                tracing::info!(reparented=%timeline.timeline_id, "reparenting done");
-
                 assert!(
                     reparented.insert(timeline.timeline_id),
                     "duplicate reparenting? timeline_id={}",
                     timeline.timeline_id
                 );
             }
-            Ok(None) => {
-                // lets just ignore this for now. one or all reparented timelines could had
-                // started deletion, and that is fine.
-            }
             Err(je) if je.is_cancelled() => unreachable!("not used"),
-            Err(je) if je.is_panic() => {
-                // ignore; it's better to continue with a single reparenting failing (or even
-                // all of them) in order to get to the goal state.
-                //
-                // these timelines will never be reparentable, but they can be always detached as
-                // separate tree roots.
-            }
+            // just ignore failures now, we can retry
+            Ok(None) => {}
+            Err(je) if je.is_panic() => {}
             Err(je) => tracing::error!("unexpected join error: {je:?}"),
         }
     }
 
-    if reparenting_candidates != reparented.len() {
-        tracing::info!("failed to reparent some candidates");
+    let reparented_all = reparenting_candidates == reparented.len();
+
+    if reparented_all {
+        Ok(DetachingAndReparenting::Reparented(reparented))
+    } else {
+        tracing::info!(
+            reparented = reparented.len(),
+            candidates = reparenting_candidates,
+            "failed to reparent all candidates; they can be retried after the tenant_reset",
+        );
+
+        let must_reset_tenant = !reparented.is_empty() || was_detached;
+        Ok(DetachingAndReparenting::SomeReparentingFailed { must_reset_tenant })
+    }
+}
+
+pub(super) async fn complete(
+    detached: &Arc<Timeline>,
+    tenant: &Tenant,
+    mut attempt: Attempt,
+    _ctx: &RequestContext,
+) -> Result<(), Error> {
+    assert_eq!(detached.timeline_id, attempt.timeline_id);
+
+    if attempt.gate_entered.is_none() {
+        let entered = detached.gate.enter().map_err(|_| Error::ShuttingDown)?;
+        attempt.gate_entered = Some(entered);
+    } else {
+        // Some(gate_entered) means the tenant was not restarted, as is not required
     }
 
-    Ok(reparented)
+    assert!(detached.ancestor_timeline.is_none());
+
+    // this should be an 503 at least...?
+    fail::fail_point!(
+        "timeline-detach-ancestor::complete_before_uploading",
+        |_| Err(Error::Failpoint(
+            "timeline-detach-ancestor::complete_before_uploading"
+        ))
+    );
+
+    tenant
+        .gc_block
+        .remove(
+            detached,
+            crate::tenant::remote_timeline_client::index::GcBlockingReason::DetachAncestor,
+        )
+        .await
+        // FIXME: better error
+        .map_err(Error::Unexpected)?;
+
+    Ok(())
+}
+
+/// Query against a locked `Tenant::timelines`.
+fn reparentable_timelines<'a, I>(
+    timelines: I,
+    detached: &'a Arc<Timeline>,
+    ancestor: &'a Arc<Timeline>,
+    ancestor_lsn: Lsn,
+) -> impl Iterator<Item = &'a Arc<Timeline>> + 'a
+where
+    I: Iterator<Item = &'a Arc<Timeline>> + 'a,
+{
+    timelines.filter_map(move |tl| {
+        if Arc::ptr_eq(tl, detached) {
+            return None;
+        }
+
+        let tl_ancestor = tl.ancestor_timeline.as_ref()?;
+        let is_same = Arc::ptr_eq(ancestor, tl_ancestor);
+        let is_earlier = tl.get_ancestor_lsn() <= ancestor_lsn;
+
+        let is_deleting = tl
+            .delete_progress
+            .try_lock()
+            .map(|flow| !flow.is_not_started())
+            .unwrap_or(true);
+
+        if is_same && is_earlier && !is_deleting {
+            Some(tl)
+        } else {
+            None
+        }
+    })
 }
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index fe582cf0e2..ee8e9ac5a1 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -2989,6 +2989,7 @@ impl Service {
             );
 
             let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref());
+
             client
                 .timeline_detach_ancestor(tenant_shard_id, timeline_id)
                 .await
@@ -3005,7 +3006,8 @@ impl Service {
                         Error::ApiError(StatusCode::BAD_REQUEST, msg) => {
                             ApiError::BadRequest(anyhow::anyhow!("{node}: {msg}"))
                         }
-                        // rest can be mapped
+                        // rest can be mapped as usual
+                        // FIXME: this converts some 500 to 409 which is not per openapi
                         other => passthrough_api_error(&node, other),
                     }
                 })
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index 4e409eeb17..902457c2ac 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -5,7 +5,7 @@ import time
 from concurrent.futures import ThreadPoolExecutor
 from queue import Empty, Queue
 from threading import Barrier
-from typing import List, Tuple
+from typing import List, Set, Tuple
 
 import pytest
 from fixtures.common_types import Lsn, TimelineId
@@ -807,22 +807,24 @@ def test_timeline_detach_ancestor_interrupted_by_deletion(
 
     What remains not tested by this:
     - shutdown winning over complete
-
-    Shutdown winning over complete needs gc blocking and reparenting any left-overs on retry.
     """
 
     if sharded and mode == "delete_tenant":
         # the shared/exclusive lock for tenant is blocking this:
         # timeline detach ancestor takes shared, delete tenant takes exclusive
-        pytest.skip(
-            "tenant deletion while timeline ancestor detach is underway is not supported yet"
-        )
+        pytest.skip("tenant deletion while timeline ancestor detach is underway cannot happen")
 
     shard_count = 2 if sharded else 1
 
     neon_env_builder.num_pageservers = shard_count
 
-    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count if sharded else None)
+    env = neon_env_builder.init_start(
+        initial_tenant_shard_count=shard_count if sharded else None,
+        initial_tenant_conf={
+            "gc_period": "1s",
+            "lsn_lease_length": "0s",
+        },
+    )
 
     for ps in env.pageservers:
         ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
@@ -831,7 +833,7 @@ def test_timeline_detach_ancestor_interrupted_by_deletion(
 
     detached_timeline = env.neon_cli.create_branch("detached soon", "main")
 
-    failpoint = "timeline-detach-ancestor::before_starting_after_locking_pausable"
+    pausepoint = "timeline-detach-ancestor::before_starting_after_locking_pausable"
 
     env.storage_controller.reconcile_until_idle()
     shards = env.storage_controller.locate(env.initial_tenant)
@@ -843,13 +845,20 @@ def test_timeline_detach_ancestor_interrupted_by_deletion(
 
     victim = pageservers[int(shards[-1]["node_id"])]
     victim_http = victim.http_client()
-    victim_http.configure_failpoints((failpoint, "pause"))
+    victim_http.configure_failpoints((pausepoint, "pause"))
 
     def detach_ancestor():
         target.detach_ancestor(env.initial_tenant, detached_timeline)
 
-    def at_failpoint() -> Tuple[str, LogCursor]:
-        return victim.assert_log_contains(f"at failpoint {failpoint}")
+    def at_failpoint() -> LogCursor:
+        msg, offset = victim.assert_log_contains(f"at failpoint {pausepoint}")
+        log.info(f"found {msg}")
+        msg, offset = victim.assert_log_contains(
+            ".* gc_loop.*: Skipping GC: .*",
+            offset,
+        )
+        log.info(f"found {msg}")
+        return offset
 
     def start_delete():
         if mode == "delete_timeline":
@@ -882,23 +891,44 @@ def test_timeline_detach_ancestor_interrupted_by_deletion(
     with ThreadPoolExecutor(max_workers=2) as pool:
         try:
             fut = pool.submit(detach_ancestor)
-            _, offset = wait_until(10, 1.0, at_failpoint)
+            offset = wait_until(10, 1.0, at_failpoint)
 
             delete = pool.submit(start_delete)
 
-            wait_until(10, 1.0, lambda: at_waiting_on_gate_close(offset))
+            offset = wait_until(10, 1.0, lambda: at_waiting_on_gate_close(offset))
 
-            victim_http.configure_failpoints((failpoint, "off"))
+            victim_http.configure_failpoints((pausepoint, "off"))
 
             delete.result()
 
             assert wait_until(10, 1.0, is_deleted), f"unimplemented mode {mode}"
 
+            # TODO: match the error
             with pytest.raises(PageserverApiException) as exc:
                 fut.result()
+            log.info(f"TODO: match this error: {exc.value}")
             assert exc.value.status_code == 503
         finally:
-            victim_http.configure_failpoints((failpoint, "off"))
+            victim_http.configure_failpoints((pausepoint, "off"))
+
+    if mode != "delete_timeline":
+        return
+
+    # make sure the gc is unblocked
+    time.sleep(2)
+    victim.assert_log_contains(".* gc_loop.*: 1 timelines need GC", offset)
+
+    if not sharded:
+        # we have the other node only while sharded
+        return
+
+    other = pageservers[int(shards[0]["node_id"])]
+    log.info(f"other is {other.id}")
+    _, offset = other.assert_log_contains(
+        ".*INFO request\\{method=PUT path=/v1/tenant/\\S+/timeline/\\S+/detach_ancestor .*\\}: Request handled, status: 200 OK",
+    )
+    # this might be a lot earlier than the victims line, but that is okay.
+    _, offset = other.assert_log_contains(".* gc_loop.*: 1 timelines need GC", offset)
 
 
 @pytest.mark.parametrize("mode", ["delete_reparentable_timeline"])
@@ -915,7 +945,9 @@ def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnv
 
     assert (
         mode == "delete_reparentable_timeline"
-    ), "only one now, but we could have the create just as well, need gc blocking"
+    ), "only one now, but creating reparentable timelines cannot be supported even with gc blocking"
+    # perhaps it could be supported by always doing this for the shard0 first, and after that for others.
+    # when we run shard0 to completion, we can use it's timelines to restrict which can be reparented.
 
     shard_count = 2
     neon_env_builder.num_pageservers = shard_count
@@ -1048,10 +1080,267 @@ def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnv
             victim_http.configure_failpoints((pausepoint, "off"))
 
 
+def test_retried_detach_ancestor_after_failed_reparenting(neon_env_builder: NeonEnvBuilder):
+    """
+    Using a failpoint, force the completion step of timeline ancestor detach to
+    fail after reparenting a single timeline.
+
+    Retrying should try reparenting until all reparentings are done, all the
+    time blocking gc even across restarts (first round).
+
+    A completion failpoint is used to inhibit completion on second to last
+    round.
+
+    On last round, the completion uses a path where no reparentings can happen
+    because original ancestor is deleted, and there is a completion to unblock
+    gc without restart.
+    """
+
+    # to get the remote storage metrics
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3)
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            "gc_period": "1s",
+            "lsn_lease_length": "0s",
+        }
+    )
+
+    env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
+
+    env.pageserver.allowed_errors.extend(
+        [
+            ".* reparenting failed: failpoint: timeline-detach-ancestor::allow_one_reparented",
+            ".* Error processing HTTP request: InternalServerError\\(failed to reparent all candidate timelines, please retry",
+            ".* Error processing HTTP request: InternalServerError\\(failpoint: timeline-detach-ancestor::complete_before_uploading",
+        ]
+    )
+
+    http = env.pageserver.http_client()
+
+    def remote_storage_copy_requests():
+        return http.get_metric_value(
+            "remote_storage_s3_request_seconds_count",
+            {"request_type": "copy_object", "result": "ok"},
+        )
+
+    def reparenting_progress(timelines: List[TimelineId]) -> Tuple[int, Set[TimelineId]]:
+        reparented = 0
+        not_reparented = set()
+        for timeline in timelines:
+            detail = http.timeline_detail(env.initial_tenant, timeline)
+            ancestor = TimelineId(detail["ancestor_timeline_id"])
+            if ancestor == detached:
+                reparented += 1
+            else:
+                not_reparented.add(timeline)
+        return (reparented, not_reparented)
+
+    # main ------A-----B-----C-----D-----E> lsn
+    timelines = []
+    with env.endpoints.create_start("main") as ep:
+        for counter in range(5):
+            ep.safe_psql(
+                f"create table foo_{counter} as select i::bigint from generate_series(1, 10000) t(i)"
+            )
+            branch_lsn = wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline)
+            http.timeline_checkpoint(env.initial_tenant, env.initial_timeline)
+            branch = env.neon_cli.create_branch(
+                f"branch_{counter}", "main", ancestor_start_lsn=branch_lsn
+            )
+            timelines.append(branch)
+
+        flush_ep_to_pageserver(env, ep, env.initial_tenant, env.initial_timeline)
+
+    # detach "E" which has most reparentable timelines under it
+    detached = timelines.pop()
+    assert len(timelines) == 4
+
+    http = http.without_status_retrying()
+
+    http.configure_failpoints(("timeline-detach-ancestor::allow_one_reparented", "return"))
+
+    not_reparented: Set[TimelineId] = set()
+    # tracked offset in the pageserver log which is at least at the most recent activation
+    offset = None
+
+    def try_detach():
+        with pytest.raises(
+            PageserverApiException,
+            match=".*failed to reparent all candidate timelines, please retry",
+        ) as exc:
+            http.detach_ancestor(env.initial_tenant, detached)
+        assert exc.value.status_code == 500
+
+    # first round -- do more checking to make sure the gc gets paused
+    try_detach()
+
+    assert (
+        http.timeline_detail(env.initial_tenant, detached)["ancestor_timeline_id"] is None
+    ), "first round should had detached 'detached'"
+
+    reparented, not_reparented = reparenting_progress(timelines)
+    assert reparented == 1
+
+    time.sleep(2)
+    _, offset = env.pageserver.assert_log_contains(
+        ".*INFO request\\{method=PUT path=/v1/tenant/[0-9a-f]{32}/timeline/[0-9a-f]{32}/detach_ancestor .*\\}: Handling request",
+        offset,
+    )
+    _, offset = env.pageserver.assert_log_contains(".*: attach finished, activating", offset)
+    _, offset = env.pageserver.assert_log_contains(
+        ".* gc_loop.*: Skipping GC: .*",
+        offset,
+    )
+    metric = remote_storage_copy_requests()
+    assert metric != 0
+    # make sure the gc blocking is persistent over a restart
+    env.pageserver.restart()
+    env.pageserver.quiesce_tenants()
+    time.sleep(2)
+    _, offset = env.pageserver.assert_log_contains(".*: attach finished, activating", offset)
+    assert env.pageserver.log_contains(".* gc_loop.*: [0-9] timelines need GC", offset) is None
+    _, offset = env.pageserver.assert_log_contains(
+        ".* gc_loop.*: Skipping GC: .*",
+        offset,
+    )
+    # restore failpoint for the next reparented
+    http.configure_failpoints(("timeline-detach-ancestor::allow_one_reparented", "return"))
+
+    reparented_before = reparented
+
+    # do two more rounds
+    for _ in range(2):
+        try_detach()
+
+        assert (
+            http.timeline_detail(env.initial_tenant, detached)["ancestor_timeline_id"] is None
+        ), "first round should had detached 'detached'"
+
+        reparented, not_reparented = reparenting_progress(timelines)
+        assert reparented == reparented_before + 1
+        reparented_before = reparented
+
+        _, offset = env.pageserver.assert_log_contains(".*: attach finished, activating", offset)
+        metric = remote_storage_copy_requests()
+        assert metric == 0, "copies happen in the first round"
+
+    assert offset is not None
+    assert len(not_reparented) == 1
+
+    http.configure_failpoints(("timeline-detach-ancestor::complete_before_uploading", "return"))
+
+    # almost final round, the failpoint is hit no longer as there is only one reparented and one always gets to succeed.
+    # the tenant is restarted once more, but we fail during completing.
+    with pytest.raises(
+        PageserverApiException, match=".* timeline-detach-ancestor::complete_before_uploading"
+    ) as exc:
+        http.detach_ancestor(env.initial_tenant, detached)
+    assert exc.value.status_code == 500
+    _, offset = env.pageserver.assert_log_contains(".*: attach finished, activating", offset)
+
+    # delete the previous ancestor to take a different path to completion. all
+    # other tests take the "detach? reparent complete", but this only hits
+    # "complete".
+    http.timeline_delete(env.initial_tenant, env.initial_timeline)
+    wait_timeline_detail_404(http, env.initial_tenant, env.initial_timeline, 20)
+
+    http.configure_failpoints(("timeline-detach-ancestor::complete_before_uploading", "off"))
+
+    reparented_resp = http.detach_ancestor(env.initial_tenant, detached)
+    assert reparented_resp == set(timelines)
+    # no need to quiesce_tenants anymore, because completion does that
+
+    reparented, not_reparented = reparenting_progress(timelines)
+    assert reparented == len(timelines)
+
+    time.sleep(2)
+    assert (
+        env.pageserver.log_contains(".*: attach finished, activating", offset) is None
+    ), "there should be no restart with the final detach_ancestor as it only completed"
+
+    # gc is unblocked
+    env.pageserver.assert_log_contains(".* gc_loop.*: 5 timelines need GC", offset)
+
+    metric = remote_storage_copy_requests()
+    assert metric == 0
+
+
+def test_timeline_is_deleted_before_timeline_detach_ancestor_completes(
+    neon_env_builder: NeonEnvBuilder,
+):
+    """
+    Make sure that a timeline deleted after restart will unpause gc blocking.
+    """
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            "gc_period": "1s",
+            "lsn_lease_length": "0s",
+        }
+    )
+
+    env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
+
+    http = env.pageserver.http_client()
+
+    detached = env.neon_cli.create_branch("detached")
+
+    failpoint = "timeline-detach-ancestor::after_activating_before_finding-pausable"
+
+    http.configure_failpoints((failpoint, "pause"))
+
+    def detach_and_get_stuck():
+        return http.detach_ancestor(env.initial_tenant, detached)
+
+    def request_processing_noted_in_log():
+        _, offset = env.pageserver.assert_log_contains(
+            ".*INFO request\\{method=PUT path=/v1/tenant/[0-9a-f]{32}/timeline/[0-9a-f]{32}/detach_ancestor .*\\}: Handling request",
+        )
+        return offset
+
+    def delete_detached():
+        return http.timeline_delete(env.initial_tenant, detached)
+
+    try:
+        with ThreadPoolExecutor(max_workers=1) as pool:
+            detach = pool.submit(detach_and_get_stuck)
+
+            offset = wait_until(10, 1.0, request_processing_noted_in_log)
+
+            # make this named fn tor more clear failure test output logging
+            def pausepoint_hit_with_gc_paused() -> LogCursor:
+                env.pageserver.assert_log_contains(f"at failpoint {failpoint}")
+                _, at = env.pageserver.assert_log_contains(
+                    ".* gc_loop.*: Skipping GC: .*",
+                    offset,
+                )
+                return at
+
+            offset = wait_until(10, 1.0, pausepoint_hit_with_gc_paused)
+
+            delete_detached()
+
+            wait_timeline_detail_404(http, env.initial_tenant, detached, 10, 1.0)
+
+            http.configure_failpoints((failpoint, "off"))
+
+            with pytest.raises(PageserverApiException) as exc:
+                detach.result()
+
+            # FIXME: this should be 404 but because there is another Anyhow conversion it is 500
+            assert exc.value.status_code == 500
+            env.pageserver.allowed_errors.append(
+                ".*Error processing HTTP request: InternalServerError\\(detached timeline was not found after restart"
+            )
+    finally:
+        http.configure_failpoints((failpoint, "off"))
+
+    # make sure gc has been unblocked
+    time.sleep(2)
+
+    env.pageserver.assert_log_contains(".* gc_loop.*: 1 timelines need GC", offset)
+
+
 # TODO:
-# - after starting the operation, pageserver is shutdown, restarted
-# - after starting the operation, bottom-most timeline is deleted, pageserver is restarted, gc is inhibited
-# - deletion of reparented while reparenting should fail once, then succeed (?)
 # - branch near existing L1 boundary, image layers?
 # - investigate: why are layers started at uneven lsn? not just after branching, but in general.
 #