From 6d6e2c6a395d365852df478b37bbf50af1e48e6b Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Tue, 13 Aug 2024 20:51:51 +0300 Subject: [PATCH] feat(detach_ancestor): better retries with persistent gc blocking (#8430) With the persistent gc blocking, we can now retry reparenting timelines which had failed for whatever reason on the previous attempt(s). Restructure the detach_ancestor into three phases: - prepare (insert persistent gc blocking, copy lsn prefix, layers) - detach and reparent - reparenting can fail, so we might need to retry this portion - complete (remove persistent gc blocking) Cc: #6994 --- libs/utils/src/completion.rs | 31 +- pageserver/src/http/routes.rs | 4 +- pageserver/src/tenant.rs | 12 +- pageserver/src/tenant/metadata.rs | 3 + pageserver/src/tenant/mgr.rs | 119 +++- .../src/tenant/remote_timeline_client.rs | 70 ++- .../tenant/remote_timeline_client/index.rs | 57 +- pageserver/src/tenant/tasks.rs | 14 +- pageserver/src/tenant/timeline.rs | 30 +- .../src/tenant/timeline/detach_ancestor.rs | 528 ++++++++++++++---- storage_controller/src/service.rs | 4 +- .../regress/test_timeline_detach_ancestor.py | 327 ++++++++++- 12 files changed, 960 insertions(+), 239 deletions(-) diff --git a/libs/utils/src/completion.rs b/libs/utils/src/completion.rs index 2fef8d35df..f65c080ad4 100644 --- a/libs/utils/src/completion.rs +++ b/libs/utils/src/completion.rs @@ -5,13 +5,40 @@ use tokio_util::task::{task_tracker::TaskTrackerToken, TaskTracker}; /// Can be cloned, moved and kept around in futures as "guard objects". #[derive(Clone)] pub struct Completion { - _token: TaskTrackerToken, + token: TaskTrackerToken, +} + +impl std::fmt::Debug for Completion { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Completion") + .field("siblings", &self.token.task_tracker().len()) + .finish() + } +} + +impl Completion { + /// Returns true if this completion is associated with the given barrier. + pub fn blocks(&self, barrier: &Barrier) -> bool { + TaskTracker::ptr_eq(self.token.task_tracker(), &barrier.0) + } + + pub fn barrier(&self) -> Barrier { + Barrier(self.token.task_tracker().clone()) + } } /// Barrier will wait until all clones of [`Completion`] have been dropped. #[derive(Clone)] pub struct Barrier(TaskTracker); +impl std::fmt::Debug for Barrier { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("Barrier") + .field("remaining", &self.0.len()) + .finish() + } +} + impl Default for Barrier { fn default() -> Self { let (_, rx) = channel(); @@ -51,5 +78,5 @@ pub fn channel() -> (Completion, Barrier) { tracker.close(); let token = tracker.token(); - (Completion { _token: token }, Barrier(tracker)) + (Completion { token }, Barrier(tracker)) } diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 2b0156079e..6f7480cc6c 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -1887,7 +1887,7 @@ async fn timeline_detach_ancestor_handler( // drop(tenant); let resp = match progress { - detach_ancestor::Progress::Prepared(_guard, prepared) => { + detach_ancestor::Progress::Prepared(attempt, prepared) => { // it would be great to tag the guard on to the tenant activation future let reparented_timelines = state .tenant_manager @@ -1895,10 +1895,10 @@ async fn timeline_detach_ancestor_handler( tenant_shard_id, timeline_id, prepared, + attempt, ctx, ) .await - .context("timeline detach ancestor completion") .map_err(ApiError::InternalServerError)?; AncestorDetached { diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index a238004aad..b065f58382 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -302,7 +302,11 @@ pub struct Tenant { pub(crate) timeline_get_throttle: Arc>, - /// An ongoing timeline detach must be checked during attempts to GC or compact a timeline. + /// An ongoing timeline detach concurrency limiter. + /// + /// As a tenant will likely be restarted as part of timeline detach ancestor it makes no sense + /// to have two running at the same time. A different one can be started if an earlier one + /// has failed for whatever reason. ongoing_timeline_detach: std::sync::Mutex>, /// `index_part.json` based gc blocking reason tracking. @@ -833,9 +837,9 @@ impl Tenant { // The Stopping case is for when we have passed control on to DeleteTenantFlow: // if it errors, we will call make_broken when tenant is already in Stopping. assert!( - matches!(*state, TenantState::Attaching | TenantState::Stopping { .. }), - "the attach task owns the tenant state until activation is complete" - ); + matches!(*state, TenantState::Attaching | TenantState::Stopping { .. }), + "the attach task owns the tenant state until activation is complete" + ); *state = TenantState::broken_from_reason(err.to_string()); }); diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index bbc070a81b..6073abc8c3 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -285,12 +285,15 @@ impl TimelineMetadata { } /// When reparenting, the `ancestor_lsn` does not change. + /// + /// Returns true if anything was changed. pub fn reparent(&mut self, timeline: &TimelineId) { assert!(self.body.ancestor_timeline.is_some()); // no assertion for redoing this: it's fine, we may have to repeat this multiple times over self.body.ancestor_timeline = Some(*timeline); } + /// Returns true if anything was changed pub fn detach_from_ancestor(&mut self, branchpoint: &(TimelineId, Lsn)) { if let Some(ancestor) = self.body.ancestor_timeline { assert_eq!(ancestor, branchpoint.0); diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index c8a11e88cc..5f2539d426 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -54,7 +54,7 @@ use utils::id::{TenantId, TimelineId}; use super::remote_timeline_client::remote_tenant_path; use super::secondary::SecondaryTenant; -use super::timeline::detach_ancestor::PreparedTimelineDetach; +use super::timeline::detach_ancestor::{self, PreparedTimelineDetach}; use super::{GlobalShutDown, TenantSharedResources}; /// For a tenant that appears in TenantsMap, it may either be @@ -1927,8 +1927,10 @@ impl TenantManager { tenant_shard_id: TenantShardId, timeline_id: TimelineId, prepared: PreparedTimelineDetach, + mut attempt: detach_ancestor::Attempt, ctx: &RequestContext, ) -> Result, anyhow::Error> { + use crate::tenant::timeline::detach_ancestor::Error; // FIXME: this is unnecessary, slotguard already has these semantics struct RevertOnDropSlot(Option); @@ -1977,43 +1979,98 @@ impl TenantManager { let timeline = tenant.get_timeline(timeline_id, true)?; - let reparented = timeline - .complete_detaching_timeline_ancestor(&tenant, prepared, ctx) + let resp = timeline + .detach_from_ancestor_and_reparent(&tenant, prepared, ctx) .await?; let mut slot_guard = slot_guard.into_inner(); - let (_guard, progress) = utils::completion::channel(); - match tenant.shutdown(progress, ShutdownMode::Hard).await { - Ok(()) => { - slot_guard.drop_old_value()?; + let tenant = if resp.reset_tenant_required() { + attempt.before_reset_tenant(); + + let (_guard, progress) = utils::completion::channel(); + match tenant.shutdown(progress, ShutdownMode::Hard).await { + Ok(()) => { + slot_guard.drop_old_value()?; + } + Err(_barrier) => { + slot_guard.revert(); + // this really should not happen, at all, unless shutdown was already going? + anyhow::bail!("Cannot restart Tenant, already shutting down"); + } } - Err(_barrier) => { - slot_guard.revert(); - // this really should not happen, at all, unless shutdown was already going? - anyhow::bail!("Cannot restart Tenant, already shutting down"); + + let tenant_path = self.conf.tenant_path(&tenant_shard_id); + let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?; + + let shard_identity = config.shard; + let tenant = tenant_spawn( + self.conf, + tenant_shard_id, + &tenant_path, + self.resources.clone(), + AttachedTenantConf::try_from(config)?, + shard_identity, + None, + SpawnMode::Eager, + ctx, + )?; + + { + let mut g = tenant.ongoing_timeline_detach.lock().unwrap(); + assert!( + g.is_none(), + "there cannot be any new timeline detach ancestor on newly created tenant" + ); + *g = Some((attempt.timeline_id, attempt.new_barrier())); } + + slot_guard.upsert(TenantSlot::Attached(tenant.clone()))?; + tenant + } else { + tracing::info!("skipping tenant_reset as no changes made required it"); + tenant + }; + + if let Some(reparented) = resp.completed() { + // finally ask the restarted tenant to complete the detach + // + // rationale for 9999s: we don't really have a timetable here; if retried, the caller + // will get an 503. + tenant + .wait_to_become_active(std::time::Duration::from_secs(9999)) + .await + .map_err(|e| { + use pageserver_api::models::TenantState; + use GetActiveTenantError::{Cancelled, WillNotBecomeActive}; + match e { + Cancelled | WillNotBecomeActive(TenantState::Stopping { .. }) => { + Error::ShuttingDown + } + other => Error::Unexpected(other.into()), + } + })?; + + utils::pausable_failpoint!( + "timeline-detach-ancestor::after_activating_before_finding-pausable" + ); + + let timeline = tenant + .get_timeline(attempt.timeline_id, true) + .map_err(|_| Error::DetachedNotFoundAfterRestart)?; + + timeline + .complete_detaching_timeline_ancestor(&tenant, attempt, ctx) + .await + .map(|()| reparented) + .map_err(|e| e.into()) + } else { + // at least the latest versions have now been downloaded and refreshed; be ready to + // retry another time. + Err(anyhow::anyhow!( + "failed to reparent all candidate timelines, please retry" + )) } - - let tenant_path = self.conf.tenant_path(&tenant_shard_id); - let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?; - - let shard_identity = config.shard; - let tenant = tenant_spawn( - self.conf, - tenant_shard_id, - &tenant_path, - self.resources.clone(), - AttachedTenantConf::try_from(config)?, - shard_identity, - None, - SpawnMode::Eager, - ctx, - )?; - - slot_guard.upsert(TenantSlot::Attached(tenant))?; - - Ok(reparented) } /// A page service client sends a TenantId, and to look up the correct Tenant we must diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index 8a76d7532f..b4d7ad1e97 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -736,12 +736,13 @@ impl RemoteTimelineClient { Ok(()) } + /// Reparent this timeline to a new parent. + /// + /// A retryable step of timeline ancestor detach. pub(crate) async fn schedule_reparenting_and_wait( self: &Arc, new_parent: &TimelineId, ) -> anyhow::Result<()> { - // FIXME: because of how Timeline::schedule_uploads works when called from layer flushing - // and reads the in-memory part we cannot do the detaching like this let receiver = { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; @@ -752,17 +753,25 @@ impl RemoteTimelineClient { )); }; - upload_queue.dirty.metadata.reparent(new_parent); - upload_queue.dirty.lineage.record_previous_ancestor(&prev); + let uploaded = &upload_queue.clean.0.metadata; - self.schedule_index_upload(upload_queue)?; + if uploaded.ancestor_timeline().is_none() && !uploaded.ancestor_lsn().is_valid() { + // nothing to do + None + } else { + upload_queue.dirty.metadata.reparent(new_parent); + upload_queue.dirty.lineage.record_previous_ancestor(&prev); - self.schedule_barrier0(upload_queue) + self.schedule_index_upload(upload_queue)?; + + Some(self.schedule_barrier0(upload_queue)) + } }; - Self::wait_completion0(receiver) - .await - .context("wait completion") + if let Some(receiver) = receiver { + Self::wait_completion0(receiver).await?; + } + Ok(()) } /// Schedules uploading a new version of `index_part.json` with the given layers added, @@ -778,26 +787,30 @@ impl RemoteTimelineClient { let mut guard = self.upload_queue.lock().unwrap(); let upload_queue = guard.initialized_mut()?; - upload_queue.dirty.metadata.detach_from_ancestor(&adopted); - upload_queue.dirty.lineage.record_detaching(&adopted); + if upload_queue.clean.0.lineage.detached_previous_ancestor() == Some(adopted) { + None + } else { + upload_queue.dirty.metadata.detach_from_ancestor(&adopted); + upload_queue.dirty.lineage.record_detaching(&adopted); - for layer in layers { - upload_queue - .dirty - .layer_metadata - .insert(layer.layer_desc().layer_name(), layer.metadata()); + for layer in layers { + let prev = upload_queue + .dirty + .layer_metadata + .insert(layer.layer_desc().layer_name(), layer.metadata()); + assert!(prev.is_none(), "copied layer existed already {layer}"); + } + + self.schedule_index_upload(upload_queue)?; + + Some(self.schedule_barrier0(upload_queue)) } - - self.schedule_index_upload(upload_queue)?; - - let barrier = self.schedule_barrier0(upload_queue); - self.launch_queued_tasks(upload_queue); - barrier }; - Self::wait_completion0(barrier) - .await - .context("wait completion") + if let Some(barrier) = barrier { + Self::wait_completion0(barrier).await?; + } + Ok(()) } /// Adds a gc blocking reason for this timeline if one does not exist already. @@ -873,12 +886,7 @@ impl RemoteTimelineClient { let upload_queue = guard.initialized_mut()?; if let index::GcBlockingReason::DetachAncestor = reason { - if !upload_queue - .clean - .0 - .lineage - .is_detached_from_original_ancestor() - { + if !upload_queue.clean.0.lineage.is_detached_from_ancestor() { drop(guard); panic!("cannot complete timeline_ancestor_detach while not detached"); } diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs index 90453b1922..757fb9d032 100644 --- a/pageserver/src/tenant/remote_timeline_client/index.rs +++ b/pageserver/src/tenant/remote_timeline_client/index.rs @@ -216,26 +216,47 @@ fn is_false(b: &bool) -> bool { impl Lineage { const REMEMBER_AT_MOST: usize = 100; - pub(crate) fn record_previous_ancestor(&mut self, old_ancestor: &TimelineId) { + pub(crate) fn record_previous_ancestor(&mut self, old_ancestor: &TimelineId) -> bool { if self.reparenting_history.last() == Some(old_ancestor) { // do not re-record it - return; - } + false + } else { + #[cfg(feature = "testing")] + { + let existing = self + .reparenting_history + .iter() + .position(|x| x == old_ancestor); + assert_eq!( + existing, None, + "we cannot reparent onto and off and onto the same timeline twice" + ); + } + let drop_oldest = self.reparenting_history.len() + 1 >= Self::REMEMBER_AT_MOST; - let drop_oldest = self.reparenting_history.len() + 1 >= Self::REMEMBER_AT_MOST; - - self.reparenting_history_truncated |= drop_oldest; - if drop_oldest { - self.reparenting_history.remove(0); + self.reparenting_history_truncated |= drop_oldest; + if drop_oldest { + self.reparenting_history.remove(0); + } + self.reparenting_history.push(*old_ancestor); + true } - self.reparenting_history.push(*old_ancestor); } - pub(crate) fn record_detaching(&mut self, branchpoint: &(TimelineId, Lsn)) { - assert!(self.original_ancestor.is_none()); - - self.original_ancestor = - Some((branchpoint.0, branchpoint.1, chrono::Utc::now().naive_utc())); + /// Returns true if anything changed. + pub(crate) fn record_detaching(&mut self, branchpoint: &(TimelineId, Lsn)) -> bool { + if let Some((id, lsn, _)) = self.original_ancestor { + assert_eq!( + &(id, lsn), + branchpoint, + "detaching attempt has to be for the same ancestor we are already detached from" + ); + false + } else { + self.original_ancestor = + Some((branchpoint.0, branchpoint.1, chrono::Utc::now().naive_utc())); + true + } } /// The queried lsn is most likely the basebackup lsn, and this answers question "is it allowed @@ -247,10 +268,16 @@ impl Lineage { .is_some_and(|(_, ancestor_lsn, _)| ancestor_lsn == lsn) } - pub(crate) fn is_detached_from_original_ancestor(&self) -> bool { + /// Returns true if the timeline originally had an ancestor, and no longer has one. + pub(crate) fn is_detached_from_ancestor(&self) -> bool { self.original_ancestor.is_some() } + /// Returns original ancestor timeline id and lsn that this timeline has been detached from. + pub(crate) fn detached_previous_ancestor(&self) -> Option<(TimelineId, Lsn)> { + self.original_ancestor.map(|(id, lsn, _)| (id, lsn)) + } + pub(crate) fn is_reparented(&self) -> bool { !self.reparenting_history.is_empty() } diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index 713845e9ac..dbcd704b4e 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -366,14 +366,13 @@ async fn gc_loop(tenant: Arc, cancel: CancellationToken) { if first { first = false; - if delay_by_lease_length(tenant.get_lsn_lease_length(), &cancel) - .await - .is_err() - { - break; - } + let delays = async { + delay_by_lease_length(tenant.get_lsn_lease_length(), &cancel).await?; + random_init_delay(period, &cancel).await?; + Ok::<_, Cancelled>(()) + }; - if random_init_delay(period, &cancel).await.is_err() { + if delays.await.is_err() { break; } } @@ -425,7 +424,6 @@ async fn gc_loop(tenant: Arc, cancel: CancellationToken) { warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Gc); - // Sleep if tokio::time::timeout(sleep_duration, cancel.cancelled()) .await .is_ok() diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 767f5969fc..f1587951c6 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -4328,18 +4328,34 @@ impl Timeline { detach_ancestor::prepare(self, tenant, options, ctx).await } - /// Completes the ancestor detach. This method is to be called while holding the - /// TenantManager's tenant slot, so during this method we cannot be deleted nor can any - /// timeline be deleted. After this method returns successfully, tenant must be reloaded. + /// Second step of detach from ancestor; detaches the `self` from it's current ancestor and + /// reparents any reparentable children of previous ancestor. /// - /// Pageserver receiving a SIGKILL during this operation is not supported (yet). - pub(crate) async fn complete_detaching_timeline_ancestor( + /// This method is to be called while holding the TenantManager's tenant slot, so during this + /// method we cannot be deleted nor can any timeline be deleted. After this method returns + /// successfully, tenant must be reloaded. + /// + /// Final step will be to [`Self::complete_detaching_timeline_ancestor`] after optionally + /// resetting the tenant. + pub(crate) async fn detach_from_ancestor_and_reparent( self: &Arc, tenant: &crate::tenant::Tenant, prepared: detach_ancestor::PreparedTimelineDetach, ctx: &RequestContext, - ) -> Result, anyhow::Error> { - detach_ancestor::complete(self, tenant, prepared, ctx).await + ) -> Result { + detach_ancestor::detach_and_reparent(self, tenant, prepared, ctx).await + } + + /// Final step which unblocks the GC. + /// + /// The tenant must've been reset if ancestry was modified previously (in tenant manager). + pub(crate) async fn complete_detaching_timeline_ancestor( + self: &Arc, + tenant: &crate::tenant::Tenant, + attempt: detach_ancestor::Attempt, + ctx: &RequestContext, + ) -> Result<(), detach_ancestor::Error> { + detach_ancestor::complete(self, tenant, attempt, ctx).await } /// Switch aux file policy and schedule upload to the index part. diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs index 3b52adc77b..969da2662b 100644 --- a/pageserver/src/tenant/timeline/detach_ancestor.rs +++ b/pageserver/src/tenant/timeline/detach_ancestor.rs @@ -5,12 +5,16 @@ use crate::{ context::{DownloadBehavior, RequestContext}, task_mgr::TaskKind, tenant::{ + mgr::GetActiveTenantError, + remote_timeline_client::index::GcBlockingReason::DetachAncestor, storage_layer::{AsLayerDesc as _, DeltaLayerWriter, Layer, ResidentLayer}, Tenant, }, virtual_file::{MaybeFatalIo, VirtualFile}, }; +use anyhow::Context; use pageserver_api::models::detach_ancestor::AncestorDetached; +use tokio::sync::Semaphore; use tokio_util::sync::CancellationToken; use tracing::Instrument; use utils::{completion, generation::Generation, http::error::ApiError, id::TimelineId, lsn::Lsn}; @@ -38,6 +42,12 @@ pub(crate) enum Error { #[error("remote copying layer failed")] CopyFailed(#[source] anyhow::Error), + #[error("wait for tenant to activate after restarting")] + WaitToActivate(#[source] GetActiveTenantError), + + #[error("detached timeline was not found after restart")] + DetachedNotFoundAfterRestart, + #[error("unexpected error")] Unexpected(#[source] anyhow::Error), @@ -55,6 +65,10 @@ impl From for ApiError { Error::OtherTimelineDetachOngoing(_) => { ApiError::ResourceUnavailable("other timeline detach is already ongoing".into()) } + e @ Error::WaitToActivate(_) => { + let s = utils::error::report_compact_sources(&e).to_string(); + ApiError::ResourceUnavailable(s.into()) + } // All of these contain shutdown errors, in fact, it's the most common e @ Error::FlushAncestor(_) | e @ Error::RewrittenDeltaDownloadFailed(_) @@ -63,6 +77,7 @@ impl From for ApiError { | e @ Error::CopyFailed(_) | e @ Error::Unexpected(_) | e @ Error::Failpoint(_) => ApiError::InternalServerError(e.into()), + Error::DetachedNotFoundAfterRestart => ApiError::NotFound(value.into()), } } } @@ -96,8 +111,25 @@ impl From for Error { } } +impl From for Error { + fn from(value: GetActiveTenantError) -> Self { + use pageserver_api::models::TenantState; + use GetActiveTenantError::*; + + match value { + Cancelled | WillNotBecomeActive(TenantState::Stopping { .. }) | SwitchedTenant => { + Error::ShuttingDown + } + WaitForActiveTimeout { .. } | NotFound(_) | Broken(_) | WillNotBecomeActive(_) => { + // NotFound seems out-of-place + Error::WaitToActivate(value) + } + } + } +} + pub(crate) enum Progress { - Prepared(completion::Completion, PreparedTimelineDetach), + Prepared(Attempt, PreparedTimelineDetach), Done(AncestorDetached), } @@ -121,6 +153,26 @@ impl Default for Options { } } +/// Represents an across tenant reset exclusive single attempt to detach ancestor. +#[derive(Debug)] +pub(crate) struct Attempt { + pub(crate) timeline_id: TimelineId, + + _guard: completion::Completion, + gate_entered: Option, +} + +impl Attempt { + pub(crate) fn before_reset_tenant(&mut self) { + let taken = self.gate_entered.take(); + assert!(taken.is_some()); + } + + pub(crate) fn new_barrier(&self) -> completion::Barrier { + self._guard.barrier() + } +} + /// See [`Timeline::prepare_to_detach_from_ancestor`] pub(super) async fn prepare( detached: &Arc, @@ -135,15 +187,33 @@ pub(super) async fn prepare( .as_ref() .map(|tl| (tl.clone(), detached.ancestor_lsn)) else { - { + let still_in_progress = { let accessor = detached.remote_client.initialized_upload_queue()?; // we are safe to inspect the latest uploaded, because we can only witness this after // restart is complete and ancestor is no more. let latest = accessor.latest_uploaded_index_part(); - if !latest.lineage.is_detached_from_original_ancestor() { + if latest.lineage.detached_previous_ancestor().is_none() { return Err(NoAncestor); - } + }; + + latest + .gc_blocking + .as_ref() + .is_some_and(|b| b.blocked_by(DetachAncestor)) + }; + + if still_in_progress { + // gc is still blocked, we can still reparent and complete. + // we are safe to reparent remaining, because they were locked in in the beginning. + let attempt = continue_with_blocked_gc(detached, tenant).await?; + + // because the ancestor of detached is already set to none, we have published all + // of the layers, so we are still "prepared." + return Ok(Progress::Prepared( + attempt, + PreparedTimelineDetach { layers: Vec::new() }, + )); } let reparented_timelines = reparented_direct_children(detached, tenant)?; @@ -164,22 +234,7 @@ pub(super) async fn prepare( return Err(TooManyAncestors); } - // before we acquire the gate, we must mark the ancestor as having a detach operation - // ongoing which will block other concurrent detach operations so we don't get to ackward - // situations where there would be two branches trying to reparent earlier branches. - let (guard, barrier) = completion::channel(); - - { - let mut guard = tenant.ongoing_timeline_detach.lock().unwrap(); - if let Some((tl, other)) = guard.as_ref() { - if !other.is_ready() { - return Err(OtherTimelineDetachOngoing(*tl)); - } - } - *guard = Some((detached.timeline_id, barrier)); - } - - let _gate_entered = detached.gate.enter().map_err(|_| ShuttingDown)?; + let attempt = start_new_attempt(detached, tenant).await?; utils::pausable_failpoint!("timeline-detach-ancestor::before_starting_after_locking_pausable"); @@ -245,7 +300,8 @@ pub(super) async fn prepare( }; // TODO: layers are already sorted by something: use that to determine how much of remote - // copies are already done. + // copies are already done -- gc is blocked, but a compaction could had happened on ancestor, + // which is something to keep in mind if copy skipping is implemented. tracing::info!(filtered=%filtered_layers, to_rewrite = straddling_branchpoint.len(), historic=%rest_of_historic.len(), "collected layers"); // TODO: copying and lsn prefix copying could be done at the same time with a single fsync after @@ -259,29 +315,33 @@ pub(super) async fn prepare( let mut wrote_any = false; - let limiter = Arc::new(tokio::sync::Semaphore::new( - options.rewrite_concurrency.get(), - )); + let limiter = Arc::new(Semaphore::new(options.rewrite_concurrency.get())); for layer in straddling_branchpoint { let limiter = limiter.clone(); let timeline = detached.clone(); let ctx = ctx.detached_child(TaskKind::DetachAncestor, DownloadBehavior::Download); - tasks.spawn(async move { - let _permit = limiter.acquire().await; - let copied = - upload_rewritten_layer(end_lsn, &layer, &timeline, &timeline.cancel, &ctx) - .await?; - Ok(copied) - }); + let span = tracing::info_span!("upload_rewritten_layer", %layer); + tasks.spawn( + async move { + let _permit = limiter.acquire().await; + let copied = + upload_rewritten_layer(end_lsn, &layer, &timeline, &timeline.cancel, &ctx) + .await?; + if let Some(copied) = copied.as_ref() { + tracing::info!(%copied, "rewrote and uploaded"); + } + Ok(copied) + } + .instrument(span), + ); } while let Some(res) = tasks.join_next().await { match res { Ok(Ok(Some(copied))) => { wrote_any = true; - tracing::info!(layer=%copied, "rewrote and uploaded"); new_layers.push(copied); } Ok(Ok(None)) => {} @@ -308,7 +368,7 @@ pub(super) async fn prepare( } let mut tasks = tokio::task::JoinSet::new(); - let limiter = Arc::new(tokio::sync::Semaphore::new(options.copy_concurrency.get())); + let limiter = Arc::new(Semaphore::new(options.copy_concurrency.get())); for adopted in rest_of_historic { let limiter = limiter.clone(); @@ -342,7 +402,56 @@ pub(super) async fn prepare( let prepared = PreparedTimelineDetach { layers: new_layers }; - Ok(Progress::Prepared(guard, prepared)) + Ok(Progress::Prepared(attempt, prepared)) +} + +async fn start_new_attempt(detached: &Timeline, tenant: &Tenant) -> Result { + let attempt = obtain_exclusive_attempt(detached, tenant)?; + + // insert the block in the index_part.json, if not already there. + let _dont_care = tenant + .gc_block + .insert( + detached, + crate::tenant::remote_timeline_client::index::GcBlockingReason::DetachAncestor, + ) + .await + // FIXME: better error + .map_err(Error::Unexpected)?; + + Ok(attempt) +} + +async fn continue_with_blocked_gc(detached: &Timeline, tenant: &Tenant) -> Result { + // FIXME: it would be nice to confirm that there is an in-memory version, since we've just + // verified there is a persistent one? + obtain_exclusive_attempt(detached, tenant) +} + +fn obtain_exclusive_attempt(detached: &Timeline, tenant: &Tenant) -> Result { + use Error::{OtherTimelineDetachOngoing, ShuttingDown}; + + // ensure we are the only active attempt for this tenant + let (guard, barrier) = completion::channel(); + { + let mut guard = tenant.ongoing_timeline_detach.lock().unwrap(); + if let Some((tl, other)) = guard.as_ref() { + if !other.is_ready() { + return Err(OtherTimelineDetachOngoing(*tl)); + } + // FIXME: no test enters here + } + *guard = Some((detached.timeline_id, barrier)); + } + + // ensure the gate is still open + let _gate_entered = detached.gate.enter().map_err(|_| ShuttingDown)?; + + Ok(Attempt { + timeline_id: detached.timeline_id, + _guard: guard, + gate_entered: Some(_gate_entered), + }) } fn reparented_direct_children( @@ -548,96 +657,207 @@ async fn remote_copy( .map_err(CopyFailed) } -/// See [`Timeline::complete_detaching_timeline_ancestor`]. -pub(super) async fn complete( +pub(crate) enum DetachingAndReparenting { + /// All of the following timeline ids were reparented and the timeline ancestor detach must be + /// marked as completed. + Reparented(HashSet), + + /// Some of the reparentings failed. The timeline ancestor detach must **not** be marked as + /// completed. + /// + /// Nested `must_reset_tenant` is set to true when any restart requiring changes were made. + SomeReparentingFailed { must_reset_tenant: bool }, + + /// Detaching and reparentings were completed in a previous attempt. Timeline ancestor detach + /// must be marked as completed. + AlreadyDone(HashSet), +} + +impl DetachingAndReparenting { + pub(crate) fn reset_tenant_required(&self) -> bool { + use DetachingAndReparenting::*; + match self { + Reparented(_) => true, + SomeReparentingFailed { must_reset_tenant } => *must_reset_tenant, + AlreadyDone(_) => false, + } + } + + pub(crate) fn completed(self) -> Option> { + use DetachingAndReparenting::*; + match self { + Reparented(x) | AlreadyDone(x) => Some(x), + SomeReparentingFailed { .. } => None, + } + } +} + +/// See [`Timeline::detach_from_ancestor_and_reparent`]. +pub(super) async fn detach_and_reparent( detached: &Arc, tenant: &Tenant, prepared: PreparedTimelineDetach, _ctx: &RequestContext, -) -> Result, anyhow::Error> { +) -> Result { let PreparedTimelineDetach { layers } = prepared; - let ancestor = detached - .ancestor_timeline - .as_ref() - .expect("must still have a ancestor"); - let ancestor_lsn = detached.get_ancestor_lsn(); + #[derive(Debug)] + enum Ancestor { + NotDetached(Arc, Lsn), + Detached(Arc, Lsn), + } + + let (recorded_branchpoint, still_ongoing) = { + let access = detached.remote_client.initialized_upload_queue()?; + let latest = access.latest_uploaded_index_part(); + + ( + latest.lineage.detached_previous_ancestor(), + latest + .gc_blocking + .as_ref() + .is_some_and(|b| b.blocked_by(DetachAncestor)), + ) + }; + assert!( + still_ongoing, + "cannot (detach? reparent)? complete if the operation is not still ongoing" + ); + + let ancestor = match (detached.ancestor_timeline.as_ref(), recorded_branchpoint) { + (Some(ancestor), None) => { + assert!( + !layers.is_empty(), + "there should always be at least one layer to inherit" + ); + Ancestor::NotDetached(ancestor.clone(), detached.ancestor_lsn) + } + (Some(_), Some(_)) => { + panic!( + "it should be impossible to get to here without having gone through the tenant reset; if the tenant was reset, then the ancestor_timeline would be None" + ); + } + (None, Some((ancestor_id, ancestor_lsn))) => { + // it has been either: + // - detached but still exists => we can try reparenting + // - detached and deleted + // + // either way, we must complete + assert!( + layers.is_empty(), + "no layers should had been copied as detach is done" + ); + + let existing = tenant.timelines.lock().unwrap().get(&ancestor_id).cloned(); + + if let Some(ancestor) = existing { + Ancestor::Detached(ancestor, ancestor_lsn) + } else { + let direct_children = reparented_direct_children(detached, tenant)?; + return Ok(DetachingAndReparenting::AlreadyDone(direct_children)); + } + } + (None, None) => { + // TODO: make sure there are no `?` before tenant_reset from after a questionmark from + // here. + panic!( + "bug: detach_and_reparent called on a timeline which has not been detached or which has no live ancestor" + ); + } + }; // publish the prepared layers before we reparent any of the timelines, so that on restart // reparented timelines find layers. also do the actual detaching. // - // if we crash after this operation, we will at least come up having detached a timeline, but - // we cannot go back and reparent the timelines which would had been reparented in normal - // execution. - // - // this is not perfect, but it avoids us a retry happening after a compaction or gc on restart - // which could give us a completely wrong layer combination. - detached - .remote_client - .schedule_adding_existing_layers_to_index_detach_and_wait( - &layers, - (ancestor.timeline_id, ancestor_lsn), - ) - .await?; + // if we crash after this operation, a retry will allow reparenting the remaining timelines as + // gc is blocked. + + let (ancestor, ancestor_lsn, was_detached) = match ancestor { + Ancestor::NotDetached(ancestor, ancestor_lsn) => { + // this has to complete before any reparentings because otherwise they would not have + // layers on the new parent. + detached + .remote_client + .schedule_adding_existing_layers_to_index_detach_and_wait( + &layers, + (ancestor.timeline_id, ancestor_lsn), + ) + .await + .context("publish layers and detach ancestor")?; + + tracing::info!( + ancestor=%ancestor.timeline_id, + %ancestor_lsn, + inherited_layers=%layers.len(), + "detached from ancestor" + ); + (ancestor, ancestor_lsn, true) + } + Ancestor::Detached(ancestor, ancestor_lsn) => (ancestor, ancestor_lsn, false), + }; let mut tasks = tokio::task::JoinSet::new(); + // Returns a single permit semaphore which will be used to make one reparenting succeed, + // others will fail as if those timelines had been stopped for whatever reason. + #[cfg(feature = "testing")] + let failpoint_sem = || -> Option> { + fail::fail_point!("timeline-detach-ancestor::allow_one_reparented", |_| Some( + Arc::new(Semaphore::new(1)) + )); + None + }(); + // because we are now keeping the slot in progress, it is unlikely that there will be any // timeline deletions during this time. if we raced one, then we'll just ignore it. - tenant - .timelines - .lock() - .unwrap() - .values() - .filter_map(|tl| { - if Arc::ptr_eq(tl, detached) { - return None; - } + { + let g = tenant.timelines.lock().unwrap(); + reparentable_timelines(g.values(), detached, &ancestor, ancestor_lsn) + .cloned() + .for_each(|timeline| { + // important in this scope: we are holding the Tenant::timelines lock + let span = tracing::info_span!("reparent", reparented=%timeline.timeline_id); + let new_parent = detached.timeline_id; + #[cfg(feature = "testing")] + let failpoint_sem = failpoint_sem.clone(); - if !tl.is_active() { - return None; - } + tasks.spawn( + async move { + let res = async { + #[cfg(feature = "testing")] + if let Some(failpoint_sem) = failpoint_sem { + let _permit = failpoint_sem.acquire().await.map_err(|_| { + anyhow::anyhow!( + "failpoint: timeline-detach-ancestor::allow_one_reparented", + ) + })?; + failpoint_sem.close(); + } - let tl_ancestor = tl.ancestor_timeline.as_ref()?; - let is_same = Arc::ptr_eq(ancestor, tl_ancestor); - let is_earlier = tl.get_ancestor_lsn() <= ancestor_lsn; - - let is_deleting = tl - .delete_progress - .try_lock() - .map(|flow| !flow.is_not_started()) - .unwrap_or(true); - - if is_same && is_earlier && !is_deleting { - Some(tl.clone()) - } else { - None - } - }) - .for_each(|timeline| { - // important in this scope: we are holding the Tenant::timelines lock - let span = tracing::info_span!("reparent", reparented=%timeline.timeline_id); - let new_parent = detached.timeline_id; - - tasks.spawn( - async move { - let res = timeline - .remote_client - .schedule_reparenting_and_wait(&new_parent) + timeline + .remote_client + .schedule_reparenting_and_wait(&new_parent) + .await + } .await; - match res { - Ok(()) => Some(timeline), - Err(e) => { - // with the use of tenant slot, we no longer expect these. - tracing::warn!("reparenting failed: {e:#}"); - None + match res { + Ok(()) => { + tracing::info!("reparented"); + Some(timeline) + } + Err(e) => { + // with the use of tenant slot, raced timeline deletion is the most + // likely reason. + tracing::warn!("reparenting failed: {e:#}"); + None + } } } - } - .instrument(span), - ); - }); + .instrument(span), + ); + }); + } let reparenting_candidates = tasks.len(); let mut reparented = HashSet::with_capacity(tasks.len()); @@ -645,33 +865,103 @@ pub(super) async fn complete( while let Some(res) = tasks.join_next().await { match res { Ok(Some(timeline)) => { - tracing::info!(reparented=%timeline.timeline_id, "reparenting done"); - assert!( reparented.insert(timeline.timeline_id), "duplicate reparenting? timeline_id={}", timeline.timeline_id ); } - Ok(None) => { - // lets just ignore this for now. one or all reparented timelines could had - // started deletion, and that is fine. - } Err(je) if je.is_cancelled() => unreachable!("not used"), - Err(je) if je.is_panic() => { - // ignore; it's better to continue with a single reparenting failing (or even - // all of them) in order to get to the goal state. - // - // these timelines will never be reparentable, but they can be always detached as - // separate tree roots. - } + // just ignore failures now, we can retry + Ok(None) => {} + Err(je) if je.is_panic() => {} Err(je) => tracing::error!("unexpected join error: {je:?}"), } } - if reparenting_candidates != reparented.len() { - tracing::info!("failed to reparent some candidates"); + let reparented_all = reparenting_candidates == reparented.len(); + + if reparented_all { + Ok(DetachingAndReparenting::Reparented(reparented)) + } else { + tracing::info!( + reparented = reparented.len(), + candidates = reparenting_candidates, + "failed to reparent all candidates; they can be retried after the tenant_reset", + ); + + let must_reset_tenant = !reparented.is_empty() || was_detached; + Ok(DetachingAndReparenting::SomeReparentingFailed { must_reset_tenant }) + } +} + +pub(super) async fn complete( + detached: &Arc, + tenant: &Tenant, + mut attempt: Attempt, + _ctx: &RequestContext, +) -> Result<(), Error> { + assert_eq!(detached.timeline_id, attempt.timeline_id); + + if attempt.gate_entered.is_none() { + let entered = detached.gate.enter().map_err(|_| Error::ShuttingDown)?; + attempt.gate_entered = Some(entered); + } else { + // Some(gate_entered) means the tenant was not restarted, as is not required } - Ok(reparented) + assert!(detached.ancestor_timeline.is_none()); + + // this should be an 503 at least...? + fail::fail_point!( + "timeline-detach-ancestor::complete_before_uploading", + |_| Err(Error::Failpoint( + "timeline-detach-ancestor::complete_before_uploading" + )) + ); + + tenant + .gc_block + .remove( + detached, + crate::tenant::remote_timeline_client::index::GcBlockingReason::DetachAncestor, + ) + .await + // FIXME: better error + .map_err(Error::Unexpected)?; + + Ok(()) +} + +/// Query against a locked `Tenant::timelines`. +fn reparentable_timelines<'a, I>( + timelines: I, + detached: &'a Arc, + ancestor: &'a Arc, + ancestor_lsn: Lsn, +) -> impl Iterator> + 'a +where + I: Iterator> + 'a, +{ + timelines.filter_map(move |tl| { + if Arc::ptr_eq(tl, detached) { + return None; + } + + let tl_ancestor = tl.ancestor_timeline.as_ref()?; + let is_same = Arc::ptr_eq(ancestor, tl_ancestor); + let is_earlier = tl.get_ancestor_lsn() <= ancestor_lsn; + + let is_deleting = tl + .delete_progress + .try_lock() + .map(|flow| !flow.is_not_started()) + .unwrap_or(true); + + if is_same && is_earlier && !is_deleting { + Some(tl) + } else { + None + } + }) } diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index fe582cf0e2..ee8e9ac5a1 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -2989,6 +2989,7 @@ impl Service { ); let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref()); + client .timeline_detach_ancestor(tenant_shard_id, timeline_id) .await @@ -3005,7 +3006,8 @@ impl Service { Error::ApiError(StatusCode::BAD_REQUEST, msg) => { ApiError::BadRequest(anyhow::anyhow!("{node}: {msg}")) } - // rest can be mapped + // rest can be mapped as usual + // FIXME: this converts some 500 to 409 which is not per openapi other => passthrough_api_error(&node, other), } }) diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index 4e409eeb17..902457c2ac 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -5,7 +5,7 @@ import time from concurrent.futures import ThreadPoolExecutor from queue import Empty, Queue from threading import Barrier -from typing import List, Tuple +from typing import List, Set, Tuple import pytest from fixtures.common_types import Lsn, TimelineId @@ -807,22 +807,24 @@ def test_timeline_detach_ancestor_interrupted_by_deletion( What remains not tested by this: - shutdown winning over complete - - Shutdown winning over complete needs gc blocking and reparenting any left-overs on retry. """ if sharded and mode == "delete_tenant": # the shared/exclusive lock for tenant is blocking this: # timeline detach ancestor takes shared, delete tenant takes exclusive - pytest.skip( - "tenant deletion while timeline ancestor detach is underway is not supported yet" - ) + pytest.skip("tenant deletion while timeline ancestor detach is underway cannot happen") shard_count = 2 if sharded else 1 neon_env_builder.num_pageservers = shard_count - env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count if sharded else None) + env = neon_env_builder.init_start( + initial_tenant_shard_count=shard_count if sharded else None, + initial_tenant_conf={ + "gc_period": "1s", + "lsn_lease_length": "0s", + }, + ) for ps in env.pageservers: ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) @@ -831,7 +833,7 @@ def test_timeline_detach_ancestor_interrupted_by_deletion( detached_timeline = env.neon_cli.create_branch("detached soon", "main") - failpoint = "timeline-detach-ancestor::before_starting_after_locking_pausable" + pausepoint = "timeline-detach-ancestor::before_starting_after_locking_pausable" env.storage_controller.reconcile_until_idle() shards = env.storage_controller.locate(env.initial_tenant) @@ -843,13 +845,20 @@ def test_timeline_detach_ancestor_interrupted_by_deletion( victim = pageservers[int(shards[-1]["node_id"])] victim_http = victim.http_client() - victim_http.configure_failpoints((failpoint, "pause")) + victim_http.configure_failpoints((pausepoint, "pause")) def detach_ancestor(): target.detach_ancestor(env.initial_tenant, detached_timeline) - def at_failpoint() -> Tuple[str, LogCursor]: - return victim.assert_log_contains(f"at failpoint {failpoint}") + def at_failpoint() -> LogCursor: + msg, offset = victim.assert_log_contains(f"at failpoint {pausepoint}") + log.info(f"found {msg}") + msg, offset = victim.assert_log_contains( + ".* gc_loop.*: Skipping GC: .*", + offset, + ) + log.info(f"found {msg}") + return offset def start_delete(): if mode == "delete_timeline": @@ -882,23 +891,44 @@ def test_timeline_detach_ancestor_interrupted_by_deletion( with ThreadPoolExecutor(max_workers=2) as pool: try: fut = pool.submit(detach_ancestor) - _, offset = wait_until(10, 1.0, at_failpoint) + offset = wait_until(10, 1.0, at_failpoint) delete = pool.submit(start_delete) - wait_until(10, 1.0, lambda: at_waiting_on_gate_close(offset)) + offset = wait_until(10, 1.0, lambda: at_waiting_on_gate_close(offset)) - victim_http.configure_failpoints((failpoint, "off")) + victim_http.configure_failpoints((pausepoint, "off")) delete.result() assert wait_until(10, 1.0, is_deleted), f"unimplemented mode {mode}" + # TODO: match the error with pytest.raises(PageserverApiException) as exc: fut.result() + log.info(f"TODO: match this error: {exc.value}") assert exc.value.status_code == 503 finally: - victim_http.configure_failpoints((failpoint, "off")) + victim_http.configure_failpoints((pausepoint, "off")) + + if mode != "delete_timeline": + return + + # make sure the gc is unblocked + time.sleep(2) + victim.assert_log_contains(".* gc_loop.*: 1 timelines need GC", offset) + + if not sharded: + # we have the other node only while sharded + return + + other = pageservers[int(shards[0]["node_id"])] + log.info(f"other is {other.id}") + _, offset = other.assert_log_contains( + ".*INFO request\\{method=PUT path=/v1/tenant/\\S+/timeline/\\S+/detach_ancestor .*\\}: Request handled, status: 200 OK", + ) + # this might be a lot earlier than the victims line, but that is okay. + _, offset = other.assert_log_contains(".* gc_loop.*: 1 timelines need GC", offset) @pytest.mark.parametrize("mode", ["delete_reparentable_timeline"]) @@ -915,7 +945,9 @@ def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnv assert ( mode == "delete_reparentable_timeline" - ), "only one now, but we could have the create just as well, need gc blocking" + ), "only one now, but creating reparentable timelines cannot be supported even with gc blocking" + # perhaps it could be supported by always doing this for the shard0 first, and after that for others. + # when we run shard0 to completion, we can use it's timelines to restrict which can be reparented. shard_count = 2 neon_env_builder.num_pageservers = shard_count @@ -1048,10 +1080,267 @@ def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnv victim_http.configure_failpoints((pausepoint, "off")) +def test_retried_detach_ancestor_after_failed_reparenting(neon_env_builder: NeonEnvBuilder): + """ + Using a failpoint, force the completion step of timeline ancestor detach to + fail after reparenting a single timeline. + + Retrying should try reparenting until all reparentings are done, all the + time blocking gc even across restarts (first round). + + A completion failpoint is used to inhibit completion on second to last + round. + + On last round, the completion uses a path where no reparentings can happen + because original ancestor is deleted, and there is a completion to unblock + gc without restart. + """ + + # to get the remote storage metrics + neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.MOCK_S3) + env = neon_env_builder.init_start( + initial_tenant_conf={ + "gc_period": "1s", + "lsn_lease_length": "0s", + } + ) + + env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + env.pageserver.allowed_errors.extend( + [ + ".* reparenting failed: failpoint: timeline-detach-ancestor::allow_one_reparented", + ".* Error processing HTTP request: InternalServerError\\(failed to reparent all candidate timelines, please retry", + ".* Error processing HTTP request: InternalServerError\\(failpoint: timeline-detach-ancestor::complete_before_uploading", + ] + ) + + http = env.pageserver.http_client() + + def remote_storage_copy_requests(): + return http.get_metric_value( + "remote_storage_s3_request_seconds_count", + {"request_type": "copy_object", "result": "ok"}, + ) + + def reparenting_progress(timelines: List[TimelineId]) -> Tuple[int, Set[TimelineId]]: + reparented = 0 + not_reparented = set() + for timeline in timelines: + detail = http.timeline_detail(env.initial_tenant, timeline) + ancestor = TimelineId(detail["ancestor_timeline_id"]) + if ancestor == detached: + reparented += 1 + else: + not_reparented.add(timeline) + return (reparented, not_reparented) + + # main ------A-----B-----C-----D-----E> lsn + timelines = [] + with env.endpoints.create_start("main") as ep: + for counter in range(5): + ep.safe_psql( + f"create table foo_{counter} as select i::bigint from generate_series(1, 10000) t(i)" + ) + branch_lsn = wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) + http.timeline_checkpoint(env.initial_tenant, env.initial_timeline) + branch = env.neon_cli.create_branch( + f"branch_{counter}", "main", ancestor_start_lsn=branch_lsn + ) + timelines.append(branch) + + flush_ep_to_pageserver(env, ep, env.initial_tenant, env.initial_timeline) + + # detach "E" which has most reparentable timelines under it + detached = timelines.pop() + assert len(timelines) == 4 + + http = http.without_status_retrying() + + http.configure_failpoints(("timeline-detach-ancestor::allow_one_reparented", "return")) + + not_reparented: Set[TimelineId] = set() + # tracked offset in the pageserver log which is at least at the most recent activation + offset = None + + def try_detach(): + with pytest.raises( + PageserverApiException, + match=".*failed to reparent all candidate timelines, please retry", + ) as exc: + http.detach_ancestor(env.initial_tenant, detached) + assert exc.value.status_code == 500 + + # first round -- do more checking to make sure the gc gets paused + try_detach() + + assert ( + http.timeline_detail(env.initial_tenant, detached)["ancestor_timeline_id"] is None + ), "first round should had detached 'detached'" + + reparented, not_reparented = reparenting_progress(timelines) + assert reparented == 1 + + time.sleep(2) + _, offset = env.pageserver.assert_log_contains( + ".*INFO request\\{method=PUT path=/v1/tenant/[0-9a-f]{32}/timeline/[0-9a-f]{32}/detach_ancestor .*\\}: Handling request", + offset, + ) + _, offset = env.pageserver.assert_log_contains(".*: attach finished, activating", offset) + _, offset = env.pageserver.assert_log_contains( + ".* gc_loop.*: Skipping GC: .*", + offset, + ) + metric = remote_storage_copy_requests() + assert metric != 0 + # make sure the gc blocking is persistent over a restart + env.pageserver.restart() + env.pageserver.quiesce_tenants() + time.sleep(2) + _, offset = env.pageserver.assert_log_contains(".*: attach finished, activating", offset) + assert env.pageserver.log_contains(".* gc_loop.*: [0-9] timelines need GC", offset) is None + _, offset = env.pageserver.assert_log_contains( + ".* gc_loop.*: Skipping GC: .*", + offset, + ) + # restore failpoint for the next reparented + http.configure_failpoints(("timeline-detach-ancestor::allow_one_reparented", "return")) + + reparented_before = reparented + + # do two more rounds + for _ in range(2): + try_detach() + + assert ( + http.timeline_detail(env.initial_tenant, detached)["ancestor_timeline_id"] is None + ), "first round should had detached 'detached'" + + reparented, not_reparented = reparenting_progress(timelines) + assert reparented == reparented_before + 1 + reparented_before = reparented + + _, offset = env.pageserver.assert_log_contains(".*: attach finished, activating", offset) + metric = remote_storage_copy_requests() + assert metric == 0, "copies happen in the first round" + + assert offset is not None + assert len(not_reparented) == 1 + + http.configure_failpoints(("timeline-detach-ancestor::complete_before_uploading", "return")) + + # almost final round, the failpoint is hit no longer as there is only one reparented and one always gets to succeed. + # the tenant is restarted once more, but we fail during completing. + with pytest.raises( + PageserverApiException, match=".* timeline-detach-ancestor::complete_before_uploading" + ) as exc: + http.detach_ancestor(env.initial_tenant, detached) + assert exc.value.status_code == 500 + _, offset = env.pageserver.assert_log_contains(".*: attach finished, activating", offset) + + # delete the previous ancestor to take a different path to completion. all + # other tests take the "detach? reparent complete", but this only hits + # "complete". + http.timeline_delete(env.initial_tenant, env.initial_timeline) + wait_timeline_detail_404(http, env.initial_tenant, env.initial_timeline, 20) + + http.configure_failpoints(("timeline-detach-ancestor::complete_before_uploading", "off")) + + reparented_resp = http.detach_ancestor(env.initial_tenant, detached) + assert reparented_resp == set(timelines) + # no need to quiesce_tenants anymore, because completion does that + + reparented, not_reparented = reparenting_progress(timelines) + assert reparented == len(timelines) + + time.sleep(2) + assert ( + env.pageserver.log_contains(".*: attach finished, activating", offset) is None + ), "there should be no restart with the final detach_ancestor as it only completed" + + # gc is unblocked + env.pageserver.assert_log_contains(".* gc_loop.*: 5 timelines need GC", offset) + + metric = remote_storage_copy_requests() + assert metric == 0 + + +def test_timeline_is_deleted_before_timeline_detach_ancestor_completes( + neon_env_builder: NeonEnvBuilder, +): + """ + Make sure that a timeline deleted after restart will unpause gc blocking. + """ + env = neon_env_builder.init_start( + initial_tenant_conf={ + "gc_period": "1s", + "lsn_lease_length": "0s", + } + ) + + env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + http = env.pageserver.http_client() + + detached = env.neon_cli.create_branch("detached") + + failpoint = "timeline-detach-ancestor::after_activating_before_finding-pausable" + + http.configure_failpoints((failpoint, "pause")) + + def detach_and_get_stuck(): + return http.detach_ancestor(env.initial_tenant, detached) + + def request_processing_noted_in_log(): + _, offset = env.pageserver.assert_log_contains( + ".*INFO request\\{method=PUT path=/v1/tenant/[0-9a-f]{32}/timeline/[0-9a-f]{32}/detach_ancestor .*\\}: Handling request", + ) + return offset + + def delete_detached(): + return http.timeline_delete(env.initial_tenant, detached) + + try: + with ThreadPoolExecutor(max_workers=1) as pool: + detach = pool.submit(detach_and_get_stuck) + + offset = wait_until(10, 1.0, request_processing_noted_in_log) + + # make this named fn tor more clear failure test output logging + def pausepoint_hit_with_gc_paused() -> LogCursor: + env.pageserver.assert_log_contains(f"at failpoint {failpoint}") + _, at = env.pageserver.assert_log_contains( + ".* gc_loop.*: Skipping GC: .*", + offset, + ) + return at + + offset = wait_until(10, 1.0, pausepoint_hit_with_gc_paused) + + delete_detached() + + wait_timeline_detail_404(http, env.initial_tenant, detached, 10, 1.0) + + http.configure_failpoints((failpoint, "off")) + + with pytest.raises(PageserverApiException) as exc: + detach.result() + + # FIXME: this should be 404 but because there is another Anyhow conversion it is 500 + assert exc.value.status_code == 500 + env.pageserver.allowed_errors.append( + ".*Error processing HTTP request: InternalServerError\\(detached timeline was not found after restart" + ) + finally: + http.configure_failpoints((failpoint, "off")) + + # make sure gc has been unblocked + time.sleep(2) + + env.pageserver.assert_log_contains(".* gc_loop.*: 1 timelines need GC", offset) + + # TODO: -# - after starting the operation, pageserver is shutdown, restarted -# - after starting the operation, bottom-most timeline is deleted, pageserver is restarted, gc is inhibited -# - deletion of reparented while reparenting should fail once, then succeed (?) # - branch near existing L1 boundary, image layers? # - investigate: why are layers started at uneven lsn? not just after branching, but in general. #