diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index ba5fb521ff..44159aee0a 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -72,6 +72,7 @@ use crate::tenant::remote_timeline_client::{ use crate::tenant::secondary::SecondaryController; use crate::tenant::size::ModelInputs; use crate::tenant::storage_layer::{IoConcurrency, LayerAccessStatsReset, LayerName}; +use crate::tenant::timeline::detach_ancestor::DetachBehavior; use crate::tenant::timeline::offload::{OffloadError, offload_timeline}; use crate::tenant::timeline::{ CompactFlags, CompactOptions, CompactRequest, CompactionError, Timeline, WaitLsnTimeout, @@ -2505,6 +2506,8 @@ async fn timeline_detach_ancestor_handler( let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + let behavior: Option = parse_query_param(&request, "detach_behavior")?; + let behavior = behavior.unwrap_or_default(); let span = tracing::info_span!("detach_ancestor", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id); @@ -2554,7 +2557,7 @@ async fn timeline_detach_ancestor_handler( let ctx = &ctx.with_scope_timeline(&timeline); let progress = timeline - .prepare_to_detach_from_ancestor(&tenant, options, ctx) + .prepare_to_detach_from_ancestor(&tenant, options, behavior, ctx) .await?; // uncomment to allow early as possible Tenant::drop @@ -2569,6 +2572,7 @@ async fn timeline_detach_ancestor_handler( tenant_shard_id, timeline_id, prepared, + behavior, attempt, ctx, ) diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs index 77f9a3579d..dceae89d1c 100644 --- a/pageserver/src/tenant/metadata.rs +++ b/pageserver/src/tenant/metadata.rs @@ -300,9 +300,8 @@ impl TimelineMetadata { /// Returns true if anything was changed pub fn detach_from_ancestor(&mut self, branchpoint: &(TimelineId, Lsn)) { - if let Some(ancestor) = self.body.ancestor_timeline { - assert_eq!(ancestor, branchpoint.0); - } + // Detaching from ancestor now doesn't always detach directly to the direct ancestor, but we + // ensure the LSN is the same. So we don't check the timeline ID. if self.body.ancestor_lsn != Lsn(0) { assert_eq!(self.body.ancestor_lsn, branchpoint.1); } diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 003f84e640..092bfdf6c1 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -1914,6 +1914,7 @@ impl TenantManager { tenant_shard_id: TenantShardId, timeline_id: TimelineId, prepared: PreparedTimelineDetach, + behavior: detach_ancestor::DetachBehavior, mut attempt: detach_ancestor::Attempt, ctx: &RequestContext, ) -> Result, detach_ancestor::Error> { @@ -1957,7 +1958,14 @@ impl TenantManager { .map_err(Error::NotFound)?; let resp = timeline - .detach_from_ancestor_and_reparent(&tenant, prepared, ctx) + .detach_from_ancestor_and_reparent( + &tenant, + prepared, + attempt.ancestor_timeline_id, + attempt.ancestor_lsn, + behavior, + ctx, + ) .await?; let mut slot_guard = slot_guard; diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index e01c3dbd4d..61542409f7 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -5388,9 +5388,10 @@ impl Timeline { self: &Arc, tenant: &crate::tenant::Tenant, options: detach_ancestor::Options, + behavior: detach_ancestor::DetachBehavior, ctx: &RequestContext, ) -> Result { - detach_ancestor::prepare(self, tenant, options, ctx).await + detach_ancestor::prepare(self, tenant, behavior, options, ctx).await } /// Second step of detach from ancestor; detaches the `self` from it's current ancestor and @@ -5406,9 +5407,21 @@ impl Timeline { self: &Arc, tenant: &crate::tenant::Tenant, prepared: detach_ancestor::PreparedTimelineDetach, + ancestor_timeline_id: TimelineId, + ancestor_lsn: Lsn, + behavior: detach_ancestor::DetachBehavior, ctx: &RequestContext, ) -> Result { - detach_ancestor::detach_and_reparent(self, tenant, prepared, ctx).await + detach_ancestor::detach_and_reparent( + self, + tenant, + prepared, + ancestor_timeline_id, + ancestor_lsn, + behavior, + ctx, + ) + .await } /// Final step which unblocks the GC. diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs index b08003d04a..c3e4bedc50 100644 --- a/pageserver/src/tenant/timeline/detach_ancestor.rs +++ b/pageserver/src/tenant/timeline/detach_ancestor.rs @@ -32,6 +32,9 @@ pub(crate) enum Error { #[error("too many ancestors")] TooManyAncestors, + #[error("ancestor is not empty")] + AncestorNotEmpty, + #[error("shutting down, please retry later")] ShuttingDown, @@ -89,7 +92,9 @@ impl From for ApiError { fn from(value: Error) -> Self { match value { Error::NoAncestor => ApiError::Conflict(value.to_string()), - Error::TooManyAncestors => ApiError::BadRequest(anyhow::anyhow!("{value}")), + Error::TooManyAncestors | Error::AncestorNotEmpty => { + ApiError::BadRequest(anyhow::anyhow!("{value}")) + } Error::ShuttingDown => ApiError::ShuttingDown, Error::Archived(_) => ApiError::BadRequest(anyhow::anyhow!("{value}")), Error::OtherTimelineDetachOngoing(_) | Error::FailedToReparentAll => { @@ -127,13 +132,37 @@ pub(crate) struct PreparedTimelineDetach { layers: Vec, } -/// TODO: this should be part of PageserverConf because we cannot easily modify cplane arguments. +// TODO: this should be part of PageserverConf because we cannot easily modify cplane arguments. #[derive(Debug)] pub(crate) struct Options { pub(crate) rewrite_concurrency: std::num::NonZeroUsize, pub(crate) copy_concurrency: std::num::NonZeroUsize, } +/// Controls the detach ancestor behavior. +/// - When set to `NoAncestorAndReparent`, we will only detach a branch if its ancestor is a root branch. It will automatically reparent any children of the ancestor before and at the branch point. +/// - When set to `MultiLevelAndNoReparent`, we will detach a branch from multiple levels of ancestors, and no reparenting will happen at all. +#[derive(Debug, Clone, Copy, Default)] +pub enum DetachBehavior { + #[default] + NoAncestorAndReparent, + MultiLevelAndNoReparent, +} + +impl std::str::FromStr for DetachBehavior { + type Err = &'static str; + + fn from_str(s: &str) -> Result { + match s { + "no_ancestor_and_reparent" => Ok(DetachBehavior::NoAncestorAndReparent), + "multi_level_and_no_reparent" => Ok(DetachBehavior::MultiLevelAndNoReparent), + "v1" => Ok(DetachBehavior::NoAncestorAndReparent), + "v2" => Ok(DetachBehavior::MultiLevelAndNoReparent), + _ => Err("cannot parse detach behavior"), + } + } +} + impl Default for Options { fn default() -> Self { Self { @@ -147,7 +176,8 @@ impl Default for Options { #[derive(Debug)] pub(crate) struct Attempt { pub(crate) timeline_id: TimelineId, - + pub(crate) ancestor_timeline_id: TimelineId, + pub(crate) ancestor_lsn: Lsn, _guard: completion::Completion, gate_entered: Option, } @@ -167,25 +197,30 @@ impl Attempt { pub(super) async fn prepare( detached: &Arc, tenant: &Tenant, + behavior: DetachBehavior, options: Options, ctx: &RequestContext, ) -> Result { use Error::*; - let Some((ancestor, ancestor_lsn)) = detached + let Some((mut ancestor, mut ancestor_lsn)) = detached .ancestor_timeline .as_ref() .map(|tl| (tl.clone(), detached.ancestor_lsn)) else { + let ancestor_id; + let ancestor_lsn; let still_in_progress = { let accessor = detached.remote_client.initialized_upload_queue()?; // we are safe to inspect the latest uploaded, because we can only witness this after // restart is complete and ancestor is no more. let latest = accessor.latest_uploaded_index_part(); - if latest.lineage.detached_previous_ancestor().is_none() { + let Some((id, lsn)) = latest.lineage.detached_previous_ancestor() else { return Err(NoAncestor); }; + ancestor_id = id; + ancestor_lsn = lsn; latest .gc_blocking @@ -196,7 +231,8 @@ pub(super) async fn prepare( if still_in_progress { // gc is still blocked, we can still reparent and complete. // we are safe to reparent remaining, because they were locked in in the beginning. - let attempt = continue_with_blocked_gc(detached, tenant).await?; + let attempt = + continue_with_blocked_gc(detached, tenant, ancestor_id, ancestor_lsn).await?; // because the ancestor of detached is already set to none, we have published all // of the layers, so we are still "prepared." @@ -224,13 +260,34 @@ pub(super) async fn prepare( check_no_archived_children_of_ancestor(tenant, detached, &ancestor, ancestor_lsn)?; - if ancestor.ancestor_timeline.is_some() { + if let DetachBehavior::MultiLevelAndNoReparent = behavior { + // If the ancestor has an ancestor, we might be able to fast-path detach it if the current ancestor does not have any data written/used by the detaching timeline. + while let Some(ancestor_of_ancestor) = ancestor.ancestor_timeline.clone() { + if ancestor_lsn != ancestor.ancestor_lsn { + // non-technical requirement; we could flatten still if ancestor LSN does not match but that needs + // us to copy and cut more layers. + return Err(AncestorNotEmpty); + } + // Use the ancestor of the ancestor as the new ancestor (only when the ancestor LSNs are the same) + ancestor_lsn = ancestor.ancestor_lsn; // Get the LSN first before resetting the `ancestor` variable + ancestor = ancestor_of_ancestor; + // TODO: do we still need to check if we don't want to reparent? + check_no_archived_children_of_ancestor(tenant, detached, &ancestor, ancestor_lsn)?; + } + } else if ancestor.ancestor_timeline.is_some() { // non-technical requirement; we could flatten N ancestors just as easily but we chose // not to, at least initially return Err(TooManyAncestors); } - let attempt = start_new_attempt(detached, tenant).await?; + tracing::info!( + "attempt to detach the timeline from the ancestor: {}@{}, behavior={:?}", + ancestor.timeline_id, + ancestor_lsn, + behavior + ); + + let attempt = start_new_attempt(detached, tenant, ancestor.timeline_id, ancestor_lsn).await?; utils::pausable_failpoint!("timeline-detach-ancestor::before_starting_after_locking-pausable"); @@ -450,8 +507,13 @@ pub(super) async fn prepare( Ok(Progress::Prepared(attempt, prepared)) } -async fn start_new_attempt(detached: &Timeline, tenant: &Tenant) -> Result { - let attempt = obtain_exclusive_attempt(detached, tenant)?; +async fn start_new_attempt( + detached: &Timeline, + tenant: &Tenant, + ancestor_timeline_id: TimelineId, + ancestor_lsn: Lsn, +) -> Result { + let attempt = obtain_exclusive_attempt(detached, tenant, ancestor_timeline_id, ancestor_lsn)?; // insert the block in the index_part.json, if not already there. let _dont_care = tenant @@ -466,13 +528,23 @@ async fn start_new_attempt(detached: &Timeline, tenant: &Tenant) -> Result Result { +async fn continue_with_blocked_gc( + detached: &Timeline, + tenant: &Tenant, + ancestor_timeline_id: TimelineId, + ancestor_lsn: Lsn, +) -> Result { // FIXME: it would be nice to confirm that there is an in-memory version, since we've just // verified there is a persistent one? - obtain_exclusive_attempt(detached, tenant) + obtain_exclusive_attempt(detached, tenant, ancestor_timeline_id, ancestor_lsn) } -fn obtain_exclusive_attempt(detached: &Timeline, tenant: &Tenant) -> Result { +fn obtain_exclusive_attempt( + detached: &Timeline, + tenant: &Tenant, + ancestor_timeline_id: TimelineId, + ancestor_lsn: Lsn, +) -> Result { use Error::{OtherTimelineDetachOngoing, ShuttingDown}; // ensure we are the only active attempt for this tenant @@ -493,6 +565,8 @@ fn obtain_exclusive_attempt(detached: &Timeline, tenant: &Tenant) -> Result, tenant: &Tenant, prepared: PreparedTimelineDetach, + ancestor_timeline_id: TimelineId, + ancestor_lsn: Lsn, + behavior: DetachBehavior, _ctx: &RequestContext, ) -> Result { let PreparedTimelineDetach { layers } = prepared; @@ -822,7 +899,30 @@ pub(super) async fn detach_and_reparent( "cannot (detach? reparent)? complete if the operation is not still ongoing" ); - let ancestor = match (detached.ancestor_timeline.as_ref(), recorded_branchpoint) { + let ancestor_to_detach = match detached.ancestor_timeline.as_ref() { + Some(mut ancestor) => { + while ancestor.timeline_id != ancestor_timeline_id { + match ancestor.ancestor_timeline.as_ref() { + Some(found) => { + if ancestor_lsn != ancestor.ancestor_lsn { + return Err(Error::DetachReparent(anyhow::anyhow!( + "cannot find the ancestor timeline to detach from: wrong ancestor lsn" + ))); + } + ancestor = found; + } + None => { + return Err(Error::DetachReparent(anyhow::anyhow!( + "cannot find the ancestor timeline to detach from" + ))); + } + } + } + Some(ancestor) + } + None => None, + }; + let ancestor = match (ancestor_to_detach, recorded_branchpoint) { (Some(ancestor), None) => { assert!( !layers.is_empty(), @@ -895,6 +995,11 @@ pub(super) async fn detach_and_reparent( Ancestor::Detached(ancestor, ancestor_lsn) => (ancestor, ancestor_lsn, false), }; + if let DetachBehavior::MultiLevelAndNoReparent = behavior { + // Do not reparent if the user requests to behave so. + return Ok(DetachingAndReparenting::Reparented(HashSet::new())); + } + let mut tasks = tokio::task::JoinSet::new(); // Returns a single permit semaphore which will be used to make one reparenting succeed, @@ -1032,6 +1137,11 @@ pub(super) async fn complete( } /// Query against a locked `Tenant::timelines`. +/// +/// A timeline is reparentable if: +/// +/// - It is not the timeline being detached. +/// - It has the same ancestor as the timeline being detached. Note that the ancestor might not be the direct ancestor. fn reparentable_timelines<'a, I>( timelines: I, detached: &'a Arc, diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 0efe0b9575..61aab2213d 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -1070,11 +1070,14 @@ class PageserverHttpClient(requests.Session, MetricsGetter): tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, batch_size: int | None = None, + behavior_v2: bool = False, **kwargs, ) -> set[TimelineId]: - params = {} + params: dict[str, Any] = {} if batch_size is not None: params["batch_size"] = batch_size + if behavior_v2: + params["detach_behavior"] = "v2" res = self.put( f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/detach_ancestor", params=params, diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index 612a767480..79537ba83a 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -319,8 +319,9 @@ def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder): # this does not contain Z in the end, so fromisoformat accepts it # it is to be in line with the deletion timestamp.. well, almost. when = original_ancestor[2][:26] - when_ts = datetime.datetime.fromisoformat(when) - assert when_ts < datetime.datetime.now() + when_ts = datetime.datetime.fromisoformat(when).replace(tzinfo=datetime.UTC) + now = datetime.datetime.utcnow().replace(tzinfo=datetime.UTC) + assert when_ts < now assert len(lineage.get("reparenting_history", [])) == 0 elif expected_ancestor == timeline_id: assert len(lineage.get("original_ancestor", [])) == 0 @@ -342,6 +343,138 @@ def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder): wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline) +def test_ancestor_detach_behavior_v2(neon_env_builder: NeonEnvBuilder): + """ + Test the v2 behavior of ancestor detach. + + old main -------|---------X---------> + | | | + | | +-> after + | +--X empty snapshot branch + | | + | +-> branch-to-detach + | + +-> earlier + + Ends up as: + + old main -------|---------X---------> + | | | + | | +-> after + | +--X empty snapshot branch + | + +-> earlier + + + new main -------|---------|----> branch-to-detach + """ + + env = neon_env_builder.init_start() + + env.pageserver.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS) + + client = env.pageserver.http_client() + + with env.endpoints.create_start("main", tenant_id=env.initial_tenant) as ep: + ep.safe_psql("CREATE TABLE foo (i BIGINT);") + ep.safe_psql("CREATE TABLE audit AS SELECT 1 as starts;") + + branchpoint_pipe = wait_for_last_flush_lsn( + env, ep, env.initial_tenant, env.initial_timeline + ) + + ep.safe_psql("INSERT INTO foo SELECT i::bigint FROM generate_series(0, 8191) g(i);") + + branchpoint_x = wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) + client.timeline_checkpoint(env.initial_tenant, env.initial_timeline) + + ep.safe_psql("INSERT INTO foo SELECT i::bigint FROM generate_series(8192, 16383) g(i);") + wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) + + earlier = env.create_branch( + "earlier", ancestor_branch_name="main", ancestor_start_lsn=branchpoint_pipe + ) + + snapshot_branchpoint = env.create_branch( + "snapshot_branchpoint", ancestor_branch_name="main", ancestor_start_lsn=branchpoint_x + ) + + branch_to_detach = env.create_branch( + "branch_to_detach", + ancestor_branch_name="snapshot_branchpoint", + ancestor_start_lsn=branchpoint_x, + ) + + after = env.create_branch("after", ancestor_branch_name="main", ancestor_start_lsn=None) + + all_reparented = client.detach_ancestor(env.initial_tenant, branch_to_detach, behavior_v2=True) + assert set(all_reparented) == set() + + env.pageserver.quiesce_tenants() + + # checking the ancestor after is much faster than waiting for the endpoint not start + expected_result = [ + ("main", env.initial_timeline, None, 16384, 1), + ("after", after, env.initial_timeline, 16384, 1), + ("snapshot_branchpoint", snapshot_branchpoint, env.initial_timeline, 8192, 1), + ("branch_to_detach", branch_to_detach, None, 8192, 1), + ("earlier", earlier, env.initial_timeline, 0, 1), + ] + + assert isinstance(env.pageserver_remote_storage, LocalFsStorage) + + for branch_name, queried_timeline, expected_ancestor, _, _ in expected_result: + details = client.timeline_detail(env.initial_tenant, queried_timeline) + ancestor_timeline_id = details["ancestor_timeline_id"] + if expected_ancestor is None: + assert ancestor_timeline_id is None + else: + assert ( + TimelineId(ancestor_timeline_id) == expected_ancestor + ), f"when checking branch {branch_name}, mapping={expected_result}" + + index_part = env.pageserver_remote_storage.index_content( + env.initial_tenant, queried_timeline + ) + lineage = index_part["lineage"] + assert lineage is not None + + assert lineage.get("reparenting_history_overflown", "false") == "false" + + if queried_timeline == branch_to_detach: + original_ancestor = lineage["original_ancestor"] + assert original_ancestor is not None + assert original_ancestor[0] == str(env.initial_timeline) + assert original_ancestor[1] == str(branchpoint_x) + + # this does not contain Z in the end, so fromisoformat accepts it + # it is to be in line with the deletion timestamp.. well, almost. + when = original_ancestor[2][:26] + when_ts = datetime.datetime.fromisoformat(when).replace(tzinfo=datetime.UTC) + now = datetime.datetime.utcnow().replace(tzinfo=datetime.UTC) + assert when_ts < now + assert len(lineage.get("reparenting_history", [])) == 0 + elif expected_ancestor == branch_to_detach: + assert len(lineage.get("original_ancestor", [])) == 0 + assert lineage["reparenting_history"] == [str(env.initial_timeline)] + else: + assert len(lineage.get("original_ancestor", [])) == 0 + assert len(lineage.get("reparenting_history", [])) == 0 + + for name, _, _, rows, starts in expected_result: + with env.endpoints.create_start(name, tenant_id=env.initial_tenant) as ep: + assert ep.safe_psql("SELECT count(*) FROM foo;")[0][0] == rows + assert ep.safe_psql(f"SELECT count(*) FROM audit WHERE starts = {starts}")[0][0] == 1 + + # delete the new timeline to confirm it doesn't carry over the anything from the old timeline + client.timeline_delete(env.initial_tenant, branch_to_detach) + wait_timeline_detail_404(client, env.initial_tenant, branch_to_detach) + + # delete the after timeline + client.timeline_delete(env.initial_tenant, after) + wait_timeline_detail_404(client, env.initial_tenant, after) + + def test_detached_receives_flushes_while_being_detached(neon_env_builder: NeonEnvBuilder): """ Makes sure that the timeline is able to receive writes through-out the detach process.