diff --git a/libs/pageserver_api/src/models/detach_ancestor.rs b/libs/pageserver_api/src/models/detach_ancestor.rs index ae5a21bab9..ad74d343ae 100644 --- a/libs/pageserver_api/src/models/detach_ancestor.rs +++ b/libs/pageserver_api/src/models/detach_ancestor.rs @@ -1,6 +1,8 @@ +use std::collections::HashSet; + use utils::id::TimelineId; #[derive(Debug, Default, PartialEq, serde::Serialize, serde::Deserialize)] pub struct AncestorDetached { - pub reparented_timelines: Vec, + pub reparented_timelines: HashSet, } diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index ab2a3381e9..adc378cde7 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -1832,7 +1832,6 @@ async fn timeline_detach_ancestor_handler( detach_ancestor::Progress::Done(resp) => resp, }; - // FIXME: if the ordering is really needed and not a hashset, move it here? json_response(StatusCode::OK, resp) } .instrument(span) diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index a1990234d4..9f81db9ed1 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -13,7 +13,7 @@ use pageserver_api::upcall_api::ReAttachResponseTenant; use rand::{distributions::Alphanumeric, Rng}; use std::borrow::Cow; use std::cmp::Ordering; -use std::collections::{BTreeMap, HashMap}; +use std::collections::{BTreeMap, HashMap, HashSet}; use std::ops::Deref; use std::sync::Arc; use std::time::Duration; @@ -1975,7 +1975,7 @@ impl TenantManager { prepared: PreparedTimelineDetach, mut attempt: detach_ancestor::Attempt, ctx: &RequestContext, - ) -> Result, anyhow::Error> { + ) -> Result, anyhow::Error> { // FIXME: this is unnecessary, slotguard already has these semantics struct RevertOnDropSlot(Option); diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs index d0742399a4..6e66e36d82 100644 --- a/pageserver/src/tenant/timeline/detach_ancestor.rs +++ b/pageserver/src/tenant/timeline/detach_ancestor.rs @@ -1,4 +1,4 @@ -use std::sync::Arc; +use std::{collections::HashSet, sync::Arc}; use super::{layer_manager::LayerManager, FlushLayerError, Timeline}; use crate::{ @@ -647,16 +647,7 @@ pub(super) async fn prepare( )); } - // `detached` has previously been detached; let's inspect each of the current timelines and - // report back the timelines which have been reparented by our detach, or which are still - // reparentable - let reparented_timelines = reparented_direct_children(detached, tenant)?; - - // IDEA? add the non-reparented in to the response -- these would be the reparentable, but - // no longer reparentable because they appeared *after* gc blocking was released. - // - // will not be needed once we have the locking in. return Ok(Progress::Done(AncestorDetached { reparented_timelines, })); @@ -862,7 +853,7 @@ pub(super) async fn prepare( fn reparented_direct_children( detached: &Arc, tenant: &Tenant, -) -> Result, Error> { +) -> Result, Error> { let mut all_direct_children = tenant .timelines .lock() @@ -872,7 +863,7 @@ fn reparented_direct_children( let is_direct_child = matches!(tl.ancestor_timeline.as_ref(), Some(ancestor) if Arc::ptr_eq(ancestor, detached)); if is_direct_child { - Some((tl.ancestor_lsn, tl.clone())) + Some(tl.clone()) } else { if let Some(timeline) = tl.ancestor_timeline.as_ref() { assert_ne!(timeline.timeline_id, detached.timeline_id); @@ -886,34 +877,27 @@ fn reparented_direct_children( let mut any_shutdown = false; - all_direct_children.retain( - |(_, tl)| match tl.remote_client.initialized_upload_queue() { - Ok(accessor) => accessor - .latest_uploaded_index_part() - .lineage - .is_reparented(), - Err(_shutdownalike) => { - // not 100% a shutdown, but let's bail early not to give inconsistent results in - // sharded enviroment. - any_shutdown = true; - true - } - }, - ); + all_direct_children.retain(|tl| match tl.remote_client.initialized_upload_queue() { + Ok(accessor) => accessor + .latest_uploaded_index_part() + .lineage + .is_reparented(), + Err(_shutdownalike) => { + // not 100% a shutdown, but let's bail early not to give inconsistent results in + // sharded enviroment. + any_shutdown = true; + true + } + }); if any_shutdown { // it could be one or many being deleted; have client retry return Err(Error::ShuttingDown); } - let mut reparented = all_direct_children; - // why this instead of hashset? there is a reason, but I've forgotten it many times. - // - // maybe if this was a hashset we would not be able to distinguish some race condition. - reparented.sort_unstable_by_key(|(lsn, tl)| (*lsn, tl.timeline_id)); - Ok(reparented + Ok(all_direct_children .into_iter() - .map(|(_, tl)| tl.timeline_id) + .map(|tl| tl.timeline_id) .collect()) } @@ -1070,7 +1054,7 @@ async fn remote_copy( pub(crate) enum DetachingAndReparenting { /// All of the following timeline ids were reparented and the timeline ancestor detach must be /// marked as completed. - Reparented(Vec), + Reparented(HashSet), /// Some of the reparentings failed. The timeline ancestor detach must **not** be marked as /// completed. @@ -1080,7 +1064,7 @@ pub(crate) enum DetachingAndReparenting { /// Detaching and reparentings were completed in a previous attempt. Timeline ancestor detach /// must be marked as completed. - AlreadyDone(Vec), + AlreadyDone(HashSet), } impl DetachingAndReparenting { @@ -1093,7 +1077,7 @@ impl DetachingAndReparenting { } } - pub(crate) fn completed(self) -> Option> { + pub(crate) fn completed(self) -> Option> { use DetachingAndReparenting::*; match self { Reparented(x) | AlreadyDone(x) => Some(x), diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 654969c978..8f74d77308 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -2868,7 +2868,6 @@ impl Service { } // no shard needs to go first/last; the operation should be idempotent - // TODO: it would be great to ensure that all shards return the same error let mut results = self .tenant_for_shards(targets, |tenant_shard_id, node| { futures::FutureExt::boxed(detach_one( @@ -2887,6 +2886,7 @@ impl Service { .filter(|(_, res)| res != &any.1) .collect::>(); if !mismatching.is_empty() { + // this can be hit by races which should not happen because operation lock on cplane let matching = results.len() - mismatching.len(); tracing::error!( matching, diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index c6df6b5baf..27fc742bb2 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -837,7 +837,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): timeline_id: TimelineId, batch_size: int | None = None, **kwargs, - ) -> List[TimelineId]: + ) -> Set[TimelineId]: params = {} if batch_size is not None: params["batch_size"] = batch_size @@ -848,7 +848,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter): ) self.verbose_error(res) json = res.json() - return list(map(TimelineId, json["reparented_timelines"])) + return set(map(TimelineId, json["reparented_timelines"])) def evict_layer( self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, layer_name: str diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index b0dfec2e5c..5f919f1005 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -165,7 +165,7 @@ def test_ancestor_detach_branched_from( ) all_reparented = client.detach_ancestor(env.initial_tenant, timeline_id) - assert all_reparented == [] + assert all_reparented == set() if restart_after: env.pageserver.stop() @@ -534,7 +534,7 @@ def test_compaction_induced_by_detaches_in_history( for _, timeline_id in skip_main: reparented = client.detach_ancestor(env.initial_tenant, timeline_id) - assert reparented == [], "we have no earlier branches at any level" + assert reparented == set(), "we have no earlier branches at any level" post_detach_l0s = list(filter(lambda x: x.l0, delta_layers(branch_timeline_id))) assert len(post_detach_l0s) == 5, "should had inherited 4 L0s, have 5 in total" @@ -774,7 +774,7 @@ def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder): else: break - assert reparented == [], "too many retries (None) or unexpected reparentings" + assert reparented == set(), "too many retries (None) or unexpected reparentings" for shard_info in shards: node_id = int(shard_info["node_id"]) @@ -1203,7 +1203,7 @@ def test_retried_detach_ancestor_after_failed_reparenting(neon_env_builder: Neon http.configure_failpoints(("timeline-detach-ancestor::complete_before_uploading", "off")) reparented_resp = http.detach_ancestor(env.initial_tenant, detached) - assert reparented_resp == timelines + assert reparented_resp == set(timelines) # no need to quiesce_tenants anymore, because completion does that reparented, not_reparented = reparenting_progress(timelines)