diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index bdc315d985..3c828c8a9e 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -112,7 +112,7 @@ pub(super) async fn delete_local_timeline_directory( } /// It is important that this gets called when DeletionGuard is being held. -/// For more context see comments in [`DeleteTimelineFlow::prepare`] +/// For more context see comments in [`make_timeline_delete_guard`] async fn remove_maybe_offloaded_timeline_from_tenant( tenant: &Tenant, timeline: &TimelineOrOffloaded, @@ -193,10 +193,8 @@ impl DeleteTimelineFlow { ) -> Result<(), DeleteTimelineError> { super::debug_assert_current_span_has_tenant_and_timeline_id(); - let allow_offloaded_children = false; - let set_stopping = true; let (timeline, mut guard) = - Self::prepare(tenant, timeline_id, allow_offloaded_children, set_stopping)?; + make_timeline_delete_guard(tenant, timeline_id, TimelineDeleteGuardKind::Delete)?; guard.mark_in_progress()?; @@ -333,75 +331,6 @@ impl DeleteTimelineFlow { Ok(()) } - pub(super) fn prepare( - tenant: &Tenant, - timeline_id: TimelineId, - allow_offloaded_children: bool, - set_stopping: bool, - ) -> Result<(TimelineOrOffloaded, DeletionGuard), DeleteTimelineError> { - // Note the interaction between this guard and deletion guard. - // Here we attempt to lock deletion guard when we're holding a lock on timelines. - // This is important because when you take into account `remove_timeline_from_tenant` - // we remove timeline from memory when we still hold the deletion guard. - // So here when timeline deletion is finished timeline wont be present in timelines map at all - // which makes the following sequence impossible: - // T1: get preempted right before the try_lock on `Timeline::delete_progress` - // T2: do a full deletion, acquire and drop `Timeline::delete_progress` - // T1: acquire deletion lock, do another `DeleteTimelineFlow::run` - // For more context see this discussion: `https://github.com/neondatabase/neon/pull/4552#discussion_r1253437346` - let timelines = tenant.timelines.lock().unwrap(); - let timelines_offloaded = tenant.timelines_offloaded.lock().unwrap(); - - let timeline = match timelines.get(&timeline_id) { - Some(t) => TimelineOrOffloaded::Timeline(Arc::clone(t)), - None => match timelines_offloaded.get(&timeline_id) { - Some(t) => TimelineOrOffloaded::Offloaded(Arc::clone(t)), - None => return Err(DeleteTimelineError::NotFound), - }, - }; - - // Ensure that there are no child timelines, because we are about to remove files, - // which will break child branches - let mut children = Vec::new(); - if !allow_offloaded_children { - children.extend(timelines_offloaded.iter().filter_map(|(id, entry)| { - (entry.ancestor_timeline_id == Some(timeline_id)).then_some(*id) - })); - } - children.extend(timelines.iter().filter_map(|(id, entry)| { - (entry.get_ancestor_timeline_id() == Some(timeline_id)).then_some(*id) - })); - - if !children.is_empty() { - return Err(DeleteTimelineError::HasChildren(children)); - } - - // Note that using try_lock here is important to avoid a deadlock. - // Here we take lock on timelines and then the deletion guard. - // At the end of the operation we're holding the guard and need to lock timelines map - // to remove the timeline from it. - // Always if you have two locks that are taken in different order this can result in a deadlock. - - let delete_progress = Arc::clone(timeline.delete_progress()); - let delete_lock_guard = match delete_progress.try_lock_owned() { - Ok(guard) => DeletionGuard(guard), - Err(_) => { - // Unfortunately if lock fails arc is consumed. - return Err(DeleteTimelineError::AlreadyInProgress(Arc::clone( - timeline.delete_progress(), - ))); - } - }; - - if set_stopping { - if let TimelineOrOffloaded::Timeline(timeline) = &timeline { - timeline.set_state(TimelineState::Stopping); - } - } - - Ok((timeline, delete_lock_guard)) - } - fn schedule_background( guard: DeletionGuard, conf: &'static PageServerConf, @@ -483,6 +412,80 @@ impl DeleteTimelineFlow { } } +#[derive(Copy, Clone, PartialEq, Eq)] +pub(super) enum TimelineDeleteGuardKind { + Offload, + Delete, +} + +pub(super) fn make_timeline_delete_guard( + tenant: &Tenant, + timeline_id: TimelineId, + guard_kind: TimelineDeleteGuardKind, +) -> Result<(TimelineOrOffloaded, DeletionGuard), DeleteTimelineError> { + // Note the interaction between this guard and deletion guard. + // Here we attempt to lock deletion guard when we're holding a lock on timelines. + // This is important because when you take into account `remove_timeline_from_tenant` + // we remove timeline from memory when we still hold the deletion guard. + // So here when timeline deletion is finished timeline wont be present in timelines map at all + // which makes the following sequence impossible: + // T1: get preempted right before the try_lock on `Timeline::delete_progress` + // T2: do a full deletion, acquire and drop `Timeline::delete_progress` + // T1: acquire deletion lock, do another `DeleteTimelineFlow::run` + // For more context see this discussion: `https://github.com/neondatabase/neon/pull/4552#discussion_r1253437346` + let timelines = tenant.timelines.lock().unwrap(); + let timelines_offloaded = tenant.timelines_offloaded.lock().unwrap(); + + let timeline = match timelines.get(&timeline_id) { + Some(t) => TimelineOrOffloaded::Timeline(Arc::clone(t)), + None => match timelines_offloaded.get(&timeline_id) { + Some(t) => TimelineOrOffloaded::Offloaded(Arc::clone(t)), + None => return Err(DeleteTimelineError::NotFound), + }, + }; + + // Ensure that there are no child timelines, because we are about to remove files, + // which will break child branches + let mut children = Vec::new(); + if guard_kind == TimelineDeleteGuardKind::Delete { + children.extend(timelines_offloaded.iter().filter_map(|(id, entry)| { + (entry.ancestor_timeline_id == Some(timeline_id)).then_some(*id) + })); + } + children.extend(timelines.iter().filter_map(|(id, entry)| { + (entry.get_ancestor_timeline_id() == Some(timeline_id)).then_some(*id) + })); + + if !children.is_empty() { + return Err(DeleteTimelineError::HasChildren(children)); + } + + // Note that using try_lock here is important to avoid a deadlock. + // Here we take lock on timelines and then the deletion guard. + // At the end of the operation we're holding the guard and need to lock timelines map + // to remove the timeline from it. + // Always if you have two locks that are taken in different order this can result in a deadlock. + + let delete_progress = Arc::clone(timeline.delete_progress()); + let delete_lock_guard = match delete_progress.try_lock_owned() { + Ok(guard) => DeletionGuard(guard), + Err(_) => { + // Unfortunately if lock fails arc is consumed. + return Err(DeleteTimelineError::AlreadyInProgress(Arc::clone( + timeline.delete_progress(), + ))); + } + }; + + if guard_kind == TimelineDeleteGuardKind::Delete { + if let TimelineOrOffloaded::Timeline(timeline) = &timeline { + timeline.set_state(TimelineState::Stopping); + } + } + + Ok((timeline, delete_lock_guard)) +} + pub(super) struct DeletionGuard(OwnedMutexGuard); impl Deref for DeletionGuard { diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs index 6c6b19e8b1..3b5bf8290c 100644 --- a/pageserver/src/tenant/timeline/offload.rs +++ b/pageserver/src/tenant/timeline/offload.rs @@ -2,10 +2,11 @@ use std::sync::Arc; use pageserver_api::models::{TenantState, TimelineState}; -use super::delete::{delete_local_timeline_directory, DeleteTimelineFlow, DeletionGuard}; +use super::delete::{delete_local_timeline_directory, DeletionGuard}; use super::Timeline; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::remote_timeline_client::ShutdownIfArchivedError; +use crate::tenant::timeline::delete::{make_timeline_delete_guard, TimelineDeleteGuardKind}; use crate::tenant::{OffloadedTimeline, Tenant, TenantManifestError, TimelineOrOffloaded}; #[derive(thiserror::Error, Debug)] @@ -36,13 +37,10 @@ pub(crate) async fn offload_timeline( debug_assert_current_span_has_tenant_and_timeline_id(); tracing::info!("offloading archived timeline"); - let allow_offloaded_children = true; - let set_stopping = false; - let (timeline, guard) = DeleteTimelineFlow::prepare( + let (timeline, guard) = make_timeline_delete_guard( tenant, timeline.timeline_id, - allow_offloaded_children, - set_stopping, + TimelineDeleteGuardKind::Offload, ) .map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?; @@ -106,7 +104,7 @@ pub(crate) async fn offload_timeline( } /// It is important that this gets called when DeletionGuard is being held. -/// For more context see comments in [`DeleteTimelineFlow::prepare`] +/// For more context see comments in [`make_timeline_delete_guard`] /// /// Returns the strong count of the timeline `Arc` fn remove_timeline_from_tenant(