From b312a3c320695a4b528968250225dfbd40af0e2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arpad=20M=C3=BCller?= Date: Mon, 20 Jan 2025 13:50:44 +0100 Subject: [PATCH] Move DeleteTimelineFlow::prepare to separate function and use enum (#10334) It was requested by review in #10305 to use an enum or something like it for distinguishing the different modes instead of two parameters, because two flags allow four combinations, and two of them don't really make sense/ aren't used. follow-up of #10305 --- pageserver/src/tenant/timeline/delete.rs | 149 +++++++++++----------- pageserver/src/tenant/timeline/offload.rs | 12 +- 2 files changed, 81 insertions(+), 80 deletions(-) diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index bdc315d985..3c828c8a9e 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -112,7 +112,7 @@ pub(super) async fn delete_local_timeline_directory( } /// It is important that this gets called when DeletionGuard is being held. -/// For more context see comments in [`DeleteTimelineFlow::prepare`] +/// For more context see comments in [`make_timeline_delete_guard`] async fn remove_maybe_offloaded_timeline_from_tenant( tenant: &Tenant, timeline: &TimelineOrOffloaded, @@ -193,10 +193,8 @@ impl DeleteTimelineFlow { ) -> Result<(), DeleteTimelineError> { super::debug_assert_current_span_has_tenant_and_timeline_id(); - let allow_offloaded_children = false; - let set_stopping = true; let (timeline, mut guard) = - Self::prepare(tenant, timeline_id, allow_offloaded_children, set_stopping)?; + make_timeline_delete_guard(tenant, timeline_id, TimelineDeleteGuardKind::Delete)?; guard.mark_in_progress()?; @@ -333,75 +331,6 @@ impl DeleteTimelineFlow { Ok(()) } - pub(super) fn prepare( - tenant: &Tenant, - timeline_id: TimelineId, - allow_offloaded_children: bool, - set_stopping: bool, - ) -> Result<(TimelineOrOffloaded, DeletionGuard), DeleteTimelineError> { - // Note the interaction between this guard and deletion guard. - // Here we attempt to lock deletion guard when we're holding a lock on timelines. - // This is important because when you take into account `remove_timeline_from_tenant` - // we remove timeline from memory when we still hold the deletion guard. - // So here when timeline deletion is finished timeline wont be present in timelines map at all - // which makes the following sequence impossible: - // T1: get preempted right before the try_lock on `Timeline::delete_progress` - // T2: do a full deletion, acquire and drop `Timeline::delete_progress` - // T1: acquire deletion lock, do another `DeleteTimelineFlow::run` - // For more context see this discussion: `https://github.com/neondatabase/neon/pull/4552#discussion_r1253437346` - let timelines = tenant.timelines.lock().unwrap(); - let timelines_offloaded = tenant.timelines_offloaded.lock().unwrap(); - - let timeline = match timelines.get(&timeline_id) { - Some(t) => TimelineOrOffloaded::Timeline(Arc::clone(t)), - None => match timelines_offloaded.get(&timeline_id) { - Some(t) => TimelineOrOffloaded::Offloaded(Arc::clone(t)), - None => return Err(DeleteTimelineError::NotFound), - }, - }; - - // Ensure that there are no child timelines, because we are about to remove files, - // which will break child branches - let mut children = Vec::new(); - if !allow_offloaded_children { - children.extend(timelines_offloaded.iter().filter_map(|(id, entry)| { - (entry.ancestor_timeline_id == Some(timeline_id)).then_some(*id) - })); - } - children.extend(timelines.iter().filter_map(|(id, entry)| { - (entry.get_ancestor_timeline_id() == Some(timeline_id)).then_some(*id) - })); - - if !children.is_empty() { - return Err(DeleteTimelineError::HasChildren(children)); - } - - // Note that using try_lock here is important to avoid a deadlock. - // Here we take lock on timelines and then the deletion guard. - // At the end of the operation we're holding the guard and need to lock timelines map - // to remove the timeline from it. - // Always if you have two locks that are taken in different order this can result in a deadlock. - - let delete_progress = Arc::clone(timeline.delete_progress()); - let delete_lock_guard = match delete_progress.try_lock_owned() { - Ok(guard) => DeletionGuard(guard), - Err(_) => { - // Unfortunately if lock fails arc is consumed. - return Err(DeleteTimelineError::AlreadyInProgress(Arc::clone( - timeline.delete_progress(), - ))); - } - }; - - if set_stopping { - if let TimelineOrOffloaded::Timeline(timeline) = &timeline { - timeline.set_state(TimelineState::Stopping); - } - } - - Ok((timeline, delete_lock_guard)) - } - fn schedule_background( guard: DeletionGuard, conf: &'static PageServerConf, @@ -483,6 +412,80 @@ impl DeleteTimelineFlow { } } +#[derive(Copy, Clone, PartialEq, Eq)] +pub(super) enum TimelineDeleteGuardKind { + Offload, + Delete, +} + +pub(super) fn make_timeline_delete_guard( + tenant: &Tenant, + timeline_id: TimelineId, + guard_kind: TimelineDeleteGuardKind, +) -> Result<(TimelineOrOffloaded, DeletionGuard), DeleteTimelineError> { + // Note the interaction between this guard and deletion guard. + // Here we attempt to lock deletion guard when we're holding a lock on timelines. + // This is important because when you take into account `remove_timeline_from_tenant` + // we remove timeline from memory when we still hold the deletion guard. + // So here when timeline deletion is finished timeline wont be present in timelines map at all + // which makes the following sequence impossible: + // T1: get preempted right before the try_lock on `Timeline::delete_progress` + // T2: do a full deletion, acquire and drop `Timeline::delete_progress` + // T1: acquire deletion lock, do another `DeleteTimelineFlow::run` + // For more context see this discussion: `https://github.com/neondatabase/neon/pull/4552#discussion_r1253437346` + let timelines = tenant.timelines.lock().unwrap(); + let timelines_offloaded = tenant.timelines_offloaded.lock().unwrap(); + + let timeline = match timelines.get(&timeline_id) { + Some(t) => TimelineOrOffloaded::Timeline(Arc::clone(t)), + None => match timelines_offloaded.get(&timeline_id) { + Some(t) => TimelineOrOffloaded::Offloaded(Arc::clone(t)), + None => return Err(DeleteTimelineError::NotFound), + }, + }; + + // Ensure that there are no child timelines, because we are about to remove files, + // which will break child branches + let mut children = Vec::new(); + if guard_kind == TimelineDeleteGuardKind::Delete { + children.extend(timelines_offloaded.iter().filter_map(|(id, entry)| { + (entry.ancestor_timeline_id == Some(timeline_id)).then_some(*id) + })); + } + children.extend(timelines.iter().filter_map(|(id, entry)| { + (entry.get_ancestor_timeline_id() == Some(timeline_id)).then_some(*id) + })); + + if !children.is_empty() { + return Err(DeleteTimelineError::HasChildren(children)); + } + + // Note that using try_lock here is important to avoid a deadlock. + // Here we take lock on timelines and then the deletion guard. + // At the end of the operation we're holding the guard and need to lock timelines map + // to remove the timeline from it. + // Always if you have two locks that are taken in different order this can result in a deadlock. + + let delete_progress = Arc::clone(timeline.delete_progress()); + let delete_lock_guard = match delete_progress.try_lock_owned() { + Ok(guard) => DeletionGuard(guard), + Err(_) => { + // Unfortunately if lock fails arc is consumed. + return Err(DeleteTimelineError::AlreadyInProgress(Arc::clone( + timeline.delete_progress(), + ))); + } + }; + + if guard_kind == TimelineDeleteGuardKind::Delete { + if let TimelineOrOffloaded::Timeline(timeline) = &timeline { + timeline.set_state(TimelineState::Stopping); + } + } + + Ok((timeline, delete_lock_guard)) +} + pub(super) struct DeletionGuard(OwnedMutexGuard); impl Deref for DeletionGuard { diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs index 6c6b19e8b1..3b5bf8290c 100644 --- a/pageserver/src/tenant/timeline/offload.rs +++ b/pageserver/src/tenant/timeline/offload.rs @@ -2,10 +2,11 @@ use std::sync::Arc; use pageserver_api::models::{TenantState, TimelineState}; -use super::delete::{delete_local_timeline_directory, DeleteTimelineFlow, DeletionGuard}; +use super::delete::{delete_local_timeline_directory, DeletionGuard}; use super::Timeline; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::tenant::remote_timeline_client::ShutdownIfArchivedError; +use crate::tenant::timeline::delete::{make_timeline_delete_guard, TimelineDeleteGuardKind}; use crate::tenant::{OffloadedTimeline, Tenant, TenantManifestError, TimelineOrOffloaded}; #[derive(thiserror::Error, Debug)] @@ -36,13 +37,10 @@ pub(crate) async fn offload_timeline( debug_assert_current_span_has_tenant_and_timeline_id(); tracing::info!("offloading archived timeline"); - let allow_offloaded_children = true; - let set_stopping = false; - let (timeline, guard) = DeleteTimelineFlow::prepare( + let (timeline, guard) = make_timeline_delete_guard( tenant, timeline.timeline_id, - allow_offloaded_children, - set_stopping, + TimelineDeleteGuardKind::Offload, ) .map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?; @@ -106,7 +104,7 @@ pub(crate) async fn offload_timeline( } /// It is important that this gets called when DeletionGuard is being held. -/// For more context see comments in [`DeleteTimelineFlow::prepare`] +/// For more context see comments in [`make_timeline_delete_guard`] /// /// Returns the strong count of the timeline `Arc` fn remove_timeline_from_tenant(