diff --git a/libs/utils/src/fs_ext.rs b/libs/utils/src/fs_ext.rs index 0ef0464267..090912d276 100644 --- a/libs/utils/src/fs_ext.rs +++ b/libs/utils/src/fs_ext.rs @@ -24,12 +24,29 @@ pub async fn is_directory_empty(path: impl AsRef) -> anyhow::Result Ok(dir.next_entry().await?.is_none()) } +pub fn ignore_not_found(e: io::Error) -> io::Result<()> { + if e.kind() == io::ErrorKind::NotFound { + Ok(()) + } else { + Err(e) + } +} + +pub fn ignore_absent_files(fs_operation: F) -> io::Result<()> +where + F: Fn() -> io::Result<()>, +{ + fs_operation().or_else(ignore_not_found) +} + #[cfg(test)] mod test { use std::path::PathBuf; use crate::fs_ext::is_directory_empty; + use super::ignore_absent_files; + #[test] fn is_empty_dir() { use super::PathExt; @@ -75,4 +92,21 @@ mod test { std::fs::remove_file(&file_path).unwrap(); assert!(is_directory_empty(file_path).await.is_err()); } + + #[test] + fn ignore_absent_files_works() { + let dir = tempfile::tempdir().unwrap(); + let dir_path = dir.path(); + + let file_path: PathBuf = dir_path.join("testfile"); + + ignore_absent_files(|| std::fs::remove_file(&file_path)).expect("should execute normally"); + + let f = std::fs::File::create(&file_path).unwrap(); + drop(f); + + ignore_absent_files(|| std::fs::remove_file(&file_path)).expect("should execute normally"); + + assert!(!file_path.exists()); + } } diff --git a/libs/utils/src/id.rs b/libs/utils/src/id.rs index 20b601f68d..2ce92ee914 100644 --- a/libs/utils/src/id.rs +++ b/libs/utils/src/id.rs @@ -1,5 +1,7 @@ +use std::ffi::OsStr; use std::{fmt, str::FromStr}; +use anyhow::Context; use hex::FromHex; use rand::Rng; use serde::{Deserialize, Serialize}; @@ -213,6 +215,18 @@ pub struct TimelineId(Id); id_newtype!(TimelineId); +impl TryFrom> for TimelineId { + type Error = anyhow::Error; + + fn try_from(value: Option<&OsStr>) -> Result { + value + .and_then(OsStr::to_str) + .unwrap_or_default() + .parse::() + .with_context(|| format!("Could not parse timeline id from {:?}", value)) + } +} + /// Neon Tenant Id represents identifiar of a particular tenant. /// Is used for distinguishing requests and data belonging to different users. /// diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 4c6df469aa..be806c77ec 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -33,7 +33,8 @@ use crate::tenant::config::TenantConf; use crate::tenant::config::TenantConfOpt; use crate::tenant::{TENANT_ATTACHING_MARKER_FILENAME, TIMELINES_SEGMENT_NAME}; use crate::{ - IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_UNINIT_MARK_SUFFIX, + IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, + TIMELINE_UNINIT_MARK_SUFFIX, }; pub mod defaults { @@ -601,6 +602,17 @@ impl PageServerConf { ) } + pub fn timeline_delete_mark_file_path( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + ) -> PathBuf { + path_with_suffix_extension( + self.timeline_path(&tenant_id, &timeline_id), + TIMELINE_DELETE_MARK_SUFFIX, + ) + } + pub fn traces_path(&self) -> PathBuf { self.workdir.join("traces") } diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 5831091098..f43651e931 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -109,6 +109,8 @@ pub const TEMP_FILE_SUFFIX: &str = "___temp"; /// Full path: `tenants//timelines/___uninit`. pub const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit"; +pub const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete"; + /// A marker file to prevent pageserver from loading a certain tenant on restart. /// Different from [`TIMELINE_UNINIT_MARK_SUFFIX`] due to semantics of the corresponding /// `ignore` management API command, that expects the ignored tenant to be properly loaded @@ -123,15 +125,30 @@ pub fn is_temporary(path: &Path) -> bool { } } -pub fn is_uninit_mark(path: &Path) -> bool { +fn ends_with_suffix(path: &Path, suffix: &str) -> bool { match path.file_name() { - Some(name) => name - .to_string_lossy() - .ends_with(TIMELINE_UNINIT_MARK_SUFFIX), + Some(name) => name.to_string_lossy().ends_with(suffix), None => false, } } +pub fn is_uninit_mark(path: &Path) -> bool { + ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX) +} + +pub fn is_delete_mark(path: &Path) -> bool { + ends_with_suffix(path, TIMELINE_DELETE_MARK_SUFFIX) +} + +fn is_walkdir_io_not_found(e: &walkdir::Error) -> bool { + if let Some(e) = e.io_error() { + if e.kind() == std::io::ErrorKind::NotFound { + return true; + } + } + false +} + /// During pageserver startup, we need to order operations not to exhaust tokio worker threads by /// blocking. /// diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 0b1183fc50..67447bc45c 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -18,22 +18,20 @@ use remote_storage::DownloadError; use remote_storage::GenericRemoteStorage; use storage_broker::BrokerClientChannel; use tokio::sync::watch; -use tokio::sync::OwnedMutexGuard; use tokio::task::JoinSet; use tokio_util::sync::CancellationToken; use tracing::*; use utils::completion; use utils::crashsafe::path_with_suffix_extension; +use utils::fs_ext; use std::cmp::min; use std::collections::hash_map::Entry; use std::collections::BTreeSet; use std::collections::HashMap; -use std::ffi::OsStr; use std::fs; use std::fs::File; use std::fs::OpenOptions; -use std::io; use std::io::Write; use std::ops::Bound::Included; use std::path::Path; @@ -48,6 +46,7 @@ use std::sync::{Mutex, RwLock}; use std::time::{Duration, Instant}; use self::config::TenantConf; +use self::delete::DeleteTimelineFlow; use self::metadata::TimelineMetadata; use self::remote_timeline_client::RemoteTimelineClient; use self::timeline::uninit::TimelineUninitMark; @@ -65,7 +64,6 @@ use crate::tenant::config::TenantConfOpt; use crate::tenant::metadata::load_metadata; use crate::tenant::remote_timeline_client::index::IndexPart; use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart; -use crate::tenant::remote_timeline_client::PersistIndexPartWithDeletedFlagError; use crate::tenant::storage_layer::DeltaLayer; use crate::tenant::storage_layer::ImageLayer; use crate::tenant::storage_layer::Layer; @@ -118,6 +116,7 @@ mod remote_timeline_client; pub mod storage_layer; pub mod config; +pub mod delete; pub mod mgr; pub mod tasks; pub mod upload_queue; @@ -266,6 +265,14 @@ pub enum GetTimelineError { }, } +#[derive(Debug, thiserror::Error)] +pub enum LoadLocalTimelineError { + #[error("FailedToLoad")] + Load(#[source] anyhow::Error), + #[error("FailedToResumeDeletion")] + ResumeDeletion(#[source] anyhow::Error), +} + #[derive(Debug, thiserror::Error)] pub enum DeleteTimelineError { #[error("NotFound")] @@ -319,14 +326,6 @@ impl std::fmt::Display for WaitToBecomeActiveError { } } -struct DeletionGuard(OwnedMutexGuard); - -impl DeletionGuard { - fn is_deleted(&self) -> bool { - *self.0 - } -} - #[derive(thiserror::Error, Debug)] pub enum CreateTimelineError { #[error("a timeline with the given ID already exists")] @@ -337,6 +336,16 @@ pub enum CreateTimelineError { Other(#[from] anyhow::Error), } +struct TenantDirectoryScan { + sorted_timelines_to_load: Vec<(TimelineId, TimelineMetadata)>, + timelines_to_resume_deletion: Vec<(TimelineId, TimelineMetadata)>, +} + +enum CreateTimelineCause { + Load, + Delete, +} + impl Tenant { /// Yet another helper for timeline initialization. /// Contains the common part of `load_local_timeline` and `load_remote_timeline`. @@ -375,6 +384,7 @@ impl Tenant { ancestor.clone(), remote_client, init_order, + CreateTimelineCause::Load, )?; let new_disk_consistent_lsn = timeline.get_disk_consistent_lsn(); anyhow::ensure!( @@ -803,11 +813,13 @@ impl Tenant { tenant } - pub fn scan_and_sort_timelines_dir( - self: Arc, - ) -> anyhow::Result> { - let timelines_dir = self.conf.timelines_path(&self.tenant_id); + fn scan_and_sort_timelines_dir(self: Arc) -> anyhow::Result { let mut timelines_to_load: HashMap = HashMap::new(); + // Note timelines_to_resume_deletion needs to be separate because it can be not sortable + // from the point of `tree_sort_timelines`. I e some parents can be missing because deletion + // completed in non topological order (for example because parent has smaller number of layer files in it) + let mut timelines_to_resume_deletion: Vec<(TimelineId, TimelineMetadata)> = vec![]; + let timelines_dir = self.conf.timelines_path(&self.tenant_id); for entry in std::fs::read_dir(&timelines_dir).context("list timelines directory for tenant")? @@ -835,16 +847,13 @@ impl Tenant { ); continue; } + let timeline_uninit_mark_file = &timeline_dir; info!( "Found an uninit mark file {}, removing the timeline and its uninit mark", timeline_uninit_mark_file.display() ); - let timeline_id = timeline_uninit_mark_file - .file_stem() - .and_then(OsStr::to_str) - .unwrap_or_default() - .parse::() + let timeline_id = TimelineId::try_from(timeline_uninit_mark_file.file_stem()) .with_context(|| { format!( "Could not parse timeline id out of the timeline uninit mark name {}", @@ -857,6 +866,44 @@ impl Tenant { { error!("Failed to clean up uninit marked timeline: {e:?}"); } + } else if crate::is_delete_mark(&timeline_dir) { + // If metadata exists, load as usual, continue deletion + // If metadata doesnt exist remove timeline dir and delete mark + let timeline_id = + TimelineId::try_from(timeline_dir.file_stem()).with_context(|| { + format!( + "Could not parse timeline id out of the timeline uninit mark name {}", + timeline_dir.display() + ) + })?; + + let metadata_path = self.conf.metadata_path(&self.tenant_id, &timeline_id); + if metadata_path.exists() { + // Remote deletion did not finish. Need to resume. + timelines_to_resume_deletion.push(( + timeline_id, + load_metadata(self.conf, &self.tenant_id, &timeline_id)?, + )); + continue; + } + + // Missing metadata means that timeline directory should be empty at this point. + // Remove delete mark afterwards. + // Note that failure during the process wont prevent tenant from successfully loading. + // TODO: this is very much similar to DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces + // but here we're inside spawn_blocking. + if let Err(e) = fs_ext::ignore_absent_files(|| { + fs::remove_dir(self.conf.timeline_path(&self.tenant_id, &timeline_id)) + }) + .context("remove deleted timeline dir") + .and_then(|_| fs::remove_file(&timeline_dir).context("remove delete mark")) + { + warn!( + "cannot clean up deleted timeline dir at: {} error: {:#}", + timeline_dir.display(), + e + ); + }; } else { if !timeline_dir.exists() { warn!( @@ -865,12 +912,8 @@ impl Tenant { ); continue; } - let timeline_id = timeline_dir - .file_name() - .and_then(OsStr::to_str) - .unwrap_or_default() - .parse::() - .with_context(|| { + let timeline_id = + TimelineId::try_from(timeline_dir.file_name()).with_context(|| { format!( "Could not parse timeline id out of the timeline dir name {}", timeline_dir.display() @@ -892,6 +935,14 @@ impl Tenant { continue; } + let timeline_delete_mark_file = self + .conf + .timeline_delete_mark_file_path(self.tenant_id, timeline_id); + if timeline_delete_mark_file.exists() { + // Cleanup should be done in `is_delete_mark` branch above + continue; + } + let file_name = entry.file_name(); if let Ok(timeline_id) = file_name.to_str().unwrap_or_default().parse::() @@ -911,7 +962,10 @@ impl Tenant { // Sort the array of timeline IDs into tree-order, so that parent comes before // all its children. - tree_sort_timelines(timelines_to_load) + tree_sort_timelines(timelines_to_load).map(|sorted_timelines| TenantDirectoryScan { + sorted_timelines_to_load: sorted_timelines, + timelines_to_resume_deletion, + }) } /// @@ -937,7 +991,7 @@ impl Tenant { let span = info_span!("blocking"); let cloned = Arc::clone(self); - let sorted_timelines: Vec<(_, _)> = tokio::task::spawn_blocking(move || { + let scan = tokio::task::spawn_blocking(move || { let _g = span.entered(); cloned.scan_and_sort_timelines_dir() }) @@ -948,10 +1002,44 @@ impl Tenant { // FIXME original collect_timeline_files contained one more check: // 1. "Timeline has no ancestor and no layer files" - for (timeline_id, local_metadata) in sorted_timelines { - self.load_local_timeline(timeline_id, local_metadata, init_order, ctx) + // Process loadable timelines first + for (timeline_id, local_metadata) in scan.sorted_timelines_to_load { + if let Err(e) = self + .load_local_timeline(timeline_id, local_metadata, init_order, ctx, false) .await - .with_context(|| format!("load local timeline {timeline_id}"))?; + { + match e { + LoadLocalTimelineError::Load(source) => { + return Err(anyhow::anyhow!(source) + .context("Failed to load local timeline: {timeline_id}")) + } + LoadLocalTimelineError::ResumeDeletion(source) => { + // Make sure resumed deletion wont fail loading for entire tenant. + error!("Failed to resume timeline deletion: {source:#}") + } + } + } + } + + // Resume deletion ones with deleted_mark + for (timeline_id, local_metadata) in scan.timelines_to_resume_deletion { + if let Err(e) = self + .load_local_timeline(timeline_id, local_metadata, init_order, ctx, true) + .await + { + match e { + LoadLocalTimelineError::Load(source) => { + // We tried to load deleted timeline, this is a bug. + return Err(anyhow::anyhow!(source).context( + "This is a bug. We tried to load deleted timeline which is wrong and loading failed. Timeline: {timeline_id}" + )); + } + LoadLocalTimelineError::ResumeDeletion(source) => { + // Make sure resumed deletion wont fail loading for entire tenant. + error!("Failed to resume timeline deletion: {source:#}") + } + } + } } trace!("Done"); @@ -969,7 +1057,8 @@ impl Tenant { local_metadata: TimelineMetadata, init_order: Option<&InitializationOrder>, ctx: &RequestContext, - ) -> anyhow::Result<()> { + found_delete_mark: bool, + ) -> Result<(), LoadLocalTimelineError> { span::debug_assert_current_span_has_tenant_id(); let remote_client = self.remote_storage.as_ref().map(|remote_storage| { @@ -981,14 +1070,6 @@ impl Tenant { ) }); - let ancestor = if let Some(ancestor_timeline_id) = local_metadata.ancestor_timeline() { - let ancestor_timeline = self.get_timeline(ancestor_timeline_id, false) - .with_context(|| anyhow::anyhow!("cannot find ancestor timeline {ancestor_timeline_id} for timeline {timeline_id}"))?; - Some(ancestor_timeline) - } else { - None - }; - let (remote_startup_data, remote_client) = match remote_client { Some(remote_client) => match remote_client.download_index_file().await { Ok(index_part) => { @@ -1008,45 +1089,29 @@ impl Tenant { info!("is_deleted is set on remote, resuming removal of timeline data originally done by timeline deletion handler"); remote_client - .init_upload_queue_stopped_to_continue_deletion(&index_part)?; + .init_upload_queue_stopped_to_continue_deletion(&index_part) + .context("init queue stopped") + .map_err(LoadLocalTimelineError::ResumeDeletion)?; - let timeline = self - .create_timeline_struct( - timeline_id, - &local_metadata, - ancestor, - Some(remote_client), - init_order, - ) - .context("create_timeline_struct")?; - - let guard = DeletionGuard( - Arc::clone(&timeline.delete_lock) - .try_lock_owned() - .expect("cannot happen because we're the only owner"), - ); - - // Note: here we even skip populating layer map. Timeline is essentially uninitialized. - // RemoteTimelineClient is the only functioning part. - timeline.set_state(TimelineState::Stopping); - // We meed to do this because when console retries delete request we shouldnt answer with 404 - // because 404 means successful deletion. - // FIXME consider TimelineState::Deleting. - let mut locked = self.timelines.lock().unwrap(); - locked.insert(timeline_id, Arc::clone(&timeline)); - - Tenant::schedule_delete_timeline( + DeleteTimelineFlow::resume_deletion( Arc::clone(self), timeline_id, - timeline, - guard, - ); + &local_metadata, + Some(remote_client), + init_order, + ) + .await + .context("resume deletion") + .map_err(LoadLocalTimelineError::ResumeDeletion)?; return Ok(()); } }; - let remote_metadata = index_part.parse_metadata().context("parse_metadata")?; + let remote_metadata = index_part + .parse_metadata() + .context("parse_metadata") + .map_err(LoadLocalTimelineError::Load)?; ( Some(RemoteStartupData { index_part, @@ -1056,12 +1121,54 @@ impl Tenant { ) } Err(DownloadError::NotFound) => { - info!("no index file was found on the remote"); + info!("no index file was found on the remote, found_delete_mark: {found_delete_mark}"); + + if found_delete_mark { + // We could've resumed at a point where remote index was deleted, but metadata file wasnt. + // Cleanup: + return DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces( + self, + timeline_id, + ) + .await + .context("cleanup_remaining_timeline_fs_traces") + .map_err(LoadLocalTimelineError::ResumeDeletion); + } + + // We're loading fresh timeline that didnt yet make it into remote. (None, Some(remote_client)) } - Err(e) => return Err(anyhow::anyhow!(e)), + Err(e) => return Err(LoadLocalTimelineError::Load(anyhow::Error::new(e))), }, - None => (None, remote_client), + None => { + // No remote client + if found_delete_mark { + // There is no remote client, we found local metadata. + // Continue cleaning up local disk. + DeleteTimelineFlow::resume_deletion( + Arc::clone(self), + timeline_id, + &local_metadata, + None, + init_order, + ) + .await + .context("resume deletion") + .map_err(LoadLocalTimelineError::ResumeDeletion)?; + return Ok(()); + } + + (None, remote_client) + } + }; + + let ancestor = if let Some(ancestor_timeline_id) = local_metadata.ancestor_timeline() { + let ancestor_timeline = self.get_timeline(ancestor_timeline_id, false) + .with_context(|| anyhow::anyhow!("cannot find ancestor timeline {ancestor_timeline_id} for timeline {timeline_id}")) + .map_err(LoadLocalTimelineError::Load)?; + Some(ancestor_timeline) + } else { + None }; self.timeline_init_and_sync( @@ -1075,6 +1182,7 @@ impl Tenant { ctx, ) .await + .map_err(LoadLocalTimelineError::Load) } pub fn tenant_id(&self) -> TenantId { @@ -1435,269 +1543,6 @@ impl Tenant { } } - /// Shuts down a timeline's tasks, removes its in-memory structures, and deletes its - /// data from both disk and s3. - async fn delete_timeline( - &self, - timeline_id: TimelineId, - timeline: Arc, - guard: DeletionGuard, - ) -> anyhow::Result<()> { - { - // Grab the layer_removal_cs lock, and actually perform the deletion. - // - // This lock prevents prevents GC or compaction from running at the same time. - // The GC task doesn't register itself with the timeline it's operating on, - // so it might still be running even though we called `shutdown_tasks`. - // - // Note that there are still other race conditions between - // GC, compaction and timeline deletion. See - // https://github.com/neondatabase/neon/issues/2671 - // - // No timeout here, GC & Compaction should be responsive to the - // `TimelineState::Stopping` change. - info!("waiting for layer_removal_cs.lock()"); - let layer_removal_guard = timeline.layer_removal_cs.lock().await; - info!("got layer_removal_cs.lock(), deleting layer files"); - - // NB: remote_timeline_client upload tasks that reference these layers have been cancelled - // by the caller. - - let local_timeline_directory = self - .conf - .timeline_path(&self.tenant_id, &timeline.timeline_id); - - fail::fail_point!("timeline-delete-before-rm", |_| { - Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))? - }); - - // NB: This need not be atomic because the deleted flag in the IndexPart - // will be observed during tenant/timeline load. The deletion will be resumed there. - // - // For configurations without remote storage, we tolerate that we're not crash-safe here. - // The timeline may come up Active but with missing layer files, in such setups. - // See https://github.com/neondatabase/neon/pull/3919#issuecomment-1531726720 - match std::fs::remove_dir_all(&local_timeline_directory) { - Err(e) if e.kind() == std::io::ErrorKind::NotFound => { - // This can happen if we're called a second time, e.g., - // because of a previous failure/cancellation at/after - // failpoint timeline-delete-after-rm. - // - // It can also happen if we race with tenant detach, because, - // it doesn't grab the layer_removal_cs lock. - // - // For now, log and continue. - // warn! level is technically not appropriate for the - // first case because we should expect retries to happen. - // But the error is so rare, it seems better to get attention if it happens. - let tenant_state = self.current_state(); - warn!( - timeline_dir=?local_timeline_directory, - ?tenant_state, - "timeline directory not found, proceeding anyway" - ); - // continue with the rest of the deletion - } - res => res.with_context(|| { - format!( - "Failed to remove local timeline directory '{}'", - local_timeline_directory.display() - ) - })?, - } - - info!("finished deleting layer files, releasing layer_removal_cs.lock()"); - drop(layer_removal_guard); - } - - fail::fail_point!("timeline-delete-after-rm", |_| { - Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))? - }); - - if let Some(remote_client) = &timeline.remote_client { - remote_client.delete_all().await.context("delete_all")? - }; - - pausable_failpoint!("in_progress_delete"); - - { - // Remove the timeline from the map. - let mut timelines = self.timelines.lock().unwrap(); - let children_exist = timelines - .iter() - .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id)); - // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`. - // We already deleted the layer files, so it's probably best to panic. - // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart) - if children_exist { - panic!("Timeline grew children while we removed layer files"); - } - - timelines.remove(&timeline_id).expect( - "timeline that we were deleting was concurrently removed from 'timelines' map", - ); - - drop(timelines); - } - - drop(guard); - - Ok(()) - } - - /// Removes timeline-related in-memory data and schedules removal from remote storage. - #[instrument(skip(self, _ctx))] - pub async fn prepare_and_schedule_delete_timeline( - self: Arc, - timeline_id: TimelineId, - _ctx: &RequestContext, - ) -> Result<(), DeleteTimelineError> { - debug_assert_current_span_has_tenant_and_timeline_id(); - - // Transition the timeline into TimelineState::Stopping. - // This should prevent new operations from starting. - // - // Also grab the Timeline's delete_lock to prevent another deletion from starting. - let timeline; - let delete_lock_guard; - { - let mut timelines = self.timelines.lock().unwrap(); - - // Ensure that there are no child timelines **attached to that pageserver**, - // because detach removes files, which will break child branches - let children: Vec = timelines - .iter() - .filter_map(|(id, entry)| { - if entry.get_ancestor_timeline_id() == Some(timeline_id) { - Some(*id) - } else { - None - } - }) - .collect(); - - if !children.is_empty() { - return Err(DeleteTimelineError::HasChildren(children)); - } - - let timeline_entry = match timelines.entry(timeline_id) { - Entry::Occupied(e) => e, - Entry::Vacant(_) => return Err(DeleteTimelineError::NotFound), - }; - - timeline = Arc::clone(timeline_entry.get()); - - // Prevent two tasks from trying to delete the timeline at the same time. - delete_lock_guard = DeletionGuard( - Arc::clone(&timeline.delete_lock) - .try_lock_owned() - .map_err(|_| DeleteTimelineError::AlreadyInProgress)?, - ); - - // If another task finished the deletion just before we acquired the lock, - // return success. - if delete_lock_guard.is_deleted() { - return Ok(()); - } - - timeline.set_state(TimelineState::Stopping); - - drop(timelines); - } - - // Now that the Timeline is in Stopping state, request all the related tasks to - // shut down. - // - // NB: If this fails half-way through, and is retried, the retry will go through - // all the same steps again. Make sure the code here is idempotent, and don't - // error out if some of the shutdown tasks have already been completed! - - // Stop the walreceiver first. - debug!("waiting for wal receiver to shutdown"); - let maybe_started_walreceiver = { timeline.walreceiver.lock().unwrap().take() }; - if let Some(walreceiver) = maybe_started_walreceiver { - walreceiver.stop().await; - } - debug!("wal receiver shutdown confirmed"); - - // Prevent new uploads from starting. - if let Some(remote_client) = timeline.remote_client.as_ref() { - let res = remote_client.stop(); - match res { - Ok(()) => {} - Err(e) => match e { - remote_timeline_client::StopError::QueueUninitialized => { - // This case shouldn't happen currently because the - // load and attach code bails out if _any_ of the timeline fails to fetch its IndexPart. - // That is, before we declare the Tenant as Active. - // But we only allow calls to delete_timeline on Active tenants. - return Err(DeleteTimelineError::Other(anyhow::anyhow!("upload queue is uninitialized, likely the timeline was in Broken state prior to this call because it failed to fetch IndexPart during load or attach, check the logs"))); - } - }, - } - } - - // Stop & wait for the remaining timeline tasks, including upload tasks. - // NB: This and other delete_timeline calls do not run as a task_mgr task, - // so, they are not affected by this shutdown_tasks() call. - info!("waiting for timeline tasks to shutdown"); - task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id)).await; - - // Mark timeline as deleted in S3 so we won't pick it up next time - // during attach or pageserver restart. - // See comment in persist_index_part_with_deleted_flag. - if let Some(remote_client) = timeline.remote_client.as_ref() { - match remote_client.persist_index_part_with_deleted_flag().await { - // If we (now, or already) marked it successfully as deleted, we can proceed - Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (), - // Bail out otherwise - // - // AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents - // two tasks from performing the deletion at the same time. The first task - // that starts deletion should run it to completion. - Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_)) - | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => { - return Err(DeleteTimelineError::Other(anyhow::anyhow!(e))); - } - } - } - self.schedule_delete_timeline(timeline_id, timeline, delete_lock_guard); - - Ok(()) - } - - fn schedule_delete_timeline( - self: Arc, - timeline_id: TimelineId, - timeline: Arc, - guard: DeletionGuard, - ) { - let tenant_id = self.tenant_id; - let timeline_clone = Arc::clone(&timeline); - - task_mgr::spawn( - task_mgr::BACKGROUND_RUNTIME.handle(), - TaskKind::TimelineDeletionWorker, - Some(self.tenant_id), - Some(timeline_id), - "timeline_delete", - false, - async move { - if let Err(err) = self.delete_timeline(timeline_id, timeline, guard).await { - error!("Error: {err:#}"); - timeline_clone.set_broken(err.to_string()) - }; - Ok(()) - } - .instrument({ - let span = - tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_id, timeline_id=%timeline_id); - span.follows_from(Span::current()); - span - }), - ); - } - pub fn current_state(&self) -> TenantState { self.state.borrow().clone() } @@ -2154,6 +1999,10 @@ impl Tenant { /// The returned Timeline is in Loading state. The caller is responsible for /// initializing any on-disk state, and for inserting the Timeline to the 'timelines' /// map. + /// + /// `validate_ancestor == false` is used when a timeline is created for deletion + /// and we might not have the ancestor present anymore which is fine for to be + /// deleted timelines. fn create_timeline_struct( &self, new_timeline_id: TimelineId, @@ -2161,12 +2010,14 @@ impl Tenant { ancestor: Option>, remote_client: Option, init_order: Option<&InitializationOrder>, + cause: CreateTimelineCause, ) -> anyhow::Result> { - if let Some(ancestor_timeline_id) = new_metadata.ancestor_timeline() { + if matches!(cause, CreateTimelineCause::Load) { + let ancestor_id = new_metadata.ancestor_timeline(); anyhow::ensure!( - ancestor.is_some(), - "Timeline's {new_timeline_id} ancestor {ancestor_timeline_id} was not found" - ) + ancestor_id == ancestor.as_ref().map(|t| t.timeline_id), + "Timeline's {new_timeline_id} ancestor {ancestor_id:?} was not found" + ); } let initial_logical_size_can_start = init_order.map(|x| &x.initial_logical_size_can_start); @@ -2852,7 +2703,14 @@ impl Tenant { }; let timeline_struct = self - .create_timeline_struct(new_timeline_id, new_metadata, ancestor, remote_client, None) + .create_timeline_struct( + new_timeline_id, + new_metadata, + ancestor, + remote_client, + None, + CreateTimelineCause::Load, + ) .context("Failed to create timeline data structure")?; timeline_struct.init_empty_layer_map(start_lsn); @@ -3270,19 +3128,6 @@ pub async fn dump_layerfile_from_path( Ok(()) } -fn ignore_absent_files(fs_operation: F) -> io::Result<()> -where - F: Fn() -> io::Result<()>, -{ - fs_operation().or_else(|e| { - if e.kind() == io::ErrorKind::NotFound { - Ok(()) - } else { - Err(e) - } - }) -} - #[cfg(test)] pub mod harness { use bytes::{Bytes, BytesMut}; diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs new file mode 100644 index 0000000000..02d1c997a3 --- /dev/null +++ b/pageserver/src/tenant/delete.rs @@ -0,0 +1,575 @@ +use std::{ + ops::{Deref, DerefMut}, + sync::Arc, +}; + +use anyhow::Context; +use pageserver_api::models::TimelineState; +use tokio::sync::OwnedMutexGuard; +use tracing::{debug, error, info, instrument, warn, Instrument, Span}; +use utils::{ + crashsafe, fs_ext, + id::{TenantId, TimelineId}, +}; + +use crate::{ + config::PageServerConf, + task_mgr::{self, TaskKind}, + tenant::{remote_timeline_client, DeleteTimelineError}, + InitializationOrder, +}; + +use super::{ + metadata::TimelineMetadata, + remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient}, + CreateTimelineCause, Tenant, Timeline, +}; + +/// Now that the Timeline is in Stopping state, request all the related tasks to shut down. +async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> { + // Stop the walreceiver first. + debug!("waiting for wal receiver to shutdown"); + let maybe_started_walreceiver = { timeline.walreceiver.lock().unwrap().take() }; + if let Some(walreceiver) = maybe_started_walreceiver { + walreceiver.stop().await; + } + debug!("wal receiver shutdown confirmed"); + + // Prevent new uploads from starting. + if let Some(remote_client) = timeline.remote_client.as_ref() { + let res = remote_client.stop(); + match res { + Ok(()) => {} + Err(e) => match e { + remote_timeline_client::StopError::QueueUninitialized => { + // This case shouldn't happen currently because the + // load and attach code bails out if _any_ of the timeline fails to fetch its IndexPart. + // That is, before we declare the Tenant as Active. + // But we only allow calls to delete_timeline on Active tenants. + return Err(DeleteTimelineError::Other(anyhow::anyhow!("upload queue is uninitialized, likely the timeline was in Broken state prior to this call because it failed to fetch IndexPart during load or attach, check the logs"))); + } + }, + } + } + + // Stop & wait for the remaining timeline tasks, including upload tasks. + // NB: This and other delete_timeline calls do not run as a task_mgr task, + // so, they are not affected by this shutdown_tasks() call. + info!("waiting for timeline tasks to shutdown"); + task_mgr::shutdown_tasks(None, Some(timeline.tenant_id), Some(timeline.timeline_id)).await; + + fail::fail_point!("timeline-delete-before-index-deleted-at", |_| { + Err(anyhow::anyhow!( + "failpoint: timeline-delete-before-index-deleted-at" + ))? + }); + Ok(()) +} + +/// Mark timeline as deleted in S3 so we won't pick it up next time +/// during attach or pageserver restart. +/// See comment in persist_index_part_with_deleted_flag. +async fn set_deleted_in_remote_index(timeline: &Timeline) -> Result<(), DeleteTimelineError> { + if let Some(remote_client) = timeline.remote_client.as_ref() { + match remote_client.persist_index_part_with_deleted_flag().await { + // If we (now, or already) marked it successfully as deleted, we can proceed + Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (), + // Bail out otherwise + // + // AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents + // two tasks from performing the deletion at the same time. The first task + // that starts deletion should run it to completion. + Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_)) + | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => { + return Err(DeleteTimelineError::Other(anyhow::anyhow!(e))); + } + } + } + Ok(()) +} + +// We delete local files first, so if pageserver restarts after local files deletion then remote deletion is not continued. +// This can be solved with inversion of these steps. But even if these steps are inverted then, when index_part.json +// gets deleted there is no way to distinguish between "this timeline is good, we just didnt upload it to remote" +// and "this timeline is deleted we should continue with removal of local state". So to avoid the ambiguity we use a mark file. +// After index part is deleted presence of this mark file indentifies that it was a deletion intention. +// So we can just remove the mark file. +async fn create_delete_mark( + conf: &PageServerConf, + tenant_id: TenantId, + timeline_id: TimelineId, +) -> Result<(), DeleteTimelineError> { + fail::fail_point!("timeline-delete-before-delete-mark", |_| { + Err(anyhow::anyhow!( + "failpoint: timeline-delete-before-delete-mark" + ))? + }); + let marker_path = conf.timeline_delete_mark_file_path(tenant_id, timeline_id); + + // Note: we're ok to replace existing file. + let _ = std::fs::OpenOptions::new() + .write(true) + .create(true) + .open(&marker_path) + .with_context(|| format!("could not create delete marker file {marker_path:?}"))?; + + crashsafe::fsync_file_and_parent(&marker_path).context("sync_mark")?; + Ok(()) +} + +/// Grab the layer_removal_cs lock, and actually perform the deletion. +/// +/// This lock prevents prevents GC or compaction from running at the same time. +/// The GC task doesn't register itself with the timeline it's operating on, +/// so it might still be running even though we called `shutdown_tasks`. +/// +/// Note that there are still other race conditions between +/// GC, compaction and timeline deletion. See +/// +/// +/// No timeout here, GC & Compaction should be responsive to the +/// `TimelineState::Stopping` change. +async fn delete_local_layer_files( + conf: &PageServerConf, + tenant_id: TenantId, + timeline: &Timeline, +) -> anyhow::Result<()> { + info!("waiting for layer_removal_cs.lock()"); + let layer_removal_guard = timeline.layer_removal_cs.lock().await; + info!("got layer_removal_cs.lock(), deleting layer files"); + + // NB: storage_sync upload tasks that reference these layers have been cancelled + // by the caller. + + let local_timeline_directory = conf.timeline_path(&tenant_id, &timeline.timeline_id); + + fail::fail_point!("timeline-delete-before-rm", |_| { + Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))? + }); + + // NB: This need not be atomic because the deleted flag in the IndexPart + // will be observed during tenant/timeline load. The deletion will be resumed there. + // + // For configurations without remote storage, we guarantee crash-safety by persising delete mark file. + // + // Note that here we do not bail out on std::io::ErrorKind::NotFound. + // This can happen if we're called a second time, e.g., + // because of a previous failure/cancellation at/after + // failpoint timeline-delete-after-rm. + // + // It can also happen if we race with tenant detach, because, + // it doesn't grab the layer_removal_cs lock. + // + // For now, log and continue. + // warn! level is technically not appropriate for the + // first case because we should expect retries to happen. + // But the error is so rare, it seems better to get attention if it happens. + // + // Note that metadata removal is skipped, this is not technically needed, + // but allows to reuse timeline loading code during resumed deletion. + // (we always expect that metadata is in place when timeline is being loaded) + + #[cfg(feature = "testing")] + let mut counter = 0; + + // Timeline directory may not exist if we failed to delete mark file and request was retried. + if !local_timeline_directory.exists() { + return Ok(()); + } + + let metadata_path = conf.metadata_path(&tenant_id, &timeline.timeline_id); + + for entry in walkdir::WalkDir::new(&local_timeline_directory).contents_first(true) { + #[cfg(feature = "testing")] + { + counter += 1; + if counter == 2 { + fail::fail_point!("timeline-delete-during-rm", |_| { + Err(anyhow::anyhow!("failpoint: timeline-delete-during-rm"))? + }); + } + } + + let entry = entry?; + if entry.path() == metadata_path { + debug!("found metadata, skipping"); + continue; + } + + if entry.path() == local_timeline_directory { + // Keeping directory because metedata file is still there + debug!("found timeline dir itself, skipping"); + continue; + } + + let metadata = match entry.metadata() { + Ok(metadata) => metadata, + Err(e) => { + if crate::is_walkdir_io_not_found(&e) { + warn!( + timeline_dir=?local_timeline_directory, + path=?entry.path().display(), + "got not found err while removing timeline dir, proceeding anyway" + ); + continue; + } + anyhow::bail!(e); + } + }; + + let r = if metadata.is_dir() { + // There shouldnt be any directories inside timeline dir as of current layout. + tokio::fs::remove_dir(entry.path()).await + } else { + tokio::fs::remove_file(entry.path()).await + }; + + if let Err(e) = r { + if e.kind() == std::io::ErrorKind::NotFound { + warn!( + timeline_dir=?local_timeline_directory, + path=?entry.path().display(), + "got not found err while removing timeline dir, proceeding anyway" + ); + continue; + } + anyhow::bail!(anyhow::anyhow!( + "Failed to remove: {}. Error: {e}", + entry.path().display() + )); + } + } + + info!("finished deleting layer files, releasing layer_removal_cs.lock()"); + drop(layer_removal_guard); + + fail::fail_point!("timeline-delete-after-rm", |_| { + Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))? + }); + + Ok(()) +} + +/// Removes remote layers and an index file after them. +async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<()> { + if let Some(remote_client) = &timeline.remote_client { + remote_client.delete_all().await.context("delete_all")? + }; + + Ok(()) +} + +// This function removs remaining traces of a timeline on disk. +// Namely: metadata file, timeline directory, delete mark. +// Note: io::ErrorKind::NotFound are ignored for metadata and timeline dir. +// delete mark should be present because it is the last step during deletion. +// (nothing can fail after its deletion) +async fn cleanup_remaining_timeline_fs_traces( + conf: &PageServerConf, + tenant_id: TenantId, + timeline_id: TimelineId, +) -> anyhow::Result<()> { + // Remove local metadata + tokio::fs::remove_file(conf.metadata_path(&tenant_id, &timeline_id)) + .await + .or_else(fs_ext::ignore_not_found) + .context("remove metadata")?; + + fail::fail_point!("timeline-delete-after-rm-metadata", |_| { + Err(anyhow::anyhow!( + "failpoint: timeline-delete-after-rm-metadata" + ))? + }); + + // Remove timeline dir + tokio::fs::remove_dir(conf.timeline_path(&tenant_id, &timeline_id)) + .await + .or_else(fs_ext::ignore_not_found) + .context("timeline dir")?; + + fail::fail_point!("timeline-delete-after-rm-dir", |_| { + Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm-dir"))? + }); + + // Remove delete mark + tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_id, timeline_id)) + .await + .context("remove delete mark") +} + +/// It is important that this gets called when DeletionGuard is being held. +/// For more context see comments in [`DeleteTimelineFlow::prepare`] +async fn remove_timeline_from_tenant( + tenant: &Tenant, + timeline_id: TimelineId, + _: &DeletionGuard, // using it as a witness +) -> anyhow::Result<()> { + // Remove the timeline from the map. + let mut timelines = tenant.timelines.lock().unwrap(); + let children_exist = timelines + .iter() + .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id)); + // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`. + // We already deleted the layer files, so it's probably best to panic. + // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart) + if children_exist { + panic!("Timeline grew children while we removed layer files"); + } + + timelines + .remove(&timeline_id) + .expect("timeline that we were deleting was concurrently removed from 'timelines' map"); + + drop(timelines); + + Ok(()) +} + +/// Orchestrates timeline shut down of all timeline tasks, removes its in-memory structures, +/// and deletes its data from both disk and s3. +/// The sequence of steps: +/// 1. Set deleted_at in remote index part. +/// 2. Create local mark file. +/// 3. Delete local files except metadata (it is simpler this way, to be able to reuse timeline initialization code that expects metadata) +/// 4. Delete remote layers +/// 5. Delete index part +/// 6. Delete meta, timeline directory +/// 7. Delete mark file +/// It is resumable from any step in case a crash/restart occurs. +/// There are three entrypoints to the process: +/// 1. [`DeleteTimelineFlow::run`] this is the main one called by a management api handler. +/// 2. [`DeleteTimelineFlow::resume_deletion`] is called during restarts when local metadata is still present +/// and we possibly neeed to continue deletion of remote files. +/// 3. [`DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces`] is used when we deleted remote +/// index but still have local metadata, timeline directory and delete mark. +/// Note the only other place that messes around timeline delete mark is the logic that scans directory with timelines during tenant load. +#[derive(Default)] +pub enum DeleteTimelineFlow { + #[default] + NotStarted, + InProgress, + Finished, +} + +impl DeleteTimelineFlow { + // These steps are run in the context of management api request handler. + // Long running steps are continued to run in the background. + // NB: If this fails half-way through, and is retried, the retry will go through + // all the same steps again. Make sure the code here is idempotent, and don't + // error out if some of the shutdown tasks have already been completed! + #[instrument(skip_all, fields(tenant_id=%tenant.tenant_id, %timeline_id))] + pub async fn run( + tenant: &Arc, + timeline_id: TimelineId, + ) -> Result<(), DeleteTimelineError> { + let (timeline, mut guard) = Self::prepare(tenant, timeline_id)?; + + guard.mark_in_progress()?; + + stop_tasks(&timeline).await?; + + set_deleted_in_remote_index(&timeline).await?; + + create_delete_mark(tenant.conf, timeline.tenant_id, timeline.timeline_id).await?; + + fail::fail_point!("timeline-delete-before-schedule", |_| { + Err(anyhow::anyhow!( + "failpoint: timeline-delete-before-schedule" + ))? + }); + + Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline); + + Ok(()) + } + + fn mark_in_progress(&mut self) -> anyhow::Result<()> { + match self { + Self::Finished => anyhow::bail!("Bug. Is in finished state"), + Self::InProgress { .. } => { /* We're in a retry */ } + Self::NotStarted => { /* Fresh start */ } + } + + *self = Self::InProgress; + + Ok(()) + } + + /// Shortcut to create Timeline in stopping state and spawn deletion task. + pub async fn resume_deletion( + tenant: Arc, + timeline_id: TimelineId, + local_metadata: &TimelineMetadata, + remote_client: Option, + init_order: Option<&InitializationOrder>, + ) -> anyhow::Result<()> { + let timeline = tenant + .create_timeline_struct( + timeline_id, + local_metadata, + None, // Ancestor is not needed for deletion. + remote_client, + init_order, + // Important. We dont pass ancestor above because it can be missing. + // Thus we need to skip the validation here. + CreateTimelineCause::Delete, + ) + .context("create_timeline_struct")?; + + let mut guard = DeletionGuard( + Arc::clone(&timeline.delete_progress) + .try_lock_owned() + .expect("cannot happen because we're the only owner"), + ); + + // Note: here we even skip populating layer map. Timeline is essentially uninitialized. + // RemoteTimelineClient is the only functioning part. + timeline.set_state(TimelineState::Stopping); + // We meed to do this because when console retries delete request we shouldnt answer with 404 + // because 404 means successful deletion. + { + let mut locked = tenant.timelines.lock().unwrap(); + locked.insert(timeline_id, Arc::clone(&timeline)); + } + + guard.mark_in_progress()?; + + // Note that delete mark can be missing on resume + // because we create delete mark after we set deleted_at in the index part. + create_delete_mark(tenant.conf, tenant.tenant_id, timeline_id).await?; + + Self::schedule_background(guard, tenant.conf, tenant, timeline); + + Ok(()) + } + + pub async fn cleanup_remaining_timeline_fs_traces( + tenant: &Tenant, + timeline_id: TimelineId, + ) -> anyhow::Result<()> { + cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_id, timeline_id).await + } + + fn prepare( + tenant: &Tenant, + timeline_id: TimelineId, + ) -> Result<(Arc, DeletionGuard), DeleteTimelineError> { + // Note the interaction between this guard and deletion guard. + // Here we attempt to lock deletion guard when we're holding a lock on timelines. + // This is important because when you take into account `remove_timeline_from_tenant` + // we remove timeline from memory when we still hold the deletion guard. + // So here when timeline deletion is finished timeline wont be present in timelines map at all + // which makes the following sequence impossible: + // T1: get preempted right before the try_lock on `Timeline::delete_progress` + // T2: do a full deletion, acquire and drop `Timeline::delete_progress` + // T1: acquire deletion lock, do another `DeleteTimelineFlow::run` + // For more context see this discussion: `https://github.com/neondatabase/neon/pull/4552#discussion_r1253437346` + let timelines = tenant.timelines.lock().unwrap(); + + let timeline = match timelines.get(&timeline_id) { + Some(t) => t, + None => return Err(DeleteTimelineError::NotFound), + }; + + // Ensure that there are no child timelines **attached to that pageserver**, + // because detach removes files, which will break child branches + let children: Vec = timelines + .iter() + .filter_map(|(id, entry)| { + if entry.get_ancestor_timeline_id() == Some(timeline_id) { + Some(*id) + } else { + None + } + }) + .collect(); + + if !children.is_empty() { + return Err(DeleteTimelineError::HasChildren(children)); + } + + // Note that using try_lock here is important to avoid a deadlock. + // Here we take lock on timelines and then the deletion guard. + // At the end of the operation we're holding the guard and need to lock timelines map + // to remove the timeline from it. + // Always if you have two locks that are taken in different order this can result in a deadlock. + let delete_lock_guard = DeletionGuard( + Arc::clone(&timeline.delete_progress) + .try_lock_owned() + .map_err(|_| DeleteTimelineError::AlreadyInProgress)?, + ); + + timeline.set_state(TimelineState::Stopping); + + Ok((Arc::clone(timeline), delete_lock_guard)) + } + + fn schedule_background( + guard: DeletionGuard, + conf: &'static PageServerConf, + tenant: Arc, + timeline: Arc, + ) { + let tenant_id = timeline.tenant_id; + let timeline_id = timeline.timeline_id; + + task_mgr::spawn( + task_mgr::BACKGROUND_RUNTIME.handle(), + TaskKind::TimelineDeletionWorker, + Some(tenant_id), + Some(timeline_id), + "timeline_delete", + false, + async move { + if let Err(err) = Self::background(guard, conf, &tenant, &timeline).await { + error!("Error: {err:#}"); + timeline.set_broken(format!("{err:#}")) + }; + Ok(()) + } + .instrument({ + let span = + tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_id, timeline_id=%timeline_id); + span.follows_from(Span::current()); + span + }), + ); + } + + async fn background( + mut guard: DeletionGuard, + conf: &PageServerConf, + tenant: &Tenant, + timeline: &Timeline, + ) -> Result<(), DeleteTimelineError> { + delete_local_layer_files(conf, tenant.tenant_id, timeline).await?; + + delete_remote_layers_and_index(timeline).await?; + + pausable_failpoint!("in_progress_delete"); + + cleanup_remaining_timeline_fs_traces(conf, tenant.tenant_id, timeline.timeline_id).await?; + + remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?; + + *guard.0 = Self::Finished; + + Ok(()) + } +} + +struct DeletionGuard(OwnedMutexGuard); + +impl Deref for DeletionGuard { + type Target = DeleteTimelineFlow; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut for DeletionGuard { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index eeb84caf13..aeecc88602 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -26,6 +26,8 @@ use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME}; use utils::fs_ext::PathExt; use utils::id::{TenantId, TimelineId}; +use super::delete::DeleteTimelineFlow; + /// The tenants known to the pageserver. /// The enum variants are used to distinguish the different states that the pageserver can be in. enum TenantsMap { @@ -421,12 +423,10 @@ pub enum DeleteTimelineError { pub async fn delete_timeline( tenant_id: TenantId, timeline_id: TimelineId, - ctx: &RequestContext, + _ctx: &RequestContext, ) -> Result<(), DeleteTimelineError> { let tenant = get_tenant(tenant_id, true).await?; - tenant - .prepare_and_schedule_delete_timeline(timeline_id, ctx) - .await?; + DeleteTimelineFlow::run(&tenant, timeline_id).await?; Ok(()) } diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index fee7f0c28e..8d002a8570 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -827,7 +827,7 @@ impl RemoteTimelineClient { ) }; - receiver.changed().await?; + receiver.changed().await.context("upload queue shut down")?; // Do not delete index part yet, it is needed for possible retry. If we remove it first // and retry will arrive to different pageserver there wont be any traces of it on remote storage @@ -855,11 +855,23 @@ impl RemoteTimelineClient { self.storage_impl.delete_objects(&remaining).await?; } + fail::fail_point!("timeline-delete-before-index-delete", |_| { + Err(anyhow::anyhow!( + "failpoint: timeline-delete-before-index-delete" + ))? + }); + let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME)); debug!("deleting index part"); self.storage_impl.delete(&index_file_path).await?; + fail::fail_point!("timeline-delete-after-index-delete", |_| { + Err(anyhow::anyhow!( + "failpoint: timeline-delete-after-index-delete" + ))? + }); + info!(prefix=%timeline_storage_path, referenced=deletions_queued, not_referenced=%remaining.len(), "done deleting in timeline prefix, including index_part.json"); Ok(()) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index c663a4f9ad..af9edbf95e 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -86,6 +86,7 @@ use self::logical_size::LogicalSize; use self::walreceiver::{WalReceiver, WalReceiverConf}; use super::config::TenantConf; +use super::delete::DeleteTimelineFlow; use super::remote_timeline_client::index::IndexPart; use super::remote_timeline_client::RemoteTimelineClient; use super::storage_layer::{ @@ -237,11 +238,10 @@ pub struct Timeline { /// Layer removal lock. /// A lock to ensure that no layer of the timeline is removed concurrently by other tasks. - /// This lock is acquired in [`Timeline::gc`], [`Timeline::compact`], - /// and [`Tenant::delete_timeline`]. This is an `Arc` lock because we need an owned + /// This lock is acquired in [`Timeline::gc`] and [`Timeline::compact`]. + /// This is an `Arc` lock because we need an owned /// lock guard in functions that will be spawned to tokio I/O pool (which requires `'static`). - /// - /// [`Tenant::delete_timeline`]: super::Tenant::delete_timeline + /// Note that [`DeleteTimelineFlow`] uses `delete_progress` field. pub(super) layer_removal_cs: Arc>, // Needed to ensure that we can't create a branch at a point that was already garbage collected @@ -283,7 +283,7 @@ pub struct Timeline { /// Prevent two tasks from deleting the timeline at the same time. If held, the /// timeline is being deleted. If 'true', the timeline has already been deleted. - pub delete_lock: Arc>, + pub delete_progress: Arc>, eviction_task_timeline_state: tokio::sync::Mutex, @@ -1453,7 +1453,7 @@ impl Timeline { eviction_task_timeline_state: tokio::sync::Mutex::new( EvictionTaskTimelineState::default(), ), - delete_lock: Arc::new(tokio::sync::Mutex::new(false)), + delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTimelineFlow::default())), initial_logical_size_can_start, initial_logical_size_attempt: Mutex::new(initial_logical_size_attempt), @@ -1918,6 +1918,15 @@ impl Timeline { } fn try_spawn_size_init_task(self: &Arc, lsn: Lsn, ctx: &RequestContext) { + let state = self.current_state(); + if matches!( + state, + TimelineState::Broken { .. } | TimelineState::Stopping + ) { + // Can happen when timeline detail endpoint is used when deletion is ongoing (or its broken). + return; + } + let permit = match Arc::clone(&self.current_logical_size.initial_size_computation) .try_acquire_owned() { diff --git a/pageserver/src/tenant/timeline/uninit.rs b/pageserver/src/tenant/timeline/uninit.rs index b8cc65f4b1..5a15e86458 100644 --- a/pageserver/src/tenant/timeline/uninit.rs +++ b/pageserver/src/tenant/timeline/uninit.rs @@ -2,13 +2,9 @@ use std::{collections::hash_map::Entry, fs, path::PathBuf, sync::Arc}; use anyhow::Context; use tracing::{error, info, info_span, warn}; -use utils::{crashsafe, id::TimelineId, lsn::Lsn}; +use utils::{crashsafe, fs_ext, id::TimelineId, lsn::Lsn}; -use crate::{ - context::RequestContext, - import_datadir, - tenant::{ignore_absent_files, Tenant}, -}; +use crate::{context::RequestContext, import_datadir, tenant::Tenant}; use super::Timeline; @@ -141,7 +137,7 @@ impl Drop for UninitializedTimeline<'_> { pub(crate) fn cleanup_timeline_directory(uninit_mark: TimelineUninitMark) { let timeline_path = &uninit_mark.timeline_path; - match ignore_absent_files(|| fs::remove_dir_all(timeline_path)) { + match fs_ext::ignore_absent_files(|| fs::remove_dir_all(timeline_path)) { Ok(()) => { info!("Timeline dir {timeline_path:?} removed successfully, removing the uninit mark") } @@ -185,7 +181,7 @@ impl TimelineUninitMark { let uninit_mark_parent = uninit_mark_file .parent() .with_context(|| format!("Uninit mark file {uninit_mark_file:?} has no parent"))?; - ignore_absent_files(|| fs::remove_file(uninit_mark_file)).with_context(|| { + fs_ext::ignore_absent_files(|| fs::remove_file(uninit_mark_file)).with_context(|| { format!("Failed to remove uninit mark file at path {uninit_mark_file:?}") })?; crashsafe::fsync(uninit_mark_parent).context("Failed to fsync uninit mark parent")?; diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py index ad89ebad00..f8a4423ffa 100644 --- a/test_runner/fixtures/pageserver/utils.py +++ b/test_runner/fixtures/pageserver/utils.py @@ -194,14 +194,18 @@ def wait_for_upload_queue_empty( def wait_timeline_detail_404( - pageserver_http: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId + pageserver_http: PageserverHttpClient, + tenant_id: TenantId, + timeline_id: TimelineId, + wait_longer: bool = False, ): last_exc = None - for _ in range(2): + iterations = 10 if wait_longer else 2 + for _ in range(iterations): time.sleep(0.250) try: data = pageserver_http.timeline_detail(tenant_id, timeline_id) - log.error(f"detail {data}") + log.info(f"detail {data}") except PageserverApiException as e: log.debug(e) if e.status_code == 404: @@ -216,7 +220,8 @@ def timeline_delete_wait_completed( pageserver_http: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId, + wait_longer: bool = False, # Use when running with RemoteStorageKind.REAL_S3 **delete_args, ): pageserver_http.timeline_delete(tenant_id=tenant_id, timeline_id=timeline_id, **delete_args) - wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id) + wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, wait_longer) diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index a4c5bf626a..9226ca21d2 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -1,3 +1,4 @@ +import enum import os import queue import shutil @@ -11,9 +12,12 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, + PgBin, RemoteStorageKind, S3Storage, available_remote_storages, + last_flush_lsn_upload, + wait_for_last_flush_lsn, ) from fixtures.pageserver.http import PageserverApiException from fixtures.pageserver.utils import ( @@ -117,59 +121,183 @@ def test_timeline_delete(neon_simple_env: NeonEnv): ps_http.timeline_detail(env.initial_tenant, leaf_timeline_id) +class Check(enum.Enum): + RETRY_WITHOUT_RESTART = enum.auto() + RETRY_WITH_RESTART = enum.auto() + + +DELETE_FAILPOINTS = [ + "timeline-delete-before-index-deleted-at", + "timeline-delete-before-schedule", + "timeline-delete-before-rm", + "timeline-delete-during-rm", + "timeline-delete-after-rm", + "timeline-delete-before-index-delete", + "timeline-delete-after-index-delete", + "timeline-delete-after-rm-metadata", + "timeline-delete-after-rm-dir", +] + + +def combinations(): + result = [] + + remotes = [RemoteStorageKind.NOOP, RemoteStorageKind.MOCK_S3] + if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE"): + remotes.append(RemoteStorageKind.REAL_S3) + + for remote_storage_kind in remotes: + for delete_failpoint in DELETE_FAILPOINTS: + if remote_storage_kind == RemoteStorageKind.NOOP and delete_failpoint in ( + "timeline-delete-before-index-delete", + "timeline-delete-after-index-delete", + ): + # the above failpoints are not relevant for config without remote storage + continue + + result.append((remote_storage_kind, delete_failpoint)) + return result + + # cover the two cases: remote storage configured vs not configured -@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS]) -def test_delete_timeline_post_rm_failure( - neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind +@pytest.mark.parametrize("remote_storage_kind, failpoint", combinations()) +@pytest.mark.parametrize("check", list(Check)) +def test_delete_timeline_exercise_crash_safety_failpoints( + neon_env_builder: NeonEnvBuilder, + remote_storage_kind: RemoteStorageKind, + failpoint: str, + check: Check, + pg_bin: PgBin, ): """ - If there is a failure after removing the timeline directory, the delete operation - should be retryable. + If there is a failure during deletion in one of the associated failpoints (or crash restart happens at this point) the delete operation + should be retryable and should be successfully resumed. + + We iterate over failpoints list, changing failpoint to the next one. + + 1. Set settings to generate many layers + 2. Create branch. + 3. Insert something + 4. Go with the test. + 5. Iterate over failpoints + 6. Execute delete for each failpoint + 7. Ensure failpoint is hit + 8. Retry or restart without the failpoint and check the result. """ if remote_storage_kind is not None: neon_env_builder.enable_remote_storage( - remote_storage_kind, "test_delete_timeline_post_rm_failure" + remote_storage_kind, "test_delete_timeline_exercise_crash_safety_failpoints" ) - env = neon_env_builder.init_start() - assert env.initial_timeline - - env.pageserver.allowed_errors.append(".*Error: failpoint: timeline-delete-after-rm") - env.pageserver.allowed_errors.append(".*Ignoring state update Stopping for broken timeline") + env = neon_env_builder.init_start( + initial_tenant_conf={ + "gc_period": "0s", + "compaction_period": "0s", + "checkpoint_distance": f"{1024 ** 2}", + "image_creation_threshold": "100", + } + ) ps_http = env.pageserver.http_client() - failpoint_name = "timeline-delete-after-rm" - ps_http.configure_failpoints((failpoint_name, "return")) + timeline_id = env.neon_cli.create_timeline("delete") + with env.endpoints.create_start("delete") as endpoint: + # generate enough layers + pg_bin.run(["pgbench", "-i", "-I dtGvp", "-s1", endpoint.connstr()]) + if remote_storage_kind is RemoteStorageKind.NOOP: + wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, timeline_id) + else: + last_flush_lsn_upload(env, endpoint, env.initial_tenant, timeline_id) - ps_http.timeline_delete(env.initial_tenant, env.initial_timeline) - wait_until_timeline_state( - pageserver_http=ps_http, - tenant_id=env.initial_tenant, - timeline_id=env.initial_timeline, - expected_state="Broken", - iterations=2, # effectively try immediately and retry once in one second - ) - - # FIXME: #4719 - # timeline_info["state"]["Broken"]["reason"] == "failpoint: timeline-delete-after-rm" - - at_failpoint_log_message = f".*{env.initial_timeline}.*at failpoint {failpoint_name}.*" - env.pageserver.allowed_errors.append(at_failpoint_log_message) + env.pageserver.allowed_errors.append(f".*{timeline_id}.*failpoint: {failpoint}") + # It appears when we stopped flush loop during deletion and then pageserver is stopped env.pageserver.allowed_errors.append( - f".*DELETE.*{env.initial_timeline}.*InternalServerError.*{failpoint_name}" + ".*freeze_and_flush_on_shutdown.*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited" ) - - # retry without failpoint, it should succeed - ps_http.configure_failpoints((failpoint_name, "off")) - - # this should succeed - # this also checks that delete can be retried even when timeline is in Broken state - timeline_delete_wait_completed(ps_http, env.initial_tenant, env.initial_timeline) + # This happens when we fail before scheduling background operation. + # Timeline is left in stopping state and retry tries to stop it again. env.pageserver.allowed_errors.append( - f".*{env.initial_timeline}.*timeline directory not found, proceeding anyway.*" + ".*Ignoring new state, equal to the existing one: Stopping" ) + # This happens when we retry delete requests for broken timelines + env.pageserver.allowed_errors.append(".*Ignoring state update Stopping for broken timeline") + # This happens when timeline remains are cleaned up during loading + env.pageserver.allowed_errors.append(".*Timeline dir entry become invalid.*") + # In one of the branches we poll for tenant to become active. Polls can generate this log message: + env.pageserver.allowed_errors.append(f".*Tenant {env.initial_tenant} is not active*") + + ps_http.configure_failpoints((failpoint, "return")) + + # These failpoints are earlier than background task is spawned. + # so they result in api request failure. + if failpoint in ( + "timeline-delete-before-index-deleted-at", + "timeline-delete-before-schedule", + ): + with pytest.raises(PageserverApiException, match=failpoint): + ps_http.timeline_delete(env.initial_tenant, timeline_id) + + else: + ps_http.timeline_delete(env.initial_tenant, timeline_id) + timeline_info = wait_until_timeline_state( + pageserver_http=ps_http, + tenant_id=env.initial_tenant, + timeline_id=timeline_id, + expected_state="Broken", + iterations=2, # effectively try immediately and retry once in one second + ) + + reason = timeline_info["state"]["Broken"]["reason"] + log.info(f"timeline broken: {reason}") + + # failpoint may not be the only error in the stack + assert reason.endswith(f"failpoint: {failpoint}"), reason + + wait_longer = remote_storage_kind is RemoteStorageKind.REAL_S3 + if check is Check.RETRY_WITH_RESTART: + env.pageserver.stop() + env.pageserver.start() + if failpoint == "timeline-delete-before-index-deleted-at": + # We crashed before persisting this to remote storage, need to retry delete request + + # Wait till tenant is loaded. Shouldnt take longer than 2 seconds (we shouldnt block tenant loading) + wait_until_tenant_active(ps_http, env.initial_tenant, iterations=2) + + timeline_delete_wait_completed(ps_http, env.initial_tenant, timeline_id) + else: + # Pageserver should've resumed deletion after restart. + wait_timeline_detail_404( + ps_http, env.initial_tenant, timeline_id, wait_longer=wait_longer + ) + elif check is Check.RETRY_WITHOUT_RESTART: + # this should succeed + # this also checks that delete can be retried even when timeline is in Broken state + ps_http.configure_failpoints((failpoint, "off")) + + timeline_delete_wait_completed( + ps_http, env.initial_tenant, timeline_id, wait_longer=wait_longer + ) + + # Check remote is impty + if remote_storage_kind is RemoteStorageKind.MOCK_S3: + assert_prefix_empty( + neon_env_builder, + prefix="/".join( + ( + "tenants", + str(env.initial_tenant), + "timelines", + str(timeline_id), + ) + ), + ) + + timeline_dir = env.timeline_dir(env.initial_tenant, timeline_id) + # Check local is empty + assert not timeline_dir.exists() + # Check no delete mark present + assert not (timeline_dir.parent / f"{timeline_id}.___deleted").exists() @pytest.mark.parametrize("remote_storage_kind", available_remote_storages()) @@ -327,7 +455,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild ) ps_http.timeline_delete(env.initial_tenant, leaf_timeline_id) - wait_until_timeline_state( + timeline_info = wait_until_timeline_state( pageserver_http=ps_http, tenant_id=env.initial_tenant, timeline_id=leaf_timeline_id, @@ -335,8 +463,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild iterations=2, # effectively try immediately and retry once in one second ) - # FIXME: #4719 - # timeline_info["state"]["Broken"]["reason"] == "failpoint: timeline-delete-after-rm" + assert timeline_info["state"]["Broken"]["reason"] == "failpoint: timeline-delete-before-rm" assert leaf_timeline_path.exists(), "the failpoint didn't work" @@ -588,6 +715,7 @@ def test_timeline_delete_works_for_remote_smoke( assert tenant_id == env.initial_tenant assert main_timeline_id == env.initial_timeline + assert env.initial_timeline is not None timeline_ids = [env.initial_timeline] for i in range(2): branch_timeline_id = env.neon_cli.create_branch(f"new{i}", "main")