From c178f34e21d7822d1545026a68f68ccb7c3f2e82 Mon Sep 17 00:00:00 2001 From: John Spray Date: Fri, 27 Oct 2023 15:53:39 +0100 Subject: [PATCH] pageserver: start simplifying tenant deletion --- pageserver/src/http/routes.rs | 3 +- pageserver/src/task_mgr.rs | 3 + pageserver/src/tenant.rs | 103 ++-- pageserver/src/tenant/delete.rs | 549 +++++++--------------- pageserver/src/tenant/mgr.rs | 137 ++++-- pageserver/src/tenant/timeline/delete.rs | 2 +- test_runner/regress/test_tenant_delete.py | 8 +- 7 files changed, 311 insertions(+), 494 deletions(-) diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 988b148a1f..daa724decc 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -275,7 +275,6 @@ impl From for ApiError { use crate::tenant::delete::DeleteTenantError::*; match value { Get(g) => ApiError::from(g), - e @ AlreadyInProgress => ApiError::Conflict(e.to_string()), Timeline(t) => ApiError::from(t), NotAttached => ApiError::NotFound(anyhow::anyhow!("Tenant is not attached").into()), SlotError(e) => e.into(), @@ -782,7 +781,7 @@ async fn tenant_delete_handler( let state = get_state(&request); - mgr::delete_tenant(state.conf, state.remote_storage.clone(), tenant_id) + mgr::delete_tenant(state.conf, state.remote_storage.clone(), tenant_id, false) .instrument(info_span!("tenant_delete_handler", %tenant_id)) .await?; diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs index 017322ffb2..2216af7544 100644 --- a/pageserver/src/task_mgr.rs +++ b/pageserver/src/task_mgr.rs @@ -291,6 +291,9 @@ pub enum TaskKind { // A request that comes in via the pageserver HTTP API. MgmtRequest, + // Tenant Manager servicing a channel that enables upcalls from Tenant + TenantManagerUpcall, + DebugTool, #[cfg(test)] diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 8f3d979e71..99b64ea376 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -51,10 +51,9 @@ use self::config::AttachedLocationConfig; use self::config::AttachmentMode; use self::config::LocationConf; use self::config::TenantConf; -use self::delete::DeleteTenantFlow; +use self::delete::should_resume_deletion; use self::metadata::LoadMetadataError; use self::metadata::TimelineMetadata; -use self::mgr::TenantsMap; use self::remote_timeline_client::RemoteTimelineClient; use self::timeline::uninit::TimelineUninitMark; use self::timeline::uninit::UninitializedTimeline; @@ -250,8 +249,6 @@ pub struct Tenant { cached_synthetic_tenant_size: Arc, eviction_task_tenant_state: tokio::sync::Mutex, - - pub(crate) delete_progress: Arc>, } impl std::fmt::Debug for Tenant { @@ -537,8 +534,8 @@ impl Tenant { resources: TenantSharedResources, attached_conf: AttachedTenantConf, init_order: Option, - tenants: &'static std::sync::RwLock, mode: SpawnMode, + resume_deletion_upcall: Option>>, ctx: &RequestContext, ) -> anyhow::Result> { let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new( @@ -625,67 +622,48 @@ impl Tenant { // Remote preload is complete. drop(remote_load_completion); - let pending_deletion = { - match DeleteTenantFlow::should_resume_deletion( - conf, - preload.as_ref().map(|p| p.deleting).unwrap_or(false), - &tenant_clone, - ) - .await - { - Ok(should_resume_deletion) => should_resume_deletion, - Err(err) => { - make_broken(&tenant_clone, anyhow::anyhow!(err)); - return Ok(()); + let resume_deletion = should_resume_deletion(conf, preload.as_ref().map(|p| p.deleting).unwrap_or(false), &tenant_id).await?; + if resume_deletion { + // We will wait until the background + info!("Tenant is partially deleted, will resume"); + + // Put the Tenant into a Stopping state so that it will not try to serve any I/O + tenant_clone + .set_stopping(false, true) + .await + .expect("cant be stopping or broken"); + + // Proceed with attaching the tenant to load the metadata we will use for remote deletion + match tenant_clone.attach(init_order, preload, &ctx).await { + Ok(()) => { + info!("attach finished, deletion will resume"); + } + Err(e) => { + make_broken(&tenant_clone, anyhow::anyhow!(e)); + } + } + + if let Some(resume_deletion_upcall) = resume_deletion_upcall { + // If this send() fails it means we're shutting down, so just drop it on the floor + resume_deletion_upcall.send(tenant_clone).await.ok(); + } else { + make_broken(&tenant_clone, anyhow::anyhow!( + "Attemped to attach a partially deleted tenant outside of pageserver startup" + )); + } + } else { + // Normal case: load all the metadata for the tenant. + match tenant_clone.attach(init_order, preload, &ctx).await { + Ok(()) => { + info!("attach finished, activating"); + tenant_clone.activate(broker_client, None, &ctx); + } + Err(e) => { + make_broken(&tenant_clone, anyhow::anyhow!(e)); } } }; - info!("pending_deletion {}", pending_deletion.is_some()); - - if let Some(deletion) = pending_deletion { - // as we are no longer loading, signal completion by dropping - // the completion while we resume deletion - drop(_completion); - // do not hold to initial_logical_size_attempt as it will prevent loading from proceeding without timeout - let _ = init_order - .as_mut() - .and_then(|x| x.initial_logical_size_attempt.take()); - let background_jobs_can_start = - init_order.as_ref().map(|x| &x.background_jobs_can_start); - if let Some(background) = background_jobs_can_start { - info!("waiting for backgound jobs barrier"); - background.clone().wait().await; - info!("ready for backgound jobs barrier"); - } - - match DeleteTenantFlow::resume_from_attach( - deletion, - &tenant_clone, - preload, - tenants, - init_order, - &ctx, - ) - .await - { - Err(err) => { - make_broken(&tenant_clone, anyhow::anyhow!(err)); - return Ok(()); - } - Ok(()) => return Ok(()), - } - } - - match tenant_clone.attach(init_order, preload, &ctx).await { - Ok(()) => { - info!("attach finished, activating"); - tenant_clone.activate(broker_client, None, &ctx); - } - Err(e) => { - make_broken(&tenant_clone, anyhow::anyhow!(e)); - } - } Ok(()) } .instrument({ @@ -2362,7 +2340,6 @@ impl Tenant { cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()), cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)), eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()), - delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTenantFlow::default())), } } diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs index fa63c83c17..a9ca33c26f 100644 --- a/pageserver/src/tenant/delete.rs +++ b/pageserver/src/tenant/delete.rs @@ -1,31 +1,27 @@ use std::sync::Arc; use anyhow::Context; -use camino::{Utf8Path, Utf8PathBuf}; +use camino::Utf8Path; use pageserver_api::models::TenantState; use remote_storage::{GenericRemoteStorage, RemotePath}; -use tokio::sync::OwnedMutexGuard; use tokio_util::sync::CancellationToken; -use tracing::{error, instrument, warn, Instrument, Span}; +use tracing::error; use utils::{ - backoff, crashsafe, fs_ext, + backoff, crashsafe, id::{TenantId, TimelineId}, }; use crate::{ config::PageServerConf, - context::RequestContext, - task_mgr::{self, TaskKind}, - InitializationOrder, + tenant::{mgr::safe_rename_tenant_dir, ShutdownError}, }; use super::{ - mgr::{GetTenantError, TenantSlotError, TenantSlotUpsertError, TenantsMap}, + mgr::{GetTenantError, SlotGuard, TenantSlotError, TenantSlotUpsertError}, remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD}, - span, timeline::delete::DeleteTimelineFlow, - tree_sort_timelines, DeleteTimelineError, Tenant, TenantPreload, + tree_sort_timelines, DeleteTimelineError, Tenant, }; #[derive(Debug, thiserror::Error)] @@ -39,9 +35,6 @@ pub(crate) enum DeleteTenantError { #[error("Invalid state {0}. Expected Active or Broken")] InvalidState(TenantState), - #[error("Tenant deletion is already in progress")] - AlreadyInProgress, - #[error("Tenant map slot error {0}")] SlotError(#[from] TenantSlotError), @@ -55,7 +48,142 @@ pub(crate) enum DeleteTenantError { Other(#[from] anyhow::Error), } -type DeletionGuard = tokio::sync::OwnedMutexGuard; +/// The part of tenant deletion that must happen before acknowledging the deletion request +/// +/// Usually deletion requires that the tenant is not already in Stopping state: set +/// `resume` to true to skip this check if resuming deletion at startup, since we +/// would have loaded the tenant into Stopping mode already +pub(crate) async fn delete_tenant_foreground( + conf: &'static PageServerConf, + remote_storage: Option, + tenant: &Arc, + resume: bool, + // Witness that we are doing this work within a TenantSlot::InProgress state. + _slot_guard: &SlotGuard, +) -> Result<(), DeleteTenantError> { + // In resume mode, we must be stopping. Else, we must _not_ be Stopping. + if !(matches!(tenant.current_state(), TenantState::Stopping) ^ !resume) { + return Err(DeleteTenantError::InvalidState(tenant.current_state())); + } + + fail::fail_point!("tenant-delete-before-create-remote-mark", |_| { + Err(anyhow::anyhow!( + "failpoint: tenant-delete-before-create-remote-mark" + ))? + }); + + // Write persistent tombstone + if let Some(remote_storage) = &remote_storage { + create_remote_delete_mark(conf, remote_storage, &tenant.get_tenant_id()).await?; + } + + fail::fail_point!("tenant-delete-before-create-local-mark", |_| { + Err(anyhow::anyhow!( + "failpoint: tenant-delete-before-create-local-mark" + ))? + }); + + create_local_delete_mark(conf, &tenant.tenant_id) + .await + .context("local create mark")?; + + fail::fail_point!("tenant-delete-before-background", |_| { + Err(anyhow::anyhow!( + "failpoint: tenant-delete-before-background" + ))? + }); + + Ok(()) +} + +/// The part of tenant deletion that happens after we have already acknowledged the request. +/// +/// This part may retried after pageserver restart if we see a tenant that has a deletion marker +/// but has not been completely deleted. +pub(crate) async fn delete_tenant_background( + conf: &'static PageServerConf, + remote_storage: Option, + tenant_id: TenantId, + tenant: &Arc, + resume: bool, + // Witness that we are doing this work within a TenantSlot::InProgress state. + _slot_guard: &SlotGuard, +) -> anyhow::Result<()> { + let already_running_timeline_deletions = schedule_ordered_timeline_deletions(tenant) + .await + .context("schedule_ordered_timeline_deletions")?; + + fail::fail_point!("tenant-delete-before-polling-ongoing-deletions", |_| { + Err(anyhow::anyhow!( + "failpoint: tenant-delete-before-polling-ongoing-deletions" + ))? + }); + + // Wait for deletions that were already running at the moment when tenant deletion was requested. + // When we can lock deletion guard it means that corresponding timeline deletion finished. + for (guard, timeline_id) in already_running_timeline_deletions { + let flow = guard.lock().await; + if !flow.is_finished() { + return Err(DeleteTenantError::Other(anyhow::anyhow!( + "already running timeline deletion failed: {timeline_id}" + )) + .into()); + } + } + + // For convenience, use the Tenant's deletion queue reference so that we don't have + // to take it as an argument. + let deletion_queue_client = tenant.deletion_queue_client.clone(); + + fail::fail_point!("tenant-delete-before-shutdown", |_| { + Err(anyhow::anyhow!("failpoint: tenant-delete-before-shutdown"))? + }); + + // Tear down local runtime state + if !resume { + tenant.shutdown(false).await.map_err(|e| match e { + ShutdownError::AlreadyStopping => { + DeleteTenantError::InvalidState(TenantState::Stopping) + } + })?; + } + + // Not necessary for correctness, but executing deletions before erasing local contents + // means that we will have a better chance to resume the delete if we crash. + deletion_queue_client.flush_execute().await?; + + fail::fail_point!("tenant-delete-before-remove-deleted-mark", |_| { + Err(anyhow::anyhow!( + "failpoint: tenant-delete-before-remove-deleted-mark" + ))? + }); + + // Remove the deletion marker + remove_tenant_remote_delete_mark(conf, remote_storage.as_ref(), &tenant_id).await?; + + // Not necessary for correctness, but helps make it true that when we log that deletion is + // done, it really is. + deletion_queue_client.flush_execute().await?; + + fail::fail_point!("tenant-delete-before-remove-tenant-dir", |_| { + Err(anyhow::anyhow!( + "failpoint: tenant-delete-before-remove-tenant-dir" + ))? + }); + + // Remove local storage contents. We do this last, so that if we crash during delete, on + // restart we will attempt to re-attach the tenant and resume the deletion. + let local_tenant_directory = conf.tenant_path(&tenant_id); + let tmp_path = safe_rename_tenant_dir(&local_tenant_directory) + .await + .with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))?; + + tokio::fs::remove_dir_all(tmp_path.as_path()) + .await + .with_context(|| format!("tenant directory {:?} deletion", tmp_path))?; + + Ok(()) +} fn remote_tenant_delete_mark_path( conf: &PageServerConf, @@ -70,7 +198,7 @@ fn remote_tenant_delete_mark_path( Ok(tenant_remote_path.join(Utf8Path::new("timelines/deleted"))) } -async fn create_remote_delete_mark( +pub(crate) async fn create_remote_delete_mark( conf: &PageServerConf, remote_storage: &GenericRemoteStorage, tenant_id: &TenantId, @@ -115,7 +243,26 @@ async fn create_local_delete_mark( Ok(()) } -async fn schedule_ordered_timeline_deletions( +pub(crate) async fn should_resume_deletion( + conf: &'static PageServerConf, + remote_mark_exists: bool, + tenant_id: &TenantId, +) -> Result { + if remote_mark_exists { + return Ok(true); + } + + // In the very last stage of deletion, we might have already removed the remote + // marker, but be attaching the tenant anyway on the basis of its local directory + // existing: to resume deletion in this case we need the local deletion marker. + if conf.tenant_deleted_mark_file_path(tenant_id).exists() { + Ok(true) + } else { + Ok(false) + } +} + +pub(crate) async fn schedule_ordered_timeline_deletions( tenant: &Arc, ) -> Result>, TimelineId)>, DeleteTenantError> { // Tenant is stopping at this point. We know it will be deleted. @@ -153,21 +300,7 @@ async fn schedule_ordered_timeline_deletions( Ok(already_running_deletions) } -async fn ensure_timelines_dir_empty(timelines_path: &Utf8Path) -> Result<(), DeleteTenantError> { - // Assert timelines dir is empty. - if !fs_ext::is_directory_empty(timelines_path).await? { - // Display first 10 items in directory - let list = fs_ext::list_dir(timelines_path).await.context("list_dir")?; - let list = &list.into_iter().take(10).collect::>(); - return Err(DeleteTenantError::Other(anyhow::anyhow!( - "Timelines directory is not empty after all timelines deletion: {list:?}" - ))); - } - - Ok(()) -} - -async fn remove_tenant_remote_delete_mark( +pub(crate) async fn remove_tenant_remote_delete_mark( conf: &PageServerConf, remote_storage: Option<&GenericRemoteStorage>, tenant_id: &TenantId, @@ -188,357 +321,3 @@ async fn remove_tenant_remote_delete_mark( } Ok(()) } - -// Cleanup fs traces: tenant config, timelines dir local delete mark, tenant dir -async fn cleanup_remaining_fs_traces( - conf: &PageServerConf, - tenant_id: &TenantId, -) -> Result<(), DeleteTenantError> { - let rm = |p: Utf8PathBuf, is_dir: bool| async move { - if is_dir { - tokio::fs::remove_dir(&p).await - } else { - tokio::fs::remove_file(&p).await - } - .or_else(fs_ext::ignore_not_found) - .with_context(|| format!("failed to delete {p}")) - }; - - rm(conf.tenant_config_path(tenant_id), false).await?; - rm(conf.tenant_location_config_path(tenant_id), false).await?; - - fail::fail_point!("tenant-delete-before-remove-timelines-dir", |_| { - Err(anyhow::anyhow!( - "failpoint: tenant-delete-before-remove-timelines-dir" - ))? - }); - - rm(conf.timelines_path(tenant_id), true).await?; - - fail::fail_point!("tenant-delete-before-remove-deleted-mark", |_| { - Err(anyhow::anyhow!( - "failpoint: tenant-delete-before-remove-deleted-mark" - ))? - }); - - // Make sure previous deletions are ordered before mark removal. - // Otherwise there is no guarantee that they reach the disk before mark deletion. - // So its possible for mark to reach disk first and for other deletions - // to be reordered later and thus missed if a crash occurs. - // Note that we dont need to sync after mark file is removed - // because we can tolerate the case when mark file reappears on startup. - let tenant_path = &conf.tenant_path(tenant_id); - if tenant_path.exists() { - crashsafe::fsync_async(&conf.tenant_path(tenant_id)) - .await - .context("fsync_pre_mark_remove")?; - } - - rm(conf.tenant_deleted_mark_file_path(tenant_id), false).await?; - - fail::fail_point!("tenant-delete-before-remove-tenant-dir", |_| { - Err(anyhow::anyhow!( - "failpoint: tenant-delete-before-remove-tenant-dir" - ))? - }); - - rm(conf.tenant_path(tenant_id), true).await?; - - Ok(()) -} - -/// Orchestrates tenant shut down of all tasks, removes its in-memory structures, -/// and deletes its data from both disk and s3. -/// The sequence of steps: -/// 1. Upload remote deletion mark. -/// 2. Create local mark file. -/// 3. Shutdown tasks -/// 4. Run ordered timeline deletions -/// 5. Wait for timeline deletion operations that were scheduled before tenant deletion was requested -/// 6. Remove remote mark -/// 7. Cleanup remaining fs traces, tenant dir, config, timelines dir, local delete mark -/// It is resumable from any step in case a crash/restart occurs. -/// There are two entrypoints to the process: -/// 1. [`DeleteTenantFlow::run`] this is the main one called by a management api handler. -/// 2. [`DeleteTenantFlow::resume_from_attach`] is called when deletion is resumed tenant is found to be deleted during attach process. -/// Note the only other place that messes around timeline delete mark is the `Tenant::spawn_load` function. -#[derive(Default)] -pub enum DeleteTenantFlow { - #[default] - NotStarted, - InProgress, - Finished, -} - -impl DeleteTenantFlow { - // These steps are run in the context of management api request handler. - // Long running steps are continued to run in the background. - // NB: If this fails half-way through, and is retried, the retry will go through - // all the same steps again. Make sure the code here is idempotent, and don't - // error out if some of the shutdown tasks have already been completed! - // NOTE: static needed for background part. - // We assume that calling code sets up the span with tenant_id. - #[instrument(skip_all)] - pub(crate) async fn run( - conf: &'static PageServerConf, - remote_storage: Option, - tenants: &'static std::sync::RwLock, - tenant: Arc, - ) -> Result<(), DeleteTenantError> { - span::debug_assert_current_span_has_tenant_id(); - - let mut guard = Self::prepare(&tenant).await?; - - if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await { - tenant.set_broken(format!("{e:#}")).await; - return Err(e); - } - - Self::schedule_background(guard, conf, remote_storage, tenants, tenant); - - Ok(()) - } - - // Helper function needed to be able to match once on returned error and transition tenant into broken state. - // This is needed because tenant.shutwodn is not idempotent. If tenant state is set to stopping another call to tenant.shutdown - // will result in an error, but here we need to be able to retry shutdown when tenant deletion is retried. - // So the solution is to set tenant state to broken. - async fn run_inner( - guard: &mut OwnedMutexGuard, - conf: &'static PageServerConf, - remote_storage: Option<&GenericRemoteStorage>, - tenant: &Tenant, - ) -> Result<(), DeleteTenantError> { - guard.mark_in_progress()?; - - fail::fail_point!("tenant-delete-before-create-remote-mark", |_| { - Err(anyhow::anyhow!( - "failpoint: tenant-delete-before-create-remote-mark" - ))? - }); - - // IDEA: implement detach as delete without remote storage. Then they would use the same lock (deletion_progress) so wont contend. - // Though sounds scary, different mark name? - // Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state. - if let Some(remote_storage) = &remote_storage { - create_remote_delete_mark(conf, remote_storage, &tenant.tenant_id) - .await - .context("remote_mark")? - } - - fail::fail_point!("tenant-delete-before-create-local-mark", |_| { - Err(anyhow::anyhow!( - "failpoint: tenant-delete-before-create-local-mark" - ))? - }); - - create_local_delete_mark(conf, &tenant.tenant_id) - .await - .context("local delete mark")?; - - fail::fail_point!("tenant-delete-before-background", |_| { - Err(anyhow::anyhow!( - "failpoint: tenant-delete-before-background" - ))? - }); - - Ok(()) - } - - fn mark_in_progress(&mut self) -> anyhow::Result<()> { - match self { - Self::Finished => anyhow::bail!("Bug. Is in finished state"), - Self::InProgress { .. } => { /* We're in a retry */ } - Self::NotStarted => { /* Fresh start */ } - } - - *self = Self::InProgress; - - Ok(()) - } - - pub(crate) async fn should_resume_deletion( - conf: &'static PageServerConf, - remote_mark_exists: bool, - tenant: &Tenant, - ) -> Result, DeleteTenantError> { - let acquire = |t: &Tenant| { - Some( - Arc::clone(&t.delete_progress) - .try_lock_owned() - .expect("we're the only owner during init"), - ) - }; - - if remote_mark_exists { - return Ok(acquire(tenant)); - } - - let tenant_id = tenant.tenant_id; - // Check local mark first, if its there there is no need to go to s3 to check whether remote one exists. - if conf.tenant_deleted_mark_file_path(&tenant_id).exists() { - Ok(acquire(tenant)) - } else { - Ok(None) - } - } - - pub(crate) async fn resume_from_attach( - guard: DeletionGuard, - tenant: &Arc, - preload: Option, - tenants: &'static std::sync::RwLock, - init_order: Option, - ctx: &RequestContext, - ) -> Result<(), DeleteTenantError> { - tenant - .set_stopping(false, true) - .await - .expect("cant be stopping or broken"); - - tenant - .attach(init_order, preload, ctx) - .await - .context("attach")?; - - Self::background( - guard, - tenant.conf, - tenant.remote_storage.clone(), - tenants, - tenant, - ) - .await - } - - async fn prepare( - tenant: &Arc, - ) -> Result, DeleteTenantError> { - // FIXME: unsure about active only. Our init jobs may not be cancellable properly, - // so at least for now allow deletions only for active tenants. TODO recheck - // Broken and Stopping is needed for retries. - if !matches!( - tenant.current_state(), - TenantState::Active | TenantState::Broken { .. } - ) { - return Err(DeleteTenantError::InvalidState(tenant.current_state())); - } - - let guard = Arc::clone(&tenant.delete_progress) - .try_lock_owned() - .map_err(|_| DeleteTenantError::AlreadyInProgress)?; - - fail::fail_point!("tenant-delete-before-shutdown", |_| { - Err(anyhow::anyhow!("failpoint: tenant-delete-before-shutdown"))? - }); - - // It would be good to only set stopping here and continue shutdown in the background, but shutdown is not idempotent. - // i e it is an error to do: - // tenant.set_stopping - // tenant.shutdown - // Its also bad that we're holding tenants.read here. - // TODO relax set_stopping to be idempotent? - if tenant.shutdown(false).await.is_err() { - return Err(DeleteTenantError::Other(anyhow::anyhow!( - "tenant shutdown is already in progress" - ))); - } - - Ok(guard) - } - - fn schedule_background( - guard: OwnedMutexGuard, - conf: &'static PageServerConf, - remote_storage: Option, - tenants: &'static std::sync::RwLock, - tenant: Arc, - ) { - let tenant_id = tenant.tenant_id; - - task_mgr::spawn( - task_mgr::BACKGROUND_RUNTIME.handle(), - TaskKind::TimelineDeletionWorker, - Some(tenant_id), - None, - "tenant_delete", - false, - async move { - if let Err(err) = - Self::background(guard, conf, remote_storage, tenants, &tenant).await - { - error!("Error: {err:#}"); - tenant.set_broken(format!("{err:#}")).await; - }; - Ok(()) - } - .instrument({ - let span = tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_id); - span.follows_from(Span::current()); - span - }), - ); - } - - async fn background( - mut guard: OwnedMutexGuard, - conf: &PageServerConf, - remote_storage: Option, - tenants: &'static std::sync::RwLock, - tenant: &Arc, - ) -> Result<(), DeleteTenantError> { - // Tree sort timelines, schedule delete for them. Mention retries from the console side. - // Note that if deletion fails we dont mark timelines as broken, - // the whole tenant will become broken as by `Self::schedule_background` logic - let already_running_timeline_deletions = schedule_ordered_timeline_deletions(tenant) - .await - .context("schedule_ordered_timeline_deletions")?; - - fail::fail_point!("tenant-delete-before-polling-ongoing-deletions", |_| { - Err(anyhow::anyhow!( - "failpoint: tenant-delete-before-polling-ongoing-deletions" - ))? - }); - - // Wait for deletions that were already running at the moment when tenant deletion was requested. - // When we can lock deletion guard it means that corresponding timeline deletion finished. - for (guard, timeline_id) in already_running_timeline_deletions { - let flow = guard.lock().await; - if !flow.is_finished() { - return Err(DeleteTenantError::Other(anyhow::anyhow!( - "already running timeline deletion failed: {timeline_id}" - ))); - } - } - - let timelines_path = conf.timelines_path(&tenant.tenant_id); - // May not exist if we fail in cleanup_remaining_fs_traces after removing it - if timelines_path.exists() { - // sanity check to guard against layout changes - ensure_timelines_dir_empty(&timelines_path) - .await - .context("timelines dir not empty")?; - } - - remove_tenant_remote_delete_mark(conf, remote_storage.as_ref(), &tenant.tenant_id).await?; - - fail::fail_point!("tenant-delete-before-cleanup-remaining-fs-traces", |_| { - Err(anyhow::anyhow!( - "failpoint: tenant-delete-before-cleanup-remaining-fs-traces" - ))? - }); - - cleanup_remaining_fs_traces(conf, &tenant.tenant_id) - .await - .context("cleanup_remaining_fs_traces")?; - - let mut locked = tenants.write().unwrap(); - if locked.remove(&tenant.tenant_id).is_none() { - warn!("Tenant got removed from tenants map during deletion"); - }; - - *guard = Self::Finished; - - Ok(()) - } -} diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 318f35075c..914098a1cf 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -26,7 +26,6 @@ use crate::control_plane_client::{ use crate::deletion_queue::DeletionQueueClient; use crate::task_mgr::{self, TaskKind}; use crate::tenant::config::{AttachmentMode, LocationConf, LocationMode, TenantConfOpt}; -use crate::tenant::delete::DeleteTenantFlow; use crate::tenant::{ create_tenant_files, AttachedTenantConf, ShutdownError, SpawnMode, Tenant, TenantState, }; @@ -37,7 +36,7 @@ use utils::fs_ext::PathExt; use utils::generation::Generation; use utils::id::{TenantId, TimelineId}; -use super::delete::DeleteTenantError; +use super::delete::{delete_tenant_background, delete_tenant_foreground, DeleteTenantError}; use super::timeline::delete::DeleteTimelineFlow; use super::TenantSharedResources; @@ -104,13 +103,6 @@ impl TenantsMap { } } } - - pub(crate) fn remove(&mut self, tenant_id: &TenantId) -> Option { - match self { - TenantsMap::Initializing => None, - TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m.remove(tenant_id), - } - } } /// This is "safe" in that that it won't leave behind a partially deleted directory @@ -119,12 +111,14 @@ impl TenantsMap { /// /// This is pageserver-specific, as it relies on future processes after a crash to check /// for TEMP_FILE_SUFFIX when loading things. -async fn safe_remove_tenant_dir_all(path: impl AsRef) -> std::io::Result<()> { +pub(crate) async fn safe_remove_tenant_dir_all(path: impl AsRef) -> std::io::Result<()> { let tmp_path = safe_rename_tenant_dir(path).await?; fs::remove_dir_all(tmp_path).await } -async fn safe_rename_tenant_dir(path: impl AsRef) -> std::io::Result { +pub(crate) async fn safe_rename_tenant_dir( + path: impl AsRef, +) -> std::io::Result { let parent = path .as_ref() .parent() @@ -370,6 +364,10 @@ pub async fn init_tenant_mgr( let tenant_generations = init_load_generations(conf, &tenant_configs, &resources, &cancel).await?; + // Tenants may send into this channel if they discover that they are in a partially + // deleted state. + let (deletion_upcall_tx, deletion_upcall_rx) = tokio::sync::mpsc::channel::>(128); + // Construct `Tenant` objects and start them running for (tenant_id, location_conf) in tenant_configs { let tenant_dir_path = conf.tenant_path(&tenant_id); @@ -441,8 +439,8 @@ pub async fn init_tenant_mgr( resources.clone(), AttachedTenantConf::try_from(location_conf)?, Some(init_order.clone()), - &TENANTS, SpawnMode::Normal, + Some(deletion_upcall_tx.clone()), &ctx, ) { Ok(tenant) => { @@ -459,9 +457,41 @@ pub async fn init_tenant_mgr( let mut tenants_map = TENANTS.write().unwrap(); assert!(matches!(&*tenants_map, &TenantsMap::Initializing)); *tenants_map = TenantsMap::Open(tenants); + + spawn_handle_upcalls_task(conf, &resources.remote_storage, deletion_upcall_rx); + Ok(()) } +/// For some edge cases, like resuming their own deletion on attach, tenants +/// may submit mgr-level operations into a queue. +fn spawn_handle_upcalls_task( + conf: &'static PageServerConf, + remote_storage: &Option, + mut deletion_upcall_rx: tokio::sync::mpsc::Receiver>, +) { + let remote_storage = remote_storage.clone(); + task_mgr::spawn( + task_mgr::BACKGROUND_RUNTIME.handle(), + TaskKind::TenantManagerUpcall, + None, + None, + "tenant manager upcall", + false, + async move { + while let Some(tenant) = deletion_upcall_rx.recv().await { + if let Err(e) = + delete_tenant(conf, remote_storage.clone(), tenant.tenant_id, true).await + { + error!(tenant_id = %tenant.get_tenant_id(), "Failed to complete deletion of tenant: {e}"); + } + } + + Ok(()) + }, + ); +} + /// Wrapper for Tenant::spawn that checks invariants before running, and inserts /// a broken tenant in the map if Tenant::spawn fails. #[allow(clippy::too_many_arguments)] @@ -472,8 +502,8 @@ pub(crate) fn tenant_spawn( resources: TenantSharedResources, location_conf: AttachedTenantConf, init_order: Option, - tenants: &'static std::sync::RwLock, mode: SpawnMode, + resume_deletion_upcall: Option>>, ctx: &RequestContext, ) -> anyhow::Result> { anyhow::ensure!( @@ -504,8 +534,8 @@ pub(crate) fn tenant_spawn( resources, location_conf, init_order, - tenants, mode, + resume_deletion_upcall, ctx, ) { Ok(tenant) => tenant, @@ -601,8 +631,6 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock) { if let Err(e) = res { match e { ShutdownError::AlreadyStopping => { - // TODO: to ensure this can _never_ happen, we need to get rid of - // the horrible DeleteTenantFlow::should_resume_deletion tracing::warn!(%tenant_id, "Tenant already stopping during shutdown"); } } @@ -684,8 +712,8 @@ pub(crate) async fn create_tenant( resources, AttachedTenantConf::try_from(location_conf)?, None, - &TENANTS, SpawnMode::Create, + None, ctx, )?; // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here. @@ -849,8 +877,8 @@ pub(crate) async fn upsert_location( }, AttachedTenantConf::try_from(new_location_config)?, None, - &TENANTS, SpawnMode::Normal, + None, ctx, )?; @@ -915,18 +943,11 @@ pub(crate) async fn delete_tenant( conf: &'static PageServerConf, remote_storage: Option, tenant_id: TenantId, + resume: bool, ) -> Result<(), DeleteTenantError> { // We acquire a SlotGuard during this function to protect against concurrent - // changes while the ::prepare phase of DeleteTenantFlow executes, but then - // have to return the Tenant to the map while the background deletion runs. - // - // TODO: refactor deletion to happen outside the lifetime of a Tenant. - // Currently, deletion requires a reference to the tenants map in order to - // keep the Tenant in the map until deletion is complete, and then remove - // it at the end. - // - // See https://github.com/neondatabase/neon/issues/5080 - + // operations, but also to ensure we do not permit re-creation of the same + // tenant ID until we are done with the deletion. let mut slot_guard = tenant_map_acquire_slot(&tenant_id, Some(true))?; // unwrap is safe because we used expect_exist=true when acquiring the slot @@ -940,11 +961,51 @@ pub(crate) async fn delete_tenant( } }; - let result = DeleteTenantFlow::run(conf, remote_storage, &TENANTS, tenant).await; + // Do the foreground part of deletion: if it fails, set the tenant broken + // and leave it in the tenants map + if let Err(e) = + delete_tenant_foreground(conf, remote_storage.clone(), &tenant, resume, &slot_guard).await + { + tenant.set_broken(format!("{e:#}")).await; + slot_guard.upsert(TenantSlot::Attached(tenant))?; + return Err(e); + } - // Replace our InProgress marker with the Tenant in attached state, after the prepare phase of deletion is done - slot_guard.upsert(slot)?; - result + // Do the actual deletion in the background, while holding the SlotGuard + task_mgr::spawn( + task_mgr::BACKGROUND_RUNTIME.handle(), + TaskKind::MgmtRequest, + None, // This task runs without the tenant_id, as it should not be caught by Tenant::shutdown + None, + "delete_tenant_background", + false, + async move { + match delete_tenant_background( + conf, + remote_storage, + tenant_id, + &tenant, + resume, + &slot_guard, + ) + .instrument(info_span!("delete_tenant_background", %tenant_id)) + .await + { + Ok(()) => { + info!(%tenant_id, "Tenant deletion complete"); + } + Err(e) => { + warn!(% tenant_id, "Tenant deletion failed: {e:#}"); + tenant.set_broken(format!("{e:#}")).await; + slot_guard.upsert(TenantSlot::Attached(tenant))?; + } + } + + Ok(()) + }, + ); + + Ok(()) } #[derive(Debug, thiserror::Error)] @@ -1093,8 +1154,8 @@ pub(crate) async fn load_tenant( resources, AttachedTenantConf::try_from(location_conf)?, None, - &TENANTS, SpawnMode::Normal, + None, ctx, ) .with_context(|| format!("Failed to schedule tenant processing in path {tenant_path:?}"))?; @@ -1172,15 +1233,17 @@ pub(crate) async fn attach_tenant( // TODO: tenant directory remains on disk if we bail out from here on. // See https://github.com/neondatabase/neon/issues/4233 + let (deletion_upcall_tx, deletion_upcall_rx) = tokio::sync::mpsc::channel::>(128); + let attached_tenant = tenant_spawn( conf, tenant_id, &tenant_dir, - resources, + resources.clone(), AttachedTenantConf::try_from(location_conf)?, None, - &TENANTS, SpawnMode::Normal, + Some(deletion_upcall_tx), ctx, )?; // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here. @@ -1193,6 +1256,8 @@ pub(crate) async fn attach_tenant( ))); } + spawn_handle_upcalls_task(conf, &resources.remote_storage, deletion_upcall_rx); + tenant_guard.upsert(TenantSlot::Attached(attached_tenant))?; Ok(()) } @@ -1260,7 +1325,7 @@ pub enum TenantMapError { /// structure exists, the TenantsMap will contain a [`TenantSlot::InProgress`] /// for this tenant, which acts as a marker for any operations targeting /// this tenant to retry later, or wait for the InProgress state to end. -pub struct SlotGuard { +pub(crate) struct SlotGuard { tenant_id: TenantId, old_value: Option, upserted: bool, diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index 6d30664515..f3201d4739 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -410,7 +410,7 @@ impl DeleteTimelineFlow { } /// Shortcut to create Timeline in stopping state and spawn deletion task. - /// See corresponding parts of [`crate::tenant::delete::DeleteTenantFlow`] + /// See corresponding parts of [`crate::tenant::timeline::delete::DeleteTimelineFlow`] #[instrument(skip_all, fields(%timeline_id))] pub async fn resume_deletion( tenant: Arc, diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index ae77197088..2ed30dbf97 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -116,8 +116,6 @@ FAILPOINTS = [ "tenant-delete-before-create-local-mark", "tenant-delete-before-background", "tenant-delete-before-polling-ongoing-deletions", - "tenant-delete-before-cleanup-remaining-fs-traces", - "tenant-delete-before-remove-timelines-dir", "tenant-delete-before-remove-deleted-mark", "tenant-delete-before-remove-tenant-dir", # Some failpoints from timeline deletion @@ -129,7 +127,6 @@ FAILPOINTS = [ FAILPOINTS_BEFORE_BACKGROUND = [ "timeline-delete-before-schedule", - "tenant-delete-before-shutdown", "tenant-delete-before-create-remote-mark", "tenant-delete-before-create-local-mark", "tenant-delete-before-background", @@ -243,10 +240,7 @@ def test_delete_tenant_exercise_crash_safety_failpoints( if check is Check.RETRY_WITH_RESTART: env.pageserver.restart() - if failpoint in ( - "tenant-delete-before-shutdown", - "tenant-delete-before-create-remote-mark", - ): + if failpoint in ("tenant-delete-before-create-remote-mark",): wait_until_tenant_active( ps_http, tenant_id=tenant_id, iterations=iterations, period=0.25 )