pageserver: start simplifying tenant deletion

This commit is contained in:
John Spray
2023-10-27 15:53:39 +01:00
parent 1874f9427a
commit c178f34e21
7 changed files with 311 additions and 494 deletions

View File

@@ -275,7 +275,6 @@ impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
use crate::tenant::delete::DeleteTenantError::*;
match value {
Get(g) => ApiError::from(g),
e @ AlreadyInProgress => ApiError::Conflict(e.to_string()),
Timeline(t) => ApiError::from(t),
NotAttached => ApiError::NotFound(anyhow::anyhow!("Tenant is not attached").into()),
SlotError(e) => e.into(),
@@ -782,7 +781,7 @@ async fn tenant_delete_handler(
let state = get_state(&request);
mgr::delete_tenant(state.conf, state.remote_storage.clone(), tenant_id)
mgr::delete_tenant(state.conf, state.remote_storage.clone(), tenant_id, false)
.instrument(info_span!("tenant_delete_handler", %tenant_id))
.await?;

View File

@@ -291,6 +291,9 @@ pub enum TaskKind {
// A request that comes in via the pageserver HTTP API.
MgmtRequest,
// Tenant Manager servicing a channel that enables upcalls from Tenant
TenantManagerUpcall,
DebugTool,
#[cfg(test)]

View File

@@ -51,10 +51,9 @@ use self::config::AttachedLocationConfig;
use self::config::AttachmentMode;
use self::config::LocationConf;
use self::config::TenantConf;
use self::delete::DeleteTenantFlow;
use self::delete::should_resume_deletion;
use self::metadata::LoadMetadataError;
use self::metadata::TimelineMetadata;
use self::mgr::TenantsMap;
use self::remote_timeline_client::RemoteTimelineClient;
use self::timeline::uninit::TimelineUninitMark;
use self::timeline::uninit::UninitializedTimeline;
@@ -250,8 +249,6 @@ pub struct Tenant {
cached_synthetic_tenant_size: Arc<AtomicU64>,
eviction_task_tenant_state: tokio::sync::Mutex<EvictionTaskTenantState>,
pub(crate) delete_progress: Arc<tokio::sync::Mutex<DeleteTenantFlow>>,
}
impl std::fmt::Debug for Tenant {
@@ -537,8 +534,8 @@ impl Tenant {
resources: TenantSharedResources,
attached_conf: AttachedTenantConf,
init_order: Option<InitializationOrder>,
tenants: &'static std::sync::RwLock<TenantsMap>,
mode: SpawnMode,
resume_deletion_upcall: Option<tokio::sync::mpsc::Sender<Arc<Tenant>>>,
ctx: &RequestContext,
) -> anyhow::Result<Arc<Tenant>> {
let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
@@ -625,67 +622,48 @@ impl Tenant {
// Remote preload is complete.
drop(remote_load_completion);
let pending_deletion = {
match DeleteTenantFlow::should_resume_deletion(
conf,
preload.as_ref().map(|p| p.deleting).unwrap_or(false),
&tenant_clone,
)
.await
{
Ok(should_resume_deletion) => should_resume_deletion,
Err(err) => {
make_broken(&tenant_clone, anyhow::anyhow!(err));
return Ok(());
let resume_deletion = should_resume_deletion(conf, preload.as_ref().map(|p| p.deleting).unwrap_or(false), &tenant_id).await?;
if resume_deletion {
// We will wait until the background
info!("Tenant is partially deleted, will resume");
// Put the Tenant into a Stopping state so that it will not try to serve any I/O
tenant_clone
.set_stopping(false, true)
.await
.expect("cant be stopping or broken");
// Proceed with attaching the tenant to load the metadata we will use for remote deletion
match tenant_clone.attach(init_order, preload, &ctx).await {
Ok(()) => {
info!("attach finished, deletion will resume");
}
Err(e) => {
make_broken(&tenant_clone, anyhow::anyhow!(e));
}
}
if let Some(resume_deletion_upcall) = resume_deletion_upcall {
// If this send() fails it means we're shutting down, so just drop it on the floor
resume_deletion_upcall.send(tenant_clone).await.ok();
} else {
make_broken(&tenant_clone, anyhow::anyhow!(
"Attemped to attach a partially deleted tenant outside of pageserver startup"
));
}
} else {
// Normal case: load all the metadata for the tenant.
match tenant_clone.attach(init_order, preload, &ctx).await {
Ok(()) => {
info!("attach finished, activating");
tenant_clone.activate(broker_client, None, &ctx);
}
Err(e) => {
make_broken(&tenant_clone, anyhow::anyhow!(e));
}
}
};
info!("pending_deletion {}", pending_deletion.is_some());
if let Some(deletion) = pending_deletion {
// as we are no longer loading, signal completion by dropping
// the completion while we resume deletion
drop(_completion);
// do not hold to initial_logical_size_attempt as it will prevent loading from proceeding without timeout
let _ = init_order
.as_mut()
.and_then(|x| x.initial_logical_size_attempt.take());
let background_jobs_can_start =
init_order.as_ref().map(|x| &x.background_jobs_can_start);
if let Some(background) = background_jobs_can_start {
info!("waiting for backgound jobs barrier");
background.clone().wait().await;
info!("ready for backgound jobs barrier");
}
match DeleteTenantFlow::resume_from_attach(
deletion,
&tenant_clone,
preload,
tenants,
init_order,
&ctx,
)
.await
{
Err(err) => {
make_broken(&tenant_clone, anyhow::anyhow!(err));
return Ok(());
}
Ok(()) => return Ok(()),
}
}
match tenant_clone.attach(init_order, preload, &ctx).await {
Ok(()) => {
info!("attach finished, activating");
tenant_clone.activate(broker_client, None, &ctx);
}
Err(e) => {
make_broken(&tenant_clone, anyhow::anyhow!(e));
}
}
Ok(())
}
.instrument({
@@ -2362,7 +2340,6 @@ impl Tenant {
cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()),
delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTenantFlow::default())),
}
}

View File

@@ -1,31 +1,27 @@
use std::sync::Arc;
use anyhow::Context;
use camino::{Utf8Path, Utf8PathBuf};
use camino::Utf8Path;
use pageserver_api::models::TenantState;
use remote_storage::{GenericRemoteStorage, RemotePath};
use tokio::sync::OwnedMutexGuard;
use tokio_util::sync::CancellationToken;
use tracing::{error, instrument, warn, Instrument, Span};
use tracing::error;
use utils::{
backoff, crashsafe, fs_ext,
backoff, crashsafe,
id::{TenantId, TimelineId},
};
use crate::{
config::PageServerConf,
context::RequestContext,
task_mgr::{self, TaskKind},
InitializationOrder,
tenant::{mgr::safe_rename_tenant_dir, ShutdownError},
};
use super::{
mgr::{GetTenantError, TenantSlotError, TenantSlotUpsertError, TenantsMap},
mgr::{GetTenantError, SlotGuard, TenantSlotError, TenantSlotUpsertError},
remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
span,
timeline::delete::DeleteTimelineFlow,
tree_sort_timelines, DeleteTimelineError, Tenant, TenantPreload,
tree_sort_timelines, DeleteTimelineError, Tenant,
};
#[derive(Debug, thiserror::Error)]
@@ -39,9 +35,6 @@ pub(crate) enum DeleteTenantError {
#[error("Invalid state {0}. Expected Active or Broken")]
InvalidState(TenantState),
#[error("Tenant deletion is already in progress")]
AlreadyInProgress,
#[error("Tenant map slot error {0}")]
SlotError(#[from] TenantSlotError),
@@ -55,7 +48,142 @@ pub(crate) enum DeleteTenantError {
Other(#[from] anyhow::Error),
}
type DeletionGuard = tokio::sync::OwnedMutexGuard<DeleteTenantFlow>;
/// The part of tenant deletion that must happen before acknowledging the deletion request
///
/// Usually deletion requires that the tenant is not already in Stopping state: set
/// `resume` to true to skip this check if resuming deletion at startup, since we
/// would have loaded the tenant into Stopping mode already
pub(crate) async fn delete_tenant_foreground(
conf: &'static PageServerConf,
remote_storage: Option<GenericRemoteStorage>,
tenant: &Arc<Tenant>,
resume: bool,
// Witness that we are doing this work within a TenantSlot::InProgress state.
_slot_guard: &SlotGuard,
) -> Result<(), DeleteTenantError> {
// In resume mode, we must be stopping. Else, we must _not_ be Stopping.
if !(matches!(tenant.current_state(), TenantState::Stopping) ^ !resume) {
return Err(DeleteTenantError::InvalidState(tenant.current_state()));
}
fail::fail_point!("tenant-delete-before-create-remote-mark", |_| {
Err(anyhow::anyhow!(
"failpoint: tenant-delete-before-create-remote-mark"
))?
});
// Write persistent tombstone
if let Some(remote_storage) = &remote_storage {
create_remote_delete_mark(conf, remote_storage, &tenant.get_tenant_id()).await?;
}
fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
Err(anyhow::anyhow!(
"failpoint: tenant-delete-before-create-local-mark"
))?
});
create_local_delete_mark(conf, &tenant.tenant_id)
.await
.context("local create mark")?;
fail::fail_point!("tenant-delete-before-background", |_| {
Err(anyhow::anyhow!(
"failpoint: tenant-delete-before-background"
))?
});
Ok(())
}
/// The part of tenant deletion that happens after we have already acknowledged the request.
///
/// This part may retried after pageserver restart if we see a tenant that has a deletion marker
/// but has not been completely deleted.
pub(crate) async fn delete_tenant_background(
conf: &'static PageServerConf,
remote_storage: Option<GenericRemoteStorage>,
tenant_id: TenantId,
tenant: &Arc<Tenant>,
resume: bool,
// Witness that we are doing this work within a TenantSlot::InProgress state.
_slot_guard: &SlotGuard,
) -> anyhow::Result<()> {
let already_running_timeline_deletions = schedule_ordered_timeline_deletions(tenant)
.await
.context("schedule_ordered_timeline_deletions")?;
fail::fail_point!("tenant-delete-before-polling-ongoing-deletions", |_| {
Err(anyhow::anyhow!(
"failpoint: tenant-delete-before-polling-ongoing-deletions"
))?
});
// Wait for deletions that were already running at the moment when tenant deletion was requested.
// When we can lock deletion guard it means that corresponding timeline deletion finished.
for (guard, timeline_id) in already_running_timeline_deletions {
let flow = guard.lock().await;
if !flow.is_finished() {
return Err(DeleteTenantError::Other(anyhow::anyhow!(
"already running timeline deletion failed: {timeline_id}"
))
.into());
}
}
// For convenience, use the Tenant's deletion queue reference so that we don't have
// to take it as an argument.
let deletion_queue_client = tenant.deletion_queue_client.clone();
fail::fail_point!("tenant-delete-before-shutdown", |_| {
Err(anyhow::anyhow!("failpoint: tenant-delete-before-shutdown"))?
});
// Tear down local runtime state
if !resume {
tenant.shutdown(false).await.map_err(|e| match e {
ShutdownError::AlreadyStopping => {
DeleteTenantError::InvalidState(TenantState::Stopping)
}
})?;
}
// Not necessary for correctness, but executing deletions before erasing local contents
// means that we will have a better chance to resume the delete if we crash.
deletion_queue_client.flush_execute().await?;
fail::fail_point!("tenant-delete-before-remove-deleted-mark", |_| {
Err(anyhow::anyhow!(
"failpoint: tenant-delete-before-remove-deleted-mark"
))?
});
// Remove the deletion marker
remove_tenant_remote_delete_mark(conf, remote_storage.as_ref(), &tenant_id).await?;
// Not necessary for correctness, but helps make it true that when we log that deletion is
// done, it really is.
deletion_queue_client.flush_execute().await?;
fail::fail_point!("tenant-delete-before-remove-tenant-dir", |_| {
Err(anyhow::anyhow!(
"failpoint: tenant-delete-before-remove-tenant-dir"
))?
});
// Remove local storage contents. We do this last, so that if we crash during delete, on
// restart we will attempt to re-attach the tenant and resume the deletion.
let local_tenant_directory = conf.tenant_path(&tenant_id);
let tmp_path = safe_rename_tenant_dir(&local_tenant_directory)
.await
.with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))?;
tokio::fs::remove_dir_all(tmp_path.as_path())
.await
.with_context(|| format!("tenant directory {:?} deletion", tmp_path))?;
Ok(())
}
fn remote_tenant_delete_mark_path(
conf: &PageServerConf,
@@ -70,7 +198,7 @@ fn remote_tenant_delete_mark_path(
Ok(tenant_remote_path.join(Utf8Path::new("timelines/deleted")))
}
async fn create_remote_delete_mark(
pub(crate) async fn create_remote_delete_mark(
conf: &PageServerConf,
remote_storage: &GenericRemoteStorage,
tenant_id: &TenantId,
@@ -115,7 +243,26 @@ async fn create_local_delete_mark(
Ok(())
}
async fn schedule_ordered_timeline_deletions(
pub(crate) async fn should_resume_deletion(
conf: &'static PageServerConf,
remote_mark_exists: bool,
tenant_id: &TenantId,
) -> Result<bool, DeleteTenantError> {
if remote_mark_exists {
return Ok(true);
}
// In the very last stage of deletion, we might have already removed the remote
// marker, but be attaching the tenant anyway on the basis of its local directory
// existing: to resume deletion in this case we need the local deletion marker.
if conf.tenant_deleted_mark_file_path(tenant_id).exists() {
Ok(true)
} else {
Ok(false)
}
}
pub(crate) async fn schedule_ordered_timeline_deletions(
tenant: &Arc<Tenant>,
) -> Result<Vec<(Arc<tokio::sync::Mutex<DeleteTimelineFlow>>, TimelineId)>, DeleteTenantError> {
// Tenant is stopping at this point. We know it will be deleted.
@@ -153,21 +300,7 @@ async fn schedule_ordered_timeline_deletions(
Ok(already_running_deletions)
}
async fn ensure_timelines_dir_empty(timelines_path: &Utf8Path) -> Result<(), DeleteTenantError> {
// Assert timelines dir is empty.
if !fs_ext::is_directory_empty(timelines_path).await? {
// Display first 10 items in directory
let list = fs_ext::list_dir(timelines_path).await.context("list_dir")?;
let list = &list.into_iter().take(10).collect::<Vec<_>>();
return Err(DeleteTenantError::Other(anyhow::anyhow!(
"Timelines directory is not empty after all timelines deletion: {list:?}"
)));
}
Ok(())
}
async fn remove_tenant_remote_delete_mark(
pub(crate) async fn remove_tenant_remote_delete_mark(
conf: &PageServerConf,
remote_storage: Option<&GenericRemoteStorage>,
tenant_id: &TenantId,
@@ -188,357 +321,3 @@ async fn remove_tenant_remote_delete_mark(
}
Ok(())
}
// Cleanup fs traces: tenant config, timelines dir local delete mark, tenant dir
async fn cleanup_remaining_fs_traces(
conf: &PageServerConf,
tenant_id: &TenantId,
) -> Result<(), DeleteTenantError> {
let rm = |p: Utf8PathBuf, is_dir: bool| async move {
if is_dir {
tokio::fs::remove_dir(&p).await
} else {
tokio::fs::remove_file(&p).await
}
.or_else(fs_ext::ignore_not_found)
.with_context(|| format!("failed to delete {p}"))
};
rm(conf.tenant_config_path(tenant_id), false).await?;
rm(conf.tenant_location_config_path(tenant_id), false).await?;
fail::fail_point!("tenant-delete-before-remove-timelines-dir", |_| {
Err(anyhow::anyhow!(
"failpoint: tenant-delete-before-remove-timelines-dir"
))?
});
rm(conf.timelines_path(tenant_id), true).await?;
fail::fail_point!("tenant-delete-before-remove-deleted-mark", |_| {
Err(anyhow::anyhow!(
"failpoint: tenant-delete-before-remove-deleted-mark"
))?
});
// Make sure previous deletions are ordered before mark removal.
// Otherwise there is no guarantee that they reach the disk before mark deletion.
// So its possible for mark to reach disk first and for other deletions
// to be reordered later and thus missed if a crash occurs.
// Note that we dont need to sync after mark file is removed
// because we can tolerate the case when mark file reappears on startup.
let tenant_path = &conf.tenant_path(tenant_id);
if tenant_path.exists() {
crashsafe::fsync_async(&conf.tenant_path(tenant_id))
.await
.context("fsync_pre_mark_remove")?;
}
rm(conf.tenant_deleted_mark_file_path(tenant_id), false).await?;
fail::fail_point!("tenant-delete-before-remove-tenant-dir", |_| {
Err(anyhow::anyhow!(
"failpoint: tenant-delete-before-remove-tenant-dir"
))?
});
rm(conf.tenant_path(tenant_id), true).await?;
Ok(())
}
/// Orchestrates tenant shut down of all tasks, removes its in-memory structures,
/// and deletes its data from both disk and s3.
/// The sequence of steps:
/// 1. Upload remote deletion mark.
/// 2. Create local mark file.
/// 3. Shutdown tasks
/// 4. Run ordered timeline deletions
/// 5. Wait for timeline deletion operations that were scheduled before tenant deletion was requested
/// 6. Remove remote mark
/// 7. Cleanup remaining fs traces, tenant dir, config, timelines dir, local delete mark
/// It is resumable from any step in case a crash/restart occurs.
/// There are two entrypoints to the process:
/// 1. [`DeleteTenantFlow::run`] this is the main one called by a management api handler.
/// 2. [`DeleteTenantFlow::resume_from_attach`] is called when deletion is resumed tenant is found to be deleted during attach process.
/// Note the only other place that messes around timeline delete mark is the `Tenant::spawn_load` function.
#[derive(Default)]
pub enum DeleteTenantFlow {
#[default]
NotStarted,
InProgress,
Finished,
}
impl DeleteTenantFlow {
// These steps are run in the context of management api request handler.
// Long running steps are continued to run in the background.
// NB: If this fails half-way through, and is retried, the retry will go through
// all the same steps again. Make sure the code here is idempotent, and don't
// error out if some of the shutdown tasks have already been completed!
// NOTE: static needed for background part.
// We assume that calling code sets up the span with tenant_id.
#[instrument(skip_all)]
pub(crate) async fn run(
conf: &'static PageServerConf,
remote_storage: Option<GenericRemoteStorage>,
tenants: &'static std::sync::RwLock<TenantsMap>,
tenant: Arc<Tenant>,
) -> Result<(), DeleteTenantError> {
span::debug_assert_current_span_has_tenant_id();
let mut guard = Self::prepare(&tenant).await?;
if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await {
tenant.set_broken(format!("{e:#}")).await;
return Err(e);
}
Self::schedule_background(guard, conf, remote_storage, tenants, tenant);
Ok(())
}
// Helper function needed to be able to match once on returned error and transition tenant into broken state.
// This is needed because tenant.shutwodn is not idempotent. If tenant state is set to stopping another call to tenant.shutdown
// will result in an error, but here we need to be able to retry shutdown when tenant deletion is retried.
// So the solution is to set tenant state to broken.
async fn run_inner(
guard: &mut OwnedMutexGuard<Self>,
conf: &'static PageServerConf,
remote_storage: Option<&GenericRemoteStorage>,
tenant: &Tenant,
) -> Result<(), DeleteTenantError> {
guard.mark_in_progress()?;
fail::fail_point!("tenant-delete-before-create-remote-mark", |_| {
Err(anyhow::anyhow!(
"failpoint: tenant-delete-before-create-remote-mark"
))?
});
// IDEA: implement detach as delete without remote storage. Then they would use the same lock (deletion_progress) so wont contend.
// Though sounds scary, different mark name?
// Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state.
if let Some(remote_storage) = &remote_storage {
create_remote_delete_mark(conf, remote_storage, &tenant.tenant_id)
.await
.context("remote_mark")?
}
fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
Err(anyhow::anyhow!(
"failpoint: tenant-delete-before-create-local-mark"
))?
});
create_local_delete_mark(conf, &tenant.tenant_id)
.await
.context("local delete mark")?;
fail::fail_point!("tenant-delete-before-background", |_| {
Err(anyhow::anyhow!(
"failpoint: tenant-delete-before-background"
))?
});
Ok(())
}
fn mark_in_progress(&mut self) -> anyhow::Result<()> {
match self {
Self::Finished => anyhow::bail!("Bug. Is in finished state"),
Self::InProgress { .. } => { /* We're in a retry */ }
Self::NotStarted => { /* Fresh start */ }
}
*self = Self::InProgress;
Ok(())
}
pub(crate) async fn should_resume_deletion(
conf: &'static PageServerConf,
remote_mark_exists: bool,
tenant: &Tenant,
) -> Result<Option<DeletionGuard>, DeleteTenantError> {
let acquire = |t: &Tenant| {
Some(
Arc::clone(&t.delete_progress)
.try_lock_owned()
.expect("we're the only owner during init"),
)
};
if remote_mark_exists {
return Ok(acquire(tenant));
}
let tenant_id = tenant.tenant_id;
// Check local mark first, if its there there is no need to go to s3 to check whether remote one exists.
if conf.tenant_deleted_mark_file_path(&tenant_id).exists() {
Ok(acquire(tenant))
} else {
Ok(None)
}
}
pub(crate) async fn resume_from_attach(
guard: DeletionGuard,
tenant: &Arc<Tenant>,
preload: Option<TenantPreload>,
tenants: &'static std::sync::RwLock<TenantsMap>,
init_order: Option<InitializationOrder>,
ctx: &RequestContext,
) -> Result<(), DeleteTenantError> {
tenant
.set_stopping(false, true)
.await
.expect("cant be stopping or broken");
tenant
.attach(init_order, preload, ctx)
.await
.context("attach")?;
Self::background(
guard,
tenant.conf,
tenant.remote_storage.clone(),
tenants,
tenant,
)
.await
}
async fn prepare(
tenant: &Arc<Tenant>,
) -> Result<tokio::sync::OwnedMutexGuard<Self>, DeleteTenantError> {
// FIXME: unsure about active only. Our init jobs may not be cancellable properly,
// so at least for now allow deletions only for active tenants. TODO recheck
// Broken and Stopping is needed for retries.
if !matches!(
tenant.current_state(),
TenantState::Active | TenantState::Broken { .. }
) {
return Err(DeleteTenantError::InvalidState(tenant.current_state()));
}
let guard = Arc::clone(&tenant.delete_progress)
.try_lock_owned()
.map_err(|_| DeleteTenantError::AlreadyInProgress)?;
fail::fail_point!("tenant-delete-before-shutdown", |_| {
Err(anyhow::anyhow!("failpoint: tenant-delete-before-shutdown"))?
});
// It would be good to only set stopping here and continue shutdown in the background, but shutdown is not idempotent.
// i e it is an error to do:
// tenant.set_stopping
// tenant.shutdown
// Its also bad that we're holding tenants.read here.
// TODO relax set_stopping to be idempotent?
if tenant.shutdown(false).await.is_err() {
return Err(DeleteTenantError::Other(anyhow::anyhow!(
"tenant shutdown is already in progress"
)));
}
Ok(guard)
}
fn schedule_background(
guard: OwnedMutexGuard<Self>,
conf: &'static PageServerConf,
remote_storage: Option<GenericRemoteStorage>,
tenants: &'static std::sync::RwLock<TenantsMap>,
tenant: Arc<Tenant>,
) {
let tenant_id = tenant.tenant_id;
task_mgr::spawn(
task_mgr::BACKGROUND_RUNTIME.handle(),
TaskKind::TimelineDeletionWorker,
Some(tenant_id),
None,
"tenant_delete",
false,
async move {
if let Err(err) =
Self::background(guard, conf, remote_storage, tenants, &tenant).await
{
error!("Error: {err:#}");
tenant.set_broken(format!("{err:#}")).await;
};
Ok(())
}
.instrument({
let span = tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_id);
span.follows_from(Span::current());
span
}),
);
}
async fn background(
mut guard: OwnedMutexGuard<Self>,
conf: &PageServerConf,
remote_storage: Option<GenericRemoteStorage>,
tenants: &'static std::sync::RwLock<TenantsMap>,
tenant: &Arc<Tenant>,
) -> Result<(), DeleteTenantError> {
// Tree sort timelines, schedule delete for them. Mention retries from the console side.
// Note that if deletion fails we dont mark timelines as broken,
// the whole tenant will become broken as by `Self::schedule_background` logic
let already_running_timeline_deletions = schedule_ordered_timeline_deletions(tenant)
.await
.context("schedule_ordered_timeline_deletions")?;
fail::fail_point!("tenant-delete-before-polling-ongoing-deletions", |_| {
Err(anyhow::anyhow!(
"failpoint: tenant-delete-before-polling-ongoing-deletions"
))?
});
// Wait for deletions that were already running at the moment when tenant deletion was requested.
// When we can lock deletion guard it means that corresponding timeline deletion finished.
for (guard, timeline_id) in already_running_timeline_deletions {
let flow = guard.lock().await;
if !flow.is_finished() {
return Err(DeleteTenantError::Other(anyhow::anyhow!(
"already running timeline deletion failed: {timeline_id}"
)));
}
}
let timelines_path = conf.timelines_path(&tenant.tenant_id);
// May not exist if we fail in cleanup_remaining_fs_traces after removing it
if timelines_path.exists() {
// sanity check to guard against layout changes
ensure_timelines_dir_empty(&timelines_path)
.await
.context("timelines dir not empty")?;
}
remove_tenant_remote_delete_mark(conf, remote_storage.as_ref(), &tenant.tenant_id).await?;
fail::fail_point!("tenant-delete-before-cleanup-remaining-fs-traces", |_| {
Err(anyhow::anyhow!(
"failpoint: tenant-delete-before-cleanup-remaining-fs-traces"
))?
});
cleanup_remaining_fs_traces(conf, &tenant.tenant_id)
.await
.context("cleanup_remaining_fs_traces")?;
let mut locked = tenants.write().unwrap();
if locked.remove(&tenant.tenant_id).is_none() {
warn!("Tenant got removed from tenants map during deletion");
};
*guard = Self::Finished;
Ok(())
}
}

View File

@@ -26,7 +26,6 @@ use crate::control_plane_client::{
use crate::deletion_queue::DeletionQueueClient;
use crate::task_mgr::{self, TaskKind};
use crate::tenant::config::{AttachmentMode, LocationConf, LocationMode, TenantConfOpt};
use crate::tenant::delete::DeleteTenantFlow;
use crate::tenant::{
create_tenant_files, AttachedTenantConf, ShutdownError, SpawnMode, Tenant, TenantState,
};
@@ -37,7 +36,7 @@ use utils::fs_ext::PathExt;
use utils::generation::Generation;
use utils::id::{TenantId, TimelineId};
use super::delete::DeleteTenantError;
use super::delete::{delete_tenant_background, delete_tenant_foreground, DeleteTenantError};
use super::timeline::delete::DeleteTimelineFlow;
use super::TenantSharedResources;
@@ -104,13 +103,6 @@ impl TenantsMap {
}
}
}
pub(crate) fn remove(&mut self, tenant_id: &TenantId) -> Option<TenantSlot> {
match self {
TenantsMap::Initializing => None,
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m.remove(tenant_id),
}
}
}
/// This is "safe" in that that it won't leave behind a partially deleted directory
@@ -119,12 +111,14 @@ impl TenantsMap {
///
/// This is pageserver-specific, as it relies on future processes after a crash to check
/// for TEMP_FILE_SUFFIX when loading things.
async fn safe_remove_tenant_dir_all(path: impl AsRef<Utf8Path>) -> std::io::Result<()> {
pub(crate) async fn safe_remove_tenant_dir_all(path: impl AsRef<Utf8Path>) -> std::io::Result<()> {
let tmp_path = safe_rename_tenant_dir(path).await?;
fs::remove_dir_all(tmp_path).await
}
async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<Utf8PathBuf> {
pub(crate) async fn safe_rename_tenant_dir(
path: impl AsRef<Utf8Path>,
) -> std::io::Result<Utf8PathBuf> {
let parent = path
.as_ref()
.parent()
@@ -370,6 +364,10 @@ pub async fn init_tenant_mgr(
let tenant_generations =
init_load_generations(conf, &tenant_configs, &resources, &cancel).await?;
// Tenants may send into this channel if they discover that they are in a partially
// deleted state.
let (deletion_upcall_tx, deletion_upcall_rx) = tokio::sync::mpsc::channel::<Arc<Tenant>>(128);
// Construct `Tenant` objects and start them running
for (tenant_id, location_conf) in tenant_configs {
let tenant_dir_path = conf.tenant_path(&tenant_id);
@@ -441,8 +439,8 @@ pub async fn init_tenant_mgr(
resources.clone(),
AttachedTenantConf::try_from(location_conf)?,
Some(init_order.clone()),
&TENANTS,
SpawnMode::Normal,
Some(deletion_upcall_tx.clone()),
&ctx,
) {
Ok(tenant) => {
@@ -459,9 +457,41 @@ pub async fn init_tenant_mgr(
let mut tenants_map = TENANTS.write().unwrap();
assert!(matches!(&*tenants_map, &TenantsMap::Initializing));
*tenants_map = TenantsMap::Open(tenants);
spawn_handle_upcalls_task(conf, &resources.remote_storage, deletion_upcall_rx);
Ok(())
}
/// For some edge cases, like resuming their own deletion on attach, tenants
/// may submit mgr-level operations into a queue.
fn spawn_handle_upcalls_task(
conf: &'static PageServerConf,
remote_storage: &Option<GenericRemoteStorage>,
mut deletion_upcall_rx: tokio::sync::mpsc::Receiver<Arc<Tenant>>,
) {
let remote_storage = remote_storage.clone();
task_mgr::spawn(
task_mgr::BACKGROUND_RUNTIME.handle(),
TaskKind::TenantManagerUpcall,
None,
None,
"tenant manager upcall",
false,
async move {
while let Some(tenant) = deletion_upcall_rx.recv().await {
if let Err(e) =
delete_tenant(conf, remote_storage.clone(), tenant.tenant_id, true).await
{
error!(tenant_id = %tenant.get_tenant_id(), "Failed to complete deletion of tenant: {e}");
}
}
Ok(())
},
);
}
/// Wrapper for Tenant::spawn that checks invariants before running, and inserts
/// a broken tenant in the map if Tenant::spawn fails.
#[allow(clippy::too_many_arguments)]
@@ -472,8 +502,8 @@ pub(crate) fn tenant_spawn(
resources: TenantSharedResources,
location_conf: AttachedTenantConf,
init_order: Option<InitializationOrder>,
tenants: &'static std::sync::RwLock<TenantsMap>,
mode: SpawnMode,
resume_deletion_upcall: Option<tokio::sync::mpsc::Sender<Arc<Tenant>>>,
ctx: &RequestContext,
) -> anyhow::Result<Arc<Tenant>> {
anyhow::ensure!(
@@ -504,8 +534,8 @@ pub(crate) fn tenant_spawn(
resources,
location_conf,
init_order,
tenants,
mode,
resume_deletion_upcall,
ctx,
) {
Ok(tenant) => tenant,
@@ -601,8 +631,6 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
if let Err(e) = res {
match e {
ShutdownError::AlreadyStopping => {
// TODO: to ensure this can _never_ happen, we need to get rid of
// the horrible DeleteTenantFlow::should_resume_deletion
tracing::warn!(%tenant_id, "Tenant already stopping during shutdown");
}
}
@@ -684,8 +712,8 @@ pub(crate) async fn create_tenant(
resources,
AttachedTenantConf::try_from(location_conf)?,
None,
&TENANTS,
SpawnMode::Create,
None,
ctx,
)?;
// TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
@@ -849,8 +877,8 @@ pub(crate) async fn upsert_location(
},
AttachedTenantConf::try_from(new_location_config)?,
None,
&TENANTS,
SpawnMode::Normal,
None,
ctx,
)?;
@@ -915,18 +943,11 @@ pub(crate) async fn delete_tenant(
conf: &'static PageServerConf,
remote_storage: Option<GenericRemoteStorage>,
tenant_id: TenantId,
resume: bool,
) -> Result<(), DeleteTenantError> {
// We acquire a SlotGuard during this function to protect against concurrent
// changes while the ::prepare phase of DeleteTenantFlow executes, but then
// have to return the Tenant to the map while the background deletion runs.
//
// TODO: refactor deletion to happen outside the lifetime of a Tenant.
// Currently, deletion requires a reference to the tenants map in order to
// keep the Tenant in the map until deletion is complete, and then remove
// it at the end.
//
// See https://github.com/neondatabase/neon/issues/5080
// operations, but also to ensure we do not permit re-creation of the same
// tenant ID until we are done with the deletion.
let mut slot_guard = tenant_map_acquire_slot(&tenant_id, Some(true))?;
// unwrap is safe because we used expect_exist=true when acquiring the slot
@@ -940,11 +961,51 @@ pub(crate) async fn delete_tenant(
}
};
let result = DeleteTenantFlow::run(conf, remote_storage, &TENANTS, tenant).await;
// Do the foreground part of deletion: if it fails, set the tenant broken
// and leave it in the tenants map
if let Err(e) =
delete_tenant_foreground(conf, remote_storage.clone(), &tenant, resume, &slot_guard).await
{
tenant.set_broken(format!("{e:#}")).await;
slot_guard.upsert(TenantSlot::Attached(tenant))?;
return Err(e);
}
// Replace our InProgress marker with the Tenant in attached state, after the prepare phase of deletion is done
slot_guard.upsert(slot)?;
result
// Do the actual deletion in the background, while holding the SlotGuard
task_mgr::spawn(
task_mgr::BACKGROUND_RUNTIME.handle(),
TaskKind::MgmtRequest,
None, // This task runs without the tenant_id, as it should not be caught by Tenant::shutdown
None,
"delete_tenant_background",
false,
async move {
match delete_tenant_background(
conf,
remote_storage,
tenant_id,
&tenant,
resume,
&slot_guard,
)
.instrument(info_span!("delete_tenant_background", %tenant_id))
.await
{
Ok(()) => {
info!(%tenant_id, "Tenant deletion complete");
}
Err(e) => {
warn!(% tenant_id, "Tenant deletion failed: {e:#}");
tenant.set_broken(format!("{e:#}")).await;
slot_guard.upsert(TenantSlot::Attached(tenant))?;
}
}
Ok(())
},
);
Ok(())
}
#[derive(Debug, thiserror::Error)]
@@ -1093,8 +1154,8 @@ pub(crate) async fn load_tenant(
resources,
AttachedTenantConf::try_from(location_conf)?,
None,
&TENANTS,
SpawnMode::Normal,
None,
ctx,
)
.with_context(|| format!("Failed to schedule tenant processing in path {tenant_path:?}"))?;
@@ -1172,15 +1233,17 @@ pub(crate) async fn attach_tenant(
// TODO: tenant directory remains on disk if we bail out from here on.
// See https://github.com/neondatabase/neon/issues/4233
let (deletion_upcall_tx, deletion_upcall_rx) = tokio::sync::mpsc::channel::<Arc<Tenant>>(128);
let attached_tenant = tenant_spawn(
conf,
tenant_id,
&tenant_dir,
resources,
resources.clone(),
AttachedTenantConf::try_from(location_conf)?,
None,
&TENANTS,
SpawnMode::Normal,
Some(deletion_upcall_tx),
ctx,
)?;
// TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
@@ -1193,6 +1256,8 @@ pub(crate) async fn attach_tenant(
)));
}
spawn_handle_upcalls_task(conf, &resources.remote_storage, deletion_upcall_rx);
tenant_guard.upsert(TenantSlot::Attached(attached_tenant))?;
Ok(())
}
@@ -1260,7 +1325,7 @@ pub enum TenantMapError {
/// structure exists, the TenantsMap will contain a [`TenantSlot::InProgress`]
/// for this tenant, which acts as a marker for any operations targeting
/// this tenant to retry later, or wait for the InProgress state to end.
pub struct SlotGuard {
pub(crate) struct SlotGuard {
tenant_id: TenantId,
old_value: Option<TenantSlot>,
upserted: bool,

View File

@@ -410,7 +410,7 @@ impl DeleteTimelineFlow {
}
/// Shortcut to create Timeline in stopping state and spawn deletion task.
/// See corresponding parts of [`crate::tenant::delete::DeleteTenantFlow`]
/// See corresponding parts of [`crate::tenant::timeline::delete::DeleteTimelineFlow`]
#[instrument(skip_all, fields(%timeline_id))]
pub async fn resume_deletion(
tenant: Arc<Tenant>,

View File

@@ -116,8 +116,6 @@ FAILPOINTS = [
"tenant-delete-before-create-local-mark",
"tenant-delete-before-background",
"tenant-delete-before-polling-ongoing-deletions",
"tenant-delete-before-cleanup-remaining-fs-traces",
"tenant-delete-before-remove-timelines-dir",
"tenant-delete-before-remove-deleted-mark",
"tenant-delete-before-remove-tenant-dir",
# Some failpoints from timeline deletion
@@ -129,7 +127,6 @@ FAILPOINTS = [
FAILPOINTS_BEFORE_BACKGROUND = [
"timeline-delete-before-schedule",
"tenant-delete-before-shutdown",
"tenant-delete-before-create-remote-mark",
"tenant-delete-before-create-local-mark",
"tenant-delete-before-background",
@@ -243,10 +240,7 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
if check is Check.RETRY_WITH_RESTART:
env.pageserver.restart()
if failpoint in (
"tenant-delete-before-shutdown",
"tenant-delete-before-create-remote-mark",
):
if failpoint in ("tenant-delete-before-create-remote-mark",):
wait_until_tenant_active(
ps_http, tenant_id=tenant_id, iterations=iterations, period=0.25
)