mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-20 06:30:43 +00:00
pageserver: start simplifying tenant deletion
This commit is contained in:
@@ -275,7 +275,6 @@ impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
|
||||
use crate::tenant::delete::DeleteTenantError::*;
|
||||
match value {
|
||||
Get(g) => ApiError::from(g),
|
||||
e @ AlreadyInProgress => ApiError::Conflict(e.to_string()),
|
||||
Timeline(t) => ApiError::from(t),
|
||||
NotAttached => ApiError::NotFound(anyhow::anyhow!("Tenant is not attached").into()),
|
||||
SlotError(e) => e.into(),
|
||||
@@ -782,7 +781,7 @@ async fn tenant_delete_handler(
|
||||
|
||||
let state = get_state(&request);
|
||||
|
||||
mgr::delete_tenant(state.conf, state.remote_storage.clone(), tenant_id)
|
||||
mgr::delete_tenant(state.conf, state.remote_storage.clone(), tenant_id, false)
|
||||
.instrument(info_span!("tenant_delete_handler", %tenant_id))
|
||||
.await?;
|
||||
|
||||
|
||||
@@ -291,6 +291,9 @@ pub enum TaskKind {
|
||||
// A request that comes in via the pageserver HTTP API.
|
||||
MgmtRequest,
|
||||
|
||||
// Tenant Manager servicing a channel that enables upcalls from Tenant
|
||||
TenantManagerUpcall,
|
||||
|
||||
DebugTool,
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -51,10 +51,9 @@ use self::config::AttachedLocationConfig;
|
||||
use self::config::AttachmentMode;
|
||||
use self::config::LocationConf;
|
||||
use self::config::TenantConf;
|
||||
use self::delete::DeleteTenantFlow;
|
||||
use self::delete::should_resume_deletion;
|
||||
use self::metadata::LoadMetadataError;
|
||||
use self::metadata::TimelineMetadata;
|
||||
use self::mgr::TenantsMap;
|
||||
use self::remote_timeline_client::RemoteTimelineClient;
|
||||
use self::timeline::uninit::TimelineUninitMark;
|
||||
use self::timeline::uninit::UninitializedTimeline;
|
||||
@@ -250,8 +249,6 @@ pub struct Tenant {
|
||||
cached_synthetic_tenant_size: Arc<AtomicU64>,
|
||||
|
||||
eviction_task_tenant_state: tokio::sync::Mutex<EvictionTaskTenantState>,
|
||||
|
||||
pub(crate) delete_progress: Arc<tokio::sync::Mutex<DeleteTenantFlow>>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for Tenant {
|
||||
@@ -537,8 +534,8 @@ impl Tenant {
|
||||
resources: TenantSharedResources,
|
||||
attached_conf: AttachedTenantConf,
|
||||
init_order: Option<InitializationOrder>,
|
||||
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||
mode: SpawnMode,
|
||||
resume_deletion_upcall: Option<tokio::sync::mpsc::Sender<Arc<Tenant>>>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Arc<Tenant>> {
|
||||
let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
|
||||
@@ -625,67 +622,48 @@ impl Tenant {
|
||||
// Remote preload is complete.
|
||||
drop(remote_load_completion);
|
||||
|
||||
let pending_deletion = {
|
||||
match DeleteTenantFlow::should_resume_deletion(
|
||||
conf,
|
||||
preload.as_ref().map(|p| p.deleting).unwrap_or(false),
|
||||
&tenant_clone,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(should_resume_deletion) => should_resume_deletion,
|
||||
Err(err) => {
|
||||
make_broken(&tenant_clone, anyhow::anyhow!(err));
|
||||
return Ok(());
|
||||
let resume_deletion = should_resume_deletion(conf, preload.as_ref().map(|p| p.deleting).unwrap_or(false), &tenant_id).await?;
|
||||
if resume_deletion {
|
||||
// We will wait until the background
|
||||
info!("Tenant is partially deleted, will resume");
|
||||
|
||||
// Put the Tenant into a Stopping state so that it will not try to serve any I/O
|
||||
tenant_clone
|
||||
.set_stopping(false, true)
|
||||
.await
|
||||
.expect("cant be stopping or broken");
|
||||
|
||||
// Proceed with attaching the tenant to load the metadata we will use for remote deletion
|
||||
match tenant_clone.attach(init_order, preload, &ctx).await {
|
||||
Ok(()) => {
|
||||
info!("attach finished, deletion will resume");
|
||||
}
|
||||
Err(e) => {
|
||||
make_broken(&tenant_clone, anyhow::anyhow!(e));
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(resume_deletion_upcall) = resume_deletion_upcall {
|
||||
// If this send() fails it means we're shutting down, so just drop it on the floor
|
||||
resume_deletion_upcall.send(tenant_clone).await.ok();
|
||||
} else {
|
||||
make_broken(&tenant_clone, anyhow::anyhow!(
|
||||
"Attemped to attach a partially deleted tenant outside of pageserver startup"
|
||||
));
|
||||
}
|
||||
} else {
|
||||
// Normal case: load all the metadata for the tenant.
|
||||
match tenant_clone.attach(init_order, preload, &ctx).await {
|
||||
Ok(()) => {
|
||||
info!("attach finished, activating");
|
||||
tenant_clone.activate(broker_client, None, &ctx);
|
||||
}
|
||||
Err(e) => {
|
||||
make_broken(&tenant_clone, anyhow::anyhow!(e));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
info!("pending_deletion {}", pending_deletion.is_some());
|
||||
|
||||
if let Some(deletion) = pending_deletion {
|
||||
// as we are no longer loading, signal completion by dropping
|
||||
// the completion while we resume deletion
|
||||
drop(_completion);
|
||||
// do not hold to initial_logical_size_attempt as it will prevent loading from proceeding without timeout
|
||||
let _ = init_order
|
||||
.as_mut()
|
||||
.and_then(|x| x.initial_logical_size_attempt.take());
|
||||
let background_jobs_can_start =
|
||||
init_order.as_ref().map(|x| &x.background_jobs_can_start);
|
||||
if let Some(background) = background_jobs_can_start {
|
||||
info!("waiting for backgound jobs barrier");
|
||||
background.clone().wait().await;
|
||||
info!("ready for backgound jobs barrier");
|
||||
}
|
||||
|
||||
match DeleteTenantFlow::resume_from_attach(
|
||||
deletion,
|
||||
&tenant_clone,
|
||||
preload,
|
||||
tenants,
|
||||
init_order,
|
||||
&ctx,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Err(err) => {
|
||||
make_broken(&tenant_clone, anyhow::anyhow!(err));
|
||||
return Ok(());
|
||||
}
|
||||
Ok(()) => return Ok(()),
|
||||
}
|
||||
}
|
||||
|
||||
match tenant_clone.attach(init_order, preload, &ctx).await {
|
||||
Ok(()) => {
|
||||
info!("attach finished, activating");
|
||||
tenant_clone.activate(broker_client, None, &ctx);
|
||||
}
|
||||
Err(e) => {
|
||||
make_broken(&tenant_clone, anyhow::anyhow!(e));
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
.instrument({
|
||||
@@ -2362,7 +2340,6 @@ impl Tenant {
|
||||
cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
|
||||
cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
|
||||
eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()),
|
||||
delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTenantFlow::default())),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,31 +1,27 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use anyhow::Context;
|
||||
use camino::{Utf8Path, Utf8PathBuf};
|
||||
use camino::Utf8Path;
|
||||
use pageserver_api::models::TenantState;
|
||||
use remote_storage::{GenericRemoteStorage, RemotePath};
|
||||
use tokio::sync::OwnedMutexGuard;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tracing::{error, instrument, warn, Instrument, Span};
|
||||
use tracing::error;
|
||||
|
||||
use utils::{
|
||||
backoff, crashsafe, fs_ext,
|
||||
backoff, crashsafe,
|
||||
id::{TenantId, TimelineId},
|
||||
};
|
||||
|
||||
use crate::{
|
||||
config::PageServerConf,
|
||||
context::RequestContext,
|
||||
task_mgr::{self, TaskKind},
|
||||
InitializationOrder,
|
||||
tenant::{mgr::safe_rename_tenant_dir, ShutdownError},
|
||||
};
|
||||
|
||||
use super::{
|
||||
mgr::{GetTenantError, TenantSlotError, TenantSlotUpsertError, TenantsMap},
|
||||
mgr::{GetTenantError, SlotGuard, TenantSlotError, TenantSlotUpsertError},
|
||||
remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
|
||||
span,
|
||||
timeline::delete::DeleteTimelineFlow,
|
||||
tree_sort_timelines, DeleteTimelineError, Tenant, TenantPreload,
|
||||
tree_sort_timelines, DeleteTimelineError, Tenant,
|
||||
};
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
@@ -39,9 +35,6 @@ pub(crate) enum DeleteTenantError {
|
||||
#[error("Invalid state {0}. Expected Active or Broken")]
|
||||
InvalidState(TenantState),
|
||||
|
||||
#[error("Tenant deletion is already in progress")]
|
||||
AlreadyInProgress,
|
||||
|
||||
#[error("Tenant map slot error {0}")]
|
||||
SlotError(#[from] TenantSlotError),
|
||||
|
||||
@@ -55,7 +48,142 @@ pub(crate) enum DeleteTenantError {
|
||||
Other(#[from] anyhow::Error),
|
||||
}
|
||||
|
||||
type DeletionGuard = tokio::sync::OwnedMutexGuard<DeleteTenantFlow>;
|
||||
/// The part of tenant deletion that must happen before acknowledging the deletion request
|
||||
///
|
||||
/// Usually deletion requires that the tenant is not already in Stopping state: set
|
||||
/// `resume` to true to skip this check if resuming deletion at startup, since we
|
||||
/// would have loaded the tenant into Stopping mode already
|
||||
pub(crate) async fn delete_tenant_foreground(
|
||||
conf: &'static PageServerConf,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
tenant: &Arc<Tenant>,
|
||||
resume: bool,
|
||||
// Witness that we are doing this work within a TenantSlot::InProgress state.
|
||||
_slot_guard: &SlotGuard,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
// In resume mode, we must be stopping. Else, we must _not_ be Stopping.
|
||||
if !(matches!(tenant.current_state(), TenantState::Stopping) ^ !resume) {
|
||||
return Err(DeleteTenantError::InvalidState(tenant.current_state()));
|
||||
}
|
||||
|
||||
fail::fail_point!("tenant-delete-before-create-remote-mark", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: tenant-delete-before-create-remote-mark"
|
||||
))?
|
||||
});
|
||||
|
||||
// Write persistent tombstone
|
||||
if let Some(remote_storage) = &remote_storage {
|
||||
create_remote_delete_mark(conf, remote_storage, &tenant.get_tenant_id()).await?;
|
||||
}
|
||||
|
||||
fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: tenant-delete-before-create-local-mark"
|
||||
))?
|
||||
});
|
||||
|
||||
create_local_delete_mark(conf, &tenant.tenant_id)
|
||||
.await
|
||||
.context("local create mark")?;
|
||||
|
||||
fail::fail_point!("tenant-delete-before-background", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: tenant-delete-before-background"
|
||||
))?
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// The part of tenant deletion that happens after we have already acknowledged the request.
|
||||
///
|
||||
/// This part may retried after pageserver restart if we see a tenant that has a deletion marker
|
||||
/// but has not been completely deleted.
|
||||
pub(crate) async fn delete_tenant_background(
|
||||
conf: &'static PageServerConf,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
tenant_id: TenantId,
|
||||
tenant: &Arc<Tenant>,
|
||||
resume: bool,
|
||||
// Witness that we are doing this work within a TenantSlot::InProgress state.
|
||||
_slot_guard: &SlotGuard,
|
||||
) -> anyhow::Result<()> {
|
||||
let already_running_timeline_deletions = schedule_ordered_timeline_deletions(tenant)
|
||||
.await
|
||||
.context("schedule_ordered_timeline_deletions")?;
|
||||
|
||||
fail::fail_point!("tenant-delete-before-polling-ongoing-deletions", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: tenant-delete-before-polling-ongoing-deletions"
|
||||
))?
|
||||
});
|
||||
|
||||
// Wait for deletions that were already running at the moment when tenant deletion was requested.
|
||||
// When we can lock deletion guard it means that corresponding timeline deletion finished.
|
||||
for (guard, timeline_id) in already_running_timeline_deletions {
|
||||
let flow = guard.lock().await;
|
||||
if !flow.is_finished() {
|
||||
return Err(DeleteTenantError::Other(anyhow::anyhow!(
|
||||
"already running timeline deletion failed: {timeline_id}"
|
||||
))
|
||||
.into());
|
||||
}
|
||||
}
|
||||
|
||||
// For convenience, use the Tenant's deletion queue reference so that we don't have
|
||||
// to take it as an argument.
|
||||
let deletion_queue_client = tenant.deletion_queue_client.clone();
|
||||
|
||||
fail::fail_point!("tenant-delete-before-shutdown", |_| {
|
||||
Err(anyhow::anyhow!("failpoint: tenant-delete-before-shutdown"))?
|
||||
});
|
||||
|
||||
// Tear down local runtime state
|
||||
if !resume {
|
||||
tenant.shutdown(false).await.map_err(|e| match e {
|
||||
ShutdownError::AlreadyStopping => {
|
||||
DeleteTenantError::InvalidState(TenantState::Stopping)
|
||||
}
|
||||
})?;
|
||||
}
|
||||
|
||||
// Not necessary for correctness, but executing deletions before erasing local contents
|
||||
// means that we will have a better chance to resume the delete if we crash.
|
||||
deletion_queue_client.flush_execute().await?;
|
||||
|
||||
fail::fail_point!("tenant-delete-before-remove-deleted-mark", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: tenant-delete-before-remove-deleted-mark"
|
||||
))?
|
||||
});
|
||||
|
||||
// Remove the deletion marker
|
||||
remove_tenant_remote_delete_mark(conf, remote_storage.as_ref(), &tenant_id).await?;
|
||||
|
||||
// Not necessary for correctness, but helps make it true that when we log that deletion is
|
||||
// done, it really is.
|
||||
deletion_queue_client.flush_execute().await?;
|
||||
|
||||
fail::fail_point!("tenant-delete-before-remove-tenant-dir", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: tenant-delete-before-remove-tenant-dir"
|
||||
))?
|
||||
});
|
||||
|
||||
// Remove local storage contents. We do this last, so that if we crash during delete, on
|
||||
// restart we will attempt to re-attach the tenant and resume the deletion.
|
||||
let local_tenant_directory = conf.tenant_path(&tenant_id);
|
||||
let tmp_path = safe_rename_tenant_dir(&local_tenant_directory)
|
||||
.await
|
||||
.with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))?;
|
||||
|
||||
tokio::fs::remove_dir_all(tmp_path.as_path())
|
||||
.await
|
||||
.with_context(|| format!("tenant directory {:?} deletion", tmp_path))?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn remote_tenant_delete_mark_path(
|
||||
conf: &PageServerConf,
|
||||
@@ -70,7 +198,7 @@ fn remote_tenant_delete_mark_path(
|
||||
Ok(tenant_remote_path.join(Utf8Path::new("timelines/deleted")))
|
||||
}
|
||||
|
||||
async fn create_remote_delete_mark(
|
||||
pub(crate) async fn create_remote_delete_mark(
|
||||
conf: &PageServerConf,
|
||||
remote_storage: &GenericRemoteStorage,
|
||||
tenant_id: &TenantId,
|
||||
@@ -115,7 +243,26 @@ async fn create_local_delete_mark(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn schedule_ordered_timeline_deletions(
|
||||
pub(crate) async fn should_resume_deletion(
|
||||
conf: &'static PageServerConf,
|
||||
remote_mark_exists: bool,
|
||||
tenant_id: &TenantId,
|
||||
) -> Result<bool, DeleteTenantError> {
|
||||
if remote_mark_exists {
|
||||
return Ok(true);
|
||||
}
|
||||
|
||||
// In the very last stage of deletion, we might have already removed the remote
|
||||
// marker, but be attaching the tenant anyway on the basis of its local directory
|
||||
// existing: to resume deletion in this case we need the local deletion marker.
|
||||
if conf.tenant_deleted_mark_file_path(tenant_id).exists() {
|
||||
Ok(true)
|
||||
} else {
|
||||
Ok(false)
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn schedule_ordered_timeline_deletions(
|
||||
tenant: &Arc<Tenant>,
|
||||
) -> Result<Vec<(Arc<tokio::sync::Mutex<DeleteTimelineFlow>>, TimelineId)>, DeleteTenantError> {
|
||||
// Tenant is stopping at this point. We know it will be deleted.
|
||||
@@ -153,21 +300,7 @@ async fn schedule_ordered_timeline_deletions(
|
||||
Ok(already_running_deletions)
|
||||
}
|
||||
|
||||
async fn ensure_timelines_dir_empty(timelines_path: &Utf8Path) -> Result<(), DeleteTenantError> {
|
||||
// Assert timelines dir is empty.
|
||||
if !fs_ext::is_directory_empty(timelines_path).await? {
|
||||
// Display first 10 items in directory
|
||||
let list = fs_ext::list_dir(timelines_path).await.context("list_dir")?;
|
||||
let list = &list.into_iter().take(10).collect::<Vec<_>>();
|
||||
return Err(DeleteTenantError::Other(anyhow::anyhow!(
|
||||
"Timelines directory is not empty after all timelines deletion: {list:?}"
|
||||
)));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn remove_tenant_remote_delete_mark(
|
||||
pub(crate) async fn remove_tenant_remote_delete_mark(
|
||||
conf: &PageServerConf,
|
||||
remote_storage: Option<&GenericRemoteStorage>,
|
||||
tenant_id: &TenantId,
|
||||
@@ -188,357 +321,3 @@ async fn remove_tenant_remote_delete_mark(
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Cleanup fs traces: tenant config, timelines dir local delete mark, tenant dir
|
||||
async fn cleanup_remaining_fs_traces(
|
||||
conf: &PageServerConf,
|
||||
tenant_id: &TenantId,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
let rm = |p: Utf8PathBuf, is_dir: bool| async move {
|
||||
if is_dir {
|
||||
tokio::fs::remove_dir(&p).await
|
||||
} else {
|
||||
tokio::fs::remove_file(&p).await
|
||||
}
|
||||
.or_else(fs_ext::ignore_not_found)
|
||||
.with_context(|| format!("failed to delete {p}"))
|
||||
};
|
||||
|
||||
rm(conf.tenant_config_path(tenant_id), false).await?;
|
||||
rm(conf.tenant_location_config_path(tenant_id), false).await?;
|
||||
|
||||
fail::fail_point!("tenant-delete-before-remove-timelines-dir", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: tenant-delete-before-remove-timelines-dir"
|
||||
))?
|
||||
});
|
||||
|
||||
rm(conf.timelines_path(tenant_id), true).await?;
|
||||
|
||||
fail::fail_point!("tenant-delete-before-remove-deleted-mark", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: tenant-delete-before-remove-deleted-mark"
|
||||
))?
|
||||
});
|
||||
|
||||
// Make sure previous deletions are ordered before mark removal.
|
||||
// Otherwise there is no guarantee that they reach the disk before mark deletion.
|
||||
// So its possible for mark to reach disk first and for other deletions
|
||||
// to be reordered later and thus missed if a crash occurs.
|
||||
// Note that we dont need to sync after mark file is removed
|
||||
// because we can tolerate the case when mark file reappears on startup.
|
||||
let tenant_path = &conf.tenant_path(tenant_id);
|
||||
if tenant_path.exists() {
|
||||
crashsafe::fsync_async(&conf.tenant_path(tenant_id))
|
||||
.await
|
||||
.context("fsync_pre_mark_remove")?;
|
||||
}
|
||||
|
||||
rm(conf.tenant_deleted_mark_file_path(tenant_id), false).await?;
|
||||
|
||||
fail::fail_point!("tenant-delete-before-remove-tenant-dir", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: tenant-delete-before-remove-tenant-dir"
|
||||
))?
|
||||
});
|
||||
|
||||
rm(conf.tenant_path(tenant_id), true).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Orchestrates tenant shut down of all tasks, removes its in-memory structures,
|
||||
/// and deletes its data from both disk and s3.
|
||||
/// The sequence of steps:
|
||||
/// 1. Upload remote deletion mark.
|
||||
/// 2. Create local mark file.
|
||||
/// 3. Shutdown tasks
|
||||
/// 4. Run ordered timeline deletions
|
||||
/// 5. Wait for timeline deletion operations that were scheduled before tenant deletion was requested
|
||||
/// 6. Remove remote mark
|
||||
/// 7. Cleanup remaining fs traces, tenant dir, config, timelines dir, local delete mark
|
||||
/// It is resumable from any step in case a crash/restart occurs.
|
||||
/// There are two entrypoints to the process:
|
||||
/// 1. [`DeleteTenantFlow::run`] this is the main one called by a management api handler.
|
||||
/// 2. [`DeleteTenantFlow::resume_from_attach`] is called when deletion is resumed tenant is found to be deleted during attach process.
|
||||
/// Note the only other place that messes around timeline delete mark is the `Tenant::spawn_load` function.
|
||||
#[derive(Default)]
|
||||
pub enum DeleteTenantFlow {
|
||||
#[default]
|
||||
NotStarted,
|
||||
InProgress,
|
||||
Finished,
|
||||
}
|
||||
|
||||
impl DeleteTenantFlow {
|
||||
// These steps are run in the context of management api request handler.
|
||||
// Long running steps are continued to run in the background.
|
||||
// NB: If this fails half-way through, and is retried, the retry will go through
|
||||
// all the same steps again. Make sure the code here is idempotent, and don't
|
||||
// error out if some of the shutdown tasks have already been completed!
|
||||
// NOTE: static needed for background part.
|
||||
// We assume that calling code sets up the span with tenant_id.
|
||||
#[instrument(skip_all)]
|
||||
pub(crate) async fn run(
|
||||
conf: &'static PageServerConf,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||
tenant: Arc<Tenant>,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
span::debug_assert_current_span_has_tenant_id();
|
||||
|
||||
let mut guard = Self::prepare(&tenant).await?;
|
||||
|
||||
if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await {
|
||||
tenant.set_broken(format!("{e:#}")).await;
|
||||
return Err(e);
|
||||
}
|
||||
|
||||
Self::schedule_background(guard, conf, remote_storage, tenants, tenant);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Helper function needed to be able to match once on returned error and transition tenant into broken state.
|
||||
// This is needed because tenant.shutwodn is not idempotent. If tenant state is set to stopping another call to tenant.shutdown
|
||||
// will result in an error, but here we need to be able to retry shutdown when tenant deletion is retried.
|
||||
// So the solution is to set tenant state to broken.
|
||||
async fn run_inner(
|
||||
guard: &mut OwnedMutexGuard<Self>,
|
||||
conf: &'static PageServerConf,
|
||||
remote_storage: Option<&GenericRemoteStorage>,
|
||||
tenant: &Tenant,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
guard.mark_in_progress()?;
|
||||
|
||||
fail::fail_point!("tenant-delete-before-create-remote-mark", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: tenant-delete-before-create-remote-mark"
|
||||
))?
|
||||
});
|
||||
|
||||
// IDEA: implement detach as delete without remote storage. Then they would use the same lock (deletion_progress) so wont contend.
|
||||
// Though sounds scary, different mark name?
|
||||
// Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state.
|
||||
if let Some(remote_storage) = &remote_storage {
|
||||
create_remote_delete_mark(conf, remote_storage, &tenant.tenant_id)
|
||||
.await
|
||||
.context("remote_mark")?
|
||||
}
|
||||
|
||||
fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: tenant-delete-before-create-local-mark"
|
||||
))?
|
||||
});
|
||||
|
||||
create_local_delete_mark(conf, &tenant.tenant_id)
|
||||
.await
|
||||
.context("local delete mark")?;
|
||||
|
||||
fail::fail_point!("tenant-delete-before-background", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: tenant-delete-before-background"
|
||||
))?
|
||||
});
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn mark_in_progress(&mut self) -> anyhow::Result<()> {
|
||||
match self {
|
||||
Self::Finished => anyhow::bail!("Bug. Is in finished state"),
|
||||
Self::InProgress { .. } => { /* We're in a retry */ }
|
||||
Self::NotStarted => { /* Fresh start */ }
|
||||
}
|
||||
|
||||
*self = Self::InProgress;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) async fn should_resume_deletion(
|
||||
conf: &'static PageServerConf,
|
||||
remote_mark_exists: bool,
|
||||
tenant: &Tenant,
|
||||
) -> Result<Option<DeletionGuard>, DeleteTenantError> {
|
||||
let acquire = |t: &Tenant| {
|
||||
Some(
|
||||
Arc::clone(&t.delete_progress)
|
||||
.try_lock_owned()
|
||||
.expect("we're the only owner during init"),
|
||||
)
|
||||
};
|
||||
|
||||
if remote_mark_exists {
|
||||
return Ok(acquire(tenant));
|
||||
}
|
||||
|
||||
let tenant_id = tenant.tenant_id;
|
||||
// Check local mark first, if its there there is no need to go to s3 to check whether remote one exists.
|
||||
if conf.tenant_deleted_mark_file_path(&tenant_id).exists() {
|
||||
Ok(acquire(tenant))
|
||||
} else {
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) async fn resume_from_attach(
|
||||
guard: DeletionGuard,
|
||||
tenant: &Arc<Tenant>,
|
||||
preload: Option<TenantPreload>,
|
||||
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||
init_order: Option<InitializationOrder>,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
tenant
|
||||
.set_stopping(false, true)
|
||||
.await
|
||||
.expect("cant be stopping or broken");
|
||||
|
||||
tenant
|
||||
.attach(init_order, preload, ctx)
|
||||
.await
|
||||
.context("attach")?;
|
||||
|
||||
Self::background(
|
||||
guard,
|
||||
tenant.conf,
|
||||
tenant.remote_storage.clone(),
|
||||
tenants,
|
||||
tenant,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
async fn prepare(
|
||||
tenant: &Arc<Tenant>,
|
||||
) -> Result<tokio::sync::OwnedMutexGuard<Self>, DeleteTenantError> {
|
||||
// FIXME: unsure about active only. Our init jobs may not be cancellable properly,
|
||||
// so at least for now allow deletions only for active tenants. TODO recheck
|
||||
// Broken and Stopping is needed for retries.
|
||||
if !matches!(
|
||||
tenant.current_state(),
|
||||
TenantState::Active | TenantState::Broken { .. }
|
||||
) {
|
||||
return Err(DeleteTenantError::InvalidState(tenant.current_state()));
|
||||
}
|
||||
|
||||
let guard = Arc::clone(&tenant.delete_progress)
|
||||
.try_lock_owned()
|
||||
.map_err(|_| DeleteTenantError::AlreadyInProgress)?;
|
||||
|
||||
fail::fail_point!("tenant-delete-before-shutdown", |_| {
|
||||
Err(anyhow::anyhow!("failpoint: tenant-delete-before-shutdown"))?
|
||||
});
|
||||
|
||||
// It would be good to only set stopping here and continue shutdown in the background, but shutdown is not idempotent.
|
||||
// i e it is an error to do:
|
||||
// tenant.set_stopping
|
||||
// tenant.shutdown
|
||||
// Its also bad that we're holding tenants.read here.
|
||||
// TODO relax set_stopping to be idempotent?
|
||||
if tenant.shutdown(false).await.is_err() {
|
||||
return Err(DeleteTenantError::Other(anyhow::anyhow!(
|
||||
"tenant shutdown is already in progress"
|
||||
)));
|
||||
}
|
||||
|
||||
Ok(guard)
|
||||
}
|
||||
|
||||
fn schedule_background(
|
||||
guard: OwnedMutexGuard<Self>,
|
||||
conf: &'static PageServerConf,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||
tenant: Arc<Tenant>,
|
||||
) {
|
||||
let tenant_id = tenant.tenant_id;
|
||||
|
||||
task_mgr::spawn(
|
||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::TimelineDeletionWorker,
|
||||
Some(tenant_id),
|
||||
None,
|
||||
"tenant_delete",
|
||||
false,
|
||||
async move {
|
||||
if let Err(err) =
|
||||
Self::background(guard, conf, remote_storage, tenants, &tenant).await
|
||||
{
|
||||
error!("Error: {err:#}");
|
||||
tenant.set_broken(format!("{err:#}")).await;
|
||||
};
|
||||
Ok(())
|
||||
}
|
||||
.instrument({
|
||||
let span = tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_id);
|
||||
span.follows_from(Span::current());
|
||||
span
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
async fn background(
|
||||
mut guard: OwnedMutexGuard<Self>,
|
||||
conf: &PageServerConf,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||
tenant: &Arc<Tenant>,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
// Tree sort timelines, schedule delete for them. Mention retries from the console side.
|
||||
// Note that if deletion fails we dont mark timelines as broken,
|
||||
// the whole tenant will become broken as by `Self::schedule_background` logic
|
||||
let already_running_timeline_deletions = schedule_ordered_timeline_deletions(tenant)
|
||||
.await
|
||||
.context("schedule_ordered_timeline_deletions")?;
|
||||
|
||||
fail::fail_point!("tenant-delete-before-polling-ongoing-deletions", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: tenant-delete-before-polling-ongoing-deletions"
|
||||
))?
|
||||
});
|
||||
|
||||
// Wait for deletions that were already running at the moment when tenant deletion was requested.
|
||||
// When we can lock deletion guard it means that corresponding timeline deletion finished.
|
||||
for (guard, timeline_id) in already_running_timeline_deletions {
|
||||
let flow = guard.lock().await;
|
||||
if !flow.is_finished() {
|
||||
return Err(DeleteTenantError::Other(anyhow::anyhow!(
|
||||
"already running timeline deletion failed: {timeline_id}"
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
let timelines_path = conf.timelines_path(&tenant.tenant_id);
|
||||
// May not exist if we fail in cleanup_remaining_fs_traces after removing it
|
||||
if timelines_path.exists() {
|
||||
// sanity check to guard against layout changes
|
||||
ensure_timelines_dir_empty(&timelines_path)
|
||||
.await
|
||||
.context("timelines dir not empty")?;
|
||||
}
|
||||
|
||||
remove_tenant_remote_delete_mark(conf, remote_storage.as_ref(), &tenant.tenant_id).await?;
|
||||
|
||||
fail::fail_point!("tenant-delete-before-cleanup-remaining-fs-traces", |_| {
|
||||
Err(anyhow::anyhow!(
|
||||
"failpoint: tenant-delete-before-cleanup-remaining-fs-traces"
|
||||
))?
|
||||
});
|
||||
|
||||
cleanup_remaining_fs_traces(conf, &tenant.tenant_id)
|
||||
.await
|
||||
.context("cleanup_remaining_fs_traces")?;
|
||||
|
||||
let mut locked = tenants.write().unwrap();
|
||||
if locked.remove(&tenant.tenant_id).is_none() {
|
||||
warn!("Tenant got removed from tenants map during deletion");
|
||||
};
|
||||
|
||||
*guard = Self::Finished;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -26,7 +26,6 @@ use crate::control_plane_client::{
|
||||
use crate::deletion_queue::DeletionQueueClient;
|
||||
use crate::task_mgr::{self, TaskKind};
|
||||
use crate::tenant::config::{AttachmentMode, LocationConf, LocationMode, TenantConfOpt};
|
||||
use crate::tenant::delete::DeleteTenantFlow;
|
||||
use crate::tenant::{
|
||||
create_tenant_files, AttachedTenantConf, ShutdownError, SpawnMode, Tenant, TenantState,
|
||||
};
|
||||
@@ -37,7 +36,7 @@ use utils::fs_ext::PathExt;
|
||||
use utils::generation::Generation;
|
||||
use utils::id::{TenantId, TimelineId};
|
||||
|
||||
use super::delete::DeleteTenantError;
|
||||
use super::delete::{delete_tenant_background, delete_tenant_foreground, DeleteTenantError};
|
||||
use super::timeline::delete::DeleteTimelineFlow;
|
||||
use super::TenantSharedResources;
|
||||
|
||||
@@ -104,13 +103,6 @@ impl TenantsMap {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn remove(&mut self, tenant_id: &TenantId) -> Option<TenantSlot> {
|
||||
match self {
|
||||
TenantsMap::Initializing => None,
|
||||
TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m.remove(tenant_id),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// This is "safe" in that that it won't leave behind a partially deleted directory
|
||||
@@ -119,12 +111,14 @@ impl TenantsMap {
|
||||
///
|
||||
/// This is pageserver-specific, as it relies on future processes after a crash to check
|
||||
/// for TEMP_FILE_SUFFIX when loading things.
|
||||
async fn safe_remove_tenant_dir_all(path: impl AsRef<Utf8Path>) -> std::io::Result<()> {
|
||||
pub(crate) async fn safe_remove_tenant_dir_all(path: impl AsRef<Utf8Path>) -> std::io::Result<()> {
|
||||
let tmp_path = safe_rename_tenant_dir(path).await?;
|
||||
fs::remove_dir_all(tmp_path).await
|
||||
}
|
||||
|
||||
async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<Utf8PathBuf> {
|
||||
pub(crate) async fn safe_rename_tenant_dir(
|
||||
path: impl AsRef<Utf8Path>,
|
||||
) -> std::io::Result<Utf8PathBuf> {
|
||||
let parent = path
|
||||
.as_ref()
|
||||
.parent()
|
||||
@@ -370,6 +364,10 @@ pub async fn init_tenant_mgr(
|
||||
let tenant_generations =
|
||||
init_load_generations(conf, &tenant_configs, &resources, &cancel).await?;
|
||||
|
||||
// Tenants may send into this channel if they discover that they are in a partially
|
||||
// deleted state.
|
||||
let (deletion_upcall_tx, deletion_upcall_rx) = tokio::sync::mpsc::channel::<Arc<Tenant>>(128);
|
||||
|
||||
// Construct `Tenant` objects and start them running
|
||||
for (tenant_id, location_conf) in tenant_configs {
|
||||
let tenant_dir_path = conf.tenant_path(&tenant_id);
|
||||
@@ -441,8 +439,8 @@ pub async fn init_tenant_mgr(
|
||||
resources.clone(),
|
||||
AttachedTenantConf::try_from(location_conf)?,
|
||||
Some(init_order.clone()),
|
||||
&TENANTS,
|
||||
SpawnMode::Normal,
|
||||
Some(deletion_upcall_tx.clone()),
|
||||
&ctx,
|
||||
) {
|
||||
Ok(tenant) => {
|
||||
@@ -459,9 +457,41 @@ pub async fn init_tenant_mgr(
|
||||
let mut tenants_map = TENANTS.write().unwrap();
|
||||
assert!(matches!(&*tenants_map, &TenantsMap::Initializing));
|
||||
*tenants_map = TenantsMap::Open(tenants);
|
||||
|
||||
spawn_handle_upcalls_task(conf, &resources.remote_storage, deletion_upcall_rx);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// For some edge cases, like resuming their own deletion on attach, tenants
|
||||
/// may submit mgr-level operations into a queue.
|
||||
fn spawn_handle_upcalls_task(
|
||||
conf: &'static PageServerConf,
|
||||
remote_storage: &Option<GenericRemoteStorage>,
|
||||
mut deletion_upcall_rx: tokio::sync::mpsc::Receiver<Arc<Tenant>>,
|
||||
) {
|
||||
let remote_storage = remote_storage.clone();
|
||||
task_mgr::spawn(
|
||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::TenantManagerUpcall,
|
||||
None,
|
||||
None,
|
||||
"tenant manager upcall",
|
||||
false,
|
||||
async move {
|
||||
while let Some(tenant) = deletion_upcall_rx.recv().await {
|
||||
if let Err(e) =
|
||||
delete_tenant(conf, remote_storage.clone(), tenant.tenant_id, true).await
|
||||
{
|
||||
error!(tenant_id = %tenant.get_tenant_id(), "Failed to complete deletion of tenant: {e}");
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
/// Wrapper for Tenant::spawn that checks invariants before running, and inserts
|
||||
/// a broken tenant in the map if Tenant::spawn fails.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
@@ -472,8 +502,8 @@ pub(crate) fn tenant_spawn(
|
||||
resources: TenantSharedResources,
|
||||
location_conf: AttachedTenantConf,
|
||||
init_order: Option<InitializationOrder>,
|
||||
tenants: &'static std::sync::RwLock<TenantsMap>,
|
||||
mode: SpawnMode,
|
||||
resume_deletion_upcall: Option<tokio::sync::mpsc::Sender<Arc<Tenant>>>,
|
||||
ctx: &RequestContext,
|
||||
) -> anyhow::Result<Arc<Tenant>> {
|
||||
anyhow::ensure!(
|
||||
@@ -504,8 +534,8 @@ pub(crate) fn tenant_spawn(
|
||||
resources,
|
||||
location_conf,
|
||||
init_order,
|
||||
tenants,
|
||||
mode,
|
||||
resume_deletion_upcall,
|
||||
ctx,
|
||||
) {
|
||||
Ok(tenant) => tenant,
|
||||
@@ -601,8 +631,6 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
|
||||
if let Err(e) = res {
|
||||
match e {
|
||||
ShutdownError::AlreadyStopping => {
|
||||
// TODO: to ensure this can _never_ happen, we need to get rid of
|
||||
// the horrible DeleteTenantFlow::should_resume_deletion
|
||||
tracing::warn!(%tenant_id, "Tenant already stopping during shutdown");
|
||||
}
|
||||
}
|
||||
@@ -684,8 +712,8 @@ pub(crate) async fn create_tenant(
|
||||
resources,
|
||||
AttachedTenantConf::try_from(location_conf)?,
|
||||
None,
|
||||
&TENANTS,
|
||||
SpawnMode::Create,
|
||||
None,
|
||||
ctx,
|
||||
)?;
|
||||
// TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
|
||||
@@ -849,8 +877,8 @@ pub(crate) async fn upsert_location(
|
||||
},
|
||||
AttachedTenantConf::try_from(new_location_config)?,
|
||||
None,
|
||||
&TENANTS,
|
||||
SpawnMode::Normal,
|
||||
None,
|
||||
ctx,
|
||||
)?;
|
||||
|
||||
@@ -915,18 +943,11 @@ pub(crate) async fn delete_tenant(
|
||||
conf: &'static PageServerConf,
|
||||
remote_storage: Option<GenericRemoteStorage>,
|
||||
tenant_id: TenantId,
|
||||
resume: bool,
|
||||
) -> Result<(), DeleteTenantError> {
|
||||
// We acquire a SlotGuard during this function to protect against concurrent
|
||||
// changes while the ::prepare phase of DeleteTenantFlow executes, but then
|
||||
// have to return the Tenant to the map while the background deletion runs.
|
||||
//
|
||||
// TODO: refactor deletion to happen outside the lifetime of a Tenant.
|
||||
// Currently, deletion requires a reference to the tenants map in order to
|
||||
// keep the Tenant in the map until deletion is complete, and then remove
|
||||
// it at the end.
|
||||
//
|
||||
// See https://github.com/neondatabase/neon/issues/5080
|
||||
|
||||
// operations, but also to ensure we do not permit re-creation of the same
|
||||
// tenant ID until we are done with the deletion.
|
||||
let mut slot_guard = tenant_map_acquire_slot(&tenant_id, Some(true))?;
|
||||
|
||||
// unwrap is safe because we used expect_exist=true when acquiring the slot
|
||||
@@ -940,11 +961,51 @@ pub(crate) async fn delete_tenant(
|
||||
}
|
||||
};
|
||||
|
||||
let result = DeleteTenantFlow::run(conf, remote_storage, &TENANTS, tenant).await;
|
||||
// Do the foreground part of deletion: if it fails, set the tenant broken
|
||||
// and leave it in the tenants map
|
||||
if let Err(e) =
|
||||
delete_tenant_foreground(conf, remote_storage.clone(), &tenant, resume, &slot_guard).await
|
||||
{
|
||||
tenant.set_broken(format!("{e:#}")).await;
|
||||
slot_guard.upsert(TenantSlot::Attached(tenant))?;
|
||||
return Err(e);
|
||||
}
|
||||
|
||||
// Replace our InProgress marker with the Tenant in attached state, after the prepare phase of deletion is done
|
||||
slot_guard.upsert(slot)?;
|
||||
result
|
||||
// Do the actual deletion in the background, while holding the SlotGuard
|
||||
task_mgr::spawn(
|
||||
task_mgr::BACKGROUND_RUNTIME.handle(),
|
||||
TaskKind::MgmtRequest,
|
||||
None, // This task runs without the tenant_id, as it should not be caught by Tenant::shutdown
|
||||
None,
|
||||
"delete_tenant_background",
|
||||
false,
|
||||
async move {
|
||||
match delete_tenant_background(
|
||||
conf,
|
||||
remote_storage,
|
||||
tenant_id,
|
||||
&tenant,
|
||||
resume,
|
||||
&slot_guard,
|
||||
)
|
||||
.instrument(info_span!("delete_tenant_background", %tenant_id))
|
||||
.await
|
||||
{
|
||||
Ok(()) => {
|
||||
info!(%tenant_id, "Tenant deletion complete");
|
||||
}
|
||||
Err(e) => {
|
||||
warn!(% tenant_id, "Tenant deletion failed: {e:#}");
|
||||
tenant.set_broken(format!("{e:#}")).await;
|
||||
slot_guard.upsert(TenantSlot::Attached(tenant))?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
},
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
@@ -1093,8 +1154,8 @@ pub(crate) async fn load_tenant(
|
||||
resources,
|
||||
AttachedTenantConf::try_from(location_conf)?,
|
||||
None,
|
||||
&TENANTS,
|
||||
SpawnMode::Normal,
|
||||
None,
|
||||
ctx,
|
||||
)
|
||||
.with_context(|| format!("Failed to schedule tenant processing in path {tenant_path:?}"))?;
|
||||
@@ -1172,15 +1233,17 @@ pub(crate) async fn attach_tenant(
|
||||
// TODO: tenant directory remains on disk if we bail out from here on.
|
||||
// See https://github.com/neondatabase/neon/issues/4233
|
||||
|
||||
let (deletion_upcall_tx, deletion_upcall_rx) = tokio::sync::mpsc::channel::<Arc<Tenant>>(128);
|
||||
|
||||
let attached_tenant = tenant_spawn(
|
||||
conf,
|
||||
tenant_id,
|
||||
&tenant_dir,
|
||||
resources,
|
||||
resources.clone(),
|
||||
AttachedTenantConf::try_from(location_conf)?,
|
||||
None,
|
||||
&TENANTS,
|
||||
SpawnMode::Normal,
|
||||
Some(deletion_upcall_tx),
|
||||
ctx,
|
||||
)?;
|
||||
// TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
|
||||
@@ -1193,6 +1256,8 @@ pub(crate) async fn attach_tenant(
|
||||
)));
|
||||
}
|
||||
|
||||
spawn_handle_upcalls_task(conf, &resources.remote_storage, deletion_upcall_rx);
|
||||
|
||||
tenant_guard.upsert(TenantSlot::Attached(attached_tenant))?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -1260,7 +1325,7 @@ pub enum TenantMapError {
|
||||
/// structure exists, the TenantsMap will contain a [`TenantSlot::InProgress`]
|
||||
/// for this tenant, which acts as a marker for any operations targeting
|
||||
/// this tenant to retry later, or wait for the InProgress state to end.
|
||||
pub struct SlotGuard {
|
||||
pub(crate) struct SlotGuard {
|
||||
tenant_id: TenantId,
|
||||
old_value: Option<TenantSlot>,
|
||||
upserted: bool,
|
||||
|
||||
@@ -410,7 +410,7 @@ impl DeleteTimelineFlow {
|
||||
}
|
||||
|
||||
/// Shortcut to create Timeline in stopping state and spawn deletion task.
|
||||
/// See corresponding parts of [`crate::tenant::delete::DeleteTenantFlow`]
|
||||
/// See corresponding parts of [`crate::tenant::timeline::delete::DeleteTimelineFlow`]
|
||||
#[instrument(skip_all, fields(%timeline_id))]
|
||||
pub async fn resume_deletion(
|
||||
tenant: Arc<Tenant>,
|
||||
|
||||
@@ -116,8 +116,6 @@ FAILPOINTS = [
|
||||
"tenant-delete-before-create-local-mark",
|
||||
"tenant-delete-before-background",
|
||||
"tenant-delete-before-polling-ongoing-deletions",
|
||||
"tenant-delete-before-cleanup-remaining-fs-traces",
|
||||
"tenant-delete-before-remove-timelines-dir",
|
||||
"tenant-delete-before-remove-deleted-mark",
|
||||
"tenant-delete-before-remove-tenant-dir",
|
||||
# Some failpoints from timeline deletion
|
||||
@@ -129,7 +127,6 @@ FAILPOINTS = [
|
||||
|
||||
FAILPOINTS_BEFORE_BACKGROUND = [
|
||||
"timeline-delete-before-schedule",
|
||||
"tenant-delete-before-shutdown",
|
||||
"tenant-delete-before-create-remote-mark",
|
||||
"tenant-delete-before-create-local-mark",
|
||||
"tenant-delete-before-background",
|
||||
@@ -243,10 +240,7 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
|
||||
if check is Check.RETRY_WITH_RESTART:
|
||||
env.pageserver.restart()
|
||||
|
||||
if failpoint in (
|
||||
"tenant-delete-before-shutdown",
|
||||
"tenant-delete-before-create-remote-mark",
|
||||
):
|
||||
if failpoint in ("tenant-delete-before-create-remote-mark",):
|
||||
wait_until_tenant_active(
|
||||
ps_http, tenant_id=tenant_id, iterations=iterations, period=0.25
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user