pageserver: cancellation for remote ops in tenant deletion on shutdown (#6105)

## Problem

Tenant deletion had a couple of TODOs where we weren't using proper
cancellation tokens that would have aborted the deletions during process
shutdown.

## Summary of changes

- Refactor enough that deletion/shutdown code has access to the
TenantManager's cancellation toke
- Use that cancellation token in tenant deletion instead of dummy
tokens.
This commit is contained in:
John Spray
2024-03-15 18:03:49 +00:00
committed by GitHub
parent 60f30000ef
commit 1aa159acca
7 changed files with 51 additions and 39 deletions

View File

@@ -317,6 +317,7 @@ fn start_pageserver(
let http_listener = tcp_listener::bind(http_addr)?;
let pg_addr = &conf.listen_pg_addr;
info!("Starting pageserver pg protocol handler on {pg_addr}");
let pageserver_listener = tcp_listener::bind(pg_addr)?;
@@ -549,7 +550,7 @@ fn start_pageserver(
let router_state = Arc::new(
http::routes::State::new(
conf,
tenant_manager,
tenant_manager.clone(),
http_auth.clone(),
remote_storage.clone(),
broker_client.clone(),
@@ -693,6 +694,7 @@ fn start_pageserver(
let bg_remote_storage = remote_storage.clone();
let bg_deletion_queue = deletion_queue.clone();
BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(
&tenant_manager,
bg_remote_storage.map(|_| bg_deletion_queue),
0,
));

View File

@@ -31,6 +31,7 @@ pub mod walredo;
use crate::task_mgr::TaskKind;
use camino::Utf8Path;
use deletion_queue::DeletionQueue;
use tenant::mgr::TenantManager;
use tracing::info;
/// Current storage format version
@@ -53,7 +54,11 @@ static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);
pub use crate::metrics::preinitialize_metrics;
#[tracing::instrument(skip_all, fields(%exit_code))]
pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_code: i32) {
pub async fn shutdown_pageserver(
tenant_manager: &TenantManager,
deletion_queue: Option<DeletionQueue>,
exit_code: i32,
) {
use std::time::Duration;
// Shut down the libpq endpoint task. This prevents new connections from
// being accepted.
@@ -67,7 +72,7 @@ pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_cod
// Shut down all the tenants. This flushes everything to disk and kills
// the checkpoint and GC tasks.
timed(
tenant::mgr::shutdown_all_tenants(),
tenant_manager.shutdown(),
"shutdown all tenants",
Duration::from_secs(5),
)

View File

@@ -50,8 +50,6 @@ use once_cell::sync::Lazy;
use utils::id::TimelineId;
use crate::shutdown_pageserver;
//
// There are four runtimes:
//
@@ -453,7 +451,7 @@ async fn task_finish(
}
if shutdown_process {
shutdown_pageserver(None, 1).await;
std::process::exit(1);
}
}

View File

@@ -296,6 +296,7 @@ impl DeleteTenantFlow {
remote_storage: Option<GenericRemoteStorage>,
tenants: &'static std::sync::RwLock<TenantsMap>,
tenant: Arc<Tenant>,
cancel: &CancellationToken,
) -> Result<(), DeleteTenantError> {
span::debug_assert_current_span_has_tenant_id();
@@ -303,7 +304,9 @@ impl DeleteTenantFlow {
let mut guard = Self::prepare(&tenant).await?;
if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await {
if let Err(e) =
Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant, cancel).await
{
tenant.set_broken(format!("{e:#}")).await;
return Err(e);
}
@@ -322,6 +325,7 @@ impl DeleteTenantFlow {
conf: &'static PageServerConf,
remote_storage: Option<&GenericRemoteStorage>,
tenant: &Tenant,
cancel: &CancellationToken,
) -> Result<(), DeleteTenantError> {
guard.mark_in_progress()?;
@@ -335,15 +339,9 @@ impl DeleteTenantFlow {
// Though sounds scary, different mark name?
// Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state.
if let Some(remote_storage) = &remote_storage {
create_remote_delete_mark(
conf,
remote_storage,
&tenant.tenant_shard_id,
// Can't use tenant.cancel, it's already shut down. TODO: wire in an appropriate token
&CancellationToken::new(),
)
.await
.context("remote_mark")?
create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id, cancel)
.await
.context("remote_mark")?
}
fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
@@ -546,8 +544,7 @@ impl DeleteTenantFlow {
conf,
remote_storage.as_ref(),
&tenant.tenant_shard_id,
// Can't use tenant.cancel, it's already shut down. TODO: wire in an appropriate token
&CancellationToken::new(),
&task_mgr::shutdown_token(),
)
.await?;

View File

@@ -102,7 +102,7 @@ pub(crate) enum TenantsMap {
/// [`init_tenant_mgr`] is done, all on-disk tenants have been loaded.
/// New tenants can be added using [`tenant_map_acquire_slot`].
Open(BTreeMap<TenantShardId, TenantSlot>),
/// The pageserver has entered shutdown mode via [`shutdown_all_tenants`].
/// The pageserver has entered shutdown mode via [`TenantManager::shutdown`].
/// Existing tenants are still accessible, but no new tenants can be created.
ShuttingDown(BTreeMap<TenantShardId, TenantSlot>),
}
@@ -261,6 +261,12 @@ pub struct TenantManager {
// See https://github.com/neondatabase/neon/issues/5796
tenants: &'static std::sync::RwLock<TenantsMap>,
resources: TenantSharedResources,
// Long-running operations that happen outside of a [`Tenant`] lifetime should respect this token.
// This is for edge cases like tenant deletion. In normal cases (within a Tenant lifetime),
// tenants have their own cancellation tokens, which we fire individually in [`Self::shutdown`], or
// when the tenant detaches.
cancel: CancellationToken,
}
fn emergency_generations(
@@ -620,6 +626,7 @@ pub async fn init_tenant_mgr(
conf,
tenants: &TENANTS,
resources,
cancel: CancellationToken::new(),
})
}
@@ -680,21 +687,6 @@ pub(crate) fn tenant_spawn(
Ok(tenant)
}
///
/// Shut down all tenants. This runs as part of pageserver shutdown.
///
/// NB: We leave the tenants in the map, so that they remain accessible through
/// the management API until we shut it down. If we removed the shut-down tenants
/// from the tenants map, the management API would return 404 for these tenants,
/// because TenantsMap::get() now returns `None`.
/// That could be easily misinterpreted by control plane, the consumer of the
/// management API. For example, it could attach the tenant on a different pageserver.
/// We would then be in split-brain once this pageserver restarts.
#[instrument(skip_all)]
pub(crate) async fn shutdown_all_tenants() {
shutdown_all_tenants0(&TENANTS).await
}
async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
let mut join_set = JoinSet::new();
@@ -1428,6 +1420,7 @@ impl TenantManager {
self.resources.remote_storage.clone(),
&TENANTS,
tenant,
&self.cancel,
)
.await;
@@ -1817,6 +1810,23 @@ impl TenantManager {
Ok(())
}
///
/// Shut down all tenants. This runs as part of pageserver shutdown.
///
/// NB: We leave the tenants in the map, so that they remain accessible through
/// the management API until we shut it down. If we removed the shut-down tenants
/// from the tenants map, the management API would return 404 for these tenants,
/// because TenantsMap::get() now returns `None`.
/// That could be easily misinterpreted by control plane, the consumer of the
/// management API. For example, it could attach the tenant on a different pageserver.
/// We would then be in split-brain once this pageserver restarts.
#[instrument(skip_all)]
pub(crate) async fn shutdown(&self) {
self.cancel.cancel();
shutdown_all_tenants0(self.tenants).await
}
}
#[derive(Debug, thiserror::Error)]

View File

@@ -184,7 +184,7 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
# allow errors caused by failpoints
f".*failpoint: {failpoint}",
# It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
# We may leave some upload tasks in the queue. They're likely deletes.
# For uploads we explicitly wait with `last_flush_lsn_upload` below.
# So by ignoring these instead of waiting for empty upload queue
@@ -327,7 +327,7 @@ def test_tenant_delete_is_resumed_on_attach(
# From deletion polling
f".*NotFound: tenant {env.initial_tenant}.*",
# It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
# error from http response is also logged
".*InternalServerError\\(Tenant is marked as deleted on remote storage.*",
'.*shutdown_pageserver{exit_code=0}: stopping left-over name="remote upload".*',

View File

@@ -204,7 +204,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
[
f".*{timeline_id}.*failpoint: {failpoint}",
# It appears when we stopped flush loop during deletion and then pageserver is stopped
".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
# This happens when we fail before scheduling background operation.
# Timeline is left in stopping state and retry tries to stop it again.
".*Ignoring new state, equal to the existing one: Stopping",
@@ -398,7 +398,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
".*failpoint: timeline-delete-before-rm",
".*Ignoring new state, equal to the existing one: Stopping",
# this happens, because the stuck timeline is visible to shutdown
".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
]
)
@@ -809,7 +809,7 @@ def test_timeline_delete_resumed_on_attach(
# allow errors caused by failpoints
f".*failpoint: {failpoint}",
# It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
# error from http response is also logged
".*InternalServerError\\(Tenant is marked as deleted on remote storage.*",
# Polling after attach may fail with this