pageserver: cancellation for remote ops in tenant deletion on shutdown (#6105)

## Problem Tenant deletion had a couple of TODOs where we weren't using proper cancellation tokens that would have aborted the deletions during process shutdown. ## Summary of changes - Refactor enough that deletion/shutdown code has access to the TenantManager's cancellation toke - Use that cancellation token in tenant deletion instead of dummy tokens.
2026-01-04 12:02:55 +00:00 · 2024-03-15 18:03:49 +00:00
parent 60f30000ef
commit 1aa159acca
7 changed files with 51 additions and 39 deletions
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -317,6 +317,7 @@ fn start_pageserver(
    let http_listener = tcp_listener::bind(http_addr)?;

    let pg_addr = &conf.listen_pg_addr;
+
    info!("Starting pageserver pg protocol handler on {pg_addr}");
    let pageserver_listener = tcp_listener::bind(pg_addr)?;

@@ -549,7 +550,7 @@ fn start_pageserver(
        let router_state = Arc::new(
            http::routes::State::new(
                conf,
-                tenant_manager,
+                tenant_manager.clone(),
                http_auth.clone(),
                remote_storage.clone(),
                broker_client.clone(),
@@ -693,6 +694,7 @@ fn start_pageserver(
                let bg_remote_storage = remote_storage.clone();
                let bg_deletion_queue = deletion_queue.clone();
                BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(
+                    &tenant_manager,
                    bg_remote_storage.map(|_| bg_deletion_queue),
                    0,
                ));
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -31,6 +31,7 @@ pub mod walredo;
 use crate::task_mgr::TaskKind;
 use camino::Utf8Path;
 use deletion_queue::DeletionQueue;
+use tenant::mgr::TenantManager;
 use tracing::info;

 /// Current storage format version
@@ -53,7 +54,11 @@ static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);
 pub use crate::metrics::preinitialize_metrics;

 #[tracing::instrument(skip_all, fields(%exit_code))]
-pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_code: i32) {
+pub async fn shutdown_pageserver(
+    tenant_manager: &TenantManager,
+    deletion_queue: Option<DeletionQueue>,
+    exit_code: i32,
+) {
    use std::time::Duration;
    // Shut down the libpq endpoint task. This prevents new connections from
    // being accepted.
@@ -67,7 +72,7 @@ pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_cod
    // Shut down all the tenants. This flushes everything to disk and kills
    // the checkpoint and GC tasks.
    timed(
-        tenant::mgr::shutdown_all_tenants(),
+        tenant_manager.shutdown(),
        "shutdown all tenants",
        Duration::from_secs(5),
    )
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -50,8 +50,6 @@ use once_cell::sync::Lazy;

 use utils::id::TimelineId;

-use crate::shutdown_pageserver;
-
 //
 // There are four runtimes:
 //
@@ -453,7 +451,7 @@ async fn task_finish(
    }

    if shutdown_process {
-        shutdown_pageserver(None, 1).await;
+        std::process::exit(1);
    }
 }

--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -296,6 +296,7 @@ impl DeleteTenantFlow {
        remote_storage: Option<GenericRemoteStorage>,
        tenants: &'static std::sync::RwLock<TenantsMap>,
        tenant: Arc<Tenant>,
+        cancel: &CancellationToken,
    ) -> Result<(), DeleteTenantError> {
        span::debug_assert_current_span_has_tenant_id();

@@ -303,7 +304,9 @@ impl DeleteTenantFlow {

        let mut guard = Self::prepare(&tenant).await?;

-        if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await {
+        if let Err(e) =
+            Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant, cancel).await
+        {
            tenant.set_broken(format!("{e:#}")).await;
            return Err(e);
        }
@@ -322,6 +325,7 @@ impl DeleteTenantFlow {
        conf: &'static PageServerConf,
        remote_storage: Option<&GenericRemoteStorage>,
        tenant: &Tenant,
+        cancel: &CancellationToken,
    ) -> Result<(), DeleteTenantError> {
        guard.mark_in_progress()?;

@@ -335,15 +339,9 @@ impl DeleteTenantFlow {
        // Though sounds scary, different mark name?
        // Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state.
        if let Some(remote_storage) = &remote_storage {
-            create_remote_delete_mark(
-                conf,
-                remote_storage,
-                &tenant.tenant_shard_id,
-                // Can't use tenant.cancel, it's already shut down.  TODO: wire in an appropriate token
-                &CancellationToken::new(),
-            )
-            .await
-            .context("remote_mark")?
+            create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id, cancel)
+                .await
+                .context("remote_mark")?
        }

        fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
@@ -546,8 +544,7 @@ impl DeleteTenantFlow {
            conf,
            remote_storage.as_ref(),
            &tenant.tenant_shard_id,
-            // Can't use tenant.cancel, it's already shut down.  TODO: wire in an appropriate token
-            &CancellationToken::new(),
+            &task_mgr::shutdown_token(),
        )
        .await?;

--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -102,7 +102,7 @@ pub(crate) enum TenantsMap {
    /// [`init_tenant_mgr`] is done, all on-disk tenants have been loaded.
    /// New tenants can be added using [`tenant_map_acquire_slot`].
    Open(BTreeMap<TenantShardId, TenantSlot>),
-    /// The pageserver has entered shutdown mode via [`shutdown_all_tenants`].
+    /// The pageserver has entered shutdown mode via [`TenantManager::shutdown`].
    /// Existing tenants are still accessible, but no new tenants can be created.
    ShuttingDown(BTreeMap<TenantShardId, TenantSlot>),
 }
@@ -261,6 +261,12 @@ pub struct TenantManager {
    // See https://github.com/neondatabase/neon/issues/5796
    tenants: &'static std::sync::RwLock<TenantsMap>,
    resources: TenantSharedResources,
+
+    // Long-running operations that happen outside of a [`Tenant`] lifetime should respect this token.
+    // This is for edge cases like tenant deletion.  In normal cases (within a Tenant lifetime),
+    // tenants have their own cancellation tokens, which we fire individually in [`Self::shutdown`], or
+    // when the tenant detaches.
+    cancel: CancellationToken,
 }

 fn emergency_generations(
@@ -620,6 +626,7 @@ pub async fn init_tenant_mgr(
        conf,
        tenants: &TENANTS,
        resources,
+        cancel: CancellationToken::new(),
    })
 }

@@ -680,21 +687,6 @@ pub(crate) fn tenant_spawn(
    Ok(tenant)
 }

-///
-/// Shut down all tenants. This runs as part of pageserver shutdown.
-///
-/// NB: We leave the tenants in the map, so that they remain accessible through
-/// the management API until we shut it down. If we removed the shut-down tenants
-/// from the tenants map, the management API would return 404 for these tenants,
-/// because TenantsMap::get() now returns `None`.
-/// That could be easily misinterpreted by control plane, the consumer of the
-/// management API. For example, it could attach the tenant on a different pageserver.
-/// We would then be in split-brain once this pageserver restarts.
-#[instrument(skip_all)]
-pub(crate) async fn shutdown_all_tenants() {
-    shutdown_all_tenants0(&TENANTS).await
-}
-
 async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
    let mut join_set = JoinSet::new();

@@ -1428,6 +1420,7 @@ impl TenantManager {
            self.resources.remote_storage.clone(),
            &TENANTS,
            tenant,
+            &self.cancel,
        )
        .await;

@@ -1817,6 +1810,23 @@ impl TenantManager {

        Ok(())
    }
+
+    ///
+    /// Shut down all tenants. This runs as part of pageserver shutdown.
+    ///
+    /// NB: We leave the tenants in the map, so that they remain accessible through
+    /// the management API until we shut it down. If we removed the shut-down tenants
+    /// from the tenants map, the management API would return 404 for these tenants,
+    /// because TenantsMap::get() now returns `None`.
+    /// That could be easily misinterpreted by control plane, the consumer of the
+    /// management API. For example, it could attach the tenant on a different pageserver.
+    /// We would then be in split-brain once this pageserver restarts.
+    #[instrument(skip_all)]
+    pub(crate) async fn shutdown(&self) {
+        self.cancel.cancel();
+
+        shutdown_all_tenants0(self.tenants).await
+    }
 }

 #[derive(Debug, thiserror::Error)]
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -184,7 +184,7 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
            # allow errors caused by failpoints
            f".*failpoint: {failpoint}",
            # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
-            ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
+            ".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
            # We may leave some upload tasks in the queue. They're likely deletes.
            # For uploads we explicitly wait with `last_flush_lsn_upload` below.
            # So by ignoring these instead of waiting for empty upload queue
@@ -327,7 +327,7 @@ def test_tenant_delete_is_resumed_on_attach(
            # From deletion polling
            f".*NotFound: tenant {env.initial_tenant}.*",
            # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
-            ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
+            ".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
            # error from http response is also logged
            ".*InternalServerError\\(Tenant is marked as deleted on remote storage.*",
            '.*shutdown_pageserver{exit_code=0}: stopping left-over name="remote upload".*',
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -204,7 +204,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
        [
            f".*{timeline_id}.*failpoint: {failpoint}",
            # It appears when we stopped flush loop during deletion and then pageserver is stopped
-            ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
+            ".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
            # This happens when we fail before scheduling background operation.
            # Timeline is left in stopping state and retry tries to stop it again.
            ".*Ignoring new state, equal to the existing one: Stopping",
@@ -398,7 +398,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
            ".*failpoint: timeline-delete-before-rm",
            ".*Ignoring new state, equal to the existing one: Stopping",
            # this happens, because the stuck timeline is visible to shutdown
-            ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
+            ".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
        ]
    )

@@ -809,7 +809,7 @@ def test_timeline_delete_resumed_on_attach(
            # allow errors caused by failpoints
            f".*failpoint: {failpoint}",
            # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
-            ".*shutdown_all_tenants:shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
+            ".*shutdown.*tenant_id.*shutdown.*timeline_id.*: failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
            # error from http response is also logged
            ".*InternalServerError\\(Tenant is marked as deleted on remote storage.*",
            # Polling after attach may fail with this