pageserver: add InProgress tenant map state, use a sync lock for the map (#5367)

## Problem Follows on from #5299 - We didn't have a generic way to protect a tenant undergoing changes: `Tenant` had states, but for our arbitrary transitions between secondary/attached, we need a general way to say "reserve this tenant ID, and don't allow any other ops on it, but don't try and report it as being in any particular state". - The TenantsMap structure was behind an async RwLock, but it was never correct to hold it across await points: that would block any other changes for all tenants. ## Summary of changes - Add the `TenantSlot::InProgress` value. This means: - Incoming administrative operations on the tenant should retry later - Anything trying to read the live state of the tenant (e.g. a page service reader) should retry later or block. - Store TenantsMap in `std::sync::RwLock` - Provide an extended `get_active_tenant_with_timeout` for page_service to use, which will wait on InProgress slots as well as non-active tenants. Closes: https://github.com/neondatabase/neon/issues/5378 --------- Co-authored-by: Christian Schwarz <christian@neon.tech>
2026-01-09 06:22:57 +00:00 · 2023-11-06 14:03:22 +00:00
parent e6470ee92e
commit 85cd97af61
17 changed files with 1142 additions and 589 deletions
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -77,6 +77,9 @@ pub mod completion;
 /// Reporting utilities
 pub mod error;

+/// async timeout helper
+pub mod timeout;
+
 pub mod sync;

 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
--- a/libs/utils/src/timeout.rs
+++ b/libs/utils/src/timeout.rs
@@ -0,0 +1,37 @@
+use std::time::Duration;
+
+use tokio_util::sync::CancellationToken;
+
+pub enum TimeoutCancellableError {
+    Timeout,
+    Cancelled,
+}
+
+/// Wrap [`tokio::time::timeout`] with a CancellationToken.
+///
+/// This wrapper is appropriate for any long running operation in a task
+/// that ought to respect a CancellationToken (which means most tasks).
+///
+/// The only time you should use a bare tokio::timeout is when the future `F`
+/// itself respects a CancellationToken: otherwise, always use this wrapper
+/// with your CancellationToken to ensure that your task does not hold up
+/// graceful shutdown.
+pub async fn timeout_cancellable<F>(
+    duration: Duration,
+    cancel: &CancellationToken,
+    future: F,
+) -> Result<F::Output, TimeoutCancellableError>
+where
+    F: std::future::Future,
+{
+    tokio::select!(
+        r = tokio::time::timeout(duration, future) => {
+            r.map_err(|_| TimeoutCancellableError::Timeout)
+
+        },
+        _ = cancel.cancelled() => {
+            Err(TimeoutCancellableError::Cancelled)
+
+        }
+    )
+}
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -266,7 +266,7 @@ async fn calculate_synthetic_size_worker(
                continue;
            }

-            if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await {
+            if let Ok(tenant) = mgr::get_tenant(tenant_id, true) {
                // TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks?
                // We can put in some prioritization for consumption metrics.
                // Same for the loop that fetches computed metrics.
--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -202,7 +202,6 @@ pub(super) async fn collect_all_metrics(
            None
        } else {
            crate::tenant::mgr::get_tenant(id, true)
-                .await
                .ok()
                .map(|tenant| (id, tenant))
        }
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -545,7 +545,7 @@ async fn collect_eviction_candidates(
        if cancel.is_cancelled() {
            return Ok(EvictionCandidates::Cancelled);
        }
-        let tenant = match tenant::mgr::get_tenant(*tenant_id, true).await {
+        let tenant = match tenant::mgr::get_tenant(*tenant_id, true) {
            Ok(tenant) => tenant,
            Err(e) => {
                // this can happen if tenant has lifecycle transition after we fetched it
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -35,7 +35,8 @@ use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::{LocationConf, TenantConfOpt};
 use crate::tenant::mgr::{
-    GetTenantError, SetNewTenantConfigError, TenantMapInsertError, TenantStateError,
+    GetTenantError, SetNewTenantConfigError, TenantMapError, TenantMapInsertError, TenantSlotError,
+    TenantSlotUpsertError, TenantStateError,
 };
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
@@ -146,28 +147,59 @@ impl From<PageReconstructError> for ApiError {
 impl From<TenantMapInsertError> for ApiError {
    fn from(tmie: TenantMapInsertError) -> ApiError {
        match tmie {
-            TenantMapInsertError::StillInitializing | TenantMapInsertError::ShuttingDown => {
-                ApiError::ResourceUnavailable(format!("{tmie}").into())
-            }
-            TenantMapInsertError::TenantAlreadyExists(id, state) => {
-                ApiError::Conflict(format!("tenant {id} already exists, state: {state:?}"))
-            }
-            TenantMapInsertError::TenantExistsSecondary(id) => {
-                ApiError::Conflict(format!("tenant {id} already exists as secondary"))
-            }
+            TenantMapInsertError::SlotError(e) => e.into(),
+            TenantMapInsertError::SlotUpsertError(e) => e.into(),
            TenantMapInsertError::Other(e) => ApiError::InternalServerError(e),
        }
    }
 }

+impl From<TenantSlotError> for ApiError {
+    fn from(e: TenantSlotError) -> ApiError {
+        use TenantSlotError::*;
+        match e {
+            NotFound(tenant_id) => {
+                ApiError::NotFound(anyhow::anyhow!("NotFound: tenant {tenant_id}").into())
+            }
+            e @ (AlreadyExists(_, _) | Conflict(_)) => ApiError::Conflict(format!("{e}")),
+            InProgress => {
+                ApiError::ResourceUnavailable("Tenant is being modified concurrently".into())
+            }
+            MapState(e) => e.into(),
+        }
+    }
+}
+
+impl From<TenantSlotUpsertError> for ApiError {
+    fn from(e: TenantSlotUpsertError) -> ApiError {
+        use TenantSlotUpsertError::*;
+        match e {
+            InternalError(e) => ApiError::InternalServerError(anyhow::anyhow!("{e}")),
+            MapState(e) => e.into(),
+        }
+    }
+}
+
+impl From<TenantMapError> for ApiError {
+    fn from(e: TenantMapError) -> ApiError {
+        use TenantMapError::*;
+        match e {
+            StillInitializing | ShuttingDown => {
+                ApiError::ResourceUnavailable(format!("{e}").into())
+            }
+        }
+    }
+}
+
 impl From<TenantStateError> for ApiError {
    fn from(tse: TenantStateError) -> ApiError {
        match tse {
-            TenantStateError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
            TenantStateError::IsStopping(_) => {
                ApiError::ResourceUnavailable("Tenant is stopping".into())
            }
-            _ => ApiError::InternalServerError(anyhow::Error::new(tse)),
+            TenantStateError::SlotError(e) => e.into(),
+            TenantStateError::SlotUpsertError(e) => e.into(),
+            TenantStateError::Other(e) => ApiError::InternalServerError(anyhow!(e)),
        }
    }
 }
@@ -188,6 +220,7 @@ impl From<GetTenantError> for ApiError {
                // (We can produce this variant only in `mgr::get_tenant(..., active=true)` calls).
                ApiError::ResourceUnavailable("Tenant not yet active".into())
            }
+            GetTenantError::MapState(e) => ApiError::ResourceUnavailable(format!("{e}").into()),
        }
    }
 }
@@ -242,6 +275,9 @@ impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
            Get(g) => ApiError::from(g),
            e @ AlreadyInProgress => ApiError::Conflict(e.to_string()),
            Timeline(t) => ApiError::from(t),
+            NotAttached => ApiError::NotFound(anyhow::anyhow!("Tenant is not attached").into()),
+            SlotError(e) => e.into(),
+            SlotUpsertError(e) => e.into(),
            Other(o) => ApiError::InternalServerError(o),
            e @ InvalidState(_) => ApiError::PreconditionFailed(e.to_string().into_boxed_str()),
        }
@@ -368,7 +404,7 @@ async fn timeline_create_handler(
    let state = get_state(&request);

    async {
-        let tenant = mgr::get_tenant(tenant_id, true).await?;
+        let tenant = mgr::get_tenant(tenant_id, true)?;
        match tenant.create_timeline(
            new_timeline_id,
            request_data.ancestor_timeline_id.map(TimelineId::from),
@@ -418,7 +454,7 @@ async fn timeline_list_handler(
    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);

    let response_data = async {
-        let tenant = mgr::get_tenant(tenant_id, true).await?;
+        let tenant = mgr::get_tenant(tenant_id, true)?;
        let timelines = tenant.list_timelines();

        let mut response_data = Vec::with_capacity(timelines.len());
@@ -457,7 +493,7 @@ async fn timeline_detail_handler(
    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);

    let timeline_info = async {
-        let tenant = mgr::get_tenant(tenant_id, true).await?;
+        let tenant = mgr::get_tenant(tenant_id, true)?;

        let timeline = tenant
            .get_timeline(timeline_id, false)
@@ -713,7 +749,7 @@ async fn tenant_status(
    check_permission(&request, Some(tenant_id))?;

    let tenant_info = async {
-        let tenant = mgr::get_tenant(tenant_id, false).await?;
+        let tenant = mgr::get_tenant(tenant_id, false)?;

        // Calculate total physical size of all timelines
        let mut current_physical_size = 0;
@@ -776,7 +812,7 @@ async fn tenant_size_handler(
    let headers = request.headers();

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let tenant = mgr::get_tenant(tenant_id, true).await?;
+    let tenant = mgr::get_tenant(tenant_id, true)?;

    // this can be long operation
    let inputs = tenant
@@ -1033,7 +1069,7 @@ async fn get_tenant_config_handler(
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;

-    let tenant = mgr::get_tenant(tenant_id, false).await?;
+    let tenant = mgr::get_tenant(tenant_id, false)?;

    let response = HashMap::from([
        (
@@ -1092,7 +1128,7 @@ async fn put_tenant_location_config_handler(
            .await
        {
            match e {
-                TenantStateError::NotFound(_) => {
+                TenantStateError::SlotError(TenantSlotError::NotFound(_)) => {
                    // This API is idempotent: a NotFound on a detach is fine.
                }
                _ => return Err(e.into()),
@@ -1130,7 +1166,6 @@ async fn handle_tenant_break(
    let tenant_id: TenantId = parse_request_param(&r, "tenant_id")?;

    let tenant = crate::tenant::mgr::get_tenant(tenant_id, true)
-        .await
        .map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?;

    tenant.set_broken("broken from test".to_owned()).await;
@@ -1435,7 +1470,7 @@ async fn active_timeline_of_active_tenant(
    tenant_id: TenantId,
    timeline_id: TimelineId,
 ) -> Result<Arc<Timeline>, ApiError> {
-    let tenant = mgr::get_tenant(tenant_id, true).await?;
+    let tenant = mgr::get_tenant(tenant_id, true)?;
    tenant
        .get_timeline(timeline_id, true)
        .map_err(|e| ApiError::NotFound(e.into()))
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -962,6 +962,32 @@ static REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER: Lazy<IntCounterVec> = Lazy
    .expect("failed to define a metric")
 });

+pub(crate) struct TenantManagerMetrics {
+    pub(crate) tenant_slots: UIntGauge,
+    pub(crate) tenant_slot_writes: IntCounter,
+    pub(crate) unexpected_errors: IntCounter,
+}
+
+pub(crate) static TENANT_MANAGER: Lazy<TenantManagerMetrics> = Lazy::new(|| {
+    TenantManagerMetrics {
+    tenant_slots: register_uint_gauge!(
+        "pageserver_tenant_manager_slots",
+        "How many slots currently exist, including all attached, secondary and in-progress operations",
+    )
+    .expect("failed to define a metric"),
+    tenant_slot_writes: register_int_counter!(
+        "pageserver_tenant_manager_slot_writes",
+        "Writes to a tenant slot, including all of create/attach/detach/delete"
+    )
+    .expect("failed to define a metric"),
+    unexpected_errors: register_int_counter!(
+        "pageserver_tenant_manager_unexpected_errors_total",
+        "Number of unexpected conditions encountered: nonzero value indicates a non-fatal bug."
+    )
+    .expect("failed to define a metric"),
+}
+});
+
 pub(crate) struct DeletionQueueMetrics {
    pub(crate) keys_submitted: IntCounter,
    pub(crate) keys_dropped: IntCounter,
@@ -1884,6 +1910,9 @@ pub fn preinitialize_metrics() {
    // Deletion queue stats
    Lazy::force(&DELETION_QUEUE);

+    // Tenant manager stats
+    Lazy::force(&TENANT_MANAGER);
+
    // countervecs
    [&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
        .into_iter()
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -14,7 +14,6 @@ use async_compression::tokio::write::GzipEncoder;
 use bytes::Buf;
 use bytes::Bytes;
 use futures::Stream;
-use pageserver_api::models::TenantState;
 use pageserver_api::models::{
    PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
    PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
@@ -55,16 +54,20 @@ use crate::metrics;
 use crate::metrics::LIVE_CONNECTIONS_COUNT;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
-use crate::tenant;
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::mgr;
-use crate::tenant::mgr::GetTenantError;
-use crate::tenant::{Tenant, Timeline};
+use crate::tenant::mgr::get_active_tenant_with_timeout;
+use crate::tenant::mgr::GetActiveTenantError;
+use crate::tenant::Timeline;
 use crate::trace::Tracer;

 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;

+// How long we may block waiting for a [`TenantSlot::InProgress`]` and/or a [`Tenant`] which
+// is not yet in state [`TenantState::Active`].
+const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000);
+
 /// Read the end of a tar archive.
 ///
 /// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each.
@@ -378,7 +381,12 @@ impl PageServerHandler {
        debug_assert_current_span_has_tenant_and_timeline_id();

        // Make request tracer if needed
-        let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
+        let tenant = mgr::get_active_tenant_with_timeout(
+            tenant_id,
+            ACTIVE_TENANT_TIMEOUT,
+            &task_mgr::shutdown_token(),
+        )
+        .await?;
        let mut tracer = if tenant.get_trace_read_requests() {
            let connection_id = ConnectionId::generate();
            let path = tenant
@@ -529,7 +537,12 @@ impl PageServerHandler {

        // Create empty timeline
        info!("creating new timeline");
-        let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
+        let tenant = get_active_tenant_with_timeout(
+            tenant_id,
+            ACTIVE_TENANT_TIMEOUT,
+            &task_mgr::shutdown_token(),
+        )
+        .await?;
        let timeline = tenant
            .create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
            .await?;
@@ -587,7 +600,9 @@ impl PageServerHandler {
    {
        debug_assert_current_span_has_tenant_and_timeline_id();

-        let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
+        let timeline = self
+            .get_active_tenant_timeline(tenant_id, timeline_id)
+            .await?;
        let last_record_lsn = timeline.get_last_record_lsn();
        if last_record_lsn != start_lsn {
            return Err(QueryError::Other(
@@ -795,7 +810,9 @@ impl PageServerHandler {
        let started = std::time::Instant::now();

        // check that the timeline exists
-        let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
+        let timeline = self
+            .get_active_tenant_timeline(tenant_id, timeline_id)
+            .await?;
        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        if let Some(lsn) = lsn {
            // Backup was requested at a particular LSN. Wait for it to arrive.
@@ -894,6 +911,25 @@ impl PageServerHandler {
            .expect("claims presence already checked");
        check_permission(claims, tenant_id)
    }
+
+    /// Shorthand for getting a reference to a Timeline of an Active tenant.
+    async fn get_active_tenant_timeline(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> Result<Arc<Timeline>, GetActiveTimelineError> {
+        let tenant = get_active_tenant_with_timeout(
+            tenant_id,
+            ACTIVE_TENANT_TIMEOUT,
+            &task_mgr::shutdown_token(),
+        )
+        .await
+        .map_err(GetActiveTimelineError::Tenant)?;
+        let timeline = tenant
+            .get_timeline(timeline_id, true)
+            .map_err(|e| GetActiveTimelineError::Timeline(anyhow::anyhow!(e)))?;
+        Ok(timeline)
+    }
 }

 #[async_trait::async_trait]
@@ -1051,7 +1087,9 @@ where
                .record("timeline_id", field::display(timeline_id));

            self.check_permission(Some(tenant_id))?;
-            let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
+            let timeline = self
+                .get_active_tenant_timeline(tenant_id, timeline_id)
+                .await?;

            let end_of_timeline = timeline.get_last_record_rlsn();

@@ -1235,7 +1273,12 @@ where

            self.check_permission(Some(tenant_id))?;

-            let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
+            let tenant = get_active_tenant_with_timeout(
+                tenant_id,
+                ACTIVE_TENANT_TIMEOUT,
+                &task_mgr::shutdown_token(),
+            )
+            .await?;
            pgb.write_message_noflush(&BeMessage::RowDescription(&[
                RowDescriptor::int8_col(b"checkpoint_distance"),
                RowDescriptor::int8_col(b"checkpoint_timeout"),
@@ -1281,67 +1324,13 @@ where
    }
 }

-#[derive(thiserror::Error, Debug)]
-enum GetActiveTenantError {
-    #[error(
-        "Timed out waiting {wait_time:?} for tenant active state. Latest state: {latest_state:?}"
-    )]
-    WaitForActiveTimeout {
-        latest_state: TenantState,
-        wait_time: Duration,
-    },
-    #[error(transparent)]
-    NotFound(GetTenantError),
-    #[error(transparent)]
-    WaitTenantActive(tenant::WaitToBecomeActiveError),
-}
-
 impl From<GetActiveTenantError> for QueryError {
    fn from(e: GetActiveTenantError) -> Self {
        match e {
            GetActiveTenantError::WaitForActiveTimeout { .. } => QueryError::Disconnected(
                ConnectionError::Io(io::Error::new(io::ErrorKind::TimedOut, e.to_string())),
            ),
-            GetActiveTenantError::WaitTenantActive(e) => QueryError::Other(anyhow::Error::new(e)),
-            GetActiveTenantError::NotFound(e) => QueryError::Other(anyhow::Error::new(e)),
-        }
-    }
-}
-
-/// Get active tenant.
-///
-/// If the tenant is Loading, waits for it to become Active, for up to 30 s. That
-/// ensures that queries don't fail immediately after pageserver startup, because
-/// all tenants are still loading.
-async fn get_active_tenant_with_timeout(
-    tenant_id: TenantId,
-    _ctx: &RequestContext, /* require get a context to support cancellation in the future */
-) -> Result<Arc<Tenant>, GetActiveTenantError> {
-    let tenant = match mgr::get_tenant(tenant_id, false).await {
-        Ok(tenant) => tenant,
-        Err(e @ GetTenantError::NotFound(_)) => return Err(GetActiveTenantError::NotFound(e)),
-        Err(GetTenantError::NotActive(_)) => {
-            unreachable!("we're calling get_tenant with active_only=false")
-        }
-        Err(GetTenantError::Broken(_)) => {
-            unreachable!("we're calling get_tenant with active_only=false")
-        }
-    };
-    let wait_time = Duration::from_secs(30);
-    match tokio::time::timeout(wait_time, tenant.wait_to_become_active()).await {
-        Ok(Ok(())) => Ok(tenant),
-        // no .context(), the error message is good enough and some tests depend on it
-        Ok(Err(e)) => Err(GetActiveTenantError::WaitTenantActive(e)),
-        Err(_) => {
-            let latest_state = tenant.current_state();
-            if latest_state == TenantState::Active {
-                Ok(tenant)
-            } else {
-                Err(GetActiveTenantError::WaitForActiveTimeout {
-                    latest_state,
-                    wait_time,
-                })
-            }
+            e => QueryError::Other(anyhow::anyhow!(e)),
        }
    }
 }
@@ -1362,18 +1351,3 @@ impl From<GetActiveTimelineError> for QueryError {
        }
    }
 }
-
-/// Shorthand for getting a reference to a Timeline of an Active tenant.
-async fn get_active_tenant_timeline(
-    tenant_id: TenantId,
-    timeline_id: TimelineId,
-    ctx: &RequestContext,
-) -> Result<Arc<Timeline>, GetActiveTimelineError> {
-    let tenant = get_active_tenant_with_timeout(tenant_id, ctx)
-        .await
-        .map_err(GetActiveTimelineError::Tenant)?;
-    let timeline = tenant
-        .get_timeline(timeline_id, true)
-        .map_err(|e| GetActiveTimelineError::Timeline(anyhow::anyhow!(e)))?;
-    Ok(timeline)
-}
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -55,6 +55,8 @@ use self::config::TenantConf;
 use self::delete::DeleteTenantFlow;
 use self::metadata::LoadMetadataError;
 use self::metadata::TimelineMetadata;
+use self::mgr::GetActiveTenantError;
+use self::mgr::GetTenantError;
 use self::mgr::TenantsMap;
 use self::remote_timeline_client::RemoteTimelineClient;
 use self::timeline::uninit::TimelineUninitMark;
@@ -263,6 +265,12 @@ pub struct Tenant {
    pub(crate) gate: Gate,
 }

+impl std::fmt::Debug for Tenant {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{} ({})", self.tenant_id, self.current_state())
+    }
+}
+
 pub(crate) enum WalRedoManager {
    Prod(PostgresRedoManager),
    #[cfg(test)]
@@ -368,34 +376,6 @@ impl Debug for SetStoppingError {
    }
 }

-#[derive(Debug, thiserror::Error)]
-pub(crate) enum WaitToBecomeActiveError {
-    WillNotBecomeActive {
-        tenant_id: TenantId,
-        state: TenantState,
-    },
-    TenantDropped {
-        tenant_id: TenantId,
-    },
-}
-
-impl std::fmt::Display for WaitToBecomeActiveError {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        match self {
-            WaitToBecomeActiveError::WillNotBecomeActive { tenant_id, state } => {
-                write!(
-                    f,
-                    "Tenant {} will not become active. Current state: {:?}",
-                    tenant_id, state
-                )
-            }
-            WaitToBecomeActiveError::TenantDropped { tenant_id } => {
-                write!(f, "Tenant {tenant_id} will not become active (dropped)")
-            }
-        }
-    }
-}
-
 #[derive(thiserror::Error, Debug)]
 pub enum CreateTimelineError {
    #[error("a timeline with the given ID already exists")]
@@ -537,7 +517,7 @@ impl Tenant {
        resources: TenantSharedResources,
        attached_conf: AttachedTenantConf,
        init_order: Option<InitializationOrder>,
-        tenants: &'static tokio::sync::RwLock<TenantsMap>,
+        tenants: &'static std::sync::RwLock<TenantsMap>,
        mode: SpawnMode,
        ctx: &RequestContext,
    ) -> anyhow::Result<Arc<Tenant>> {
@@ -1850,6 +1830,7 @@ impl Tenant {
            }
            Err(SetStoppingError::AlreadyStopping(other)) => {
                // give caller the option to wait for this this shutdown
+                info!("Tenant::shutdown: AlreadyStopping");
                return Err(other);
            }
        };
@@ -2048,7 +2029,7 @@ impl Tenant {
        self.state.subscribe()
    }

-    pub(crate) async fn wait_to_become_active(&self) -> Result<(), WaitToBecomeActiveError> {
+    pub(crate) async fn wait_to_become_active(&self) -> Result<(), GetActiveTenantError> {
        let mut receiver = self.state.subscribe();
        loop {
            let current_state = receiver.borrow_and_update().clone();
@@ -2056,11 +2037,9 @@ impl Tenant {
                TenantState::Loading | TenantState::Attaching | TenantState::Activating(_) => {
                    // in these states, there's a chance that we can reach ::Active
                    receiver.changed().await.map_err(
-                        |_e: tokio::sync::watch::error::RecvError| {
-                            WaitToBecomeActiveError::TenantDropped {
-                                tenant_id: self.tenant_id,
-                            }
-                        },
+                        |_e: tokio::sync::watch::error::RecvError|
+                            // Tenant existed but was dropped: report it as non-existent
+                            GetActiveTenantError::NotFound(GetTenantError::NotFound(self.tenant_id))
                    )?;
                }
                TenantState::Active { .. } => {
@@ -2068,10 +2047,7 @@ impl Tenant {
                }
                TenantState::Broken { .. } | TenantState::Stopping { .. } => {
                    // There's no chance the tenant can transition back into ::Active
-                    return Err(WaitToBecomeActiveError::WillNotBecomeActive {
-                        tenant_id: self.tenant_id,
-                        state: current_state,
-                    });
+                    return Err(GetActiveTenantError::WillNotBecomeActive(current_state));
                }
            }
        }
@@ -2137,6 +2113,9 @@ where
 }

 impl Tenant {
+    pub fn get_tenant_id(&self) -> TenantId {
+        self.tenant_id
+    }
    pub fn tenant_specific_overrides(&self) -> TenantConfOpt {
        self.tenant_conf.read().unwrap().tenant_conf
    }
@@ -4266,11 +4245,7 @@ mod tests {
        metadata_bytes[8] ^= 1;
        std::fs::write(metadata_path, metadata_bytes)?;

-        let err = harness
-            .try_load_local(&ctx)
-            .await
-            .err()
-            .expect("should fail");
+        let err = harness.try_load_local(&ctx).await.expect_err("should fail");
        // get all the stack with all .context, not only the last one
        let message = format!("{err:#}");
        let expected = "failed to load metadata";
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -21,7 +21,7 @@ use crate::{
 };

 use super::{
-    mgr::{GetTenantError, TenantsMap},
+    mgr::{GetTenantError, TenantSlotError, TenantSlotUpsertError, TenantsMap},
    remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
    span,
    timeline::delete::DeleteTimelineFlow,
@@ -33,12 +33,21 @@ pub(crate) enum DeleteTenantError {
    #[error("GetTenant {0}")]
    Get(#[from] GetTenantError),

+    #[error("Tenant not attached")]
+    NotAttached,
+
    #[error("Invalid state {0}. Expected Active or Broken")]
    InvalidState(TenantState),

    #[error("Tenant deletion is already in progress")]
    AlreadyInProgress,

+    #[error("Tenant map slot error {0}")]
+    SlotError(#[from] TenantSlotError),
+
+    #[error("Tenant map slot upsert error {0}")]
+    SlotUpsertError(#[from] TenantSlotUpsertError),
+
    #[error("Timeline {0}")]
    Timeline(#[from] DeleteTimelineError),

@@ -273,12 +282,12 @@ impl DeleteTenantFlow {
    pub(crate) async fn run(
        conf: &'static PageServerConf,
        remote_storage: Option<GenericRemoteStorage>,
-        tenants: &'static tokio::sync::RwLock<TenantsMap>,
-        tenant_id: TenantId,
+        tenants: &'static std::sync::RwLock<TenantsMap>,
+        tenant: Arc<Tenant>,
    ) -> Result<(), DeleteTenantError> {
        span::debug_assert_current_span_has_tenant_id();

-        let (tenant, mut guard) = Self::prepare(tenants, tenant_id).await?;
+        let mut guard = Self::prepare(&tenant).await?;

        if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await {
            tenant.set_broken(format!("{e:#}")).await;
@@ -378,7 +387,7 @@ impl DeleteTenantFlow {
        guard: DeletionGuard,
        tenant: &Arc<Tenant>,
        preload: Option<TenantPreload>,
-        tenants: &'static tokio::sync::RwLock<TenantsMap>,
+        tenants: &'static std::sync::RwLock<TenantsMap>,
        init_order: Option<InitializationOrder>,
        ctx: &RequestContext,
    ) -> Result<(), DeleteTenantError> {
@@ -405,15 +414,8 @@ impl DeleteTenantFlow {
    }

    async fn prepare(
-        tenants: &tokio::sync::RwLock<TenantsMap>,
-        tenant_id: TenantId,
-    ) -> Result<(Arc<Tenant>, tokio::sync::OwnedMutexGuard<Self>), DeleteTenantError> {
-        let m = tenants.read().await;
-
-        let tenant = m
-            .get(&tenant_id)
-            .ok_or(GetTenantError::NotFound(tenant_id))?;
-
+        tenant: &Arc<Tenant>,
+    ) -> Result<tokio::sync::OwnedMutexGuard<Self>, DeleteTenantError> {
        // FIXME: unsure about active only. Our init jobs may not be cancellable properly,
        // so at least for now allow deletions only for active tenants. TODO recheck
        // Broken and Stopping is needed for retries.
@@ -447,14 +449,14 @@ impl DeleteTenantFlow {
            )));
        }

-        Ok((Arc::clone(tenant), guard))
+        Ok(guard)
    }

    fn schedule_background(
        guard: OwnedMutexGuard<Self>,
        conf: &'static PageServerConf,
        remote_storage: Option<GenericRemoteStorage>,
-        tenants: &'static tokio::sync::RwLock<TenantsMap>,
+        tenants: &'static std::sync::RwLock<TenantsMap>,
        tenant: Arc<Tenant>,
    ) {
        let tenant_id = tenant.tenant_id;
@@ -487,7 +489,7 @@ impl DeleteTenantFlow {
        mut guard: OwnedMutexGuard<Self>,
        conf: &PageServerConf,
        remote_storage: Option<GenericRemoteStorage>,
-        tenants: &'static tokio::sync::RwLock<TenantsMap>,
+        tenants: &'static std::sync::RwLock<TenantsMap>,
        tenant: &Arc<Tenant>,
    ) -> Result<(), DeleteTenantError> {
        // Tree sort timelines, schedule delete for them. Mention retries from the console side.
@@ -535,10 +537,18 @@ impl DeleteTenantFlow {
            .await
            .context("cleanup_remaining_fs_traces")?;

-        let mut locked = tenants.write().await;
-        if locked.remove(&tenant.tenant_id).is_none() {
-            warn!("Tenant got removed from tenants map during deletion");
-        };
+        {
+            let mut locked = tenants.write().unwrap();
+            if locked.remove(&tenant.tenant_id).is_none() {
+                warn!("Tenant got removed from tenants map during deletion");
+            };
+
+            // FIXME: we should not be modifying this from outside of mgr.rs.
+            // This will go away when we simplify deletion (https://github.com/neondatabase/neon/issues/5080)
+            crate::metrics::TENANT_MANAGER
+                .tenant_slots
+                .set(locked.len() as u64);
+        }

        *guard = Self::Finished;

--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -341,20 +341,7 @@ impl Timeline {
        // Make one of the tenant's timelines draw the short straw and run the calculation.
        // The others wait until the calculation is done so that they take into account the
        // imitated accesses that the winner made.
-        //
-        // It is critical we are responsive to cancellation here. Otherwise, we deadlock with
-        // tenant deletion (holds TENANTS in read mode) any other task that attempts to
-        // acquire TENANTS in write mode before we here call get_tenant.
-        // See https://github.com/neondatabase/neon/issues/5284.
-        let res = tokio::select! {
-            _ = cancel.cancelled() => {
-                return ControlFlow::Break(());
-            }
-            res = crate::tenant::mgr::get_tenant(self.tenant_id, true) => {
-                res
-            }
-        };
-        let tenant = match res {
+        let tenant = match crate::tenant::mgr::get_tenant(self.tenant_id, true) {
            Ok(t) => t,
            Err(_) => {
                return ControlFlow::Break(());
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -626,6 +626,8 @@ class NeonEnvBuilder:
                sk.stop(immediate=True)

            for pageserver in self.env.pageservers:
+                pageserver.assert_no_metric_errors()
+
                pageserver.stop(immediate=True)

            if self.env.attachment_service is not None:
@@ -1784,6 +1786,21 @@ class NeonPageserver(PgProtocol):

        assert not errors

+    def assert_no_metric_errors(self):
+        """
+        Certain metrics should _always_ be zero: they track conditions that indicate a bug.
+        """
+        if not self.running:
+            log.info(f"Skipping metrics check on pageserver {self.id}, it is not running")
+            return
+
+        for metric in [
+            "pageserver_tenant_manager_unexpected_errors_total",
+            "pageserver_deletion_queue_unexpected_errors_total",
+        ]:
+            value = self.http_client().get_metric_value(metric)
+            assert value == 0, f"Nonzero {metric} == {value}"
+
    def log_contains(self, pattern: str) -> Optional[str]:
        """Check that the pageserver log contains a line that matches the given regex"""
        logfile = open(os.path.join(self.workdir, "pageserver.log"), "r")
--- a/test_runner/regress/test_neon_cli.py
+++ b/test_runner/regress/test_neon_cli.py
@@ -134,6 +134,9 @@ def test_cli_start_stop(neon_env_builder: NeonEnvBuilder):
    env.neon_cli.pageserver_stop(env.pageserver.id)
    env.neon_cli.safekeeper_stop()

+    # Keep NeonEnv state up to date, it usually owns starting/stopping services
+    env.pageserver.running = False
+
    # Default start
    res = env.neon_cli.raw_cli(["start"])
    res.check_returncode()
@@ -155,6 +158,10 @@ def test_cli_start_stop_multi(neon_env_builder: NeonEnvBuilder):
    env.neon_cli.pageserver_stop(env.BASE_PAGESERVER_ID)
    env.neon_cli.pageserver_stop(env.BASE_PAGESERVER_ID + 1)

+    # Keep NeonEnv state up to date, it usually owns starting/stopping services
+    env.pageservers[0].running = False
+    env.pageservers[1].running = False
+
    # Addressing a nonexistent ID throws
    with pytest.raises(RuntimeError):
        env.neon_cli.pageserver_stop(env.BASE_PAGESERVER_ID + 100)
--- a/test_runner/regress/test_pageserver_restart.py
+++ b/test_runner/regress/test_pageserver_restart.py
@@ -20,6 +20,8 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder, generations: bool)
    endpoint = env.endpoints.create_start("main")
    pageserver_http = env.pageserver.http_client()

+    assert pageserver_http.get_metric_value("pageserver_tenant_manager_slots") == 1
+
    pg_conn = endpoint.connect()
    cur = pg_conn.cursor()

@@ -52,6 +54,9 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder, generations: bool)
    env.pageserver.stop()
    env.pageserver.start()

+    # We reloaded our tenant
+    assert pageserver_http.get_metric_value("pageserver_tenant_manager_slots") == 1
+
    cur.execute("SELECT count(*) FROM foo")
    assert cur.fetchone() == (100000,)

--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -63,6 +63,9 @@ def test_tenant_delete_smoke(
        conf=MANY_SMALL_LAYERS_TENANT_CONFIG,
    )

+    # Default tenant and the one we created
+    assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 2
+
    # create two timelines one being the parent of another
    parent = None
    for timeline in ["first", "second"]:
@@ -88,7 +91,9 @@ def test_tenant_delete_smoke(

    iterations = poll_for_remote_storage_iterations(remote_storage_kind)

+    assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 2
    tenant_delete_wait_completed(ps_http, tenant_id, iterations)
+    assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1

    tenant_path = env.pageserver.tenant_dir(tenant_id)
    assert not tenant_path.exists()
@@ -104,6 +109,9 @@ def test_tenant_delete_smoke(
            ),
        )

+    # Deletion updates the tenant count: the one default tenant remains
+    assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1
+

 class Check(enum.Enum):
    RETRY_WITHOUT_RESTART = enum.auto()
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -26,6 +26,16 @@ from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import query_scalar, wait_until
 from prometheus_client.samples import Sample

+# In tests that overlap endpoint activity with tenant attach/detach, there are
+# a variety of warnings that the page service may emit when it cannot acquire
+# an active tenant to serve a request
+PERMIT_PAGE_SERVICE_ERRORS = [
+    ".*page_service.*Tenant .* not found",
+    ".*page_service.*Tenant .* is not active",
+    ".*page_service.*cancelled",
+    ".*page_service.*will not become active.*",
+]
+

 def do_gc_target(
    pageserver_http: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId
@@ -60,12 +70,7 @@ def test_tenant_reattach(
    # create new nenant
    tenant_id, timeline_id = env.neon_cli.create_tenant()

-    # Attempts to connect from compute to pageserver while the tenant is
-    # temporarily detached produces these errors in the pageserver log.
-    env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
-    env.pageserver.allowed_errors.append(
-        f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
-    )
+    env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)

    with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
        with endpoint.cursor() as cur:
@@ -235,10 +240,7 @@ def test_tenant_reattach_while_busy(

    # Attempts to connect from compute to pageserver while the tenant is
    # temporarily detached produces these errors in the pageserver log.
-    env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
-    env.pageserver.allowed_errors.append(
-        f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
-    )
+    env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)

    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)

@@ -259,7 +261,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
    env = neon_env_builder.init_start()
    pageserver_http = env.pageserver.http_client()

-    env.pageserver.allowed_errors.append(".*NotFound: Tenant .*")
+    env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)

    # first check for non existing tenant
    tenant_id = TenantId.generate()
@@ -271,19 +273,9 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):

    assert excinfo.value.status_code == 404

-    # the error will be printed to the log too
-    env.pageserver.allowed_errors.append(".*NotFound: tenant *")
-
    # create new nenant
    tenant_id, timeline_id = env.neon_cli.create_tenant()

-    # Attempts to connect from compute to pageserver while the tenant is
-    # temporarily detached produces these errors in the pageserver log.
-    env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
-    env.pageserver.allowed_errors.append(
-        f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
-    )
-
    # assert tenant exists on disk
    assert env.pageserver.tenant_dir(tenant_id).exists()

@@ -345,12 +337,7 @@ def test_tenant_detach_ignored_tenant(neon_simple_env: NeonEnv):
    # create a new tenant
    tenant_id, _ = env.neon_cli.create_tenant()

-    # Attempts to connect from compute to pageserver while the tenant is
-    # temporarily detached produces these errors in the pageserver log.
-    env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
-    env.pageserver.allowed_errors.append(
-        f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
-    )
+    env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)

    # assert tenant exists on disk
    assert env.pageserver.tenant_dir(tenant_id).exists()
@@ -401,12 +388,7 @@ def test_tenant_detach_regular_tenant(neon_simple_env: NeonEnv):
    # create a new tenant
    tenant_id, _ = env.neon_cli.create_tenant()

-    # Attempts to connect from compute to pageserver while the tenant is
-    # temporarily detached produces these errors in the pageserver log.
-    env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
-    env.pageserver.allowed_errors.append(
-        f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
-    )
+    env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)

    # assert tenant exists on disk
    assert env.pageserver.tenant_dir(tenant_id).exists()
@@ -453,12 +435,7 @@ def test_detach_while_attaching(
    tenant_id = env.initial_tenant
    timeline_id = env.initial_timeline

-    # Attempts to connect from compute to pageserver while the tenant is
-    # temporarily detached produces these errors in the pageserver log.
-    env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
-    env.pageserver.allowed_errors.append(
-        f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
-    )
+    env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)

    # Create table, and insert some rows. Make it big enough that it doesn't fit in
    # shared_buffers, otherwise the SELECT after restart will just return answer
@@ -593,12 +570,7 @@ def test_ignored_tenant_download_missing_layers(neon_env_builder: NeonEnvBuilder
    tenant_id = env.initial_tenant
    timeline_id = env.initial_timeline

-    # Attempts to connect from compute to pageserver while the tenant is
-    # temporarily detached produces these errors in the pageserver log.
-    env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
-    env.pageserver.allowed_errors.append(
-        f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
-    )
+    env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)

    data_id = 1
    data_secret = "very secret secret"
@@ -649,12 +621,7 @@ def test_load_attach_negatives(neon_env_builder: NeonEnvBuilder):

    tenant_id = env.initial_tenant

-    # Attempts to connect from compute to pageserver while the tenant is
-    # temporarily detached produces these errors in the pageserver log.
-    env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
-    env.pageserver.allowed_errors.append(
-        f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
-    )
+    env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)

    env.pageserver.allowed_errors.append(".*tenant .*? already exists, state:.*")
    with pytest.raises(
@@ -693,12 +660,7 @@ def test_ignore_while_attaching(
    tenant_id = env.initial_tenant
    timeline_id = env.initial_timeline

-    # Attempts to connect from compute to pageserver while the tenant is
-    # temporarily detached produces these errors in the pageserver log.
-    env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
-    env.pageserver.allowed_errors.append(
-        f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
-    )
+    env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)

    data_id = 1
    data_secret = "very secret secret"