Update epic-template.md

use tasklists instead of just the list of checkboxes in the Epic template to help discoverability of the parent issues
2026-05-20 14:40:37 +00:00 · 2023-11-06 13:12:34 +01:00
88 changed files with 2977 additions and 4422 deletions
--- a/.config/hakari.toml
+++ b/.config/hakari.toml
@@ -22,11 +22,5 @@ platforms = [
    # "x86_64-pc-windows-msvc",
 ]

-[final-excludes]
-# vm_monitor benefits from the same Cargo.lock as the rest of our artifacts, but
-# it is built primarly in separate repo neondatabase/autoscaling and thus is excluded
-# from depending on workspace-hack because most of the dependencies are not used.
-workspace-members = ["vm_monitor"]
-
 # Write out exact versions rather than a semver range. (Defaults to false.)
 # exact-versions = true
--- a/.github/ISSUE_TEMPLATE/epic-template.md
+++ b/.github/ISSUE_TEMPLATE/epic-template.md
@@ -17,9 +17,9 @@ assignees: ''
 ## Implementation ideas


+```[tasklist]
 ## Tasks
- [ ]
-
+```

 ## Other related tasks and Epics
 - 
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -723,7 +723,6 @@ jobs:
                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
                           --context .
                           --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
-                           --build-arg BUILD_TAG=${{ needs.tag.outputs.build-tag }}
                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
                           --destination neondatabase/neon:${{needs.tag.outputs.build-tag}}
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -170,12 +170,6 @@ dependencies = [
 "backtrace",
 ]

-[[package]]
-name = "arc-swap"
-version = "1.6.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bddcadddf5e9015d310179a59bb28c4d4b9920ad0f11e8e14dbadf654890c9a6"
-
 [[package]]
 name = "archery"
 version = "0.5.0"
@@ -5957,7 +5951,6 @@ name = "utils"
 version = "0.1.0"
 dependencies = [
 "anyhow",
- "arc-swap",
 "async-trait",
 "bincode",
 "byteorder",
@@ -6055,6 +6048,7 @@ dependencies = [
 "tokio-util",
 "tracing",
 "tracing-subscriber",
+ "workspace_hack",
 ]

 [[package]]
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -36,7 +36,6 @@ license = "Apache-2.0"
 ## All dependency versions, used in the project
 [workspace.dependencies]
 anyhow = { version = "1.0", features = ["backtrace"] }
-arc-swap = "1.6"
 async-compression = { version = "0.4.0", features = ["tokio", "gzip"] }
 azure_core = "0.16"
 azure_identity = "0.16"
--- a/5
+++ b/5
@@ -27,7 +27,6 @@ RUN set -e \
 FROM $REPOSITORY/$IMAGE:$TAG AS build
 WORKDIR /home/nonroot
 ARG GIT_VERSION=local
-ARG BUILD_TAG

 # Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds.
 # Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations.
@@ -79,9 +78,9 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pg_sni_router
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl             /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper          /usr/local/bin
-COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker      /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker         /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy               /usr/local/bin
-COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local          /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local               /usr/local/bin

 COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
 COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
--- a/4
+++ b/4
@@ -72,10 +72,6 @@ neon: postgres-headers walproposer-lib
 #
 $(POSTGRES_INSTALL_DIR)/build/%/config.status:
 	+@echo "Configuring Postgres $* build"
-	@test -s $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure || { \
-		echo "\nPostgres submodule not found in $(ROOT_PROJECT_DIR)/vendor/postgres-$*/, execute "; \
-		echo "'git submodule update --init --recursive --depth 2 --progress .' in project root.\n"; \
-		exit 1; }
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/$*
 	(cd $(POSTGRES_INSTALL_DIR)/build/$* && \
 	env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure \
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -710,12 +710,8 @@ impl ComputeNode {
    // `pg_ctl` for start / stop, so this just seems much easier to do as we already
    // have opened connection to Postgres and superuser access.
    #[instrument(skip_all)]
-    fn pg_reload_conf(&self) -> Result<()> {
-        let pgctl_bin = Path::new(&self.pgbin).parent().unwrap().join("pg_ctl");
-        Command::new(pgctl_bin)
-            .args(["reload", "-D", &self.pgdata])
-            .output()
-            .expect("cannot run pg_ctl process");
+    fn pg_reload_conf(&self, client: &mut Client) -> Result<()> {
+        client.simple_query("SELECT pg_reload_conf()")?;
        Ok(())
    }

@@ -728,9 +724,9 @@ impl ComputeNode {
        // Write new config
        let pgdata_path = Path::new(&self.pgdata);
        config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec, None)?;
-        self.pg_reload_conf()?;

        let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
+        self.pg_reload_conf(&mut client)?;

        // Proceed with post-startup configuration. Note, that order of operations is important.
        // Disable DDL forwarding because control plane already knows about these roles/databases.
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -1,7 +1,7 @@
+//!
 //! Various tools and helpers to handle cluster / compute node (Postgres)
 //! configuration.
-#![deny(unsafe_code)]
-#![deny(clippy::undocumented_unsafe_blocks)]
+//!
 pub mod checker;
 pub mod config;
 pub mod configurator;
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -262,7 +262,7 @@ where
    P: Into<Utf8PathBuf>,
 {
    let path: Utf8PathBuf = path.into();
-    // SAFETY:
+    // SAFETY
    // pre_exec is marked unsafe because it runs between fork and exec.
    // Why is that dangerous in various ways?
    // Long answer:  https://github.com/rust-lang/rust/issues/39575
--- a/control_plane/src/lib.rs
+++ b/control_plane/src/lib.rs
@@ -1,10 +1,11 @@
-//! Local control plane.
-//!
-//! Can start, configure and stop postgres instances running as a local processes.
-//!
-//! Intended to be used in integration tests and in CLI tools for
-//! local installations.
-#![deny(clippy::undocumented_unsafe_blocks)]
+//
+// Local control plane.
+//
+// Can start, configure and stop postgres instances running as a local processes.
+//
+// Intended to be used in integration tests and in CLI tools for
+// local installations.
+//

 pub mod attachment_service;
 mod background_process;
--- a/libs/compute_api/src/lib.rs
+++ b/libs/compute_api/src/lib.rs
@@ -1,5 +1,3 @@
-#![deny(unsafe_code)]
-#![deny(clippy::undocumented_unsafe_blocks)]
 pub mod requests;
 pub mod responses;
 pub mod spec;
--- a/libs/consumption_metrics/src/lib.rs
+++ b/libs/consumption_metrics/src/lib.rs
@@ -1,6 +1,6 @@
+//!
 //! Shared code for consumption metics collection
-#![deny(unsafe_code)]
-#![deny(clippy::undocumented_unsafe_blocks)]
+//!
 use chrono::{DateTime, Utc};
 use rand::Rng;
 use serde::{Deserialize, Serialize};
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -2,7 +2,6 @@
 //! make sure that we use the same dep version everywhere.
 //! Otherwise, we might not see all metrics registered via
 //! a default registry.
-#![deny(clippy::undocumented_unsafe_blocks)]
 use once_cell::sync::Lazy;
 use prometheus::core::{AtomicU64, Collector, GenericGauge, GenericGaugeVec};
 pub use prometheus::opts;
--- a/libs/pageserver_api/src/lib.rs
+++ b/libs/pageserver_api/src/lib.rs
@@ -1,5 +1,3 @@
-#![deny(unsafe_code)]
-#![deny(clippy::undocumented_unsafe_blocks)]
 use const_format::formatcp;

 /// Public API types
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -2,8 +2,6 @@
 //! To use, create PostgresBackend and run() it, passing the Handler
 //! implementation determining how to process the queries. Currently its API
 //! is rather narrow, but we can extend it once required.
-#![deny(unsafe_code)]
-#![deny(clippy::undocumented_unsafe_blocks)]
 use anyhow::Context;
 use bytes::Bytes;
 use futures::pin_mut;
@@ -730,17 +728,12 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {

                trace!("got query {query_string:?}");
                if let Err(e) = handler.process_query(self, query_string).await {
-                    match e {
-                        QueryError::Shutdown => return Ok(ProcessMsgResult::Break),
-                        e => {
-                            log_query_error(query_string, &e);
-                            let short_error = short_error(&e);
-                            self.write_message_noflush(&BeMessage::ErrorResponse(
-                                &short_error,
-                                Some(e.pg_error_code()),
-                            ))?;
-                        }
-                    }
+                    log_query_error(query_string, &e);
+                    let short_error = short_error(&e);
+                    self.write_message_noflush(&BeMessage::ErrorResponse(
+                        &short_error,
+                        Some(e.pg_error_code()),
+                    ))?;
                }
                self.write_message_noflush(&BeMessage::ReadyForQuery)?;
            }
--- a/libs/postgres_connection/src/lib.rs
+++ b/libs/postgres_connection/src/lib.rs
@@ -1,5 +1,3 @@
-#![deny(unsafe_code)]
-#![deny(clippy::undocumented_unsafe_blocks)]
 use anyhow::{bail, Context};
 use itertools::Itertools;
 use std::borrow::Cow;
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -8,7 +8,6 @@
 // modules included with the postgres_ffi macro depend on the types of the specific version's
 // types, and trigger a too eager lint.
 #![allow(clippy::duplicate_mod)]
-#![deny(clippy::undocumented_unsafe_blocks)]

 use bytes::Bytes;
 use utils::bin_ser::SerializeError;
@@ -21,7 +20,6 @@ macro_rules! postgres_ffi {
            pub mod bindings {
                // bindgen generates bindings for a lot of stuff we don't need
                #![allow(dead_code)]
-                #![allow(clippy::undocumented_unsafe_blocks)]

                use serde::{Deserialize, Serialize};
                include!(concat!(
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -1,7 +1,6 @@
 //! Postgres protocol messages serialization-deserialization. See
 //! <https://www.postgresql.org/docs/devel/protocol-message-formats.html>
 //! on message formats.
-#![deny(clippy::undocumented_unsafe_blocks)]

 pub mod framed;

--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -1,18 +1,21 @@
 //! Azure Blob Storage wrapper

-use std::collections::HashMap;
 use std::env;
 use std::num::NonZeroU32;
 use std::sync::Arc;
-use std::{borrow::Cow, io::Cursor};
+use std::{borrow::Cow, collections::HashMap, io::Cursor};

 use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
 use anyhow::Result;
 use azure_core::request_options::{MaxResults, Metadata, Range};
+use azure_core::Header;
 use azure_identity::DefaultAzureCredential;
 use azure_storage::StorageCredentials;
 use azure_storage_blobs::prelude::ClientBuilder;
-use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient};
+use azure_storage_blobs::{
+    blob::operations::GetBlobBuilder,
+    prelude::{BlobClient, ContainerClient},
+};
 use futures_util::StreamExt;
 use http_types::StatusCode;
 use tokio::io::AsyncRead;
@@ -109,19 +112,16 @@ impl AzureBlobStorage {

    async fn download_for_builder(
        &self,
+        metadata: StorageMetadata,
        builder: GetBlobBuilder,
    ) -> Result<Download, DownloadError> {
        let mut response = builder.into_stream();

-        let mut metadata = HashMap::new();
        // TODO give proper streaming response instead of buffering into RAM
        // https://github.com/neondatabase/neon/issues/5563
        let mut buf = Vec::new();
        while let Some(part) = response.next().await {
            let part = part.map_err(to_download_error)?;
-            if let Some(blob_meta) = part.blob.metadata {
-                metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
-            }
            let data = part
                .data
                .collect()
@@ -131,9 +131,28 @@ impl AzureBlobStorage {
        }
        Ok(Download {
            download_stream: Box::pin(Cursor::new(buf)),
-            metadata: Some(StorageMetadata(metadata)),
+            metadata: Some(metadata),
        })
    }
+    // TODO get rid of this function once we have metadata included in the response
+    // https://github.com/Azure/azure-sdk-for-rust/issues/1439
+    async fn get_metadata(
+        &self,
+        blob_client: &BlobClient,
+    ) -> Result<StorageMetadata, DownloadError> {
+        let builder = blob_client.get_metadata();
+
+        let response = builder.into_future().await.map_err(to_download_error)?;
+        let mut map = HashMap::new();
+
+        for md in response.metadata.iter() {
+            map.insert(
+                md.name().as_str().to_string(),
+                md.value().as_str().to_string(),
+            );
+        }
+        Ok(StorageMetadata(map))
+    }

    async fn permit(&self, kind: RequestKind) -> tokio::sync::SemaphorePermit<'_> {
        self.concurrency_limiter
@@ -250,9 +269,11 @@ impl RemoteStorage for AzureBlobStorage {
        let _permit = self.permit(RequestKind::Get).await;
        let blob_client = self.client.blob_client(self.relative_path_to_name(from));

+        let metadata = self.get_metadata(&blob_client).await?;
+
        let builder = blob_client.get();

-        self.download_for_builder(builder).await
+        self.download_for_builder(metadata, builder).await
    }

    async fn download_byte_range(
@@ -264,6 +285,8 @@ impl RemoteStorage for AzureBlobStorage {
        let _permit = self.permit(RequestKind::Get).await;
        let blob_client = self.client.blob_client(self.relative_path_to_name(from));

+        let metadata = self.get_metadata(&blob_client).await?;
+
        let mut builder = blob_client.get();

        if let Some(end_exclusive) = end_exclusive {
@@ -278,7 +301,7 @@ impl RemoteStorage for AzureBlobStorage {
            builder = builder.range(Range::new(start_inclusive, end_exclusive));
        }

-        self.download_for_builder(builder).await
+        self.download_for_builder(metadata, builder).await
    }

    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
@@ -311,8 +334,4 @@ impl RemoteStorage for AzureBlobStorage {
        }
        Ok(())
    }
-
-    async fn copy_object(&self, src: &RemotePath, dst: &RemotePath) -> anyhow::Result<()> {
-        unimplemented!()
-    }
 }
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -6,8 +6,6 @@
 //!   * [`s3_bucket`] uses AWS S3 bucket as an external storage
 //!   * [`azure_blob`] allows to use Azure Blob storage as an external storage
 //!
-#![deny(unsafe_code)]
-#![deny(clippy::undocumented_unsafe_blocks)]

 mod azure_blob;
 mod local_fs;
@@ -114,7 +112,7 @@ impl RemotePath {
        self.0.file_name()
    }

-    pub fn join<P: AsRef<Utf8Path>>(&self, segment: P) -> Self {
+    pub fn join(&self, segment: &Utf8Path) -> Self {
        Self(self.0.join(segment))
    }

@@ -215,8 +213,6 @@ pub trait RemoteStorage: Send + Sync + 'static {
    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()>;

    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()>;
-
-    async fn copy_object(&self, src: &RemotePath, dst: &RemotePath) -> anyhow::Result<()>;
 }

 pub struct Download {
@@ -379,15 +375,6 @@ impl GenericRemoteStorage {
            Self::Unreliable(s) => s.delete_objects(paths).await,
        }
    }
-
-    pub async fn copy_object(&self, src: &RemotePath, dst: &RemotePath) -> anyhow::Result<()> {
-        match self {
-            Self::LocalFs(s) => s.copy_object(src, dst).await,
-            Self::AwsS3(s) => s.copy_object(src, dst).await,
-            Self::AzureBlob(s) => s.copy_object(src, dst).await,
-            Self::Unreliable(s) => s.copy_object(src, dst).await,
-        }
-    }
 }

 impl GenericRemoteStorage {
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -393,27 +393,6 @@ impl RemoteStorage for LocalFs {
        }
        Ok(())
    }
-
-    async fn copy_object(&self, src: &RemotePath, dst: &RemotePath) -> anyhow::Result<()> {
-        let src_path = src.with_base(&self.storage_root);
-        let dst_path = dst.with_base(&self.storage_root);
-
-        // If the destination file already exists, we need to delete it first.
-        if dst_path.exists() {
-            fs::remove_file(&dst_path).await?;
-        }
-
-        // Copy the file.
-        fs::copy(&src_path, &dst_path).await?;
-
-        // Copy the metadata.
-        let metadata_path = storage_metadata_path(&src_path);
-        if metadata_path.exists() {
-            fs::copy(&metadata_path, storage_metadata_path(&dst_path)).await?;
-        }
-
-        Ok(())
-    }
 }

 fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -221,8 +221,6 @@ impl S3Bucket {
            )),
        }
    }
-
-
 }

 pin_project_lite::pin_project! {
@@ -517,11 +515,6 @@ impl RemoteStorage for S3Bucket {
        let paths = std::array::from_ref(path);
        self.delete_objects(paths).await
    }
-
-    async fn copy_object(&self, src: &RemotePath, dst: &RemotePath) -> anyhow::Result<()> {
-        unimplemented!()
-    }
-
 }

 /// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`].
@@ -604,6 +597,4 @@ mod tests {
            }
        }
    }
-
-
 }
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -160,8 +160,4 @@ impl RemoteStorage for UnreliableWrapper {
        }
        Ok(())
    }
-
-    async fn copy_object(&self, src: &RemotePath, dst: &RemotePath) -> anyhow::Result<()> {
-        unimplemented!()
-    }
 }
--- a/libs/safekeeper_api/src/lib.rs
+++ b/libs/safekeeper_api/src/lib.rs
@@ -1,5 +1,3 @@
-#![deny(unsafe_code)]
-#![deny(clippy::undocumented_unsafe_blocks)]
 use const_format::formatcp;

 /// Public API types
--- a/libs/tenant_size_model/src/lib.rs
+++ b/libs/tenant_size_model/src/lib.rs
@@ -1,6 +1,4 @@
 //! Synthetic size calculation
-#![deny(unsafe_code)]
-#![deny(clippy::undocumented_unsafe_blocks)]

 mod calculation;
 pub mod svg;
--- a/libs/tracing-utils/src/lib.rs
+++ b/libs/tracing-utils/src/lib.rs
@@ -32,8 +32,6 @@
 //!         .init();
 //! }
 //! ```
-#![deny(unsafe_code)]
-#![deny(clippy::undocumented_unsafe_blocks)]

 use opentelemetry::sdk::Resource;
 use opentelemetry::KeyValue;
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -5,7 +5,6 @@ edition.workspace = true
 license.workspace = true

 [dependencies]
-arc-swap.workspace = true
 sentry.workspace = true
 async-trait.workspace = true
 anyhow.workspace = true
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -1,8 +1,7 @@
 // For details about authentication see docs/authentication.md

-use arc_swap::ArcSwap;
 use serde;
-use std::{fs, sync::Arc};
+use std::fs;

 use anyhow::Result;
 use camino::Utf8Path;
@@ -45,88 +44,31 @@ impl Claims {
    }
 }

-pub struct SwappableJwtAuth(ArcSwap<JwtAuth>);
-
-impl SwappableJwtAuth {
-    pub fn new(jwt_auth: JwtAuth) -> Self {
-        SwappableJwtAuth(ArcSwap::new(Arc::new(jwt_auth)))
-    }
-    pub fn swap(&self, jwt_auth: JwtAuth) {
-        self.0.swap(Arc::new(jwt_auth));
-    }
-    pub fn decode(&self, token: &str) -> Result<TokenData<Claims>> {
-        self.0.load().decode(token)
-    }
-}
-
-impl std::fmt::Debug for SwappableJwtAuth {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "Swappable({:?})", self.0.load())
-    }
-}
-
 pub struct JwtAuth {
-    decoding_keys: Vec<DecodingKey>,
+    decoding_key: DecodingKey,
    validation: Validation,
 }

 impl JwtAuth {
-    pub fn new(decoding_keys: Vec<DecodingKey>) -> Self {
+    pub fn new(decoding_key: DecodingKey) -> Self {
        let mut validation = Validation::default();
        validation.algorithms = vec![STORAGE_TOKEN_ALGORITHM];
        // The default 'required_spec_claims' is 'exp'. But we don't want to require
        // expiration.
        validation.required_spec_claims = [].into();
        Self {
-            decoding_keys,
+            decoding_key,
            validation,
        }
    }

    pub fn from_key_path(key_path: &Utf8Path) -> Result<Self> {
-        let metadata = key_path.metadata()?;
-        let decoding_keys = if metadata.is_dir() {
-            let mut keys = Vec::new();
-            for entry in fs::read_dir(key_path)? {
-                let path = entry?.path();
-                if !path.is_file() {
-                    // Ignore directories (don't recurse)
-                    continue;
-                }
-                let public_key = fs::read(path)?;
-                keys.push(DecodingKey::from_ed_pem(&public_key)?);
-            }
-            keys
-        } else if metadata.is_file() {
-            let public_key = fs::read(key_path)?;
-            vec![DecodingKey::from_ed_pem(&public_key)?]
-        } else {
-            anyhow::bail!("path is neither a directory or a file")
-        };
-        if decoding_keys.is_empty() {
-            anyhow::bail!("Configured for JWT auth with zero decoding keys. All JWT gated requests would be rejected.");
-        }
-        Ok(Self::new(decoding_keys))
+        let public_key = fs::read(key_path)?;
+        Ok(Self::new(DecodingKey::from_ed_pem(&public_key)?))
    }

-    /// Attempt to decode the token with the internal decoding keys.
-    ///
-    /// The function tries the stored decoding keys in succession,
-    /// and returns the first yielding a successful result.
-    /// If there is no working decoding key, it returns the last error.
    pub fn decode(&self, token: &str) -> Result<TokenData<Claims>> {
-        let mut res = None;
-        for decoding_key in &self.decoding_keys {
-            res = Some(decode(token, decoding_key, &self.validation));
-            if let Some(Ok(res)) = res {
-                return Ok(res);
-            }
-        }
-        if let Some(res) = res {
-            res.map_err(anyhow::Error::new)
-        } else {
-            anyhow::bail!("no JWT decoding keys configured")
-        }
+        Ok(decode(token, &self.decoding_key, &self.validation)?)
    }
 }

@@ -187,7 +129,7 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
        let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.U3eA8j-uU-JnhzeO3EDHRuXLwkAUFCPxtGHEgw6p7Ccc3YRbFs2tmCdbD9PZEXP-XsxSeBQi1FY0YPcT3NXADw";

        // Check it can be validated with the public key
-        let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?]);
+        let auth = JwtAuth::new(DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?);
        let claims_from_token = auth.decode(encoded_eddsa)?.claims;
        assert_eq!(claims_from_token, expected_claims);

@@ -204,7 +146,7 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
        let encoded = encode_from_key_file(&claims, TEST_PRIV_KEY_ED25519)?;

        // decode it back
-        let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?]);
+        let auth = JwtAuth::new(DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?);
        let decoded = auth.decode(&encoded)?;

        assert_eq!(decoded.claims, claims);
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -1,4 +1,4 @@
-use crate::auth::{Claims, SwappableJwtAuth};
+use crate::auth::{Claims, JwtAuth};
 use crate::http::error::{api_error_handler, route_error_handler, ApiError};
 use anyhow::Context;
 use hyper::header::{HeaderName, AUTHORIZATION};
@@ -389,7 +389,7 @@ fn parse_token(header_value: &str) -> Result<&str, ApiError> {
 }

 pub fn auth_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
-    provide_auth: fn(&Request<Body>) -> Option<&SwappableJwtAuth>,
+    provide_auth: fn(&Request<Body>) -> Option<&JwtAuth>,
 ) -> Middleware<B, ApiError> {
    Middleware::pre(move |req| async move {
        if let Some(auth) = provide_auth(&req) {
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -120,8 +120,6 @@ impl Id {
            chunk[0] = HEX[((b >> 4) & 0xf) as usize];
            chunk[1] = HEX[(b & 0xf) as usize];
        }
-
-        // SAFETY: vec constructed out of `HEX`, it can only be ascii
        unsafe { String::from_utf8_unchecked(buf) }
    }
 }
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -1,6 +1,5 @@
 //! `utils` is intended to be a place to put code that is shared
 //! between other crates in this repository.
-#![deny(clippy::undocumented_unsafe_blocks)]

 pub mod backoff;

@@ -78,9 +77,6 @@ pub mod completion;
 /// Reporting utilities
 pub mod error;

-/// async timeout helper
-pub mod timeout;
-
 pub mod sync;

 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
--- a/libs/utils/src/shutdown.rs
+++ b/libs/utils/src/shutdown.rs
@@ -1,7 +1,6 @@
 /// Immediately terminate the calling process without calling
 /// atexit callbacks, C runtime destructors etc. We mainly use
 /// this to protect coverage data from concurrent writes.
-pub fn exit_now(code: u8) -> ! {
-    // SAFETY: exiting is safe, the ffi is not safe
+pub fn exit_now(code: u8) {
    unsafe { nix::libc::_exit(code as _) };
 }
--- a/libs/utils/src/sync.rs
+++ b/libs/utils/src/sync.rs
@@ -1,3 +1 @@
 pub mod heavier_once_cell;
-
-pub mod gate;
--- a/libs/utils/src/sync/gate.rs
+++ b/libs/utils/src/sync/gate.rs
@@ -1,158 +0,0 @@
-use std::{sync::Arc, time::Duration};
-
-/// Gates are a concurrency helper, primarily used for implementing safe shutdown.
-///
-/// Users of a resource call `enter()` to acquire a GateGuard, and the owner of
-/// the resource calls `close()` when they want to ensure that all holders of guards
-/// have released them, and that no future guards will be issued.
-pub struct Gate {
-    /// Each caller of enter() takes one unit from the semaphore. In close(), we
-    /// take all the units to ensure all GateGuards are destroyed.
-    sem: Arc<tokio::sync::Semaphore>,
-
-    /// For observability only: a name that will be used to log warnings if a particular
-    /// gate is holding up shutdown
-    name: String,
-}
-
-/// RAII guard for a [`Gate`]: as long as this exists, calls to [`Gate::close`] will
-/// not complete.
-#[derive(Debug)]
-pub struct GateGuard(tokio::sync::OwnedSemaphorePermit);
-
-/// Observability helper: every `warn_period`, emit a log warning that we're still waiting on this gate
-async fn warn_if_stuck<Fut: std::future::Future>(
-    fut: Fut,
-    name: &str,
-    warn_period: std::time::Duration,
-) -> <Fut as std::future::Future>::Output {
-    let started = std::time::Instant::now();
-
-    let mut fut = std::pin::pin!(fut);
-
-    loop {
-        match tokio::time::timeout(warn_period, &mut fut).await {
-            Ok(ret) => return ret,
-            Err(_) => {
-                tracing::warn!(
-                    gate = name,
-                    elapsed_ms = started.elapsed().as_millis(),
-                    "still waiting, taking longer than expected..."
-                );
-            }
-        }
-    }
-}
-
-#[derive(Debug)]
-pub enum GateError {
-    GateClosed,
-}
-
-impl Gate {
-    const MAX_UNITS: u32 = u32::MAX;
-
-    pub fn new(name: String) -> Self {
-        Self {
-            sem: Arc::new(tokio::sync::Semaphore::new(Self::MAX_UNITS as usize)),
-            name,
-        }
-    }
-
-    /// Acquire a guard that will prevent close() calls from completing. If close()
-    /// was already called, this will return an error which should be interpreted
-    /// as "shutting down".
-    ///
-    /// This function would typically be used from e.g. request handlers. While holding
-    /// the guard returned from this function, it is important to respect a CancellationToken
-    /// to avoid blocking close() indefinitely: typically types that contain a Gate will
-    /// also contain a CancellationToken.
-    pub fn enter(&self) -> Result<GateGuard, GateError> {
-        self.sem
-            .clone()
-            .try_acquire_owned()
-            .map(GateGuard)
-            .map_err(|_| GateError::GateClosed)
-    }
-
-    /// Types with a shutdown() method and a gate should call this method at the
-    /// end of shutdown, to ensure that all GateGuard holders are done.
-    ///
-    /// This will wait for all guards to be destroyed.  For this to complete promptly, it is
-    /// important that the holders of such guards are respecting a CancellationToken which has
-    /// been cancelled before entering this function.
-    pub async fn close(&self) {
-        warn_if_stuck(self.do_close(), &self.name, Duration::from_millis(1000)).await
-    }
-
-    /// Check if [`Self::close()`] has finished waiting for all [`Self::enter()`] users to finish.  This
-    /// is usually analoguous for "Did shutdown finish?" for types that include a Gate, whereas checking
-    /// the CancellationToken on such types is analogous to "Did shutdown start?"
-    pub fn close_complete(&self) -> bool {
-        self.sem.is_closed()
-    }
-
-    async fn do_close(&self) {
-        tracing::debug!(gate = self.name, "Closing Gate...");
-        match self.sem.acquire_many(Self::MAX_UNITS).await {
-            Ok(_units) => {
-                // While holding all units, close the semaphore.  All subsequent calls to enter() will fail.
-                self.sem.close();
-            }
-            Err(_) => {
-                // Semaphore closed: we are the only function that can do this, so it indicates a double-call.
-                // This is legal.  Timeline::shutdown for example is not protected from being called more than
-                // once.
-                tracing::debug!(gate = self.name, "Double close")
-            }
-        }
-        tracing::debug!(gate = self.name, "Closed Gate.")
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use futures::FutureExt;
-
-    use super::*;
-
-    #[tokio::test]
-    async fn test_idle_gate() {
-        // Having taken no gates, we should not be blocked in close
-        let gate = Gate::new("test".to_string());
-        gate.close().await;
-
-        // If a guard is dropped before entering, close should not be blocked
-        let gate = Gate::new("test".to_string());
-        let guard = gate.enter().unwrap();
-        drop(guard);
-        gate.close().await;
-
-        // Entering a closed guard fails
-        gate.enter().expect_err("enter should fail after close");
-    }
-
-    #[tokio::test]
-    async fn test_busy_gate() {
-        let gate = Gate::new("test".to_string());
-
-        let guard = gate.enter().unwrap();
-
-        let mut close_fut = std::pin::pin!(gate.close());
-
-        // Close should be blocked
-        assert!(close_fut.as_mut().now_or_never().is_none());
-
-        // Attempting to enter() should fail, even though close isn't done yet.
-        gate.enter()
-            .expect_err("enter should fail after entering close");
-
-        drop(guard);
-
-        // Guard is gone, close should finish
-        assert!(close_fut.as_mut().now_or_never().is_some());
-
-        // Attempting to enter() is still forbidden
-        gate.enter().expect_err("enter should fail finishing close");
-    }
-}
--- a/libs/utils/src/timeout.rs
+++ b/libs/utils/src/timeout.rs
@@ -1,37 +0,0 @@
-use std::time::Duration;
-
-use tokio_util::sync::CancellationToken;
-
-pub enum TimeoutCancellableError {
-    Timeout,
-    Cancelled,
-}
-
-/// Wrap [`tokio::time::timeout`] with a CancellationToken.
-///
-/// This wrapper is appropriate for any long running operation in a task
-/// that ought to respect a CancellationToken (which means most tasks).
-///
-/// The only time you should use a bare tokio::timeout is when the future `F`
-/// itself respects a CancellationToken: otherwise, always use this wrapper
-/// with your CancellationToken to ensure that your task does not hold up
-/// graceful shutdown.
-pub async fn timeout_cancellable<F>(
-    duration: Duration,
-    cancel: &CancellationToken,
-    future: F,
-) -> Result<F::Output, TimeoutCancellableError>
-where
-    F: std::future::Future,
-{
-    tokio::select!(
-        r = tokio::time::timeout(duration, future) => {
-            r.map_err(|_| TimeoutCancellableError::Timeout)
-
-        },
-        _ = cancel.cancelled() => {
-            Err(TimeoutCancellableError::Cancelled)
-
-        }
-    )
-}
--- a/libs/vm_monitor/Cargo.toml
+++ b/libs/vm_monitor/Cargo.toml
@@ -19,12 +19,13 @@ inotify.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 sysinfo.workspace = true
-tokio = { workspace = true, features = ["rt-multi-thread"] }
+tokio.workspace = true
 tokio-postgres.workspace = true
 tokio-stream.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true
 tracing-subscriber.workspace = true
+workspace_hack = { version = "0.1", path = "../../workspace_hack" }

 [target.'cfg(target_os = "linux")'.dependencies]
 cgroups-rs = "0.3.3"
--- a/libs/vm_monitor/src/lib.rs
+++ b/libs/vm_monitor/src/lib.rs
@@ -1,5 +1,3 @@
-#![deny(unsafe_code)]
-#![deny(clippy::undocumented_unsafe_blocks)]
 #![cfg(target_os = "linux")]

 use anyhow::Context;
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -34,11 +34,8 @@ use postgres_backend::AuthType;
 use utils::logging::TracingErrorLayerEnablement;
 use utils::signals::ShutdownSignals;
 use utils::{
-    auth::{JwtAuth, SwappableJwtAuth},
-    logging, project_build_tag, project_git_version,
-    sentry_init::init_sentry,
-    signals::Signal,
-    tcp_listener,
+    auth::JwtAuth, logging, project_build_tag, project_git_version, sentry_init::init_sentry,
+    signals::Signal, tcp_listener,
 };

 project_git_version!(GIT_VERSION);
@@ -324,12 +321,13 @@ fn start_pageserver(
    let http_auth;
    let pg_auth;
    if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT {
-        // unwrap is ok because check is performed when creating config, so path is set and exists
+        // unwrap is ok because check is performed when creating config, so path is set and file exists
        let key_path = conf.auth_validation_public_key_path.as_ref().unwrap();
-        info!("Loading public key(s) for verifying JWT tokens from {key_path:?}");
-
-        let jwt_auth = JwtAuth::from_key_path(key_path)?;
-        let auth: Arc<SwappableJwtAuth> = Arc::new(SwappableJwtAuth::new(jwt_auth));
+        info!(
+            "Loading public key for verifying JWT tokens from {:#?}",
+            key_path
+        );
+        let auth: Arc<JwtAuth> = Arc::new(JwtAuth::from_key_path(key_path)?);

        http_auth = match &conf.http_auth_type {
            AuthType::Trust => None,
@@ -412,7 +410,7 @@ fn start_pageserver(

    // Scan the local 'tenants/' directory and start loading the tenants
    let deletion_queue_client = deletion_queue.new_client();
-    let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
+    BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
        conf,
        TenantSharedResources {
            broker_client: broker_client.clone(),
@@ -422,7 +420,6 @@ fn start_pageserver(
        order,
        shutdown_pageserver.clone(),
    ))?;
-    let tenant_manager = Arc::new(tenant_manager);

    BACKGROUND_RUNTIME.spawn({
        let init_done_rx = init_done_rx;
@@ -551,7 +548,6 @@ fn start_pageserver(
        let router_state = Arc::new(
            http::routes::State::new(
                conf,
-                tenant_manager,
                http_auth.clone(),
                remote_storage.clone(),
                broker_client.clone(),
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -161,7 +161,7 @@ pub struct PageServerConf {
    pub http_auth_type: AuthType,
    /// authentication method for libpq connections from compute
    pub pg_auth_type: AuthType,
-    /// Path to a file or directory containing public key(s) for verifying JWT tokens.
+    /// Path to a file containing public key for verifying JWT tokens.
    /// Used for both mgmt and compute auth, if enabled.
    pub auth_validation_public_key_path: Option<Utf8PathBuf>,

--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -266,7 +266,7 @@ async fn calculate_synthetic_size_worker(
                continue;
            }

-            if let Ok(tenant) = mgr::get_tenant(tenant_id, true) {
+            if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await {
                // TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks?
                // We can put in some prioritization for consumption metrics.
                // Same for the loop that fetches computed metrics.
--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -202,6 +202,7 @@ pub(super) async fn collect_all_metrics(
            None
        } else {
            crate::tenant::mgr::get_tenant(id, true)
+                .await
                .ok()
                .map(|tenant| (id, tenant))
        }
--- a/pageserver/src/deletion_queue/deleter.rs
+++ b/pageserver/src/deletion_queue/deleter.rs
@@ -55,24 +55,21 @@ impl Deleter {

    /// Wrap the remote `delete_objects` with a failpoint
    async fn remote_delete(&self) -> Result<(), anyhow::Error> {
+        fail::fail_point!("deletion-queue-before-execute", |_| {
+            info!("Skipping execution, failpoint set");
+            metrics::DELETION_QUEUE
+                .remote_errors
+                .with_label_values(&["failpoint"])
+                .inc();
+            Err(anyhow::anyhow!("failpoint hit"))
+        });
+
        // A backoff::retry is used here for two reasons:
        // - To provide a backoff rather than busy-polling the API on errors
        // - To absorb transient 429/503 conditions without hitting our error
        //   logging path for issues deleting objects.
        backoff::retry(
-            || async {
-                fail::fail_point!("deletion-queue-before-execute", |_| {
-                    info!("Skipping execution, failpoint set");
-
-                    metrics::DELETION_QUEUE
-                        .remote_errors
-                        .with_label_values(&["failpoint"])
-                        .inc();
-                    Err(anyhow::anyhow!("failpoint: deletion-queue-before-execute"))
-                });
-
-                self.remote_storage.delete_objects(&self.accumulator).await
-            },
+            || async { self.remote_storage.delete_objects(&self.accumulator).await },
            |_| false,
            3,
            10,
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -403,7 +403,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
                    return (evicted_bytes, evictions_failed);
                };

-                let results = timeline.evict_layers(&batch).await;
+                let results = timeline.evict_layers(&batch, &cancel).await;

                match results {
                    Ok(results) => {
@@ -545,7 +545,7 @@ async fn collect_eviction_candidates(
        if cancel.is_cancelled() {
            return Ok(EvictionCandidates::Cancelled);
        }
-        let tenant = match tenant::mgr::get_tenant(*tenant_id, true) {
+        let tenant = match tenant::mgr::get_tenant(*tenant_id, true).await {
            Ok(tenant) => tenant,
            Err(e) => {
                // this can happen if tenant has lifecycle transition after we fetched it
@@ -554,11 +554,6 @@ async fn collect_eviction_candidates(
            }
        };

-        if tenant.cancel.is_cancelled() {
-            info!(%tenant_id, "Skipping tenant for eviction, it is shutting down");
-            continue;
-        }
-
        // collect layers from all timelines in this tenant
        //
        // If one of the timelines becomes `!is_active()` during the iteration,
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -52,31 +52,6 @@ paths:
              schema:
                type: object

-  /v1/reload_auth_validation_keys:
-    post:
-      description: Reloads the JWT public keys from their pre-configured location on disk.
-      responses:
-        "200":
-          description: The reload completed successfully.
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error (also hits if no keys were found)
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-
  /v1/tenant/{tenant_id}:
    parameters:
      - name: tenant_id
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -20,7 +20,6 @@ use remote_storage::GenericRemoteStorage;
 use tenant_size_model::{SizeResult, StorageModel};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-use utils::auth::JwtAuth;
 use utils::http::endpoint::request_span;
 use utils::http::json::json_request_or_empty_body;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
@@ -36,8 +35,7 @@ use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::{LocationConf, TenantConfOpt};
 use crate::tenant::mgr::{
-    GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError,
-    TenantSlotError, TenantSlotUpsertError, TenantStateError,
+    GetTenantError, SetNewTenantConfigError, TenantMapInsertError, TenantStateError,
 };
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
@@ -46,7 +44,7 @@ use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, TenantSha
 use crate::{config::PageServerConf, tenant::mgr};
 use crate::{disk_usage_eviction_task, tenant};
 use utils::{
-    auth::SwappableJwtAuth,
+    auth::JwtAuth,
    generation::Generation,
    http::{
        endpoint::{self, attach_openapi_ui, auth_middleware, check_permission_with},
@@ -64,8 +62,7 @@ use super::models::ConfigureFailpointsRequest;

 pub struct State {
    conf: &'static PageServerConf,
-    tenant_manager: Arc<TenantManager>,
-    auth: Option<Arc<SwappableJwtAuth>>,
+    auth: Option<Arc<JwtAuth>>,
    allowlist_routes: Vec<Uri>,
    remote_storage: Option<GenericRemoteStorage>,
    broker_client: storage_broker::BrokerClientChannel,
@@ -76,8 +73,7 @@ pub struct State {
 impl State {
    pub fn new(
        conf: &'static PageServerConf,
-        tenant_manager: Arc<TenantManager>,
-        auth: Option<Arc<SwappableJwtAuth>>,
+        auth: Option<Arc<JwtAuth>>,
        remote_storage: Option<GenericRemoteStorage>,
        broker_client: storage_broker::BrokerClientChannel,
        disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
@@ -89,7 +85,6 @@ impl State {
            .collect::<Vec<_>>();
        Ok(Self {
            conf,
-            tenant_manager,
            auth,
            allowlist_routes,
            remote_storage,
@@ -151,59 +146,28 @@ impl From<PageReconstructError> for ApiError {
 impl From<TenantMapInsertError> for ApiError {
    fn from(tmie: TenantMapInsertError) -> ApiError {
        match tmie {
-            TenantMapInsertError::SlotError(e) => e.into(),
-            TenantMapInsertError::SlotUpsertError(e) => e.into(),
+            TenantMapInsertError::StillInitializing | TenantMapInsertError::ShuttingDown => {
+                ApiError::ResourceUnavailable(format!("{tmie}").into())
+            }
+            TenantMapInsertError::TenantAlreadyExists(id, state) => {
+                ApiError::Conflict(format!("tenant {id} already exists, state: {state:?}"))
+            }
+            TenantMapInsertError::TenantExistsSecondary(id) => {
+                ApiError::Conflict(format!("tenant {id} already exists as secondary"))
+            }
            TenantMapInsertError::Other(e) => ApiError::InternalServerError(e),
        }
    }
 }

-impl From<TenantSlotError> for ApiError {
-    fn from(e: TenantSlotError) -> ApiError {
-        use TenantSlotError::*;
-        match e {
-            NotFound(tenant_id) => {
-                ApiError::NotFound(anyhow::anyhow!("NotFound: tenant {tenant_id}").into())
-            }
-            e @ (AlreadyExists(_, _) | Conflict(_)) => ApiError::Conflict(format!("{e}")),
-            InProgress => {
-                ApiError::ResourceUnavailable("Tenant is being modified concurrently".into())
-            }
-            MapState(e) => e.into(),
-        }
-    }
-}
-
-impl From<TenantSlotUpsertError> for ApiError {
-    fn from(e: TenantSlotUpsertError) -> ApiError {
-        use TenantSlotUpsertError::*;
-        match e {
-            InternalError(e) => ApiError::InternalServerError(anyhow::anyhow!("{e}")),
-            MapState(e) => e.into(),
-        }
-    }
-}
-
-impl From<TenantMapError> for ApiError {
-    fn from(e: TenantMapError) -> ApiError {
-        use TenantMapError::*;
-        match e {
-            StillInitializing | ShuttingDown => {
-                ApiError::ResourceUnavailable(format!("{e}").into())
-            }
-        }
-    }
-}
-
 impl From<TenantStateError> for ApiError {
    fn from(tse: TenantStateError) -> ApiError {
        match tse {
+            TenantStateError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
            TenantStateError::IsStopping(_) => {
                ApiError::ResourceUnavailable("Tenant is stopping".into())
            }
-            TenantStateError::SlotError(e) => e.into(),
-            TenantStateError::SlotUpsertError(e) => e.into(),
-            TenantStateError::Other(e) => ApiError::InternalServerError(anyhow!(e)),
+            _ => ApiError::InternalServerError(anyhow::Error::new(tse)),
        }
    }
 }
@@ -224,7 +188,6 @@ impl From<GetTenantError> for ApiError {
                // (We can produce this variant only in `mgr::get_tenant(..., active=true)` calls).
                ApiError::ResourceUnavailable("Tenant not yet active".into())
            }
-            GetTenantError::MapState(e) => ApiError::ResourceUnavailable(format!("{e}").into()),
        }
    }
 }
@@ -279,9 +242,6 @@ impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
            Get(g) => ApiError::from(g),
            e @ AlreadyInProgress => ApiError::Conflict(e.to_string()),
            Timeline(t) => ApiError::from(t),
-            NotAttached => ApiError::NotFound(anyhow::anyhow!("Tenant is not attached").into()),
-            SlotError(e) => e.into(),
-            SlotUpsertError(e) => e.into(),
            Other(o) => ApiError::InternalServerError(o),
            e @ InvalidState(_) => ApiError::PreconditionFailed(e.to_string().into_boxed_str()),
        }
@@ -393,32 +353,6 @@ async fn status_handler(
    json_response(StatusCode::OK, StatusResponse { id: config.id })
 }

-async fn reload_auth_validation_keys_handler(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    check_permission(&request, None)?;
-    let config = get_config(&request);
-    let state = get_state(&request);
-    let Some(shared_auth) = &state.auth else {
-        return json_response(StatusCode::BAD_REQUEST, ());
-    };
-    // unwrap is ok because check is performed when creating config, so path is set and exists
-    let key_path = config.auth_validation_public_key_path.as_ref().unwrap();
-    info!("Reloading public key(s) for verifying JWT tokens from {key_path:?}");
-
-    match JwtAuth::from_key_path(key_path) {
-        Ok(new_auth) => {
-            shared_auth.swap(new_auth);
-            json_response(StatusCode::OK, ())
-        }
-        Err(e) => {
-            warn!("Error reloading public keys from {key_path:?}: {e:}");
-            json_response(StatusCode::INTERNAL_SERVER_ERROR, ())
-        }
-    }
-}
-
 async fn timeline_create_handler(
    mut request: Request<Body>,
    _cancel: CancellationToken,
@@ -434,7 +368,7 @@ async fn timeline_create_handler(
    let state = get_state(&request);

    async {
-        let tenant = mgr::get_tenant(tenant_id, true)?;
+        let tenant = mgr::get_tenant(tenant_id, true).await?;
        match tenant.create_timeline(
            new_timeline_id,
            request_data.ancestor_timeline_id.map(TimelineId::from),
@@ -462,9 +396,6 @@ async fn timeline_create_handler(
            Err(e @ tenant::CreateTimelineError::AncestorNotActive) => {
                json_response(StatusCode::SERVICE_UNAVAILABLE, HttpErrorBody::from_msg(e.to_string()))
            }
-            Err(tenant::CreateTimelineError::ShuttingDown) => {
-                json_response(StatusCode::SERVICE_UNAVAILABLE,HttpErrorBody::from_msg("tenant shutting down".to_string()))
-            }
            Err(tenant::CreateTimelineError::Other(err)) => Err(ApiError::InternalServerError(err)),
        }
    }
@@ -484,7 +415,7 @@ async fn timeline_list_handler(
    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);

    let response_data = async {
-        let tenant = mgr::get_tenant(tenant_id, true)?;
+        let tenant = mgr::get_tenant(tenant_id, true).await?;
        let timelines = tenant.list_timelines();

        let mut response_data = Vec::with_capacity(timelines.len());
@@ -523,7 +454,7 @@ async fn timeline_detail_handler(
    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);

    let timeline_info = async {
-        let tenant = mgr::get_tenant(tenant_id, true)?;
+        let tenant = mgr::get_tenant(tenant_id, true).await?;

        let timeline = tenant
            .get_timeline(timeline_id, false)
@@ -779,7 +710,7 @@ async fn tenant_status(
    check_permission(&request, Some(tenant_id))?;

    let tenant_info = async {
-        let tenant = mgr::get_tenant(tenant_id, false)?;
+        let tenant = mgr::get_tenant(tenant_id, false).await?;

        // Calculate total physical size of all timelines
        let mut current_physical_size = 0;
@@ -842,7 +773,7 @@ async fn tenant_size_handler(
    let headers = request.headers();

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let tenant = mgr::get_tenant(tenant_id, true)?;
+    let tenant = mgr::get_tenant(tenant_id, true).await?;

    // this can be long operation
    let inputs = tenant
@@ -1099,7 +1030,7 @@ async fn get_tenant_config_handler(
    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
    check_permission(&request, Some(tenant_id))?;

-    let tenant = mgr::get_tenant(tenant_id, false)?;
+    let tenant = mgr::get_tenant(tenant_id, false).await?;

    let response = HashMap::from([
        (
@@ -1158,7 +1089,7 @@ async fn put_tenant_location_config_handler(
            .await
        {
            match e {
-                TenantStateError::SlotError(TenantSlotError::NotFound(_)) => {
+                TenantStateError::NotFound(_) => {
                    // This API is idempotent: a NotFound on a detach is fine.
                }
                _ => return Err(e.into()),
@@ -1170,14 +1101,20 @@ async fn put_tenant_location_config_handler(
    let location_conf =
        LocationConf::try_from(&request_data.config).map_err(ApiError::BadRequest)?;

-    state
-        .tenant_manager
-        .upsert_location(tenant_id, location_conf, &ctx)
-        .await
-        // TODO: badrequest assumes the caller was asking for something unreasonable, but in
-        // principle we might have hit something like concurrent API calls to the same tenant,
-        // which is not a 400 but a 409.
-        .map_err(ApiError::BadRequest)?;
+    mgr::upsert_location(
+        state.conf,
+        tenant_id,
+        location_conf,
+        state.broker_client.clone(),
+        state.remote_storage.clone(),
+        state.deletion_queue_client.clone(),
+        &ctx,
+    )
+    .await
+    // TODO: badrequest assumes the caller was asking for something unreasonable, but in
+    // principle we might have hit something like concurrent API calls to the same tenant,
+    // which is not a 400 but a 409.
+    .map_err(ApiError::BadRequest)?;

    json_response(StatusCode::OK, ())
 }
@@ -1190,6 +1127,7 @@ async fn handle_tenant_break(
    let tenant_id: TenantId = parse_request_param(&r, "tenant_id")?;

    let tenant = crate::tenant::mgr::get_tenant(tenant_id, true)
+        .await
        .map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?;

    tenant.set_broken("broken from test".to_owned()).await;
@@ -1494,7 +1432,7 @@ async fn active_timeline_of_active_tenant(
    tenant_id: TenantId,
    timeline_id: TimelineId,
 ) -> Result<Arc<Timeline>, ApiError> {
-    let tenant = mgr::get_tenant(tenant_id, true)?;
+    let tenant = mgr::get_tenant(tenant_id, true).await?;
    tenant
        .get_timeline(timeline_id, true)
        .map_err(|e| ApiError::NotFound(e.into()))
@@ -1719,7 +1657,7 @@ where
 pub fn make_router(
    state: Arc<State>,
    launch_ts: &'static LaunchTimestamp,
-    auth: Option<Arc<SwappableJwtAuth>>,
+    auth: Option<Arc<JwtAuth>>,
 ) -> anyhow::Result<RouterBuilder<hyper::Body, ApiError>> {
    let spec = include_bytes!("openapi_spec.yml");
    let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc");
@@ -1748,9 +1686,6 @@ pub fn make_router(
        .put("/v1/failpoints", |r| {
            testing_api_handler("manage failpoints", r, failpoints_handler)
        })
-        .post("/v1/reload_auth_validation_keys", |r| {
-            api_handler(r, reload_auth_validation_keys_handler)
-        })
        .get("/v1/tenant", |r| api_handler(r, tenant_list_handler))
        .post("/v1/tenant", |r| api_handler(r, tenant_create_handler))
        .get("/v1/tenant/:tenant_id", |r| api_handler(r, tenant_status))
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -1,5 +1,3 @@
-#![deny(clippy::undocumented_unsafe_blocks)]
-
 mod auth;
 pub mod basebackup;
 pub mod config;
@@ -63,6 +61,14 @@ pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_cod
    )
    .await;

+    // Shut down any page service tasks.
+    timed(
+        task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None),
+        "shutdown PageRequestHandlers",
+        Duration::from_secs(1),
+    )
+    .await;
+
    // Shut down all the tenants. This flushes everything to disk and kills
    // the checkpoint and GC tasks.
    timed(
@@ -72,15 +78,6 @@ pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_cod
    )
    .await;

-    // Shut down any page service tasks: any in-progress work for particular timelines or tenants
-    // should already have been canclled via mgr::shutdown_all_tenants
-    timed(
-        task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None),
-        "shutdown PageRequestHandlers",
-        Duration::from_secs(1),
-    )
-    .await;
-
    // Best effort to persist any outstanding deletions, to avoid leaking objects
    if let Some(mut deletion_queue) = deletion_queue {
        deletion_queue.shutdown(Duration::from_secs(5)).await;
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -962,32 +962,6 @@ static REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER: Lazy<IntCounterVec> = Lazy
    .expect("failed to define a metric")
 });

-pub(crate) struct TenantManagerMetrics {
-    pub(crate) tenant_slots: UIntGauge,
-    pub(crate) tenant_slot_writes: IntCounter,
-    pub(crate) unexpected_errors: IntCounter,
-}
-
-pub(crate) static TENANT_MANAGER: Lazy<TenantManagerMetrics> = Lazy::new(|| {
-    TenantManagerMetrics {
-    tenant_slots: register_uint_gauge!(
-        "pageserver_tenant_manager_slots",
-        "How many slots currently exist, including all attached, secondary and in-progress operations",
-    )
-    .expect("failed to define a metric"),
-    tenant_slot_writes: register_int_counter!(
-        "pageserver_tenant_manager_slot_writes",
-        "Writes to a tenant slot, including all of create/attach/detach/delete"
-    )
-    .expect("failed to define a metric"),
-    unexpected_errors: register_int_counter!(
-        "pageserver_tenant_manager_unexpected_errors_total",
-        "Number of unexpected conditions encountered: nonzero value indicates a non-fatal bug."
-    )
-    .expect("failed to define a metric"),
-}
-});
-
 pub(crate) struct DeletionQueueMetrics {
    pub(crate) keys_submitted: IntCounter,
    pub(crate) keys_dropped: IntCounter,
@@ -1910,9 +1884,6 @@ pub fn preinitialize_metrics() {
    // Deletion queue stats
    Lazy::force(&DELETION_QUEUE);

-    // Tenant manager stats
-    Lazy::force(&TENANT_MANAGER);
-
    // countervecs
    [&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
        .into_iter()
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -40,7 +40,7 @@ use tracing::field;
 use tracing::*;
 use utils::id::ConnectionId;
 use utils::{
-    auth::{Claims, Scope, SwappableJwtAuth},
+    auth::{Claims, JwtAuth, Scope},
    id::{TenantId, TimelineId},
    lsn::Lsn,
    simple_rcu::RcuReadGuard,
@@ -55,20 +55,16 @@ use crate::metrics;
 use crate::metrics::LIVE_CONNECTIONS_COUNT;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
+use crate::tenant;
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::mgr;
-use crate::tenant::mgr::get_active_tenant_with_timeout;
-use crate::tenant::mgr::GetActiveTenantError;
-use crate::tenant::Timeline;
+use crate::tenant::mgr::GetTenantError;
+use crate::tenant::{Tenant, Timeline};
 use crate::trace::Tracer;

 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;

-// How long we may block waiting for a [`TenantSlot::InProgress`]` and/or a [`Tenant`] which
-// is not yet in state [`TenantState::Active`].
-const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000);
-
 /// Read the end of a tar archive.
 ///
 /// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each.
@@ -122,7 +118,7 @@ async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()
 pub async fn libpq_listener_main(
    conf: &'static PageServerConf,
    broker_client: storage_broker::BrokerClientChannel,
-    auth: Option<Arc<SwappableJwtAuth>>,
+    auth: Option<Arc<JwtAuth>>,
    listener: TcpListener,
    auth_type: AuthType,
    listener_ctx: RequestContext,
@@ -190,7 +186,7 @@ pub async fn libpq_listener_main(
 async fn page_service_conn_main(
    conf: &'static PageServerConf,
    broker_client: storage_broker::BrokerClientChannel,
-    auth: Option<Arc<SwappableJwtAuth>>,
+    auth: Option<Arc<JwtAuth>>,
    socket: tokio::net::TcpStream,
    auth_type: AuthType,
    connection_ctx: RequestContext,
@@ -227,7 +223,13 @@ async fn page_service_conn_main(
    // and create a child per-query context when it invokes process_query.
    // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
    // and create the per-query context in process_query ourselves.
-    let mut conn_handler = PageServerHandler::new(conf, broker_client, auth, connection_ctx);
+    let mut conn_handler = PageServerHandler::new(
+        conf,
+        broker_client,
+        auth,
+        connection_ctx,
+        task_mgr::shutdown_token(),
+    );
    let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;

    match pgbackend
@@ -253,7 +255,7 @@ async fn page_service_conn_main(
 struct PageServerHandler {
    _conf: &'static PageServerConf,
    broker_client: storage_broker::BrokerClientChannel,
-    auth: Option<Arc<SwappableJwtAuth>>,
+    auth: Option<Arc<JwtAuth>>,
    claims: Option<Claims>,

    /// The context created for the lifetime of the connection
@@ -261,14 +263,19 @@ struct PageServerHandler {
    /// For each query received over the connection,
    /// `process_query` creates a child context from this one.
    connection_ctx: RequestContext,
+
+    /// A token that should fire when the tenant transitions from
+    /// attached state, or when the pageserver is shutting down.
+    cancel: CancellationToken,
 }

 impl PageServerHandler {
    pub fn new(
        conf: &'static PageServerConf,
        broker_client: storage_broker::BrokerClientChannel,
-        auth: Option<Arc<SwappableJwtAuth>>,
+        auth: Option<Arc<JwtAuth>>,
        connection_ctx: RequestContext,
+        cancel: CancellationToken,
    ) -> Self {
        PageServerHandler {
            _conf: conf,
@@ -276,6 +283,7 @@ impl PageServerHandler {
            auth,
            claims: None,
            connection_ctx,
+            cancel,
        }
    }

@@ -283,11 +291,7 @@ impl PageServerHandler {
    /// this rather than naked flush() in order to shut down promptly.  Without this, we would
    /// block shutdown of a tenant if a postgres client was failing to consume bytes we send
    /// in the flush.
-    async fn flush_cancellable<IO>(
-        &self,
-        pgb: &mut PostgresBackend<IO>,
-        cancel: &CancellationToken,
-    ) -> Result<(), QueryError>
+    async fn flush_cancellable<IO>(&self, pgb: &mut PostgresBackend<IO>) -> Result<(), QueryError>
    where
        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
    {
@@ -295,7 +299,7 @@ impl PageServerHandler {
            flush_r = pgb.flush() => {
                Ok(flush_r?)
            },
-            _ = cancel.cancelled() => {
+            _ = self.cancel.cancelled() => {
                Err(QueryError::Shutdown)
            }
        )
@@ -304,7 +308,6 @@ impl PageServerHandler {
    fn copyin_stream<'a, IO>(
        &'a self,
        pgb: &'a mut PostgresBackend<IO>,
-        cancel: &'a CancellationToken,
    ) -> impl Stream<Item = io::Result<Bytes>> + 'a
    where
        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
@@ -314,7 +317,7 @@ impl PageServerHandler {
                let msg = tokio::select! {
                    biased;

-                    _ = cancel.cancelled() => {
+                    _ = self.cancel.cancelled() => {
                        // We were requested to shut down.
                        let msg = "pageserver is shutting down";
                        let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, None));
@@ -354,7 +357,7 @@ impl PageServerHandler {
                        let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
                        // error can't happen here, ErrorResponse serialization should be always ok
                        pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
-                        self.flush_cancellable(pgb, cancel).await.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
+                        self.flush_cancellable(pgb).await.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
                        Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
                    }
                    Err(QueryError::Disconnected(ConnectionError::Io(io_error))) => {
@@ -381,13 +384,12 @@ impl PageServerHandler {
    {
        debug_assert_current_span_has_tenant_and_timeline_id();

+        // NOTE: pagerequests handler exits when connection is closed,
+        //       so there is no need to reset the association
+        task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
+
        // Make request tracer if needed
-        let tenant = mgr::get_active_tenant_with_timeout(
-            tenant_id,
-            ACTIVE_TENANT_TIMEOUT,
-            &task_mgr::shutdown_token(),
-        )
-        .await?;
+        let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
        let mut tracer = if tenant.get_trace_read_requests() {
            let connection_id = ConnectionId::generate();
            let path = tenant
@@ -403,14 +405,9 @@ impl PageServerHandler {
            .get_timeline(timeline_id, true)
            .map_err(|e| anyhow::anyhow!(e))?;

-        // Avoid starting new requests if the timeline has already started shutting down,
-        // and block timeline shutdown until this request is complete, or drops out due
-        // to cancellation.
-        let _timeline_guard = timeline.gate.enter().map_err(|_| QueryError::Shutdown)?;
-
        // switch client to COPYBOTH
        pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
-        self.flush_cancellable(pgb, &timeline.cancel).await?;
+        self.flush_cancellable(pgb).await?;

        let metrics = metrics::SmgrQueryTimePerTimeline::new(&tenant_id, &timeline_id);

@@ -418,7 +415,7 @@ impl PageServerHandler {
            let msg = tokio::select! {
                biased;

-                _ = timeline.cancel.cancelled() => {
+                _ = self.cancel.cancelled() => {
                    // We were requested to shut down.
                    info!("shutdown request received in page handler");
                    return Err(QueryError::Shutdown)
@@ -493,20 +490,9 @@ impl PageServerHandler {
                }
            };

-            if let Err(e) = &response {
-                if timeline.cancel.is_cancelled() {
-                    // If we fail to fulfil a request during shutdown, which may be _because_ of
-                    // shutdown, then do not send the error to the client.  Instead just drop the
-                    // connection.
-                    span.in_scope(|| info!("dropped response during shutdown: {e:#}"));
-                    return Err(QueryError::Shutdown);
-                }
-            }
-
            let response = response.unwrap_or_else(|e| {
                // print the all details to the log with {:#}, but for the client the
-                // error message is enough.  Do not log if shutting down, as the anyhow::Error
-                // here includes cancellation which is not an error.
+                // error message is enough
                span.in_scope(|| error!("error reading relation or page version: {:#}", e));
                PagestreamBeMessage::Error(PagestreamErrorResponse {
                    message: e.to_string(),
@@ -514,7 +500,7 @@ impl PageServerHandler {
            });

            pgb.write_message_noflush(&BeMessage::CopyData(&response.serialize()))?;
-            self.flush_cancellable(pgb, &timeline.cancel).await?;
+            self.flush_cancellable(pgb).await?;
        }
        Ok(())
    }
@@ -536,14 +522,10 @@ impl PageServerHandler {
    {
        debug_assert_current_span_has_tenant_and_timeline_id();

+        task_mgr::associate_with(Some(tenant_id), Some(timeline_id));
        // Create empty timeline
        info!("creating new timeline");
-        let tenant = get_active_tenant_with_timeout(
-            tenant_id,
-            ACTIVE_TENANT_TIMEOUT,
-            &task_mgr::shutdown_token(),
-        )
-        .await?;
+        let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
        let timeline = tenant
            .create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
            .await?;
@@ -561,9 +543,9 @@ impl PageServerHandler {
        // Import basebackup provided via CopyData
        info!("importing basebackup");
        pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
-        self.flush_cancellable(pgb, &tenant.cancel).await?;
+        self.flush_cancellable(pgb).await?;

-        let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &tenant.cancel)));
+        let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb)));
        timeline
            .import_basebackup_from_tar(
                &mut copyin_reader,
@@ -600,10 +582,9 @@ impl PageServerHandler {
        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
    {
        debug_assert_current_span_has_tenant_and_timeline_id();
+        task_mgr::associate_with(Some(tenant_id), Some(timeline_id));

-        let timeline = self
-            .get_active_tenant_timeline(tenant_id, timeline_id)
-            .await?;
+        let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
        let last_record_lsn = timeline.get_last_record_lsn();
        if last_record_lsn != start_lsn {
            return Err(QueryError::Other(
@@ -617,8 +598,8 @@ impl PageServerHandler {
        // Import wal provided via CopyData
        info!("importing wal");
        pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
-        self.flush_cancellable(pgb, &timeline.cancel).await?;
-        let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &timeline.cancel)));
+        self.flush_cancellable(pgb).await?;
+        let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb)));
        import_wal_from_tar(&timeline, &mut copyin_reader, start_lsn, end_lsn, &ctx).await?;
        info!("wal import complete");

@@ -811,9 +792,7 @@ impl PageServerHandler {
        let started = std::time::Instant::now();

        // check that the timeline exists
-        let timeline = self
-            .get_active_tenant_timeline(tenant_id, timeline_id)
-            .await?;
+        let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;
        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        if let Some(lsn) = lsn {
            // Backup was requested at a particular LSN. Wait for it to arrive.
@@ -828,7 +807,7 @@ impl PageServerHandler {

        // switch client to COPYOUT
        pgb.write_message_noflush(&BeMessage::CopyOutResponse)?;
-        self.flush_cancellable(pgb, &timeline.cancel).await?;
+        self.flush_cancellable(pgb).await?;

        // Send a tarball of the latest layer on the timeline. Compress if not
        // fullbackup. TODO Compress in that case too (tests need to be updated)
@@ -880,7 +859,7 @@ impl PageServerHandler {
        }

        pgb.write_message_noflush(&BeMessage::CopyDone)?;
-        self.flush_cancellable(pgb, &timeline.cancel).await?;
+        self.flush_cancellable(pgb).await?;

        let basebackup_after = started
            .elapsed()
@@ -912,25 +891,6 @@ impl PageServerHandler {
            .expect("claims presence already checked");
        check_permission(claims, tenant_id)
    }
-
-    /// Shorthand for getting a reference to a Timeline of an Active tenant.
-    async fn get_active_tenant_timeline(
-        &self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-    ) -> Result<Arc<Timeline>, GetActiveTimelineError> {
-        let tenant = get_active_tenant_with_timeout(
-            tenant_id,
-            ACTIVE_TENANT_TIMEOUT,
-            &task_mgr::shutdown_token(),
-        )
-        .await
-        .map_err(GetActiveTimelineError::Tenant)?;
-        let timeline = tenant
-            .get_timeline(timeline_id, true)
-            .map_err(|e| GetActiveTimelineError::Timeline(anyhow::anyhow!(e)))?;
-        Ok(timeline)
-    }
 }

 #[async_trait::async_trait]
@@ -1088,9 +1048,7 @@ where
                .record("timeline_id", field::display(timeline_id));

            self.check_permission(Some(tenant_id))?;
-            let timeline = self
-                .get_active_tenant_timeline(tenant_id, timeline_id)
-                .await?;
+            let timeline = get_active_tenant_timeline(tenant_id, timeline_id, &ctx).await?;

            let end_of_timeline = timeline.get_last_record_rlsn();

@@ -1274,12 +1232,7 @@ where

            self.check_permission(Some(tenant_id))?;

-            let tenant = get_active_tenant_with_timeout(
-                tenant_id,
-                ACTIVE_TENANT_TIMEOUT,
-                &task_mgr::shutdown_token(),
-            )
-            .await?;
+            let tenant = get_active_tenant_with_timeout(tenant_id, &ctx).await?;
            pgb.write_message_noflush(&BeMessage::RowDescription(&[
                RowDescriptor::int8_col(b"checkpoint_distance"),
                RowDescriptor::int8_col(b"checkpoint_timeout"),
@@ -1325,16 +1278,67 @@ where
    }
 }

+#[derive(thiserror::Error, Debug)]
+enum GetActiveTenantError {
+    #[error(
+        "Timed out waiting {wait_time:?} for tenant active state. Latest state: {latest_state:?}"
+    )]
+    WaitForActiveTimeout {
+        latest_state: TenantState,
+        wait_time: Duration,
+    },
+    #[error(transparent)]
+    NotFound(GetTenantError),
+    #[error(transparent)]
+    WaitTenantActive(tenant::WaitToBecomeActiveError),
+}
+
 impl From<GetActiveTenantError> for QueryError {
    fn from(e: GetActiveTenantError) -> Self {
        match e {
            GetActiveTenantError::WaitForActiveTimeout { .. } => QueryError::Disconnected(
                ConnectionError::Io(io::Error::new(io::ErrorKind::TimedOut, e.to_string())),
            ),
-            GetActiveTenantError::WillNotBecomeActive(TenantState::Stopping { .. }) => {
-                QueryError::Shutdown
+            GetActiveTenantError::WaitTenantActive(e) => QueryError::Other(anyhow::Error::new(e)),
+            GetActiveTenantError::NotFound(e) => QueryError::Other(anyhow::Error::new(e)),
+        }
+    }
+}
+
+/// Get active tenant.
+///
+/// If the tenant is Loading, waits for it to become Active, for up to 30 s. That
+/// ensures that queries don't fail immediately after pageserver startup, because
+/// all tenants are still loading.
+async fn get_active_tenant_with_timeout(
+    tenant_id: TenantId,
+    _ctx: &RequestContext, /* require get a context to support cancellation in the future */
+) -> Result<Arc<Tenant>, GetActiveTenantError> {
+    let tenant = match mgr::get_tenant(tenant_id, false).await {
+        Ok(tenant) => tenant,
+        Err(e @ GetTenantError::NotFound(_)) => return Err(GetActiveTenantError::NotFound(e)),
+        Err(GetTenantError::NotActive(_)) => {
+            unreachable!("we're calling get_tenant with active_only=false")
+        }
+        Err(GetTenantError::Broken(_)) => {
+            unreachable!("we're calling get_tenant with active_only=false")
+        }
+    };
+    let wait_time = Duration::from_secs(30);
+    match tokio::time::timeout(wait_time, tenant.wait_to_become_active()).await {
+        Ok(Ok(())) => Ok(tenant),
+        // no .context(), the error message is good enough and some tests depend on it
+        Ok(Err(e)) => Err(GetActiveTenantError::WaitTenantActive(e)),
+        Err(_) => {
+            let latest_state = tenant.current_state();
+            if latest_state == TenantState::Active {
+                Ok(tenant)
+            } else {
+                Err(GetActiveTenantError::WaitForActiveTimeout {
+                    latest_state,
+                    wait_time,
+                })
            }
-            e => QueryError::Other(anyhow::anyhow!(e)),
        }
    }
 }
@@ -1355,3 +1359,18 @@ impl From<GetActiveTimelineError> for QueryError {
        }
    }
 }
+
+/// Shorthand for getting a reference to a Timeline of an Active tenant.
+async fn get_active_tenant_timeline(
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    ctx: &RequestContext,
+) -> Result<Arc<Timeline>, GetActiveTimelineError> {
+    let tenant = get_active_tenant_with_timeout(tenant_id, ctx)
+        .await
+        .map_err(GetActiveTimelineError::Tenant)?;
+    let timeline = tenant
+        .get_timeline(timeline_id, true)
+        .map_err(|e| GetActiveTimelineError::Timeline(anyhow::anyhow!(e)))?;
+    Ok(timeline)
+}
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -44,17 +44,6 @@ pub enum CalculateLogicalSizeError {
    Other(#[from] anyhow::Error),
 }

-impl From<PageReconstructError> for CalculateLogicalSizeError {
-    fn from(pre: PageReconstructError) -> Self {
-        match pre {
-            PageReconstructError::AncestorStopping(_) | PageReconstructError::Cancelled => {
-                Self::Cancelled
-            }
-            _ => Self::Other(pre.into()),
-        }
-    }
-}
-
 #[derive(Debug, thiserror::Error)]
 pub enum RelationError {
    #[error("Relation Already Exists")]
@@ -584,17 +573,24 @@ impl Timeline {
        crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();

        // Fetch list of database dirs and iterate them
-        let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
+        let buf = self.get(DBDIR_KEY, lsn, ctx).await.context("read dbdir")?;
        let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?;

        let mut total_size: u64 = 0;
        for (spcnode, dbnode) in dbdir.dbdirs.keys() {
-            for rel in self.list_rels(*spcnode, *dbnode, lsn, ctx).await? {
+            for rel in self
+                .list_rels(*spcnode, *dbnode, lsn, ctx)
+                .await
+                .context("list rels")?
+            {
                if cancel.is_cancelled() {
                    return Err(CalculateLogicalSizeError::Cancelled);
                }
                let relsize_key = rel_size_to_key(rel);
-                let mut buf = self.get(relsize_key, lsn, ctx).await?;
+                let mut buf = self
+                    .get(relsize_key, lsn, ctx)
+                    .await
+                    .with_context(|| format!("read relation size of {rel:?}"))?;
                let relsize = buf.get_u32_le();

                total_size += relsize as u64;
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -299,6 +299,10 @@ pub enum TaskKind {

 #[derive(Default)]
 struct MutableTaskState {
+    /// Tenant and timeline that this task is associated with.
+    tenant_id: Option<TenantId>,
+    timeline_id: Option<TimelineId>,
+
    /// Handle for waiting for the task to exit. It can be None, if the
    /// the task has already exited.
    join_handle: Option<JoinHandle<()>>,
@@ -315,11 +319,6 @@ struct PageServerTask {
    // To request task shutdown, just cancel this token.
    cancel: CancellationToken,

-    /// Tasks may optionally be launched for a particular tenant/timeline, enabling
-    /// later cancelling tasks for that tenant/timeline in [`shutdown_tasks`]
-    tenant_id: Option<TenantId>,
-    timeline_id: Option<TimelineId>,
-
    mutable: Mutex<MutableTaskState>,
 }

@@ -345,9 +344,11 @@ where
        kind,
        name: name.to_string(),
        cancel: cancel.clone(),
-        tenant_id,
-        timeline_id,
-        mutable: Mutex::new(MutableTaskState { join_handle: None }),
+        mutable: Mutex::new(MutableTaskState {
+            tenant_id,
+            timeline_id,
+            join_handle: None,
+        }),
    });

    TASKS.lock().unwrap().insert(task_id, Arc::clone(&task));
@@ -417,6 +418,8 @@ async fn task_finish(

    let mut shutdown_process = false;
    {
+        let task_mut = task.mutable.lock().unwrap();
+
        match result {
            Ok(Ok(())) => {
                debug!("Task '{}' exited normally", task_name);
@@ -425,13 +428,13 @@ async fn task_finish(
                if shutdown_process_on_error {
                    error!(
                        "Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
-                        task_name, task.tenant_id, task.timeline_id, err
+                        task_name, task_mut.tenant_id, task_mut.timeline_id, err
                    );
                    shutdown_process = true;
                } else {
                    error!(
                        "Task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
-                        task_name, task.tenant_id, task.timeline_id, err
+                        task_name, task_mut.tenant_id, task_mut.timeline_id, err
                    );
                }
            }
@@ -439,13 +442,13 @@ async fn task_finish(
                if shutdown_process_on_error {
                    error!(
                        "Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
-                        task_name, task.tenant_id, task.timeline_id, err
+                        task_name, task_mut.tenant_id, task_mut.timeline_id, err
                    );
                    shutdown_process = true;
                } else {
                    error!(
                        "Task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
-                        task_name, task.tenant_id, task.timeline_id, err
+                        task_name, task_mut.tenant_id, task_mut.timeline_id, err
                    );
                }
            }
@@ -457,6 +460,17 @@ async fn task_finish(
    }
 }

+// expected to be called from the task of the given id.
+pub fn associate_with(tenant_id: Option<TenantId>, timeline_id: Option<TimelineId>) {
+    CURRENT_TASK.with(|ct| {
+        let mut task_mut = ct.mutable.lock().unwrap();
+        task_mut.tenant_id = tenant_id;
+        task_mut.timeline_id = timeline_id;
+    });
+}
+
+/// Is there a task running that matches the criteria
+
 /// Signal and wait for tasks to shut down.
 ///
 ///
@@ -479,16 +493,17 @@ pub async fn shutdown_tasks(
    {
        let tasks = TASKS.lock().unwrap();
        for task in tasks.values() {
+            let task_mut = task.mutable.lock().unwrap();
            if (kind.is_none() || Some(task.kind) == kind)
-                && (tenant_id.is_none() || task.tenant_id == tenant_id)
-                && (timeline_id.is_none() || task.timeline_id == timeline_id)
+                && (tenant_id.is_none() || task_mut.tenant_id == tenant_id)
+                && (timeline_id.is_none() || task_mut.timeline_id == timeline_id)
            {
                task.cancel.cancel();
                victim_tasks.push((
                    Arc::clone(task),
                    task.kind,
-                    task.tenant_id,
-                    task.timeline_id,
+                    task_mut.tenant_id,
+                    task_mut.timeline_id,
                ));
            }
        }
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -26,7 +26,6 @@ use tracing::*;
 use utils::completion;
 use utils::crashsafe::path_with_suffix_extension;
 use utils::fs_ext;
-use utils::sync::gate::Gate;

 use std::cmp::min;
 use std::collections::hash_map::Entry;
@@ -55,8 +54,6 @@ use self::config::TenantConf;
 use self::delete::DeleteTenantFlow;
 use self::metadata::LoadMetadataError;
 use self::metadata::TimelineMetadata;
-use self::mgr::GetActiveTenantError;
-use self::mgr::GetTenantError;
 use self::mgr::TenantsMap;
 use self::remote_timeline_client::RemoteTimelineClient;
 use self::timeline::uninit::TimelineUninitMark;
@@ -255,20 +252,6 @@ pub struct Tenant {
    eviction_task_tenant_state: tokio::sync::Mutex<EvictionTaskTenantState>,

    pub(crate) delete_progress: Arc<tokio::sync::Mutex<DeleteTenantFlow>>,
-
-    // Cancellation token fires when we have entered shutdown().  This is a parent of
-    // Timelines' cancellation token.
-    pub(crate) cancel: CancellationToken,
-
-    // Users of the Tenant such as the page service must take this Gate to avoid
-    // trying to use a Tenant which is shutting down.
-    pub(crate) gate: Gate,
-}
-
-impl std::fmt::Debug for Tenant {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{} ({})", self.tenant_id, self.current_state())
-    }
 }

 pub(crate) enum WalRedoManager {
@@ -376,6 +359,34 @@ impl Debug for SetStoppingError {
    }
 }

+#[derive(Debug, thiserror::Error)]
+pub(crate) enum WaitToBecomeActiveError {
+    WillNotBecomeActive {
+        tenant_id: TenantId,
+        state: TenantState,
+    },
+    TenantDropped {
+        tenant_id: TenantId,
+    },
+}
+
+impl std::fmt::Display for WaitToBecomeActiveError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            WaitToBecomeActiveError::WillNotBecomeActive { tenant_id, state } => {
+                write!(
+                    f,
+                    "Tenant {} will not become active. Current state: {:?}",
+                    tenant_id, state
+                )
+            }
+            WaitToBecomeActiveError::TenantDropped { tenant_id } => {
+                write!(f, "Tenant {tenant_id} will not become active (dropped)")
+            }
+        }
+    }
+}
+
 #[derive(thiserror::Error, Debug)]
 pub enum CreateTimelineError {
    #[error("a timeline with the given ID already exists")]
@@ -384,8 +395,6 @@ pub enum CreateTimelineError {
    AncestorLsn(anyhow::Error),
    #[error("ancestor timeline is not active")]
    AncestorNotActive,
-    #[error("tenant shutting down")]
-    ShuttingDown,
    #[error(transparent)]
    Other(#[from] anyhow::Error),
 }
@@ -517,7 +526,7 @@ impl Tenant {
        resources: TenantSharedResources,
        attached_conf: AttachedTenantConf,
        init_order: Option<InitializationOrder>,
-        tenants: &'static std::sync::RwLock<TenantsMap>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
        mode: SpawnMode,
        ctx: &RequestContext,
    ) -> anyhow::Result<Arc<Tenant>> {
@@ -1515,11 +1524,6 @@ impl Tenant {
            )));
        }

-        let _gate = self
-            .gate
-            .enter()
-            .map_err(|_| CreateTimelineError::ShuttingDown)?;
-
        if let Ok(existing) = self.get_timeline(new_timeline_id, false) {
            debug!("timeline {new_timeline_id} already exists");

@@ -1804,7 +1808,6 @@ impl Tenant {
        freeze_and_flush: bool,
    ) -> Result<(), completion::Barrier> {
        span::debug_assert_current_span_has_tenant_id();
-
        // Set tenant (and its timlines) to Stoppping state.
        //
        // Since we can only transition into Stopping state after activation is complete,
@@ -1830,7 +1833,6 @@ impl Tenant {
            }
            Err(SetStoppingError::AlreadyStopping(other)) => {
                // give caller the option to wait for this this shutdown
-                info!("Tenant::shutdown: AlreadyStopping");
                return Err(other);
            }
        };
@@ -1844,7 +1846,6 @@ impl Tenant {
                js.spawn(async move { timeline.shutdown(freeze_and_flush).instrument(span).await });
            })
        };
-        tracing::info!("Waiting for timelines...");
        while let Some(res) = js.join_next().await {
            match res {
                Ok(()) => {}
@@ -1854,21 +1855,12 @@ impl Tenant {
            }
        }

-        // We cancel the Tenant's cancellation token _after_ the timelines have all shut down.  This permits
-        // them to continue to do work during their shutdown methods, e.g. flushing data.
-        tracing::debug!("Cancelling CancellationToken");
-        self.cancel.cancel();
-
        // shutdown all tenant and timeline tasks: gc, compaction, page service
        // No new tasks will be started for this tenant because it's in `Stopping` state.
        //
        // this will additionally shutdown and await all timeline tasks.
-        tracing::debug!("Waiting for tasks...");
        task_mgr::shutdown_tasks(None, Some(self.tenant_id), None).await;

-        // Wait for any in-flight operations to complete
-        self.gate.close().await;
-
        Ok(())
    }

@@ -2029,7 +2021,7 @@ impl Tenant {
        self.state.subscribe()
    }

-    pub(crate) async fn wait_to_become_active(&self) -> Result<(), GetActiveTenantError> {
+    pub(crate) async fn wait_to_become_active(&self) -> Result<(), WaitToBecomeActiveError> {
        let mut receiver = self.state.subscribe();
        loop {
            let current_state = receiver.borrow_and_update().clone();
@@ -2037,9 +2029,11 @@ impl Tenant {
                TenantState::Loading | TenantState::Attaching | TenantState::Activating(_) => {
                    // in these states, there's a chance that we can reach ::Active
                    receiver.changed().await.map_err(
-                        |_e: tokio::sync::watch::error::RecvError|
-                            // Tenant existed but was dropped: report it as non-existent
-                            GetActiveTenantError::NotFound(GetTenantError::NotFound(self.tenant_id))
+                        |_e: tokio::sync::watch::error::RecvError| {
+                            WaitToBecomeActiveError::TenantDropped {
+                                tenant_id: self.tenant_id,
+                            }
+                        },
                    )?;
                }
                TenantState::Active { .. } => {
@@ -2047,7 +2041,10 @@ impl Tenant {
                }
                TenantState::Broken { .. } | TenantState::Stopping { .. } => {
                    // There's no chance the tenant can transition back into ::Active
-                    return Err(GetActiveTenantError::WillNotBecomeActive(current_state));
+                    return Err(WaitToBecomeActiveError::WillNotBecomeActive {
+                        tenant_id: self.tenant_id,
+                        state: current_state,
+                    });
                }
            }
        }
@@ -2113,9 +2110,6 @@ where
 }

 impl Tenant {
-    pub fn get_tenant_id(&self) -> TenantId {
-        self.tenant_id
-    }
    pub fn tenant_specific_overrides(&self) -> TenantConfOpt {
        self.tenant_conf.read().unwrap().tenant_conf
    }
@@ -2273,7 +2267,6 @@ impl Tenant {
            initial_logical_size_can_start.cloned(),
            initial_logical_size_attempt.cloned().flatten(),
            state,
-            self.cancel.child_token(),
        );

        Ok(timeline)
@@ -2363,8 +2356,6 @@ impl Tenant {
            cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
            eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()),
            delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTenantFlow::default())),
-            cancel: CancellationToken::default(),
-            gate: Gate::new(format!("Tenant<{tenant_id}>")),
        }
    }

@@ -3701,7 +3692,7 @@ mod tests {
    use tokio_util::sync::CancellationToken;

    static TEST_KEY: Lazy<Key> =
-        Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
+        Lazy::new(|| Key::from_slice(&hex!("112222222233333333444444445500000001")));

    #[tokio::test]
    async fn test_basic() -> anyhow::Result<()> {
@@ -3797,9 +3788,9 @@ mod tests {
        let writer = tline.writer().await;

        #[allow(non_snake_case)]
-        let TEST_KEY_A: Key = Key::from_hex("110000000033333333444444445500000001").unwrap();
+        let TEST_KEY_A: Key = Key::from_hex("112222222233333333444444445500000001").unwrap();
        #[allow(non_snake_case)]
-        let TEST_KEY_B: Key = Key::from_hex("110000000033333333444444445500000002").unwrap();
+        let TEST_KEY_B: Key = Key::from_hex("112222222233333333444444445500000002").unwrap();

        // Insert a value on the timeline
        writer
@@ -4245,7 +4236,11 @@ mod tests {
        metadata_bytes[8] ^= 1;
        std::fs::write(metadata_path, metadata_bytes)?;

-        let err = harness.try_load_local(&ctx).await.expect_err("should fail");
+        let err = harness
+            .try_load_local(&ctx)
+            .await
+            .err()
+            .expect("should fail");
        // get all the stack with all .context, not only the last one
        let message = format!("{err:#}");
        let expected = "failed to load metadata";
@@ -4379,7 +4374,7 @@ mod tests {

        let mut keyspace = KeySpaceAccum::new();

-        let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
+        let mut test_key = Key::from_hex("012222222233333333444444445500000000").unwrap();
        let mut blknum = 0;
        for _ in 0..50 {
            for _ in 0..10000 {
@@ -4425,7 +4420,7 @@ mod tests {

        const NUM_KEYS: usize = 1000;

-        let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
+        let mut test_key = Key::from_hex("012222222233333333444444445500000000").unwrap();

        let mut keyspace = KeySpaceAccum::new();

@@ -4506,7 +4501,7 @@ mod tests {

        const NUM_KEYS: usize = 1000;

-        let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
+        let mut test_key = Key::from_hex("012222222233333333444444445500000000").unwrap();

        let mut keyspace = KeySpaceAccum::new();

@@ -4597,7 +4592,7 @@ mod tests {
        const NUM_KEYS: usize = 100;
        const NUM_TLINES: usize = 50;

-        let mut test_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
+        let mut test_key = Key::from_hex("012222222233333333444444445500000000").unwrap();
        // Track page mutation lsns across different timelines.
        let mut updated = [[Lsn(0); NUM_KEYS]; NUM_TLINES];

--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -21,7 +21,7 @@ use crate::{
 };

 use super::{
-    mgr::{GetTenantError, TenantSlotError, TenantSlotUpsertError, TenantsMap},
+    mgr::{GetTenantError, TenantsMap},
    remote_timeline_client::{FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
    span,
    timeline::delete::DeleteTimelineFlow,
@@ -33,21 +33,12 @@ pub(crate) enum DeleteTenantError {
    #[error("GetTenant {0}")]
    Get(#[from] GetTenantError),

-    #[error("Tenant not attached")]
-    NotAttached,
-
    #[error("Invalid state {0}. Expected Active or Broken")]
    InvalidState(TenantState),

    #[error("Tenant deletion is already in progress")]
    AlreadyInProgress,

-    #[error("Tenant map slot error {0}")]
-    SlotError(#[from] TenantSlotError),
-
-    #[error("Tenant map slot upsert error {0}")]
-    SlotUpsertError(#[from] TenantSlotUpsertError),
-
    #[error("Timeline {0}")]
    Timeline(#[from] DeleteTimelineError),

@@ -282,12 +273,12 @@ impl DeleteTenantFlow {
    pub(crate) async fn run(
        conf: &'static PageServerConf,
        remote_storage: Option<GenericRemoteStorage>,
-        tenants: &'static std::sync::RwLock<TenantsMap>,
-        tenant: Arc<Tenant>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
+        tenant_id: TenantId,
    ) -> Result<(), DeleteTenantError> {
        span::debug_assert_current_span_has_tenant_id();

-        let mut guard = Self::prepare(&tenant).await?;
+        let (tenant, mut guard) = Self::prepare(tenants, tenant_id).await?;

        if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await {
            tenant.set_broken(format!("{e:#}")).await;
@@ -387,7 +378,7 @@ impl DeleteTenantFlow {
        guard: DeletionGuard,
        tenant: &Arc<Tenant>,
        preload: Option<TenantPreload>,
-        tenants: &'static std::sync::RwLock<TenantsMap>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
        init_order: Option<InitializationOrder>,
        ctx: &RequestContext,
    ) -> Result<(), DeleteTenantError> {
@@ -414,8 +405,15 @@ impl DeleteTenantFlow {
    }

    async fn prepare(
-        tenant: &Arc<Tenant>,
-    ) -> Result<tokio::sync::OwnedMutexGuard<Self>, DeleteTenantError> {
+        tenants: &tokio::sync::RwLock<TenantsMap>,
+        tenant_id: TenantId,
+    ) -> Result<(Arc<Tenant>, tokio::sync::OwnedMutexGuard<Self>), DeleteTenantError> {
+        let m = tenants.read().await;
+
+        let tenant = m
+            .get(&tenant_id)
+            .ok_or(GetTenantError::NotFound(tenant_id))?;
+
        // FIXME: unsure about active only. Our init jobs may not be cancellable properly,
        // so at least for now allow deletions only for active tenants. TODO recheck
        // Broken and Stopping is needed for retries.
@@ -449,14 +447,14 @@ impl DeleteTenantFlow {
            )));
        }

-        Ok(guard)
+        Ok((Arc::clone(tenant), guard))
    }

    fn schedule_background(
        guard: OwnedMutexGuard<Self>,
        conf: &'static PageServerConf,
        remote_storage: Option<GenericRemoteStorage>,
-        tenants: &'static std::sync::RwLock<TenantsMap>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
        tenant: Arc<Tenant>,
    ) {
        let tenant_id = tenant.tenant_id;
@@ -489,7 +487,7 @@ impl DeleteTenantFlow {
        mut guard: OwnedMutexGuard<Self>,
        conf: &PageServerConf,
        remote_storage: Option<GenericRemoteStorage>,
-        tenants: &'static std::sync::RwLock<TenantsMap>,
+        tenants: &'static tokio::sync::RwLock<TenantsMap>,
        tenant: &Arc<Tenant>,
    ) -> Result<(), DeleteTenantError> {
        // Tree sort timelines, schedule delete for them. Mention retries from the console side.
@@ -537,18 +535,10 @@ impl DeleteTenantFlow {
            .await
            .context("cleanup_remaining_fs_traces")?;

-        {
-            let mut locked = tenants.write().unwrap();
-            if locked.remove(&tenant.tenant_id).is_none() {
-                warn!("Tenant got removed from tenants map during deletion");
-            };
-
-            // FIXME: we should not be modifying this from outside of mgr.rs.
-            // This will go away when we simplify deletion (https://github.com/neondatabase/neon/issues/5080)
-            crate::metrics::TENANT_MANAGER
-                .tenant_slots
-                .set(locked.len() as u64);
-        }
+        let mut locked = tenants.write().await;
+        if locked.remove(&tenant.tenant_id).is_none() {
+            warn!("Tenant got removed from tenants map during deletion");
+        };

        *guard = Self::Finished;

--- a/pageserver/src/tenant/disk_btree_test_data.rs
+++ b/pageserver/src/tenant/disk_btree_test_data.rs
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -406,12 +406,10 @@ async fn fill_logical_sizes(
                have_any_error = true;
            }
            Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Err(error)))) => {
-                if !matches!(error, CalculateLogicalSizeError::Cancelled) {
-                    warn!(
-                        timeline_id=%timeline.timeline_id,
-                        "failed to calculate logical size at {lsn}: {error:#}"
-                    );
-                }
+                warn!(
+                    timeline_id=%timeline.timeline_id,
+                    "failed to calculate logical size at {lsn}: {error:#}"
+                );
                have_any_error = true;
            }
            Ok(Ok(TimelineAtLsnSizeResult(timeline, lsn, Ok(size)))) => {
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -345,19 +345,14 @@ impl InMemoryLayer {

        let cursor = inner.file.block_cursor();

-        // Sort the keys because delta layer writer expects them sorted.
-        //
-        // NOTE: this sort can take up significant time if the layer has millions of
-        //       keys. To speed up all the comparisons we convert the key to i128 and
-        //       keep the value as a reference.
-        let mut keys: Vec<_> = inner.index.iter().map(|(k, m)| (k.to_i128(), m)).collect();
-        keys.sort_unstable_by_key(|k| k.0);
+        let mut keys: Vec<(&Key, &VecMap<Lsn, u64>)> = inner.index.iter().collect();
+        keys.sort_by_key(|k| k.0);

        let ctx = RequestContextBuilder::extend(ctx)
            .page_content_kind(PageContentKind::InMemoryLayer)
            .build();
        for (key, vec_map) in keys.iter() {
-            let key = Key::from_i128(*key);
+            let key = **key;
            // Write all page versions
            for (lsn, pos) in vec_map.as_slice() {
                cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -23,7 +23,7 @@ use tokio::{
 };
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-use utils::{id::TenantTimelineId, sync::gate::Gate};
+use utils::id::TenantTimelineId;

 use std::cmp::{max, min, Ordering};
 use std::collections::{BinaryHeap, HashMap, HashSet};
@@ -310,13 +310,6 @@ pub struct Timeline {
    /// Load or creation time information about the disk_consistent_lsn and when the loading
    /// happened. Used for consumption metrics.
    pub(crate) loaded_at: (Lsn, SystemTime),
-
-    /// Gate to prevent shutdown completing while I/O is still happening to this timeline's data
-    pub(crate) gate: Gate,
-
-    /// Cancellation token scoped to this timeline: anything doing long-running work relating
-    /// to the timeline should drop out when this token fires.
-    pub(crate) cancel: CancellationToken,
 }

 pub struct WalReceiverInfo {
@@ -793,11 +786,7 @@ impl Timeline {
                // as an empty timeline. Also in unit tests, when we use the timeline
                // as a simple key-value store, ignoring the datadir layout. Log the
                // error but continue.
-                //
-                // Suppress error when it's due to cancellation
-                if !self.cancel.is_cancelled() {
-                    error!("could not compact, repartitioning keyspace failed: {err:?}");
-                }
+                error!("could not compact, repartitioning keyspace failed: {err:?}");
            }
        };

@@ -895,12 +884,7 @@ impl Timeline {
    pub async fn shutdown(self: &Arc<Self>, freeze_and_flush: bool) {
        debug_assert_current_span_has_tenant_and_timeline_id();

-        // Signal any subscribers to our cancellation token to drop out
-        tracing::debug!("Cancelling CancellationToken");
-        self.cancel.cancel();
-
        // prevent writes to the InMemoryLayer
-        tracing::debug!("Waiting for WalReceiverManager...");
        task_mgr::shutdown_tasks(
            Some(TaskKind::WalReceiverManager),
            Some(self.tenant_id),
@@ -936,16 +920,6 @@ impl Timeline {
                warn!("failed to await for frozen and flushed uploads: {e:#}");
            }
        }
-
-        // Page request handlers might be waiting for LSN to advance: they do not respect Timeline::cancel
-        // while doing so.
-        self.last_record_lsn.shutdown();
-
-        tracing::debug!("Waiting for tasks...");
-        task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(self.timeline_id)).await;
-
-        // Finally wait until any gate-holders are complete
-        self.gate.close().await;
    }

    pub fn set_state(&self, new_state: TimelineState) {
@@ -1074,11 +1048,6 @@ impl Timeline {
    /// Like [`evict_layer_batch`](Self::evict_layer_batch), but for just one layer.
    /// Additional case `Ok(None)` covers the case where the layer could not be found by its `layer_file_name`.
    pub async fn evict_layer(&self, layer_file_name: &str) -> anyhow::Result<Option<bool>> {
-        let _gate = self
-            .gate
-            .enter()
-            .map_err(|_| anyhow::anyhow!("Shutting down"))?;
-
        let Some(local_layer) = self.find_layer(layer_file_name).await else {
            return Ok(None);
        };
@@ -1094,8 +1063,9 @@ impl Timeline {
            .as_ref()
            .ok_or_else(|| anyhow::anyhow!("remote storage not configured; cannot evict"))?;

+        let cancel = CancellationToken::new();
        let results = self
-            .evict_layer_batch(remote_client, &[local_layer])
+            .evict_layer_batch(remote_client, &[local_layer], &cancel)
            .await?;
        assert_eq!(results.len(), 1);
        let result: Option<Result<(), EvictionError>> = results.into_iter().next().unwrap();
@@ -1110,18 +1080,15 @@ impl Timeline {
    pub(crate) async fn evict_layers(
        &self,
        layers_to_evict: &[Layer],
+        cancel: &CancellationToken,
    ) -> anyhow::Result<Vec<Option<Result<(), EvictionError>>>> {
-        let _gate = self
-            .gate
-            .enter()
-            .map_err(|_| anyhow::anyhow!("Shutting down"))?;
-
        let remote_client = self
            .remote_client
            .as_ref()
            .context("timeline must have RemoteTimelineClient")?;

-        self.evict_layer_batch(remote_client, layers_to_evict).await
+        self.evict_layer_batch(remote_client, layers_to_evict, cancel)
+            .await
    }

    /// Evict multiple layers at once, continuing through errors.
@@ -1142,6 +1109,7 @@ impl Timeline {
        &self,
        remote_client: &Arc<RemoteTimelineClient>,
        layers_to_evict: &[Layer],
+        cancel: &CancellationToken,
    ) -> anyhow::Result<Vec<Option<Result<(), EvictionError>>>> {
        // ensure that the layers have finished uploading
        // (don't hold the layer_removal_cs while we do it, we're not removing anything yet)
@@ -1189,7 +1157,7 @@ impl Timeline {
        };

        tokio::select! {
-            _ = self.cancel.cancelled() => {},
+            _ = cancel.cancelled() => {},
            _ = join => {}
        }

@@ -1299,7 +1267,6 @@ impl Timeline {
        initial_logical_size_can_start: Option<completion::Barrier>,
        initial_logical_size_attempt: Option<completion::Completion>,
        state: TimelineState,
-        cancel: CancellationToken,
    ) -> Arc<Self> {
        let disk_consistent_lsn = metadata.disk_consistent_lsn();
        let (state, _) = watch::channel(state);
@@ -1400,8 +1367,6 @@ impl Timeline {

                initial_logical_size_can_start,
                initial_logical_size_attempt: Mutex::new(initial_logical_size_attempt),
-                cancel,
-                gate: Gate::new(format!("Timeline<{tenant_id}/{timeline_id}>")),
            };
            result.repartition_threshold =
                result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
@@ -2065,10 +2030,6 @@ impl Timeline {
        let mut cont_lsn = Lsn(request_lsn.0 + 1);

        'outer: loop {
-            if self.cancel.is_cancelled() {
-                return Err(PageReconstructError::Cancelled);
-            }
-
            // The function should have updated 'state'
            //info!("CALLED for {} at {}: {:?} with {} records, cached {}", key, cont_lsn, result, reconstruct_state.records.len(), cached_lsn);
            match result {
@@ -4405,10 +4366,25 @@ mod tests {
            .expect("should had been resident")
            .drop_eviction_guard();

+        let cancel = tokio_util::sync::CancellationToken::new();
        let batch = [layer];

-        let first = async { timeline.evict_layer_batch(&rc, &batch).await.unwrap() };
-        let second = async { timeline.evict_layer_batch(&rc, &batch).await.unwrap() };
+        let first = {
+            let cancel = cancel.child_token();
+            async {
+                let cancel = cancel;
+                timeline
+                    .evict_layer_batch(&rc, &batch, &cancel)
+                    .await
+                    .unwrap()
+            }
+        };
+        let second = async {
+            timeline
+                .evict_layer_batch(&rc, &batch, &cancel)
+                .await
+                .unwrap()
+        };

        let (first, second) = tokio::join!(first, second);

--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -17,7 +17,6 @@ use crate::{
    deletion_queue::DeletionQueueClient,
    task_mgr::{self, TaskKind},
    tenant::{
-        debug_assert_current_span_has_tenant_and_timeline_id,
        metadata::TimelineMetadata,
        remote_timeline_client::{
            self, PersistIndexPartWithDeletedFlagError, RemoteTimelineClient,
@@ -31,11 +30,6 @@ use super::{Timeline, TimelineResources};

 /// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
 async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
-    debug_assert_current_span_has_tenant_and_timeline_id();
-    // Notify any timeline work to drop out of loops/requests
-    tracing::debug!("Cancelling CancellationToken");
-    timeline.cancel.cancel();
-
    // Stop the walreceiver first.
    debug!("waiting for wal receiver to shutdown");
    let maybe_started_walreceiver = { timeline.walreceiver.lock().unwrap().take() };
@@ -80,11 +74,6 @@ async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
            "failpoint: timeline-delete-before-index-deleted-at"
        ))?
    });
-
-    tracing::debug!("Waiting for gate...");
-    timeline.gate.close().await;
-    tracing::debug!("Shutdown complete");
-
    Ok(())
 }

--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -277,7 +277,10 @@ impl Timeline {
            Some(c) => c,
        };

-        let results = match self.evict_layer_batch(remote_client, &candidates).await {
+        let results = match self
+            .evict_layer_batch(remote_client, &candidates, cancel)
+            .await
+        {
            Err(pre_err) => {
                stats.errors += candidates.len();
                error!("could not do any evictions: {pre_err:#}");
@@ -341,7 +344,20 @@ impl Timeline {
        // Make one of the tenant's timelines draw the short straw and run the calculation.
        // The others wait until the calculation is done so that they take into account the
        // imitated accesses that the winner made.
-        let tenant = match crate::tenant::mgr::get_tenant(self.tenant_id, true) {
+        //
+        // It is critical we are responsive to cancellation here. Otherwise, we deadlock with
+        // tenant deletion (holds TENANTS in read mode) any other task that attempts to
+        // acquire TENANTS in write mode before we here call get_tenant.
+        // See https://github.com/neondatabase/neon/issues/5284.
+        let res = tokio::select! {
+            _ = cancel.cancelled() => {
+                return ControlFlow::Break(());
+            }
+            res = crate::tenant::mgr::get_tenant(self.tenant_id, true) => {
+                res
+            }
+        };
+        let tenant = match res {
            Ok(t) => t,
            Err(_) => {
                return ControlFlow::Break(());
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -426,7 +426,7 @@ impl ConnectionManagerState {
                    timeline,
                    new_sk.wal_source_connconf,
                    events_sender,
-                    cancellation.clone(),
+                    cancellation,
                    connect_timeout,
                    ctx,
                    node_id,
@@ -447,14 +447,7 @@ impl ConnectionManagerState {
                            }
                            WalReceiverError::Other(e) => {
                                // give out an error to have task_mgr give it a really verbose logging
-                                if cancellation.is_cancelled() {
-                                    // Ideally we would learn about this via some path other than Other, but
-                                    // that requires refactoring all the intermediate layers of ingest code
-                                    // that only emit anyhow::Error
-                                    Ok(())
-                                } else {
-                                    Err(e).context("walreceiver connection handling failure")
-                                }
+                                Err(e).context("walreceiver connection handling failure")
                            }
                        }
                    }
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -596,21 +596,21 @@ trait CloseFileDescriptors: CommandExt {

 impl<C: CommandExt> CloseFileDescriptors for C {
    fn close_fds(&mut self) -> &mut Command {
-        // SAFETY: Code executed inside pre_exec should have async-signal-safety,
-        // which means it should be safe to execute inside a signal handler.
-        // The precise meaning depends on platform. See `man signal-safety`
-        // for the linux definition.
-        //
-        // The set_fds_cloexec_threadsafe function is documented to be
-        // async-signal-safe.
-        //
-        // Aside from this function, the rest of the code is re-entrant and
-        // doesn't make any syscalls. We're just passing constants.
-        //
-        // NOTE: It's easy to indirectly cause a malloc or lock a mutex,
-        // which is not async-signal-safe. Be careful.
        unsafe {
            self.pre_exec(move || {
+                // SAFETY: Code executed inside pre_exec should have async-signal-safety,
+                // which means it should be safe to execute inside a signal handler.
+                // The precise meaning depends on platform. See `man signal-safety`
+                // for the linux definition.
+                //
+                // The set_fds_cloexec_threadsafe function is documented to be
+                // async-signal-safe.
+                //
+                // Aside from this function, the rest of the code is re-entrant and
+                // doesn't make any syscalls. We're just passing constants.
+                //
+                // NOTE: It's easy to indirectly cause a malloc or lock a mutex,
+                // which is not async-signal-safe. Be careful.
                close_fds::set_fds_cloexec_threadsafe(3, &[]);
                Ok(())
            })
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -19,10 +19,7 @@
 #include "access/xlog.h"
 #include "access/xlogutils.h"
 #include "storage/buf_internals.h"
-#include "storage/lwlock.h"
-#include "storage/ipc.h"
 #include "c.h"
-#include "postmaster/interrupt.h"

 #include "libpq-fe.h"
 #include "libpq/pqformat.h"
@@ -64,63 +61,23 @@ int			flush_every_n_requests = 8;
 int			n_reconnect_attempts = 0;
 int			max_reconnect_attempts = 60;

-#define MAX_PAGESERVER_CONNSTRING_SIZE 256
-
-typedef struct
-{
-    LWLockId lock;
-    pg_atomic_uint64 update_counter;
-    char pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
-} PagestoreShmemState;
-
-#if PG_VERSION_NUM >= 150000
-static shmem_request_hook_type prev_shmem_request_hook = NULL;
-static void walproposer_shmem_request(void);
-#endif
-static shmem_startup_hook_type prev_shmem_startup_hook;
-static PagestoreShmemState *pagestore_shared;
-static uint64 pagestore_local_counter = 0;
-static char local_pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
-
 bool	(*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;

 static bool pageserver_flush(void);
 static void pageserver_disconnect(void);

-static bool
-CheckPageserverConnstring(char **newval, void **extra, GucSource source)
-{
-    return strlen(*newval) < MAX_PAGESERVER_CONNSTRING_SIZE;
-}
+
+static pqsigfunc	 prev_signal_handler;

 static void
-AssignPageserverConnstring(const char *newval, void *extra)
+pageserver_sighup_handler(SIGNAL_ARGS)
 {
-    if(!pagestore_shared)
-        return;
-    LWLockAcquire(pagestore_shared->lock, LW_EXCLUSIVE);
-    strlcpy(pagestore_shared->pageserver_connstring, newval, MAX_PAGESERVER_CONNSTRING_SIZE);
-    pg_atomic_fetch_add_u64(&pagestore_shared->update_counter, 1);
-    LWLockRelease(pagestore_shared->lock);
-}
-
-static bool
-CheckConnstringUpdated()
-{
-    if(!pagestore_shared)
-        return false;
-    return pagestore_local_counter < pg_atomic_read_u64(&pagestore_shared->update_counter);
-}
-
-static void
-ReloadConnstring()
-{
-    if(!pagestore_shared)
-        return;
-    LWLockAcquire(pagestore_shared->lock, LW_SHARED);
-    strlcpy(local_pageserver_connstring, pagestore_shared->pageserver_connstring, sizeof(local_pageserver_connstring));
-    pagestore_local_counter = pg_atomic_read_u64(&pagestore_shared->update_counter);
-    LWLockRelease(pagestore_shared->lock);
+	if (prev_signal_handler)
+	{
+        	prev_signal_handler(postgres_signal_arg);
+	}
+	neon_log(LOG, "Received SIGHUP, disconnecting pageserver. New pageserver connstring is %s", page_server_connstring);
+	pageserver_disconnect();
 }

 static bool
@@ -134,11 +91,6 @@ pageserver_connect(int elevel)

 	Assert(!connected);

-        if(CheckConnstringUpdated())
-        {
-            ReloadConnstring();
-        }
-
 	/*
 	 * Connect using the connection string we got from the
 	 * neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment
@@ -158,7 +110,7 @@ pageserver_connect(int elevel)
 		n++;
 	}
 	keywords[n] = "dbname";
-	values[n] = local_pageserver_connstring;
+	values[n] = page_server_connstring;
 	n++;
 	keywords[n] = NULL;
 	values[n] = NULL;
@@ -302,12 +254,6 @@ pageserver_send(NeonRequest * request)
 {
 	StringInfoData req_buff;

-        if(CheckConnstringUpdated())
-        {
-            pageserver_disconnect();
-            ReloadConnstring();
-        }
-
 	/* If the connection was lost for some reason, reconnect */
 	if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD)
 	{
@@ -328,7 +274,6 @@ pageserver_send(NeonRequest * request)
 	{
 		while (!pageserver_connect(n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
 		{
-			HandleMainLoopInterrupts();
 			n_reconnect_attempts += 1;
 			pg_usleep(RECONNECT_INTERVAL_USEC);
 		}
@@ -446,8 +391,7 @@ pageserver_flush(void)
 	return true;
 }

-page_server_api api =
-{
+page_server_api api = {
 	.send = pageserver_send,
 	.flush = pageserver_flush,
 	.receive = pageserver_receive
@@ -461,72 +405,12 @@ check_neon_id(char **newval, void **extra, GucSource source)
 	return **newval == '\0' || HexDecodeString(id, *newval, 16);
 }

-static Size
-PagestoreShmemSize(void)
-{
-    return sizeof(PagestoreShmemState);
-}
-
-static bool
-PagestoreShmemInit(void)
-{
-    bool found;
-    LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
-    pagestore_shared = ShmemInitStruct("libpagestore shared state",
-                                       PagestoreShmemSize(),
-                                       &found);
-    if(!found)
-    {
-        pagestore_shared->lock = &(GetNamedLWLockTranche("neon_libpagestore")->lock);
-        pg_atomic_init_u64(&pagestore_shared->update_counter, 0);
-        AssignPageserverConnstring(page_server_connstring, NULL);
-    }
-    LWLockRelease(AddinShmemInitLock);
-    return found;
-}
-
-static void
-pagestore_shmem_startup_hook(void)
-{
-    if(prev_shmem_startup_hook)
-        prev_shmem_startup_hook();
-
-    PagestoreShmemInit();
-}
-
-static void
-pagestore_shmem_request(void)
-{
-#if PG_VERSION_NUM >= 150000
-    if(prev_shmem_request_hook)
-        prev_shmem_request_hook();
-#endif
-
-    RequestAddinShmemSpace(PagestoreShmemSize());
-    RequestNamedLWLockTranche("neon_libpagestore", 1);
-}
-
-static void
-pagestore_prepare_shmem(void)
-{
-#if PG_VERSION_NUM >= 150000
-	prev_shmem_request_hook = shmem_request_hook;
-	shmem_request_hook = pagestore_shmem_request;
-#else
-        pagestore_shmem_request();
-#endif
-	prev_shmem_startup_hook = shmem_startup_hook;
-	shmem_startup_hook = pagestore_shmem_startup_hook;
-}
-
 /*
 * Module initialization function
 */
 void
 pg_init_libpagestore(void)
 {
-        pagestore_prepare_shmem();
-
 	DefineCustomStringVariable("neon.pageserver_connstring",
 							   "connection string to the page server",
 							   NULL,
@@ -534,7 +418,7 @@ pg_init_libpagestore(void)
 							   "",
 							   PGC_SIGHUP,
 							   0,	/* no flags required */
-							   CheckPageserverConnstring, AssignPageserverConnstring, NULL);
+							   NULL, NULL, NULL);

 	DefineCustomStringVariable("neon.timeline_id",
 							   "Neon timeline_id the server is running on",
@@ -615,5 +499,7 @@ pg_init_libpagestore(void)
 		redo_read_buffer_filter = neon_redo_read_buffer_filter;
 	}

+        prev_signal_handler = pqsignal(SIGHUP, pageserver_sighup_handler);
+
 	lfc_init();
 }
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -1,8 +1,6 @@
 //! User credentials used in authentication.

-use crate::{
-    auth::password_hack::parse_endpoint_param, error::UserFacingError, proxy::neon_options,
-};
+use crate::{auth::password_hack::parse_endpoint_param, error::UserFacingError};
 use itertools::Itertools;
 use pq_proto::StartupMessageParams;
 use std::collections::HashSet;
@@ -40,8 +38,6 @@ pub struct ClientCredentials<'a> {
    pub user: &'a str,
    // TODO: this is a severe misnomer! We should think of a new name ASAP.
    pub project: Option<String>,
-
-    pub cache_key: String,
 }

 impl ClientCredentials<'_> {
@@ -57,7 +53,6 @@ impl<'a> ClientCredentials<'a> {
        ClientCredentials {
            user: "",
            project: None,
-            cache_key: "".to_string(),
        }
    }

@@ -125,17 +120,7 @@ impl<'a> ClientCredentials<'a> {

        info!(user, project = project.as_deref(), "credentials");

-        let cache_key = format!(
-            "{}{}",
-            project.as_deref().unwrap_or(""),
-            neon_options(params).unwrap_or("".to_string())
-        );
-
-        Ok(Self {
-            user,
-            project,
-            cache_key,
-        })
+        Ok(Self { user, project })
    }
 }

@@ -191,7 +176,6 @@ mod tests {
        let creds = ClientCredentials::parse(&options, sni, common_names)?;
        assert_eq!(creds.user, "john_doe");
        assert_eq!(creds.project.as_deref(), Some("foo"));
-        assert_eq!(creds.cache_key, "foo");

        Ok(())
    }
@@ -319,23 +303,4 @@ mod tests {
            _ => panic!("bad error: {err:?}"),
        }
    }
-
-    #[test]
-    fn parse_neon_options() -> anyhow::Result<()> {
-        let options = StartupMessageParams::new([
-            ("user", "john_doe"),
-            ("options", "neon_lsn:0/2 neon_endpoint_type:read_write"),
-        ]);
-
-        let sni = Some("project.localhost");
-        let common_names = Some(["localhost".into()].into());
-        let creds = ClientCredentials::parse(&options, sni, common_names)?;
-        assert_eq!(creds.project.as_deref(), Some("project"));
-        assert_eq!(
-            creds.cache_key,
-            "projectneon_endpoint_type:read_write neon_lsn:0/2"
-        );
-
-        Ok(())
-    }
 }
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -3,7 +3,6 @@ use crate::{
    cancellation::CancelClosure,
    console::errors::WakeComputeError,
    error::{io_error, UserFacingError},
-    proxy::is_neon_param,
 };
 use futures::{FutureExt, TryFutureExt};
 use itertools::Itertools;
@@ -279,7 +278,7 @@ fn filtered_options(params: &StartupMessageParams) -> Option<String> {
    #[allow(unstable_name_collisions)]
    let options: String = params
        .options_raw()?
-        .filter(|opt| parse_endpoint_param(opt).is_none() && !is_neon_param(opt))
+        .filter(|opt| parse_endpoint_param(opt).is_none())
        .intersperse(" ") // TODO: use impl from std once it's stabilized
        .collect();

@@ -314,11 +313,5 @@ mod tests {

        let params = StartupMessageParams::new([("options", "project = foo")]);
        assert_eq!(filtered_options(&params).as_deref(), Some("project = foo"));
-
-        let params = StartupMessageParams::new([(
-            "options",
-            "project = foo neon_endpoint_type:read_write   neon_lsn:0/2",
-        )]);
-        assert_eq!(filtered_options(&params).as_deref(), Some("project = foo"));
    }
 }
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -178,7 +178,6 @@ pub struct ConsoleReqExtra<'a> {
    pub session_id: uuid::Uuid,
    /// Name of client application, if set.
    pub application_name: Option<&'a str>,
-    pub options: Option<&'a str>,
 }

 /// Auth secret which is managed by the cloud.
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -99,7 +99,6 @@ impl Api {
                .query(&[
                    ("application_name", extra.application_name),
                    ("project", Some(project)),
-                    ("options", extra.options),
                ])
                .build()?;

@@ -152,7 +151,7 @@ impl super::Api for Api {
        extra: &ConsoleReqExtra<'_>,
        creds: &ClientCredentials,
    ) -> Result<CachedNodeInfo, WakeComputeError> {
-        let key: &str = &creds.cache_key;
+        let key = creds.project().expect("impossible");

        // Every time we do a wakeup http request, the compute node will stay up
        // for some time (highly depends on the console's scale-to-zero policy);
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -1,5 +1,3 @@
-#![deny(clippy::undocumented_unsafe_blocks)]
-
 use std::convert::Infallible;

 use anyhow::{bail, Context};
--- a/proxy/src/parse.rs
+++ b/proxy/src/parse.rs
@@ -3,9 +3,10 @@
 use std::ffi::CStr;

 pub fn split_cstr(bytes: &[u8]) -> Option<(&CStr, &[u8])> {
-    let cstr = CStr::from_bytes_until_nul(bytes).ok()?;
-    let (_, other) = bytes.split_at(cstr.to_bytes_with_nul().len());
-    Some((cstr, other))
+    let pos = bytes.iter().position(|&x| x == 0)?;
+    let (cstr, other) = bytes.split_at(pos + 1);
+    // SAFETY: we've already checked that there's a terminator
+    Some((unsafe { CStr::from_bytes_with_nul_unchecked(cstr) }, other))
 }

 /// See <https://doc.rust-lang.org/std/primitive.slice.html#method.split_array_ref>.
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -15,12 +15,10 @@ use crate::{
 use anyhow::{bail, Context};
 use async_trait::async_trait;
 use futures::TryFutureExt;
-use itertools::Itertools;
 use metrics::{exponential_buckets, register_int_counter_vec, IntCounterVec};
-use once_cell::sync::{Lazy, OnceCell};
+use once_cell::sync::Lazy;
 use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
 use prometheus::{register_histogram_vec, HistogramVec};
-use regex::Regex;
 use std::{error::Error, io, ops::ControlFlow, sync::Arc, time::Instant};
 use tokio::{
    io::{AsyncRead, AsyncWrite, AsyncWriteExt},
@@ -883,12 +881,9 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
            allow_self_signed_compute,
        } = self;

-        let console_options = neon_options(params);
-
        let extra = console::ConsoleReqExtra {
            session_id, // aka this connection's id
            application_name: params.get("application_name"),
-            options: console_options.as_deref(),
        };

        let mut latency_timer = LatencyTimer::new(mode.protocol_label());
@@ -950,27 +945,3 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
        proxy_pass(stream, node.stream, &aux).await
    }
 }
-
-pub fn neon_options(params: &StartupMessageParams) -> Option<String> {
-    #[allow(unstable_name_collisions)]
-    let options: String = params
-        .options_raw()?
-        .filter(|opt| is_neon_param(opt))
-        .sorted() // we sort it to use as cache key
-        .intersperse(" ") // TODO: use impl from std once it's stabilized
-        .collect();
-
-    // Don't even bother with empty options.
-    if options.is_empty() {
-        return None;
-    }
-
-    Some(options)
-}
-
-pub fn is_neon_param(bytes: &str) -> bool {
-    static RE: OnceCell<Regex> = OnceCell::new();
-    RE.get_or_init(|| Regex::new(r"^neon_\w+:").unwrap());
-
-    RE.get().unwrap().is_match(bytes)
-}
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -440,7 +440,6 @@ fn helper_create_connect_info(
    let extra = console::ConsoleReqExtra {
        session_id: uuid::Uuid::new_v4(),
        application_name: Some("TEST"),
-        options: None,
    };
    let creds = auth::BackendType::Test(mechanism);
    (cache, extra, creds)
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -22,10 +22,7 @@ use tokio_postgres::{AsyncMessage, ReadyForQueryStatus};

 use crate::{
    auth, console,
-    proxy::{
-        neon_options, LatencyTimer, NUM_DB_CONNECTIONS_CLOSED_COUNTER,
-        NUM_DB_CONNECTIONS_OPENED_COUNTER,
-    },
+    proxy::{LatencyTimer, NUM_DB_CONNECTIONS_CLOSED_COUNTER, NUM_DB_CONNECTIONS_OPENED_COUNTER},
    usage_metrics::{Ids, MetricCounter, USAGE_METRICS},
 };
 use crate::{compute, config};
@@ -44,7 +41,6 @@ pub struct ConnInfo {
    pub dbname: String,
    pub hostname: String,
    pub password: String,
-    pub options: Option<String>,
 }

 impl ConnInfo {
@@ -405,25 +401,26 @@ async fn connect_to_compute(
    let tls = config.tls_config.as_ref();
    let common_names = tls.and_then(|tls| tls.common_names.clone());

-    let params = StartupMessageParams::new([
+    let credential_params = StartupMessageParams::new([
        ("user", &conn_info.username),
        ("database", &conn_info.dbname),
        ("application_name", APP_NAME),
-        ("options", conn_info.options.as_deref().unwrap_or("")),
    ]);

    let creds = config
        .auth_backend
        .as_ref()
-        .map(|_| auth::ClientCredentials::parse(&params, Some(&conn_info.hostname), common_names))
+        .map(|_| {
+            auth::ClientCredentials::parse(
+                &credential_params,
+                Some(&conn_info.hostname),
+                common_names,
+            )
+        })
        .transpose()?;
-
-    let console_options = neon_options(&params);
-
    let extra = console::ConsoleReqExtra {
        session_id: uuid::Uuid::new_v4(),
        application_name: Some(APP_NAME),
-        options: console_options.as_deref(),
    };

    let node_info = creds
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -174,23 +174,11 @@ fn get_conn_info(
        }
    }

-    let pairs = connection_url.query_pairs();
-
-    let mut options = Option::None;
-
-    for (key, value) in pairs {
-        if key == "options" {
-            options = Some(value.to_string());
-            break;
-        }
-    }
-
    Ok(ConnInfo {
        username: username.to_owned(),
        dbname: dbname.to_owned(),
        hostname: hostname.to_owned(),
        password: password.to_owned(),
-        options,
    })
 }

--- a/s3_scrubber/src/lib.rs
+++ b/s3_scrubber/src/lib.rs
@@ -1,5 +1,3 @@
-#![deny(unsafe_code)]
-#![deny(clippy::undocumented_unsafe_blocks)]
 pub mod checks;
 pub mod cloud_admin_api;
 pub mod garbage;
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -38,7 +38,7 @@ use safekeeper::{http, WAL_REMOVER_RUNTIME};
 use safekeeper::{remove_wal, WAL_BACKUP_RUNTIME};
 use safekeeper::{wal_backup, HTTP_RUNTIME};
 use storage_broker::DEFAULT_ENDPOINT;
-use utils::auth::{JwtAuth, Scope, SwappableJwtAuth};
+use utils::auth::{JwtAuth, Scope};
 use utils::{
    id::NodeId,
    logging::{self, LogFormat},
@@ -251,9 +251,10 @@ async fn main() -> anyhow::Result<()> {
            None
        }
        Some(path) => {
-            info!("loading http auth JWT key(s) from {path}");
-            let jwt_auth = JwtAuth::from_key_path(path).context("failed to load the auth key")?;
-            Some(Arc::new(SwappableJwtAuth::new(jwt_auth)))
+            info!("loading http auth JWT key from {path}");
+            Some(Arc::new(
+                JwtAuth::from_key_path(path).context("failed to load the auth key")?,
+            ))
        }
    };

--- a/safekeeper/src/http/openapi_spec.yaml
+++ b/safekeeper/src/http/openapi_spec.yaml
@@ -86,41 +86,6 @@ paths:
        default:
          $ref: "#/components/responses/GenericError"

-  /v1/tenant/{tenant_id}/timeline/{source_timeline_id}/copy:
-    parameters:
-      - name: tenant_id
-        in: path
-        required: true
-        schema:
-          type: string
-          format: hex
-      - name: source_timeline_id
-        in: path
-        required: true
-        schema:
-          type: string
-          format: hex
-
-    post:
-      tags:
-      - "Timeline"
-      summary: Register new timeline as copy of existing timeline
-      description: ""
-      operationId: v1CopyTenantTimeline
-      requestBody:
-        content:
-          application/json:
-            schema:
-              $ref: "#/components/schemas/TimelineCopyRequest"
-      responses:
-        "201":
-          description: Timeline created
-          # TODO: return timeline info?
-        "403":
-          $ref: "#/components/responses/ForbiddenError"
-        default:
-          $ref: "#/components/responses/GenericError"
-

  /v1/tenant/{tenant_id}/timeline/{timeline_id}:
    parameters:
@@ -245,18 +210,6 @@ components:
            type: integer
            minimum: 0

-    TimelineCopyRequest:
-      type: object
-      required:
-        - target_timeline_id
-        - until_lsn
-      properties:
-        target_timeline_id:
-          type: string
-          format: hex
-        until_lsn:
-          type: string
-
    SkTimelineInfo:
      type: object
      required:
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -30,7 +30,7 @@ use crate::timelines_global_map::TimelineDeleteForceResult;
 use crate::GlobalTimelines;
 use crate::SafeKeeperConf;
 use utils::{
-    auth::SwappableJwtAuth,
+    auth::JwtAuth,
    http::{
        endpoint::{self, auth_middleware, check_permission_with},
        error::ApiError,
@@ -428,11 +428,8 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
            if ALLOWLIST_ROUTES.contains(request.uri()) {
                None
            } else {
-                // Option<Arc<SwappableJwtAuth>> is always provided as data below, hence unwrap().
-                request
-                    .data::<Option<Arc<SwappableJwtAuth>>>()
-                    .unwrap()
-                    .as_deref()
+                // Option<Arc<JwtAuth>> is always provided as data below, hence unwrap().
+                request.data::<Option<Arc<JwtAuth>>>().unwrap().as_deref()
            }
        }))
    }
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -1,4 +1,3 @@
-#![deny(clippy::undocumented_unsafe_blocks)]
 use camino::Utf8PathBuf;
 use once_cell::sync::Lazy;
 use remote_storage::RemoteStorageConfig;
@@ -7,10 +6,7 @@ use tokio::runtime::Runtime;
 use std::time::Duration;
 use storage_broker::Uri;

-use utils::{
-    auth::SwappableJwtAuth,
-    id::{NodeId, TenantId, TenantTimelineId},
-};
+use utils::id::{NodeId, TenantId, TenantTimelineId};

 mod auth;
 pub mod broker;
@@ -73,7 +69,7 @@ pub struct SafeKeeperConf {
    pub wal_backup_enabled: bool,
    pub pg_auth: Option<Arc<JwtAuth>>,
    pub pg_tenant_only_auth: Option<Arc<JwtAuth>>,
-    pub http_auth: Option<Arc<SwappableJwtAuth>>,
+    pub http_auth: Option<Arc<JwtAuth>>,
    pub current_thread_runtime: bool,
 }

--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -111,7 +111,7 @@ impl WalReceivers {
            .count()
    }

-    /// Unregister walreceiver.
+    /// Unregister walsender.
    fn unregister(self: &Arc<WalReceivers>, id: WalReceiverId) {
        let mut shared = self.mutex.lock();
        shared.slots[id] = None;
@@ -138,8 +138,8 @@ pub enum WalReceiverStatus {
    Streaming,
 }

-/// Scope guard to access slot in WalReceivers registry and unregister from
-/// it in Drop.
+/// Scope guard to access slot in WalSenders registry and unregister from it in
+/// Drop.
 pub struct WalReceiverGuard {
    id: WalReceiverId,
    walreceivers: Arc<WalReceivers>,
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -361,6 +361,7 @@ class PgProtocol:

@dataclass
 class AuthKeys:
+    pub: str
    priv: str

    def generate_token(self, *, scope: str, **token_data: str) -> str:
@@ -625,8 +626,6 @@ class NeonEnvBuilder:
                sk.stop(immediate=True)

            for pageserver in self.env.pageservers:
-                pageserver.assert_no_metric_errors()
-
                pageserver.stop(immediate=True)

            if self.env.attachment_service is not None:
@@ -876,31 +875,9 @@ class NeonEnv:

    @cached_property
    def auth_keys(self) -> AuthKeys:
+        pub = (Path(self.repo_dir) / "auth_public_key.pem").read_text()
        priv = (Path(self.repo_dir) / "auth_private_key.pem").read_text()
-        return AuthKeys(priv=priv)
-
-    def regenerate_keys_at(self, privkey_path: Path, pubkey_path: Path):
-        # compare generate_auth_keys() in local_env.rs
-        subprocess.run(
-            ["openssl", "genpkey", "-algorithm", "ed25519", "-out", privkey_path],
-            cwd=self.repo_dir,
-            check=True,
-        )
-
-        subprocess.run(
-            [
-                "openssl",
-                "pkey",
-                "-in",
-                privkey_path,
-                "-pubout",
-                "-out",
-                pubkey_path,
-            ],
-            cwd=self.repo_dir,
-            check=True,
-        )
-        del self.auth_keys
+        return AuthKeys(pub=pub, priv=priv)

    def generate_endpoint_id(self) -> str:
        """
@@ -1807,21 +1784,6 @@ class NeonPageserver(PgProtocol):

        assert not errors

-    def assert_no_metric_errors(self):
-        """
-        Certain metrics should _always_ be zero: they track conditions that indicate a bug.
-        """
-        if not self.running:
-            log.info(f"Skipping metrics check on pageserver {self.id}, it is not running")
-            return
-
-        for metric in [
-            "pageserver_tenant_manager_unexpected_errors_total",
-            "pageserver_deletion_queue_unexpected_errors_total",
-        ]:
-            value = self.http_client().get_metric_value(metric)
-            assert value == 0, f"Nonzero {metric} == {value}"
-
    def log_contains(self, pattern: str) -> Optional[str]:
        """Check that the pageserver log contains a line that matches the given regex"""
        logfile = open(os.path.join(self.workdir, "pageserver.log"), "r")
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -189,10 +189,6 @@ class PageserverHttpClient(requests.Session):
        assert res_json is None
        return res_json

-    def reload_auth_validation_keys(self):
-        res = self.post(f"http://localhost:{self.port}/v1/reload_auth_validation_keys")
-        self.verbose_error(res)
-
    def tenant_list(self) -> List[Dict[Any, Any]]:
        res = self.get(f"http://localhost:{self.port}/v1/tenant")
        self.verbose_error(res)
--- a/test_runner/regress/test_auth.py
+++ b/test_runner/regress/test_auth.py
@@ -1,35 +1,12 @@
-import os
 from contextlib import closing
-from pathlib import Path

 import psycopg2
 import pytest
-from fixtures.neon_fixtures import (
-    NeonEnv,
-    NeonEnvBuilder,
-    PgProtocol,
-)
-from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
+from fixtures.neon_fixtures import NeonEnvBuilder, PgProtocol
+from fixtures.pageserver.http import PageserverApiException
 from fixtures.types import TenantId, TimelineId


-def assert_client_authorized(env: NeonEnv, http_client: PageserverHttpClient):
-    http_client.timeline_create(
-        pg_version=env.pg_version,
-        tenant_id=env.initial_tenant,
-        new_timeline_id=TimelineId.generate(),
-        ancestor_timeline_id=env.initial_timeline,
-    )
-
-
-def assert_client_not_authorized(env: NeonEnv, http_client: PageserverHttpClient):
-    with pytest.raises(
-        PageserverApiException,
-        match="Unauthorized: malformed jwt token",
-    ):
-        assert_client_authorized(env, http_client)
-
-
 def test_pageserver_auth(neon_env_builder: NeonEnvBuilder):
    neon_env_builder.auth_enabled = True
    env = neon_env_builder.init_start()
@@ -50,16 +27,30 @@ def test_pageserver_auth(neon_env_builder: NeonEnvBuilder):
    ps.safe_psql("set FOO", password=pageserver_token)

    # tenant can create branches
-    assert_client_authorized(env, tenant_http_client)
-
+    tenant_http_client.timeline_create(
+        pg_version=env.pg_version,
+        tenant_id=env.initial_tenant,
+        new_timeline_id=TimelineId.generate(),
+        ancestor_timeline_id=env.initial_timeline,
+    )
    # console can create branches for tenant
-    assert_client_authorized(env, pageserver_http_client)
+    pageserver_http_client.timeline_create(
+        pg_version=env.pg_version,
+        tenant_id=env.initial_tenant,
+        new_timeline_id=TimelineId.generate(),
+        ancestor_timeline_id=env.initial_timeline,
+    )

    # fail to create branch using token with different tenant_id
    with pytest.raises(
        PageserverApiException, match="Forbidden: Tenant id mismatch. Permission denied"
    ):
-        assert_client_authorized(env, invalid_tenant_http_client)
+        invalid_tenant_http_client.timeline_create(
+            pg_version=env.pg_version,
+            tenant_id=env.initial_tenant,
+            new_timeline_id=TimelineId.generate(),
+            ancestor_timeline_id=env.initial_timeline,
+        )

    # create tenant using management token
    pageserver_http_client.tenant_create(TenantId.generate())
@@ -91,94 +82,6 @@ def test_compute_auth_to_pageserver(neon_env_builder: NeonEnvBuilder):
            assert cur.fetchone() == (5000050000,)


-def test_pageserver_multiple_keys(neon_env_builder: NeonEnvBuilder):
-    neon_env_builder.auth_enabled = True
-    env = neon_env_builder.init_start()
-    env.pageserver.allowed_errors.append(".*Unauthorized: malformed jwt token.*")
-
-    pageserver_token_old = env.auth_keys.generate_pageserver_token()
-    pageserver_http_client_old = env.pageserver.http_client(pageserver_token_old)
-
-    pageserver_http_client_old.reload_auth_validation_keys()
-
-    # This test is to ensure that the pageserver supports multiple keys.
-    # The neon_local tool generates one key pair at a hardcoded path by default.
-    # As a preparation for our test, move the public key of the key pair into a
-    # directory at the same location as the hardcoded path by:
-    # 1. moving the the file at `configured_pub_key_path` to a temporary location
-    # 2. creating a new directory at `configured_pub_key_path`
-    # 3. moving the file from the temporary location into the newly created directory
-    configured_pub_key_path = Path(env.repo_dir) / "auth_public_key.pem"
-    os.rename(configured_pub_key_path, Path(env.repo_dir) / "auth_public_key.pem.file")
-    os.mkdir(configured_pub_key_path)
-    os.rename(
-        Path(env.repo_dir) / "auth_public_key.pem.file",
-        configured_pub_key_path / "auth_public_key_old.pem",
-    )
-
-    # Add a new key pair
-    # This invalidates env.auth_keys and makes them be regenerated
-    env.regenerate_keys_at(
-        Path("auth_private_key.pem"), Path("auth_public_key.pem/auth_public_key_new.pem")
-    )
-
-    # Reload the keys on the pageserver side
-    pageserver_http_client_old.reload_auth_validation_keys()
-
-    # We can continue doing things using the old token
-    assert_client_authorized(env, pageserver_http_client_old)
-
-    pageserver_token_new = env.auth_keys.generate_pageserver_token()
-    pageserver_http_client_new = env.pageserver.http_client(pageserver_token_new)
-
-    # The new token also works
-    assert_client_authorized(env, pageserver_http_client_new)
-
-    # Remove the old token and reload
-    os.remove(Path(env.repo_dir) / "auth_public_key.pem" / "auth_public_key_old.pem")
-    pageserver_http_client_old.reload_auth_validation_keys()
-
-    # Reloading fails now with the old token, but the new token still works
-    assert_client_not_authorized(env, pageserver_http_client_old)
-    assert_client_authorized(env, pageserver_http_client_new)
-
-
-def test_pageserver_key_reload(neon_env_builder: NeonEnvBuilder):
-    neon_env_builder.auth_enabled = True
-    env = neon_env_builder.init_start()
-    env.pageserver.allowed_errors.append(".*Unauthorized: malformed jwt token.*")
-
-    pageserver_token_old = env.auth_keys.generate_pageserver_token()
-    pageserver_http_client_old = env.pageserver.http_client(pageserver_token_old)
-
-    pageserver_http_client_old.reload_auth_validation_keys()
-
-    # Regenerate the keys
-    env.regenerate_keys_at(Path("auth_private_key.pem"), Path("auth_public_key.pem"))
-
-    # Reload the keys on the pageserver side
-    pageserver_http_client_old.reload_auth_validation_keys()
-
-    # Next attempt fails as we use the old auth token
-    with pytest.raises(
-        PageserverApiException,
-        match="Unauthorized: malformed jwt token",
-    ):
-        pageserver_http_client_old.reload_auth_validation_keys()
-
-    # same goes for attempts trying to create a timeline
-    assert_client_not_authorized(env, pageserver_http_client_old)
-
-    pageserver_token_new = env.auth_keys.generate_pageserver_token()
-    pageserver_http_client_new = env.pageserver.http_client(pageserver_token_new)
-
-    # timeline creation works with the new token
-    assert_client_authorized(env, pageserver_http_client_new)
-
-    # reloading also works with the new token
-    pageserver_http_client_new.reload_auth_validation_keys()
-
-
@pytest.mark.parametrize("auth_enabled", [False, True])
 def test_auth_failures(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
    neon_env_builder.auth_enabled = auth_enabled
--- a/test_runner/regress/test_change_pageserver.py
+++ b/test_runner/regress/test_change_pageserver.py
@@ -1,13 +1,9 @@
-import asyncio
-
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder
 from fixtures.remote_storage import RemoteStorageKind


 def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
-    num_connections = 3
-
    neon_env_builder.num_pageservers = 2
    neon_env_builder.enable_pageserver_remote_storage(
        remote_storage_kind=RemoteStorageKind.MOCK_S3,
@@ -20,24 +16,15 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
    alt_pageserver_id = env.pageservers[1].id
    env.pageservers[1].tenant_attach(env.initial_tenant)

-    pg_conns = [endpoint.connect() for i in range(num_connections)]
-    curs = [pg_conn.cursor() for pg_conn in pg_conns]
-
-    def execute(statement: str):
-        for cur in curs:
-            cur.execute(statement)
-
-    def fetchone():
-        results = [cur.fetchone() for cur in curs]
-        assert all(result == results[0] for result in results)
-        return results[0]
+    pg_conn = endpoint.connect()
+    cur = pg_conn.cursor()

    # Create table, and insert some rows. Make it big enough that it doesn't fit in
    # shared_buffers, otherwise the SELECT after restart will just return answer
    # from shared_buffers without hitting the page server, which defeats the point
    # of this test.
-    curs[0].execute("CREATE TABLE foo (t text)")
-    curs[0].execute(
+    cur.execute("CREATE TABLE foo (t text)")
+    cur.execute(
        """
        INSERT INTO foo
            SELECT 'long string to consume some space' || g
@@ -46,25 +33,25 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
    )

    # Verify that the table is larger than shared_buffers
-    curs[0].execute(
+    cur.execute(
        """
        select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size
        from pg_settings where name = 'shared_buffers'
        """
    )
-    row = curs[0].fetchone()
+    row = cur.fetchone()
    assert row is not None
    log.info(f"shared_buffers is {row[0]}, table size {row[1]}")
    assert int(row[0]) < int(row[1])

-    execute("SELECT count(*) FROM foo")
-    assert fetchone() == (100000,)
+    cur.execute("SELECT count(*) FROM foo")
+    assert cur.fetchone() == (100000,)

    endpoint.reconfigure(pageserver_id=alt_pageserver_id)

    # Verify that the neon.pageserver_connstring GUC is set to the correct thing
-    execute("SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'")
-    connstring = fetchone()
+    cur.execute("SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'")
+    connstring = cur.fetchone()
    assert connstring is not None
    expected_connstring = f"postgresql://no_user:@localhost:{env.pageservers[1].service_port.pg}"
    assert expected_connstring == expected_connstring
@@ -73,45 +60,5 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
        0
    ].stop()  # Stop the old pageserver just to make sure we're reading from the new one

-    execute("SELECT count(*) FROM foo")
-    assert fetchone() == (100000,)
-
-    # Try failing back, and this time we will stop the current pageserver before reconfiguring
-    # the endpoint.  Whereas the previous reconfiguration was like a healthy migration, this
-    # is more like what happens in an unexpected  pageserver failure.
-    env.pageservers[0].start()
-    env.pageservers[1].stop()
-
-    endpoint.reconfigure(pageserver_id=env.pageservers[0].id)
-
-    execute("SELECT count(*) FROM foo")
-    assert fetchone() == (100000,)
-
-    env.pageservers[0].stop()
-    env.pageservers[1].start()
-
-    # Test a (former) bug where a child process spins without updating its connection string
-    # by executing a query separately. This query will hang until we issue the reconfigure.
-    async def reconfigure_async():
-        await asyncio.sleep(
-            1
-        )  # Sleep for 1 second just to make sure we actually started our count(*) query
-        endpoint.reconfigure(pageserver_id=env.pageservers[1].id)
-
-    def execute_count():
-        execute("SELECT count(*) FROM FOO")
-
-    async def execute_and_reconfigure():
-        task_exec = asyncio.to_thread(execute_count)
-        task_reconfig = asyncio.create_task(reconfigure_async())
-        await asyncio.gather(
-            task_exec,
-            task_reconfig,
-        )
-
-    asyncio.run(execute_and_reconfigure())
-    assert fetchone() == (100000,)
-
-    # One final check that nothing hangs
-    execute("SELECT count(*) FROM foo")
-    assert fetchone() == (100000,)
+    cur.execute("SELECT count(*) FROM foo")
+    assert cur.fetchone() == (100000,)
--- a/test_runner/regress/test_neon_cli.py
+++ b/test_runner/regress/test_neon_cli.py
@@ -134,9 +134,6 @@ def test_cli_start_stop(neon_env_builder: NeonEnvBuilder):
    env.neon_cli.pageserver_stop(env.pageserver.id)
    env.neon_cli.safekeeper_stop()

-    # Keep NeonEnv state up to date, it usually owns starting/stopping services
-    env.pageserver.running = False
-
    # Default start
    res = env.neon_cli.raw_cli(["start"])
    res.check_returncode()
@@ -158,10 +155,6 @@ def test_cli_start_stop_multi(neon_env_builder: NeonEnvBuilder):
    env.neon_cli.pageserver_stop(env.BASE_PAGESERVER_ID)
    env.neon_cli.pageserver_stop(env.BASE_PAGESERVER_ID + 1)

-    # Keep NeonEnv state up to date, it usually owns starting/stopping services
-    env.pageservers[0].running = False
-    env.pageservers[1].running = False
-
    # Addressing a nonexistent ID throws
    with pytest.raises(RuntimeError):
        env.neon_cli.pageserver_stop(env.BASE_PAGESERVER_ID + 100)
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -366,17 +366,11 @@ def test_deletion_queue_recovery(
    assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0

    if validate_before == ValidateBefore.VALIDATE:
-        # At this point, one or more DeletionLists have been written.  We have set a failpoint
-        # to prevent them successfully executing, but we want to see them get validated.
-        #
-        # We await _some_ validations instead of _all_ validations, because our execution failpoint
-        # will prevent validation proceeding for any but the first DeletionList.  Usually the workload
-        # just generates one, but if it generates two due to timing, then we must not expect that the
-        # second one will be validated.
-        def assert_some_validations():
-            assert get_deletion_queue_validated(ps_http) > 0

-        wait_until(20, 1, assert_some_validations)
+        def assert_validation_complete():
+            assert get_deletion_queue_submitted(ps_http) == get_deletion_queue_validated(ps_http)
+
+        wait_until(20, 1, assert_validation_complete)

        # The validatated keys statistic advances before the header is written, so we
        # also wait to see the header hit the disk: this seems paranoid but the race
@@ -386,11 +380,6 @@ def test_deletion_queue_recovery(

        wait_until(20, 1, assert_header_written)

-        # If we will lose attachment, then our expectation on restart is that only the ones
-        # we already validated will execute.  Act like only those were present in the queue.
-        if keep_attachment == KeepAttachment.LOSE:
-            before_restart_depth = get_deletion_queue_validated(ps_http)
-
    log.info(f"Restarting pageserver with {before_restart_depth} deletions enqueued")
    env.pageserver.stop(immediate=True)

@@ -413,13 +402,11 @@ def test_deletion_queue_recovery(
    ps_http.deletion_queue_flush(execute=True)
    wait_until(10, 1, lambda: assert_deletion_queue(ps_http, lambda n: n == 0))

-    if keep_attachment == KeepAttachment.KEEP:
+    if keep_attachment == KeepAttachment.KEEP or validate_before == ValidateBefore.VALIDATE:
        # - If we kept the attachment, then our pre-restart deletions should execute
        #   because on re-attach they were from the immediately preceding generation
-        assert get_deletion_queue_executed(ps_http) == before_restart_depth
-    elif validate_before == ValidateBefore.VALIDATE:
-        # - If we validated before restart, then we should execute however many keys were
-        #   validated before restart.
+        # - If we validated before restart, then the deletions should execute because the
+        #   deletion queue header records a validated deletion list sequence number.
        assert get_deletion_queue_executed(ps_http) == before_restart_depth
    else:
        env.pageserver.allowed_errors.extend([".*Dropping stale deletions.*"])
--- a/test_runner/regress/test_pageserver_restart.py
+++ b/test_runner/regress/test_pageserver_restart.py
@@ -20,8 +20,6 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder, generations: bool)
    endpoint = env.endpoints.create_start("main")
    pageserver_http = env.pageserver.http_client()

-    assert pageserver_http.get_metric_value("pageserver_tenant_manager_slots") == 1
-
    pg_conn = endpoint.connect()
    cur = pg_conn.cursor()

@@ -54,9 +52,6 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder, generations: bool)
    env.pageserver.stop()
    env.pageserver.start()

-    # We reloaded our tenant
-    assert pageserver_http.get_metric_value("pageserver_tenant_manager_slots") == 1
-
    cur.execute("SELECT count(*) FROM foo")
    assert cur.fetchone() == (100000,)

--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -63,9 +63,6 @@ def test_tenant_delete_smoke(
        conf=MANY_SMALL_LAYERS_TENANT_CONFIG,
    )

-    # Default tenant and the one we created
-    assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 2
-
    # create two timelines one being the parent of another
    parent = None
    for timeline in ["first", "second"]:
@@ -91,9 +88,7 @@ def test_tenant_delete_smoke(

    iterations = poll_for_remote_storage_iterations(remote_storage_kind)

-    assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 2
    tenant_delete_wait_completed(ps_http, tenant_id, iterations)
-    assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1

    tenant_path = env.pageserver.tenant_dir(tenant_id)
    assert not tenant_path.exists()
@@ -109,9 +104,6 @@ def test_tenant_delete_smoke(
            ),
        )

-    # Deletion updates the tenant count: the one default tenant remains
-    assert ps_http.get_metric_value("pageserver_tenant_manager_slots") == 1
-

 class Check(enum.Enum):
    RETRY_WITHOUT_RESTART = enum.auto()
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -26,16 +26,6 @@ from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import query_scalar, wait_until
 from prometheus_client.samples import Sample

-# In tests that overlap endpoint activity with tenant attach/detach, there are
-# a variety of warnings that the page service may emit when it cannot acquire
-# an active tenant to serve a request
-PERMIT_PAGE_SERVICE_ERRORS = [
-    ".*page_service.*Tenant .* not found",
-    ".*page_service.*Tenant .* is not active",
-    ".*page_service.*cancelled",
-    ".*page_service.*will not become active.*",
-]
-

 def do_gc_target(
    pageserver_http: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId
@@ -70,7 +60,12 @@ def test_tenant_reattach(
    # create new nenant
    tenant_id, timeline_id = env.neon_cli.create_tenant()

-    env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
+    # Attempts to connect from compute to pageserver while the tenant is
+    # temporarily detached produces these errors in the pageserver log.
+    env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
+    env.pageserver.allowed_errors.append(
+        f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
+    )

    with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
        with endpoint.cursor() as cur:
@@ -240,7 +235,10 @@ def test_tenant_reattach_while_busy(

    # Attempts to connect from compute to pageserver while the tenant is
    # temporarily detached produces these errors in the pageserver log.
-    env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
+    env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
+    env.pageserver.allowed_errors.append(
+        f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
+    )

    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)

@@ -261,7 +259,7 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):
    env = neon_env_builder.init_start()
    pageserver_http = env.pageserver.http_client()

-    env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
+    env.pageserver.allowed_errors.append(".*NotFound: Tenant .*")

    # first check for non existing tenant
    tenant_id = TenantId.generate()
@@ -273,9 +271,19 @@ def test_tenant_detach_smoke(neon_env_builder: NeonEnvBuilder):

    assert excinfo.value.status_code == 404

+    # the error will be printed to the log too
+    env.pageserver.allowed_errors.append(".*NotFound: tenant *")
+
    # create new nenant
    tenant_id, timeline_id = env.neon_cli.create_tenant()

+    # Attempts to connect from compute to pageserver while the tenant is
+    # temporarily detached produces these errors in the pageserver log.
+    env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
+    env.pageserver.allowed_errors.append(
+        f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
+    )
+
    # assert tenant exists on disk
    assert env.pageserver.tenant_dir(tenant_id).exists()

@@ -337,7 +345,12 @@ def test_tenant_detach_ignored_tenant(neon_simple_env: NeonEnv):
    # create a new tenant
    tenant_id, _ = env.neon_cli.create_tenant()

-    env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
+    # Attempts to connect from compute to pageserver while the tenant is
+    # temporarily detached produces these errors in the pageserver log.
+    env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
+    env.pageserver.allowed_errors.append(
+        f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
+    )

    # assert tenant exists on disk
    assert env.pageserver.tenant_dir(tenant_id).exists()
@@ -388,7 +401,12 @@ def test_tenant_detach_regular_tenant(neon_simple_env: NeonEnv):
    # create a new tenant
    tenant_id, _ = env.neon_cli.create_tenant()

-    env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
+    # Attempts to connect from compute to pageserver while the tenant is
+    # temporarily detached produces these errors in the pageserver log.
+    env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
+    env.pageserver.allowed_errors.append(
+        f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
+    )

    # assert tenant exists on disk
    assert env.pageserver.tenant_dir(tenant_id).exists()
@@ -435,7 +453,12 @@ def test_detach_while_attaching(
    tenant_id = env.initial_tenant
    timeline_id = env.initial_timeline

-    env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
+    # Attempts to connect from compute to pageserver while the tenant is
+    # temporarily detached produces these errors in the pageserver log.
+    env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
+    env.pageserver.allowed_errors.append(
+        f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
+    )

    # Create table, and insert some rows. Make it big enough that it doesn't fit in
    # shared_buffers, otherwise the SELECT after restart will just return answer
@@ -570,7 +593,12 @@ def test_ignored_tenant_download_missing_layers(neon_env_builder: NeonEnvBuilder
    tenant_id = env.initial_tenant
    timeline_id = env.initial_timeline

-    env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
+    # Attempts to connect from compute to pageserver while the tenant is
+    # temporarily detached produces these errors in the pageserver log.
+    env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
+    env.pageserver.allowed_errors.append(
+        f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
+    )

    data_id = 1
    data_secret = "very secret secret"
@@ -621,7 +649,12 @@ def test_load_attach_negatives(neon_env_builder: NeonEnvBuilder):

    tenant_id = env.initial_tenant

-    env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
+    # Attempts to connect from compute to pageserver while the tenant is
+    # temporarily detached produces these errors in the pageserver log.
+    env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
+    env.pageserver.allowed_errors.append(
+        f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
+    )

    env.pageserver.allowed_errors.append(".*tenant .*? already exists, state:.*")
    with pytest.raises(
@@ -660,7 +693,12 @@ def test_ignore_while_attaching(
    tenant_id = env.initial_tenant
    timeline_id = env.initial_timeline

-    env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
+    # Attempts to connect from compute to pageserver while the tenant is
+    # temporarily detached produces these errors in the pageserver log.
+    env.pageserver.allowed_errors.append(f".*Tenant {tenant_id} not found.*")
+    env.pageserver.allowed_errors.append(
+        f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
+    )

    data_id = 1
    data_secret = "very secret secret"