Preallocate vectors

2026-02-14 16:10:37 +00:00 · 2023-11-07 00:31:45 -05:00
90 changed files with 661 additions and 1665 deletions
--- a/.config/hakari.toml
+++ b/.config/hakari.toml
@@ -22,11 +22,5 @@ platforms = [
    # "x86_64-pc-windows-msvc",
 ]

-[final-excludes]
-# vm_monitor benefits from the same Cargo.lock as the rest of our artifacts, but
-# it is built primarly in separate repo neondatabase/autoscaling and thus is excluded
-# from depending on workspace-hack because most of the dependencies are not used.
-workspace-members = ["vm_monitor"]
-
 # Write out exact versions rather than a semver range. (Defaults to false.)
 # exact-versions = true
--- a/.github/ISSUE_TEMPLATE/epic-template.md
+++ b/.github/ISSUE_TEMPLATE/epic-template.md
@@ -17,9 +17,8 @@ assignees: ''
 ## Implementation ideas


-```[tasklist]
-### Tasks
-```
+## Tasks
+- [ ]


 ## Other related tasks and Epics
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -723,7 +723,6 @@ jobs:
                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
                           --context .
                           --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
-                           --build-arg BUILD_TAG=${{ needs.tag.outputs.build-tag }}
                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
                           --destination neondatabase/neon:${{needs.tag.outputs.build-tag}}
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -170,12 +170,6 @@ dependencies = [
 "backtrace",
 ]

-[[package]]
-name = "arc-swap"
-version = "1.6.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bddcadddf5e9015d310179a59bb28c4d4b9920ad0f11e8e14dbadf654890c9a6"
-
 [[package]]
 name = "archery"
 version = "0.5.0"
@@ -4064,7 +4058,6 @@ dependencies = [
 "aws-config",
 "aws-credential-types",
 "aws-sdk-s3",
- "aws-smithy-async",
 "aws-smithy-http",
 "aws-types",
 "azure_core",
@@ -5958,7 +5951,6 @@ name = "utils"
 version = "0.1.0"
 dependencies = [
 "anyhow",
- "arc-swap",
 "async-trait",
 "bincode",
 "byteorder",
@@ -6056,6 +6048,7 @@ dependencies = [
 "tokio-util",
 "tracing",
 "tracing-subscriber",
+ "workspace_hack",
 ]

 [[package]]
@@ -6483,7 +6476,6 @@ dependencies = [
 "clap",
 "clap_builder",
 "crossbeam-utils",
- "dashmap",
 "either",
 "fail",
 "futures",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -36,7 +36,6 @@ license = "Apache-2.0"
 ## All dependency versions, used in the project
 [workspace.dependencies]
 anyhow = { version = "1.0", features = ["backtrace"] }
-arc-swap = "1.6"
 async-compression = { version = "0.4.0", features = ["tokio", "gzip"] }
 azure_core = "0.16"
 azure_identity = "0.16"
@@ -48,7 +47,6 @@ async-trait = "0.1"
 aws-config = { version = "0.56", default-features = false, features=["rustls"] }
 aws-sdk-s3 = "0.29"
 aws-smithy-http = "0.56"
-aws-smithy-async = { version = "0.56", default-features = false, features=["rt-tokio"] }
 aws-credential-types = "0.56"
 aws-types = "0.56"
 axum = { version = "0.6.20", features = ["ws"] }
@@ -67,7 +65,7 @@ comfy-table = "6.1"
 const_format = "0.2"
 crc32c = "0.6"
 crossbeam-utils = "0.8.5"
-dashmap = { version = "5.5.0", features = ["raw-api"] }
+dashmap = "5.5.0"
 either = "1.8"
 enum-map = "2.4.2"
 enumset = "1.0.12"
--- a/5
+++ b/5
@@ -27,7 +27,6 @@ RUN set -e \
 FROM $REPOSITORY/$IMAGE:$TAG AS build
 WORKDIR /home/nonroot
 ARG GIT_VERSION=local
-ARG BUILD_TAG

 # Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds.
 # Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations.
@@ -79,9 +78,9 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pg_sni_router
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl             /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper          /usr/local/bin
-COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker      /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker         /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy               /usr/local/bin
-COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local          /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local               /usr/local/bin

 COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
 COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
--- a/4
+++ b/4
@@ -72,10 +72,6 @@ neon: postgres-headers walproposer-lib
 #
 $(POSTGRES_INSTALL_DIR)/build/%/config.status:
 	+@echo "Configuring Postgres $* build"
-	@test -s $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure || { \
-		echo "\nPostgres submodule not found in $(ROOT_PROJECT_DIR)/vendor/postgres-$*/, execute "; \
-		echo "'git submodule update --init --recursive --depth 2 --progress .' in project root.\n"; \
-		exit 1; }
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/$*
 	(cd $(POSTGRES_INSTALL_DIR)/build/$* && \
 	env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure \
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -710,12 +710,8 @@ impl ComputeNode {
    // `pg_ctl` for start / stop, so this just seems much easier to do as we already
    // have opened connection to Postgres and superuser access.
    #[instrument(skip_all)]
-    fn pg_reload_conf(&self) -> Result<()> {
-        let pgctl_bin = Path::new(&self.pgbin).parent().unwrap().join("pg_ctl");
-        Command::new(pgctl_bin)
-            .args(["reload", "-D", &self.pgdata])
-            .output()
-            .expect("cannot run pg_ctl process");
+    fn pg_reload_conf(&self, client: &mut Client) -> Result<()> {
+        client.simple_query("SELECT pg_reload_conf()")?;
        Ok(())
    }

@@ -728,9 +724,9 @@ impl ComputeNode {
        // Write new config
        let pgdata_path = Path::new(&self.pgdata);
        config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec, None)?;
-        self.pg_reload_conf()?;

        let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
+        self.pg_reload_conf(&mut client)?;

        // Proceed with post-startup configuration. Note, that order of operations is important.
        // Disable DDL forwarding because control plane already knows about these roles/databases.
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -78,7 +78,7 @@ use regex::Regex;
 use remote_storage::*;
 use serde_json;
 use std::io::Read;
-use std::num::NonZeroUsize;
+use std::num::{NonZeroU32, NonZeroUsize};
 use std::path::Path;
 use std::str;
 use tar::Archive;
@@ -281,6 +281,8 @@ pub fn init_remote_storage(remote_ext_config: &str) -> anyhow::Result<GenericRem
        max_keys_per_list_response: None,
    };
    let config = RemoteStorageConfig {
+        max_concurrent_syncs: NonZeroUsize::new(100).expect("100 != 0"),
+        max_sync_errors: NonZeroU32::new(100).expect("100 != 0"),
        storage: RemoteStorageKind::AwsS3(config),
    };
    GenericRemoteStorage::from_config(&config)
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -1,7 +1,7 @@
+//!
 //! Various tools and helpers to handle cluster / compute node (Postgres)
 //! configuration.
-#![deny(unsafe_code)]
-#![deny(clippy::undocumented_unsafe_blocks)]
+//!
 pub mod checker;
 pub mod config;
 pub mod configurator;
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -262,7 +262,7 @@ where
    P: Into<Utf8PathBuf>,
 {
    let path: Utf8PathBuf = path.into();
-    // SAFETY:
+    // SAFETY
    // pre_exec is marked unsafe because it runs between fork and exec.
    // Why is that dangerous in various ways?
    // Long answer:  https://github.com/rust-lang/rust/issues/39575
--- a/control_plane/src/lib.rs
+++ b/control_plane/src/lib.rs
@@ -1,10 +1,11 @@
-//! Local control plane.
-//!
-//! Can start, configure and stop postgres instances running as a local processes.
-//!
-//! Intended to be used in integration tests and in CLI tools for
-//! local installations.
-#![deny(clippy::undocumented_unsafe_blocks)]
+//
+// Local control plane.
+//
+// Can start, configure and stop postgres instances running as a local processes.
+//
+// Intended to be used in integration tests and in CLI tools for
+// local installations.
+//

 pub mod attachment_service;
 mod background_process;
--- a/libs/compute_api/src/lib.rs
+++ b/libs/compute_api/src/lib.rs
@@ -1,5 +1,3 @@
-#![deny(unsafe_code)]
-#![deny(clippy::undocumented_unsafe_blocks)]
 pub mod requests;
 pub mod responses;
 pub mod spec;
--- a/libs/consumption_metrics/src/lib.rs
+++ b/libs/consumption_metrics/src/lib.rs
@@ -1,6 +1,6 @@
+//!
 //! Shared code for consumption metics collection
-#![deny(unsafe_code)]
-#![deny(clippy::undocumented_unsafe_blocks)]
+//!
 use chrono::{DateTime, Utc};
 use rand::Rng;
 use serde::{Deserialize, Serialize};
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -2,7 +2,6 @@
 //! make sure that we use the same dep version everywhere.
 //! Otherwise, we might not see all metrics registered via
 //! a default registry.
-#![deny(clippy::undocumented_unsafe_blocks)]
 use once_cell::sync::Lazy;
 use prometheus::core::{AtomicU64, Collector, GenericGauge, GenericGaugeVec};
 pub use prometheus::opts;
--- a/libs/pageserver_api/src/lib.rs
+++ b/libs/pageserver_api/src/lib.rs
@@ -1,5 +1,3 @@
-#![deny(unsafe_code)]
-#![deny(clippy::undocumented_unsafe_blocks)]
 use const_format::formatcp;

 /// Public API types
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -2,8 +2,6 @@
 //! To use, create PostgresBackend and run() it, passing the Handler
 //! implementation determining how to process the queries. Currently its API
 //! is rather narrow, but we can extend it once required.
-#![deny(unsafe_code)]
-#![deny(clippy::undocumented_unsafe_blocks)]
 use anyhow::Context;
 use bytes::Bytes;
 use futures::pin_mut;
@@ -17,7 +15,7 @@ use std::{fmt, io};
 use std::{future::Future, str::FromStr};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_rustls::TlsAcceptor;
-use tracing::{debug, error, info, trace, warn};
+use tracing::{debug, error, info, trace};

 use pq_proto::framed::{ConnectionError, Framed, FramedReader, FramedWriter};
 use pq_proto::{
@@ -35,11 +33,6 @@ pub enum QueryError {
    /// We were instructed to shutdown while processing the query
    #[error("Shutting down")]
    Shutdown,
-    /// Authentication failure
-    #[error("Unauthorized: {0}")]
-    Unauthorized(std::borrow::Cow<'static, str>),
-    #[error("Simulated Connection Error")]
-    SimulatedConnectionError,
    /// Some other error
    #[error(transparent)]
    Other(#[from] anyhow::Error),
@@ -54,9 +47,8 @@ impl From<io::Error> for QueryError {
 impl QueryError {
    pub fn pg_error_code(&self) -> &'static [u8; 5] {
        match self {
-            Self::Disconnected(_) | Self::SimulatedConnectionError => b"08006", // connection failure
+            Self::Disconnected(_) => b"08006", // connection failure
            Self::Shutdown => SQLSTATE_ADMIN_SHUTDOWN,
-            Self::Unauthorized(_) => SQLSTATE_INTERNAL_ERROR,
            Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error
        }
    }
@@ -616,7 +608,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {

                    if let Err(e) = handler.check_auth_jwt(self, jwt_response) {
                        self.write_message_noflush(&BeMessage::ErrorResponse(
-                            &short_error(&e),
+                            &e.to_string(),
                            Some(e.pg_error_code()),
                        ))?;
                        return Err(e);
@@ -738,9 +730,6 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
                if let Err(e) = handler.process_query(self, query_string).await {
                    match e {
                        QueryError::Shutdown => return Ok(ProcessMsgResult::Break),
-                        QueryError::SimulatedConnectionError => {
-                            return Err(QueryError::SimulatedConnectionError)
-                        }
                        e => {
                            log_query_error(query_string, &e);
                            let short_error = short_error(&e);
@@ -975,8 +964,6 @@ pub fn short_error(e: &QueryError) -> String {
    match e {
        QueryError::Disconnected(connection_error) => connection_error.to_string(),
        QueryError::Shutdown => "shutdown".to_string(),
-        QueryError::Unauthorized(_e) => "JWT authentication error".to_string(),
-        QueryError::SimulatedConnectionError => "simulated connection error".to_string(),
        QueryError::Other(e) => format!("{e:#}"),
    }
 }
@@ -993,15 +980,9 @@ fn log_query_error(query: &str, e: &QueryError) {
        QueryError::Disconnected(other_connection_error) => {
            error!("query handler for '{query}' failed with connection error: {other_connection_error:?}")
        }
-        QueryError::SimulatedConnectionError => {
-            error!("query handler for query '{query}' failed due to a simulated connection error")
-        }
        QueryError::Shutdown => {
            info!("query handler for '{query}' cancelled during tenant shutdown")
        }
-        QueryError::Unauthorized(e) => {
-            warn!("query handler for '{query}' failed with authentication error: {e}");
-        }
        QueryError::Other(e) => {
            error!("query handler for '{query}' failed: {e:?}");
        }
--- a/libs/postgres_connection/src/lib.rs
+++ b/libs/postgres_connection/src/lib.rs
@@ -1,5 +1,3 @@
-#![deny(unsafe_code)]
-#![deny(clippy::undocumented_unsafe_blocks)]
 use anyhow::{bail, Context};
 use itertools::Itertools;
 use std::borrow::Cow;
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -8,7 +8,6 @@
 // modules included with the postgres_ffi macro depend on the types of the specific version's
 // types, and trigger a too eager lint.
 #![allow(clippy::duplicate_mod)]
-#![deny(clippy::undocumented_unsafe_blocks)]

 use bytes::Bytes;
 use utils::bin_ser::SerializeError;
@@ -21,7 +20,6 @@ macro_rules! postgres_ffi {
            pub mod bindings {
                // bindgen generates bindings for a lot of stuff we don't need
                #![allow(dead_code)]
-                #![allow(clippy::undocumented_unsafe_blocks)]

                use serde::{Deserialize, Serialize};
                include!(concat!(
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -1,7 +1,6 @@
 //! Postgres protocol messages serialization-deserialization. See
 //! <https://www.postgresql.org/docs/devel/protocol-message-formats.html>
 //! on message formats.
-#![deny(clippy::undocumented_unsafe_blocks)]

 pub mod framed;

--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -8,7 +8,6 @@ license.workspace = true
 anyhow.workspace = true
 async-trait.workspace = true
 once_cell.workspace = true
-aws-smithy-async.workspace = true
 aws-smithy-http.workspace = true
 aws-types.workspace = true
 aws-config.workspace = true
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -6,15 +6,19 @@
 //!   * [`s3_bucket`] uses AWS S3 bucket as an external storage
 //!   * [`azure_blob`] allows to use Azure Blob storage as an external storage
 //!
-#![deny(unsafe_code)]
-#![deny(clippy::undocumented_unsafe_blocks)]

 mod azure_blob;
 mod local_fs;
 mod s3_bucket;
 mod simulate_failures;

-use std::{collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc};
+use std::{
+    collections::HashMap,
+    fmt::Debug,
+    num::{NonZeroU32, NonZeroUsize},
+    pin::Pin,
+    sync::Arc,
+};

 use anyhow::{bail, Context};
 use camino::{Utf8Path, Utf8PathBuf};
@@ -30,6 +34,12 @@ pub use self::{
 };
 use s3_bucket::RequestKind;

+/// How many different timelines can be processed simultaneously when synchronizing layers with the remote storage.
+/// During regular work, pageserver produces one layer file per timeline checkpoint, with bursts of concurrency
+/// during start (where local and remote timelines are compared and initial sync tasks are scheduled) and timeline attach.
+/// Both cases may trigger timeline download, that might download a lot of layers. This concurrency is limited by the clients internally, if needed.
+pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS: usize = 50;
+pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
 /// Currently, sync happens with AWS S3, that has two limits on requests per second:
 /// ~200 RPS for IAM services
 /// <https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html>
@@ -431,6 +441,10 @@ pub struct StorageMetadata(HashMap<String, String>);
 /// External backup storage configuration, enough for creating a client for that storage.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct RemoteStorageConfig {
+    /// Max allowed number of concurrent sync operations between the API user and the remote storage.
+    pub max_concurrent_syncs: NonZeroUsize,
+    /// Max allowed errors before the sync task is considered failed and evicted.
+    pub max_sync_errors: NonZeroU32,
    /// The storage connection configuration.
    pub storage: RemoteStorageKind,
 }
@@ -526,6 +540,18 @@ impl RemoteStorageConfig {

        let use_azure = container_name.is_some() && container_region.is_some();

+        let max_concurrent_syncs = NonZeroUsize::new(
+            parse_optional_integer("max_concurrent_syncs", toml)?
+                .unwrap_or(DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS),
+        )
+        .context("Failed to parse 'max_concurrent_syncs' as a positive integer")?;
+
+        let max_sync_errors = NonZeroU32::new(
+            parse_optional_integer("max_sync_errors", toml)?
+                .unwrap_or(DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS),
+        )
+        .context("Failed to parse 'max_sync_errors' as a positive integer")?;
+
        let default_concurrency_limit = if use_azure {
            DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT
        } else {
@@ -607,7 +633,11 @@ impl RemoteStorageConfig {
            }
        };

-        Ok(Some(RemoteStorageConfig { storage }))
+        Ok(Some(RemoteStorageConfig {
+            max_concurrent_syncs,
+            max_sync_errors,
+            storage,
+        }))
    }
 }

--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -4,27 +4,23 @@
 //! allowing multiple api users to independently work with the same S3 bucket, if
 //! their bucket prefixes are both specified and different.

-use std::{borrow::Cow, sync::Arc};
+use std::borrow::Cow;

 use anyhow::Context;
 use aws_config::{
    environment::credentials::EnvironmentVariableCredentialsProvider,
-    imds::credentials::ImdsCredentialsProvider,
-    meta::credentials::CredentialsProviderChain,
-    provider_config::ProviderConfig,
-    retry::{RetryConfigBuilder, RetryMode},
-    web_identity_token::WebIdentityTokenCredentialsProvider,
+    imds::credentials::ImdsCredentialsProvider, meta::credentials::CredentialsProviderChain,
+    provider_config::ProviderConfig, web_identity_token::WebIdentityTokenCredentialsProvider,
 };
 use aws_credential_types::cache::CredentialsCache;
 use aws_sdk_s3::{
-    config::{AsyncSleep, Config, Region, SharedAsyncSleep},
+    config::{Config, Region},
    error::SdkError,
    operation::get_object::GetObjectError,
    primitives::ByteStream,
    types::{Delete, ObjectIdentifier},
    Client,
 };
-use aws_smithy_async::rt::sleep::TokioSleep;
 use aws_smithy_http::body::SdkBody;
 use hyper::Body;
 use scopeguard::ScopeGuard;
@@ -87,23 +83,10 @@ impl S3Bucket {
            .or_else("imds", ImdsCredentialsProvider::builder().build())
        };

-        // AWS SDK requires us to specify how the RetryConfig should sleep when it wants to back off
-        let sleep_impl: Arc<dyn AsyncSleep> = Arc::new(TokioSleep::new());
-
-        // We do our own retries (see [`backoff::retry`]).  However, for the AWS SDK to enable rate limiting in response to throttling
-        // responses (e.g. 429 on too many ListObjectsv2 requests), we must provide a retry config.  We set it to use at most one
-        // attempt, and enable 'Adaptive' mode, which causes rate limiting to be enabled.
-        let mut retry_config = RetryConfigBuilder::new();
-        retry_config
-            .set_max_attempts(Some(1))
-            .set_mode(Some(RetryMode::Adaptive));
-
        let mut config_builder = Config::builder()
            .region(region)
            .credentials_cache(CredentialsCache::lazy())
-            .credentials_provider(credentials_provider)
-            .sleep_impl(SharedAsyncSleep::from(sleep_impl))
-            .retry_config(retry_config.build());
+            .credentials_provider(credentials_provider);

        if let Some(custom_endpoint) = aws_config.endpoint.clone() {
            config_builder = config_builder
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -1,6 +1,6 @@
 use std::collections::HashSet;
 use std::env;
-use std::num::NonZeroUsize;
+use std::num::{NonZeroU32, NonZeroUsize};
 use std::ops::ControlFlow;
 use std::path::PathBuf;
 use std::sync::Arc;
@@ -469,6 +469,8 @@ fn create_azure_client(
    let random = rand::thread_rng().gen::<u32>();

    let remote_storage_config = RemoteStorageConfig {
+        max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
+        max_sync_errors: NonZeroU32::new(5).unwrap(),
        storage: RemoteStorageKind::AzureContainer(AzureConfig {
            container_name: remote_storage_azure_container,
            container_region: remote_storage_azure_region,
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -1,6 +1,6 @@
 use std::collections::HashSet;
 use std::env;
-use std::num::NonZeroUsize;
+use std::num::{NonZeroU32, NonZeroUsize};
 use std::ops::ControlFlow;
 use std::path::PathBuf;
 use std::sync::Arc;
@@ -396,6 +396,8 @@ fn create_s3_client(
    let random = rand::thread_rng().gen::<u32>();

    let remote_storage_config = RemoteStorageConfig {
+        max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
+        max_sync_errors: NonZeroU32::new(5).unwrap(),
        storage: RemoteStorageKind::AwsS3(S3Config {
            bucket_name: remote_storage_s3_bucket,
            bucket_region: remote_storage_s3_region,
--- a/libs/safekeeper_api/src/lib.rs
+++ b/libs/safekeeper_api/src/lib.rs
@@ -1,5 +1,3 @@
-#![deny(unsafe_code)]
-#![deny(clippy::undocumented_unsafe_blocks)]
 use const_format::formatcp;

 /// Public API types
--- a/libs/tenant_size_model/src/lib.rs
+++ b/libs/tenant_size_model/src/lib.rs
@@ -1,6 +1,4 @@
 //! Synthetic size calculation
-#![deny(unsafe_code)]
-#![deny(clippy::undocumented_unsafe_blocks)]

 mod calculation;
 pub mod svg;
--- a/libs/tracing-utils/src/lib.rs
+++ b/libs/tracing-utils/src/lib.rs
@@ -32,8 +32,6 @@
 //!         .init();
 //! }
 //! ```
-#![deny(unsafe_code)]
-#![deny(clippy::undocumented_unsafe_blocks)]

 use opentelemetry::sdk::Resource;
 use opentelemetry::KeyValue;
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -5,7 +5,6 @@ edition.workspace = true
 license.workspace = true

 [dependencies]
-arc-swap.workspace = true
 sentry.workspace = true
 async-trait.workspace = true
 anyhow.workspace = true
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -1,8 +1,7 @@
 // For details about authentication see docs/authentication.md

-use arc_swap::ArcSwap;
 use serde;
-use std::{borrow::Cow, fmt::Display, fs, sync::Arc};
+use std::fs;

 use anyhow::Result;
 use camino::Utf8Path;
@@ -11,7 +10,7 @@ use jsonwebtoken::{
 };
 use serde::{Deserialize, Serialize};

-use crate::{http::error::ApiError, id::TenantId};
+use crate::id::TenantId;

 /// Algorithm to use. We require EdDSA.
 const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA;
@@ -45,106 +44,31 @@ impl Claims {
    }
 }

-pub struct SwappableJwtAuth(ArcSwap<JwtAuth>);
-
-impl SwappableJwtAuth {
-    pub fn new(jwt_auth: JwtAuth) -> Self {
-        SwappableJwtAuth(ArcSwap::new(Arc::new(jwt_auth)))
-    }
-    pub fn swap(&self, jwt_auth: JwtAuth) {
-        self.0.swap(Arc::new(jwt_auth));
-    }
-    pub fn decode(&self, token: &str) -> std::result::Result<TokenData<Claims>, AuthError> {
-        self.0.load().decode(token)
-    }
-}
-
-impl std::fmt::Debug for SwappableJwtAuth {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "Swappable({:?})", self.0.load())
-    }
-}
-
-#[derive(Clone, PartialEq, Eq, Hash, Debug)]
-pub struct AuthError(pub Cow<'static, str>);
-
-impl Display for AuthError {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{}", self.0)
-    }
-}
-
-impl From<AuthError> for ApiError {
-    fn from(_value: AuthError) -> Self {
-        // Don't pass on the value of the AuthError as a precautionary measure.
-        // Being intentionally vague in public error communication hurts debugability
-        // but it is more secure.
-        ApiError::Forbidden("JWT authentication error".to_string())
-    }
-}
-
 pub struct JwtAuth {
-    decoding_keys: Vec<DecodingKey>,
+    decoding_key: DecodingKey,
    validation: Validation,
 }

 impl JwtAuth {
-    pub fn new(decoding_keys: Vec<DecodingKey>) -> Self {
+    pub fn new(decoding_key: DecodingKey) -> Self {
        let mut validation = Validation::default();
        validation.algorithms = vec![STORAGE_TOKEN_ALGORITHM];
        // The default 'required_spec_claims' is 'exp'. But we don't want to require
        // expiration.
        validation.required_spec_claims = [].into();
        Self {
-            decoding_keys,
+            decoding_key,
            validation,
        }
    }

    pub fn from_key_path(key_path: &Utf8Path) -> Result<Self> {
-        let metadata = key_path.metadata()?;
-        let decoding_keys = if metadata.is_dir() {
-            let mut keys = Vec::new();
-            for entry in fs::read_dir(key_path)? {
-                let path = entry?.path();
-                if !path.is_file() {
-                    // Ignore directories (don't recurse)
-                    continue;
-                }
-                let public_key = fs::read(path)?;
-                keys.push(DecodingKey::from_ed_pem(&public_key)?);
-            }
-            keys
-        } else if metadata.is_file() {
-            let public_key = fs::read(key_path)?;
-            vec![DecodingKey::from_ed_pem(&public_key)?]
-        } else {
-            anyhow::bail!("path is neither a directory or a file")
-        };
-        if decoding_keys.is_empty() {
-            anyhow::bail!("Configured for JWT auth with zero decoding keys. All JWT gated requests would be rejected.");
-        }
-        Ok(Self::new(decoding_keys))
+        let public_key = fs::read(key_path)?;
+        Ok(Self::new(DecodingKey::from_ed_pem(&public_key)?))
    }

-    /// Attempt to decode the token with the internal decoding keys.
-    ///
-    /// The function tries the stored decoding keys in succession,
-    /// and returns the first yielding a successful result.
-    /// If there is no working decoding key, it returns the last error.
-    pub fn decode(&self, token: &str) -> std::result::Result<TokenData<Claims>, AuthError> {
-        let mut res = None;
-        for decoding_key in &self.decoding_keys {
-            res = Some(decode(token, decoding_key, &self.validation));
-            if let Some(Ok(res)) = res {
-                return Ok(res);
-            }
-        }
-        if let Some(res) = res {
-            res.map_err(|e| AuthError(Cow::Owned(e.to_string())))
-        } else {
-            Err(AuthError(Cow::Borrowed("no JWT decoding keys configured")))
-        }
+    pub fn decode(&self, token: &str) -> Result<TokenData<Claims>> {
+        Ok(decode(token, &self.decoding_key, &self.validation)?)
    }
 }

@@ -184,9 +108,9 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
 "#;

    #[test]
-    fn test_decode() {
+    fn test_decode() -> Result<(), anyhow::Error> {
        let expected_claims = Claims {
-            tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081").unwrap()),
+            tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081")?),
            scope: Scope::Tenant,
        };

@@ -205,24 +129,28 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
        let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.U3eA8j-uU-JnhzeO3EDHRuXLwkAUFCPxtGHEgw6p7Ccc3YRbFs2tmCdbD9PZEXP-XsxSeBQi1FY0YPcT3NXADw";

        // Check it can be validated with the public key
-        let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap()]);
-        let claims_from_token = auth.decode(encoded_eddsa).unwrap().claims;
+        let auth = JwtAuth::new(DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?);
+        let claims_from_token = auth.decode(encoded_eddsa)?.claims;
        assert_eq!(claims_from_token, expected_claims);
+
+        Ok(())
    }

    #[test]
-    fn test_encode() {
+    fn test_encode() -> Result<(), anyhow::Error> {
        let claims = Claims {
-            tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081").unwrap()),
+            tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081")?),
            scope: Scope::Tenant,
        };

-        let encoded = encode_from_key_file(&claims, TEST_PRIV_KEY_ED25519).unwrap();
+        let encoded = encode_from_key_file(&claims, TEST_PRIV_KEY_ED25519)?;

        // decode it back
-        let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap()]);
-        let decoded = auth.decode(&encoded).unwrap();
+        let auth = JwtAuth::new(DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?);
+        let decoded = auth.decode(&encoded)?;

        assert_eq!(decoded.claims, claims);
+
+        Ok(())
    }
 }
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -1,4 +1,4 @@
-use crate::auth::{AuthError, Claims, SwappableJwtAuth};
+use crate::auth::{Claims, JwtAuth};
 use crate::http::error::{api_error_handler, route_error_handler, ApiError};
 use anyhow::Context;
 use hyper::header::{HeaderName, AUTHORIZATION};
@@ -389,7 +389,7 @@ fn parse_token(header_value: &str) -> Result<&str, ApiError> {
 }

 pub fn auth_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
-    provide_auth: fn(&Request<Body>) -> Option<&SwappableJwtAuth>,
+    provide_auth: fn(&Request<Body>) -> Option<&JwtAuth>,
 ) -> Middleware<B, ApiError> {
    Middleware::pre(move |req| async move {
        if let Some(auth) = provide_auth(&req) {
@@ -400,11 +400,9 @@ pub fn auth_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
                    })?;
                    let token = parse_token(header_value)?;

-                    let data = auth.decode(token).map_err(|err| {
-                        warn!("Authentication error: {err}");
-                        // Rely on From<AuthError> for ApiError impl
-                        err
-                    })?;
+                    let data = auth
+                        .decode(token)
+                        .map_err(|_| ApiError::Unauthorized("malformed jwt token".to_string()))?;
                    req.set_context(data.claims);
                }
                None => {
@@ -452,11 +450,12 @@ where

 pub fn check_permission_with(
    req: &Request<Body>,
-    check_permission: impl Fn(&Claims) -> Result<(), AuthError>,
+    check_permission: impl Fn(&Claims) -> Result<(), anyhow::Error>,
 ) -> Result<(), ApiError> {
    match req.context::<Claims>() {
-        Some(claims) => Ok(check_permission(&claims)
-            .map_err(|_err| ApiError::Forbidden("JWT authentication error".to_string()))?),
+        Some(claims) => {
+            Ok(check_permission(&claims).map_err(|err| ApiError::Forbidden(err.to_string()))?)
+        }
        None => Ok(()), // claims is None because auth is disabled
    }
 }
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize};
 use std::borrow::Cow;
 use std::error::Error as StdError;
 use thiserror::Error;
-use tracing::{error, info, warn};
+use tracing::{error, info};

 #[derive(Debug, Error)]
 pub enum ApiError {
@@ -118,9 +118,6 @@ pub fn api_error_handler(api_error: ApiError) -> Response<Body> {
    // Print a stack trace for Internal Server errors

    match api_error {
-        ApiError::Forbidden(_) | ApiError::Unauthorized(_) => {
-            warn!("Error processing HTTP request: {api_error:#}")
-        }
        ApiError::ResourceUnavailable(_) => info!("Error processing HTTP request: {api_error:#}"),
        ApiError::NotFound(_) => info!("Error processing HTTP request: {api_error:#}"),
        ApiError::InternalServerError(_) => error!("Error processing HTTP request: {api_error:?}"),
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -120,8 +120,6 @@ impl Id {
            chunk[0] = HEX[((b >> 4) & 0xf) as usize];
            chunk[1] = HEX[(b & 0xf) as usize];
        }
-
-        // SAFETY: vec constructed out of `HEX`, it can only be ascii
        unsafe { String::from_utf8_unchecked(buf) }
    }
 }
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -1,6 +1,5 @@
 //! `utils` is intended to be a place to put code that is shared
 //! between other crates in this repository.
-#![deny(clippy::undocumented_unsafe_blocks)]

 pub mod backoff;

--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -125,9 +125,6 @@ where
            // Wake everyone with an error.
            let mut internal = self.internal.lock().unwrap();

-            // Block any future waiters from starting
-            internal.shutdown = true;
-
            // This will steal the entire waiters map.
            // When we drop it all waiters will be woken.
            mem::take(&mut internal.waiters)
--- a/libs/utils/src/shutdown.rs
+++ b/libs/utils/src/shutdown.rs
@@ -1,7 +1,6 @@
 /// Immediately terminate the calling process without calling
 /// atexit callbacks, C runtime destructors etc. We mainly use
 /// this to protect coverage data from concurrent writes.
-pub fn exit_now(code: u8) -> ! {
-    // SAFETY: exiting is safe, the ffi is not safe
+pub fn exit_now(code: u8) {
    unsafe { nix::libc::_exit(code as _) };
 }
--- a/libs/utils/src/sync/gate.rs
+++ b/libs/utils/src/sync/gate.rs
@@ -85,13 +85,6 @@ impl Gate {
        warn_if_stuck(self.do_close(), &self.name, Duration::from_millis(1000)).await
    }

-    /// Check if [`Self::close()`] has finished waiting for all [`Self::enter()`] users to finish.  This
-    /// is usually analoguous for "Did shutdown finish?" for types that include a Gate, whereas checking
-    /// the CancellationToken on such types is analogous to "Did shutdown start?"
-    pub fn close_complete(&self) -> bool {
-        self.sem.is_closed()
-    }
-
    async fn do_close(&self) {
        tracing::debug!(gate = self.name, "Closing Gate...");
        match self.sem.acquire_many(Self::MAX_UNITS).await {
--- a/libs/vm_monitor/Cargo.toml
+++ b/libs/vm_monitor/Cargo.toml
@@ -19,12 +19,13 @@ inotify.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 sysinfo.workspace = true
-tokio = { workspace = true, features = ["rt-multi-thread"] }
+tokio.workspace = true
 tokio-postgres.workspace = true
 tokio-stream.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true
 tracing-subscriber.workspace = true
+workspace_hack = { version = "0.1", path = "../../workspace_hack" }

 [target.'cfg(target_os = "linux")'.dependencies]
 cgroups-rs = "0.3.3"
--- a/libs/vm_monitor/src/lib.rs
+++ b/libs/vm_monitor/src/lib.rs
@@ -1,5 +1,3 @@
-#![deny(unsafe_code)]
-#![deny(clippy::undocumented_unsafe_blocks)]
 #![cfg(target_os = "linux")]

 use anyhow::Context;
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -188,7 +188,6 @@ extern "C" fn recovery_download(
    }
 }

-#[allow(clippy::unnecessary_cast)]
 extern "C" fn wal_read(
    sk: *mut Safekeeper,
    buf: *mut ::std::os::raw::c_char,
@@ -422,7 +421,6 @@ impl std::fmt::Display for Level {
 }

 /// Take ownership of `Vec<u8>` from StringInfoData.
-#[allow(clippy::unnecessary_cast)]
 pub(crate) fn take_vec_u8(pg: &mut StringInfoData) -> Option<Vec<u8>> {
    if pg.data.is_null() {
        return None;
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -186,7 +186,7 @@ impl Wrapper {
            .unwrap()
            .into_bytes_with_nul();
        assert!(safekeepers_list_vec.len() == safekeepers_list_vec.capacity());
-        let safekeepers_list = safekeepers_list_vec.as_mut_ptr() as *mut std::ffi::c_char;
+        let safekeepers_list = safekeepers_list_vec.as_mut_ptr() as *mut i8;

        let callback_data = Box::into_raw(Box::new(api)) as *mut ::std::os::raw::c_void;

--- a/pageserver/src/auth.rs
+++ b/pageserver/src/auth.rs
@@ -1,21 +1,22 @@
-use utils::auth::{AuthError, Claims, Scope};
+use anyhow::{bail, Result};
+use utils::auth::{Claims, Scope};
 use utils::id::TenantId;

-pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<(), AuthError> {
+pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<()> {
    match (&claims.scope, tenant_id) {
-        (Scope::Tenant, None) => Err(AuthError(
-            "Attempt to access management api with tenant scope. Permission denied".into(),
-        )),
+        (Scope::Tenant, None) => {
+            bail!("Attempt to access management api with tenant scope. Permission denied")
+        }
        (Scope::Tenant, Some(tenant_id)) => {
            if claims.tenant_id.unwrap() != tenant_id {
-                return Err(AuthError("Tenant id mismatch. Permission denied".into()));
+                bail!("Tenant id mismatch. Permission denied")
            }
            Ok(())
        }
        (Scope::PageServerApi, None) => Ok(()), // access to management api for PageServerApi scope
        (Scope::PageServerApi, Some(_)) => Ok(()), // access to tenant api using PageServerApi scope
-        (Scope::SafekeeperData, _) => Err(AuthError(
-            "SafekeeperData scope makes no sense for Pageserver".into(),
-        )),
+        (Scope::SafekeeperData, _) => {
+            bail!("SafekeeperData scope makes no sense for Pageserver")
+        }
    }
 }
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -34,11 +34,8 @@ use postgres_backend::AuthType;
 use utils::logging::TracingErrorLayerEnablement;
 use utils::signals::ShutdownSignals;
 use utils::{
-    auth::{JwtAuth, SwappableJwtAuth},
-    logging, project_build_tag, project_git_version,
-    sentry_init::init_sentry,
-    signals::Signal,
-    tcp_listener,
+    auth::JwtAuth, logging, project_build_tag, project_git_version, sentry_init::init_sentry,
+    signals::Signal, tcp_listener,
 };

 project_git_version!(GIT_VERSION);
@@ -324,12 +321,13 @@ fn start_pageserver(
    let http_auth;
    let pg_auth;
    if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT {
-        // unwrap is ok because check is performed when creating config, so path is set and exists
+        // unwrap is ok because check is performed when creating config, so path is set and file exists
        let key_path = conf.auth_validation_public_key_path.as_ref().unwrap();
-        info!("Loading public key(s) for verifying JWT tokens from {key_path:?}");
-
-        let jwt_auth = JwtAuth::from_key_path(key_path)?;
-        let auth: Arc<SwappableJwtAuth> = Arc::new(SwappableJwtAuth::new(jwt_auth));
+        info!(
+            "Loading public key for verifying JWT tokens from {:#?}",
+            key_path
+        );
+        let auth: Arc<JwtAuth> = Arc::new(JwtAuth::from_key_path(key_path)?);

        http_auth = match &conf.http_auth_type {
            AuthType::Trust => None,
@@ -412,7 +410,7 @@ fn start_pageserver(

    // Scan the local 'tenants/' directory and start loading the tenants
    let deletion_queue_client = deletion_queue.new_client();
-    let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
+    BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
        conf,
        TenantSharedResources {
            broker_client: broker_client.clone(),
@@ -422,7 +420,6 @@ fn start_pageserver(
        order,
        shutdown_pageserver.clone(),
    ))?;
-    let tenant_manager = Arc::new(tenant_manager);

    BACKGROUND_RUNTIME.spawn({
        let init_done_rx = init_done_rx;
@@ -551,7 +548,6 @@ fn start_pageserver(
        let router_state = Arc::new(
            http::routes::State::new(
                conf,
-                tenant_manager,
                http_auth.clone(),
                remote_storage.clone(),
                broker_client.clone(),
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -161,7 +161,7 @@ pub struct PageServerConf {
    pub http_auth_type: AuthType,
    /// authentication method for libpq connections from compute
    pub pg_auth_type: AuthType,
-    /// Path to a file or directory containing public key(s) for verifying JWT tokens.
+    /// Path to a file containing public key for verifying JWT tokens.
    /// Used for both mgmt and compute auth, if enabled.
    pub auth_validation_public_key_path: Option<Utf8PathBuf>,

@@ -1314,6 +1314,12 @@ broker_endpoint = '{broker_endpoint}'
            assert_eq!(
                parsed_remote_storage_config,
                RemoteStorageConfig {
+                    max_concurrent_syncs: NonZeroUsize::new(
+                        remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS
+                    )
+                        .unwrap(),
+                    max_sync_errors: NonZeroU32::new(remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS)
+                        .unwrap(),
                    storage: RemoteStorageKind::LocalFs(local_storage_path.clone()),
                },
                "Remote storage config should correctly parse the local FS config and fill other storage defaults"
@@ -1374,6 +1380,8 @@ broker_endpoint = '{broker_endpoint}'
            assert_eq!(
                parsed_remote_storage_config,
                RemoteStorageConfig {
+                    max_concurrent_syncs,
+                    max_sync_errors,
                    storage: RemoteStorageKind::AwsS3(S3Config {
                        bucket_name: bucket_name.clone(),
                        bucket_region: bucket_region.clone(),
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -893,6 +893,14 @@ mod test {
        std::fs::create_dir_all(remote_fs_dir)?;
        let remote_fs_dir = harness.conf.workdir.join("remote_fs").canonicalize_utf8()?;
        let storage_config = RemoteStorageConfig {
+            max_concurrent_syncs: std::num::NonZeroUsize::new(
+                remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
+            )
+            .unwrap(),
+            max_sync_errors: std::num::NonZeroU32::new(
+                remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
+            )
+            .unwrap(),
            storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
        };
        let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();
--- a/pageserver/src/deletion_queue/deleter.rs
+++ b/pageserver/src/deletion_queue/deleter.rs
@@ -55,24 +55,21 @@ impl Deleter {

    /// Wrap the remote `delete_objects` with a failpoint
    async fn remote_delete(&self) -> Result<(), anyhow::Error> {
+        fail::fail_point!("deletion-queue-before-execute", |_| {
+            info!("Skipping execution, failpoint set");
+            metrics::DELETION_QUEUE
+                .remote_errors
+                .with_label_values(&["failpoint"])
+                .inc();
+            Err(anyhow::anyhow!("failpoint hit"))
+        });
+
        // A backoff::retry is used here for two reasons:
        // - To provide a backoff rather than busy-polling the API on errors
        // - To absorb transient 429/503 conditions without hitting our error
        //   logging path for issues deleting objects.
        backoff::retry(
-            || async {
-                fail::fail_point!("deletion-queue-before-execute", |_| {
-                    info!("Skipping execution, failpoint set");
-
-                    metrics::DELETION_QUEUE
-                        .remote_errors
-                        .with_label_values(&["failpoint"])
-                        .inc();
-                    Err(anyhow::anyhow!("failpoint: deletion-queue-before-execute"))
-                });
-
-                self.remote_storage.delete_objects(&self.accumulator).await
-            },
+            || async { self.remote_storage.delete_objects(&self.accumulator).await },
            |_| false,
            3,
            10,
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -52,31 +52,6 @@ paths:
              schema:
                type: object

-  /v1/reload_auth_validation_keys:
-    post:
-      description: Reloads the JWT public keys from their pre-configured location on disk.
-      responses:
-        "200":
-          description: The reload completed successfully.
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error (also hits if no keys were found)
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-
  /v1/tenant/{tenant_id}:
    parameters:
      - name: tenant_id
@@ -352,8 +327,7 @@ paths:
          in: query
          required: true
          schema:
-            type: string
-            format: hex
+            type: integer
          description: A LSN to get the timestamp
      responses:
        "200":
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -20,7 +20,6 @@ use remote_storage::GenericRemoteStorage;
 use tenant_size_model::{SizeResult, StorageModel};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-use utils::auth::JwtAuth;
 use utils::http::endpoint::request_span;
 use utils::http::json::json_request_or_empty_body;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
@@ -36,8 +35,8 @@ use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::{LocationConf, TenantConfOpt};
 use crate::tenant::mgr::{
-    GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError,
-    TenantSlotError, TenantSlotUpsertError, TenantStateError,
+    GetTenantError, SetNewTenantConfigError, TenantMapError, TenantMapInsertError, TenantSlotError,
+    TenantSlotUpsertError, TenantStateError,
 };
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
@@ -46,7 +45,7 @@ use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, TenantSha
 use crate::{config::PageServerConf, tenant::mgr};
 use crate::{disk_usage_eviction_task, tenant};
 use utils::{
-    auth::SwappableJwtAuth,
+    auth::JwtAuth,
    generation::Generation,
    http::{
        endpoint::{self, attach_openapi_ui, auth_middleware, check_permission_with},
@@ -64,8 +63,7 @@ use super::models::ConfigureFailpointsRequest;

 pub struct State {
    conf: &'static PageServerConf,
-    tenant_manager: Arc<TenantManager>,
-    auth: Option<Arc<SwappableJwtAuth>>,
+    auth: Option<Arc<JwtAuth>>,
    allowlist_routes: Vec<Uri>,
    remote_storage: Option<GenericRemoteStorage>,
    broker_client: storage_broker::BrokerClientChannel,
@@ -76,8 +74,7 @@ pub struct State {
 impl State {
    pub fn new(
        conf: &'static PageServerConf,
-        tenant_manager: Arc<TenantManager>,
-        auth: Option<Arc<SwappableJwtAuth>>,
+        auth: Option<Arc<JwtAuth>>,
        remote_storage: Option<GenericRemoteStorage>,
        broker_client: storage_broker::BrokerClientChannel,
        disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
@@ -89,7 +86,6 @@ impl State {
            .collect::<Vec<_>>();
        Ok(Self {
            conf,
-            tenant_manager,
            auth,
            allowlist_routes,
            remote_storage,
@@ -303,7 +299,11 @@ async fn build_timeline_info(
        // we're executing this function, we will outlive the timeline on-disk state.
        info.current_logical_size_non_incremental = Some(
            timeline
-                .get_current_logical_size_non_incremental(info.last_record_lsn, ctx)
+                .get_current_logical_size_non_incremental(
+                    info.last_record_lsn,
+                    CancellationToken::new(),
+                    ctx,
+                )
                .await?,
        );
    }
@@ -389,32 +389,6 @@ async fn status_handler(
    json_response(StatusCode::OK, StatusResponse { id: config.id })
 }

-async fn reload_auth_validation_keys_handler(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    check_permission(&request, None)?;
-    let config = get_config(&request);
-    let state = get_state(&request);
-    let Some(shared_auth) = &state.auth else {
-        return json_response(StatusCode::BAD_REQUEST, ());
-    };
-    // unwrap is ok because check is performed when creating config, so path is set and exists
-    let key_path = config.auth_validation_public_key_path.as_ref().unwrap();
-    info!("Reloading public key(s) for verifying JWT tokens from {key_path:?}");
-
-    match JwtAuth::from_key_path(key_path) {
-        Ok(new_auth) => {
-            shared_auth.swap(new_auth);
-            json_response(StatusCode::OK, ())
-        }
-        Err(e) => {
-            warn!("Error reloading public keys from {key_path:?}: {e:}");
-            json_response(StatusCode::INTERNAL_SERVER_ERROR, ())
-        }
-    }
-}
-
 async fn timeline_create_handler(
    mut request: Request<Body>,
    _cancel: CancellationToken,
@@ -1166,14 +1140,20 @@ async fn put_tenant_location_config_handler(
    let location_conf =
        LocationConf::try_from(&request_data.config).map_err(ApiError::BadRequest)?;

-    state
-        .tenant_manager
-        .upsert_location(tenant_id, location_conf, &ctx)
-        .await
-        // TODO: badrequest assumes the caller was asking for something unreasonable, but in
-        // principle we might have hit something like concurrent API calls to the same tenant,
-        // which is not a 400 but a 409.
-        .map_err(ApiError::BadRequest)?;
+    mgr::upsert_location(
+        state.conf,
+        tenant_id,
+        location_conf,
+        state.broker_client.clone(),
+        state.remote_storage.clone(),
+        state.deletion_queue_client.clone(),
+        &ctx,
+    )
+    .await
+    // TODO: badrequest assumes the caller was asking for something unreasonable, but in
+    // principle we might have hit something like concurrent API calls to the same tenant,
+    // which is not a 400 but a 409.
+    .map_err(ApiError::BadRequest)?;

    json_response(StatusCode::OK, ())
 }
@@ -1667,8 +1647,6 @@ where
        );

        match handle.await {
-            // TODO: never actually return Err from here, always Ok(...) so that we can log
-            // spanned errors. Call api_error_handler instead and return appropriate Body.
            Ok(result) => result,
            Err(e) => {
                // The handler task panicked. We have a global panic handler that logs the
@@ -1717,7 +1695,7 @@ where
 pub fn make_router(
    state: Arc<State>,
    launch_ts: &'static LaunchTimestamp,
-    auth: Option<Arc<SwappableJwtAuth>>,
+    auth: Option<Arc<JwtAuth>>,
 ) -> anyhow::Result<RouterBuilder<hyper::Body, ApiError>> {
    let spec = include_bytes!("openapi_spec.yml");
    let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc");
@@ -1746,9 +1724,6 @@ pub fn make_router(
        .put("/v1/failpoints", |r| {
            testing_api_handler("manage failpoints", r, failpoints_handler)
        })
-        .post("/v1/reload_auth_validation_keys", |r| {
-            api_handler(r, reload_auth_validation_keys_handler)
-        })
        .get("/v1/tenant", |r| api_handler(r, tenant_list_handler))
        .post("/v1/tenant", |r| api_handler(r, tenant_create_handler))
        .get("/v1/tenant/:tenant_id", |r| api_handler(r, tenant_status))
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -1,5 +1,3 @@
-#![deny(clippy::undocumented_unsafe_blocks)]
-
 mod auth;
 pub mod basebackup;
 pub mod config;
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1225,6 +1225,15 @@ pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

+pub(crate) static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "pageserver_wal_redo_wait_seconds",
+        "Time spent waiting for access to the Postgres WAL redo process",
+        redo_histogram_time_buckets!(),
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_records_histogram",
@@ -1919,6 +1928,7 @@ pub fn preinitialize_metrics() {
        &READ_NUM_FS_LAYERS,
        &WAIT_LSN_TIME,
        &WAL_REDO_TIME,
+        &WAL_REDO_WAIT_TIME,
        &WAL_REDO_RECORDS_HISTOGRAM,
        &WAL_REDO_BYTES_HISTOGRAM,
    ]
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -14,7 +14,6 @@ use async_compression::tokio::write::GzipEncoder;
 use bytes::Buf;
 use bytes::Bytes;
 use futures::Stream;
-use pageserver_api::models::TenantState;
 use pageserver_api::models::{
    PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
    PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
@@ -40,7 +39,7 @@ use tracing::field;
 use tracing::*;
 use utils::id::ConnectionId;
 use utils::{
-    auth::{Claims, Scope, SwappableJwtAuth},
+    auth::{Claims, JwtAuth, Scope},
    id::{TenantId, TimelineId},
    lsn::Lsn,
    simple_rcu::RcuReadGuard,
@@ -122,7 +121,7 @@ async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()
 pub async fn libpq_listener_main(
    conf: &'static PageServerConf,
    broker_client: storage_broker::BrokerClientChannel,
-    auth: Option<Arc<SwappableJwtAuth>>,
+    auth: Option<Arc<JwtAuth>>,
    listener: TcpListener,
    auth_type: AuthType,
    listener_ctx: RequestContext,
@@ -190,7 +189,7 @@ pub async fn libpq_listener_main(
 async fn page_service_conn_main(
    conf: &'static PageServerConf,
    broker_client: storage_broker::BrokerClientChannel,
-    auth: Option<Arc<SwappableJwtAuth>>,
+    auth: Option<Arc<JwtAuth>>,
    socket: tokio::net::TcpStream,
    auth_type: AuthType,
    connection_ctx: RequestContext,
@@ -218,27 +217,9 @@ async fn page_service_conn_main(
    // no write timeout is used, because the kernel is assumed to error writes after some time.
    let mut socket = tokio_io_timeout::TimeoutReader::new(socket);

-    let default_timeout_ms = 10 * 60 * 1000; // 10 minutes by default
-    let socket_timeout_ms = (|| {
-        fail::fail_point!("simulated-bad-compute-connection", |avg_timeout_ms| {
-            // Exponential distribution for simulating
-            // poor network conditions, expect about avg_timeout_ms to be around 15
-            // in tests
-            if let Some(avg_timeout_ms) = avg_timeout_ms {
-                let avg = avg_timeout_ms.parse::<i64>().unwrap() as f32;
-                let u = rand::random::<f32>();
-                ((1.0 - u).ln() / (-avg)) as u64
-            } else {
-                default_timeout_ms
-            }
-        });
-        default_timeout_ms
-    })();
-
-    // A timeout here does not mean the client died, it can happen if it's just idle for
-    // a while: we will tear down this PageServerHandler and instantiate a new one if/when
-    // they reconnect.
-    socket.set_timeout(Some(std::time::Duration::from_millis(socket_timeout_ms)));
+    // timeout should be lower, but trying out multiple days for
+    // <https://github.com/neondatabase/neon/issues/4205>
+    socket.set_timeout(Some(std::time::Duration::from_secs(60 * 60 * 24 * 3)));
    let socket = std::pin::pin!(socket);

    // XXX: pgbackend.run() should take the connection_ctx,
@@ -271,7 +252,7 @@ async fn page_service_conn_main(
 struct PageServerHandler {
    _conf: &'static PageServerConf,
    broker_client: storage_broker::BrokerClientChannel,
-    auth: Option<Arc<SwappableJwtAuth>>,
+    auth: Option<Arc<JwtAuth>>,
    claims: Option<Claims>,

    /// The context created for the lifetime of the connection
@@ -285,7 +266,7 @@ impl PageServerHandler {
    pub fn new(
        conf: &'static PageServerConf,
        broker_client: storage_broker::BrokerClientChannel,
-        auth: Option<Arc<SwappableJwtAuth>>,
+        auth: Option<Arc<JwtAuth>>,
        connection_ctx: RequestContext,
    ) -> Self {
        PageServerHandler {
@@ -512,11 +493,7 @@ impl PageServerHandler {
            };

            if let Err(e) = &response {
-                // Requests may fail as soon as we are Stopping, even if the Timeline's cancellation token wasn't fired yet,
-                // because wait_lsn etc will drop out
-                // is_stopping(): [`Timeline::flush_and_shutdown`] has entered
-                // is_canceled(): [`Timeline::shutdown`]` has entered
-                if timeline.cancel.is_cancelled() || timeline.is_stopping() {
+                if timeline.cancel.is_cancelled() {
                    // If we fail to fulfil a request during shutdown, which may be _because_ of
                    // shutdown, then do not send the error to the client.  Instead just drop the
                    // connection.
@@ -920,7 +897,7 @@ impl PageServerHandler {

    // when accessing management api supply None as an argument
    // when using to authorize tenant pass corresponding tenant id
-    fn check_permission(&self, tenant_id: Option<TenantId>) -> Result<(), QueryError> {
+    fn check_permission(&self, tenant_id: Option<TenantId>) -> anyhow::Result<()> {
        if self.auth.is_none() {
            // auth is set to Trust, nothing to check so just return ok
            return Ok(());
@@ -932,7 +909,7 @@ impl PageServerHandler {
            .claims
            .as_ref()
            .expect("claims presence already checked");
-        check_permission(claims, tenant_id).map_err(|e| QueryError::Unauthorized(e.0))
+        check_permission(claims, tenant_id)
    }

    /// Shorthand for getting a reference to a Timeline of an Active tenant.
@@ -971,17 +948,16 @@ where
            .auth
            .as_ref()
            .unwrap()
-            .decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)
-            .map_err(|e| QueryError::Unauthorized(e.0))?;
+            .decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)?;

        if matches!(data.claims.scope, Scope::Tenant) && data.claims.tenant_id.is_none() {
-            return Err(QueryError::Unauthorized(
-                "jwt token scope is Tenant, but tenant id is missing".into(),
-            ));
+            return Err(QueryError::Other(anyhow::anyhow!(
+                "jwt token scope is Tenant, but tenant id is missing"
+            )));
        }

-        debug!(
-            "jwt scope check succeeded for scope: {:#?} by tenant id: {:?}",
+        info!(
+            "jwt auth succeeded for scope: {:#?} by tenant id: {:?}",
            data.claims.scope, data.claims.tenant_id,
        );

@@ -1003,13 +979,9 @@ where
        pgb: &mut PostgresBackend<IO>,
        query_string: &str,
    ) -> Result<(), QueryError> {
-        fail::fail_point!("simulated-bad-compute-connection", |_| {
-            info!("Hit failpoint for bad connection");
-            Err(QueryError::SimulatedConnectionError)
-        });
-
        let ctx = self.connection_ctx.attached_child();
        debug!("process query {query_string:?}");
+
        if query_string.starts_with("pagestream ") {
            let (_, params_raw) = query_string.split_at("pagestream ".len());
            let params = params_raw.split(' ').collect::<Vec<_>>();
@@ -1358,9 +1330,6 @@ impl From<GetActiveTenantError> for QueryError {
            GetActiveTenantError::WaitForActiveTimeout { .. } => QueryError::Disconnected(
                ConnectionError::Io(io::Error::new(io::ErrorKind::TimedOut, e.to_string())),
            ),
-            GetActiveTenantError::WillNotBecomeActive(TenantState::Stopping { .. }) => {
-                QueryError::Shutdown
-            }
            e => QueryError::Other(anyhow::anyhow!(e)),
        }
    }
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -21,6 +21,7 @@ use serde::{Deserialize, Serialize};
 use std::collections::{hash_map, HashMap, HashSet};
 use std::ops::ControlFlow;
 use std::ops::Range;
+use tokio_util::sync::CancellationToken;
 use tracing::{debug, trace, warn};
 use utils::{bin_ser::BeSer, lsn::Lsn};

@@ -577,6 +578,7 @@ impl Timeline {
    pub async fn get_current_logical_size_non_incremental(
        &self,
        lsn: Lsn,
+        cancel: CancellationToken,
        ctx: &RequestContext,
    ) -> Result<u64, CalculateLogicalSizeError> {
        crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
@@ -587,8 +589,12 @@ impl Timeline {

        let mut total_size: u64 = 0;
        for (spcnode, dbnode) in dbdir.dbdirs.keys() {
-            for rel in self.list_rels(*spcnode, *dbnode, lsn, ctx).await? {
-                if self.cancel.is_cancelled() {
+            for rel in self
+                .list_rels(*spcnode, *dbnode, lsn, ctx)
+                .await
+                .context("list rels")?
+            {
+                if cancel.is_cancelled() {
                    return Err(CalculateLogicalSizeError::Cancelled);
                }
                let relsize_key = rel_size_to_key(rel);
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1841,13 +1841,7 @@ impl Tenant {
            timelines.values().for_each(|timeline| {
                let timeline = Arc::clone(timeline);
                let span = Span::current();
-                js.spawn(async move {
-                    if freeze_and_flush {
-                        timeline.flush_and_shutdown().instrument(span).await
-                    } else {
-                        timeline.shutdown().instrument(span).await
-                    }
-                });
+                js.spawn(async move { timeline.shutdown(freeze_and_flush).instrument(span).await });
            })
        };
        tracing::info!("Waiting for timelines...");
@@ -3534,6 +3528,10 @@ pub(crate) mod harness {
            let remote_fs_dir = conf.workdir.join("localfs");
            std::fs::create_dir_all(&remote_fs_dir).unwrap();
            let config = RemoteStorageConfig {
+                // TODO: why not remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
+                max_concurrent_syncs: std::num::NonZeroUsize::new(2_000_000).unwrap(),
+                // TODO: why not remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
+                max_sync_errors: std::num::NonZeroU32::new(3_000_000).unwrap(),
                storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
            };
            let remote_storage = GenericRemoteStorage::from_config(&config).unwrap();
@@ -4733,7 +4731,7 @@ mod tests {
            // Keeps uninit mark in place
            let raw_tline = tline.raw_timeline().unwrap();
            raw_tline
-                .shutdown()
+                .shutdown(false)
                .instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_id))
                .await;
            std::mem::forget(tline);
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -327,7 +327,7 @@ mod tests {
                let mut sz: u16 = rng.gen();
                // Make 50% of the arrays small
                if rng.gen() {
-                    sz &= 63;
+                    sz |= 63;
                }
                random_array(sz.into())
            })
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -573,10 +573,10 @@ impl<const L: usize> BuildNode<L> {
        BuildNode {
            num_children: 0,
            level,
-            prefix: Vec::new(),
+            prefix: Vec::with_capacity(16),
            suffix_len: 0,
-            keys: Vec::new(),
-            values: Vec::new(),
+            keys: Vec::with_capacity(5024),
+            values: Vec::with_capacity(3140),
            size: NODE_HDR_SIZE,
        }
    }
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -200,22 +200,6 @@ async fn unsafe_create_dir_all(path: &Utf8PathBuf) -> std::io::Result<()> {
    Ok(())
 }

-/// The TenantManager is responsible for storing and mutating the collection of all tenants
-/// that this pageserver process has state for.  Every Tenant and SecondaryTenant instance
-/// lives inside the TenantManager.
-///
-/// The most important role of the TenantManager is to prevent conflicts: e.g. trying to attach
-/// the same tenant twice concurrently, or trying to configure the same tenant into secondary
-/// and attached modes concurrently.
-pub struct TenantManager {
-    conf: &'static PageServerConf,
-    // TODO: currently this is a &'static pointing to TENANTs.  When we finish refactoring
-    // out of that static variable, the TenantManager can own this.
-    // See https://github.com/neondatabase/neon/issues/5796
-    tenants: &'static std::sync::RwLock<TenantsMap>,
-    resources: TenantSharedResources,
-}
-
 fn emergency_generations(
    tenant_confs: &HashMap<TenantId, anyhow::Result<LocationConf>>,
 ) -> HashMap<TenantId, Generation> {
@@ -382,7 +366,7 @@ pub async fn init_tenant_mgr(
    resources: TenantSharedResources,
    init_order: InitializationOrder,
    cancel: CancellationToken,
-) -> anyhow::Result<TenantManager> {
+) -> anyhow::Result<()> {
    let mut tenants = HashMap::new();

    let ctx = RequestContext::todo_child(TaskKind::Startup, DownloadBehavior::Warn);
@@ -484,12 +468,7 @@ pub async fn init_tenant_mgr(
    assert!(matches!(&*tenants_map, &TenantsMap::Initializing));
    METRICS.tenant_slots.set(tenants.len() as u64);
    *tenants_map = TenantsMap::Open(tenants);
-
-    Ok(TenantManager {
-        conf,
-        tenants: &TENANTS,
-        resources,
-    })
+    Ok(())
 }

 /// Wrapper for Tenant::spawn that checks invariants before running, and inserts
@@ -566,10 +545,8 @@ pub(crate) async fn shutdown_all_tenants() {
 async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
    use utils::completion;

-    let mut join_set = JoinSet::new();
-
-    // Atomically, 1. create the shutdown tasks and 2. prevent creation of new tenants.
-    let (total_in_progress, total_attached) = {
+    // Atomically, 1. extract the list of tenants to shut down and 2. prevent creation of new tenants.
+    let (in_progress_ops, tenants_to_shut_down) = {
        let mut m = tenants.write().unwrap();
        match &mut *m {
            TenantsMap::Initializing => {
@@ -579,67 +556,78 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
            }
            TenantsMap::Open(tenants) => {
                let mut shutdown_state = HashMap::new();
-                let mut total_in_progress = 0;
-                let mut total_attached = 0;
+                let mut in_progress_ops = Vec::new();
+                let mut tenants_to_shut_down = Vec::new();

-                for (tenant_id, v) in tenants.drain() {
+                for (k, v) in tenants.drain() {
                    match v {
                        TenantSlot::Attached(t) => {
-                            shutdown_state.insert(tenant_id, TenantSlot::Attached(t.clone()));
-                            join_set.spawn(
-                                async move {
-                                    let freeze_and_flush = true;
-
-                                    let res = {
-                                        let (_guard, shutdown_progress) = completion::channel();
-                                        t.shutdown(shutdown_progress, freeze_and_flush).await
-                                    };
-
-                                    if let Err(other_progress) = res {
-                                        // join the another shutdown in progress
-                                        other_progress.wait().await;
-                                    }
-
-                                    // we cannot afford per tenant logging here, because if s3 is degraded, we are
-                                    // going to log too many lines
-                                    debug!("tenant successfully stopped");
-                                }
-                                .instrument(info_span!("shutdown", %tenant_id)),
-                            );
-
-                            total_attached += 1;
+                            tenants_to_shut_down.push(t.clone());
+                            shutdown_state.insert(k, TenantSlot::Attached(t));
                        }
                        TenantSlot::Secondary => {
-                            shutdown_state.insert(tenant_id, TenantSlot::Secondary);
+                            shutdown_state.insert(k, TenantSlot::Secondary);
                        }
                        TenantSlot::InProgress(notify) => {
                            // InProgress tenants are not visible in TenantsMap::ShuttingDown: we will
                            // wait for their notifications to fire in this function.
-                            join_set.spawn(async move {
-                                notify.wait().await;
-                            });
-
-                            total_in_progress += 1;
+                            in_progress_ops.push(notify);
                        }
                    }
                }
                *m = TenantsMap::ShuttingDown(shutdown_state);
-                (total_in_progress, total_attached)
+                (in_progress_ops, tenants_to_shut_down)
            }
            TenantsMap::ShuttingDown(_) => {
+                // TODO: it is possible that detach and shutdown happen at the same time. as a
+                // result, during shutdown we do not wait for detach.
                error!("already shutting down, this function isn't supposed to be called more than once");
                return;
            }
        }
    };

-    let started_at = std::time::Instant::now();
-
    info!(
        "Waiting for {} InProgress tenants and {} Attached tenants to shut down",
-        total_in_progress, total_attached
+        in_progress_ops.len(),
+        tenants_to_shut_down.len()
    );

+    for barrier in in_progress_ops {
+        barrier.wait().await;
+    }
+
+    info!(
+        "InProgress tenants shut down, waiting for {} Attached tenants to shut down",
+        tenants_to_shut_down.len()
+    );
+    let started_at = std::time::Instant::now();
+    let mut join_set = JoinSet::new();
+    for tenant in tenants_to_shut_down {
+        let tenant_id = tenant.get_tenant_id();
+        join_set.spawn(
+            async move {
+                let freeze_and_flush = true;
+
+                let res = {
+                    let (_guard, shutdown_progress) = completion::channel();
+                    tenant.shutdown(shutdown_progress, freeze_and_flush).await
+                };
+
+                if let Err(other_progress) = res {
+                    // join the another shutdown in progress
+                    other_progress.wait().await;
+                }
+
+                // we cannot afford per tenant logging here, because if s3 is degraded, we are
+                // going to log too many lines
+
+                debug!("tenant successfully stopped");
+            }
+            .instrument(info_span!("shutdown", %tenant_id)),
+        );
+    }
+
    let total = join_set.len();
    let mut panicked = 0;
    let mut buffering = true;
@@ -652,7 +640,7 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
                match joined {
                    Ok(()) => {}
                    Err(join_error) if join_error.is_cancelled() => {
-                        unreachable!("we are not cancelling any of the tasks");
+                        unreachable!("we are not cancelling any of the futures");
                    }
                    Err(join_error) if join_error.is_panic() => {
                        // cannot really do anything, as this panic is likely a bug
@@ -754,134 +742,139 @@ pub(crate) async fn set_new_tenant_config(
    Ok(())
 }

-impl TenantManager {
-    #[instrument(skip_all, fields(%tenant_id))]
-    pub(crate) async fn upsert_location(
-        &self,
-        tenant_id: TenantId,
-        new_location_config: LocationConf,
-        ctx: &RequestContext,
-    ) -> Result<(), anyhow::Error> {
-        info!("configuring tenant location {tenant_id} to state {new_location_config:?}");
+#[instrument(skip_all, fields(%tenant_id))]
+pub(crate) async fn upsert_location(
+    conf: &'static PageServerConf,
+    tenant_id: TenantId,
+    new_location_config: LocationConf,
+    broker_client: storage_broker::BrokerClientChannel,
+    remote_storage: Option<GenericRemoteStorage>,
+    deletion_queue_client: DeletionQueueClient,
+    ctx: &RequestContext,
+) -> Result<(), anyhow::Error> {
+    info!("configuring tenant location {tenant_id} to state {new_location_config:?}");

-        // Special case fast-path for updates to Tenant: if our upsert is only updating configuration,
-        // then we do not need to set the slot to InProgress, we can just call into the
-        // existng tenant.
-        {
-            let locked = self.tenants.read().unwrap();
-            let peek_slot = tenant_map_peek_slot(&locked, &tenant_id, TenantSlotPeekMode::Write)?;
-            match (&new_location_config.mode, peek_slot) {
-                (LocationMode::Attached(attach_conf), Some(TenantSlot::Attached(tenant))) => {
-                    if attach_conf.generation == tenant.generation {
-                        // A transition from Attached to Attached in the same generation, we may
-                        // take our fast path and just provide the updated configuration
-                        // to the tenant.
-                        tenant.set_new_location_config(AttachedTenantConf::try_from(
-                            new_location_config,
-                        )?);
+    // Special case fast-path for updates to Tenant: if our upsert is only updating configuration,
+    // then we do not need to set the slot to InProgress, we can just call into the
+    // existng tenant.
+    {
+        let locked = TENANTS.read().unwrap();
+        let peek_slot = tenant_map_peek_slot(&locked, &tenant_id, TenantSlotPeekMode::Write)?;
+        match (&new_location_config.mode, peek_slot) {
+            (LocationMode::Attached(attach_conf), Some(TenantSlot::Attached(tenant))) => {
+                if attach_conf.generation == tenant.generation {
+                    // A transition from Attached to Attached in the same generation, we may
+                    // take our fast path and just provide the updated configuration
+                    // to the tenant.
+                    tenant.set_new_location_config(AttachedTenantConf::try_from(
+                        new_location_config,
+                    )?);

-                        // Persist the new config in the background, to avoid holding up any
-                        // locks while we do so.
-                        // TODO
+                    // Persist the new config in the background, to avoid holding up any
+                    // locks while we do so.
+                    // TODO

-                        return Ok(());
-                    } else {
-                        // Different generations, fall through to general case
-                    }
-                }
-                _ => {
-                    // Not an Attached->Attached transition, fall through to general case
+                    return Ok(());
+                } else {
+                    // Different generations, fall through to general case
                }
            }
+            _ => {
+                // Not an Attached->Attached transition, fall through to general case
+            }
        }
+    }

-        // General case for upserts to TenantsMap, excluding the case above: we will substitute an
-        // InProgress value to the slot while we make whatever changes are required.  The state for
-        // the tenant is inaccessible to the outside world while we are doing this, but that is sensible:
-        // the state is ill-defined while we're in transition.  Transitions are async, but fast: we do
-        // not do significant I/O, and shutdowns should be prompt via cancellation tokens.
-        let mut slot_guard = tenant_map_acquire_slot(&tenant_id, TenantSlotAcquireMode::Any)?;
+    // General case for upserts to TenantsMap, excluding the case above: we will substitute an
+    // InProgress value to the slot while we make whatever changes are required.  The state for
+    // the tenant is inaccessible to the outside world while we are doing this, but that is sensible:
+    // the state is ill-defined while we're in transition.  Transitions are async, but fast: we do
+    // not do significant I/O, and shutdowns should be prompt via cancellation tokens.
+    let mut slot_guard = tenant_map_acquire_slot(&tenant_id, TenantSlotAcquireMode::Any)?;

-        if let Some(TenantSlot::Attached(tenant)) = slot_guard.get_old_value() {
-            // The case where we keep a Tenant alive was covered above in the special case
-            // for Attached->Attached transitions in the same generation.  By this point,
-            // if we see an attached tenant we know it will be discarded and should be
-            // shut down.
-            let (_guard, progress) = utils::completion::channel();
+    if let Some(TenantSlot::Attached(tenant)) = slot_guard.get_old_value() {
+        // The case where we keep a Tenant alive was covered above in the special case
+        // for Attached->Attached transitions in the same generation.  By this point,
+        // if we see an attached tenant we know it will be discarded and should be
+        // shut down.
+        let (_guard, progress) = utils::completion::channel();

-            match tenant.get_attach_mode() {
-                AttachmentMode::Single | AttachmentMode::Multi => {
-                    // Before we leave our state as the presumed holder of the latest generation,
-                    // flush any outstanding deletions to reduce the risk of leaking objects.
-                    self.resources.deletion_queue_client.flush_advisory()
-                }
-                AttachmentMode::Stale => {
-                    // If we're stale there's not point trying to flush deletions
-                }
-            };
-
-            info!("Shutting down attached tenant");
-            match tenant.shutdown(progress, false).await {
-                Ok(()) => {}
-                Err(barrier) => {
-                    info!("Shutdown already in progress, waiting for it to complete");
-                    barrier.wait().await;
-                }
+        match tenant.get_attach_mode() {
+            AttachmentMode::Single | AttachmentMode::Multi => {
+                // Before we leave our state as the presumed holder of the latest generation,
+                // flush any outstanding deletions to reduce the risk of leaking objects.
+                deletion_queue_client.flush_advisory()
            }
-            slot_guard.drop_old_value().expect("We just shut it down");
-        }
-
-        let tenant_path = self.conf.tenant_path(&tenant_id);
-
-        let new_slot = match &new_location_config.mode {
-            LocationMode::Secondary(_) => {
-                let tenant_path = self.conf.tenant_path(&tenant_id);
-                // Directory doesn't need to be fsync'd because if we crash it can
-                // safely be recreated next time this tenant location is configured.
-                unsafe_create_dir_all(&tenant_path)
-                    .await
-                    .with_context(|| format!("Creating {tenant_path}"))?;
-
-                Tenant::persist_tenant_config(self.conf, &tenant_id, &new_location_config)
-                    .await
-                    .map_err(SetNewTenantConfigError::Persist)?;
-
-                TenantSlot::Secondary
-            }
-            LocationMode::Attached(_attach_config) => {
-                let timelines_path = self.conf.timelines_path(&tenant_id);
-
-                // Directory doesn't need to be fsync'd because we do not depend on
-                // it to exist after crashes: it may be recreated when tenant is
-                // re-attached, see https://github.com/neondatabase/neon/issues/5550
-                unsafe_create_dir_all(&timelines_path)
-                    .await
-                    .with_context(|| format!("Creating {timelines_path}"))?;
-
-                Tenant::persist_tenant_config(self.conf, &tenant_id, &new_location_config)
-                    .await
-                    .map_err(SetNewTenantConfigError::Persist)?;
-
-                let tenant = tenant_spawn(
-                    self.conf,
-                    tenant_id,
-                    &tenant_path,
-                    self.resources.clone(),
-                    AttachedTenantConf::try_from(new_location_config)?,
-                    None,
-                    self.tenants,
-                    SpawnMode::Normal,
-                    ctx,
-                )?;
-
-                TenantSlot::Attached(tenant)
+            AttachmentMode::Stale => {
+                // If we're stale there's not point trying to flush deletions
            }
        };

-        slot_guard.upsert(new_slot)?;
-
-        Ok(())
+        info!("Shutting down attached tenant");
+        match tenant.shutdown(progress, false).await {
+            Ok(()) => {}
+            Err(barrier) => {
+                info!("Shutdown already in progress, waiting for it to complete");
+                barrier.wait().await;
+            }
+        }
+        slot_guard.drop_old_value().expect("We just shut it down");
    }
+
+    let tenant_path = conf.tenant_path(&tenant_id);
+
+    let new_slot = match &new_location_config.mode {
+        LocationMode::Secondary(_) => {
+            let tenant_path = conf.tenant_path(&tenant_id);
+            // Directory doesn't need to be fsync'd because if we crash it can
+            // safely be recreated next time this tenant location is configured.
+            unsafe_create_dir_all(&tenant_path)
+                .await
+                .with_context(|| format!("Creating {tenant_path}"))?;
+
+            Tenant::persist_tenant_config(conf, &tenant_id, &new_location_config)
+                .await
+                .map_err(SetNewTenantConfigError::Persist)?;
+
+            TenantSlot::Secondary
+        }
+        LocationMode::Attached(_attach_config) => {
+            let timelines_path = conf.timelines_path(&tenant_id);
+
+            // Directory doesn't need to be fsync'd because we do not depend on
+            // it to exist after crashes: it may be recreated when tenant is
+            // re-attached, see https://github.com/neondatabase/neon/issues/5550
+            unsafe_create_dir_all(&timelines_path)
+                .await
+                .with_context(|| format!("Creating {timelines_path}"))?;
+
+            Tenant::persist_tenant_config(conf, &tenant_id, &new_location_config)
+                .await
+                .map_err(SetNewTenantConfigError::Persist)?;
+
+            let tenant = tenant_spawn(
+                conf,
+                tenant_id,
+                &tenant_path,
+                TenantSharedResources {
+                    broker_client,
+                    remote_storage,
+                    deletion_queue_client,
+                },
+                AttachedTenantConf::try_from(new_location_config)?,
+                None,
+                &TENANTS,
+                SpawnMode::Normal,
+                ctx,
+            )?;
+
+            TenantSlot::Attached(tenant)
+        }
+    };
+
+    slot_guard.upsert(new_slot)?;
+
+    Ok(())
 }

 #[derive(Debug, thiserror::Error)]
@@ -1437,6 +1430,9 @@ pub struct SlotGuard {
    _completion: utils::completion::Completion,
 }

+unsafe impl Send for SlotGuard {}
+unsafe impl Sync for SlotGuard {}
+
 impl SlotGuard {
    fn new(
        tenant_id: TenantId,
@@ -1543,7 +1539,14 @@ impl SlotGuard {
    /// is responsible for protecting
    fn old_value_is_shutdown(&self) -> bool {
        match self.old_value.as_ref() {
-            Some(TenantSlot::Attached(tenant)) => tenant.gate.close_complete(),
+            Some(TenantSlot::Attached(tenant)) => {
+                // TODO: PR #5711 will add a gate that enables properly checking that
+                // shutdown completed.
+                matches!(
+                    tenant.current_state(),
+                    TenantState::Stopping { .. } | TenantState::Broken { .. }
+                )
+            }
            Some(TenantSlot::Secondary) => {
                // TODO: when adding secondary mode tenants, this will check for shutdown
                // in the same way that we do for `Tenant` above
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -6,6 +6,7 @@ use std::sync::Arc;
 use anyhow::{bail, Context};
 use tokio::sync::oneshot::error::RecvError;
 use tokio::sync::Semaphore;
+use tokio_util::sync::CancellationToken;

 use crate::context::RequestContext;
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;
@@ -349,6 +350,10 @@ async fn fill_logical_sizes(
    // our advantage with `?` error handling.
    let mut joinset = tokio::task::JoinSet::new();

+    let cancel = tokio_util::sync::CancellationToken::new();
+    // be sure to cancel all spawned tasks if we are dropped
+    let _dg = cancel.clone().drop_guard();
+
    // For each point that would benefit from having a logical size available,
    // spawn a Task to fetch it, unless we have it cached already.
    for seg in segments.iter() {
@@ -366,8 +371,15 @@ async fn fill_logical_sizes(
                let parallel_size_calcs = Arc::clone(limit);
                let ctx = ctx.attached_child();
                joinset.spawn(
-                    calculate_logical_size(parallel_size_calcs, timeline, lsn, cause, ctx)
-                        .in_current_span(),
+                    calculate_logical_size(
+                        parallel_size_calcs,
+                        timeline,
+                        lsn,
+                        cause,
+                        ctx,
+                        cancel.child_token(),
+                    )
+                    .in_current_span(),
                );
            }
            e.insert(cached_size);
@@ -475,13 +487,14 @@ async fn calculate_logical_size(
    lsn: utils::lsn::Lsn,
    cause: LogicalSizeCalculationCause,
    ctx: RequestContext,
+    cancel: CancellationToken,
 ) -> Result<TimelineAtLsnSizeResult, RecvError> {
    let _permit = tokio::sync::Semaphore::acquire_owned(limit)
        .await
        .expect("global semaphore should not had been closed");

    let size_res = timeline
-        .spawn_ondemand_logical_size_calculation(lsn, cause, ctx)
+        .spawn_ondemand_logical_size_calculation(lsn, cause, ctx, cancel)
        .instrument(info_span!("spawn_ondemand_logical_size_calculation"))
        .await?;
    Ok(TimelineAtLsnSizeResult(timeline, lsn, size_res))
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -251,7 +251,6 @@ impl Layer {

        layer
            .get_value_reconstruct_data(key, lsn_range, reconstruct_data, &self.0, ctx)
-            .instrument(tracing::info_span!("get_value_reconstruct_data", layer=%self))
            .await
    }

@@ -1212,10 +1211,8 @@ impl DownloadedLayer {
            // this will be a permanent failure
            .context("load layer");

-            if let Err(e) = res.as_ref() {
+            if res.is_err() {
                LAYER_IMPL_METRICS.inc_permanent_loading_failures();
-                // TODO(#5815): we are not logging all errors, so temporarily log them here as well
-                tracing::error!("layer loading failed permanently: {e:#}");
            }
            res
        };
@@ -1294,7 +1291,6 @@ impl ResidentLayer {
    }

    /// Loads all keys stored in the layer. Returns key, lsn and value size.
-    #[tracing::instrument(skip_all, fields(layer=%self))]
    pub(crate) async fn load_keys<'a>(
        &'a self,
        ctx: &RequestContext,
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -36,6 +36,7 @@ use std::time::{Duration, Instant, SystemTime};
 use crate::context::{
    AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder,
 };
+use crate::deletion_queue::DeletionQueueClient;
 use crate::tenant::storage_layer::delta_layer::DeltaEntry;
 use crate::tenant::storage_layer::{
    AsLayerDesc, DeltaLayerWriter, EvictionError, ImageLayerWriter, InMemoryLayer, Layer,
@@ -49,7 +50,6 @@ use crate::tenant::{
    metadata::{save_metadata, TimelineMetadata},
    par_fsync,
 };
-use crate::{deletion_queue::DeletionQueueClient, tenant::remote_timeline_client::StopError};

 use crate::config::PageServerConf;
 use crate::keyspace::{KeyPartitioning, KeySpace, KeySpaceRandomAccum};
@@ -247,7 +247,7 @@ pub struct Timeline {
    /// the flush finishes. You can use that to wait for the flush to finish.
    layer_flush_start_tx: tokio::sync::watch::Sender<u64>,
    /// to be notified when layer flushing has finished, subscribe to the layer_flush_done channel
-    layer_flush_done_tx: tokio::sync::watch::Sender<(u64, Result<(), FlushLayerError>)>,
+    layer_flush_done_tx: tokio::sync::watch::Sender<(u64, anyhow::Result<()>)>,

    /// Layer removal lock.
    /// A lock to ensure that no layer of the timeline is removed concurrently by other tasks.
@@ -374,19 +374,6 @@ pub enum PageReconstructError {
    WalRedo(anyhow::Error),
 }

-#[derive(thiserror::Error, Debug)]
-enum FlushLayerError {
-    /// Timeline cancellation token was cancelled
-    #[error("timeline shutting down")]
-    Cancelled,
-
-    #[error(transparent)]
-    PageReconstructError(#[from] PageReconstructError),
-
-    #[error(transparent)]
-    Other(#[from] anyhow::Error),
-}
-
 impl std::fmt::Debug for PageReconstructError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
        match self {
@@ -904,16 +891,15 @@ impl Timeline {
        self.launch_eviction_task(background_jobs_can_start);
    }

-    /// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk and then
-    /// also to remote storage.  This method can easily take multiple seconds for a busy timeline.
-    ///
-    /// While we are flushing, we continue to accept read I/O.
    #[instrument(skip_all, fields(timeline_id=%self.timeline_id))]
-    pub(crate) async fn flush_and_shutdown(&self) {
+    pub async fn shutdown(self: &Arc<Self>, freeze_and_flush: bool) {
        debug_assert_current_span_has_tenant_and_timeline_id();

-        // Stop ingesting data, so that we are not still writing to an InMemoryLayer while
-        // trying to flush
+        // Signal any subscribers to our cancellation token to drop out
+        tracing::debug!("Cancelling CancellationToken");
+        self.cancel.cancel();
+
+        // prevent writes to the InMemoryLayer
        tracing::debug!("Waiting for WalReceiverManager...");
        task_mgr::shutdown_tasks(
            Some(TaskKind::WalReceiverManager),
@@ -922,70 +908,40 @@ impl Timeline {
        )
        .await;

-        // Since we have shut down WAL ingest, we should not let anyone start waiting for the LSN to advance
-        self.last_record_lsn.shutdown();
-
        // now all writers to InMemory layer are gone, do the final flush if requested
-        match self.freeze_and_flush().await {
-            Ok(_) => {
-                // drain the upload queue
-                if let Some(client) = self.remote_client.as_ref() {
-                    // if we did not wait for completion here, it might be our shutdown process
-                    // didn't wait for remote uploads to complete at all, as new tasks can forever
-                    // be spawned.
-                    //
-                    // what is problematic is the shutting down of RemoteTimelineClient, because
-                    // obviously it does not make sense to stop while we wait for it, but what
-                    // about corner cases like s3 suddenly hanging up?
-                    if let Err(e) = client.wait_completion().await {
-                        // Non-fatal.  Shutdown is infallible.  Failures to flush just mean that
-                        // we have some extra WAL replay to do next time the timeline starts.
-                        warn!("failed to flush to remote storage: {e:#}");
-                    }
+        if freeze_and_flush {
+            match self.freeze_and_flush().await {
+                Ok(()) => {}
+                Err(e) => {
+                    warn!("failed to freeze and flush: {e:#}");
+                    return; // TODO: should probably drain remote timeline client anyways?
                }
            }
-            Err(e) => {
-                // Non-fatal.  Shutdown is infallible.  Failures to flush just mean that
-                // we have some extra WAL replay to do next time the timeline starts.
-                warn!("failed to freeze and flush: {e:#}");
+
+            // drain the upload queue
+            let res = if let Some(client) = self.remote_client.as_ref() {
+                // if we did not wait for completion here, it might be our shutdown process
+                // didn't wait for remote uploads to complete at all, as new tasks can forever
+                // be spawned.
+                //
+                // what is problematic is the shutting down of RemoteTimelineClient, because
+                // obviously it does not make sense to stop while we wait for it, but what
+                // about corner cases like s3 suddenly hanging up?
+                client.wait_completion().await
+            } else {
+                Ok(())
+            };
+
+            if let Err(e) = res {
+                warn!("failed to await for frozen and flushed uploads: {e:#}");
            }
        }

-        self.shutdown().await;
-    }
-
-    /// Shut down immediately, without waiting for any open layers to flush to disk.  This is a subset of
-    /// the graceful [`Timeline::flush_and_shutdown`] function.
-    pub(crate) async fn shutdown(&self) {
-        // Signal any subscribers to our cancellation token to drop out
-        tracing::debug!("Cancelling CancellationToken");
-        self.cancel.cancel();
-
        // Page request handlers might be waiting for LSN to advance: they do not respect Timeline::cancel
        // while doing so.
        self.last_record_lsn.shutdown();

-        // Shut down the layer flush task before the remote client, as one depends on the other
-        task_mgr::shutdown_tasks(
-            Some(TaskKind::LayerFlushTask),
-            Some(self.tenant_id),
-            Some(self.timeline_id),
-        )
-        .await;
-
-        // Shut down remote timeline client: this gracefully moves its metadata into its Stopping state in
-        // case our caller wants to use that for a deletion
-        if let Some(remote_client) = self.remote_client.as_ref() {
-            match remote_client.stop() {
-                Ok(()) => {}
-                Err(StopError::QueueUninitialized) => {
-                    // Shutting down during initialization is legal
-                }
-            }
-        }
-
        tracing::debug!("Waiting for tasks...");
-
        task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(self.timeline_id)).await;

        // Finally wait until any gate-holders are complete
@@ -1029,12 +985,7 @@ impl Timeline {
            reason,
            backtrace: backtrace_str,
        };
-        self.set_state(broken_state);
-
-        // Although the Broken state is not equivalent to shutdown() (shutdown will be called
-        // later when this tenant is detach or the process shuts down), firing the cancellation token
-        // here avoids the need for other tasks to watch for the Broken state explicitly.
-        self.cancel.cancel();
+        self.set_state(broken_state)
    }

    pub fn current_state(&self) -> TimelineState {
@@ -1790,8 +1741,12 @@ impl Timeline {
                // delay will be terminated by a timeout regardless.
                let _completion = { self_clone.initial_logical_size_attempt.lock().expect("unexpected initial_logical_size_attempt poisoned").take() };

+                // no extra cancellation here, because nothing really waits for this to complete compared
+                // to spawn_ondemand_logical_size_calculation.
+                let cancel = CancellationToken::new();
+
                let calculated_size = match self_clone
-                    .logical_size_calculation_task(lsn, LogicalSizeCalculationCause::Initial, &background_ctx)
+                    .logical_size_calculation_task(lsn, LogicalSizeCalculationCause::Initial, &background_ctx, cancel)
                    .await
                {
                    Ok(s) => s,
@@ -1860,6 +1815,7 @@ impl Timeline {
        lsn: Lsn,
        cause: LogicalSizeCalculationCause,
        ctx: RequestContext,
+        cancel: CancellationToken,
    ) -> oneshot::Receiver<Result<u64, CalculateLogicalSizeError>> {
        let (sender, receiver) = oneshot::channel();
        let self_clone = Arc::clone(self);
@@ -1880,7 +1836,7 @@ impl Timeline {
            false,
            async move {
                let res = self_clone
-                    .logical_size_calculation_task(lsn, cause, &ctx)
+                    .logical_size_calculation_task(lsn, cause, &ctx, cancel)
                    .await;
                let _ = sender.send(res).ok();
                Ok(()) // Receiver is responsible for handling errors
@@ -1896,28 +1852,58 @@ impl Timeline {
        lsn: Lsn,
        cause: LogicalSizeCalculationCause,
        ctx: &RequestContext,
+        cancel: CancellationToken,
    ) -> Result<u64, CalculateLogicalSizeError> {
        span::debug_assert_current_span_has_tenant_and_timeline_id();

-        let _guard = self.gate.enter();
-
+        let mut timeline_state_updates = self.subscribe_for_state_updates();
        let self_calculation = Arc::clone(self);

        let mut calculation = pin!(async {
+            let cancel = cancel.child_token();
            let ctx = ctx.attached_child();
            self_calculation
-                .calculate_logical_size(lsn, cause, &ctx)
+                .calculate_logical_size(lsn, cause, cancel, &ctx)
                .await
        });
+        let timeline_state_cancellation = async {
+            loop {
+                match timeline_state_updates.changed().await {
+                    Ok(()) => {
+                        let new_state = timeline_state_updates.borrow().clone();
+                        match new_state {
+                            // we're running this job for active timelines only
+                            TimelineState::Active => continue,
+                            TimelineState::Broken { .. }
+                            | TimelineState::Stopping
+                            | TimelineState::Loading => {
+                                break format!("aborted because timeline became inactive (new state: {new_state:?})")
+                            }
+                        }
+                    }
+                    Err(_sender_dropped_error) => {
+                        // can't happen, the sender is not dropped as long as the Timeline exists
+                        break "aborted because state watch was dropped".to_string();
+                    }
+                }
+            }
+        };
+
+        let taskmgr_shutdown_cancellation = async {
+            task_mgr::shutdown_watcher().await;
+            "aborted because task_mgr shutdown requested".to_string()
+        };

        tokio::select! {
            res = &mut calculation => { res }
-            _ = self.cancel.cancelled() => {
-                debug!("cancelling logical size calculation for timeline shutdown");
+            reason = timeline_state_cancellation => {
+                debug!(reason = reason, "cancelling calculation");
+                cancel.cancel();
                calculation.await
            }
-            _ = task_mgr::shutdown_watcher() => {
-                debug!("cancelling logical size calculation for task shutdown");
+            reason = taskmgr_shutdown_cancellation => {
+                debug!(reason = reason, "cancelling calculation");
+                cancel.cancel();
                calculation.await
            }
        }
@@ -1931,6 +1917,7 @@ impl Timeline {
        &self,
        up_to_lsn: Lsn,
        cause: LogicalSizeCalculationCause,
+        cancel: CancellationToken,
        ctx: &RequestContext,
    ) -> Result<u64, CalculateLogicalSizeError> {
        info!(
@@ -1973,7 +1960,7 @@ impl Timeline {
        };
        let timer = storage_time_metrics.start_timer();
        let logical_size = self
-            .get_current_logical_size_non_incremental(up_to_lsn, ctx)
+            .get_current_logical_size_non_incremental(up_to_lsn, cancel, ctx)
            .await?;
        debug!("calculated logical size: {logical_size}");
        timer.stop_and_record();
@@ -2386,10 +2373,6 @@ impl Timeline {
        info!("started flush loop");
        loop {
            tokio::select! {
-                _ = self.cancel.cancelled() => {
-                    info!("shutting down layer flush task");
-                    break;
-                },
                _ = task_mgr::shutdown_watcher() => {
                    info!("shutting down layer flush task");
                    break;
@@ -2401,14 +2384,6 @@ impl Timeline {
            let timer = self.metrics.flush_time_histo.start_timer();
            let flush_counter = *layer_flush_start_rx.borrow();
            let result = loop {
-                if self.cancel.is_cancelled() {
-                    info!("dropping out of flush loop for timeline shutdown");
-                    // Note: we do not bother transmitting into [`layer_flush_done_tx`], because
-                    // anyone waiting on that will respect self.cancel as well: they will stop
-                    // waiting at the same time we as drop out of this loop.
-                    return;
-                }
-
                let layer_to_flush = {
                    let guard = self.layers.read().await;
                    guard.layer_map().frozen_layers.front().cloned()
@@ -2417,18 +2392,9 @@ impl Timeline {
                let Some(layer_to_flush) = layer_to_flush else {
                    break Ok(());
                };
-                match self.flush_frozen_layer(layer_to_flush, ctx).await {
-                    Ok(()) => {}
-                    Err(FlushLayerError::Cancelled) => {
-                        info!("dropping out of flush loop for timeline shutdown");
-                        return;
-                    }
-                    err @ Err(
-                        FlushLayerError::Other(_) | FlushLayerError::PageReconstructError(_),
-                    ) => {
-                        error!("could not flush frozen layer: {err:?}");
-                        break err;
-                    }
+                if let Err(err) = self.flush_frozen_layer(layer_to_flush, ctx).await {
+                    error!("could not flush frozen layer: {err:?}");
+                    break Err(err);
                }
            };
            // Notify any listeners that we're done
@@ -2477,17 +2443,7 @@ impl Timeline {
                }
            }
            trace!("waiting for flush to complete");
-            tokio::select! {
-                rx_e = rx.changed() => {
-                    rx_e?;
-                },
-                // Cancellation safety: we are not leaving an I/O in-flight for the flush, we're just ignoring
-                // the notification from [`flush_loop`] that it completed.
-                _ = self.cancel.cancelled() => {
-                    tracing::info!("Cancelled layer flush due on timeline shutdown");
-                    return Ok(())
-                }
-            };
+            rx.changed().await?;
            trace!("done")
        }
    }
@@ -2502,7 +2458,7 @@ impl Timeline {
        self: &Arc<Self>,
        frozen_layer: Arc<InMemoryLayer>,
        ctx: &RequestContext,
-    ) -> Result<(), FlushLayerError> {
+    ) -> anyhow::Result<()> {
        // As a special case, when we have just imported an image into the repository,
        // instead of writing out a L0 delta layer, we directly write out image layer
        // files instead. This is possible as long as *all* the data imported into the
@@ -2527,11 +2483,6 @@ impl Timeline {
                let (partitioning, _lsn) = self
                    .repartition(self.initdb_lsn, self.get_compaction_target_size(), ctx)
                    .await?;
-
-                if self.cancel.is_cancelled() {
-                    return Err(FlushLayerError::Cancelled);
-                }
-
                // For image layers, we add them immediately into the layer map.
                (
                    self.create_image_layers(&partitioning, self.initdb_lsn, true, ctx)
@@ -2563,10 +2514,6 @@ impl Timeline {
                )
            };

-        if self.cancel.is_cancelled() {
-            return Err(FlushLayerError::Cancelled);
-        }
-
        let disk_consistent_lsn = Lsn(lsn_range.end.0 - 1);
        let old_disk_consistent_lsn = self.disk_consistent_lsn.load();

@@ -2576,10 +2523,6 @@ impl Timeline {
        let metadata = {
            let mut guard = self.layers.write().await;

-            if self.cancel.is_cancelled() {
-                return Err(FlushLayerError::Cancelled);
-            }
-
            guard.finish_flush_l0_layer(delta_layer_to_add.as_ref(), &frozen_layer, &self.metrics);

            if disk_consistent_lsn != old_disk_consistent_lsn {
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -326,7 +326,8 @@ impl Timeline {
        match state.last_layer_access_imitation {
            Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
            _ => {
-                self.imitate_timeline_cached_layer_accesses(ctx).await;
+                self.imitate_timeline_cached_layer_accesses(cancel, ctx)
+                    .await;
                state.last_layer_access_imitation = Some(tokio::time::Instant::now())
            }
        }
@@ -366,12 +367,21 @@ impl Timeline {

    /// Recompute the values which would cause on-demand downloads during restart.
    #[instrument(skip_all)]
-    async fn imitate_timeline_cached_layer_accesses(&self, ctx: &RequestContext) {
+    async fn imitate_timeline_cached_layer_accesses(
+        &self,
+        cancel: &CancellationToken,
+        ctx: &RequestContext,
+    ) {
        let lsn = self.get_last_record_lsn();

        // imitiate on-restart initial logical size
        let size = self
-            .calculate_logical_size(lsn, LogicalSizeCalculationCause::EvictionTaskImitation, ctx)
+            .calculate_logical_size(
+                lsn,
+                LogicalSizeCalculationCause::EvictionTaskImitation,
+                cancel.clone(),
+                ctx,
+            )
            .instrument(info_span!("calculate_logical_size"))
            .await;

--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -44,6 +44,7 @@ use std::sync::atomic::{AtomicUsize, Ordering};
 use crate::config::PageServerConf;
 use crate::metrics::{
    WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME,
+    WAL_REDO_WAIT_TIME,
 };
 use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block};
 use crate::repository::Key;
@@ -206,8 +207,11 @@ impl PostgresRedoManager {
    ) -> anyhow::Result<Bytes> {
        let (rel, blknum) = key_to_rel_block(key).context("invalid record")?;
        const MAX_RETRY_ATTEMPTS: u32 = 1;
+        let start_time = Instant::now();
        let mut n_attempts = 0u32;
        loop {
+            let lock_time = Instant::now();
+
            // launch the WAL redo process on first use
            let proc: Arc<WalRedoProcess> = {
                let proc_guard = self.redo_process.read().unwrap();
@@ -232,7 +236,7 @@ impl PostgresRedoManager {
                }
            };

-            let started_at = std::time::Instant::now();
+            WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64());

            // Relational WAL records are applied using wal-redo-postgres
            let buf_tag = BufferTag { rel, blknum };
@@ -240,7 +244,8 @@ impl PostgresRedoManager {
                .apply_wal_records(buf_tag, &base_img, records, wal_redo_timeout)
                .context("apply_wal_records");

-            let duration = started_at.elapsed();
+            let end_time = Instant::now();
+            let duration = end_time.duration_since(lock_time);

            let len = records.len();
            let nbytes = records.iter().fold(0, |acumulator, record| {
@@ -591,21 +596,21 @@ trait CloseFileDescriptors: CommandExt {

 impl<C: CommandExt> CloseFileDescriptors for C {
    fn close_fds(&mut self) -> &mut Command {
-        // SAFETY: Code executed inside pre_exec should have async-signal-safety,
-        // which means it should be safe to execute inside a signal handler.
-        // The precise meaning depends on platform. See `man signal-safety`
-        // for the linux definition.
-        //
-        // The set_fds_cloexec_threadsafe function is documented to be
-        // async-signal-safe.
-        //
-        // Aside from this function, the rest of the code is re-entrant and
-        // doesn't make any syscalls. We're just passing constants.
-        //
-        // NOTE: It's easy to indirectly cause a malloc or lock a mutex,
-        // which is not async-signal-safe. Be careful.
        unsafe {
            self.pre_exec(move || {
+                // SAFETY: Code executed inside pre_exec should have async-signal-safety,
+                // which means it should be safe to execute inside a signal handler.
+                // The precise meaning depends on platform. See `man signal-safety`
+                // for the linux definition.
+                //
+                // The set_fds_cloexec_threadsafe function is documented to be
+                // async-signal-safe.
+                //
+                // Aside from this function, the rest of the code is re-entrant and
+                // doesn't make any syscalls. We're just passing constants.
+                //
+                // NOTE: It's easy to indirectly cause a malloc or lock a mutex,
+                // which is not async-signal-safe. Be careful.
                close_fds::set_fds_cloexec_threadsafe(3, &[]);
                Ok(())
            })
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -19,10 +19,7 @@
 #include "access/xlog.h"
 #include "access/xlogutils.h"
 #include "storage/buf_internals.h"
-#include "storage/lwlock.h"
-#include "storage/ipc.h"
 #include "c.h"
-#include "postmaster/interrupt.h"

 #include "libpq-fe.h"
 #include "libpq/pqformat.h"
@@ -64,63 +61,23 @@ int			flush_every_n_requests = 8;
 int			n_reconnect_attempts = 0;
 int			max_reconnect_attempts = 60;

-#define MAX_PAGESERVER_CONNSTRING_SIZE 256
-
-typedef struct
-{
-    LWLockId lock;
-    pg_atomic_uint64 update_counter;
-    char pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
-} PagestoreShmemState;
-
-#if PG_VERSION_NUM >= 150000
-static shmem_request_hook_type prev_shmem_request_hook = NULL;
-static void walproposer_shmem_request(void);
-#endif
-static shmem_startup_hook_type prev_shmem_startup_hook;
-static PagestoreShmemState *pagestore_shared;
-static uint64 pagestore_local_counter = 0;
-static char local_pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
-
 bool	(*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;

 static bool pageserver_flush(void);
 static void pageserver_disconnect(void);

-static bool
-CheckPageserverConnstring(char **newval, void **extra, GucSource source)
-{
-    return strlen(*newval) < MAX_PAGESERVER_CONNSTRING_SIZE;
-}
+
+static pqsigfunc	 prev_signal_handler;

 static void
-AssignPageserverConnstring(const char *newval, void *extra)
+pageserver_sighup_handler(SIGNAL_ARGS)
 {
-    if(!pagestore_shared)
-        return;
-    LWLockAcquire(pagestore_shared->lock, LW_EXCLUSIVE);
-    strlcpy(pagestore_shared->pageserver_connstring, newval, MAX_PAGESERVER_CONNSTRING_SIZE);
-    pg_atomic_fetch_add_u64(&pagestore_shared->update_counter, 1);
-    LWLockRelease(pagestore_shared->lock);
-}
-
-static bool
-CheckConnstringUpdated()
-{
-    if(!pagestore_shared)
-        return false;
-    return pagestore_local_counter < pg_atomic_read_u64(&pagestore_shared->update_counter);
-}
-
-static void
-ReloadConnstring()
-{
-    if(!pagestore_shared)
-        return;
-    LWLockAcquire(pagestore_shared->lock, LW_SHARED);
-    strlcpy(local_pageserver_connstring, pagestore_shared->pageserver_connstring, sizeof(local_pageserver_connstring));
-    pagestore_local_counter = pg_atomic_read_u64(&pagestore_shared->update_counter);
-    LWLockRelease(pagestore_shared->lock);
+	if (prev_signal_handler)
+	{
+        	prev_signal_handler(postgres_signal_arg);
+	}
+	neon_log(LOG, "Received SIGHUP, disconnecting pageserver. New pageserver connstring is %s", page_server_connstring);
+	pageserver_disconnect();
 }

 static bool
@@ -134,11 +91,6 @@ pageserver_connect(int elevel)

 	Assert(!connected);

-        if(CheckConnstringUpdated())
-        {
-            ReloadConnstring();
-        }
-
 	/*
 	 * Connect using the connection string we got from the
 	 * neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment
@@ -158,7 +110,7 @@ pageserver_connect(int elevel)
 		n++;
 	}
 	keywords[n] = "dbname";
-	values[n] = local_pageserver_connstring;
+	values[n] = page_server_connstring;
 	n++;
 	keywords[n] = NULL;
 	values[n] = NULL;
@@ -302,12 +254,6 @@ pageserver_send(NeonRequest * request)
 {
 	StringInfoData req_buff;

-        if(CheckConnstringUpdated())
-        {
-            pageserver_disconnect();
-            ReloadConnstring();
-        }
-
 	/* If the connection was lost for some reason, reconnect */
 	if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD)
 	{
@@ -328,7 +274,6 @@ pageserver_send(NeonRequest * request)
 	{
 		while (!pageserver_connect(n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
 		{
-			HandleMainLoopInterrupts();
 			n_reconnect_attempts += 1;
 			pg_usleep(RECONNECT_INTERVAL_USEC);
 		}
@@ -446,8 +391,7 @@ pageserver_flush(void)
 	return true;
 }

-page_server_api api =
-{
+page_server_api api = {
 	.send = pageserver_send,
 	.flush = pageserver_flush,
 	.receive = pageserver_receive
@@ -461,72 +405,12 @@ check_neon_id(char **newval, void **extra, GucSource source)
 	return **newval == '\0' || HexDecodeString(id, *newval, 16);
 }

-static Size
-PagestoreShmemSize(void)
-{
-    return sizeof(PagestoreShmemState);
-}
-
-static bool
-PagestoreShmemInit(void)
-{
-    bool found;
-    LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
-    pagestore_shared = ShmemInitStruct("libpagestore shared state",
-                                       PagestoreShmemSize(),
-                                       &found);
-    if(!found)
-    {
-        pagestore_shared->lock = &(GetNamedLWLockTranche("neon_libpagestore")->lock);
-        pg_atomic_init_u64(&pagestore_shared->update_counter, 0);
-        AssignPageserverConnstring(page_server_connstring, NULL);
-    }
-    LWLockRelease(AddinShmemInitLock);
-    return found;
-}
-
-static void
-pagestore_shmem_startup_hook(void)
-{
-    if(prev_shmem_startup_hook)
-        prev_shmem_startup_hook();
-
-    PagestoreShmemInit();
-}
-
-static void
-pagestore_shmem_request(void)
-{
-#if PG_VERSION_NUM >= 150000
-    if(prev_shmem_request_hook)
-        prev_shmem_request_hook();
-#endif
-
-    RequestAddinShmemSpace(PagestoreShmemSize());
-    RequestNamedLWLockTranche("neon_libpagestore", 1);
-}
-
-static void
-pagestore_prepare_shmem(void)
-{
-#if PG_VERSION_NUM >= 150000
-	prev_shmem_request_hook = shmem_request_hook;
-	shmem_request_hook = pagestore_shmem_request;
-#else
-        pagestore_shmem_request();
-#endif
-	prev_shmem_startup_hook = shmem_startup_hook;
-	shmem_startup_hook = pagestore_shmem_startup_hook;
-}
-
 /*
 * Module initialization function
 */
 void
 pg_init_libpagestore(void)
 {
-        pagestore_prepare_shmem();
-
 	DefineCustomStringVariable("neon.pageserver_connstring",
 							   "connection string to the page server",
 							   NULL,
@@ -534,7 +418,7 @@ pg_init_libpagestore(void)
 							   "",
 							   PGC_SIGHUP,
 							   0,	/* no flags required */
-							   CheckPageserverConnstring, AssignPageserverConnstring, NULL);
+							   NULL, NULL, NULL);

 	DefineCustomStringVariable("neon.timeline_id",
 							   "Neon timeline_id the server is running on",
@@ -615,5 +499,7 @@ pg_init_libpagestore(void)
 		redo_read_buffer_filter = neon_redo_read_buffer_filter;
 	}

+        prev_signal_handler = pqsignal(SIGHUP, pageserver_sighup_handler);
+
 	lfc_init();
 }
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -1,8 +1,6 @@
 //! User credentials used in authentication.

-use crate::{
-    auth::password_hack::parse_endpoint_param, error::UserFacingError, proxy::neon_options,
-};
+use crate::{auth::password_hack::parse_endpoint_param, error::UserFacingError};
 use itertools::Itertools;
 use pq_proto::StartupMessageParams;
 use std::collections::HashSet;
@@ -40,8 +38,6 @@ pub struct ClientCredentials<'a> {
    pub user: &'a str,
    // TODO: this is a severe misnomer! We should think of a new name ASAP.
    pub project: Option<String>,
-
-    pub cache_key: String,
 }

 impl ClientCredentials<'_> {
@@ -57,7 +53,6 @@ impl<'a> ClientCredentials<'a> {
        ClientCredentials {
            user: "",
            project: None,
-            cache_key: "".to_string(),
        }
    }

@@ -125,17 +120,7 @@ impl<'a> ClientCredentials<'a> {

        info!(user, project = project.as_deref(), "credentials");

-        let cache_key = format!(
-            "{}{}",
-            project.as_deref().unwrap_or(""),
-            neon_options(params).unwrap_or("".to_string())
-        );
-
-        Ok(Self {
-            user,
-            project,
-            cache_key,
-        })
+        Ok(Self { user, project })
    }
 }

@@ -191,7 +176,6 @@ mod tests {
        let creds = ClientCredentials::parse(&options, sni, common_names)?;
        assert_eq!(creds.user, "john_doe");
        assert_eq!(creds.project.as_deref(), Some("foo"));
-        assert_eq!(creds.cache_key, "foo");

        Ok(())
    }
@@ -319,23 +303,4 @@ mod tests {
            _ => panic!("bad error: {err:?}"),
        }
    }
-
-    #[test]
-    fn parse_neon_options() -> anyhow::Result<()> {
-        let options = StartupMessageParams::new([
-            ("user", "john_doe"),
-            ("options", "neon_lsn:0/2 neon_endpoint_type:read_write"),
-        ]);
-
-        let sni = Some("project.localhost");
-        let common_names = Some(["localhost".into()].into());
-        let creds = ClientCredentials::parse(&options, sni, common_names)?;
-        assert_eq!(creds.project.as_deref(), Some("project"));
-        assert_eq!(
-            creds.cache_key,
-            "projectneon_endpoint_type:read_write neon_lsn:0/2"
-        );
-
-        Ok(())
-    }
 }
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -80,9 +80,6 @@ struct ProxyCliArgs {
    /// cache for `wake_compute` api method (use `size=0` to disable)
    #[clap(long, default_value = config::CacheOptions::DEFAULT_OPTIONS_NODE_INFO)]
    wake_compute_cache: String,
-    /// lock for `wake_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable).
-    #[clap(long, default_value = config::WakeComputeLockOptions::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK)]
-    wake_compute_lock: String,
    /// Allow self-signed certificates for compute nodes (for testing)
    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
    allow_self_signed_compute: bool,
@@ -223,23 +220,10 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
                node_info: console::caches::NodeInfoCache::new("node_info_cache", size, ttl),
            }));

-            let config::WakeComputeLockOptions {
-                shards,
-                permits,
-                epoch,
-                timeout,
-            } = args.wake_compute_lock.parse()?;
-            info!(permits, shards, ?epoch, "Using NodeLocks (wake_compute)");
-            let locks = Box::leak(Box::new(
-                console::locks::ApiLocks::new("wake_compute_lock", permits, shards, timeout)
-                    .unwrap(),
-            ));
-            tokio::spawn(locks.garbage_collect_worker(epoch));
-
            let url = args.auth_endpoint.parse()?;
            let endpoint = http::Endpoint::new(url, http::new_client());

-            let api = console::provider::neon::Api::new(endpoint, caches, locks);
+            let api = console::provider::neon::Api::new(endpoint, caches);
            auth::BackendType::Console(Cow::Owned(api), ())
        }
        AuthBackend::Postgres => {
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -3,7 +3,6 @@ use crate::{
    cancellation::CancelClosure,
    console::errors::WakeComputeError,
    error::{io_error, UserFacingError},
-    proxy::is_neon_param,
 };
 use futures::{FutureExt, TryFutureExt};
 use itertools::Itertools;
@@ -279,7 +278,7 @@ fn filtered_options(params: &StartupMessageParams) -> Option<String> {
    #[allow(unstable_name_collisions)]
    let options: String = params
        .options_raw()?
-        .filter(|opt| parse_endpoint_param(opt).is_none() && !is_neon_param(opt))
+        .filter(|opt| parse_endpoint_param(opt).is_none())
        .intersperse(" ") // TODO: use impl from std once it's stabilized
        .collect();

@@ -314,11 +313,5 @@ mod tests {

        let params = StartupMessageParams::new([("options", "project = foo")]);
        assert_eq!(filtered_options(&params).as_deref(), Some("project = foo"));
-
-        let params = StartupMessageParams::new([(
-            "options",
-            "project = foo neon_endpoint_type:read_write   neon_lsn:0/2",
-        )]);
-        assert_eq!(filtered_options(&params).as_deref(), Some("project = foo"));
    }
 }
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -264,79 +264,6 @@ impl FromStr for CacheOptions {
    }
 }

-/// Helper for cmdline cache options parsing.
-pub struct WakeComputeLockOptions {
-    /// The number of shards the lock map should have
-    pub shards: usize,
-    /// The number of allowed concurrent requests for each endpoitn
-    pub permits: usize,
-    /// Garbage collection epoch
-    pub epoch: Duration,
-    /// Lock timeout
-    pub timeout: Duration,
-}
-
-impl WakeComputeLockOptions {
-    /// Default options for [`crate::console::provider::ApiLocks`].
-    pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "permits=0";
-
-    // pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "shards=32,permits=4,epoch=10m,timeout=1s";
-
-    /// Parse lock options passed via cmdline.
-    /// Example: [`Self::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK`].
-    fn parse(options: &str) -> anyhow::Result<Self> {
-        let mut shards = None;
-        let mut permits = None;
-        let mut epoch = None;
-        let mut timeout = None;
-
-        for option in options.split(',') {
-            let (key, value) = option
-                .split_once('=')
-                .with_context(|| format!("bad key-value pair: {option}"))?;
-
-            match key {
-                "shards" => shards = Some(value.parse()?),
-                "permits" => permits = Some(value.parse()?),
-                "epoch" => epoch = Some(humantime::parse_duration(value)?),
-                "timeout" => timeout = Some(humantime::parse_duration(value)?),
-                unknown => bail!("unknown key: {unknown}"),
-            }
-        }
-
-        // these dont matter if lock is disabled
-        if let Some(0) = permits {
-            timeout = Some(Duration::default());
-            epoch = Some(Duration::default());
-            shards = Some(2);
-        }
-
-        let out = Self {
-            shards: shards.context("missing `shards`")?,
-            permits: permits.context("missing `permits`")?,
-            epoch: epoch.context("missing `epoch`")?,
-            timeout: timeout.context("missing `timeout`")?,
-        };
-
-        ensure!(out.shards > 1, "shard count must be > 1");
-        ensure!(
-            out.shards.is_power_of_two(),
-            "shard count must be a power of two"
-        );
-
-        Ok(out)
-    }
-}
-
-impl FromStr for WakeComputeLockOptions {
-    type Err = anyhow::Error;
-
-    fn from_str(options: &str) -> Result<Self, Self::Err> {
-        let error = || format!("failed to parse cache lock options '{options}'");
-        Self::parse(options).with_context(error)
-    }
-}
-
 #[cfg(test)]
 mod tests {
    use super::*;
@@ -361,42 +288,4 @@ mod tests {

        Ok(())
    }
-
-    #[test]
-    fn test_parse_lock_options() -> anyhow::Result<()> {
-        let WakeComputeLockOptions {
-            epoch,
-            permits,
-            shards,
-            timeout,
-        } = "shards=32,permits=4,epoch=10m,timeout=1s".parse()?;
-        assert_eq!(epoch, Duration::from_secs(10 * 60));
-        assert_eq!(timeout, Duration::from_secs(1));
-        assert_eq!(shards, 32);
-        assert_eq!(permits, 4);
-
-        let WakeComputeLockOptions {
-            epoch,
-            permits,
-            shards,
-            timeout,
-        } = "epoch=60s,shards=16,timeout=100ms,permits=8".parse()?;
-        assert_eq!(epoch, Duration::from_secs(60));
-        assert_eq!(timeout, Duration::from_millis(100));
-        assert_eq!(shards, 16);
-        assert_eq!(permits, 8);
-
-        let WakeComputeLockOptions {
-            epoch,
-            permits,
-            shards,
-            timeout,
-        } = "permits=0".parse()?;
-        assert_eq!(epoch, Duration::ZERO);
-        assert_eq!(timeout, Duration::ZERO);
-        assert_eq!(shards, 2);
-        assert_eq!(permits, 0);
-
-        Ok(())
-    }
 }
--- a/proxy/src/console.rs
+++ b/proxy/src/console.rs
@@ -13,10 +13,5 @@ pub mod caches {
    pub use super::provider::{ApiCaches, NodeInfoCache};
 }

-/// Various cache-related types.
-pub mod locks {
-    pub use super::provider::ApiLocks;
-}
-
 /// Console's management API.
 pub mod mgmt;
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -8,13 +8,7 @@ use crate::{
    compute, scram,
 };
 use async_trait::async_trait;
-use dashmap::DashMap;
-use std::{sync::Arc, time::Duration};
-use tokio::{
-    sync::{OwnedSemaphorePermit, Semaphore},
-    time::Instant,
-};
-use tracing::info;
+use std::sync::Arc;

 pub mod errors {
    use crate::{
@@ -155,9 +149,6 @@ pub mod errors {

        #[error(transparent)]
        ApiError(ApiError),
-
-        #[error("Timeout waiting to acquire wake compute lock")]
-        TimeoutError,
    }

    // This allows more useful interactions than `#[from]`.
@@ -167,17 +158,6 @@ pub mod errors {
        }
    }

-    impl From<tokio::sync::AcquireError> for WakeComputeError {
-        fn from(_: tokio::sync::AcquireError) -> Self {
-            WakeComputeError::TimeoutError
-        }
-    }
-    impl From<tokio::time::error::Elapsed> for WakeComputeError {
-        fn from(_: tokio::time::error::Elapsed) -> Self {
-            WakeComputeError::TimeoutError
-        }
-    }
-
    impl UserFacingError for WakeComputeError {
        fn to_string_client(&self) -> String {
            use WakeComputeError::*;
@@ -187,8 +167,6 @@ pub mod errors {
                BadComputeAddress(_) => REQUEST_FAILED.to_owned(),
                // However, API might return a meaningful error.
                ApiError(e) => e.to_string_client(),
-
-                TimeoutError => "timeout while acquiring the compute resource lock".to_owned(),
            }
        }
    }
@@ -200,7 +178,6 @@ pub struct ConsoleReqExtra<'a> {
    pub session_id: uuid::Uuid,
    /// Name of client application, if set.
    pub application_name: Option<&'a str>,
-    pub options: Option<&'a str>,
 }

 /// Auth secret which is managed by the cloud.
@@ -255,145 +232,3 @@ pub struct ApiCaches {
    /// Cache for the `wake_compute` API method.
    pub node_info: NodeInfoCache,
 }
-
-/// Various caches for [`console`](super).
-pub struct ApiLocks {
-    name: &'static str,
-    node_locks: DashMap<Arc<str>, Arc<Semaphore>>,
-    permits: usize,
-    timeout: Duration,
-    registered: prometheus::IntCounter,
-    unregistered: prometheus::IntCounter,
-    reclamation_lag: prometheus::Histogram,
-    lock_acquire_lag: prometheus::Histogram,
-}
-
-impl ApiLocks {
-    pub fn new(
-        name: &'static str,
-        permits: usize,
-        shards: usize,
-        timeout: Duration,
-    ) -> prometheus::Result<Self> {
-        let registered = prometheus::IntCounter::with_opts(
-            prometheus::Opts::new(
-                "semaphores_registered",
-                "Number of semaphores registered in this api lock",
-            )
-            .namespace(name),
-        )?;
-        prometheus::register(Box::new(registered.clone()))?;
-        let unregistered = prometheus::IntCounter::with_opts(
-            prometheus::Opts::new(
-                "semaphores_unregistered",
-                "Number of semaphores unregistered in this api lock",
-            )
-            .namespace(name),
-        )?;
-        prometheus::register(Box::new(unregistered.clone()))?;
-        let reclamation_lag = prometheus::Histogram::with_opts(
-            prometheus::HistogramOpts::new(
-                "reclamation_lag_seconds",
-                "Time it takes to reclaim unused semaphores in the api lock",
-            )
-            .namespace(name)
-            // 1us -> 65ms
-            // benchmarks on my mac indicate it's usually in the range of 256us and 512us
-            .buckets(prometheus::exponential_buckets(1e-6, 2.0, 16)?),
-        )?;
-        prometheus::register(Box::new(reclamation_lag.clone()))?;
-        let lock_acquire_lag = prometheus::Histogram::with_opts(
-            prometheus::HistogramOpts::new(
-                "semaphore_acquire_seconds",
-                "Time it takes to reclaim unused semaphores in the api lock",
-            )
-            .namespace(name)
-            // 0.1ms -> 6s
-            .buckets(prometheus::exponential_buckets(1e-4, 2.0, 16)?),
-        )?;
-        prometheus::register(Box::new(lock_acquire_lag.clone()))?;
-
-        Ok(Self {
-            name,
-            node_locks: DashMap::with_shard_amount(shards),
-            permits,
-            timeout,
-            lock_acquire_lag,
-            registered,
-            unregistered,
-            reclamation_lag,
-        })
-    }
-
-    pub async fn get_wake_compute_permit(
-        &self,
-        key: &Arc<str>,
-    ) -> Result<WakeComputePermit, errors::WakeComputeError> {
-        if self.permits == 0 {
-            return Ok(WakeComputePermit { permit: None });
-        }
-        let now = Instant::now();
-        let semaphore = {
-            // get fast path
-            if let Some(semaphore) = self.node_locks.get(key) {
-                semaphore.clone()
-            } else {
-                self.node_locks
-                    .entry(key.clone())
-                    .or_insert_with(|| {
-                        self.registered.inc();
-                        Arc::new(Semaphore::new(self.permits))
-                    })
-                    .clone()
-            }
-        };
-        let permit = tokio::time::timeout_at(now + self.timeout, semaphore.acquire_owned()).await;
-
-        self.lock_acquire_lag
-            .observe((Instant::now() - now).as_secs_f64());
-
-        Ok(WakeComputePermit {
-            permit: Some(permit??),
-        })
-    }
-
-    pub async fn garbage_collect_worker(&self, epoch: std::time::Duration) {
-        if self.permits == 0 {
-            return;
-        }
-
-        let mut interval = tokio::time::interval(epoch / (self.node_locks.shards().len()) as u32);
-        loop {
-            for (i, shard) in self.node_locks.shards().iter().enumerate() {
-                interval.tick().await;
-                // temporary lock a single shard and then clear any semaphores that aren't currently checked out
-                // race conditions: if strong_count == 1, there's no way that it can increase while the shard is locked
-                // therefore releasing it is safe from race conditions
-                info!(
-                    name = self.name,
-                    shard = i,
-                    "performing epoch reclamation on api lock"
-                );
-                let mut lock = shard.write();
-                let timer = self.reclamation_lag.start_timer();
-                let count = lock
-                    .extract_if(|_, semaphore| Arc::strong_count(semaphore.get_mut()) == 1)
-                    .count();
-                drop(lock);
-                self.unregistered.inc_by(count as u64);
-                timer.observe_duration()
-            }
-        }
-    }
-}
-
-pub struct WakeComputePermit {
-    // None if the lock is disabled
-    permit: Option<OwnedSemaphorePermit>,
-}
-
-impl WakeComputePermit {
-    pub fn should_check_cache(&self) -> bool {
-        self.permit.is_some()
-    }
-}
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -3,12 +3,12 @@
 use super::{
    super::messages::{ConsoleError, GetRoleSecret, WakeCompute},
    errors::{ApiError, GetAuthInfoError, WakeComputeError},
-    ApiCaches, ApiLocks, AuthInfo, CachedNodeInfo, ConsoleReqExtra, NodeInfo,
+    ApiCaches, AuthInfo, CachedNodeInfo, ConsoleReqExtra, NodeInfo,
 };
 use crate::{auth::ClientCredentials, compute, http, scram};
 use async_trait::async_trait;
 use futures::TryFutureExt;
-use std::{net::SocketAddr, sync::Arc};
+use std::net::SocketAddr;
 use tokio::time::Instant;
 use tokio_postgres::config::SslMode;
 use tracing::{error, info, info_span, warn, Instrument};
@@ -17,17 +17,12 @@ use tracing::{error, info, info_span, warn, Instrument};
 pub struct Api {
    endpoint: http::Endpoint,
    caches: &'static ApiCaches,
-    locks: &'static ApiLocks,
    jwt: String,
 }

 impl Api {
    /// Construct an API object containing the auth parameters.
-    pub fn new(
-        endpoint: http::Endpoint,
-        caches: &'static ApiCaches,
-        locks: &'static ApiLocks,
-    ) -> Self {
+    pub fn new(endpoint: http::Endpoint, caches: &'static ApiCaches) -> Self {
        let jwt: String = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") {
            Ok(v) => v,
            Err(_) => "".to_string(),
@@ -35,7 +30,6 @@ impl Api {
        Self {
            endpoint,
            caches,
-            locks,
            jwt,
        }
    }
@@ -105,7 +99,6 @@ impl Api {
                .query(&[
                    ("application_name", extra.application_name),
                    ("project", Some(project)),
-                    ("options", extra.options),
                ])
                .build()?;

@@ -158,7 +151,7 @@ impl super::Api for Api {
        extra: &ConsoleReqExtra<'_>,
        creds: &ClientCredentials,
    ) -> Result<CachedNodeInfo, WakeComputeError> {
-        let key: &str = &creds.cache_key;
+        let key = creds.project().expect("impossible");

        // Every time we do a wakeup http request, the compute node will stay up
        // for some time (highly depends on the console's scale-to-zero policy);
@@ -169,22 +162,9 @@ impl super::Api for Api {
            return Ok(cached);
        }

-        let key: Arc<str> = key.into();
-
-        let permit = self.locks.get_wake_compute_permit(&key).await?;
-
-        // after getting back a permit - it's possible the cache was filled
-        // double check
-        if permit.should_check_cache() {
-            if let Some(cached) = self.caches.node_info.get(&key) {
-                info!(key = &*key, "found cached compute node info");
-                return Ok(cached);
-            }
-        }
-
        let node = self.do_wake_compute(extra, creds).await?;
-        let (_, cached) = self.caches.node_info.insert(key.clone(), node);
-        info!(key = &*key, "created a cache entry for compute node info");
+        let (_, cached) = self.caches.node_info.insert(key.into(), node);
+        info!(key = key, "created a cache entry for compute node info");

        Ok(cached)
    }
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -1,5 +1,3 @@
-#![deny(clippy::undocumented_unsafe_blocks)]
-
 use std::convert::Infallible;

 use anyhow::{bail, Context};
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -15,12 +15,10 @@ use crate::{
 use anyhow::{bail, Context};
 use async_trait::async_trait;
 use futures::TryFutureExt;
-use itertools::Itertools;
 use metrics::{exponential_buckets, register_int_counter_vec, IntCounterVec};
-use once_cell::sync::{Lazy, OnceCell};
+use once_cell::sync::Lazy;
 use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
 use prometheus::{register_histogram_vec, HistogramVec};
-use regex::Regex;
 use std::{error::Error, io, ops::ControlFlow, sync::Arc, time::Instant};
 use tokio::{
    io::{AsyncRead, AsyncWrite, AsyncWriteExt},
@@ -570,7 +568,6 @@ fn report_error(e: &WakeComputeError, retry: bool) {
            "api_console_other_server_error"
        }
        WakeComputeError::ApiError(ApiError::Console { .. }) => "api_console_other_error",
-        WakeComputeError::TimeoutError => "timeout_error",
    };
    NUM_WAKEUP_FAILURES.with_label_values(&[retry, kind]).inc();
 }
@@ -884,12 +881,9 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
            allow_self_signed_compute,
        } = self;

-        let console_options = neon_options(params);
-
        let extra = console::ConsoleReqExtra {
            session_id, // aka this connection's id
            application_name: params.get("application_name"),
-            options: console_options.as_deref(),
        };

        let mut latency_timer = LatencyTimer::new(mode.protocol_label());
@@ -951,27 +945,3 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
        proxy_pass(stream, node.stream, &aux).await
    }
 }
-
-pub fn neon_options(params: &StartupMessageParams) -> Option<String> {
-    #[allow(unstable_name_collisions)]
-    let options: String = params
-        .options_raw()?
-        .filter(|opt| is_neon_param(opt))
-        .sorted() // we sort it to use as cache key
-        .intersperse(" ") // TODO: use impl from std once it's stabilized
-        .collect();
-
-    // Don't even bother with empty options.
-    if options.is_empty() {
-        return None;
-    }
-
-    Some(options)
-}
-
-pub fn is_neon_param(bytes: &str) -> bool {
-    static RE: OnceCell<Regex> = OnceCell::new();
-    RE.get_or_init(|| Regex::new(r"^neon_\w+:").unwrap());
-
-    RE.get().unwrap().is_match(bytes)
-}
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -440,7 +440,6 @@ fn helper_create_connect_info(
    let extra = console::ConsoleReqExtra {
        session_id: uuid::Uuid::new_v4(),
        application_name: Some("TEST"),
-        options: None,
    };
    let creds = auth::BackendType::Test(mechanism);
    (cache, extra, creds)
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -22,10 +22,7 @@ use tokio_postgres::{AsyncMessage, ReadyForQueryStatus};

 use crate::{
    auth, console,
-    proxy::{
-        neon_options, LatencyTimer, NUM_DB_CONNECTIONS_CLOSED_COUNTER,
-        NUM_DB_CONNECTIONS_OPENED_COUNTER,
-    },
+    proxy::{LatencyTimer, NUM_DB_CONNECTIONS_CLOSED_COUNTER, NUM_DB_CONNECTIONS_OPENED_COUNTER},
    usage_metrics::{Ids, MetricCounter, USAGE_METRICS},
 };
 use crate::{compute, config};
@@ -44,7 +41,6 @@ pub struct ConnInfo {
    pub dbname: String,
    pub hostname: String,
    pub password: String,
-    pub options: Option<String>,
 }

 impl ConnInfo {
@@ -405,25 +401,26 @@ async fn connect_to_compute(
    let tls = config.tls_config.as_ref();
    let common_names = tls.and_then(|tls| tls.common_names.clone());

-    let params = StartupMessageParams::new([
+    let credential_params = StartupMessageParams::new([
        ("user", &conn_info.username),
        ("database", &conn_info.dbname),
        ("application_name", APP_NAME),
-        ("options", conn_info.options.as_deref().unwrap_or("")),
    ]);

    let creds = config
        .auth_backend
        .as_ref()
-        .map(|_| auth::ClientCredentials::parse(&params, Some(&conn_info.hostname), common_names))
+        .map(|_| {
+            auth::ClientCredentials::parse(
+                &credential_params,
+                Some(&conn_info.hostname),
+                common_names,
+            )
+        })
        .transpose()?;
-
-    let console_options = neon_options(&params);
-
    let extra = console::ConsoleReqExtra {
        session_id: uuid::Uuid::new_v4(),
        application_name: Some(APP_NAME),
-        options: console_options.as_deref(),
    };

    let node_info = creds
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -174,23 +174,11 @@ fn get_conn_info(
        }
    }

-    let pairs = connection_url.query_pairs();
-
-    let mut options = Option::None;
-
-    for (key, value) in pairs {
-        if key == "options" {
-            options = Some(value.to_string());
-            break;
-        }
-    }
-
    Ok(ConnInfo {
        username: username.to_owned(),
        dbname: dbname.to_owned(),
        hostname: hostname.to_owned(),
        password: password.to_owned(),
-        options,
    })
 }

--- a/s3_scrubber/src/lib.rs
+++ b/s3_scrubber/src/lib.rs
@@ -1,5 +1,3 @@
-#![deny(unsafe_code)]
-#![deny(clippy::undocumented_unsafe_blocks)]
 pub mod checks;
 pub mod cloud_admin_api;
 pub mod garbage;
--- a/safekeeper/src/auth.rs
+++ b/safekeeper/src/auth.rs
@@ -1,20 +1,19 @@
-use utils::auth::{AuthError, Claims, Scope};
+use anyhow::{bail, Result};
+use utils::auth::{Claims, Scope};
 use utils::id::TenantId;

-pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<(), AuthError> {
+pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<()> {
    match (&claims.scope, tenant_id) {
-        (Scope::Tenant, None) => Err(AuthError(
-            "Attempt to access management api with tenant scope. Permission denied".into(),
-        )),
+        (Scope::Tenant, None) => {
+            bail!("Attempt to access management api with tenant scope. Permission denied")
+        }
        (Scope::Tenant, Some(tenant_id)) => {
            if claims.tenant_id.unwrap() != tenant_id {
-                return Err(AuthError("Tenant id mismatch. Permission denied".into()));
+                bail!("Tenant id mismatch. Permission denied")
            }
            Ok(())
        }
-        (Scope::PageServerApi, _) => Err(AuthError(
-            "PageServerApi scope makes no sense for Safekeeper".into(),
-        )),
+        (Scope::PageServerApi, _) => bail!("PageServerApi scope makes no sense for Safekeeper"),
        (Scope::SafekeeperData, _) => Ok(()),
    }
 }
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -38,7 +38,7 @@ use safekeeper::{http, WAL_REMOVER_RUNTIME};
 use safekeeper::{remove_wal, WAL_BACKUP_RUNTIME};
 use safekeeper::{wal_backup, HTTP_RUNTIME};
 use storage_broker::DEFAULT_ENDPOINT;
-use utils::auth::{JwtAuth, Scope, SwappableJwtAuth};
+use utils::auth::{JwtAuth, Scope};
 use utils::{
    id::NodeId,
    logging::{self, LogFormat},
@@ -251,9 +251,10 @@ async fn main() -> anyhow::Result<()> {
            None
        }
        Some(path) => {
-            info!("loading http auth JWT key(s) from {path}");
-            let jwt_auth = JwtAuth::from_key_path(path).context("failed to load the auth key")?;
-            Some(Arc::new(SwappableJwtAuth::new(jwt_auth)))
+            info!("loading http auth JWT key from {path}");
+            Some(Arc::new(
+                JwtAuth::from_key_path(path).context("failed to load the auth key")?,
+            ))
        }
    };

--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -6,7 +6,7 @@ use std::str::FromStr;
 use std::str::{self};
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
-use tracing::{debug, info, info_span, Instrument};
+use tracing::{info, info_span, Instrument};

 use crate::auth::check_permission;
 use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage};
@@ -165,27 +165,26 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
            .auth
            .as_ref()
            .expect("auth_type is configured but .auth of handler is missing");
-        let data = auth
-            .decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)
-            .map_err(|e| QueryError::Unauthorized(e.0))?;
+        let data =
+            auth.decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)?;

        // The handler might be configured to allow only tenant scope tokens.
        if matches!(allowed_auth_scope, Scope::Tenant)
            && !matches!(data.claims.scope, Scope::Tenant)
        {
-            return Err(QueryError::Unauthorized(
-                "passed JWT token is for full access, but only tenant scope is allowed".into(),
-            ));
+            return Err(QueryError::Other(anyhow::anyhow!(
+                "passed JWT token is for full access, but only tenant scope is allowed"
+            )));
        }

        if matches!(data.claims.scope, Scope::Tenant) && data.claims.tenant_id.is_none() {
-            return Err(QueryError::Unauthorized(
-                "jwt token scope is Tenant, but tenant id is missing".into(),
-            ));
+            return Err(QueryError::Other(anyhow::anyhow!(
+                "jwt token scope is Tenant, but tenant id is missing"
+            )));
        }

-        debug!(
-            "jwt scope check succeeded for scope: {:#?} by tenant id: {:?}",
+        info!(
+            "jwt auth succeeded for scope: {:#?} by tenant id: {:?}",
            data.claims.scope, data.claims.tenant_id,
        );

@@ -264,7 +263,7 @@ impl SafekeeperPostgresHandler {

    // when accessing management api supply None as an argument
    // when using to authorize tenant pass corresponding tenant id
-    fn check_permission(&self, tenant_id: Option<TenantId>) -> Result<(), QueryError> {
+    fn check_permission(&self, tenant_id: Option<TenantId>) -> anyhow::Result<()> {
        if self.auth.is_none() {
            // auth is set to Trust, nothing to check so just return ok
            return Ok(());
@@ -276,7 +275,7 @@ impl SafekeeperPostgresHandler {
            .claims
            .as_ref()
            .expect("claims presence already checked");
-        check_permission(claims, tenant_id).map_err(|e| QueryError::Unauthorized(e.0))
+        check_permission(claims, tenant_id)
    }

    async fn handle_timeline_status<IO: AsyncRead + AsyncWrite + Unpin>(
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -30,7 +30,7 @@ use crate::timelines_global_map::TimelineDeleteForceResult;
 use crate::GlobalTimelines;
 use crate::SafeKeeperConf;
 use utils::{
-    auth::SwappableJwtAuth,
+    auth::JwtAuth,
    http::{
        endpoint::{self, auth_middleware, check_permission_with},
        error::ApiError,
@@ -428,11 +428,8 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
            if ALLOWLIST_ROUTES.contains(request.uri()) {
                None
            } else {
-                // Option<Arc<SwappableJwtAuth>> is always provided as data below, hence unwrap().
-                request
-                    .data::<Option<Arc<SwappableJwtAuth>>>()
-                    .unwrap()
-                    .as_deref()
+                // Option<Arc<JwtAuth>> is always provided as data below, hence unwrap().
+                request.data::<Option<Arc<JwtAuth>>>().unwrap().as_deref()
            }
        }))
    }
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -1,4 +1,3 @@
-#![deny(clippy::undocumented_unsafe_blocks)]
 use camino::Utf8PathBuf;
 use once_cell::sync::Lazy;
 use remote_storage::RemoteStorageConfig;
@@ -7,10 +6,7 @@ use tokio::runtime::Runtime;
 use std::time::Duration;
 use storage_broker::Uri;

-use utils::{
-    auth::SwappableJwtAuth,
-    id::{NodeId, TenantId, TenantTimelineId},
-};
+use utils::id::{NodeId, TenantId, TenantTimelineId};

 mod auth;
 pub mod broker;
@@ -73,7 +69,7 @@ pub struct SafeKeeperConf {
    pub wal_backup_enabled: bool,
    pub pg_auth: Option<Arc<JwtAuth>>,
    pub pg_tenant_only_auth: Option<Arc<JwtAuth>>,
-    pub http_auth: Option<Arc<SwappableJwtAuth>>,
+    pub http_auth: Option<Arc<JwtAuth>>,
    pub current_thread_runtime: bool,
 }

--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -111,7 +111,7 @@ impl WalReceivers {
            .count()
    }

-    /// Unregister walreceiver.
+    /// Unregister walsender.
    fn unregister(self: &Arc<WalReceivers>, id: WalReceiverId) {
        let mut shared = self.mutex.lock();
        shared.slots[id] = None;
@@ -138,8 +138,8 @@ pub enum WalReceiverStatus {
    Streaming,
 }

-/// Scope guard to access slot in WalReceivers registry and unregister from
-/// it in Drop.
+/// Scope guard to access slot in WalSenders registry and unregister from it in
+/// Drop.
 pub struct WalReceiverGuard {
    id: WalReceiverId,
    walreceivers: Arc<WalReceivers>,
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -361,6 +361,7 @@ class PgProtocol:

@dataclass
 class AuthKeys:
+    pub: str
    priv: str

    def generate_token(self, *, scope: str, **token_data: str) -> str:
@@ -876,31 +877,9 @@ class NeonEnv:

    @cached_property
    def auth_keys(self) -> AuthKeys:
+        pub = (Path(self.repo_dir) / "auth_public_key.pem").read_text()
        priv = (Path(self.repo_dir) / "auth_private_key.pem").read_text()
-        return AuthKeys(priv=priv)
-
-    def regenerate_keys_at(self, privkey_path: Path, pubkey_path: Path):
-        # compare generate_auth_keys() in local_env.rs
-        subprocess.run(
-            ["openssl", "genpkey", "-algorithm", "ed25519", "-out", privkey_path],
-            cwd=self.repo_dir,
-            check=True,
-        )
-
-        subprocess.run(
-            [
-                "openssl",
-                "pkey",
-                "-in",
-                privkey_path,
-                "-pubout",
-                "-out",
-                pubkey_path,
-            ],
-            cwd=self.repo_dir,
-            check=True,
-        )
-        del self.auth_keys
+        return AuthKeys(pub=pub, priv=priv)

    def generate_endpoint_id(self) -> str:
        """
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -189,10 +189,6 @@ class PageserverHttpClient(requests.Session):
        assert res_json is None
        return res_json

-    def reload_auth_validation_keys(self):
-        res = self.post(f"http://localhost:{self.port}/v1/reload_auth_validation_keys")
-        self.verbose_error(res)
-
    def tenant_list(self) -> List[Dict[Any, Any]]:
        res = self.get(f"http://localhost:{self.port}/v1/tenant")
        self.verbose_error(res)
--- a/test_runner/regress/test_auth.py
+++ b/test_runner/regress/test_auth.py
@@ -1,35 +1,12 @@
-import os
 from contextlib import closing
-from pathlib import Path

 import psycopg2
 import pytest
-from fixtures.neon_fixtures import (
-    NeonEnv,
-    NeonEnvBuilder,
-    PgProtocol,
-)
-from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
+from fixtures.neon_fixtures import NeonEnvBuilder, PgProtocol
+from fixtures.pageserver.http import PageserverApiException
 from fixtures.types import TenantId, TimelineId


-def assert_client_authorized(env: NeonEnv, http_client: PageserverHttpClient):
-    http_client.timeline_create(
-        pg_version=env.pg_version,
-        tenant_id=env.initial_tenant,
-        new_timeline_id=TimelineId.generate(),
-        ancestor_timeline_id=env.initial_timeline,
-    )
-
-
-def assert_client_not_authorized(env: NeonEnv, http_client: PageserverHttpClient):
-    with pytest.raises(
-        PageserverApiException,
-        match="Forbidden: JWT authentication error",
-    ):
-        assert_client_authorized(env, http_client)
-
-
 def test_pageserver_auth(neon_env_builder: NeonEnvBuilder):
    neon_env_builder.auth_enabled = True
    env = neon_env_builder.init_start()
@@ -50,14 +27,30 @@ def test_pageserver_auth(neon_env_builder: NeonEnvBuilder):
    ps.safe_psql("set FOO", password=pageserver_token)

    # tenant can create branches
-    assert_client_authorized(env, tenant_http_client)
-
+    tenant_http_client.timeline_create(
+        pg_version=env.pg_version,
+        tenant_id=env.initial_tenant,
+        new_timeline_id=TimelineId.generate(),
+        ancestor_timeline_id=env.initial_timeline,
+    )
    # console can create branches for tenant
-    assert_client_authorized(env, pageserver_http_client)
+    pageserver_http_client.timeline_create(
+        pg_version=env.pg_version,
+        tenant_id=env.initial_tenant,
+        new_timeline_id=TimelineId.generate(),
+        ancestor_timeline_id=env.initial_timeline,
+    )

    # fail to create branch using token with different tenant_id
-    with pytest.raises(PageserverApiException, match="Forbidden: JWT authentication error"):
-        assert_client_authorized(env, invalid_tenant_http_client)
+    with pytest.raises(
+        PageserverApiException, match="Forbidden: Tenant id mismatch. Permission denied"
+    ):
+        invalid_tenant_http_client.timeline_create(
+            pg_version=env.pg_version,
+            tenant_id=env.initial_tenant,
+            new_timeline_id=TimelineId.generate(),
+            ancestor_timeline_id=env.initial_timeline,
+        )

    # create tenant using management token
    pageserver_http_client.tenant_create(TenantId.generate())
@@ -65,7 +58,7 @@ def test_pageserver_auth(neon_env_builder: NeonEnvBuilder):
    # fail to create tenant using tenant token
    with pytest.raises(
        PageserverApiException,
-        match="Forbidden: JWT authentication error",
+        match="Forbidden: Attempt to access management api with tenant scope. Permission denied",
    ):
        tenant_http_client.tenant_create(TenantId.generate())

@@ -89,96 +82,6 @@ def test_compute_auth_to_pageserver(neon_env_builder: NeonEnvBuilder):
            assert cur.fetchone() == (5000050000,)


-def test_pageserver_multiple_keys(neon_env_builder: NeonEnvBuilder):
-    neon_env_builder.auth_enabled = True
-    env = neon_env_builder.init_start()
-    env.pageserver.allowed_errors.append(".*Authentication error: InvalidSignature.*")
-    env.pageserver.allowed_errors.append(".*Unauthorized: malformed jwt token.*")
-
-    pageserver_token_old = env.auth_keys.generate_pageserver_token()
-    pageserver_http_client_old = env.pageserver.http_client(pageserver_token_old)
-
-    pageserver_http_client_old.reload_auth_validation_keys()
-
-    # This test is to ensure that the pageserver supports multiple keys.
-    # The neon_local tool generates one key pair at a hardcoded path by default.
-    # As a preparation for our test, move the public key of the key pair into a
-    # directory at the same location as the hardcoded path by:
-    # 1. moving the the file at `configured_pub_key_path` to a temporary location
-    # 2. creating a new directory at `configured_pub_key_path`
-    # 3. moving the file from the temporary location into the newly created directory
-    configured_pub_key_path = Path(env.repo_dir) / "auth_public_key.pem"
-    os.rename(configured_pub_key_path, Path(env.repo_dir) / "auth_public_key.pem.file")
-    os.mkdir(configured_pub_key_path)
-    os.rename(
-        Path(env.repo_dir) / "auth_public_key.pem.file",
-        configured_pub_key_path / "auth_public_key_old.pem",
-    )
-
-    # Add a new key pair
-    # This invalidates env.auth_keys and makes them be regenerated
-    env.regenerate_keys_at(
-        Path("auth_private_key.pem"), Path("auth_public_key.pem/auth_public_key_new.pem")
-    )
-
-    # Reload the keys on the pageserver side
-    pageserver_http_client_old.reload_auth_validation_keys()
-
-    # We can continue doing things using the old token
-    assert_client_authorized(env, pageserver_http_client_old)
-
-    pageserver_token_new = env.auth_keys.generate_pageserver_token()
-    pageserver_http_client_new = env.pageserver.http_client(pageserver_token_new)
-
-    # The new token also works
-    assert_client_authorized(env, pageserver_http_client_new)
-
-    # Remove the old token and reload
-    os.remove(Path(env.repo_dir) / "auth_public_key.pem" / "auth_public_key_old.pem")
-    pageserver_http_client_old.reload_auth_validation_keys()
-
-    # Reloading fails now with the old token, but the new token still works
-    assert_client_not_authorized(env, pageserver_http_client_old)
-    assert_client_authorized(env, pageserver_http_client_new)
-
-
-def test_pageserver_key_reload(neon_env_builder: NeonEnvBuilder):
-    neon_env_builder.auth_enabled = True
-    env = neon_env_builder.init_start()
-    env.pageserver.allowed_errors.append(".*Authentication error: InvalidSignature.*")
-    env.pageserver.allowed_errors.append(".*Unauthorized: malformed jwt token.*")
-
-    pageserver_token_old = env.auth_keys.generate_pageserver_token()
-    pageserver_http_client_old = env.pageserver.http_client(pageserver_token_old)
-
-    pageserver_http_client_old.reload_auth_validation_keys()
-
-    # Regenerate the keys
-    env.regenerate_keys_at(Path("auth_private_key.pem"), Path("auth_public_key.pem"))
-
-    # Reload the keys on the pageserver side
-    pageserver_http_client_old.reload_auth_validation_keys()
-
-    # Next attempt fails as we use the old auth token
-    with pytest.raises(
-        PageserverApiException,
-        match="Forbidden: JWT authentication error",
-    ):
-        pageserver_http_client_old.reload_auth_validation_keys()
-
-    # same goes for attempts trying to create a timeline
-    assert_client_not_authorized(env, pageserver_http_client_old)
-
-    pageserver_token_new = env.auth_keys.generate_pageserver_token()
-    pageserver_http_client_new = env.pageserver.http_client(pageserver_token_new)
-
-    # timeline creation works with the new token
-    assert_client_authorized(env, pageserver_http_client_new)
-
-    # reloading also works with the new token
-    pageserver_http_client_new.reload_auth_validation_keys()
-
-
@pytest.mark.parametrize("auth_enabled", [False, True])
 def test_auth_failures(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
    neon_env_builder.auth_enabled = auth_enabled
--- a/test_runner/regress/test_bad_connection.py
+++ b/test_runner/regress/test_bad_connection.py
@@ -1,60 +0,0 @@
-import random
-import time
-
-from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnvBuilder
-
-
-def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder):
-    env = neon_env_builder.init_start()
-    env.pageserver.allowed_errors.append(".*simulated connection error.*")
-
-    pageserver_http = env.pageserver.http_client()
-    env.neon_cli.create_branch("test_compute_pageserver_connection_stress")
-    endpoint = env.endpoints.create_start("test_compute_pageserver_connection_stress")
-
-    # Enable failpoint after starting everything else up so that loading initial
-    # basebackup doesn't fail
-    pageserver_http.configure_failpoints(("simulated-bad-compute-connection", "50%return(15)"))
-
-    pg_conn = endpoint.connect()
-    cur = pg_conn.cursor()
-
-    # Create table, and insert some rows. Make it big enough that it doesn't fit in
-    # shared_buffers, otherwise the SELECT after restart will just return answer
-    # from shared_buffers without hitting the page server, which defeats the point
-    # of this test.
-    cur.execute("CREATE TABLE foo (t text)")
-    cur.execute(
-        """
-        INSERT INTO foo
-            SELECT 'long string to consume some space' || g
-            FROM generate_series(1, 100000) g
-        """
-    )
-
-    # Verify that the table is larger than shared_buffers
-    cur.execute(
-        """
-        select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size
-        from pg_settings where name = 'shared_buffers'
-        """
-    )
-    row = cur.fetchone()
-    assert row is not None
-    log.info(f"shared_buffers is {row[0]}, table size {row[1]}")
-    assert int(row[0]) < int(row[1])
-
-    cur.execute("SELECT count(*) FROM foo")
-    assert cur.fetchone() == (100000,)
-
-    end_time = time.time() + 30
-    times_executed = 0
-    while time.time() < end_time:
-        if random.random() < 0.5:
-            cur.execute("INSERT INTO foo VALUES ('stas'), ('heikki')")
-        else:
-            cur.execute("SELECT t FROM foo ORDER BY RANDOM() LIMIT 10")
-            cur.fetchall()
-        times_executed += 1
-    log.info(f"Workload executed {times_executed} times")
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -26,7 +26,6 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder):
            ".*will not become active. Current state: Broken.*",
            ".*failed to load metadata.*",
            ".*load failed.*load local timeline.*",
-            ".*layer loading failed permanently: load layer: .*",
        ]
    )

--- a/test_runner/regress/test_change_pageserver.py
+++ b/test_runner/regress/test_change_pageserver.py
@@ -1,13 +1,9 @@
-import asyncio
-
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder
 from fixtures.remote_storage import RemoteStorageKind


 def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
-    num_connections = 3
-
    neon_env_builder.num_pageservers = 2
    neon_env_builder.enable_pageserver_remote_storage(
        remote_storage_kind=RemoteStorageKind.MOCK_S3,
@@ -20,24 +16,15 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
    alt_pageserver_id = env.pageservers[1].id
    env.pageservers[1].tenant_attach(env.initial_tenant)

-    pg_conns = [endpoint.connect() for i in range(num_connections)]
-    curs = [pg_conn.cursor() for pg_conn in pg_conns]
-
-    def execute(statement: str):
-        for cur in curs:
-            cur.execute(statement)
-
-    def fetchone():
-        results = [cur.fetchone() for cur in curs]
-        assert all(result == results[0] for result in results)
-        return results[0]
+    pg_conn = endpoint.connect()
+    cur = pg_conn.cursor()

    # Create table, and insert some rows. Make it big enough that it doesn't fit in
    # shared_buffers, otherwise the SELECT after restart will just return answer
    # from shared_buffers without hitting the page server, which defeats the point
    # of this test.
-    curs[0].execute("CREATE TABLE foo (t text)")
-    curs[0].execute(
+    cur.execute("CREATE TABLE foo (t text)")
+    cur.execute(
        """
        INSERT INTO foo
            SELECT 'long string to consume some space' || g
@@ -46,25 +33,25 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
    )

    # Verify that the table is larger than shared_buffers
-    curs[0].execute(
+    cur.execute(
        """
        select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size
        from pg_settings where name = 'shared_buffers'
        """
    )
-    row = curs[0].fetchone()
+    row = cur.fetchone()
    assert row is not None
    log.info(f"shared_buffers is {row[0]}, table size {row[1]}")
    assert int(row[0]) < int(row[1])

-    execute("SELECT count(*) FROM foo")
-    assert fetchone() == (100000,)
+    cur.execute("SELECT count(*) FROM foo")
+    assert cur.fetchone() == (100000,)

    endpoint.reconfigure(pageserver_id=alt_pageserver_id)

    # Verify that the neon.pageserver_connstring GUC is set to the correct thing
-    execute("SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'")
-    connstring = fetchone()
+    cur.execute("SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'")
+    connstring = cur.fetchone()
    assert connstring is not None
    expected_connstring = f"postgresql://no_user:@localhost:{env.pageservers[1].service_port.pg}"
    assert expected_connstring == expected_connstring
@@ -73,45 +60,5 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
        0
    ].stop()  # Stop the old pageserver just to make sure we're reading from the new one

-    execute("SELECT count(*) FROM foo")
-    assert fetchone() == (100000,)
-
-    # Try failing back, and this time we will stop the current pageserver before reconfiguring
-    # the endpoint.  Whereas the previous reconfiguration was like a healthy migration, this
-    # is more like what happens in an unexpected  pageserver failure.
-    env.pageservers[0].start()
-    env.pageservers[1].stop()
-
-    endpoint.reconfigure(pageserver_id=env.pageservers[0].id)
-
-    execute("SELECT count(*) FROM foo")
-    assert fetchone() == (100000,)
-
-    env.pageservers[0].stop()
-    env.pageservers[1].start()
-
-    # Test a (former) bug where a child process spins without updating its connection string
-    # by executing a query separately. This query will hang until we issue the reconfigure.
-    async def reconfigure_async():
-        await asyncio.sleep(
-            1
-        )  # Sleep for 1 second just to make sure we actually started our count(*) query
-        endpoint.reconfigure(pageserver_id=env.pageservers[1].id)
-
-    def execute_count():
-        execute("SELECT count(*) FROM FOO")
-
-    async def execute_and_reconfigure():
-        task_exec = asyncio.to_thread(execute_count)
-        task_reconfig = asyncio.create_task(reconfigure_async())
-        await asyncio.gather(
-            task_exec,
-            task_reconfig,
-        )
-
-    asyncio.run(execute_and_reconfigure())
-    assert fetchone() == (100000,)
-
-    # One final check that nothing hangs
-    execute("SELECT count(*) FROM foo")
-    assert fetchone() == (100000,)
+    cur.execute("SELECT count(*) FROM foo")
+    assert cur.fetchone() == (100000,)
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -366,17 +366,11 @@ def test_deletion_queue_recovery(
    assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0

    if validate_before == ValidateBefore.VALIDATE:
-        # At this point, one or more DeletionLists have been written.  We have set a failpoint
-        # to prevent them successfully executing, but we want to see them get validated.
-        #
-        # We await _some_ validations instead of _all_ validations, because our execution failpoint
-        # will prevent validation proceeding for any but the first DeletionList.  Usually the workload
-        # just generates one, but if it generates two due to timing, then we must not expect that the
-        # second one will be validated.
-        def assert_some_validations():
-            assert get_deletion_queue_validated(ps_http) > 0

-        wait_until(20, 1, assert_some_validations)
+        def assert_validation_complete():
+            assert get_deletion_queue_submitted(ps_http) == get_deletion_queue_validated(ps_http)
+
+        wait_until(20, 1, assert_validation_complete)

        # The validatated keys statistic advances before the header is written, so we
        # also wait to see the header hit the disk: this seems paranoid but the race
@@ -386,11 +380,6 @@ def test_deletion_queue_recovery(

        wait_until(20, 1, assert_header_written)

-        # If we will lose attachment, then our expectation on restart is that only the ones
-        # we already validated will execute.  Act like only those were present in the queue.
-        if keep_attachment == KeepAttachment.LOSE:
-            before_restart_depth = get_deletion_queue_validated(ps_http)
-
    log.info(f"Restarting pageserver with {before_restart_depth} deletions enqueued")
    env.pageserver.stop(immediate=True)

@@ -413,13 +402,11 @@ def test_deletion_queue_recovery(
    ps_http.deletion_queue_flush(execute=True)
    wait_until(10, 1, lambda: assert_deletion_queue(ps_http, lambda n: n == 0))

-    if keep_attachment == KeepAttachment.KEEP:
+    if keep_attachment == KeepAttachment.KEEP or validate_before == ValidateBefore.VALIDATE:
        # - If we kept the attachment, then our pre-restart deletions should execute
        #   because on re-attach they were from the immediately preceding generation
-        assert get_deletion_queue_executed(ps_http) == before_restart_depth
-    elif validate_before == ValidateBefore.VALIDATE:
-        # - If we validated before restart, then we should execute however many keys were
-        #   validated before restart.
+        # - If we validated before restart, then the deletions should execute because the
+        #   deletion queue header records a validated deletion list sequence number.
        assert get_deletion_queue_executed(ps_http) == before_restart_depth
    else:
        env.pageserver.allowed_errors.extend([".*Dropping stale deletions.*"])
--- a/test_runner/regress/test_pageserver_restarts_under_workload.py
+++ b/test_runner/regress/test_pageserver_restarts_under_workload.py
@@ -17,6 +17,10 @@ def test_pageserver_restarts_under_worload(neon_simple_env: NeonEnv, pg_bin: PgB
    n_restarts = 10
    scale = 10

+    # Pageserver currently logs requests on non-active tenants at error level
+    # https://github.com/neondatabase/neon/issues/5784
+    env.pageserver.allowed_errors.append(".* will not become active. Current state: Stopping.*")
+
    def run_pgbench(connstr: str):
        log.info(f"Start a pgbench workload on pg {connstr}")
        pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr])
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -25,7 +25,6 @@ chrono = { version = "0.4", default-features = false, features = ["clock", "serd
 clap = { version = "4", features = ["derive", "string"] }
 clap_builder = { version = "4", default-features = false, features = ["color", "help", "std", "string", "suggestions", "usage"] }
 crossbeam-utils = { version = "0.8" }
-dashmap = { version = "5", default-features = false, features = ["raw-api"] }
 either = { version = "1" }
 fail = { version = "0.5", default-features = false, features = ["failpoints"] }
 futures = { version = "0.3" }