Merge remote-tracking branch 'origin/main' into arpad/lsn_by_ts

2026-01-16 09:52:54 +00:00 · 2023-11-11 02:24:51 +01:00
parent 2caa795139 74d150ba45
commit 96b3171799
93 changed files with 2097 additions and 1251 deletions
--- a/.config/hakari.toml
+++ b/.config/hakari.toml
@@ -22,5 +22,11 @@ platforms = [
    # "x86_64-pc-windows-msvc",
 ]

+[final-excludes]
+# vm_monitor benefits from the same Cargo.lock as the rest of our artifacts, but
+# it is built primarly in separate repo neondatabase/autoscaling and thus is excluded
+# from depending on workspace-hack because most of the dependencies are not used.
+workspace-members = ["vm_monitor"]
+
 # Write out exact versions rather than a semver range. (Defaults to false.)
 # exact-versions = true
--- a/.github/ISSUE_TEMPLATE/epic-template.md
+++ b/.github/ISSUE_TEMPLATE/epic-template.md
@@ -17,8 +17,9 @@ assignees: ''
 ## Implementation ideas


-## Tasks
- [ ]
+```[tasklist]
+### Tasks
+```


 ## Other related tasks and Epics
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -1,5 +1,7 @@
 self-hosted-runner:
  labels:
+    - arm64
+    - dev
    - gen3
    - large
    - small
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -172,10 +172,10 @@ jobs:
      # https://github.com/EmbarkStudios/cargo-deny
      - name: Check rust licenses/bans/advisories/sources
        if: ${{ !cancelled() }}
-        run: cargo deny check
+        run: cargo deny check --hide-inclusion-graph

  build-neon:
-    needs: [ check-permissions ]
+    needs: [ check-permissions, tag ]
    runs-on: [ self-hosted, gen3, large ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
@@ -187,6 +187,7 @@ jobs:
    env:
      BUILD_TYPE: ${{ matrix.build_type }}
      GIT_VERSION: ${{ github.event.pull_request.head.sha || github.sha }}
+      BUILD_TAG: ${{ needs.tag.outputs.build-tag }}

    steps:
      - name: Fix git ownership
@@ -723,6 +724,7 @@ jobs:
                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
                           --context .
                           --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
+                           --build-arg BUILD_TAG=${{ needs.tag.outputs.build-tag }}
                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
                           --destination neondatabase/neon:${{needs.tag.outputs.build-tag}}
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -21,7 +21,10 @@ env:

 jobs:
  check-macos-build:
-    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos')
+    if: |
+      contains(github.event.pull_request.labels.*.name, 'run-extra-build-macos')  ||
+      contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
+      github.ref_name == 'main'
    timeout-minutes: 90
    runs-on: macos-latest

@@ -112,8 +115,182 @@ jobs:
      - name: Check that no warnings are produced
        run: ./run_clippy.sh

+  check-linux-arm-build:
+    timeout-minutes: 90
+    runs-on: [ self-hosted, dev, arm64 ]
+
+    env:
+      # Use release build only, to have less debug info around
+      # Hence keeping target/ (and general cache size) smaller
+      BUILD_TYPE: release
+      CARGO_FEATURES: --features testing
+      CARGO_FLAGS: --locked --release
+      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
+
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      options: --init
+
+    steps:
+      - name: Fix git ownership
+        run: |
+          # Workaround for `fatal: detected dubious ownership in repository at ...`
+          #
+          # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
+          #   Ref https://github.com/actions/checkout/issues/785
+          #
+          git config --global --add safe.directory ${{ github.workspace }}
+          git config --global --add safe.directory ${GITHUB_WORKSPACE}
+
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+          fetch-depth: 1
+
+      - name: Set pg 14 revision for caching
+        id: pg_v14_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v14) >> $GITHUB_OUTPUT
+
+      - name: Set pg 15 revision for caching
+        id: pg_v15_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v15) >> $GITHUB_OUTPUT
+
+      - name: Set pg 16 revision for caching
+        id: pg_v16_rev
+        run: echo pg_rev=$(git rev-parse HEAD:vendor/postgres-v16) >> $GITHUB_OUTPUT
+
+      - name: Set env variables
+        run: |
+          echo "CARGO_HOME=${GITHUB_WORKSPACE}/.cargo" >> $GITHUB_ENV
+
+      - name: Cache postgres v14 build
+        id: cache_pg_14
+        uses: actions/cache@v3
+        with:
+          path: pg_install/v14
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
+      - name: Cache postgres v15 build
+        id: cache_pg_15
+        uses: actions/cache@v3
+        with:
+          path: pg_install/v15
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
+      - name: Cache postgres v16 build
+        id: cache_pg_16
+        uses: actions/cache@v3
+        with:
+          path: pg_install/v16
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
+
+      - name: Build postgres v14
+        if: steps.cache_pg_14.outputs.cache-hit != 'true'
+        run: mold -run make postgres-v14 -j$(nproc)
+
+      - name: Build postgres v15
+        if: steps.cache_pg_15.outputs.cache-hit != 'true'
+        run: mold -run make postgres-v15 -j$(nproc)
+
+      - name: Build postgres v16
+        if: steps.cache_pg_16.outputs.cache-hit != 'true'
+        run: mold -run make postgres-v16 -j$(nproc)
+
+      - name: Build neon extensions
+        run: mold -run make neon-pg-ext -j$(nproc)
+
+      - name: Build walproposer-lib
+        run: mold -run make walproposer-lib -j$(nproc)
+
+      - name: Run cargo build
+        run: |
+          mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
+
+      - name: Run cargo test
+        run: |
+          cargo test $CARGO_FLAGS $CARGO_FEATURES
+
+          # Run separate tests for real S3
+          export ENABLE_REAL_S3_REMOTE_STORAGE=nonempty
+          export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev
+          export REMOTE_STORAGE_S3_REGION=eu-central-1
+          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
+          cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3
+
+          # Run separate tests for real Azure Blob Storage
+          # XXX: replace region with `eu-central-1`-like region
+          export ENABLE_REAL_AZURE_REMOTE_STORAGE=y
+          export AZURE_STORAGE_ACCOUNT="${{ secrets.AZURE_STORAGE_ACCOUNT_DEV }}"
+          export AZURE_STORAGE_ACCESS_KEY="${{ secrets.AZURE_STORAGE_ACCESS_KEY_DEV }}"
+          export REMOTE_STORAGE_AZURE_CONTAINER="${{ vars.REMOTE_STORAGE_AZURE_CONTAINER }}"
+          export REMOTE_STORAGE_AZURE_REGION="${{ vars.REMOTE_STORAGE_AZURE_REGION }}"
+          # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
+          cargo test $CARGO_FLAGS --package remote_storage --test test_real_azure
+
+  check-codestyle-rust-arm:
+    timeout-minutes: 90
+    runs-on: [ self-hosted, dev, arm64 ]
+
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
+      options: --init
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+          fetch-depth: 1
+
+      # Some of our rust modules use FFI and need those to be checked
+      - name: Get postgres headers
+        run: make postgres-headers -j$(nproc)
+
+      # cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations.
+      # This will catch compiler & clippy warnings in all feature combinations.
+      # TODO: use cargo hack for build and test as well, but, that's quite expensive.
+      # NB: keep clippy args in sync with ./run_clippy.sh
+      - run: |
+          CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")"
+          if [ "$CLIPPY_COMMON_ARGS" = "" ]; then
+            echo "No clippy args found in .neon_clippy_args"
+            exit 1
+          fi
+          echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
+      - name: Run cargo clippy (debug)
+        run: cargo hack --feature-powerset clippy $CLIPPY_COMMON_ARGS
+      - name: Run cargo clippy (release)
+        run: cargo hack --feature-powerset clippy --release $CLIPPY_COMMON_ARGS
+
+      - name: Check documentation generation
+        run: cargo doc --workspace --no-deps --document-private-items
+        env:
+            RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
+
+      # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
+      - name: Check formatting
+        if: ${{ !cancelled() }}
+        run: cargo fmt --all -- --check
+
+      # https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci
+      - name: Check rust dependencies
+        if: ${{ !cancelled() }}
+        run: |
+          cargo hakari generate --diff  # workspace-hack Cargo.toml is up-to-date
+          cargo hakari manage-deps --dry-run  # all workspace crates depend on workspace-hack
+
+      # https://github.com/EmbarkStudios/cargo-deny
+      - name: Check rust licenses/bans/advisories/sources
+        if: ${{ !cancelled() }}
+        run: cargo deny check
+
  gather-rust-build-stats:
-    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats')
+    if: |
+      contains(github.event.pull_request.labels.*.name, 'run-extra-build-stats') ||
+      contains(github.event.pull_request.labels.*.name, 'run-extra-build-*') ||
+      github.ref_name == 'main'
    runs-on: [ self-hosted, gen3, large ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -36,6 +36,7 @@ license = "Apache-2.0"
 ## All dependency versions, used in the project
 [workspace.dependencies]
 anyhow = { version = "1.0", features = ["backtrace"] }
+arc-swap = "1.6"
 async-compression = { version = "0.4.0", features = ["tokio", "gzip"] }
 azure_core = "0.16"
 azure_identity = "0.16"
@@ -47,6 +48,7 @@ async-trait = "0.1"
 aws-config = { version = "0.56", default-features = false, features=["rustls"] }
 aws-sdk-s3 = "0.29"
 aws-smithy-http = "0.56"
+aws-smithy-async = { version = "0.56", default-features = false, features=["rt-tokio"] }
 aws-credential-types = "0.56"
 aws-types = "0.56"
 axum = { version = "0.6.20", features = ["ws"] }
@@ -65,7 +67,7 @@ comfy-table = "6.1"
 const_format = "0.2"
 crc32c = "0.6"
 crossbeam-utils = "0.8.5"
-dashmap = "5.5.0"
+dashmap = { version = "5.5.0", features = ["raw-api"] }
 either = "1.8"
 enum-map = "2.4.2"
 enumset = "1.0.12"
@@ -81,7 +83,7 @@ hex = "0.4"
 hex-literal = "0.4"
 hmac = "0.12.1"
 hostname = "0.3.1"
-http-types = "2"
+http-types = { version = "2", default-features = false }
 humantime = "2.1"
 humantime-serde = "1.1.1"
 hyper = "0.14"
--- a/5
+++ b/5
@@ -27,6 +27,7 @@ RUN set -e \
 FROM $REPOSITORY/$IMAGE:$TAG AS build
 WORKDIR /home/nonroot
 ARG GIT_VERSION=local
+ARG BUILD_TAG

 # Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds.
 # Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations.
@@ -78,9 +79,9 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pg_sni_router
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl             /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper          /usr/local/bin
-COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker         /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker      /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy               /usr/local/bin
-COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local               /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local          /usr/local/bin

 COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
 COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
--- a/4
+++ b/4
@@ -72,6 +72,10 @@ neon: postgres-headers walproposer-lib
 #
 $(POSTGRES_INSTALL_DIR)/build/%/config.status:
 	+@echo "Configuring Postgres $* build"
+	@test -s $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure || { \
+		echo "\nPostgres submodule not found in $(ROOT_PROJECT_DIR)/vendor/postgres-$*/, execute "; \
+		echo "'git submodule update --init --recursive --depth 2 --progress .' in project root.\n"; \
+		exit 1; }
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/$*
 	(cd $(POSTGRES_INSTALL_DIR)/build/$* && \
 	env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure \
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -710,8 +710,12 @@ impl ComputeNode {
    // `pg_ctl` for start / stop, so this just seems much easier to do as we already
    // have opened connection to Postgres and superuser access.
    #[instrument(skip_all)]
-    fn pg_reload_conf(&self, client: &mut Client) -> Result<()> {
-        client.simple_query("SELECT pg_reload_conf()")?;
+    fn pg_reload_conf(&self) -> Result<()> {
+        let pgctl_bin = Path::new(&self.pgbin).parent().unwrap().join("pg_ctl");
+        Command::new(pgctl_bin)
+            .args(["reload", "-D", &self.pgdata])
+            .output()
+            .expect("cannot run pg_ctl process");
        Ok(())
    }

@@ -724,9 +728,9 @@ impl ComputeNode {
        // Write new config
        let pgdata_path = Path::new(&self.pgdata);
        config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec, None)?;
+        self.pg_reload_conf()?;

        let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
-        self.pg_reload_conf(&mut client)?;

        // Proceed with post-startup configuration. Note, that order of operations is important.
        // Disable DDL forwarding because control plane already knows about these roles/databases.
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -78,7 +78,7 @@ use regex::Regex;
 use remote_storage::*;
 use serde_json;
 use std::io::Read;
-use std::num::{NonZeroU32, NonZeroUsize};
+use std::num::NonZeroUsize;
 use std::path::Path;
 use std::str;
 use tar::Archive;
@@ -281,8 +281,6 @@ pub fn init_remote_storage(remote_ext_config: &str) -> anyhow::Result<GenericRem
        max_keys_per_list_response: None,
    };
    let config = RemoteStorageConfig {
-        max_concurrent_syncs: NonZeroUsize::new(100).expect("100 != 0"),
-        max_sync_errors: NonZeroU32::new(100).expect("100 != 0"),
        storage: RemoteStorageKind::AwsS3(config),
    };
    GenericRemoteStorage::from_config(&config)
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -1,7 +1,7 @@
-//!
 //! Various tools and helpers to handle cluster / compute node (Postgres)
 //! configuration.
-//!
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 pub mod checker;
 pub mod config;
 pub mod configurator;
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -262,7 +262,7 @@ where
    P: Into<Utf8PathBuf>,
 {
    let path: Utf8PathBuf = path.into();
-    // SAFETY
+    // SAFETY:
    // pre_exec is marked unsafe because it runs between fork and exec.
    // Why is that dangerous in various ways?
    // Long answer:  https://github.com/rust-lang/rust/issues/39575
--- a/control_plane/src/lib.rs
+++ b/control_plane/src/lib.rs
@@ -1,11 +1,10 @@
-//
-// Local control plane.
-//
-// Can start, configure and stop postgres instances running as a local processes.
-//
-// Intended to be used in integration tests and in CLI tools for
-// local installations.
-//
+//! Local control plane.
+//!
+//! Can start, configure and stop postgres instances running as a local processes.
+//!
+//! Intended to be used in integration tests and in CLI tools for
+//! local installations.
+#![deny(clippy::undocumented_unsafe_blocks)]

 pub mod attachment_service;
 mod background_process;
--- a/deny.toml
+++ b/deny.toml
@@ -74,10 +74,30 @@ highlight = "all"
 workspace-default-features = "allow"
 external-default-features = "allow"
 allow = []
-deny = []
+
 skip = []
 skip-tree = []

+[[bans.deny]]
+# we use tokio, the same rationale applies for async-{io,waker,global-executor,executor,channel,lock}, smol
+# if you find yourself here while adding a dependency, try "default-features = false", ask around on #rust
+name = "async-std"
+
+[[bans.deny]]
+name = "async-io"
+
+[[bans.deny]]
+name = "async-waker"
+
+[[bans.deny]]
+name = "async-global-executor"
+
+[[bans.deny]]
+name = "async-executor"
+
+[[bans.deny]]
+name = "smol"
+
 # This section is considered when running `cargo deny check sources`.
 # More documentation about the 'sources' section can be found here:
 # https://embarkstudios.github.io/cargo-deny/checks/sources/cfg.html
--- a/libs/compute_api/src/lib.rs
+++ b/libs/compute_api/src/lib.rs
@@ -1,3 +1,5 @@
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 pub mod requests;
 pub mod responses;
 pub mod spec;
--- a/libs/consumption_metrics/src/lib.rs
+++ b/libs/consumption_metrics/src/lib.rs
@@ -1,6 +1,6 @@
-//!
 //! Shared code for consumption metics collection
-//!
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 use chrono::{DateTime, Utc};
 use rand::Rng;
 use serde::{Deserialize, Serialize};
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -2,6 +2,7 @@
 //! make sure that we use the same dep version everywhere.
 //! Otherwise, we might not see all metrics registered via
 //! a default registry.
+#![deny(clippy::undocumented_unsafe_blocks)]
 use once_cell::sync::Lazy;
 use prometheus::core::{AtomicU64, Collector, GenericGauge, GenericGaugeVec};
 pub use prometheus::opts;
--- a/libs/pageserver_api/src/lib.rs
+++ b/libs/pageserver_api/src/lib.rs
@@ -1,3 +1,5 @@
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 use const_format::formatcp;

 /// Public API types
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -2,6 +2,8 @@
 //! To use, create PostgresBackend and run() it, passing the Handler
 //! implementation determining how to process the queries. Currently its API
 //! is rather narrow, but we can extend it once required.
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 use anyhow::Context;
 use bytes::Bytes;
 use futures::pin_mut;
@@ -15,7 +17,7 @@ use std::{fmt, io};
 use std::{future::Future, str::FromStr};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_rustls::TlsAcceptor;
-use tracing::{debug, error, info, trace};
+use tracing::{debug, error, info, trace, warn};

 use pq_proto::framed::{ConnectionError, Framed, FramedReader, FramedWriter};
 use pq_proto::{
@@ -33,6 +35,11 @@ pub enum QueryError {
    /// We were instructed to shutdown while processing the query
    #[error("Shutting down")]
    Shutdown,
+    /// Authentication failure
+    #[error("Unauthorized: {0}")]
+    Unauthorized(std::borrow::Cow<'static, str>),
+    #[error("Simulated Connection Error")]
+    SimulatedConnectionError,
    /// Some other error
    #[error(transparent)]
    Other(#[from] anyhow::Error),
@@ -47,8 +54,9 @@ impl From<io::Error> for QueryError {
 impl QueryError {
    pub fn pg_error_code(&self) -> &'static [u8; 5] {
        match self {
-            Self::Disconnected(_) => b"08006", // connection failure
+            Self::Disconnected(_) | Self::SimulatedConnectionError => b"08006", // connection failure
            Self::Shutdown => SQLSTATE_ADMIN_SHUTDOWN,
+            Self::Unauthorized(_) => SQLSTATE_INTERNAL_ERROR,
            Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error
        }
    }
@@ -608,7 +616,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {

                    if let Err(e) = handler.check_auth_jwt(self, jwt_response) {
                        self.write_message_noflush(&BeMessage::ErrorResponse(
-                            &e.to_string(),
+                            &short_error(&e),
                            Some(e.pg_error_code()),
                        ))?;
                        return Err(e);
@@ -730,6 +738,9 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
                if let Err(e) = handler.process_query(self, query_string).await {
                    match e {
                        QueryError::Shutdown => return Ok(ProcessMsgResult::Break),
+                        QueryError::SimulatedConnectionError => {
+                            return Err(QueryError::SimulatedConnectionError)
+                        }
                        e => {
                            log_query_error(query_string, &e);
                            let short_error = short_error(&e);
@@ -964,6 +975,8 @@ pub fn short_error(e: &QueryError) -> String {
    match e {
        QueryError::Disconnected(connection_error) => connection_error.to_string(),
        QueryError::Shutdown => "shutdown".to_string(),
+        QueryError::Unauthorized(_e) => "JWT authentication error".to_string(),
+        QueryError::SimulatedConnectionError => "simulated connection error".to_string(),
        QueryError::Other(e) => format!("{e:#}"),
    }
 }
@@ -980,9 +993,15 @@ fn log_query_error(query: &str, e: &QueryError) {
        QueryError::Disconnected(other_connection_error) => {
            error!("query handler for '{query}' failed with connection error: {other_connection_error:?}")
        }
+        QueryError::SimulatedConnectionError => {
+            error!("query handler for query '{query}' failed due to a simulated connection error")
+        }
        QueryError::Shutdown => {
            info!("query handler for '{query}' cancelled during tenant shutdown")
        }
+        QueryError::Unauthorized(e) => {
+            warn!("query handler for '{query}' failed with authentication error: {e}");
+        }
        QueryError::Other(e) => {
            error!("query handler for '{query}' failed: {e:?}");
        }
--- a/libs/postgres_connection/src/lib.rs
+++ b/libs/postgres_connection/src/lib.rs
@@ -1,3 +1,5 @@
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 use anyhow::{bail, Context};
 use itertools::Itertools;
 use std::borrow::Cow;
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -8,6 +8,7 @@
 // modules included with the postgres_ffi macro depend on the types of the specific version's
 // types, and trigger a too eager lint.
 #![allow(clippy::duplicate_mod)]
+#![deny(clippy::undocumented_unsafe_blocks)]

 use bytes::Bytes;
 use utils::bin_ser::SerializeError;
@@ -20,6 +21,7 @@ macro_rules! postgres_ffi {
            pub mod bindings {
                // bindgen generates bindings for a lot of stuff we don't need
                #![allow(dead_code)]
+                #![allow(clippy::undocumented_unsafe_blocks)]

                use serde::{Deserialize, Serialize};
                include!(concat!(
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -1,6 +1,7 @@
 //! Postgres protocol messages serialization-deserialization. See
 //! <https://www.postgresql.org/docs/devel/protocol-message-formats.html>
 //! on message formats.
+#![deny(clippy::undocumented_unsafe_blocks)]

 pub mod framed;

--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -8,6 +8,7 @@ license.workspace = true
 anyhow.workspace = true
 async-trait.workspace = true
 once_cell.workspace = true
+aws-smithy-async.workspace = true
 aws-smithy-http.workspace = true
 aws-types.workspace = true
 aws-config.workspace = true
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -6,19 +6,15 @@
 //!   * [`s3_bucket`] uses AWS S3 bucket as an external storage
 //!   * [`azure_blob`] allows to use Azure Blob storage as an external storage
 //!
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]

 mod azure_blob;
 mod local_fs;
 mod s3_bucket;
 mod simulate_failures;

-use std::{
-    collections::HashMap,
-    fmt::Debug,
-    num::{NonZeroU32, NonZeroUsize},
-    pin::Pin,
-    sync::Arc,
-};
+use std::{collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::Arc};

 use anyhow::{bail, Context};
 use camino::{Utf8Path, Utf8PathBuf};
@@ -34,12 +30,6 @@ pub use self::{
 };
 use s3_bucket::RequestKind;

-/// How many different timelines can be processed simultaneously when synchronizing layers with the remote storage.
-/// During regular work, pageserver produces one layer file per timeline checkpoint, with bursts of concurrency
-/// during start (where local and remote timelines are compared and initial sync tasks are scheduled) and timeline attach.
-/// Both cases may trigger timeline download, that might download a lot of layers. This concurrency is limited by the clients internally, if needed.
-pub const DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS: usize = 50;
-pub const DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS: u32 = 10;
 /// Currently, sync happens with AWS S3, that has two limits on requests per second:
 /// ~200 RPS for IAM services
 /// <https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html>
@@ -441,10 +431,6 @@ pub struct StorageMetadata(HashMap<String, String>);
 /// External backup storage configuration, enough for creating a client for that storage.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct RemoteStorageConfig {
-    /// Max allowed number of concurrent sync operations between the API user and the remote storage.
-    pub max_concurrent_syncs: NonZeroUsize,
-    /// Max allowed errors before the sync task is considered failed and evicted.
-    pub max_sync_errors: NonZeroU32,
    /// The storage connection configuration.
    pub storage: RemoteStorageKind,
 }
@@ -540,18 +526,6 @@ impl RemoteStorageConfig {

        let use_azure = container_name.is_some() && container_region.is_some();

-        let max_concurrent_syncs = NonZeroUsize::new(
-            parse_optional_integer("max_concurrent_syncs", toml)?
-                .unwrap_or(DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS),
-        )
-        .context("Failed to parse 'max_concurrent_syncs' as a positive integer")?;
-
-        let max_sync_errors = NonZeroU32::new(
-            parse_optional_integer("max_sync_errors", toml)?
-                .unwrap_or(DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS),
-        )
-        .context("Failed to parse 'max_sync_errors' as a positive integer")?;
-
        let default_concurrency_limit = if use_azure {
            DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT
        } else {
@@ -633,11 +607,7 @@ impl RemoteStorageConfig {
            }
        };

-        Ok(Some(RemoteStorageConfig {
-            max_concurrent_syncs,
-            max_sync_errors,
-            storage,
-        }))
+        Ok(Some(RemoteStorageConfig { storage }))
    }
 }

--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -4,23 +4,27 @@
 //! allowing multiple api users to independently work with the same S3 bucket, if
 //! their bucket prefixes are both specified and different.

-use std::borrow::Cow;
+use std::{borrow::Cow, sync::Arc};

 use anyhow::Context;
 use aws_config::{
    environment::credentials::EnvironmentVariableCredentialsProvider,
-    imds::credentials::ImdsCredentialsProvider, meta::credentials::CredentialsProviderChain,
-    provider_config::ProviderConfig, web_identity_token::WebIdentityTokenCredentialsProvider,
+    imds::credentials::ImdsCredentialsProvider,
+    meta::credentials::CredentialsProviderChain,
+    provider_config::ProviderConfig,
+    retry::{RetryConfigBuilder, RetryMode},
+    web_identity_token::WebIdentityTokenCredentialsProvider,
 };
 use aws_credential_types::cache::CredentialsCache;
 use aws_sdk_s3::{
-    config::{Config, Region},
+    config::{AsyncSleep, Config, Region, SharedAsyncSleep},
    error::SdkError,
    operation::get_object::GetObjectError,
    primitives::ByteStream,
    types::{Delete, ObjectIdentifier},
    Client,
 };
+use aws_smithy_async::rt::sleep::TokioSleep;
 use aws_smithy_http::body::SdkBody;
 use hyper::Body;
 use scopeguard::ScopeGuard;
@@ -83,10 +87,23 @@ impl S3Bucket {
            .or_else("imds", ImdsCredentialsProvider::builder().build())
        };

+        // AWS SDK requires us to specify how the RetryConfig should sleep when it wants to back off
+        let sleep_impl: Arc<dyn AsyncSleep> = Arc::new(TokioSleep::new());
+
+        // We do our own retries (see [`backoff::retry`]).  However, for the AWS SDK to enable rate limiting in response to throttling
+        // responses (e.g. 429 on too many ListObjectsv2 requests), we must provide a retry config.  We set it to use at most one
+        // attempt, and enable 'Adaptive' mode, which causes rate limiting to be enabled.
+        let mut retry_config = RetryConfigBuilder::new();
+        retry_config
+            .set_max_attempts(Some(1))
+            .set_mode(Some(RetryMode::Adaptive));
+
        let mut config_builder = Config::builder()
            .region(region)
            .credentials_cache(CredentialsCache::lazy())
-            .credentials_provider(credentials_provider);
+            .credentials_provider(credentials_provider)
+            .sleep_impl(SharedAsyncSleep::from(sleep_impl))
+            .retry_config(retry_config.build());

        if let Some(custom_endpoint) = aws_config.endpoint.clone() {
            config_builder = config_builder
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -1,6 +1,6 @@
 use std::collections::HashSet;
 use std::env;
-use std::num::{NonZeroU32, NonZeroUsize};
+use std::num::NonZeroUsize;
 use std::ops::ControlFlow;
 use std::path::PathBuf;
 use std::sync::Arc;
@@ -469,8 +469,6 @@ fn create_azure_client(
    let random = rand::thread_rng().gen::<u32>();

    let remote_storage_config = RemoteStorageConfig {
-        max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
-        max_sync_errors: NonZeroU32::new(5).unwrap(),
        storage: RemoteStorageKind::AzureContainer(AzureConfig {
            container_name: remote_storage_azure_container,
            container_region: remote_storage_azure_region,
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -1,6 +1,6 @@
 use std::collections::HashSet;
 use std::env;
-use std::num::{NonZeroU32, NonZeroUsize};
+use std::num::NonZeroUsize;
 use std::ops::ControlFlow;
 use std::path::PathBuf;
 use std::sync::Arc;
@@ -396,8 +396,6 @@ fn create_s3_client(
    let random = rand::thread_rng().gen::<u32>();

    let remote_storage_config = RemoteStorageConfig {
-        max_concurrent_syncs: NonZeroUsize::new(100).unwrap(),
-        max_sync_errors: NonZeroU32::new(5).unwrap(),
        storage: RemoteStorageKind::AwsS3(S3Config {
            bucket_name: remote_storage_s3_bucket,
            bucket_region: remote_storage_s3_region,
--- a/libs/safekeeper_api/src/lib.rs
+++ b/libs/safekeeper_api/src/lib.rs
@@ -1,3 +1,5 @@
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 use const_format::formatcp;

 /// Public API types
--- a/libs/tenant_size_model/src/lib.rs
+++ b/libs/tenant_size_model/src/lib.rs
@@ -1,4 +1,6 @@
 //! Synthetic size calculation
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]

 mod calculation;
 pub mod svg;
--- a/libs/tracing-utils/src/lib.rs
+++ b/libs/tracing-utils/src/lib.rs
@@ -32,6 +32,8 @@
 //!         .init();
 //! }
 //! ```
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]

 use opentelemetry::sdk::Resource;
 use opentelemetry::KeyValue;
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -5,6 +5,7 @@ edition.workspace = true
 license.workspace = true

 [dependencies]
+arc-swap.workspace = true
 sentry.workspace = true
 async-trait.workspace = true
 anyhow.workspace = true
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -1,7 +1,8 @@
 // For details about authentication see docs/authentication.md

+use arc_swap::ArcSwap;
 use serde;
-use std::fs;
+use std::{borrow::Cow, fmt::Display, fs, sync::Arc};

 use anyhow::Result;
 use camino::Utf8Path;
@@ -10,7 +11,7 @@ use jsonwebtoken::{
 };
 use serde::{Deserialize, Serialize};

-use crate::id::TenantId;
+use crate::{http::error::ApiError, id::TenantId};

 /// Algorithm to use. We require EdDSA.
 const STORAGE_TOKEN_ALGORITHM: Algorithm = Algorithm::EdDSA;
@@ -44,31 +45,106 @@ impl Claims {
    }
 }

+pub struct SwappableJwtAuth(ArcSwap<JwtAuth>);
+
+impl SwappableJwtAuth {
+    pub fn new(jwt_auth: JwtAuth) -> Self {
+        SwappableJwtAuth(ArcSwap::new(Arc::new(jwt_auth)))
+    }
+    pub fn swap(&self, jwt_auth: JwtAuth) {
+        self.0.swap(Arc::new(jwt_auth));
+    }
+    pub fn decode(&self, token: &str) -> std::result::Result<TokenData<Claims>, AuthError> {
+        self.0.load().decode(token)
+    }
+}
+
+impl std::fmt::Debug for SwappableJwtAuth {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "Swappable({:?})", self.0.load())
+    }
+}
+
+#[derive(Clone, PartialEq, Eq, Hash, Debug)]
+pub struct AuthError(pub Cow<'static, str>);
+
+impl Display for AuthError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.0)
+    }
+}
+
+impl From<AuthError> for ApiError {
+    fn from(_value: AuthError) -> Self {
+        // Don't pass on the value of the AuthError as a precautionary measure.
+        // Being intentionally vague in public error communication hurts debugability
+        // but it is more secure.
+        ApiError::Forbidden("JWT authentication error".to_string())
+    }
+}
+
 pub struct JwtAuth {
-    decoding_key: DecodingKey,
+    decoding_keys: Vec<DecodingKey>,
    validation: Validation,
 }

 impl JwtAuth {
-    pub fn new(decoding_key: DecodingKey) -> Self {
+    pub fn new(decoding_keys: Vec<DecodingKey>) -> Self {
        let mut validation = Validation::default();
        validation.algorithms = vec![STORAGE_TOKEN_ALGORITHM];
        // The default 'required_spec_claims' is 'exp'. But we don't want to require
        // expiration.
        validation.required_spec_claims = [].into();
        Self {
-            decoding_key,
+            decoding_keys,
            validation,
        }
    }

    pub fn from_key_path(key_path: &Utf8Path) -> Result<Self> {
-        let public_key = fs::read(key_path)?;
-        Ok(Self::new(DecodingKey::from_ed_pem(&public_key)?))
+        let metadata = key_path.metadata()?;
+        let decoding_keys = if metadata.is_dir() {
+            let mut keys = Vec::new();
+            for entry in fs::read_dir(key_path)? {
+                let path = entry?.path();
+                if !path.is_file() {
+                    // Ignore directories (don't recurse)
+                    continue;
+                }
+                let public_key = fs::read(path)?;
+                keys.push(DecodingKey::from_ed_pem(&public_key)?);
+            }
+            keys
+        } else if metadata.is_file() {
+            let public_key = fs::read(key_path)?;
+            vec![DecodingKey::from_ed_pem(&public_key)?]
+        } else {
+            anyhow::bail!("path is neither a directory or a file")
+        };
+        if decoding_keys.is_empty() {
+            anyhow::bail!("Configured for JWT auth with zero decoding keys. All JWT gated requests would be rejected.");
+        }
+        Ok(Self::new(decoding_keys))
    }

-    pub fn decode(&self, token: &str) -> Result<TokenData<Claims>> {
-        Ok(decode(token, &self.decoding_key, &self.validation)?)
+    /// Attempt to decode the token with the internal decoding keys.
+    ///
+    /// The function tries the stored decoding keys in succession,
+    /// and returns the first yielding a successful result.
+    /// If there is no working decoding key, it returns the last error.
+    pub fn decode(&self, token: &str) -> std::result::Result<TokenData<Claims>, AuthError> {
+        let mut res = None;
+        for decoding_key in &self.decoding_keys {
+            res = Some(decode(token, decoding_key, &self.validation));
+            if let Some(Ok(res)) = res {
+                return Ok(res);
+            }
+        }
+        if let Some(res) = res {
+            res.map_err(|e| AuthError(Cow::Owned(e.to_string())))
+        } else {
+            Err(AuthError(Cow::Borrowed("no JWT decoding keys configured")))
+        }
    }
 }

@@ -108,9 +184,9 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
 "#;

    #[test]
-    fn test_decode() -> Result<(), anyhow::Error> {
+    fn test_decode() {
        let expected_claims = Claims {
-            tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081")?),
+            tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081").unwrap()),
            scope: Scope::Tenant,
        };

@@ -129,28 +205,24 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
        let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.U3eA8j-uU-JnhzeO3EDHRuXLwkAUFCPxtGHEgw6p7Ccc3YRbFs2tmCdbD9PZEXP-XsxSeBQi1FY0YPcT3NXADw";

        // Check it can be validated with the public key
-        let auth = JwtAuth::new(DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?);
-        let claims_from_token = auth.decode(encoded_eddsa)?.claims;
+        let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap()]);
+        let claims_from_token = auth.decode(encoded_eddsa).unwrap().claims;
        assert_eq!(claims_from_token, expected_claims);
-
-        Ok(())
    }

    #[test]
-    fn test_encode() -> Result<(), anyhow::Error> {
+    fn test_encode() {
        let claims = Claims {
-            tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081")?),
+            tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081").unwrap()),
            scope: Scope::Tenant,
        };

-        let encoded = encode_from_key_file(&claims, TEST_PRIV_KEY_ED25519)?;
+        let encoded = encode_from_key_file(&claims, TEST_PRIV_KEY_ED25519).unwrap();

        // decode it back
-        let auth = JwtAuth::new(DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?);
-        let decoded = auth.decode(&encoded)?;
+        let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519).unwrap()]);
+        let decoded = auth.decode(&encoded).unwrap();

        assert_eq!(decoded.claims, claims);
-
-        Ok(())
    }
 }
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -1,4 +1,4 @@
-use crate::auth::{Claims, JwtAuth};
+use crate::auth::{AuthError, Claims, SwappableJwtAuth};
 use crate::http::error::{api_error_handler, route_error_handler, ApiError};
 use anyhow::Context;
 use hyper::header::{HeaderName, AUTHORIZATION};
@@ -389,7 +389,7 @@ fn parse_token(header_value: &str) -> Result<&str, ApiError> {
 }

 pub fn auth_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
-    provide_auth: fn(&Request<Body>) -> Option<&JwtAuth>,
+    provide_auth: fn(&Request<Body>) -> Option<&SwappableJwtAuth>,
 ) -> Middleware<B, ApiError> {
    Middleware::pre(move |req| async move {
        if let Some(auth) = provide_auth(&req) {
@@ -400,9 +400,11 @@ pub fn auth_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
                    })?;
                    let token = parse_token(header_value)?;

-                    let data = auth
-                        .decode(token)
-                        .map_err(|_| ApiError::Unauthorized("malformed jwt token".to_string()))?;
+                    let data = auth.decode(token).map_err(|err| {
+                        warn!("Authentication error: {err}");
+                        // Rely on From<AuthError> for ApiError impl
+                        err
+                    })?;
                    req.set_context(data.claims);
                }
                None => {
@@ -450,12 +452,11 @@ where

 pub fn check_permission_with(
    req: &Request<Body>,
-    check_permission: impl Fn(&Claims) -> Result<(), anyhow::Error>,
+    check_permission: impl Fn(&Claims) -> Result<(), AuthError>,
 ) -> Result<(), ApiError> {
    match req.context::<Claims>() {
-        Some(claims) => {
-            Ok(check_permission(&claims).map_err(|err| ApiError::Forbidden(err.to_string()))?)
-        }
+        Some(claims) => Ok(check_permission(&claims)
+            .map_err(|_err| ApiError::Forbidden("JWT authentication error".to_string()))?),
        None => Ok(()), // claims is None because auth is disabled
    }
 }
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize};
 use std::borrow::Cow;
 use std::error::Error as StdError;
 use thiserror::Error;
-use tracing::{error, info};
+use tracing::{error, info, warn};

 #[derive(Debug, Error)]
 pub enum ApiError {
@@ -118,6 +118,9 @@ pub fn api_error_handler(api_error: ApiError) -> Response<Body> {
    // Print a stack trace for Internal Server errors

    match api_error {
+        ApiError::Forbidden(_) | ApiError::Unauthorized(_) => {
+            warn!("Error processing HTTP request: {api_error:#}")
+        }
        ApiError::ResourceUnavailable(_) => info!("Error processing HTTP request: {api_error:#}"),
        ApiError::NotFound(_) => info!("Error processing HTTP request: {api_error:#}"),
        ApiError::InternalServerError(_) => error!("Error processing HTTP request: {api_error:?}"),
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -120,6 +120,8 @@ impl Id {
            chunk[0] = HEX[((b >> 4) & 0xf) as usize];
            chunk[1] = HEX[(b & 0xf) as usize];
        }
+
+        // SAFETY: vec constructed out of `HEX`, it can only be ascii
        unsafe { String::from_utf8_unchecked(buf) }
    }
 }
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -1,5 +1,6 @@
 //! `utils` is intended to be a place to put code that is shared
 //! between other crates in this repository.
+#![deny(clippy::undocumented_unsafe_blocks)]

 pub mod backoff;

--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -125,6 +125,9 @@ where
            // Wake everyone with an error.
            let mut internal = self.internal.lock().unwrap();

+            // Block any future waiters from starting
+            internal.shutdown = true;
+
            // This will steal the entire waiters map.
            // When we drop it all waiters will be woken.
            mem::take(&mut internal.waiters)
--- a/libs/utils/src/shutdown.rs
+++ b/libs/utils/src/shutdown.rs
@@ -1,6 +1,7 @@
 /// Immediately terminate the calling process without calling
 /// atexit callbacks, C runtime destructors etc. We mainly use
 /// this to protect coverage data from concurrent writes.
-pub fn exit_now(code: u8) {
+pub fn exit_now(code: u8) -> ! {
+    // SAFETY: exiting is safe, the ffi is not safe
    unsafe { nix::libc::_exit(code as _) };
 }
--- a/libs/utils/src/sync/gate.rs
+++ b/libs/utils/src/sync/gate.rs
@@ -85,6 +85,13 @@ impl Gate {
        warn_if_stuck(self.do_close(), &self.name, Duration::from_millis(1000)).await
    }

+    /// Check if [`Self::close()`] has finished waiting for all [`Self::enter()`] users to finish.  This
+    /// is usually analoguous for "Did shutdown finish?" for types that include a Gate, whereas checking
+    /// the CancellationToken on such types is analogous to "Did shutdown start?"
+    pub fn close_complete(&self) -> bool {
+        self.sem.is_closed()
+    }
+
    async fn do_close(&self) {
        tracing::debug!(gate = self.name, "Closing Gate...");
        match self.sem.acquire_many(Self::MAX_UNITS).await {
--- a/libs/vm_monitor/Cargo.toml
+++ b/libs/vm_monitor/Cargo.toml
@@ -19,13 +19,12 @@ inotify.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 sysinfo.workspace = true
-tokio.workspace = true
+tokio = { workspace = true, features = ["rt-multi-thread"] }
 tokio-postgres.workspace = true
 tokio-stream.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true
 tracing-subscriber.workspace = true
-workspace_hack = { version = "0.1", path = "../../workspace_hack" }

 [target.'cfg(target_os = "linux")'.dependencies]
 cgroups-rs = "0.3.3"
--- a/libs/vm_monitor/src/lib.rs
+++ b/libs/vm_monitor/src/lib.rs
@@ -1,3 +1,5 @@
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 #![cfg(target_os = "linux")]

 use anyhow::Context;
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -188,6 +188,7 @@ extern "C" fn recovery_download(
    }
 }

+#[allow(clippy::unnecessary_cast)]
 extern "C" fn wal_read(
    sk: *mut Safekeeper,
    buf: *mut ::std::os::raw::c_char,
@@ -421,6 +422,7 @@ impl std::fmt::Display for Level {
 }

 /// Take ownership of `Vec<u8>` from StringInfoData.
+#[allow(clippy::unnecessary_cast)]
 pub(crate) fn take_vec_u8(pg: &mut StringInfoData) -> Option<Vec<u8>> {
    if pg.data.is_null() {
        return None;
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -186,7 +186,7 @@ impl Wrapper {
            .unwrap()
            .into_bytes_with_nul();
        assert!(safekeepers_list_vec.len() == safekeepers_list_vec.capacity());
-        let safekeepers_list = safekeepers_list_vec.as_mut_ptr() as *mut i8;
+        let safekeepers_list = safekeepers_list_vec.as_mut_ptr() as *mut std::ffi::c_char;

        let callback_data = Box::into_raw(Box::new(api)) as *mut ::std::os::raw::c_void;

--- a/pageserver/src/auth.rs
+++ b/pageserver/src/auth.rs
@@ -1,22 +1,21 @@
-use anyhow::{bail, Result};
-use utils::auth::{Claims, Scope};
+use utils::auth::{AuthError, Claims, Scope};
 use utils::id::TenantId;

-pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<()> {
+pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<(), AuthError> {
    match (&claims.scope, tenant_id) {
-        (Scope::Tenant, None) => {
-            bail!("Attempt to access management api with tenant scope. Permission denied")
-        }
+        (Scope::Tenant, None) => Err(AuthError(
+            "Attempt to access management api with tenant scope. Permission denied".into(),
+        )),
        (Scope::Tenant, Some(tenant_id)) => {
            if claims.tenant_id.unwrap() != tenant_id {
-                bail!("Tenant id mismatch. Permission denied")
+                return Err(AuthError("Tenant id mismatch. Permission denied".into()));
            }
            Ok(())
        }
        (Scope::PageServerApi, None) => Ok(()), // access to management api for PageServerApi scope
        (Scope::PageServerApi, Some(_)) => Ok(()), // access to tenant api using PageServerApi scope
-        (Scope::SafekeeperData, _) => {
-            bail!("SafekeeperData scope makes no sense for Pageserver")
-        }
+        (Scope::SafekeeperData, _) => Err(AuthError(
+            "SafekeeperData scope makes no sense for Pageserver".into(),
+        )),
    }
 }
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -34,8 +34,11 @@ use postgres_backend::AuthType;
 use utils::logging::TracingErrorLayerEnablement;
 use utils::signals::ShutdownSignals;
 use utils::{
-    auth::JwtAuth, logging, project_build_tag, project_git_version, sentry_init::init_sentry,
-    signals::Signal, tcp_listener,
+    auth::{JwtAuth, SwappableJwtAuth},
+    logging, project_build_tag, project_git_version,
+    sentry_init::init_sentry,
+    signals::Signal,
+    tcp_listener,
 };

 project_git_version!(GIT_VERSION);
@@ -321,13 +324,12 @@ fn start_pageserver(
    let http_auth;
    let pg_auth;
    if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT {
-        // unwrap is ok because check is performed when creating config, so path is set and file exists
+        // unwrap is ok because check is performed when creating config, so path is set and exists
        let key_path = conf.auth_validation_public_key_path.as_ref().unwrap();
-        info!(
-            "Loading public key for verifying JWT tokens from {:#?}",
-            key_path
-        );
-        let auth: Arc<JwtAuth> = Arc::new(JwtAuth::from_key_path(key_path)?);
+        info!("Loading public key(s) for verifying JWT tokens from {key_path:?}");
+
+        let jwt_auth = JwtAuth::from_key_path(key_path)?;
+        let auth: Arc<SwappableJwtAuth> = Arc::new(SwappableJwtAuth::new(jwt_auth));

        http_auth = match &conf.http_auth_type {
            AuthType::Trust => None,
@@ -410,7 +412,7 @@ fn start_pageserver(

    // Scan the local 'tenants/' directory and start loading the tenants
    let deletion_queue_client = deletion_queue.new_client();
-    BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
+    let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
        conf,
        TenantSharedResources {
            broker_client: broker_client.clone(),
@@ -420,6 +422,7 @@ fn start_pageserver(
        order,
        shutdown_pageserver.clone(),
    ))?;
+    let tenant_manager = Arc::new(tenant_manager);

    BACKGROUND_RUNTIME.spawn({
        let init_done_rx = init_done_rx;
@@ -548,6 +551,7 @@ fn start_pageserver(
        let router_state = Arc::new(
            http::routes::State::new(
                conf,
+                tenant_manager,
                http_auth.clone(),
                remote_storage.clone(),
                broker_client.clone(),
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -161,7 +161,7 @@ pub struct PageServerConf {
    pub http_auth_type: AuthType,
    /// authentication method for libpq connections from compute
    pub pg_auth_type: AuthType,
-    /// Path to a file containing public key for verifying JWT tokens.
+    /// Path to a file or directory containing public key(s) for verifying JWT tokens.
    /// Used for both mgmt and compute auth, if enabled.
    pub auth_validation_public_key_path: Option<Utf8PathBuf>,

@@ -1314,12 +1314,6 @@ broker_endpoint = '{broker_endpoint}'
            assert_eq!(
                parsed_remote_storage_config,
                RemoteStorageConfig {
-                    max_concurrent_syncs: NonZeroUsize::new(
-                        remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS
-                    )
-                        .unwrap(),
-                    max_sync_errors: NonZeroU32::new(remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS)
-                        .unwrap(),
                    storage: RemoteStorageKind::LocalFs(local_storage_path.clone()),
                },
                "Remote storage config should correctly parse the local FS config and fill other storage defaults"
@@ -1380,8 +1374,6 @@ broker_endpoint = '{broker_endpoint}'
            assert_eq!(
                parsed_remote_storage_config,
                RemoteStorageConfig {
-                    max_concurrent_syncs,
-                    max_sync_errors,
                    storage: RemoteStorageKind::AwsS3(S3Config {
                        bucket_name: bucket_name.clone(),
                        bucket_region: bucket_region.clone(),
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -893,14 +893,6 @@ mod test {
        std::fs::create_dir_all(remote_fs_dir)?;
        let remote_fs_dir = harness.conf.workdir.join("remote_fs").canonicalize_utf8()?;
        let storage_config = RemoteStorageConfig {
-            max_concurrent_syncs: std::num::NonZeroUsize::new(
-                remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
-            )
-            .unwrap(),
-            max_sync_errors: std::num::NonZeroU32::new(
-                remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
-            )
-            .unwrap(),
            storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
        };
        let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();
--- a/pageserver/src/deletion_queue/deleter.rs
+++ b/pageserver/src/deletion_queue/deleter.rs
@@ -55,21 +55,24 @@ impl Deleter {

    /// Wrap the remote `delete_objects` with a failpoint
    async fn remote_delete(&self) -> Result<(), anyhow::Error> {
-        fail::fail_point!("deletion-queue-before-execute", |_| {
-            info!("Skipping execution, failpoint set");
-            metrics::DELETION_QUEUE
-                .remote_errors
-                .with_label_values(&["failpoint"])
-                .inc();
-            Err(anyhow::anyhow!("failpoint hit"))
-        });
-
        // A backoff::retry is used here for two reasons:
        // - To provide a backoff rather than busy-polling the API on errors
        // - To absorb transient 429/503 conditions without hitting our error
        //   logging path for issues deleting objects.
        backoff::retry(
-            || async { self.remote_storage.delete_objects(&self.accumulator).await },
+            || async {
+                fail::fail_point!("deletion-queue-before-execute", |_| {
+                    info!("Skipping execution, failpoint set");
+
+                    metrics::DELETION_QUEUE
+                        .remote_errors
+                        .with_label_values(&["failpoint"])
+                        .inc();
+                    Err(anyhow::anyhow!("failpoint: deletion-queue-before-execute"))
+                });
+
+                self.remote_storage.delete_objects(&self.accumulator).await
+            },
            |_| false,
            3,
            10,
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -52,6 +52,31 @@ paths:
              schema:
                type: object

+  /v1/reload_auth_validation_keys:
+    post:
+      description: Reloads the JWT public keys from their pre-configured location on disk.
+      responses:
+        "200":
+          description: The reload completed successfully.
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "500":
+          description: Generic operation error (also hits if no keys were found)
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+
  /v1/tenant/{tenant_id}:
    parameters:
      - name: tenant_id
@@ -327,7 +352,8 @@ paths:
          in: query
          required: true
          schema:
-            type: integer
+            type: string
+            format: hex
          description: A LSN to get the timestamp
      responses:
        "200":
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -20,6 +20,7 @@ use remote_storage::GenericRemoteStorage;
 use tenant_size_model::{SizeResult, StorageModel};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
+use utils::auth::JwtAuth;
 use utils::http::endpoint::request_span;
 use utils::http::json::json_request_or_empty_body;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
@@ -35,8 +36,8 @@ use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::{LocationConf, TenantConfOpt};
 use crate::tenant::mgr::{
-    GetTenantError, SetNewTenantConfigError, TenantMapError, TenantMapInsertError, TenantSlotError,
-    TenantSlotUpsertError, TenantStateError,
+    GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError,
+    TenantSlotError, TenantSlotUpsertError, TenantStateError,
 };
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
@@ -45,7 +46,7 @@ use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, TenantSha
 use crate::{config::PageServerConf, tenant::mgr};
 use crate::{disk_usage_eviction_task, tenant};
 use utils::{
-    auth::JwtAuth,
+    auth::SwappableJwtAuth,
    generation::Generation,
    http::{
        endpoint::{self, attach_openapi_ui, auth_middleware, check_permission_with},
@@ -63,7 +64,8 @@ use super::models::ConfigureFailpointsRequest;

 pub struct State {
    conf: &'static PageServerConf,
-    auth: Option<Arc<JwtAuth>>,
+    tenant_manager: Arc<TenantManager>,
+    auth: Option<Arc<SwappableJwtAuth>>,
    allowlist_routes: Vec<Uri>,
    remote_storage: Option<GenericRemoteStorage>,
    broker_client: storage_broker::BrokerClientChannel,
@@ -74,7 +76,8 @@ pub struct State {
 impl State {
    pub fn new(
        conf: &'static PageServerConf,
-        auth: Option<Arc<JwtAuth>>,
+        tenant_manager: Arc<TenantManager>,
+        auth: Option<Arc<SwappableJwtAuth>>,
        remote_storage: Option<GenericRemoteStorage>,
        broker_client: storage_broker::BrokerClientChannel,
        disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
@@ -86,6 +89,7 @@ impl State {
            .collect::<Vec<_>>();
        Ok(Self {
            conf,
+            tenant_manager,
            auth,
            allowlist_routes,
            remote_storage,
@@ -299,11 +303,7 @@ async fn build_timeline_info(
        // we're executing this function, we will outlive the timeline on-disk state.
        info.current_logical_size_non_incremental = Some(
            timeline
-                .get_current_logical_size_non_incremental(
-                    info.last_record_lsn,
-                    CancellationToken::new(),
-                    ctx,
-                )
+                .get_current_logical_size_non_incremental(info.last_record_lsn, ctx)
                .await?,
        );
    }
@@ -389,6 +389,32 @@ async fn status_handler(
    json_response(StatusCode::OK, StatusResponse { id: config.id })
 }

+async fn reload_auth_validation_keys_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    check_permission(&request, None)?;
+    let config = get_config(&request);
+    let state = get_state(&request);
+    let Some(shared_auth) = &state.auth else {
+        return json_response(StatusCode::BAD_REQUEST, ());
+    };
+    // unwrap is ok because check is performed when creating config, so path is set and exists
+    let key_path = config.auth_validation_public_key_path.as_ref().unwrap();
+    info!("Reloading public key(s) for verifying JWT tokens from {key_path:?}");
+
+    match JwtAuth::from_key_path(key_path) {
+        Ok(new_auth) => {
+            shared_auth.swap(new_auth);
+            json_response(StatusCode::OK, ())
+        }
+        Err(e) => {
+            warn!("Error reloading public keys from {key_path:?}: {e:}");
+            json_response(StatusCode::INTERNAL_SERVER_ERROR, ())
+        }
+    }
+}
+
 async fn timeline_create_handler(
    mut request: Request<Body>,
    _cancel: CancellationToken,
@@ -1140,20 +1166,14 @@ async fn put_tenant_location_config_handler(
    let location_conf =
        LocationConf::try_from(&request_data.config).map_err(ApiError::BadRequest)?;

-    mgr::upsert_location(
-        state.conf,
-        tenant_id,
-        location_conf,
-        state.broker_client.clone(),
-        state.remote_storage.clone(),
-        state.deletion_queue_client.clone(),
-        &ctx,
-    )
-    .await
-    // TODO: badrequest assumes the caller was asking for something unreasonable, but in
-    // principle we might have hit something like concurrent API calls to the same tenant,
-    // which is not a 400 but a 409.
-    .map_err(ApiError::BadRequest)?;
+    state
+        .tenant_manager
+        .upsert_location(tenant_id, location_conf, &ctx)
+        .await
+        // TODO: badrequest assumes the caller was asking for something unreasonable, but in
+        // principle we might have hit something like concurrent API calls to the same tenant,
+        // which is not a 400 but a 409.
+        .map_err(ApiError::BadRequest)?;

    json_response(StatusCode::OK, ())
 }
@@ -1458,7 +1478,7 @@ async fn timeline_collect_keyspace(
        let keys = timeline
            .collect_keyspace(at_lsn, &ctx)
            .await
-            .map_err(ApiError::InternalServerError)?;
+            .map_err(|e| ApiError::InternalServerError(e.into()))?;

        json_response(StatusCode::OK, Partitioning { keys, at_lsn })
    }
@@ -1647,6 +1667,8 @@ where
        );

        match handle.await {
+            // TODO: never actually return Err from here, always Ok(...) so that we can log
+            // spanned errors. Call api_error_handler instead and return appropriate Body.
            Ok(result) => result,
            Err(e) => {
                // The handler task panicked. We have a global panic handler that logs the
@@ -1695,7 +1717,7 @@ where
 pub fn make_router(
    state: Arc<State>,
    launch_ts: &'static LaunchTimestamp,
-    auth: Option<Arc<JwtAuth>>,
+    auth: Option<Arc<SwappableJwtAuth>>,
 ) -> anyhow::Result<RouterBuilder<hyper::Body, ApiError>> {
    let spec = include_bytes!("openapi_spec.yml");
    let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc");
@@ -1724,6 +1746,9 @@ pub fn make_router(
        .put("/v1/failpoints", |r| {
            testing_api_handler("manage failpoints", r, failpoints_handler)
        })
+        .post("/v1/reload_auth_validation_keys", |r| {
+            api_handler(r, reload_auth_validation_keys_handler)
+        })
        .get("/v1/tenant", |r| api_handler(r, tenant_list_handler))
        .post("/v1/tenant", |r| api_handler(r, tenant_create_handler))
        .get("/v1/tenant/:tenant_id", |r| api_handler(r, tenant_status))
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -1,3 +1,5 @@
+#![deny(clippy::undocumented_unsafe_blocks)]
+
 mod auth;
 pub mod basebackup;
 pub mod config;
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1225,15 +1225,6 @@ pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
-        "pageserver_wal_redo_wait_seconds",
-        "Time spent waiting for access to the Postgres WAL redo process",
-        redo_histogram_time_buckets!(),
-    )
-    .expect("failed to define a metric")
-});
-
 pub(crate) static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_records_histogram",
@@ -1261,6 +1252,46 @@ pub(crate) static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
    .unwrap()
 });

+pub(crate) struct WalRedoProcessCounters {
+    pub(crate) started: IntCounter,
+    pub(crate) killed_by_cause: enum_map::EnumMap<WalRedoKillCause, IntCounter>,
+}
+
+#[derive(Debug, enum_map::Enum, strum_macros::IntoStaticStr)]
+pub(crate) enum WalRedoKillCause {
+    WalRedoProcessDrop,
+    NoLeakChildDrop,
+    Startup,
+}
+
+impl Default for WalRedoProcessCounters {
+    fn default() -> Self {
+        let started = register_int_counter!(
+            "pageserver_wal_redo_process_started_total",
+            "Number of WAL redo processes started",
+        )
+        .unwrap();
+
+        let killed = register_int_counter_vec!(
+            "pageserver_wal_redo_process_stopped_total",
+            "Number of WAL redo processes stopped",
+            &["cause"],
+        )
+        .unwrap();
+        Self {
+            started,
+            killed_by_cause: EnumMap::from_array(std::array::from_fn(|i| {
+                let cause = <WalRedoKillCause as enum_map::Enum>::from_usize(i);
+                let cause_str: &'static str = cause.into();
+                killed.with_label_values(&[cause_str])
+            })),
+        }
+    }
+}
+
+pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy<WalRedoProcessCounters> =
+    Lazy::new(WalRedoProcessCounters::default);
+
 /// Similar to `prometheus::HistogramTimer` but does not record on drop.
 pub struct StorageTimeMetricsTimer {
    metrics: StorageTimeMetrics,
@@ -1928,7 +1959,6 @@ pub fn preinitialize_metrics() {
        &READ_NUM_FS_LAYERS,
        &WAIT_LSN_TIME,
        &WAL_REDO_TIME,
-        &WAL_REDO_WAIT_TIME,
        &WAL_REDO_RECORDS_HISTOGRAM,
        &WAL_REDO_BYTES_HISTOGRAM,
    ]
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -14,6 +14,7 @@ use async_compression::tokio::write::GzipEncoder;
 use bytes::Buf;
 use bytes::Bytes;
 use futures::Stream;
+use pageserver_api::models::TenantState;
 use pageserver_api::models::{
    PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
    PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
@@ -39,7 +40,7 @@ use tracing::field;
 use tracing::*;
 use utils::id::ConnectionId;
 use utils::{
-    auth::{Claims, JwtAuth, Scope},
+    auth::{Claims, Scope, SwappableJwtAuth},
    id::{TenantId, TimelineId},
    lsn::Lsn,
    simple_rcu::RcuReadGuard,
@@ -121,7 +122,7 @@ async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()
 pub async fn libpq_listener_main(
    conf: &'static PageServerConf,
    broker_client: storage_broker::BrokerClientChannel,
-    auth: Option<Arc<JwtAuth>>,
+    auth: Option<Arc<SwappableJwtAuth>>,
    listener: TcpListener,
    auth_type: AuthType,
    listener_ctx: RequestContext,
@@ -189,7 +190,7 @@ pub async fn libpq_listener_main(
 async fn page_service_conn_main(
    conf: &'static PageServerConf,
    broker_client: storage_broker::BrokerClientChannel,
-    auth: Option<Arc<JwtAuth>>,
+    auth: Option<Arc<SwappableJwtAuth>>,
    socket: tokio::net::TcpStream,
    auth_type: AuthType,
    connection_ctx: RequestContext,
@@ -217,9 +218,27 @@ async fn page_service_conn_main(
    // no write timeout is used, because the kernel is assumed to error writes after some time.
    let mut socket = tokio_io_timeout::TimeoutReader::new(socket);

-    // timeout should be lower, but trying out multiple days for
-    // <https://github.com/neondatabase/neon/issues/4205>
-    socket.set_timeout(Some(std::time::Duration::from_secs(60 * 60 * 24 * 3)));
+    let default_timeout_ms = 10 * 60 * 1000; // 10 minutes by default
+    let socket_timeout_ms = (|| {
+        fail::fail_point!("simulated-bad-compute-connection", |avg_timeout_ms| {
+            // Exponential distribution for simulating
+            // poor network conditions, expect about avg_timeout_ms to be around 15
+            // in tests
+            if let Some(avg_timeout_ms) = avg_timeout_ms {
+                let avg = avg_timeout_ms.parse::<i64>().unwrap() as f32;
+                let u = rand::random::<f32>();
+                ((1.0 - u).ln() / (-avg)) as u64
+            } else {
+                default_timeout_ms
+            }
+        });
+        default_timeout_ms
+    })();
+
+    // A timeout here does not mean the client died, it can happen if it's just idle for
+    // a while: we will tear down this PageServerHandler and instantiate a new one if/when
+    // they reconnect.
+    socket.set_timeout(Some(std::time::Duration::from_millis(socket_timeout_ms)));
    let socket = std::pin::pin!(socket);

    // XXX: pgbackend.run() should take the connection_ctx,
@@ -252,7 +271,7 @@ async fn page_service_conn_main(
 struct PageServerHandler {
    _conf: &'static PageServerConf,
    broker_client: storage_broker::BrokerClientChannel,
-    auth: Option<Arc<JwtAuth>>,
+    auth: Option<Arc<SwappableJwtAuth>>,
    claims: Option<Claims>,

    /// The context created for the lifetime of the connection
@@ -266,7 +285,7 @@ impl PageServerHandler {
    pub fn new(
        conf: &'static PageServerConf,
        broker_client: storage_broker::BrokerClientChannel,
-        auth: Option<Arc<JwtAuth>>,
+        auth: Option<Arc<SwappableJwtAuth>>,
        connection_ctx: RequestContext,
    ) -> Self {
        PageServerHandler {
@@ -493,7 +512,11 @@ impl PageServerHandler {
            };

            if let Err(e) = &response {
-                if timeline.cancel.is_cancelled() {
+                // Requests may fail as soon as we are Stopping, even if the Timeline's cancellation token wasn't fired yet,
+                // because wait_lsn etc will drop out
+                // is_stopping(): [`Timeline::flush_and_shutdown`] has entered
+                // is_canceled(): [`Timeline::shutdown`]` has entered
+                if timeline.cancel.is_cancelled() || timeline.is_stopping() {
                    // If we fail to fulfil a request during shutdown, which may be _because_ of
                    // shutdown, then do not send the error to the client.  Instead just drop the
                    // connection.
@@ -897,7 +920,7 @@ impl PageServerHandler {

    // when accessing management api supply None as an argument
    // when using to authorize tenant pass corresponding tenant id
-    fn check_permission(&self, tenant_id: Option<TenantId>) -> anyhow::Result<()> {
+    fn check_permission(&self, tenant_id: Option<TenantId>) -> Result<(), QueryError> {
        if self.auth.is_none() {
            // auth is set to Trust, nothing to check so just return ok
            return Ok(());
@@ -909,7 +932,7 @@ impl PageServerHandler {
            .claims
            .as_ref()
            .expect("claims presence already checked");
-        check_permission(claims, tenant_id)
+        check_permission(claims, tenant_id).map_err(|e| QueryError::Unauthorized(e.0))
    }

    /// Shorthand for getting a reference to a Timeline of an Active tenant.
@@ -948,16 +971,17 @@ where
            .auth
            .as_ref()
            .unwrap()
-            .decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)?;
+            .decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)
+            .map_err(|e| QueryError::Unauthorized(e.0))?;

        if matches!(data.claims.scope, Scope::Tenant) && data.claims.tenant_id.is_none() {
-            return Err(QueryError::Other(anyhow::anyhow!(
-                "jwt token scope is Tenant, but tenant id is missing"
-            )));
+            return Err(QueryError::Unauthorized(
+                "jwt token scope is Tenant, but tenant id is missing".into(),
+            ));
        }

-        info!(
-            "jwt auth succeeded for scope: {:#?} by tenant id: {:?}",
+        debug!(
+            "jwt scope check succeeded for scope: {:#?} by tenant id: {:?}",
            data.claims.scope, data.claims.tenant_id,
        );

@@ -979,9 +1003,13 @@ where
        pgb: &mut PostgresBackend<IO>,
        query_string: &str,
    ) -> Result<(), QueryError> {
+        fail::fail_point!("simulated-bad-compute-connection", |_| {
+            info!("Hit failpoint for bad connection");
+            Err(QueryError::SimulatedConnectionError)
+        });
+
        let ctx = self.connection_ctx.attached_child();
        debug!("process query {query_string:?}");
-
        if query_string.starts_with("pagestream ") {
            let (_, params_raw) = query_string.split_at("pagestream ".len());
            let params = params_raw.split(' ').collect::<Vec<_>>();
@@ -1330,6 +1358,9 @@ impl From<GetActiveTenantError> for QueryError {
            GetActiveTenantError::WaitForActiveTimeout { .. } => QueryError::Disconnected(
                ConnectionError::Io(io::Error::new(io::ErrorKind::TimedOut, e.to_string())),
            ),
+            GetActiveTenantError::WillNotBecomeActive(TenantState::Stopping { .. }) => {
+                QueryError::Shutdown
+            }
            e => QueryError::Other(anyhow::anyhow!(e)),
        }
    }
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -21,8 +21,8 @@ use serde::{Deserialize, Serialize};
 use std::collections::{hash_map, HashMap, HashSet};
 use std::ops::ControlFlow;
 use std::ops::Range;
-use tokio_util::sync::CancellationToken;
 use tracing::{debug, trace, warn};
+use utils::bin_ser::DeserializeError;
 use utils::exp_counter::ExpCounter;
 use utils::{bin_ser::BeSer, lsn::Lsn};

@@ -34,18 +34,30 @@ pub enum LsnForTimestamp {
    /// Found commits both before and after the given timestamp
    Present(Lsn),

+    /// Found no commits after the given timestamp, this means
+    /// that the newest data in the branch is older than the given
+    /// timestamp.
+    ///
    /// All commits <= LSN happened before the given timestamp
    Future(Lsn),

+    /// The queried timestamp is past our horizon we look back at (PITR)
+    ///
    /// All commits > LSN happened after the given timestamp,
-    /// but any commits older than the returned LSN
-    /// might have happened before or after the given timestamp.
-    /// We don't know because no data before the given lsn is available
+    /// but any commits < LSN might have happened before or after
+    /// the given timestamp. We don't know because no data before
+    /// the given lsn is available.
    Past(Lsn),

    /// We have found no commit with a timestamp,
    /// so we can't return anything meaningful.
-    /// The associated LSN is a lower bound value.
+    ///
+    /// The associated LSN is the lower bound value we can safely
+    /// create branches on, but no statement is made if it is
+    /// older or newer than the timestamp.
+    ///
+    /// This variant can e.g. be returned right after a
+    /// cluster import.
    NoData(Lsn),
 }

@@ -57,6 +69,25 @@ pub enum CalculateLogicalSizeError {
    Other(#[from] anyhow::Error),
 }

+#[derive(Debug, thiserror::Error)]
+pub(crate) enum CollectKeySpaceError {
+    #[error(transparent)]
+    Decode(#[from] DeserializeError),
+    #[error(transparent)]
+    PageRead(PageReconstructError),
+    #[error("cancelled")]
+    Cancelled,
+}
+
+impl From<PageReconstructError> for CollectKeySpaceError {
+    fn from(err: PageReconstructError) -> Self {
+        match err {
+            PageReconstructError::Cancelled => Self::Cancelled,
+            err => Self::PageRead(err),
+        }
+    }
+}
+
 impl From<PageReconstructError> for CalculateLogicalSizeError {
    fn from(pre: PageReconstructError) -> Self {
        match pre {
@@ -407,6 +438,11 @@ impl Timeline {
        }
        // If `found_smaller == true`, `low` is the LSN of the last commit record
        // before or at `search_timestamp`
+        //
+        // FIXME: it would be better to get the LSN of the previous commit.
+        // Otherwise, if you restore to the returned LSN, the database will
+        // include physical changes from later commits that will be marked
+        // as aborted, and will need to be vacuumed away.
        let commit_lsn = Lsn(low * 8);
        match (found_smaller, found_larger) {
            (false, false) => {
@@ -419,7 +455,7 @@ impl Timeline {
                Ok(LsnForTimestamp::Past(min_lsn))
            }
            (true, false) => {
-                // Only found a commit with timestamp smaller than the request.
+                // Only found commits with timestamps smaller than the request.
                // It's still a valid case for branch creation, return it.
                // And `update_gc_info()` ignores LSN for a `LsnForTimestamp::Future`
                // case, anyway.
@@ -599,7 +635,6 @@ impl Timeline {
    pub async fn get_current_logical_size_non_incremental(
        &self,
        lsn: Lsn,
-        cancel: CancellationToken,
        ctx: &RequestContext,
    ) -> Result<u64, CalculateLogicalSizeError> {
        crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
@@ -610,12 +645,8 @@ impl Timeline {

        let mut total_size: u64 = 0;
        for (spcnode, dbnode) in dbdir.dbdirs.keys() {
-            for rel in self
-                .list_rels(*spcnode, *dbnode, lsn, ctx)
-                .await
-                .context("list rels")?
-            {
-                if cancel.is_cancelled() {
+            for rel in self.list_rels(*spcnode, *dbnode, lsn, ctx).await? {
+                if self.cancel.is_cancelled() {
                    return Err(CalculateLogicalSizeError::Cancelled);
                }
                let relsize_key = rel_size_to_key(rel);
@@ -632,11 +663,11 @@ impl Timeline {
    /// Get a KeySpace that covers all the Keys that are in use at the given LSN.
    /// Anything that's not listed maybe removed from the underlying storage (from
    /// that LSN forwards).
-    pub async fn collect_keyspace(
+    pub(crate) async fn collect_keyspace(
        &self,
        lsn: Lsn,
        ctx: &RequestContext,
-    ) -> anyhow::Result<KeySpace> {
+    ) -> Result<KeySpace, CollectKeySpaceError> {
        // Iterate through key ranges, greedily packing them into partitions
        let mut result = KeySpaceAccum::new();

@@ -645,7 +676,7 @@ impl Timeline {

        // Fetch list of database dirs and iterate them
        let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
-        let dbdir = DbDirectory::des(&buf).context("deserialization failure")?;
+        let dbdir = DbDirectory::des(&buf)?;

        let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect();
        dbs.sort_unstable();
@@ -678,7 +709,7 @@ impl Timeline {
            let slrudir_key = slru_dir_to_key(kind);
            result.add_key(slrudir_key);
            let buf = self.get(slrudir_key, lsn, ctx).await?;
-            let dir = SlruSegmentDirectory::des(&buf).context("deserialization failure")?;
+            let dir = SlruSegmentDirectory::des(&buf)?;
            let mut segments: Vec<u32> = dir.segments.iter().cloned().collect();
            segments.sort_unstable();
            for segno in segments {
@@ -696,7 +727,7 @@ impl Timeline {
        // Then pg_twophase
        result.add_key(TWOPHASEDIR_KEY);
        let buf = self.get(TWOPHASEDIR_KEY, lsn, ctx).await?;
-        let twophase_dir = TwoPhaseDirectory::des(&buf).context("deserialization failure")?;
+        let twophase_dir = TwoPhaseDirectory::des(&buf)?;
        let mut xids: Vec<TransactionId> = twophase_dir.xids.iter().cloned().collect();
        xids.sort_unstable();
        for xid in xids {
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1841,7 +1841,13 @@ impl Tenant {
            timelines.values().for_each(|timeline| {
                let timeline = Arc::clone(timeline);
                let span = Span::current();
-                js.spawn(async move { timeline.shutdown(freeze_and_flush).instrument(span).await });
+                js.spawn(async move {
+                    if freeze_and_flush {
+                        timeline.flush_and_shutdown().instrument(span).await
+                    } else {
+                        timeline.shutdown().instrument(span).await
+                    }
+                });
            })
        };
        tracing::info!("Waiting for timelines...");
@@ -3528,10 +3534,6 @@ pub(crate) mod harness {
            let remote_fs_dir = conf.workdir.join("localfs");
            std::fs::create_dir_all(&remote_fs_dir).unwrap();
            let config = RemoteStorageConfig {
-                // TODO: why not remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
-                max_concurrent_syncs: std::num::NonZeroUsize::new(2_000_000).unwrap(),
-                // TODO: why not remote_storage::DEFAULT_REMOTE_STORAGE_MAX_SYNC_ERRORS,
-                max_sync_errors: std::num::NonZeroU32::new(3_000_000).unwrap(),
                storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
            };
            let remote_storage = GenericRemoteStorage::from_config(&config).unwrap();
@@ -4731,7 +4733,7 @@ mod tests {
            // Keeps uninit mark in place
            let raw_tline = tline.raw_timeline().unwrap();
            raw_tline
-                .shutdown(false)
+                .shutdown()
                .instrument(info_span!("test_shutdown", tenant_id=%raw_tline.tenant_id))
                .await;
            std::mem::forget(tline);
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -327,7 +327,7 @@ mod tests {
                let mut sz: u16 = rng.gen();
                // Make 50% of the arrays small
                if rng.gen() {
-                    sz |= 63;
+                    sz &= 63;
                }
                random_array(sz.into())
            })
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -200,6 +200,22 @@ async fn unsafe_create_dir_all(path: &Utf8PathBuf) -> std::io::Result<()> {
    Ok(())
 }

+/// The TenantManager is responsible for storing and mutating the collection of all tenants
+/// that this pageserver process has state for.  Every Tenant and SecondaryTenant instance
+/// lives inside the TenantManager.
+///
+/// The most important role of the TenantManager is to prevent conflicts: e.g. trying to attach
+/// the same tenant twice concurrently, or trying to configure the same tenant into secondary
+/// and attached modes concurrently.
+pub struct TenantManager {
+    conf: &'static PageServerConf,
+    // TODO: currently this is a &'static pointing to TENANTs.  When we finish refactoring
+    // out of that static variable, the TenantManager can own this.
+    // See https://github.com/neondatabase/neon/issues/5796
+    tenants: &'static std::sync::RwLock<TenantsMap>,
+    resources: TenantSharedResources,
+}
+
 fn emergency_generations(
    tenant_confs: &HashMap<TenantId, anyhow::Result<LocationConf>>,
 ) -> HashMap<TenantId, Generation> {
@@ -366,7 +382,7 @@ pub async fn init_tenant_mgr(
    resources: TenantSharedResources,
    init_order: InitializationOrder,
    cancel: CancellationToken,
-) -> anyhow::Result<()> {
+) -> anyhow::Result<TenantManager> {
    let mut tenants = HashMap::new();

    let ctx = RequestContext::todo_child(TaskKind::Startup, DownloadBehavior::Warn);
@@ -468,7 +484,12 @@ pub async fn init_tenant_mgr(
    assert!(matches!(&*tenants_map, &TenantsMap::Initializing));
    METRICS.tenant_slots.set(tenants.len() as u64);
    *tenants_map = TenantsMap::Open(tenants);
-    Ok(())
+
+    Ok(TenantManager {
+        conf,
+        tenants: &TENANTS,
+        resources,
+    })
 }

 /// Wrapper for Tenant::spawn that checks invariants before running, and inserts
@@ -545,8 +566,10 @@ pub(crate) async fn shutdown_all_tenants() {
 async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
    use utils::completion;

-    // Atomically, 1. extract the list of tenants to shut down and 2. prevent creation of new tenants.
-    let (in_progress_ops, tenants_to_shut_down) = {
+    let mut join_set = JoinSet::new();
+
+    // Atomically, 1. create the shutdown tasks and 2. prevent creation of new tenants.
+    let (total_in_progress, total_attached) = {
        let mut m = tenants.write().unwrap();
        match &mut *m {
            TenantsMap::Initializing => {
@@ -556,78 +579,67 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
            }
            TenantsMap::Open(tenants) => {
                let mut shutdown_state = HashMap::new();
-                let mut in_progress_ops = Vec::new();
-                let mut tenants_to_shut_down = Vec::new();
+                let mut total_in_progress = 0;
+                let mut total_attached = 0;

-                for (k, v) in tenants.drain() {
+                for (tenant_id, v) in tenants.drain() {
                    match v {
                        TenantSlot::Attached(t) => {
-                            tenants_to_shut_down.push(t.clone());
-                            shutdown_state.insert(k, TenantSlot::Attached(t));
+                            shutdown_state.insert(tenant_id, TenantSlot::Attached(t.clone()));
+                            join_set.spawn(
+                                async move {
+                                    let freeze_and_flush = true;
+
+                                    let res = {
+                                        let (_guard, shutdown_progress) = completion::channel();
+                                        t.shutdown(shutdown_progress, freeze_and_flush).await
+                                    };
+
+                                    if let Err(other_progress) = res {
+                                        // join the another shutdown in progress
+                                        other_progress.wait().await;
+                                    }
+
+                                    // we cannot afford per tenant logging here, because if s3 is degraded, we are
+                                    // going to log too many lines
+                                    debug!("tenant successfully stopped");
+                                }
+                                .instrument(info_span!("shutdown", %tenant_id)),
+                            );
+
+                            total_attached += 1;
                        }
                        TenantSlot::Secondary => {
-                            shutdown_state.insert(k, TenantSlot::Secondary);
+                            shutdown_state.insert(tenant_id, TenantSlot::Secondary);
                        }
                        TenantSlot::InProgress(notify) => {
                            // InProgress tenants are not visible in TenantsMap::ShuttingDown: we will
                            // wait for their notifications to fire in this function.
-                            in_progress_ops.push(notify);
+                            join_set.spawn(async move {
+                                notify.wait().await;
+                            });
+
+                            total_in_progress += 1;
                        }
                    }
                }
                *m = TenantsMap::ShuttingDown(shutdown_state);
-                (in_progress_ops, tenants_to_shut_down)
+                (total_in_progress, total_attached)
            }
            TenantsMap::ShuttingDown(_) => {
-                // TODO: it is possible that detach and shutdown happen at the same time. as a
-                // result, during shutdown we do not wait for detach.
                error!("already shutting down, this function isn't supposed to be called more than once");
                return;
            }
        }
    };

+    let started_at = std::time::Instant::now();
+
    info!(
        "Waiting for {} InProgress tenants and {} Attached tenants to shut down",
-        in_progress_ops.len(),
-        tenants_to_shut_down.len()
+        total_in_progress, total_attached
    );

-    for barrier in in_progress_ops {
-        barrier.wait().await;
-    }
-
-    info!(
-        "InProgress tenants shut down, waiting for {} Attached tenants to shut down",
-        tenants_to_shut_down.len()
-    );
-    let started_at = std::time::Instant::now();
-    let mut join_set = JoinSet::new();
-    for tenant in tenants_to_shut_down {
-        let tenant_id = tenant.get_tenant_id();
-        join_set.spawn(
-            async move {
-                let freeze_and_flush = true;
-
-                let res = {
-                    let (_guard, shutdown_progress) = completion::channel();
-                    tenant.shutdown(shutdown_progress, freeze_and_flush).await
-                };
-
-                if let Err(other_progress) = res {
-                    // join the another shutdown in progress
-                    other_progress.wait().await;
-                }
-
-                // we cannot afford per tenant logging here, because if s3 is degraded, we are
-                // going to log too many lines
-
-                debug!("tenant successfully stopped");
-            }
-            .instrument(info_span!("shutdown", %tenant_id)),
-        );
-    }
-
    let total = join_set.len();
    let mut panicked = 0;
    let mut buffering = true;
@@ -640,7 +652,7 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
                match joined {
                    Ok(()) => {}
                    Err(join_error) if join_error.is_cancelled() => {
-                        unreachable!("we are not cancelling any of the futures");
+                        unreachable!("we are not cancelling any of the tasks");
                    }
                    Err(join_error) if join_error.is_panic() => {
                        // cannot really do anything, as this panic is likely a bug
@@ -742,139 +754,134 @@ pub(crate) async fn set_new_tenant_config(
    Ok(())
 }

-#[instrument(skip_all, fields(%tenant_id))]
-pub(crate) async fn upsert_location(
-    conf: &'static PageServerConf,
-    tenant_id: TenantId,
-    new_location_config: LocationConf,
-    broker_client: storage_broker::BrokerClientChannel,
-    remote_storage: Option<GenericRemoteStorage>,
-    deletion_queue_client: DeletionQueueClient,
-    ctx: &RequestContext,
-) -> Result<(), anyhow::Error> {
-    info!("configuring tenant location {tenant_id} to state {new_location_config:?}");
+impl TenantManager {
+    #[instrument(skip_all, fields(%tenant_id))]
+    pub(crate) async fn upsert_location(
+        &self,
+        tenant_id: TenantId,
+        new_location_config: LocationConf,
+        ctx: &RequestContext,
+    ) -> Result<(), anyhow::Error> {
+        info!("configuring tenant location {tenant_id} to state {new_location_config:?}");

-    // Special case fast-path for updates to Tenant: if our upsert is only updating configuration,
-    // then we do not need to set the slot to InProgress, we can just call into the
-    // existng tenant.
-    {
-        let locked = TENANTS.read().unwrap();
-        let peek_slot = tenant_map_peek_slot(&locked, &tenant_id, TenantSlotPeekMode::Write)?;
-        match (&new_location_config.mode, peek_slot) {
-            (LocationMode::Attached(attach_conf), Some(TenantSlot::Attached(tenant))) => {
-                if attach_conf.generation == tenant.generation {
-                    // A transition from Attached to Attached in the same generation, we may
-                    // take our fast path and just provide the updated configuration
-                    // to the tenant.
-                    tenant.set_new_location_config(AttachedTenantConf::try_from(
-                        new_location_config,
-                    )?);
+        // Special case fast-path for updates to Tenant: if our upsert is only updating configuration,
+        // then we do not need to set the slot to InProgress, we can just call into the
+        // existng tenant.
+        {
+            let locked = self.tenants.read().unwrap();
+            let peek_slot = tenant_map_peek_slot(&locked, &tenant_id, TenantSlotPeekMode::Write)?;
+            match (&new_location_config.mode, peek_slot) {
+                (LocationMode::Attached(attach_conf), Some(TenantSlot::Attached(tenant))) => {
+                    if attach_conf.generation == tenant.generation {
+                        // A transition from Attached to Attached in the same generation, we may
+                        // take our fast path and just provide the updated configuration
+                        // to the tenant.
+                        tenant.set_new_location_config(AttachedTenantConf::try_from(
+                            new_location_config,
+                        )?);

-                    // Persist the new config in the background, to avoid holding up any
-                    // locks while we do so.
-                    // TODO
+                        // Persist the new config in the background, to avoid holding up any
+                        // locks while we do so.
+                        // TODO

-                    return Ok(());
-                } else {
-                    // Different generations, fall through to general case
+                        return Ok(());
+                    } else {
+                        // Different generations, fall through to general case
+                    }
+                }
+                _ => {
+                    // Not an Attached->Attached transition, fall through to general case
                }
            }
-            _ => {
-                // Not an Attached->Attached transition, fall through to general case
-            }
        }
-    }

-    // General case for upserts to TenantsMap, excluding the case above: we will substitute an
-    // InProgress value to the slot while we make whatever changes are required.  The state for
-    // the tenant is inaccessible to the outside world while we are doing this, but that is sensible:
-    // the state is ill-defined while we're in transition.  Transitions are async, but fast: we do
-    // not do significant I/O, and shutdowns should be prompt via cancellation tokens.
-    let mut slot_guard = tenant_map_acquire_slot(&tenant_id, TenantSlotAcquireMode::Any)?;
+        // General case for upserts to TenantsMap, excluding the case above: we will substitute an
+        // InProgress value to the slot while we make whatever changes are required.  The state for
+        // the tenant is inaccessible to the outside world while we are doing this, but that is sensible:
+        // the state is ill-defined while we're in transition.  Transitions are async, but fast: we do
+        // not do significant I/O, and shutdowns should be prompt via cancellation tokens.
+        let mut slot_guard = tenant_map_acquire_slot(&tenant_id, TenantSlotAcquireMode::Any)?;

-    if let Some(TenantSlot::Attached(tenant)) = slot_guard.get_old_value() {
-        // The case where we keep a Tenant alive was covered above in the special case
-        // for Attached->Attached transitions in the same generation.  By this point,
-        // if we see an attached tenant we know it will be discarded and should be
-        // shut down.
-        let (_guard, progress) = utils::completion::channel();
+        if let Some(TenantSlot::Attached(tenant)) = slot_guard.get_old_value() {
+            // The case where we keep a Tenant alive was covered above in the special case
+            // for Attached->Attached transitions in the same generation.  By this point,
+            // if we see an attached tenant we know it will be discarded and should be
+            // shut down.
+            let (_guard, progress) = utils::completion::channel();

-        match tenant.get_attach_mode() {
-            AttachmentMode::Single | AttachmentMode::Multi => {
-                // Before we leave our state as the presumed holder of the latest generation,
-                // flush any outstanding deletions to reduce the risk of leaking objects.
-                deletion_queue_client.flush_advisory()
+            match tenant.get_attach_mode() {
+                AttachmentMode::Single | AttachmentMode::Multi => {
+                    // Before we leave our state as the presumed holder of the latest generation,
+                    // flush any outstanding deletions to reduce the risk of leaking objects.
+                    self.resources.deletion_queue_client.flush_advisory()
+                }
+                AttachmentMode::Stale => {
+                    // If we're stale there's not point trying to flush deletions
+                }
+            };
+
+            info!("Shutting down attached tenant");
+            match tenant.shutdown(progress, false).await {
+                Ok(()) => {}
+                Err(barrier) => {
+                    info!("Shutdown already in progress, waiting for it to complete");
+                    barrier.wait().await;
+                }
            }
-            AttachmentMode::Stale => {
-                // If we're stale there's not point trying to flush deletions
+            slot_guard.drop_old_value().expect("We just shut it down");
+        }
+
+        let tenant_path = self.conf.tenant_path(&tenant_id);
+
+        let new_slot = match &new_location_config.mode {
+            LocationMode::Secondary(_) => {
+                let tenant_path = self.conf.tenant_path(&tenant_id);
+                // Directory doesn't need to be fsync'd because if we crash it can
+                // safely be recreated next time this tenant location is configured.
+                unsafe_create_dir_all(&tenant_path)
+                    .await
+                    .with_context(|| format!("Creating {tenant_path}"))?;
+
+                Tenant::persist_tenant_config(self.conf, &tenant_id, &new_location_config)
+                    .await
+                    .map_err(SetNewTenantConfigError::Persist)?;
+
+                TenantSlot::Secondary
+            }
+            LocationMode::Attached(_attach_config) => {
+                let timelines_path = self.conf.timelines_path(&tenant_id);
+
+                // Directory doesn't need to be fsync'd because we do not depend on
+                // it to exist after crashes: it may be recreated when tenant is
+                // re-attached, see https://github.com/neondatabase/neon/issues/5550
+                unsafe_create_dir_all(&timelines_path)
+                    .await
+                    .with_context(|| format!("Creating {timelines_path}"))?;
+
+                Tenant::persist_tenant_config(self.conf, &tenant_id, &new_location_config)
+                    .await
+                    .map_err(SetNewTenantConfigError::Persist)?;
+
+                let tenant = tenant_spawn(
+                    self.conf,
+                    tenant_id,
+                    &tenant_path,
+                    self.resources.clone(),
+                    AttachedTenantConf::try_from(new_location_config)?,
+                    None,
+                    self.tenants,
+                    SpawnMode::Normal,
+                    ctx,
+                )?;
+
+                TenantSlot::Attached(tenant)
            }
        };

-        info!("Shutting down attached tenant");
-        match tenant.shutdown(progress, false).await {
-            Ok(()) => {}
-            Err(barrier) => {
-                info!("Shutdown already in progress, waiting for it to complete");
-                barrier.wait().await;
-            }
-        }
-        slot_guard.drop_old_value().expect("We just shut it down");
+        slot_guard.upsert(new_slot)?;
+
+        Ok(())
    }
-
-    let tenant_path = conf.tenant_path(&tenant_id);
-
-    let new_slot = match &new_location_config.mode {
-        LocationMode::Secondary(_) => {
-            let tenant_path = conf.tenant_path(&tenant_id);
-            // Directory doesn't need to be fsync'd because if we crash it can
-            // safely be recreated next time this tenant location is configured.
-            unsafe_create_dir_all(&tenant_path)
-                .await
-                .with_context(|| format!("Creating {tenant_path}"))?;
-
-            Tenant::persist_tenant_config(conf, &tenant_id, &new_location_config)
-                .await
-                .map_err(SetNewTenantConfigError::Persist)?;
-
-            TenantSlot::Secondary
-        }
-        LocationMode::Attached(_attach_config) => {
-            let timelines_path = conf.timelines_path(&tenant_id);
-
-            // Directory doesn't need to be fsync'd because we do not depend on
-            // it to exist after crashes: it may be recreated when tenant is
-            // re-attached, see https://github.com/neondatabase/neon/issues/5550
-            unsafe_create_dir_all(&timelines_path)
-                .await
-                .with_context(|| format!("Creating {timelines_path}"))?;
-
-            Tenant::persist_tenant_config(conf, &tenant_id, &new_location_config)
-                .await
-                .map_err(SetNewTenantConfigError::Persist)?;
-
-            let tenant = tenant_spawn(
-                conf,
-                tenant_id,
-                &tenant_path,
-                TenantSharedResources {
-                    broker_client,
-                    remote_storage,
-                    deletion_queue_client,
-                },
-                AttachedTenantConf::try_from(new_location_config)?,
-                None,
-                &TENANTS,
-                SpawnMode::Normal,
-                ctx,
-            )?;
-
-            TenantSlot::Attached(tenant)
-        }
-    };
-
-    slot_guard.upsert(new_slot)?;
-
-    Ok(())
 }

 #[derive(Debug, thiserror::Error)]
@@ -1430,9 +1437,6 @@ pub struct SlotGuard {
    _completion: utils::completion::Completion,
 }

-unsafe impl Send for SlotGuard {}
-unsafe impl Sync for SlotGuard {}
-
 impl SlotGuard {
    fn new(
        tenant_id: TenantId,
@@ -1539,14 +1543,7 @@ impl SlotGuard {
    /// is responsible for protecting
    fn old_value_is_shutdown(&self) -> bool {
        match self.old_value.as_ref() {
-            Some(TenantSlot::Attached(tenant)) => {
-                // TODO: PR #5711 will add a gate that enables properly checking that
-                // shutdown completed.
-                matches!(
-                    tenant.current_state(),
-                    TenantState::Stopping { .. } | TenantState::Broken { .. }
-                )
-            }
+            Some(TenantSlot::Attached(tenant)) => tenant.gate.close_complete(),
            Some(TenantSlot::Secondary) => {
                // TODO: when adding secondary mode tenants, this will check for shutdown
                // in the same way that we do for `Tenant` above
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -6,7 +6,6 @@ use std::sync::Arc;
 use anyhow::{bail, Context};
 use tokio::sync::oneshot::error::RecvError;
 use tokio::sync::Semaphore;
-use tokio_util::sync::CancellationToken;

 use crate::context::RequestContext;
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;
@@ -350,10 +349,6 @@ async fn fill_logical_sizes(
    // our advantage with `?` error handling.
    let mut joinset = tokio::task::JoinSet::new();

-    let cancel = tokio_util::sync::CancellationToken::new();
-    // be sure to cancel all spawned tasks if we are dropped
-    let _dg = cancel.clone().drop_guard();
-
    // For each point that would benefit from having a logical size available,
    // spawn a Task to fetch it, unless we have it cached already.
    for seg in segments.iter() {
@@ -371,15 +366,8 @@ async fn fill_logical_sizes(
                let parallel_size_calcs = Arc::clone(limit);
                let ctx = ctx.attached_child();
                joinset.spawn(
-                    calculate_logical_size(
-                        parallel_size_calcs,
-                        timeline,
-                        lsn,
-                        cause,
-                        ctx,
-                        cancel.child_token(),
-                    )
-                    .in_current_span(),
+                    calculate_logical_size(parallel_size_calcs, timeline, lsn, cause, ctx)
+                        .in_current_span(),
                );
            }
            e.insert(cached_size);
@@ -487,14 +475,13 @@ async fn calculate_logical_size(
    lsn: utils::lsn::Lsn,
    cause: LogicalSizeCalculationCause,
    ctx: RequestContext,
-    cancel: CancellationToken,
 ) -> Result<TimelineAtLsnSizeResult, RecvError> {
    let _permit = tokio::sync::Semaphore::acquire_owned(limit)
        .await
        .expect("global semaphore should not had been closed");

    let size_res = timeline
-        .spawn_ondemand_logical_size_calculation(lsn, cause, ctx, cancel)
+        .spawn_ondemand_logical_size_calculation(lsn, cause, ctx)
        .instrument(info_span!("spawn_ondemand_logical_size_calculation"))
        .await?;
    Ok(TimelineAtLsnSizeResult(timeline, lsn, size_res))
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -251,6 +251,7 @@ impl Layer {

        layer
            .get_value_reconstruct_data(key, lsn_range, reconstruct_data, &self.0, ctx)
+            .instrument(tracing::info_span!("get_value_reconstruct_data", layer=%self))
            .await
    }

@@ -1211,8 +1212,10 @@ impl DownloadedLayer {
            // this will be a permanent failure
            .context("load layer");

-            if res.is_err() {
+            if let Err(e) = res.as_ref() {
                LAYER_IMPL_METRICS.inc_permanent_loading_failures();
+                // TODO(#5815): we are not logging all errors, so temporarily log them here as well
+                tracing::error!("layer loading failed permanently: {e:#}");
            }
            res
        };
@@ -1291,6 +1294,7 @@ impl ResidentLayer {
    }

    /// Loads all keys stored in the layer. Returns key, lsn and value size.
+    #[tracing::instrument(skip_all, fields(layer=%self))]
    pub(crate) async fn load_keys<'a>(
        &'a self,
        ctx: &RequestContext,
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -36,7 +36,6 @@ use std::time::{Duration, Instant, SystemTime};
 use crate::context::{
    AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder,
 };
-use crate::deletion_queue::DeletionQueueClient;
 use crate::tenant::storage_layer::delta_layer::DeltaEntry;
 use crate::tenant::storage_layer::{
    AsLayerDesc, DeltaLayerWriter, EvictionError, ImageLayerWriter, InMemoryLayer, Layer,
@@ -50,6 +49,7 @@ use crate::tenant::{
    metadata::{save_metadata, TimelineMetadata},
    par_fsync,
 };
+use crate::{deletion_queue::DeletionQueueClient, tenant::remote_timeline_client::StopError};

 use crate::config::PageServerConf;
 use crate::keyspace::{KeyPartitioning, KeySpace, KeySpaceRandomAccum};
@@ -247,7 +247,7 @@ pub struct Timeline {
    /// the flush finishes. You can use that to wait for the flush to finish.
    layer_flush_start_tx: tokio::sync::watch::Sender<u64>,
    /// to be notified when layer flushing has finished, subscribe to the layer_flush_done channel
-    layer_flush_done_tx: tokio::sync::watch::Sender<(u64, anyhow::Result<()>)>,
+    layer_flush_done_tx: tokio::sync::watch::Sender<(u64, Result<(), FlushLayerError>)>,

    /// Layer removal lock.
    /// A lock to ensure that no layer of the timeline is removed concurrently by other tasks.
@@ -374,6 +374,19 @@ pub enum PageReconstructError {
    WalRedo(anyhow::Error),
 }

+#[derive(thiserror::Error, Debug)]
+enum FlushLayerError {
+    /// Timeline cancellation token was cancelled
+    #[error("timeline shutting down")]
+    Cancelled,
+
+    #[error(transparent)]
+    PageReconstructError(#[from] PageReconstructError),
+
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
 impl std::fmt::Debug for PageReconstructError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
        match self {
@@ -891,15 +904,16 @@ impl Timeline {
        self.launch_eviction_task(background_jobs_can_start);
    }

+    /// Graceful shutdown, may do a lot of I/O as we flush any open layers to disk and then
+    /// also to remote storage.  This method can easily take multiple seconds for a busy timeline.
+    ///
+    /// While we are flushing, we continue to accept read I/O.
    #[instrument(skip_all, fields(timeline_id=%self.timeline_id))]
-    pub async fn shutdown(self: &Arc<Self>, freeze_and_flush: bool) {
+    pub(crate) async fn flush_and_shutdown(&self) {
        debug_assert_current_span_has_tenant_and_timeline_id();

-        // Signal any subscribers to our cancellation token to drop out
-        tracing::debug!("Cancelling CancellationToken");
-        self.cancel.cancel();
-
-        // prevent writes to the InMemoryLayer
+        // Stop ingesting data, so that we are not still writing to an InMemoryLayer while
+        // trying to flush
        tracing::debug!("Waiting for WalReceiverManager...");
        task_mgr::shutdown_tasks(
            Some(TaskKind::WalReceiverManager),
@@ -908,40 +922,70 @@ impl Timeline {
        )
        .await;

+        // Since we have shut down WAL ingest, we should not let anyone start waiting for the LSN to advance
+        self.last_record_lsn.shutdown();
+
        // now all writers to InMemory layer are gone, do the final flush if requested
-        if freeze_and_flush {
-            match self.freeze_and_flush().await {
-                Ok(()) => {}
-                Err(e) => {
-                    warn!("failed to freeze and flush: {e:#}");
-                    return; // TODO: should probably drain remote timeline client anyways?
+        match self.freeze_and_flush().await {
+            Ok(_) => {
+                // drain the upload queue
+                if let Some(client) = self.remote_client.as_ref() {
+                    // if we did not wait for completion here, it might be our shutdown process
+                    // didn't wait for remote uploads to complete at all, as new tasks can forever
+                    // be spawned.
+                    //
+                    // what is problematic is the shutting down of RemoteTimelineClient, because
+                    // obviously it does not make sense to stop while we wait for it, but what
+                    // about corner cases like s3 suddenly hanging up?
+                    if let Err(e) = client.wait_completion().await {
+                        // Non-fatal.  Shutdown is infallible.  Failures to flush just mean that
+                        // we have some extra WAL replay to do next time the timeline starts.
+                        warn!("failed to flush to remote storage: {e:#}");
+                    }
                }
            }
-
-            // drain the upload queue
-            let res = if let Some(client) = self.remote_client.as_ref() {
-                // if we did not wait for completion here, it might be our shutdown process
-                // didn't wait for remote uploads to complete at all, as new tasks can forever
-                // be spawned.
-                //
-                // what is problematic is the shutting down of RemoteTimelineClient, because
-                // obviously it does not make sense to stop while we wait for it, but what
-                // about corner cases like s3 suddenly hanging up?
-                client.wait_completion().await
-            } else {
-                Ok(())
-            };
-
-            if let Err(e) = res {
-                warn!("failed to await for frozen and flushed uploads: {e:#}");
+            Err(e) => {
+                // Non-fatal.  Shutdown is infallible.  Failures to flush just mean that
+                // we have some extra WAL replay to do next time the timeline starts.
+                warn!("failed to freeze and flush: {e:#}");
            }
        }

+        self.shutdown().await;
+    }
+
+    /// Shut down immediately, without waiting for any open layers to flush to disk.  This is a subset of
+    /// the graceful [`Timeline::flush_and_shutdown`] function.
+    pub(crate) async fn shutdown(&self) {
+        // Signal any subscribers to our cancellation token to drop out
+        tracing::debug!("Cancelling CancellationToken");
+        self.cancel.cancel();
+
        // Page request handlers might be waiting for LSN to advance: they do not respect Timeline::cancel
        // while doing so.
        self.last_record_lsn.shutdown();

+        // Shut down the layer flush task before the remote client, as one depends on the other
+        task_mgr::shutdown_tasks(
+            Some(TaskKind::LayerFlushTask),
+            Some(self.tenant_id),
+            Some(self.timeline_id),
+        )
+        .await;
+
+        // Shut down remote timeline client: this gracefully moves its metadata into its Stopping state in
+        // case our caller wants to use that for a deletion
+        if let Some(remote_client) = self.remote_client.as_ref() {
+            match remote_client.stop() {
+                Ok(()) => {}
+                Err(StopError::QueueUninitialized) => {
+                    // Shutting down during initialization is legal
+                }
+            }
+        }
+
        tracing::debug!("Waiting for tasks...");
+
        task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(self.timeline_id)).await;

        // Finally wait until any gate-holders are complete
@@ -985,7 +1029,12 @@ impl Timeline {
            reason,
            backtrace: backtrace_str,
        };
-        self.set_state(broken_state)
+        self.set_state(broken_state);
+
+        // Although the Broken state is not equivalent to shutdown() (shutdown will be called
+        // later when this tenant is detach or the process shuts down), firing the cancellation token
+        // here avoids the need for other tasks to watch for the Broken state explicitly.
+        self.cancel.cancel();
    }

    pub fn current_state(&self) -> TimelineState {
@@ -1741,12 +1790,8 @@ impl Timeline {
                // delay will be terminated by a timeout regardless.
                let _completion = { self_clone.initial_logical_size_attempt.lock().expect("unexpected initial_logical_size_attempt poisoned").take() };

-                // no extra cancellation here, because nothing really waits for this to complete compared
-                // to spawn_ondemand_logical_size_calculation.
-                let cancel = CancellationToken::new();
-
                let calculated_size = match self_clone
-                    .logical_size_calculation_task(lsn, LogicalSizeCalculationCause::Initial, &background_ctx, cancel)
+                    .logical_size_calculation_task(lsn, LogicalSizeCalculationCause::Initial, &background_ctx)
                    .await
                {
                    Ok(s) => s,
@@ -1815,7 +1860,6 @@ impl Timeline {
        lsn: Lsn,
        cause: LogicalSizeCalculationCause,
        ctx: RequestContext,
-        cancel: CancellationToken,
    ) -> oneshot::Receiver<Result<u64, CalculateLogicalSizeError>> {
        let (sender, receiver) = oneshot::channel();
        let self_clone = Arc::clone(self);
@@ -1836,7 +1880,7 @@ impl Timeline {
            false,
            async move {
                let res = self_clone
-                    .logical_size_calculation_task(lsn, cause, &ctx, cancel)
+                    .logical_size_calculation_task(lsn, cause, &ctx)
                    .await;
                let _ = sender.send(res).ok();
                Ok(()) // Receiver is responsible for handling errors
@@ -1852,58 +1896,28 @@ impl Timeline {
        lsn: Lsn,
        cause: LogicalSizeCalculationCause,
        ctx: &RequestContext,
-        cancel: CancellationToken,
    ) -> Result<u64, CalculateLogicalSizeError> {
        span::debug_assert_current_span_has_tenant_and_timeline_id();

-        let mut timeline_state_updates = self.subscribe_for_state_updates();
+        let _guard = self.gate.enter();
+
        let self_calculation = Arc::clone(self);

        let mut calculation = pin!(async {
-            let cancel = cancel.child_token();
            let ctx = ctx.attached_child();
            self_calculation
-                .calculate_logical_size(lsn, cause, cancel, &ctx)
+                .calculate_logical_size(lsn, cause, &ctx)
                .await
        });
-        let timeline_state_cancellation = async {
-            loop {
-                match timeline_state_updates.changed().await {
-                    Ok(()) => {
-                        let new_state = timeline_state_updates.borrow().clone();
-                        match new_state {
-                            // we're running this job for active timelines only
-                            TimelineState::Active => continue,
-                            TimelineState::Broken { .. }
-                            | TimelineState::Stopping
-                            | TimelineState::Loading => {
-                                break format!("aborted because timeline became inactive (new state: {new_state:?})")
-                            }
-                        }
-                    }
-                    Err(_sender_dropped_error) => {
-                        // can't happen, the sender is not dropped as long as the Timeline exists
-                        break "aborted because state watch was dropped".to_string();
-                    }
-                }
-            }
-        };
-
-        let taskmgr_shutdown_cancellation = async {
-            task_mgr::shutdown_watcher().await;
-            "aborted because task_mgr shutdown requested".to_string()
-        };

        tokio::select! {
            res = &mut calculation => { res }
-            reason = timeline_state_cancellation => {
-                debug!(reason = reason, "cancelling calculation");
-                cancel.cancel();
+            _ = self.cancel.cancelled() => {
+                debug!("cancelling logical size calculation for timeline shutdown");
                calculation.await
            }
-            reason = taskmgr_shutdown_cancellation => {
-                debug!(reason = reason, "cancelling calculation");
-                cancel.cancel();
+            _ = task_mgr::shutdown_watcher() => {
+                debug!("cancelling logical size calculation for task shutdown");
                calculation.await
            }
        }
@@ -1917,7 +1931,6 @@ impl Timeline {
        &self,
        up_to_lsn: Lsn,
        cause: LogicalSizeCalculationCause,
-        cancel: CancellationToken,
        ctx: &RequestContext,
    ) -> Result<u64, CalculateLogicalSizeError> {
        info!(
@@ -1960,7 +1973,7 @@ impl Timeline {
        };
        let timer = storage_time_metrics.start_timer();
        let logical_size = self
-            .get_current_logical_size_non_incremental(up_to_lsn, cancel, ctx)
+            .get_current_logical_size_non_incremental(up_to_lsn, ctx)
            .await?;
        debug!("calculated logical size: {logical_size}");
        timer.stop_and_record();
@@ -2373,6 +2386,10 @@ impl Timeline {
        info!("started flush loop");
        loop {
            tokio::select! {
+                _ = self.cancel.cancelled() => {
+                    info!("shutting down layer flush task");
+                    break;
+                },
                _ = task_mgr::shutdown_watcher() => {
                    info!("shutting down layer flush task");
                    break;
@@ -2384,6 +2401,14 @@ impl Timeline {
            let timer = self.metrics.flush_time_histo.start_timer();
            let flush_counter = *layer_flush_start_rx.borrow();
            let result = loop {
+                if self.cancel.is_cancelled() {
+                    info!("dropping out of flush loop for timeline shutdown");
+                    // Note: we do not bother transmitting into [`layer_flush_done_tx`], because
+                    // anyone waiting on that will respect self.cancel as well: they will stop
+                    // waiting at the same time we as drop out of this loop.
+                    return;
+                }
+
                let layer_to_flush = {
                    let guard = self.layers.read().await;
                    guard.layer_map().frozen_layers.front().cloned()
@@ -2392,9 +2417,18 @@ impl Timeline {
                let Some(layer_to_flush) = layer_to_flush else {
                    break Ok(());
                };
-                if let Err(err) = self.flush_frozen_layer(layer_to_flush, ctx).await {
-                    error!("could not flush frozen layer: {err:?}");
-                    break Err(err);
+                match self.flush_frozen_layer(layer_to_flush, ctx).await {
+                    Ok(()) => {}
+                    Err(FlushLayerError::Cancelled) => {
+                        info!("dropping out of flush loop for timeline shutdown");
+                        return;
+                    }
+                    err @ Err(
+                        FlushLayerError::Other(_) | FlushLayerError::PageReconstructError(_),
+                    ) => {
+                        error!("could not flush frozen layer: {err:?}");
+                        break err;
+                    }
                }
            };
            // Notify any listeners that we're done
@@ -2443,7 +2477,17 @@ impl Timeline {
                }
            }
            trace!("waiting for flush to complete");
-            rx.changed().await?;
+            tokio::select! {
+                rx_e = rx.changed() => {
+                    rx_e?;
+                },
+                // Cancellation safety: we are not leaving an I/O in-flight for the flush, we're just ignoring
+                // the notification from [`flush_loop`] that it completed.
+                _ = self.cancel.cancelled() => {
+                    tracing::info!("Cancelled layer flush due on timeline shutdown");
+                    return Ok(())
+                }
+            };
            trace!("done")
        }
    }
@@ -2458,7 +2502,7 @@ impl Timeline {
        self: &Arc<Self>,
        frozen_layer: Arc<InMemoryLayer>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), FlushLayerError> {
        // As a special case, when we have just imported an image into the repository,
        // instead of writing out a L0 delta layer, we directly write out image layer
        // files instead. This is possible as long as *all* the data imported into the
@@ -2483,6 +2527,11 @@ impl Timeline {
                let (partitioning, _lsn) = self
                    .repartition(self.initdb_lsn, self.get_compaction_target_size(), ctx)
                    .await?;
+
+                if self.cancel.is_cancelled() {
+                    return Err(FlushLayerError::Cancelled);
+                }
+
                // For image layers, we add them immediately into the layer map.
                (
                    self.create_image_layers(&partitioning, self.initdb_lsn, true, ctx)
@@ -2514,6 +2563,10 @@ impl Timeline {
                )
            };

+        if self.cancel.is_cancelled() {
+            return Err(FlushLayerError::Cancelled);
+        }
+
        let disk_consistent_lsn = Lsn(lsn_range.end.0 - 1);
        let old_disk_consistent_lsn = self.disk_consistent_lsn.load();

@@ -2523,6 +2576,10 @@ impl Timeline {
        let metadata = {
            let mut guard = self.layers.write().await;

+            if self.cancel.is_cancelled() {
+                return Err(FlushLayerError::Cancelled);
+            }
+
            guard.finish_flush_l0_layer(delta_layer_to_add.as_ref(), &frozen_layer, &self.metrics);

            if disk_consistent_lsn != old_disk_consistent_lsn {
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -26,6 +26,7 @@ use tracing::{debug, error, info, info_span, instrument, warn, Instrument};

 use crate::{
    context::{DownloadBehavior, RequestContext},
+    pgdatadir_mapping::CollectKeySpaceError,
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
    tenant::{
        config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
@@ -326,8 +327,7 @@ impl Timeline {
        match state.last_layer_access_imitation {
            Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
            _ => {
-                self.imitate_timeline_cached_layer_accesses(cancel, ctx)
-                    .await;
+                self.imitate_timeline_cached_layer_accesses(ctx).await;
                state.last_layer_access_imitation = Some(tokio::time::Instant::now())
            }
        }
@@ -367,21 +367,12 @@ impl Timeline {

    /// Recompute the values which would cause on-demand downloads during restart.
    #[instrument(skip_all)]
-    async fn imitate_timeline_cached_layer_accesses(
-        &self,
-        cancel: &CancellationToken,
-        ctx: &RequestContext,
-    ) {
+    async fn imitate_timeline_cached_layer_accesses(&self, ctx: &RequestContext) {
        let lsn = self.get_last_record_lsn();

        // imitiate on-restart initial logical size
        let size = self
-            .calculate_logical_size(
-                lsn,
-                LogicalSizeCalculationCause::EvictionTaskImitation,
-                cancel.clone(),
-                ctx,
-            )
+            .calculate_logical_size(lsn, LogicalSizeCalculationCause::EvictionTaskImitation, ctx)
            .instrument(info_span!("calculate_logical_size"))
            .await;

@@ -407,9 +398,16 @@ impl Timeline {
            if size.is_err() {
                // ignore, see above comment
            } else {
-                warn!(
-                    "failed to collect keyspace but succeeded in calculating logical size: {e:#}"
-                );
+                match e {
+                    CollectKeySpaceError::Cancelled => {
+                        // Shutting down, ignore
+                    }
+                    err => {
+                        warn!(
+                            "failed to collect keyspace but succeeded in calculating logical size: {err:#}"
+                        );
+                    }
+                }
            }
        }
    }
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -43,8 +43,8 @@ use std::sync::atomic::{AtomicUsize, Ordering};

 use crate::config::PageServerConf;
 use crate::metrics::{
-    WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME,
-    WAL_REDO_WAIT_TIME,
+    WalRedoKillCause, WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_COUNTERS,
+    WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME,
 };
 use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block};
 use crate::repository::Key;
@@ -207,11 +207,8 @@ impl PostgresRedoManager {
    ) -> anyhow::Result<Bytes> {
        let (rel, blknum) = key_to_rel_block(key).context("invalid record")?;
        const MAX_RETRY_ATTEMPTS: u32 = 1;
-        let start_time = Instant::now();
        let mut n_attempts = 0u32;
        loop {
-            let lock_time = Instant::now();
-
            // launch the WAL redo process on first use
            let proc: Arc<WalRedoProcess> = {
                let proc_guard = self.redo_process.read().unwrap();
@@ -236,7 +233,7 @@ impl PostgresRedoManager {
                }
            };

-            WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64());
+            let started_at = std::time::Instant::now();

            // Relational WAL records are applied using wal-redo-postgres
            let buf_tag = BufferTag { rel, blknum };
@@ -244,8 +241,7 @@ impl PostgresRedoManager {
                .apply_wal_records(buf_tag, &base_img, records, wal_redo_timeout)
                .context("apply_wal_records");

-            let end_time = Instant::now();
-            let duration = end_time.duration_since(lock_time);
+            let duration = started_at.elapsed();

            let len = records.len();
            let nbytes = records.iter().fold(0, |acumulator, record| {
@@ -596,21 +592,21 @@ trait CloseFileDescriptors: CommandExt {

 impl<C: CommandExt> CloseFileDescriptors for C {
    fn close_fds(&mut self) -> &mut Command {
+        // SAFETY: Code executed inside pre_exec should have async-signal-safety,
+        // which means it should be safe to execute inside a signal handler.
+        // The precise meaning depends on platform. See `man signal-safety`
+        // for the linux definition.
+        //
+        // The set_fds_cloexec_threadsafe function is documented to be
+        // async-signal-safe.
+        //
+        // Aside from this function, the rest of the code is re-entrant and
+        // doesn't make any syscalls. We're just passing constants.
+        //
+        // NOTE: It's easy to indirectly cause a malloc or lock a mutex,
+        // which is not async-signal-safe. Be careful.
        unsafe {
            self.pre_exec(move || {
-                // SAFETY: Code executed inside pre_exec should have async-signal-safety,
-                // which means it should be safe to execute inside a signal handler.
-                // The precise meaning depends on platform. See `man signal-safety`
-                // for the linux definition.
-                //
-                // The set_fds_cloexec_threadsafe function is documented to be
-                // async-signal-safe.
-                //
-                // Aside from this function, the rest of the code is re-entrant and
-                // doesn't make any syscalls. We're just passing constants.
-                //
-                // NOTE: It's easy to indirectly cause a malloc or lock a mutex,
-                // which is not async-signal-safe. Be careful.
                close_fds::set_fds_cloexec_threadsafe(3, &[]);
                Ok(())
            })
@@ -667,10 +663,10 @@ impl WalRedoProcess {
            .close_fds()
            .spawn_no_leak_child(tenant_id)
            .context("spawn process")?;
-
+        WAL_REDO_PROCESS_COUNTERS.started.inc();
        let mut child = scopeguard::guard(child, |child| {
            error!("killing wal-redo-postgres process due to a problem during launch");
-            child.kill_and_wait();
+            child.kill_and_wait(WalRedoKillCause::Startup);
        });

        let stdin = child.stdin.take().unwrap();
@@ -1001,7 +997,7 @@ impl Drop for WalRedoProcess {
        self.child
            .take()
            .expect("we only do this once")
-            .kill_and_wait();
+            .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
        self.stderr_logger_cancel.cancel();
        // no way to wait for stderr_logger_task from Drop because that is async only
    }
@@ -1037,16 +1033,19 @@ impl NoLeakChild {
        })
    }

-    fn kill_and_wait(mut self) {
+    fn kill_and_wait(mut self, cause: WalRedoKillCause) {
        let child = match self.child.take() {
            Some(child) => child,
            None => return,
        };
-        Self::kill_and_wait_impl(child);
+        Self::kill_and_wait_impl(child, cause);
    }

-    #[instrument(skip_all, fields(pid=child.id()))]
-    fn kill_and_wait_impl(mut child: Child) {
+    #[instrument(skip_all, fields(pid=child.id(), ?cause))]
+    fn kill_and_wait_impl(mut child: Child, cause: WalRedoKillCause) {
+        scopeguard::defer! {
+            WAL_REDO_PROCESS_COUNTERS.killed_by_cause[cause].inc();
+        }
        let res = child.kill();
        if let Err(e) = res {
            // This branch is very unlikely because:
@@ -1091,7 +1090,7 @@ impl Drop for NoLeakChild {
                // This thread here is going to outlive of our dropper.
                let span = tracing::info_span!("walredo", %tenant_id);
                let _entered = span.enter();
-                Self::kill_and_wait_impl(child);
+                Self::kill_and_wait_impl(child, WalRedoKillCause::NoLeakChildDrop);
            })
            .await
        });
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -19,7 +19,10 @@
 #include "access/xlog.h"
 #include "access/xlogutils.h"
 #include "storage/buf_internals.h"
+#include "storage/lwlock.h"
+#include "storage/ipc.h"
 #include "c.h"
+#include "postmaster/interrupt.h"

 #include "libpq-fe.h"
 #include "libpq/pqformat.h"
@@ -61,23 +64,63 @@ int			flush_every_n_requests = 8;
 int			n_reconnect_attempts = 0;
 int			max_reconnect_attempts = 60;

+#define MAX_PAGESERVER_CONNSTRING_SIZE 256
+
+typedef struct
+{
+    LWLockId lock;
+    pg_atomic_uint64 update_counter;
+    char pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
+} PagestoreShmemState;
+
+#if PG_VERSION_NUM >= 150000
+static shmem_request_hook_type prev_shmem_request_hook = NULL;
+static void walproposer_shmem_request(void);
+#endif
+static shmem_startup_hook_type prev_shmem_startup_hook;
+static PagestoreShmemState *pagestore_shared;
+static uint64 pagestore_local_counter = 0;
+static char local_pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
+
 bool	(*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;

 static bool pageserver_flush(void);
 static void pageserver_disconnect(void);

-
-static pqsigfunc	 prev_signal_handler;
+static bool
+CheckPageserverConnstring(char **newval, void **extra, GucSource source)
+{
+    return strlen(*newval) < MAX_PAGESERVER_CONNSTRING_SIZE;
+}

 static void
-pageserver_sighup_handler(SIGNAL_ARGS)
+AssignPageserverConnstring(const char *newval, void *extra)
 {
-	if (prev_signal_handler)
-	{
-        	prev_signal_handler(postgres_signal_arg);
-	}
-	neon_log(LOG, "Received SIGHUP, disconnecting pageserver. New pageserver connstring is %s", page_server_connstring);
-	pageserver_disconnect();
+    if(!pagestore_shared)
+        return;
+    LWLockAcquire(pagestore_shared->lock, LW_EXCLUSIVE);
+    strlcpy(pagestore_shared->pageserver_connstring, newval, MAX_PAGESERVER_CONNSTRING_SIZE);
+    pg_atomic_fetch_add_u64(&pagestore_shared->update_counter, 1);
+    LWLockRelease(pagestore_shared->lock);
+}
+
+static bool
+CheckConnstringUpdated()
+{
+    if(!pagestore_shared)
+        return false;
+    return pagestore_local_counter < pg_atomic_read_u64(&pagestore_shared->update_counter);
+}
+
+static void
+ReloadConnstring()
+{
+    if(!pagestore_shared)
+        return;
+    LWLockAcquire(pagestore_shared->lock, LW_SHARED);
+    strlcpy(local_pageserver_connstring, pagestore_shared->pageserver_connstring, sizeof(local_pageserver_connstring));
+    pagestore_local_counter = pg_atomic_read_u64(&pagestore_shared->update_counter);
+    LWLockRelease(pagestore_shared->lock);
 }

 static bool
@@ -91,6 +134,11 @@ pageserver_connect(int elevel)

 	Assert(!connected);

+        if(CheckConnstringUpdated())
+        {
+            ReloadConnstring();
+        }
+
 	/*
 	 * Connect using the connection string we got from the
 	 * neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment
@@ -110,7 +158,7 @@ pageserver_connect(int elevel)
 		n++;
 	}
 	keywords[n] = "dbname";
-	values[n] = page_server_connstring;
+	values[n] = local_pageserver_connstring;
 	n++;
 	keywords[n] = NULL;
 	values[n] = NULL;
@@ -254,6 +302,12 @@ pageserver_send(NeonRequest * request)
 {
 	StringInfoData req_buff;

+        if(CheckConnstringUpdated())
+        {
+            pageserver_disconnect();
+            ReloadConnstring();
+        }
+
 	/* If the connection was lost for some reason, reconnect */
 	if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD)
 	{
@@ -274,6 +328,7 @@ pageserver_send(NeonRequest * request)
 	{
 		while (!pageserver_connect(n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
 		{
+			HandleMainLoopInterrupts();
 			n_reconnect_attempts += 1;
 			pg_usleep(RECONNECT_INTERVAL_USEC);
 		}
@@ -391,7 +446,8 @@ pageserver_flush(void)
 	return true;
 }

-page_server_api api = {
+page_server_api api =
+{
 	.send = pageserver_send,
 	.flush = pageserver_flush,
 	.receive = pageserver_receive
@@ -405,12 +461,72 @@ check_neon_id(char **newval, void **extra, GucSource source)
 	return **newval == '\0' || HexDecodeString(id, *newval, 16);
 }

+static Size
+PagestoreShmemSize(void)
+{
+    return sizeof(PagestoreShmemState);
+}
+
+static bool
+PagestoreShmemInit(void)
+{
+    bool found;
+    LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
+    pagestore_shared = ShmemInitStruct("libpagestore shared state",
+                                       PagestoreShmemSize(),
+                                       &found);
+    if(!found)
+    {
+        pagestore_shared->lock = &(GetNamedLWLockTranche("neon_libpagestore")->lock);
+        pg_atomic_init_u64(&pagestore_shared->update_counter, 0);
+        AssignPageserverConnstring(page_server_connstring, NULL);
+    }
+    LWLockRelease(AddinShmemInitLock);
+    return found;
+}
+
+static void
+pagestore_shmem_startup_hook(void)
+{
+    if(prev_shmem_startup_hook)
+        prev_shmem_startup_hook();
+
+    PagestoreShmemInit();
+}
+
+static void
+pagestore_shmem_request(void)
+{
+#if PG_VERSION_NUM >= 150000
+    if(prev_shmem_request_hook)
+        prev_shmem_request_hook();
+#endif
+
+    RequestAddinShmemSpace(PagestoreShmemSize());
+    RequestNamedLWLockTranche("neon_libpagestore", 1);
+}
+
+static void
+pagestore_prepare_shmem(void)
+{
+#if PG_VERSION_NUM >= 150000
+	prev_shmem_request_hook = shmem_request_hook;
+	shmem_request_hook = pagestore_shmem_request;
+#else
+        pagestore_shmem_request();
+#endif
+	prev_shmem_startup_hook = shmem_startup_hook;
+	shmem_startup_hook = pagestore_shmem_startup_hook;
+}
+
 /*
 * Module initialization function
 */
 void
 pg_init_libpagestore(void)
 {
+        pagestore_prepare_shmem();
+
 	DefineCustomStringVariable("neon.pageserver_connstring",
 							   "connection string to the page server",
 							   NULL,
@@ -418,7 +534,7 @@ pg_init_libpagestore(void)
 							   "",
 							   PGC_SIGHUP,
 							   0,	/* no flags required */
-							   NULL, NULL, NULL);
+							   CheckPageserverConnstring, AssignPageserverConnstring, NULL);

 	DefineCustomStringVariable("neon.timeline_id",
 							   "Neon timeline_id the server is running on",
@@ -499,7 +615,5 @@ pg_init_libpagestore(void)
 		redo_read_buffer_filter = neon_redo_read_buffer_filter;
 	}

-        prev_signal_handler = pqsignal(SIGHUP, pageserver_sighup_handler);
-
 	lfc_init();
 }
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -1,6 +1,8 @@
 //! User credentials used in authentication.

-use crate::{auth::password_hack::parse_endpoint_param, error::UserFacingError};
+use crate::{
+    auth::password_hack::parse_endpoint_param, error::UserFacingError, proxy::neon_options,
+};
 use itertools::Itertools;
 use pq_proto::StartupMessageParams;
 use std::collections::HashSet;
@@ -38,6 +40,8 @@ pub struct ClientCredentials<'a> {
    pub user: &'a str,
    // TODO: this is a severe misnomer! We should think of a new name ASAP.
    pub project: Option<String>,
+
+    pub cache_key: String,
 }

 impl ClientCredentials<'_> {
@@ -53,6 +57,7 @@ impl<'a> ClientCredentials<'a> {
        ClientCredentials {
            user: "",
            project: None,
+            cache_key: "".to_string(),
        }
    }

@@ -120,7 +125,17 @@ impl<'a> ClientCredentials<'a> {

        info!(user, project = project.as_deref(), "credentials");

-        Ok(Self { user, project })
+        let cache_key = format!(
+            "{}{}",
+            project.as_deref().unwrap_or(""),
+            neon_options(params).unwrap_or("".to_string())
+        );
+
+        Ok(Self {
+            user,
+            project,
+            cache_key,
+        })
    }
 }

@@ -176,6 +191,7 @@ mod tests {
        let creds = ClientCredentials::parse(&options, sni, common_names)?;
        assert_eq!(creds.user, "john_doe");
        assert_eq!(creds.project.as_deref(), Some("foo"));
+        assert_eq!(creds.cache_key, "foo");

        Ok(())
    }
@@ -303,4 +319,23 @@ mod tests {
            _ => panic!("bad error: {err:?}"),
        }
    }
+
+    #[test]
+    fn parse_neon_options() -> anyhow::Result<()> {
+        let options = StartupMessageParams::new([
+            ("user", "john_doe"),
+            ("options", "neon_lsn:0/2 neon_endpoint_type:read_write"),
+        ]);
+
+        let sni = Some("project.localhost");
+        let common_names = Some(["localhost".into()].into());
+        let creds = ClientCredentials::parse(&options, sni, common_names)?;
+        assert_eq!(creds.project.as_deref(), Some("project"));
+        assert_eq!(
+            creds.cache_key,
+            "projectneon_endpoint_type:read_write neon_lsn:0/2"
+        );
+
+        Ok(())
+    }
 }
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -80,6 +80,9 @@ struct ProxyCliArgs {
    /// cache for `wake_compute` api method (use `size=0` to disable)
    #[clap(long, default_value = config::CacheOptions::DEFAULT_OPTIONS_NODE_INFO)]
    wake_compute_cache: String,
+    /// lock for `wake_compute` api method. example: "shards=32,permits=4,epoch=10m,timeout=1s". (use `permits=0` to disable).
+    #[clap(long, default_value = config::WakeComputeLockOptions::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK)]
+    wake_compute_lock: String,
    /// Allow self-signed certificates for compute nodes (for testing)
    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
    allow_self_signed_compute: bool,
@@ -220,10 +223,23 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
                node_info: console::caches::NodeInfoCache::new("node_info_cache", size, ttl),
            }));

+            let config::WakeComputeLockOptions {
+                shards,
+                permits,
+                epoch,
+                timeout,
+            } = args.wake_compute_lock.parse()?;
+            info!(permits, shards, ?epoch, "Using NodeLocks (wake_compute)");
+            let locks = Box::leak(Box::new(
+                console::locks::ApiLocks::new("wake_compute_lock", permits, shards, timeout)
+                    .unwrap(),
+            ));
+            tokio::spawn(locks.garbage_collect_worker(epoch));
+
            let url = args.auth_endpoint.parse()?;
            let endpoint = http::Endpoint::new(url, http::new_client());

-            let api = console::provider::neon::Api::new(endpoint, caches);
+            let api = console::provider::neon::Api::new(endpoint, caches, locks);
            auth::BackendType::Console(Cow::Owned(api), ())
        }
        AuthBackend::Postgres => {
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -3,6 +3,7 @@ use crate::{
    cancellation::CancelClosure,
    console::errors::WakeComputeError,
    error::{io_error, UserFacingError},
+    proxy::is_neon_param,
 };
 use futures::{FutureExt, TryFutureExt};
 use itertools::Itertools;
@@ -278,7 +279,7 @@ fn filtered_options(params: &StartupMessageParams) -> Option<String> {
    #[allow(unstable_name_collisions)]
    let options: String = params
        .options_raw()?
-        .filter(|opt| parse_endpoint_param(opt).is_none())
+        .filter(|opt| parse_endpoint_param(opt).is_none() && !is_neon_param(opt))
        .intersperse(" ") // TODO: use impl from std once it's stabilized
        .collect();

@@ -313,5 +314,11 @@ mod tests {

        let params = StartupMessageParams::new([("options", "project = foo")]);
        assert_eq!(filtered_options(&params).as_deref(), Some("project = foo"));
+
+        let params = StartupMessageParams::new([(
+            "options",
+            "project = foo neon_endpoint_type:read_write   neon_lsn:0/2",
+        )]);
+        assert_eq!(filtered_options(&params).as_deref(), Some("project = foo"));
    }
 }
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -264,6 +264,79 @@ impl FromStr for CacheOptions {
    }
 }

+/// Helper for cmdline cache options parsing.
+pub struct WakeComputeLockOptions {
+    /// The number of shards the lock map should have
+    pub shards: usize,
+    /// The number of allowed concurrent requests for each endpoitn
+    pub permits: usize,
+    /// Garbage collection epoch
+    pub epoch: Duration,
+    /// Lock timeout
+    pub timeout: Duration,
+}
+
+impl WakeComputeLockOptions {
+    /// Default options for [`crate::console::provider::ApiLocks`].
+    pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "permits=0";
+
+    // pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "shards=32,permits=4,epoch=10m,timeout=1s";
+
+    /// Parse lock options passed via cmdline.
+    /// Example: [`Self::DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK`].
+    fn parse(options: &str) -> anyhow::Result<Self> {
+        let mut shards = None;
+        let mut permits = None;
+        let mut epoch = None;
+        let mut timeout = None;
+
+        for option in options.split(',') {
+            let (key, value) = option
+                .split_once('=')
+                .with_context(|| format!("bad key-value pair: {option}"))?;
+
+            match key {
+                "shards" => shards = Some(value.parse()?),
+                "permits" => permits = Some(value.parse()?),
+                "epoch" => epoch = Some(humantime::parse_duration(value)?),
+                "timeout" => timeout = Some(humantime::parse_duration(value)?),
+                unknown => bail!("unknown key: {unknown}"),
+            }
+        }
+
+        // these dont matter if lock is disabled
+        if let Some(0) = permits {
+            timeout = Some(Duration::default());
+            epoch = Some(Duration::default());
+            shards = Some(2);
+        }
+
+        let out = Self {
+            shards: shards.context("missing `shards`")?,
+            permits: permits.context("missing `permits`")?,
+            epoch: epoch.context("missing `epoch`")?,
+            timeout: timeout.context("missing `timeout`")?,
+        };
+
+        ensure!(out.shards > 1, "shard count must be > 1");
+        ensure!(
+            out.shards.is_power_of_two(),
+            "shard count must be a power of two"
+        );
+
+        Ok(out)
+    }
+}
+
+impl FromStr for WakeComputeLockOptions {
+    type Err = anyhow::Error;
+
+    fn from_str(options: &str) -> Result<Self, Self::Err> {
+        let error = || format!("failed to parse cache lock options '{options}'");
+        Self::parse(options).with_context(error)
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
@@ -288,4 +361,42 @@ mod tests {

        Ok(())
    }
+
+    #[test]
+    fn test_parse_lock_options() -> anyhow::Result<()> {
+        let WakeComputeLockOptions {
+            epoch,
+            permits,
+            shards,
+            timeout,
+        } = "shards=32,permits=4,epoch=10m,timeout=1s".parse()?;
+        assert_eq!(epoch, Duration::from_secs(10 * 60));
+        assert_eq!(timeout, Duration::from_secs(1));
+        assert_eq!(shards, 32);
+        assert_eq!(permits, 4);
+
+        let WakeComputeLockOptions {
+            epoch,
+            permits,
+            shards,
+            timeout,
+        } = "epoch=60s,shards=16,timeout=100ms,permits=8".parse()?;
+        assert_eq!(epoch, Duration::from_secs(60));
+        assert_eq!(timeout, Duration::from_millis(100));
+        assert_eq!(shards, 16);
+        assert_eq!(permits, 8);
+
+        let WakeComputeLockOptions {
+            epoch,
+            permits,
+            shards,
+            timeout,
+        } = "permits=0".parse()?;
+        assert_eq!(epoch, Duration::ZERO);
+        assert_eq!(timeout, Duration::ZERO);
+        assert_eq!(shards, 2);
+        assert_eq!(permits, 0);
+
+        Ok(())
+    }
 }
--- a/proxy/src/console.rs
+++ b/proxy/src/console.rs
@@ -13,5 +13,10 @@ pub mod caches {
    pub use super::provider::{ApiCaches, NodeInfoCache};
 }

+/// Various cache-related types.
+pub mod locks {
+    pub use super::provider::ApiLocks;
+}
+
 /// Console's management API.
 pub mod mgmt;
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -8,7 +8,13 @@ use crate::{
    compute, scram,
 };
 use async_trait::async_trait;
-use std::sync::Arc;
+use dashmap::DashMap;
+use std::{sync::Arc, time::Duration};
+use tokio::{
+    sync::{OwnedSemaphorePermit, Semaphore},
+    time::Instant,
+};
+use tracing::info;

 pub mod errors {
    use crate::{
@@ -149,6 +155,9 @@ pub mod errors {

        #[error(transparent)]
        ApiError(ApiError),
+
+        #[error("Timeout waiting to acquire wake compute lock")]
+        TimeoutError,
    }

    // This allows more useful interactions than `#[from]`.
@@ -158,6 +167,17 @@ pub mod errors {
        }
    }

+    impl From<tokio::sync::AcquireError> for WakeComputeError {
+        fn from(_: tokio::sync::AcquireError) -> Self {
+            WakeComputeError::TimeoutError
+        }
+    }
+    impl From<tokio::time::error::Elapsed> for WakeComputeError {
+        fn from(_: tokio::time::error::Elapsed) -> Self {
+            WakeComputeError::TimeoutError
+        }
+    }
+
    impl UserFacingError for WakeComputeError {
        fn to_string_client(&self) -> String {
            use WakeComputeError::*;
@@ -167,6 +187,8 @@ pub mod errors {
                BadComputeAddress(_) => REQUEST_FAILED.to_owned(),
                // However, API might return a meaningful error.
                ApiError(e) => e.to_string_client(),
+
+                TimeoutError => "timeout while acquiring the compute resource lock".to_owned(),
            }
        }
    }
@@ -178,6 +200,7 @@ pub struct ConsoleReqExtra<'a> {
    pub session_id: uuid::Uuid,
    /// Name of client application, if set.
    pub application_name: Option<&'a str>,
+    pub options: Option<&'a str>,
 }

 /// Auth secret which is managed by the cloud.
@@ -232,3 +255,145 @@ pub struct ApiCaches {
    /// Cache for the `wake_compute` API method.
    pub node_info: NodeInfoCache,
 }
+
+/// Various caches for [`console`](super).
+pub struct ApiLocks {
+    name: &'static str,
+    node_locks: DashMap<Arc<str>, Arc<Semaphore>>,
+    permits: usize,
+    timeout: Duration,
+    registered: prometheus::IntCounter,
+    unregistered: prometheus::IntCounter,
+    reclamation_lag: prometheus::Histogram,
+    lock_acquire_lag: prometheus::Histogram,
+}
+
+impl ApiLocks {
+    pub fn new(
+        name: &'static str,
+        permits: usize,
+        shards: usize,
+        timeout: Duration,
+    ) -> prometheus::Result<Self> {
+        let registered = prometheus::IntCounter::with_opts(
+            prometheus::Opts::new(
+                "semaphores_registered",
+                "Number of semaphores registered in this api lock",
+            )
+            .namespace(name),
+        )?;
+        prometheus::register(Box::new(registered.clone()))?;
+        let unregistered = prometheus::IntCounter::with_opts(
+            prometheus::Opts::new(
+                "semaphores_unregistered",
+                "Number of semaphores unregistered in this api lock",
+            )
+            .namespace(name),
+        )?;
+        prometheus::register(Box::new(unregistered.clone()))?;
+        let reclamation_lag = prometheus::Histogram::with_opts(
+            prometheus::HistogramOpts::new(
+                "reclamation_lag_seconds",
+                "Time it takes to reclaim unused semaphores in the api lock",
+            )
+            .namespace(name)
+            // 1us -> 65ms
+            // benchmarks on my mac indicate it's usually in the range of 256us and 512us
+            .buckets(prometheus::exponential_buckets(1e-6, 2.0, 16)?),
+        )?;
+        prometheus::register(Box::new(reclamation_lag.clone()))?;
+        let lock_acquire_lag = prometheus::Histogram::with_opts(
+            prometheus::HistogramOpts::new(
+                "semaphore_acquire_seconds",
+                "Time it takes to reclaim unused semaphores in the api lock",
+            )
+            .namespace(name)
+            // 0.1ms -> 6s
+            .buckets(prometheus::exponential_buckets(1e-4, 2.0, 16)?),
+        )?;
+        prometheus::register(Box::new(lock_acquire_lag.clone()))?;
+
+        Ok(Self {
+            name,
+            node_locks: DashMap::with_shard_amount(shards),
+            permits,
+            timeout,
+            lock_acquire_lag,
+            registered,
+            unregistered,
+            reclamation_lag,
+        })
+    }
+
+    pub async fn get_wake_compute_permit(
+        &self,
+        key: &Arc<str>,
+    ) -> Result<WakeComputePermit, errors::WakeComputeError> {
+        if self.permits == 0 {
+            return Ok(WakeComputePermit { permit: None });
+        }
+        let now = Instant::now();
+        let semaphore = {
+            // get fast path
+            if let Some(semaphore) = self.node_locks.get(key) {
+                semaphore.clone()
+            } else {
+                self.node_locks
+                    .entry(key.clone())
+                    .or_insert_with(|| {
+                        self.registered.inc();
+                        Arc::new(Semaphore::new(self.permits))
+                    })
+                    .clone()
+            }
+        };
+        let permit = tokio::time::timeout_at(now + self.timeout, semaphore.acquire_owned()).await;
+
+        self.lock_acquire_lag
+            .observe((Instant::now() - now).as_secs_f64());
+
+        Ok(WakeComputePermit {
+            permit: Some(permit??),
+        })
+    }
+
+    pub async fn garbage_collect_worker(&self, epoch: std::time::Duration) {
+        if self.permits == 0 {
+            return;
+        }
+
+        let mut interval = tokio::time::interval(epoch / (self.node_locks.shards().len()) as u32);
+        loop {
+            for (i, shard) in self.node_locks.shards().iter().enumerate() {
+                interval.tick().await;
+                // temporary lock a single shard and then clear any semaphores that aren't currently checked out
+                // race conditions: if strong_count == 1, there's no way that it can increase while the shard is locked
+                // therefore releasing it is safe from race conditions
+                info!(
+                    name = self.name,
+                    shard = i,
+                    "performing epoch reclamation on api lock"
+                );
+                let mut lock = shard.write();
+                let timer = self.reclamation_lag.start_timer();
+                let count = lock
+                    .extract_if(|_, semaphore| Arc::strong_count(semaphore.get_mut()) == 1)
+                    .count();
+                drop(lock);
+                self.unregistered.inc_by(count as u64);
+                timer.observe_duration()
+            }
+        }
+    }
+}
+
+pub struct WakeComputePermit {
+    // None if the lock is disabled
+    permit: Option<OwnedSemaphorePermit>,
+}
+
+impl WakeComputePermit {
+    pub fn should_check_cache(&self) -> bool {
+        self.permit.is_some()
+    }
+}
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -3,12 +3,12 @@
 use super::{
    super::messages::{ConsoleError, GetRoleSecret, WakeCompute},
    errors::{ApiError, GetAuthInfoError, WakeComputeError},
-    ApiCaches, AuthInfo, CachedNodeInfo, ConsoleReqExtra, NodeInfo,
+    ApiCaches, ApiLocks, AuthInfo, CachedNodeInfo, ConsoleReqExtra, NodeInfo,
 };
 use crate::{auth::ClientCredentials, compute, http, scram};
 use async_trait::async_trait;
 use futures::TryFutureExt;
-use std::net::SocketAddr;
+use std::{net::SocketAddr, sync::Arc};
 use tokio::time::Instant;
 use tokio_postgres::config::SslMode;
 use tracing::{error, info, info_span, warn, Instrument};
@@ -17,12 +17,17 @@ use tracing::{error, info, info_span, warn, Instrument};
 pub struct Api {
    endpoint: http::Endpoint,
    caches: &'static ApiCaches,
+    locks: &'static ApiLocks,
    jwt: String,
 }

 impl Api {
    /// Construct an API object containing the auth parameters.
-    pub fn new(endpoint: http::Endpoint, caches: &'static ApiCaches) -> Self {
+    pub fn new(
+        endpoint: http::Endpoint,
+        caches: &'static ApiCaches,
+        locks: &'static ApiLocks,
+    ) -> Self {
        let jwt: String = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") {
            Ok(v) => v,
            Err(_) => "".to_string(),
@@ -30,6 +35,7 @@ impl Api {
        Self {
            endpoint,
            caches,
+            locks,
            jwt,
        }
    }
@@ -99,6 +105,7 @@ impl Api {
                .query(&[
                    ("application_name", extra.application_name),
                    ("project", Some(project)),
+                    ("options", extra.options),
                ])
                .build()?;

@@ -151,7 +158,7 @@ impl super::Api for Api {
        extra: &ConsoleReqExtra<'_>,
        creds: &ClientCredentials,
    ) -> Result<CachedNodeInfo, WakeComputeError> {
-        let key = creds.project().expect("impossible");
+        let key: &str = &creds.cache_key;

        // Every time we do a wakeup http request, the compute node will stay up
        // for some time (highly depends on the console's scale-to-zero policy);
@@ -162,9 +169,22 @@ impl super::Api for Api {
            return Ok(cached);
        }

+        let key: Arc<str> = key.into();
+
+        let permit = self.locks.get_wake_compute_permit(&key).await?;
+
+        // after getting back a permit - it's possible the cache was filled
+        // double check
+        if permit.should_check_cache() {
+            if let Some(cached) = self.caches.node_info.get(&key) {
+                info!(key = &*key, "found cached compute node info");
+                return Ok(cached);
+            }
+        }
+
        let node = self.do_wake_compute(extra, creds).await?;
-        let (_, cached) = self.caches.node_info.insert(key.into(), node);
-        info!(key = key, "created a cache entry for compute node info");
+        let (_, cached) = self.caches.node_info.insert(key.clone(), node);
+        info!(key = &*key, "created a cache entry for compute node info");

        Ok(cached)
    }
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -1,3 +1,5 @@
+#![deny(clippy::undocumented_unsafe_blocks)]
+
 use std::convert::Infallible;

 use anyhow::{bail, Context};
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -15,10 +15,12 @@ use crate::{
 use anyhow::{bail, Context};
 use async_trait::async_trait;
 use futures::TryFutureExt;
+use itertools::Itertools;
 use metrics::{exponential_buckets, register_int_counter_vec, IntCounterVec};
-use once_cell::sync::Lazy;
+use once_cell::sync::{Lazy, OnceCell};
 use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
 use prometheus::{register_histogram_vec, HistogramVec};
+use regex::Regex;
 use std::{error::Error, io, ops::ControlFlow, sync::Arc, time::Instant};
 use tokio::{
    io::{AsyncRead, AsyncWrite, AsyncWriteExt},
@@ -568,6 +570,7 @@ fn report_error(e: &WakeComputeError, retry: bool) {
            "api_console_other_server_error"
        }
        WakeComputeError::ApiError(ApiError::Console { .. }) => "api_console_other_error",
+        WakeComputeError::TimeoutError => "timeout_error",
    };
    NUM_WAKEUP_FAILURES.with_label_values(&[retry, kind]).inc();
 }
@@ -881,9 +884,12 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
            allow_self_signed_compute,
        } = self;

+        let console_options = neon_options(params);
+
        let extra = console::ConsoleReqExtra {
            session_id, // aka this connection's id
            application_name: params.get("application_name"),
+            options: console_options.as_deref(),
        };

        let mut latency_timer = LatencyTimer::new(mode.protocol_label());
@@ -945,3 +951,27 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
        proxy_pass(stream, node.stream, &aux).await
    }
 }
+
+pub fn neon_options(params: &StartupMessageParams) -> Option<String> {
+    #[allow(unstable_name_collisions)]
+    let options: String = params
+        .options_raw()?
+        .filter(|opt| is_neon_param(opt))
+        .sorted() // we sort it to use as cache key
+        .intersperse(" ") // TODO: use impl from std once it's stabilized
+        .collect();
+
+    // Don't even bother with empty options.
+    if options.is_empty() {
+        return None;
+    }
+
+    Some(options)
+}
+
+pub fn is_neon_param(bytes: &str) -> bool {
+    static RE: OnceCell<Regex> = OnceCell::new();
+    RE.get_or_init(|| Regex::new(r"^neon_\w+:").unwrap());
+
+    RE.get().unwrap().is_match(bytes)
+}
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -440,6 +440,7 @@ fn helper_create_connect_info(
    let extra = console::ConsoleReqExtra {
        session_id: uuid::Uuid::new_v4(),
        application_name: Some("TEST"),
+        options: None,
    };
    let creds = auth::BackendType::Test(mechanism);
    (cache, extra, creds)
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -22,7 +22,10 @@ use tokio_postgres::{AsyncMessage, ReadyForQueryStatus};

 use crate::{
    auth, console,
-    proxy::{LatencyTimer, NUM_DB_CONNECTIONS_CLOSED_COUNTER, NUM_DB_CONNECTIONS_OPENED_COUNTER},
+    proxy::{
+        neon_options, LatencyTimer, NUM_DB_CONNECTIONS_CLOSED_COUNTER,
+        NUM_DB_CONNECTIONS_OPENED_COUNTER,
+    },
    usage_metrics::{Ids, MetricCounter, USAGE_METRICS},
 };
 use crate::{compute, config};
@@ -41,6 +44,7 @@ pub struct ConnInfo {
    pub dbname: String,
    pub hostname: String,
    pub password: String,
+    pub options: Option<String>,
 }

 impl ConnInfo {
@@ -401,26 +405,25 @@ async fn connect_to_compute(
    let tls = config.tls_config.as_ref();
    let common_names = tls.and_then(|tls| tls.common_names.clone());

-    let credential_params = StartupMessageParams::new([
+    let params = StartupMessageParams::new([
        ("user", &conn_info.username),
        ("database", &conn_info.dbname),
        ("application_name", APP_NAME),
+        ("options", conn_info.options.as_deref().unwrap_or("")),
    ]);

    let creds = config
        .auth_backend
        .as_ref()
-        .map(|_| {
-            auth::ClientCredentials::parse(
-                &credential_params,
-                Some(&conn_info.hostname),
-                common_names,
-            )
-        })
+        .map(|_| auth::ClientCredentials::parse(&params, Some(&conn_info.hostname), common_names))
        .transpose()?;
+
+    let console_options = neon_options(&params);
+
    let extra = console::ConsoleReqExtra {
        session_id: uuid::Uuid::new_v4(),
        application_name: Some(APP_NAME),
+        options: console_options.as_deref(),
    };

    let node_info = creds
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -174,11 +174,23 @@ fn get_conn_info(
        }
    }

+    let pairs = connection_url.query_pairs();
+
+    let mut options = Option::None;
+
+    for (key, value) in pairs {
+        if key == "options" {
+            options = Some(value.to_string());
+            break;
+        }
+    }
+
    Ok(ConnInfo {
        username: username.to_owned(),
        dbname: dbname.to_owned(),
        hostname: hostname.to_owned(),
        password: password.to_owned(),
+        options,
    })
 }

--- a/s3_scrubber/src/lib.rs
+++ b/s3_scrubber/src/lib.rs
@@ -1,3 +1,5 @@
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 pub mod checks;
 pub mod cloud_admin_api;
 pub mod garbage;
--- a/safekeeper/src/auth.rs
+++ b/safekeeper/src/auth.rs
@@ -1,19 +1,20 @@
-use anyhow::{bail, Result};
-use utils::auth::{Claims, Scope};
+use utils::auth::{AuthError, Claims, Scope};
 use utils::id::TenantId;

-pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<()> {
+pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<(), AuthError> {
    match (&claims.scope, tenant_id) {
-        (Scope::Tenant, None) => {
-            bail!("Attempt to access management api with tenant scope. Permission denied")
-        }
+        (Scope::Tenant, None) => Err(AuthError(
+            "Attempt to access management api with tenant scope. Permission denied".into(),
+        )),
        (Scope::Tenant, Some(tenant_id)) => {
            if claims.tenant_id.unwrap() != tenant_id {
-                bail!("Tenant id mismatch. Permission denied")
+                return Err(AuthError("Tenant id mismatch. Permission denied".into()));
            }
            Ok(())
        }
-        (Scope::PageServerApi, _) => bail!("PageServerApi scope makes no sense for Safekeeper"),
+        (Scope::PageServerApi, _) => Err(AuthError(
+            "PageServerApi scope makes no sense for Safekeeper".into(),
+        )),
        (Scope::SafekeeperData, _) => Ok(()),
    }
 }
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -38,7 +38,7 @@ use safekeeper::{http, WAL_REMOVER_RUNTIME};
 use safekeeper::{remove_wal, WAL_BACKUP_RUNTIME};
 use safekeeper::{wal_backup, HTTP_RUNTIME};
 use storage_broker::DEFAULT_ENDPOINT;
-use utils::auth::{JwtAuth, Scope};
+use utils::auth::{JwtAuth, Scope, SwappableJwtAuth};
 use utils::{
    id::NodeId,
    logging::{self, LogFormat},
@@ -251,10 +251,9 @@ async fn main() -> anyhow::Result<()> {
            None
        }
        Some(path) => {
-            info!("loading http auth JWT key from {path}");
-            Some(Arc::new(
-                JwtAuth::from_key_path(path).context("failed to load the auth key")?,
-            ))
+            info!("loading http auth JWT key(s) from {path}");
+            let jwt_auth = JwtAuth::from_key_path(path).context("failed to load the auth key")?;
+            Some(Arc::new(SwappableJwtAuth::new(jwt_auth)))
        }
    };

--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -6,7 +6,7 @@ use std::str::FromStr;
 use std::str::{self};
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
-use tracing::{info, info_span, Instrument};
+use tracing::{debug, info, info_span, Instrument};

 use crate::auth::check_permission;
 use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage};
@@ -165,26 +165,27 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
            .auth
            .as_ref()
            .expect("auth_type is configured but .auth of handler is missing");
-        let data =
-            auth.decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)?;
+        let data = auth
+            .decode(str::from_utf8(jwt_response).context("jwt response is not UTF-8")?)
+            .map_err(|e| QueryError::Unauthorized(e.0))?;

        // The handler might be configured to allow only tenant scope tokens.
        if matches!(allowed_auth_scope, Scope::Tenant)
            && !matches!(data.claims.scope, Scope::Tenant)
        {
-            return Err(QueryError::Other(anyhow::anyhow!(
-                "passed JWT token is for full access, but only tenant scope is allowed"
-            )));
+            return Err(QueryError::Unauthorized(
+                "passed JWT token is for full access, but only tenant scope is allowed".into(),
+            ));
        }

        if matches!(data.claims.scope, Scope::Tenant) && data.claims.tenant_id.is_none() {
-            return Err(QueryError::Other(anyhow::anyhow!(
-                "jwt token scope is Tenant, but tenant id is missing"
-            )));
+            return Err(QueryError::Unauthorized(
+                "jwt token scope is Tenant, but tenant id is missing".into(),
+            ));
        }

-        info!(
-            "jwt auth succeeded for scope: {:#?} by tenant id: {:?}",
+        debug!(
+            "jwt scope check succeeded for scope: {:#?} by tenant id: {:?}",
            data.claims.scope, data.claims.tenant_id,
        );

@@ -263,7 +264,7 @@ impl SafekeeperPostgresHandler {

    // when accessing management api supply None as an argument
    // when using to authorize tenant pass corresponding tenant id
-    fn check_permission(&self, tenant_id: Option<TenantId>) -> anyhow::Result<()> {
+    fn check_permission(&self, tenant_id: Option<TenantId>) -> Result<(), QueryError> {
        if self.auth.is_none() {
            // auth is set to Trust, nothing to check so just return ok
            return Ok(());
@@ -275,7 +276,7 @@ impl SafekeeperPostgresHandler {
            .claims
            .as_ref()
            .expect("claims presence already checked");
-        check_permission(claims, tenant_id)
+        check_permission(claims, tenant_id).map_err(|e| QueryError::Unauthorized(e.0))
    }

    async fn handle_timeline_status<IO: AsyncRead + AsyncWrite + Unpin>(
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -30,7 +30,7 @@ use crate::timelines_global_map::TimelineDeleteForceResult;
 use crate::GlobalTimelines;
 use crate::SafeKeeperConf;
 use utils::{
-    auth::JwtAuth,
+    auth::SwappableJwtAuth,
    http::{
        endpoint::{self, auth_middleware, check_permission_with},
        error::ApiError,
@@ -428,8 +428,11 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
            if ALLOWLIST_ROUTES.contains(request.uri()) {
                None
            } else {
-                // Option<Arc<JwtAuth>> is always provided as data below, hence unwrap().
-                request.data::<Option<Arc<JwtAuth>>>().unwrap().as_deref()
+                // Option<Arc<SwappableJwtAuth>> is always provided as data below, hence unwrap().
+                request
+                    .data::<Option<Arc<SwappableJwtAuth>>>()
+                    .unwrap()
+                    .as_deref()
            }
        }))
    }
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -1,3 +1,4 @@
+#![deny(clippy::undocumented_unsafe_blocks)]
 use camino::Utf8PathBuf;
 use once_cell::sync::Lazy;
 use remote_storage::RemoteStorageConfig;
@@ -6,7 +7,10 @@ use tokio::runtime::Runtime;
 use std::time::Duration;
 use storage_broker::Uri;

-use utils::id::{NodeId, TenantId, TenantTimelineId};
+use utils::{
+    auth::SwappableJwtAuth,
+    id::{NodeId, TenantId, TenantTimelineId},
+};

 mod auth;
 pub mod broker;
@@ -69,7 +73,7 @@ pub struct SafeKeeperConf {
    pub wal_backup_enabled: bool,
    pub pg_auth: Option<Arc<JwtAuth>>,
    pub pg_tenant_only_auth: Option<Arc<JwtAuth>>,
-    pub http_auth: Option<Arc<JwtAuth>>,
+    pub http_auth: Option<Arc<SwappableJwtAuth>>,
    pub current_thread_runtime: bool,
 }

--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -111,7 +111,7 @@ impl WalReceivers {
            .count()
    }

-    /// Unregister walsender.
+    /// Unregister walreceiver.
    fn unregister(self: &Arc<WalReceivers>, id: WalReceiverId) {
        let mut shared = self.mutex.lock();
        shared.slots[id] = None;
@@ -138,8 +138,8 @@ pub enum WalReceiverStatus {
    Streaming,
 }

-/// Scope guard to access slot in WalSenders registry and unregister from it in
-/// Drop.
+/// Scope guard to access slot in WalReceivers registry and unregister from
+/// it in Drop.
 pub struct WalReceiverGuard {
    id: WalReceiverId,
    walreceivers: Arc<WalReceivers>,
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -361,7 +361,6 @@ class PgProtocol:

@dataclass
 class AuthKeys:
-    pub: str
    priv: str

    def generate_token(self, *, scope: str, **token_data: str) -> str:
@@ -877,9 +876,31 @@ class NeonEnv:

    @cached_property
    def auth_keys(self) -> AuthKeys:
-        pub = (Path(self.repo_dir) / "auth_public_key.pem").read_text()
        priv = (Path(self.repo_dir) / "auth_private_key.pem").read_text()
-        return AuthKeys(pub=pub, priv=priv)
+        return AuthKeys(priv=priv)
+
+    def regenerate_keys_at(self, privkey_path: Path, pubkey_path: Path):
+        # compare generate_auth_keys() in local_env.rs
+        subprocess.run(
+            ["openssl", "genpkey", "-algorithm", "ed25519", "-out", privkey_path],
+            cwd=self.repo_dir,
+            check=True,
+        )
+
+        subprocess.run(
+            [
+                "openssl",
+                "pkey",
+                "-in",
+                privkey_path,
+                "-pubout",
+                "-out",
+                pubkey_path,
+            ],
+            cwd=self.repo_dir,
+            check=True,
+        )
+        del self.auth_keys

    def generate_endpoint_id(self) -> str:
        """
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -189,6 +189,10 @@ class PageserverHttpClient(requests.Session):
        assert res_json is None
        return res_json

+    def reload_auth_validation_keys(self):
+        res = self.post(f"http://localhost:{self.port}/v1/reload_auth_validation_keys")
+        self.verbose_error(res)
+
    def tenant_list(self) -> List[Dict[Any, Any]]:
        res = self.get(f"http://localhost:{self.port}/v1/tenant")
        self.verbose_error(res)
--- a/test_runner/regress/test_auth.py
+++ b/test_runner/regress/test_auth.py
@@ -1,12 +1,35 @@
+import os
 from contextlib import closing
+from pathlib import Path

 import psycopg2
 import pytest
-from fixtures.neon_fixtures import NeonEnvBuilder, PgProtocol
-from fixtures.pageserver.http import PageserverApiException
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    NeonEnvBuilder,
+    PgProtocol,
+)
+from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
 from fixtures.types import TenantId, TimelineId


+def assert_client_authorized(env: NeonEnv, http_client: PageserverHttpClient):
+    http_client.timeline_create(
+        pg_version=env.pg_version,
+        tenant_id=env.initial_tenant,
+        new_timeline_id=TimelineId.generate(),
+        ancestor_timeline_id=env.initial_timeline,
+    )
+
+
+def assert_client_not_authorized(env: NeonEnv, http_client: PageserverHttpClient):
+    with pytest.raises(
+        PageserverApiException,
+        match="Forbidden: JWT authentication error",
+    ):
+        assert_client_authorized(env, http_client)
+
+
 def test_pageserver_auth(neon_env_builder: NeonEnvBuilder):
    neon_env_builder.auth_enabled = True
    env = neon_env_builder.init_start()
@@ -27,30 +50,14 @@ def test_pageserver_auth(neon_env_builder: NeonEnvBuilder):
    ps.safe_psql("set FOO", password=pageserver_token)

    # tenant can create branches
-    tenant_http_client.timeline_create(
-        pg_version=env.pg_version,
-        tenant_id=env.initial_tenant,
-        new_timeline_id=TimelineId.generate(),
-        ancestor_timeline_id=env.initial_timeline,
-    )
+    assert_client_authorized(env, tenant_http_client)
+
    # console can create branches for tenant
-    pageserver_http_client.timeline_create(
-        pg_version=env.pg_version,
-        tenant_id=env.initial_tenant,
-        new_timeline_id=TimelineId.generate(),
-        ancestor_timeline_id=env.initial_timeline,
-    )
+    assert_client_authorized(env, pageserver_http_client)

    # fail to create branch using token with different tenant_id
-    with pytest.raises(
-        PageserverApiException, match="Forbidden: Tenant id mismatch. Permission denied"
-    ):
-        invalid_tenant_http_client.timeline_create(
-            pg_version=env.pg_version,
-            tenant_id=env.initial_tenant,
-            new_timeline_id=TimelineId.generate(),
-            ancestor_timeline_id=env.initial_timeline,
-        )
+    with pytest.raises(PageserverApiException, match="Forbidden: JWT authentication error"):
+        assert_client_authorized(env, invalid_tenant_http_client)

    # create tenant using management token
    pageserver_http_client.tenant_create(TenantId.generate())
@@ -58,7 +65,7 @@ def test_pageserver_auth(neon_env_builder: NeonEnvBuilder):
    # fail to create tenant using tenant token
    with pytest.raises(
        PageserverApiException,
-        match="Forbidden: Attempt to access management api with tenant scope. Permission denied",
+        match="Forbidden: JWT authentication error",
    ):
        tenant_http_client.tenant_create(TenantId.generate())

@@ -82,6 +89,96 @@ def test_compute_auth_to_pageserver(neon_env_builder: NeonEnvBuilder):
            assert cur.fetchone() == (5000050000,)


+def test_pageserver_multiple_keys(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.auth_enabled = True
+    env = neon_env_builder.init_start()
+    env.pageserver.allowed_errors.append(".*Authentication error: InvalidSignature.*")
+    env.pageserver.allowed_errors.append(".*Unauthorized: malformed jwt token.*")
+
+    pageserver_token_old = env.auth_keys.generate_pageserver_token()
+    pageserver_http_client_old = env.pageserver.http_client(pageserver_token_old)
+
+    pageserver_http_client_old.reload_auth_validation_keys()
+
+    # This test is to ensure that the pageserver supports multiple keys.
+    # The neon_local tool generates one key pair at a hardcoded path by default.
+    # As a preparation for our test, move the public key of the key pair into a
+    # directory at the same location as the hardcoded path by:
+    # 1. moving the the file at `configured_pub_key_path` to a temporary location
+    # 2. creating a new directory at `configured_pub_key_path`
+    # 3. moving the file from the temporary location into the newly created directory
+    configured_pub_key_path = Path(env.repo_dir) / "auth_public_key.pem"
+    os.rename(configured_pub_key_path, Path(env.repo_dir) / "auth_public_key.pem.file")
+    os.mkdir(configured_pub_key_path)
+    os.rename(
+        Path(env.repo_dir) / "auth_public_key.pem.file",
+        configured_pub_key_path / "auth_public_key_old.pem",
+    )
+
+    # Add a new key pair
+    # This invalidates env.auth_keys and makes them be regenerated
+    env.regenerate_keys_at(
+        Path("auth_private_key.pem"), Path("auth_public_key.pem/auth_public_key_new.pem")
+    )
+
+    # Reload the keys on the pageserver side
+    pageserver_http_client_old.reload_auth_validation_keys()
+
+    # We can continue doing things using the old token
+    assert_client_authorized(env, pageserver_http_client_old)
+
+    pageserver_token_new = env.auth_keys.generate_pageserver_token()
+    pageserver_http_client_new = env.pageserver.http_client(pageserver_token_new)
+
+    # The new token also works
+    assert_client_authorized(env, pageserver_http_client_new)
+
+    # Remove the old token and reload
+    os.remove(Path(env.repo_dir) / "auth_public_key.pem" / "auth_public_key_old.pem")
+    pageserver_http_client_old.reload_auth_validation_keys()
+
+    # Reloading fails now with the old token, but the new token still works
+    assert_client_not_authorized(env, pageserver_http_client_old)
+    assert_client_authorized(env, pageserver_http_client_new)
+
+
+def test_pageserver_key_reload(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.auth_enabled = True
+    env = neon_env_builder.init_start()
+    env.pageserver.allowed_errors.append(".*Authentication error: InvalidSignature.*")
+    env.pageserver.allowed_errors.append(".*Unauthorized: malformed jwt token.*")
+
+    pageserver_token_old = env.auth_keys.generate_pageserver_token()
+    pageserver_http_client_old = env.pageserver.http_client(pageserver_token_old)
+
+    pageserver_http_client_old.reload_auth_validation_keys()
+
+    # Regenerate the keys
+    env.regenerate_keys_at(Path("auth_private_key.pem"), Path("auth_public_key.pem"))
+
+    # Reload the keys on the pageserver side
+    pageserver_http_client_old.reload_auth_validation_keys()
+
+    # Next attempt fails as we use the old auth token
+    with pytest.raises(
+        PageserverApiException,
+        match="Forbidden: JWT authentication error",
+    ):
+        pageserver_http_client_old.reload_auth_validation_keys()
+
+    # same goes for attempts trying to create a timeline
+    assert_client_not_authorized(env, pageserver_http_client_old)
+
+    pageserver_token_new = env.auth_keys.generate_pageserver_token()
+    pageserver_http_client_new = env.pageserver.http_client(pageserver_token_new)
+
+    # timeline creation works with the new token
+    assert_client_authorized(env, pageserver_http_client_new)
+
+    # reloading also works with the new token
+    pageserver_http_client_new.reload_auth_validation_keys()
+
+
@pytest.mark.parametrize("auth_enabled", [False, True])
 def test_auth_failures(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
    neon_env_builder.auth_enabled = auth_enabled
--- a/test_runner/regress/test_bad_connection.py
+++ b/test_runner/regress/test_bad_connection.py
@@ -0,0 +1,60 @@
+import random
+import time
+
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnvBuilder
+
+
+def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_start()
+    env.pageserver.allowed_errors.append(".*simulated connection error.*")
+
+    pageserver_http = env.pageserver.http_client()
+    env.neon_cli.create_branch("test_compute_pageserver_connection_stress")
+    endpoint = env.endpoints.create_start("test_compute_pageserver_connection_stress")
+
+    # Enable failpoint after starting everything else up so that loading initial
+    # basebackup doesn't fail
+    pageserver_http.configure_failpoints(("simulated-bad-compute-connection", "50%return(15)"))
+
+    pg_conn = endpoint.connect()
+    cur = pg_conn.cursor()
+
+    # Create table, and insert some rows. Make it big enough that it doesn't fit in
+    # shared_buffers, otherwise the SELECT after restart will just return answer
+    # from shared_buffers without hitting the page server, which defeats the point
+    # of this test.
+    cur.execute("CREATE TABLE foo (t text)")
+    cur.execute(
+        """
+        INSERT INTO foo
+            SELECT 'long string to consume some space' || g
+            FROM generate_series(1, 100000) g
+        """
+    )
+
+    # Verify that the table is larger than shared_buffers
+    cur.execute(
+        """
+        select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size
+        from pg_settings where name = 'shared_buffers'
+        """
+    )
+    row = cur.fetchone()
+    assert row is not None
+    log.info(f"shared_buffers is {row[0]}, table size {row[1]}")
+    assert int(row[0]) < int(row[1])
+
+    cur.execute("SELECT count(*) FROM foo")
+    assert cur.fetchone() == (100000,)
+
+    end_time = time.time() + 30
+    times_executed = 0
+    while time.time() < end_time:
+        if random.random() < 0.5:
+            cur.execute("INSERT INTO foo VALUES ('stas'), ('heikki')")
+        else:
+            cur.execute("SELECT t FROM foo ORDER BY RANDOM() LIMIT 10")
+            cur.fetchall()
+        times_executed += 1
+    log.info(f"Workload executed {times_executed} times")
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -26,6 +26,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder):
            ".*will not become active. Current state: Broken.*",
            ".*failed to load metadata.*",
            ".*load failed.*load local timeline.*",
+            ".*layer loading failed permanently: load layer: .*",
        ]
    )

--- a/test_runner/regress/test_change_pageserver.py
+++ b/test_runner/regress/test_change_pageserver.py
@@ -1,9 +1,13 @@
+import asyncio
+
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder
 from fixtures.remote_storage import RemoteStorageKind


 def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
+    num_connections = 3
+
    neon_env_builder.num_pageservers = 2
    neon_env_builder.enable_pageserver_remote_storage(
        remote_storage_kind=RemoteStorageKind.MOCK_S3,
@@ -16,15 +20,24 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
    alt_pageserver_id = env.pageservers[1].id
    env.pageservers[1].tenant_attach(env.initial_tenant)

-    pg_conn = endpoint.connect()
-    cur = pg_conn.cursor()
+    pg_conns = [endpoint.connect() for i in range(num_connections)]
+    curs = [pg_conn.cursor() for pg_conn in pg_conns]
+
+    def execute(statement: str):
+        for cur in curs:
+            cur.execute(statement)
+
+    def fetchone():
+        results = [cur.fetchone() for cur in curs]
+        assert all(result == results[0] for result in results)
+        return results[0]

    # Create table, and insert some rows. Make it big enough that it doesn't fit in
    # shared_buffers, otherwise the SELECT after restart will just return answer
    # from shared_buffers without hitting the page server, which defeats the point
    # of this test.
-    cur.execute("CREATE TABLE foo (t text)")
-    cur.execute(
+    curs[0].execute("CREATE TABLE foo (t text)")
+    curs[0].execute(
        """
        INSERT INTO foo
            SELECT 'long string to consume some space' || g
@@ -33,25 +46,25 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
    )

    # Verify that the table is larger than shared_buffers
-    cur.execute(
+    curs[0].execute(
        """
        select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size
        from pg_settings where name = 'shared_buffers'
        """
    )
-    row = cur.fetchone()
+    row = curs[0].fetchone()
    assert row is not None
    log.info(f"shared_buffers is {row[0]}, table size {row[1]}")
    assert int(row[0]) < int(row[1])

-    cur.execute("SELECT count(*) FROM foo")
-    assert cur.fetchone() == (100000,)
+    execute("SELECT count(*) FROM foo")
+    assert fetchone() == (100000,)

    endpoint.reconfigure(pageserver_id=alt_pageserver_id)

    # Verify that the neon.pageserver_connstring GUC is set to the correct thing
-    cur.execute("SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'")
-    connstring = cur.fetchone()
+    execute("SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'")
+    connstring = fetchone()
    assert connstring is not None
    expected_connstring = f"postgresql://no_user:@localhost:{env.pageservers[1].service_port.pg}"
    assert expected_connstring == expected_connstring
@@ -60,5 +73,45 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
        0
    ].stop()  # Stop the old pageserver just to make sure we're reading from the new one

-    cur.execute("SELECT count(*) FROM foo")
-    assert cur.fetchone() == (100000,)
+    execute("SELECT count(*) FROM foo")
+    assert fetchone() == (100000,)
+
+    # Try failing back, and this time we will stop the current pageserver before reconfiguring
+    # the endpoint.  Whereas the previous reconfiguration was like a healthy migration, this
+    # is more like what happens in an unexpected  pageserver failure.
+    env.pageservers[0].start()
+    env.pageservers[1].stop()
+
+    endpoint.reconfigure(pageserver_id=env.pageservers[0].id)
+
+    execute("SELECT count(*) FROM foo")
+    assert fetchone() == (100000,)
+
+    env.pageservers[0].stop()
+    env.pageservers[1].start()
+
+    # Test a (former) bug where a child process spins without updating its connection string
+    # by executing a query separately. This query will hang until we issue the reconfigure.
+    async def reconfigure_async():
+        await asyncio.sleep(
+            1
+        )  # Sleep for 1 second just to make sure we actually started our count(*) query
+        endpoint.reconfigure(pageserver_id=env.pageservers[1].id)
+
+    def execute_count():
+        execute("SELECT count(*) FROM FOO")
+
+    async def execute_and_reconfigure():
+        task_exec = asyncio.to_thread(execute_count)
+        task_reconfig = asyncio.create_task(reconfigure_async())
+        await asyncio.gather(
+            task_exec,
+            task_reconfig,
+        )
+
+    asyncio.run(execute_and_reconfigure())
+    assert fetchone() == (100000,)
+
+    # One final check that nothing hangs
+    execute("SELECT count(*) FROM foo")
+    assert fetchone() == (100000,)
--- a/test_runner/regress/test_lsn_mapping.py
+++ b/test_runner/regress/test_lsn_mapping.py
@@ -79,11 +79,27 @@ def test_lsn_mapping_old(neon_env_builder: NeonEnvBuilder):
 def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
    env = neon_env_builder.init_start()

-    new_timeline_id = env.neon_cli.create_branch("test_lsn_mapping")
-    endpoint_main = env.endpoints.create_start("test_lsn_mapping")
-    log.info("postgres is running on 'test_lsn_mapping' branch")
+    tenant_id, _ = env.neon_cli.create_tenant(
+        conf={
+            # disable default GC and compaction
+            "gc_period": "1000 m",
+            "compaction_period": "0 s",
+            "gc_horizon": f"{1024 ** 2}",
+            "checkpoint_distance": f"{1024 ** 2}",
+            "compaction_target_size": f"{1024 ** 2}",
+        }
+    )
+
+    timeline_id = env.neon_cli.create_branch("test_lsn_mapping", tenant_id=tenant_id)
+    endpoint_main = env.endpoints.create_start("test_lsn_mapping", tenant_id=tenant_id)
+    timeline_id = endpoint_main.safe_psql("show neon.timeline_id")[0][0]
+    log.info("postgres is running on 'main' branch")

    cur = endpoint_main.connect().cursor()
+
+    # Obtain an lsn before all write operations on this branch
+    start_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_lsn()"))
+
    # Create table, and insert rows, each in a separate transaction
    # Disable `synchronous_commit` to make this initialization go faster.
    # XXX: on my laptop this test takes 7s, and setting `synchronous_commit=off`
@@ -106,29 +122,33 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
    cur.execute("INSERT INTO foo VALUES (-1)")

    # Wait until WAL is received by pageserver
-    wait_for_last_flush_lsn(env, endpoint_main, env.initial_tenant, new_timeline_id)
+    last_flush_lsn = wait_for_last_flush_lsn(env, endpoint_main, tenant_id, timeline_id)

    with env.pageserver.http_client() as client:
        # Check edge cases
        # Timestamp is in the future
        probe_timestamp = tbl[-1][1] + timedelta(hours=1)
        result = client.timeline_get_lsn_by_timestamp(
-            env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 2
+            tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z", 2
        )
        assert result["kind"] in ["present", "future"]
+        # make sure that we return a well advanced lsn here
+        assert Lsn(result["lsn"]) > start_lsn

        # Timestamp is in the unreachable past
        probe_timestamp = tbl[0][1] - timedelta(hours=10)
        result = client.timeline_get_lsn_by_timestamp(
-            env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 2
+            tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z", 2
        )
        assert result["kind"] == "past"
+        # make sure that we return the minimum lsn here at the start of the range
+        assert Lsn(result["lsn"]) < start_lsn

        # Probe a bunch of timestamps in the valid range
        for i in range(1, len(tbl), 100):
            probe_timestamp = tbl[i][1]
            result = client.timeline_get_lsn_by_timestamp(
-                env.initial_tenant, new_timeline_id, f"{probe_timestamp.isoformat()}Z", 2
+                tenant_id, timeline_id, f"{probe_timestamp.isoformat()}Z", 2
            )
            assert result["kind"] not in ["past", "nodata"]
            lsn = result["lsn"]
@@ -136,12 +156,29 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder):
            # Launch a new read-only node at that LSN, and check that only the rows
            # that were supposed to be committed at that point in time are visible.
            endpoint_here = env.endpoints.create_start(
-                branch_name="test_lsn_mapping", endpoint_id="ep-lsn_mapping_read", lsn=lsn
+                branch_name="test_lsn_mapping",
+                endpoint_id="ep-lsn_mapping_read",
+                lsn=lsn,
+                tenant_id=tenant_id,
            )
            assert endpoint_here.safe_psql("SELECT max(x) FROM foo")[0][0] == i

            endpoint_here.stop_and_destroy()

+        # Do the "past" check again at a new branch to ensure that we don't return something before the branch cutoff
+        timeline_id_child = env.neon_cli.create_branch(
+            "test_lsn_mapping_child", tenant_id=tenant_id, ancestor_branch_name="test_lsn_mapping"
+        )
+
+        # Timestamp is in the unreachable past
+        probe_timestamp = tbl[0][1] - timedelta(hours=10)
+        result = client.timeline_get_lsn_by_timestamp(
+            tenant_id, timeline_id_child, f"{probe_timestamp.isoformat()}Z", 2
+        )
+        assert result["kind"] == "past"
+        # make sure that we return the minimum lsn here at the start of the range
+        assert Lsn(result["lsn"]) >= last_flush_lsn
+

 # Test pageserver get_timestamp_of_lsn API
 def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder):
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -366,11 +366,17 @@ def test_deletion_queue_recovery(
    assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0

    if validate_before == ValidateBefore.VALIDATE:
+        # At this point, one or more DeletionLists have been written.  We have set a failpoint
+        # to prevent them successfully executing, but we want to see them get validated.
+        #
+        # We await _some_ validations instead of _all_ validations, because our execution failpoint
+        # will prevent validation proceeding for any but the first DeletionList.  Usually the workload
+        # just generates one, but if it generates two due to timing, then we must not expect that the
+        # second one will be validated.
+        def assert_some_validations():
+            assert get_deletion_queue_validated(ps_http) > 0

-        def assert_validation_complete():
-            assert get_deletion_queue_submitted(ps_http) == get_deletion_queue_validated(ps_http)
-
-        wait_until(20, 1, assert_validation_complete)
+        wait_until(20, 1, assert_some_validations)

        # The validatated keys statistic advances before the header is written, so we
        # also wait to see the header hit the disk: this seems paranoid but the race
@@ -380,6 +386,11 @@ def test_deletion_queue_recovery(

        wait_until(20, 1, assert_header_written)

+        # If we will lose attachment, then our expectation on restart is that only the ones
+        # we already validated will execute.  Act like only those were present in the queue.
+        if keep_attachment == KeepAttachment.LOSE:
+            before_restart_depth = get_deletion_queue_validated(ps_http)
+
    log.info(f"Restarting pageserver with {before_restart_depth} deletions enqueued")
    env.pageserver.stop(immediate=True)

@@ -402,11 +413,13 @@ def test_deletion_queue_recovery(
    ps_http.deletion_queue_flush(execute=True)
    wait_until(10, 1, lambda: assert_deletion_queue(ps_http, lambda n: n == 0))

-    if keep_attachment == KeepAttachment.KEEP or validate_before == ValidateBefore.VALIDATE:
+    if keep_attachment == KeepAttachment.KEEP:
        # - If we kept the attachment, then our pre-restart deletions should execute
        #   because on re-attach they were from the immediately preceding generation
-        # - If we validated before restart, then the deletions should execute because the
-        #   deletion queue header records a validated deletion list sequence number.
+        assert get_deletion_queue_executed(ps_http) == before_restart_depth
+    elif validate_before == ValidateBefore.VALIDATE:
+        # - If we validated before restart, then we should execute however many keys were
+        #   validated before restart.
        assert get_deletion_queue_executed(ps_http) == before_restart_depth
    else:
        env.pageserver.allowed_errors.extend([".*Dropping stale deletions.*"])
--- a/test_runner/regress/test_pageserver_restarts_under_workload.py
+++ b/test_runner/regress/test_pageserver_restarts_under_workload.py
@@ -17,10 +17,6 @@ def test_pageserver_restarts_under_worload(neon_simple_env: NeonEnv, pg_bin: PgB
    n_restarts = 10
    scale = 10

-    # Pageserver currently logs requests on non-active tenants at error level
-    # https://github.com/neondatabase/neon/issues/5784
-    env.pageserver.allowed_errors.append(".* will not become active. Current state: Stopping.*")
-
    def run_pgbench(connstr: str):
        log.info(f"Start a pgbench workload on pg {connstr}")
        pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr])
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -25,6 +25,7 @@ chrono = { version = "0.4", default-features = false, features = ["clock", "serd
 clap = { version = "4", features = ["derive", "string"] }
 clap_builder = { version = "4", default-features = false, features = ["color", "help", "std", "string", "suggestions", "usage"] }
 crossbeam-utils = { version = "0.8" }
+dashmap = { version = "5", default-features = false, features = ["raw-api"] }
 either = { version = "1" }
 fail = { version = "0.5", default-features = false, features = ["failpoints"] }
 futures = { version = "0.3" }
@@ -38,7 +39,7 @@ hex = { version = "0.4", features = ["serde"] }
 hyper = { version = "0.14", features = ["full"] }
 itertools = { version = "0.10" }
 libc = { version = "0.2", features = ["extra_traits"] }
-log = { version = "0.4", default-features = false, features = ["kv_unstable", "std"] }
+log = { version = "0.4", default-features = false, features = ["std"] }
 memchr = { version = "2" }
 nom = { version = "7" }
 num-bigint = { version = "0.4" }
@@ -55,7 +56,6 @@ scopeguard = { version = "1" }
 serde = { version = "1", features = ["alloc", "derive"] }
 serde_json = { version = "1", features = ["raw_value"] }
 smallvec = { version = "1", default-features = false, features = ["write"] }
-standback = { version = "0.2", default-features = false, features = ["std"] }
 time = { version = "0.3", features = ["local-offset", "macros", "serde-well-known"] }
 tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] }
 tokio-rustls = { version = "0.24" }
@@ -76,14 +76,13 @@ cc = { version = "1", default-features = false, features = ["parallel"] }
 either = { version = "1" }
 itertools = { version = "0.10" }
 libc = { version = "0.2", features = ["extra_traits"] }
-log = { version = "0.4", default-features = false, features = ["kv_unstable", "std"] }
+log = { version = "0.4", default-features = false, features = ["std"] }
 memchr = { version = "2" }
 nom = { version = "7" }
 prost = { version = "0.11" }
 regex = { version = "1" }
 regex-syntax = { version = "0.7" }
 serde = { version = "1", features = ["alloc", "derive"] }
-standback = { version = "0.2", default-features = false, features = ["std"] }
 syn-dff4ba8e3ae991db = { package = "syn", version = "1", features = ["extra-traits", "full", "visit"] }
 syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "full", "visit", "visit-mut"] }
 time-macros = { version = "0.2", default-features = false, features = ["formatting", "parsing", "serde"] }