implement a benchmark for tokio tcp handling to figure out perf bottleneck in no_op libpq benchmark

implement a standalone no-op server
usable by getpage_bench_libpq by running it on a different port than the pageserver libpq listener, and overriding connstring for getpage_bench_libpq to point to the noop_server
2026-05-22 07:30:37 +00:00 · 2023-11-08 16:34:57 +00:00 · 2023-11-08 16:34:57 +00:00 · 2023-11-08 16:34:56 +00:00 · 2023-11-08 16:34:56 +00:00 · 2023-11-08 16:34:56 +00:00
76 changed files with 2300 additions and 335 deletions
--- a/.config/hakari.toml
+++ b/.config/hakari.toml
@@ -22,5 +22,11 @@ platforms = [
    # "x86_64-pc-windows-msvc",
 ]

+[final-excludes]
+# vm_monitor benefits from the same Cargo.lock as the rest of our artifacts, but
+# it is built primarly in separate repo neondatabase/autoscaling and thus is excluded
+# from depending on workspace-hack because most of the dependencies are not used.
+workspace-members = ["vm_monitor"]
+
 # Write out exact versions rather than a semver range. (Defaults to false.)
 # exact-versions = true
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -723,6 +723,7 @@ jobs:
                           --cache-repo 369495373322.dkr.ecr.eu-central-1.amazonaws.com/cache
                           --context .
                           --build-arg GIT_VERSION=${{ github.event.pull_request.head.sha || github.sha }}
+                           --build-arg BUILD_TAG=${{ needs.tag.outputs.build-tag }}
                           --build-arg REPOSITORY=369495373322.dkr.ecr.eu-central-1.amazonaws.com
                           --destination 369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:${{needs.tag.outputs.build-tag}}
                           --destination neondatabase/neon:${{needs.tag.outputs.build-tag}}
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -170,6 +170,12 @@ dependencies = [
 "backtrace",
 ]

+[[package]]
+name = "arc-swap"
+version = "1.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bddcadddf5e9015d310179a59bb28c4d4b9920ad0f11e8e14dbadf654890c9a6"
+
 [[package]]
 name = "archery"
 version = "0.5.0"
@@ -2921,6 +2927,16 @@ dependencies = [
 "winapi",
 ]

+[[package]]
+name = "nu-ansi-term"
+version = "0.46.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
+dependencies = [
+ "overload",
+ "winapi",
+]
+
 [[package]]
 name = "num-bigint"
 version = "0.4.3"
@@ -3187,6 +3203,12 @@ version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a"

+[[package]]
+name = "overload"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
+
 [[package]]
 name = "pagectl"
 version = "0.1.0"
@@ -3272,10 +3294,12 @@ dependencies = [
 "tokio",
 "tokio-io-timeout",
 "tokio-postgres",
+ "tokio-stream",
 "tokio-tar",
 "tokio-util",
 "toml_edit",
 "tracing",
+ "tracing-subscriber",
 "url",
 "utils",
 "walkdir",
@@ -3550,7 +3574,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#ef8559b5f60f5c1d2b0184a62f49035600824518"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -3563,7 +3587,7 @@ dependencies = [
 [[package]]
 name = "postgres-native-tls"
 version = "0.5.0"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#ef8559b5f60f5c1d2b0184a62f49035600824518"
 dependencies = [
 "native-tls",
 "tokio",
@@ -3574,7 +3598,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#ef8559b5f60f5c1d2b0184a62f49035600824518"
 dependencies = [
 "base64 0.20.0",
 "byteorder",
@@ -3592,7 +3616,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#ef8559b5f60f5c1d2b0184a62f49035600824518"
 dependencies = [
 "bytes",
 "fallible-iterator",
@@ -5408,7 +5432,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=ce7260db5998fe27167da42503905a12e7ad9048#ce7260db5998fe27167da42503905a12e7ad9048"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#ef8559b5f60f5c1d2b0184a62f49035600824518"
 dependencies = [
 "async-trait",
 "byteorder",
@@ -5765,6 +5789,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77"
 dependencies = [
 "matchers",
+ "nu-ansi-term",
 "once_cell",
 "regex",
 "serde",
@@ -5951,6 +5976,7 @@ name = "utils"
 version = "0.1.0"
 dependencies = [
 "anyhow",
+ "arc-swap",
 "async-trait",
 "bincode",
 "byteorder",
@@ -6048,7 +6074,6 @@ dependencies = [
 "tokio-util",
 "tracing",
 "tracing-subscriber",
- "workspace_hack",
 ]

 [[package]]
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -36,6 +36,7 @@ license = "Apache-2.0"
 ## All dependency versions, used in the project
 [workspace.dependencies]
 anyhow = { version = "1.0", features = ["backtrace"] }
+arc-swap = "1.6"
 async-compression = { version = "0.4.0", features = ["tokio", "gzip"] }
 azure_core = "0.16"
 azure_identity = "0.16"
@@ -162,11 +163,11 @@ env_logger = "0.10"
 log = "0.4"

 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
-postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
+postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }

 ## Other git libraries
 heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
@@ -203,7 +204,7 @@ tonic-build = "0.9"

 # This is only needed for proxy's tests.
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="ce7260db5998fe27167da42503905a12e7ad9048" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }

 ################# Binary contents sections

--- a/5
+++ b/5
@@ -27,6 +27,7 @@ RUN set -e \
 FROM $REPOSITORY/$IMAGE:$TAG AS build
 WORKDIR /home/nonroot
 ARG GIT_VERSION=local
+ARG BUILD_TAG

 # Enable https://github.com/paritytech/cachepot to cache Rust crates' compilation results in Docker builds.
 # Set up cachepot to use an AWS S3 bucket for cache results, to reuse it between `docker build` invocations.
@@ -78,9 +79,9 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/pg_sni_router
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pageserver          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/pagectl             /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper          /usr/local/bin
-COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker         /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker      /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy               /usr/local/bin
-COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local               /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local          /usr/local/bin

 COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
 COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
--- a/4
+++ b/4
@@ -72,6 +72,10 @@ neon: postgres-headers walproposer-lib
 #
 $(POSTGRES_INSTALL_DIR)/build/%/config.status:
 	+@echo "Configuring Postgres $* build"
+	@test -s $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure || { \
+		echo "\nPostgres submodule not found in $(ROOT_PROJECT_DIR)/vendor/postgres-$*/, execute "; \
+		echo "'git submodule update --init --recursive --depth 2 --progress .' in project root.\n"; \
+		exit 1; }
 	mkdir -p $(POSTGRES_INSTALL_DIR)/build/$*
 	(cd $(POSTGRES_INSTALL_DIR)/build/$* && \
 	env PATH="$(EXTRA_PATH_OVERRIDES):$$PATH" $(ROOT_PROJECT_DIR)/vendor/postgres-$*/configure \
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -710,8 +710,12 @@ impl ComputeNode {
    // `pg_ctl` for start / stop, so this just seems much easier to do as we already
    // have opened connection to Postgres and superuser access.
    #[instrument(skip_all)]
-    fn pg_reload_conf(&self, client: &mut Client) -> Result<()> {
-        client.simple_query("SELECT pg_reload_conf()")?;
+    fn pg_reload_conf(&self) -> Result<()> {
+        let pgctl_bin = Path::new(&self.pgbin).parent().unwrap().join("pg_ctl");
+        Command::new(pgctl_bin)
+            .args(["reload", "-D", &self.pgdata])
+            .output()
+            .expect("cannot run pg_ctl process");
        Ok(())
    }

@@ -724,9 +728,9 @@ impl ComputeNode {
        // Write new config
        let pgdata_path = Path::new(&self.pgdata);
        config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec, None)?;
+        self.pg_reload_conf()?;

        let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
-        self.pg_reload_conf(&mut client)?;

        // Proceed with post-startup configuration. Note, that order of operations is important.
        // Disable DDL forwarding because control plane already knows about these roles/databases.
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -1,7 +1,7 @@
-//!
 //! Various tools and helpers to handle cluster / compute node (Postgres)
 //! configuration.
-//!
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 pub mod checker;
 pub mod config;
 pub mod configurator;
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -262,7 +262,7 @@ where
    P: Into<Utf8PathBuf>,
 {
    let path: Utf8PathBuf = path.into();
-    // SAFETY
+    // SAFETY:
    // pre_exec is marked unsafe because it runs between fork and exec.
    // Why is that dangerous in various ways?
    // Long answer:  https://github.com/rust-lang/rust/issues/39575
--- a/control_plane/src/lib.rs
+++ b/control_plane/src/lib.rs
@@ -1,11 +1,10 @@
-//
-// Local control plane.
-//
-// Can start, configure and stop postgres instances running as a local processes.
-//
-// Intended to be used in integration tests and in CLI tools for
-// local installations.
-//
+//! Local control plane.
+//!
+//! Can start, configure and stop postgres instances running as a local processes.
+//!
+//! Intended to be used in integration tests and in CLI tools for
+//! local installations.
+#![deny(clippy::undocumented_unsafe_blocks)]

 pub mod attachment_service;
 mod background_process;
--- a/libs/compute_api/src/lib.rs
+++ b/libs/compute_api/src/lib.rs
@@ -1,3 +1,5 @@
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 pub mod requests;
 pub mod responses;
 pub mod spec;
--- a/libs/consumption_metrics/src/lib.rs
+++ b/libs/consumption_metrics/src/lib.rs
@@ -1,6 +1,6 @@
-//!
 //! Shared code for consumption metics collection
-//!
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 use chrono::{DateTime, Utc};
 use rand::Rng;
 use serde::{Deserialize, Serialize};
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -2,6 +2,7 @@
 //! make sure that we use the same dep version everywhere.
 //! Otherwise, we might not see all metrics registered via
 //! a default registry.
+#![deny(clippy::undocumented_unsafe_blocks)]
 use once_cell::sync::Lazy;
 use prometheus::core::{AtomicU64, Collector, GenericGauge, GenericGaugeVec};
 pub use prometheus::opts;
--- a/libs/pageserver_api/src/lib.rs
+++ b/libs/pageserver_api/src/lib.rs
@@ -1,3 +1,5 @@
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 use const_format::formatcp;

 /// Public API types
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -18,7 +18,7 @@ use utils::{

 use crate::reltag::RelTag;
 use anyhow::bail;
-use bytes::{BufMut, Bytes, BytesMut};
+use bytes::{Buf, BufMut, Bytes, BytesMut};

 /// The state of a tenant in this pageserver.
 ///
@@ -572,15 +572,18 @@ pub enum PagestreamFeMessage {
    Nblocks(PagestreamNblocksRequest),
    GetPage(PagestreamGetPageRequest),
    DbSize(PagestreamDbSizeRequest),
+    NoOp,
 }

 // Wrapped in libpq CopyData
+#[derive(Debug)]
 pub enum PagestreamBeMessage {
    Exists(PagestreamExistsResponse),
    Nblocks(PagestreamNblocksResponse),
    GetPage(PagestreamGetPageResponse),
    Error(PagestreamErrorResponse),
    DbSize(PagestreamDbSizeResponse),
+    NoOp,
 }

 #[derive(Debug, PartialEq, Eq)]
@@ -679,6 +682,10 @@ impl PagestreamFeMessage {
                bytes.put_u64(req.lsn.0);
                bytes.put_u32(req.dbnode);
            }
+
+            Self::NoOp => {
+                bytes.put_u8(4);
+            }
        }

        bytes.into()
@@ -729,6 +736,7 @@ impl PagestreamFeMessage {
                lsn: Lsn::from(body.read_u64::<BigEndian>()?),
                dbnode: body.read_u32::<BigEndian>()?,
            })),
+            4 => Ok(PagestreamFeMessage::NoOp),
            _ => bail!("unknown smgr message tag: {:?}", msg_tag),
        }
    }
@@ -763,10 +771,46 @@ impl PagestreamBeMessage {
                bytes.put_u8(104); /* tag from pagestore_client.h */
                bytes.put_i64(resp.db_size);
            }
+            Self::NoOp => {
+                bytes.put_u8(105);
+            }
        }

        bytes.into()
    }
+
+    pub fn deserialize(buf: Bytes) -> anyhow::Result<Self> {
+        let mut buf = buf.reader();
+        let msg_tag = buf.read_u8()?;
+        match msg_tag {
+            100 => todo!(),
+            101 => todo!(),
+            102 => {
+                let buf = buf.get_ref();
+                /* TODO use constant */
+                if buf.len() == 8192 {
+                    Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
+                        page: buf.clone(),
+                    }))
+                } else {
+                    anyhow::bail!("invalid page size: {}", buf.len());
+                }
+            }
+            103 => {
+                let buf = buf.get_ref();
+                let cstr = std::ffi::CStr::from_bytes_until_nul(buf)?;
+                let rust_str = cstr.to_str()?;
+                Ok(PagestreamBeMessage::Error(PagestreamErrorResponse {
+                    message: rust_str.to_owned(),
+                }))
+            }
+            104 => todo!(),
+            105 => {
+                Ok(PagestreamBeMessage::NoOp)
+            },
+            _ => bail!("unknown tag: {:?}", msg_tag),
+        }
+    }
 }

 #[cfg(test)]
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -2,6 +2,8 @@
 //! To use, create PostgresBackend and run() it, passing the Handler
 //! implementation determining how to process the queries. Currently its API
 //! is rather narrow, but we can extend it once required.
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 use anyhow::Context;
 use bytes::Bytes;
 use futures::pin_mut;
--- a/libs/postgres_connection/src/lib.rs
+++ b/libs/postgres_connection/src/lib.rs
@@ -1,3 +1,5 @@
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 use anyhow::{bail, Context};
 use itertools::Itertools;
 use std::borrow::Cow;
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -8,6 +8,7 @@
 // modules included with the postgres_ffi macro depend on the types of the specific version's
 // types, and trigger a too eager lint.
 #![allow(clippy::duplicate_mod)]
+#![deny(clippy::undocumented_unsafe_blocks)]

 use bytes::Bytes;
 use utils::bin_ser::SerializeError;
@@ -20,6 +21,7 @@ macro_rules! postgres_ffi {
            pub mod bindings {
                // bindgen generates bindings for a lot of stuff we don't need
                #![allow(dead_code)]
+                #![allow(clippy::undocumented_unsafe_blocks)]

                use serde::{Deserialize, Serialize};
                include!(concat!(
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -1,6 +1,7 @@
 //! Postgres protocol messages serialization-deserialization. See
 //! <https://www.postgresql.org/docs/devel/protocol-message-formats.html>
 //! on message formats.
+#![deny(clippy::undocumented_unsafe_blocks)]

 pub mod framed;

--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -6,6 +6,8 @@
 //!   * [`s3_bucket`] uses AWS S3 bucket as an external storage
 //!   * [`azure_blob`] allows to use Azure Blob storage as an external storage
 //!
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]

 mod azure_blob;
 mod local_fs;
@@ -112,7 +114,7 @@ impl RemotePath {
        self.0.file_name()
    }

-    pub fn join(&self, segment: &Utf8Path) -> Self {
+    pub fn join<P: AsRef<Utf8Path>>(&self, segment: P) -> Self {
        Self(self.0.join(segment))
    }

--- a/libs/safekeeper_api/src/lib.rs
+++ b/libs/safekeeper_api/src/lib.rs
@@ -1,3 +1,5 @@
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 use const_format::formatcp;

 /// Public API types
--- a/libs/tenant_size_model/src/lib.rs
+++ b/libs/tenant_size_model/src/lib.rs
@@ -1,4 +1,6 @@
 //! Synthetic size calculation
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]

 mod calculation;
 pub mod svg;
--- a/libs/tracing-utils/src/lib.rs
+++ b/libs/tracing-utils/src/lib.rs
@@ -32,6 +32,8 @@
 //!         .init();
 //! }
 //! ```
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]

 use opentelemetry::sdk::Resource;
 use opentelemetry::KeyValue;
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -5,6 +5,7 @@ edition.workspace = true
 license.workspace = true

 [dependencies]
+arc-swap.workspace = true
 sentry.workspace = true
 async-trait.workspace = true
 anyhow.workspace = true
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -1,7 +1,8 @@
 // For details about authentication see docs/authentication.md

+use arc_swap::ArcSwap;
 use serde;
-use std::fs;
+use std::{fs, sync::Arc};

 use anyhow::Result;
 use camino::Utf8Path;
@@ -44,31 +45,88 @@ impl Claims {
    }
 }

+pub struct SwappableJwtAuth(ArcSwap<JwtAuth>);
+
+impl SwappableJwtAuth {
+    pub fn new(jwt_auth: JwtAuth) -> Self {
+        SwappableJwtAuth(ArcSwap::new(Arc::new(jwt_auth)))
+    }
+    pub fn swap(&self, jwt_auth: JwtAuth) {
+        self.0.swap(Arc::new(jwt_auth));
+    }
+    pub fn decode(&self, token: &str) -> Result<TokenData<Claims>> {
+        self.0.load().decode(token)
+    }
+}
+
+impl std::fmt::Debug for SwappableJwtAuth {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "Swappable({:?})", self.0.load())
+    }
+}
+
 pub struct JwtAuth {
-    decoding_key: DecodingKey,
+    decoding_keys: Vec<DecodingKey>,
    validation: Validation,
 }

 impl JwtAuth {
-    pub fn new(decoding_key: DecodingKey) -> Self {
+    pub fn new(decoding_keys: Vec<DecodingKey>) -> Self {
        let mut validation = Validation::default();
        validation.algorithms = vec![STORAGE_TOKEN_ALGORITHM];
        // The default 'required_spec_claims' is 'exp'. But we don't want to require
        // expiration.
        validation.required_spec_claims = [].into();
        Self {
-            decoding_key,
+            decoding_keys,
            validation,
        }
    }

    pub fn from_key_path(key_path: &Utf8Path) -> Result<Self> {
-        let public_key = fs::read(key_path)?;
-        Ok(Self::new(DecodingKey::from_ed_pem(&public_key)?))
+        let metadata = key_path.metadata()?;
+        let decoding_keys = if metadata.is_dir() {
+            let mut keys = Vec::new();
+            for entry in fs::read_dir(key_path)? {
+                let path = entry?.path();
+                if !path.is_file() {
+                    // Ignore directories (don't recurse)
+                    continue;
+                }
+                let public_key = fs::read(path)?;
+                keys.push(DecodingKey::from_ed_pem(&public_key)?);
+            }
+            keys
+        } else if metadata.is_file() {
+            let public_key = fs::read(key_path)?;
+            vec![DecodingKey::from_ed_pem(&public_key)?]
+        } else {
+            anyhow::bail!("path is neither a directory or a file")
+        };
+        if decoding_keys.is_empty() {
+            anyhow::bail!("Configured for JWT auth with zero decoding keys. All JWT gated requests would be rejected.");
+        }
+        Ok(Self::new(decoding_keys))
    }

+    /// Attempt to decode the token with the internal decoding keys.
+    ///
+    /// The function tries the stored decoding keys in succession,
+    /// and returns the first yielding a successful result.
+    /// If there is no working decoding key, it returns the last error.
    pub fn decode(&self, token: &str) -> Result<TokenData<Claims>> {
-        Ok(decode(token, &self.decoding_key, &self.validation)?)
+        let mut res = None;
+        for decoding_key in &self.decoding_keys {
+            res = Some(decode(token, decoding_key, &self.validation));
+            if let Some(Ok(res)) = res {
+                return Ok(res);
+            }
+        }
+        if let Some(res) = res {
+            res.map_err(anyhow::Error::new)
+        } else {
+            anyhow::bail!("no JWT decoding keys configured")
+        }
    }
 }

@@ -129,7 +187,7 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
        let encoded_eddsa = "eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJzY29wZSI6InRlbmFudCIsInRlbmFudF9pZCI6IjNkMWY3NTk1YjQ2ODIzMDMwNGUwYjczY2VjYmNiMDgxIiwiaXNzIjoibmVvbi5jb250cm9scGxhbmUiLCJleHAiOjE3MDkyMDA4NzksImlhdCI6MTY3ODQ0MjQ3OX0.U3eA8j-uU-JnhzeO3EDHRuXLwkAUFCPxtGHEgw6p7Ccc3YRbFs2tmCdbD9PZEXP-XsxSeBQi1FY0YPcT3NXADw";

        // Check it can be validated with the public key
-        let auth = JwtAuth::new(DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?);
+        let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?]);
        let claims_from_token = auth.decode(encoded_eddsa)?.claims;
        assert_eq!(claims_from_token, expected_claims);

@@ -146,7 +204,7 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
        let encoded = encode_from_key_file(&claims, TEST_PRIV_KEY_ED25519)?;

        // decode it back
-        let auth = JwtAuth::new(DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?);
+        let auth = JwtAuth::new(vec![DecodingKey::from_ed_pem(TEST_PUB_KEY_ED25519)?]);
        let decoded = auth.decode(&encoded)?;

        assert_eq!(decoded.claims, claims);
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -1,4 +1,4 @@
-use crate::auth::{Claims, JwtAuth};
+use crate::auth::{Claims, SwappableJwtAuth};
 use crate::http::error::{api_error_handler, route_error_handler, ApiError};
 use anyhow::Context;
 use hyper::header::{HeaderName, AUTHORIZATION};
@@ -389,7 +389,7 @@ fn parse_token(header_value: &str) -> Result<&str, ApiError> {
 }

 pub fn auth_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
-    provide_auth: fn(&Request<Body>) -> Option<&JwtAuth>,
+    provide_auth: fn(&Request<Body>) -> Option<&SwappableJwtAuth>,
 ) -> Middleware<B, ApiError> {
    Middleware::pre(move |req| async move {
        if let Some(auth) = provide_auth(&req) {
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -120,6 +120,8 @@ impl Id {
            chunk[0] = HEX[((b >> 4) & 0xf) as usize];
            chunk[1] = HEX[(b & 0xf) as usize];
        }
+
+        // SAFETY: vec constructed out of `HEX`, it can only be ascii
        unsafe { String::from_utf8_unchecked(buf) }
    }
 }
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -1,5 +1,6 @@
 //! `utils` is intended to be a place to put code that is shared
 //! between other crates in this repository.
+#![deny(clippy::undocumented_unsafe_blocks)]

 pub mod backoff;

--- a/libs/utils/src/shutdown.rs
+++ b/libs/utils/src/shutdown.rs
@@ -1,6 +1,7 @@
 /// Immediately terminate the calling process without calling
 /// atexit callbacks, C runtime destructors etc. We mainly use
 /// this to protect coverage data from concurrent writes.
-pub fn exit_now(code: u8) {
+pub fn exit_now(code: u8) -> ! {
+    // SAFETY: exiting is safe, the ffi is not safe
    unsafe { nix::libc::_exit(code as _) };
 }
--- a/libs/utils/src/sync/gate.rs
+++ b/libs/utils/src/sync/gate.rs
@@ -85,6 +85,13 @@ impl Gate {
        warn_if_stuck(self.do_close(), &self.name, Duration::from_millis(1000)).await
    }

+    /// Check if [`Self::close()`] has finished waiting for all [`Self::enter()`] users to finish.  This
+    /// is usually analoguous for "Did shutdown finish?" for types that include a Gate, whereas checking
+    /// the CancellationToken on such types is analogous to "Did shutdown start?"
+    pub fn close_complete(&self) -> bool {
+        self.sem.is_closed()
+    }
+
    async fn do_close(&self) {
        tracing::debug!(gate = self.name, "Closing Gate...");
        match self.sem.acquire_many(Self::MAX_UNITS).await {
--- a/libs/vm_monitor/Cargo.toml
+++ b/libs/vm_monitor/Cargo.toml
@@ -19,13 +19,12 @@ inotify.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 sysinfo.workspace = true
-tokio.workspace = true
+tokio = { workspace = true, features = ["rt-multi-thread"] }
 tokio-postgres.workspace = true
 tokio-stream.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true
 tracing-subscriber.workspace = true
-workspace_hack = { version = "0.1", path = "../../workspace_hack" }

 [target.'cfg(target_os = "linux")'.dependencies]
 cgroups-rs = "0.3.3"
--- a/libs/vm_monitor/src/lib.rs
+++ b/libs/vm_monitor/src/lib.rs
@@ -1,3 +1,5 @@
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 #![cfg(target_os = "linux")]

 use anyhow::Context;
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -82,6 +82,8 @@ enum-map.workspace = true
 enumset.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
+tokio-stream.workspace = true
+tracing-subscriber = { version = "0.3.17", features = ["env-filter"] }

 [dev-dependencies]
 criterion.workspace = true
--- a/pageserver/src/bin/getpage_bench_http.rs
+++ b/pageserver/src/bin/getpage_bench_http.rs
@@ -0,0 +1,245 @@
+use clap::Parser;
+use hyper::client::conn::Parts;
+use hyper::client::HttpConnector;
+use hyper::{Body, Client, Uri};
+use pageserver::{repository, tenant};
+use rand::prelude::*;
+use std::env::args;
+use std::future::Future;
+use std::str::FromStr;
+use std::sync::atomic::{AtomicU64, Ordering};
+use std::sync::{Arc, Mutex};
+use std::thread;
+use tokio::sync::mpsc::{channel, Sender};
+use tokio::sync::Mutex as AsyncMutex;
+use tokio::task::JoinHandle;
+
+struct Key(repository::Key);
+
+impl std::str::FromStr for Key {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
+        repository::Key::from_hex(s).map(Key)
+    }
+}
+
+struct KeyRange {
+    start: Key,
+    end: Key,
+}
+
+impl KeyRange {
+    fn len(&self) -> i128 {
+        self.end.0.to_i128() - self.start.0.to_i128()
+    }
+}
+
+#[derive(clap::Parser)]
+struct Args {
+    #[clap(long, default_value = "http://localhost:9898")]
+    ps_endpoint: String,
+    // tenant_id: String,
+    // timeline_id: String,
+    num_tasks: usize,
+    num_requests: usize,
+    tenants: Option<Vec<String>>,
+    #[clap(long)]
+    pick_n_tenants: Option<usize>,
+}
+
+#[derive(Debug, Default)]
+struct Stats {
+    completed_requests: AtomicU64,
+}
+
+impl Stats {
+    fn inc(&self) {
+        self.completed_requests.fetch_add(1, Ordering::Relaxed);
+    }
+}
+
+#[tokio::main]
+async fn main() {
+    let args: &'static Args = Box::leak(Box::new(Args::parse()));
+
+    let client = Client::new();
+
+    let tenants = if let Some(tenants) = &args.tenants {
+        tenants.clone()
+    } else {
+        // let tenant_id = "b97965931096047b2d54958756baee7b";
+        // let timeline_id = "2868f84a8d166779e4c651b116c45059";
+
+        let resp = client
+            .get(Uri::try_from(&format!("{}/v1/tenant", args.ps_endpoint)).unwrap())
+            .await
+            .unwrap();
+
+        let body = hyper::body::to_bytes(resp).await.unwrap();
+        let tenants: serde_json::Value = serde_json::from_slice(&body).unwrap();
+        let mut out = Vec::new();
+        for t in tenants.as_array().unwrap() {
+            if let Some(limit) = args.pick_n_tenants {
+                if out.len() >= limit {
+                    break;
+                }
+            }
+            out.push(t.get("id").unwrap().as_str().unwrap().to_owned());
+        }
+        if let Some(limit) = args.pick_n_tenants {
+            assert_eq!(out.len(), limit);
+        }
+        out
+    };
+
+    let mut tenant_timelines = Vec::new();
+    for tenant_id in tenants {
+        let resp = client
+            .get(
+                Uri::try_from(&format!(
+                    "{}/v1/tenant/{}/timeline",
+                    args.ps_endpoint, tenant_id
+                ))
+                .unwrap(),
+            )
+            .await
+            .unwrap();
+
+        let body = hyper::body::to_bytes(resp).await.unwrap();
+        let timelines: serde_json::Value = serde_json::from_slice(&body).unwrap();
+        for t in timelines.as_array().unwrap() {
+            let timeline_id = t.get("timeline_id").unwrap().as_str().unwrap().to_owned();
+            tenant_timelines.push((tenant_id.clone(), timeline_id));
+        }
+    }
+    println!("tenant_timelines:\n{:?}", tenant_timelines);
+
+    let mut stats = Arc::new(Stats::default());
+
+    tokio::spawn({
+        let stats = Arc::clone(&stats);
+        async move {
+            loop {
+                let start = std::time::Instant::now();
+                tokio::time::sleep(std::time::Duration::from_secs(1)).await;
+                let completed_requests = stats.completed_requests.swap(0, Ordering::Relaxed);
+                let elapsed = start.elapsed();
+                println!(
+                    "RPS: {:.0}",
+                    completed_requests as f64 / elapsed.as_secs_f64()
+                );
+            }
+        }
+    });
+
+    let mut tasks = Vec::new();
+    for (tenant_id, timeline_id) in tenant_timelines {
+        let t = tokio::spawn(timeline(
+            args,
+            client.clone(),
+            tenant_id,
+            timeline_id,
+            Arc::clone(&stats),
+        ));
+        tasks.push(t);
+    }
+
+    for t in tasks {
+        t.await.unwrap();
+    }
+}
+
+fn timeline(
+    args: &'static Args,
+    client: Client<HttpConnector, Body>,
+    tenant_id: String,
+    timeline_id: String,
+    stats: Arc<Stats>,
+) -> impl Future<Output = ()> {
+    async move {
+        let mut resp = client
+            .get(
+                Uri::try_from(&format!(
+                    "{}/v1/tenant/{}/timeline/{}/keyspace",
+                    args.ps_endpoint, tenant_id, timeline_id
+                ))
+                .unwrap(),
+            )
+            .await
+            .unwrap();
+        if !resp.status().is_success() {
+            panic!("Failed to get keyspace: {resp:?}");
+        }
+        let body = hyper::body::to_bytes(resp).await.unwrap();
+        let keyspace: serde_json::Value = serde_json::from_slice(&body).unwrap();
+
+        let lsn = Arc::new(keyspace["at_lsn"].as_str().unwrap().to_owned());
+
+        let ranges = keyspace["keys"]
+            .as_array()
+            .unwrap()
+            .iter()
+            .map(|r| {
+                let r = r.as_array().unwrap();
+                assert_eq!(r.len(), 2);
+                let start = Key::from_str(r[0].as_str().unwrap()).unwrap();
+                let end = Key::from_str(r[1].as_str().unwrap()).unwrap();
+                KeyRange { start, end }
+            })
+            .collect::<Vec<_>>();
+
+        // weighted ranges
+        let weights = ranges.iter().map(|r| r.len()).collect::<Vec<_>>();
+
+        let ranges = Arc::new(ranges);
+        let weights = Arc::new(weights);
+
+        let (tx, mut rx) = channel::<i32>(1000);
+        let tx = Arc::new(AsyncMutex::new(tx));
+
+        let mut tasks = Vec::<JoinHandle<()>>::new();
+
+        let start = std::time::Instant::now();
+
+        for i in 0..args.num_tasks {
+            let ranges = ranges.clone();
+            let weights = weights.clone();
+            let lsn = lsn.clone();
+            let client = client.clone();
+            let tenant_id = tenant_id.clone();
+            let timeline_id = timeline_id.clone();
+            let stats = Arc::clone(&stats);
+            let task = tokio::spawn(async move {
+                for i in 0..args.num_requests {
+                    let key = {
+                        let mut rng = rand::thread_rng();
+                        let r = ranges.choose_weighted(&mut rng, |r| r.len()).unwrap();
+                        let key = rng.gen_range((r.start.0.to_i128()..r.end.0.to_i128()));
+                        key
+                    };
+                    let url = format!(
+                        "{}/v1/tenant/{}/timeline/{}/getpage?key={:036x}&lsn={}",
+                        args.ps_endpoint, tenant_id, timeline_id, key, lsn
+                    );
+                    let uri = url.parse::<Uri>().unwrap();
+                    let resp = client.get(uri).await.unwrap();
+                    stats.inc();
+                }
+            });
+            tasks.push(task);
+        }
+
+        drop(tx);
+
+        for task in tasks {
+            task.await.unwrap();
+        }
+
+        let elapsed = start.elapsed();
+        println!(
+            "RPS: {:.0}",
+            (args.num_requests * args.num_tasks) as f64 / elapsed.as_secs_f64()
+        );
+    }
+}
--- a/pageserver/src/bin/getpage_bench_libpq.rs
+++ b/pageserver/src/bin/getpage_bench_libpq.rs
@@ -0,0 +1,411 @@
+use anyhow::Context;
+use clap::Parser;
+use futures::{SinkExt, TryStreamExt};
+use hyper::client::conn::Parts;
+use hyper::client::HttpConnector;
+use hyper::{Client, Uri};
+use pageserver::page_cache::PAGE_SZ;
+use pageserver::pgdatadir_mapping::{is_rel_block_key, key_to_rel_block};
+use pageserver::{repository, tenant};
+use pageserver_api::models::{
+    PagestreamBeMessage, PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
+};
+use pageserver_api::reltag::RelTag;
+use rand::prelude::*;
+use scopeguard::defer;
+use std::env::args;
+use std::future::Future;
+use std::str::FromStr;
+use std::sync::atomic::{AtomicU64, Ordering};
+use std::sync::{Arc, Mutex};
+use std::thread;
+use tokio::sync::mpsc::{channel, Sender};
+use tokio::sync::Mutex as AsyncMutex;
+use tokio::task::JoinHandle;
+use tokio_stream::{Stream, StreamExt};
+use utils::completion;
+use utils::lsn::Lsn;
+
+struct Key(repository::Key);
+
+impl std::str::FromStr for Key {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
+        repository::Key::from_hex(s).map(Key)
+    }
+}
+
+struct KeyRange {
+    start: i128,
+    end: i128,
+}
+
+impl KeyRange {
+    fn len(&self) -> i128 {
+        self.end - self.start
+    }
+}
+
+struct RelTagBlockNo {
+    rel_tag: RelTag,
+    block_no: u32,
+}
+
+#[derive(clap::Parser)]
+struct Args {
+    #[clap(long, default_value = "http://localhost:9898")]
+    ps_endpoint: String,
+    #[clap(long, default_value = "postgres://postgres@localhost:64000")]
+    pq_client_connstring: String,
+    // tenant_id: String,
+    // timeline_id: String,
+    num_tasks: usize,
+    num_requests: usize,
+    tenants: Option<Vec<String>>,
+    #[clap(long)]
+    pick_n_tenants: Option<usize>,
+    #[clap(subcommand)]
+    mode: Mode,
+}
+
+#[derive(clap::Parser, Clone)]
+enum Mode {
+    GetPage,
+    NoOp,
+}
+
+#[derive(Debug, Default)]
+struct Stats {
+    completed_requests: AtomicU64,
+}
+
+impl Stats {
+    fn inc(&self) {
+        self.completed_requests.fetch_add(1, Ordering::Relaxed);
+    }
+}
+
+#[tokio::main]
+async fn main() {
+    let args: &'static Args = Box::leak(Box::new(Args::parse()));
+
+    // std::env::set_var("RUST_LOG", "info,tokio_postgres=trace");
+    // tracing_subscriber::fmt::init();
+
+    let client = Client::new();
+
+    let tenants = if let Some(tenants) = &args.tenants {
+        tenants.clone()
+    } else {
+        // let tenant_id = "b97965931096047b2d54958756baee7b";
+        // let timeline_id = "2868f84a8d166779e4c651b116c45059";
+
+        let resp = client
+            .get(Uri::try_from(&format!("{}/v1/tenant", args.ps_endpoint)).unwrap())
+            .await
+            .unwrap();
+
+        let body = hyper::body::to_bytes(resp).await.unwrap();
+        let tenants: serde_json::Value = serde_json::from_slice(&body).unwrap();
+        let mut out = Vec::new();
+        for t in tenants.as_array().unwrap() {
+            if let Some(limit) = args.pick_n_tenants {
+                if out.len() >= limit {
+                    break;
+                }
+            }
+            out.push(t.get("id").unwrap().as_str().unwrap().to_owned());
+        }
+        if let Some(limit) = args.pick_n_tenants {
+            assert_eq!(out.len(), limit);
+        }
+        out
+    };
+
+    let mut tenant_timelines = Vec::new();
+    for tenant_id in tenants {
+        let resp = client
+            .get(
+                Uri::try_from(&format!(
+                    "{}/v1/tenant/{}/timeline",
+                    args.ps_endpoint, tenant_id
+                ))
+                .unwrap(),
+            )
+            .await
+            .unwrap();
+
+        let body = hyper::body::to_bytes(resp).await.unwrap();
+        let timelines: serde_json::Value = serde_json::from_slice(&body).unwrap();
+        for t in timelines.as_array().unwrap() {
+            let timeline_id = t.get("timeline_id").unwrap().as_str().unwrap().to_owned();
+            tenant_timelines.push((tenant_id.clone(), timeline_id));
+        }
+    }
+    println!("tenant_timelines:\n{:?}", tenant_timelines);
+
+    let mut stats = Arc::new(Stats::default());
+
+    tokio::spawn({
+        let stats = Arc::clone(&stats);
+        async move {
+            loop {
+                let start = std::time::Instant::now();
+                tokio::time::sleep(std::time::Duration::from_secs(1)).await;
+                let completed_requests = stats.completed_requests.swap(0, Ordering::Relaxed);
+                let elapsed = start.elapsed();
+                println!(
+                    "RPS: {:.0}",
+                    completed_requests as f64 / elapsed.as_secs_f64()
+                );
+            }
+        }
+    });
+
+    let mut tasks = Vec::new();
+    for (tenant_id, timeline_id) in tenant_timelines {
+        let stats = Arc::clone(&stats);
+        let t = tokio::spawn(timeline(
+            args,
+            client.clone(),
+            tenant_id,
+            timeline_id,
+            stats,
+        ));
+        tasks.push(t);
+    }
+
+    for t in tasks {
+        t.await.unwrap();
+    }
+}
+
+fn timeline(
+    args: &'static Args,
+    http_client: Client<HttpConnector, hyper::Body>,
+    tenant_id: String,
+    timeline_id: String,
+    stats: Arc<Stats>,
+) -> impl Future<Output = ()> + Send + Sync {
+    async move {
+        let mut resp = http_client
+            .get(
+                Uri::try_from(&format!(
+                    "{}/v1/tenant/{}/timeline/{}/keyspace",
+                    args.ps_endpoint, tenant_id, timeline_id
+                ))
+                .unwrap(),
+            )
+            .await
+            .unwrap();
+        if !resp.status().is_success() {
+            panic!("Failed to get keyspace: {resp:?}");
+        }
+        let body = hyper::body::to_bytes(resp).await.unwrap();
+        let keyspace: serde_json::Value = serde_json::from_slice(&body).unwrap();
+        let lsn: Lsn = keyspace["at_lsn"].as_str().unwrap().parse().unwrap();
+
+        let ranges = keyspace["keys"]
+            .as_array()
+            .unwrap()
+            .iter()
+            .filter_map(|r| {
+                let r = r.as_array().unwrap();
+                assert_eq!(r.len(), 2);
+                let start = Key::from_str(r[0].as_str().unwrap()).unwrap();
+                let end = Key::from_str(r[1].as_str().unwrap()).unwrap();
+                // filter out non-relblock keys
+                match (is_rel_block_key(start.0), is_rel_block_key(end.0)) {
+                    (true, true) => Some(KeyRange {
+                        start: start.0.to_i128(),
+                        end: end.0.to_i128(),
+                    }),
+                    (true, false) | (false, true) => {
+                        unimplemented!("split up range")
+                    }
+                    (false, false) => None,
+                }
+            })
+            .collect::<Vec<_>>();
+
+        // weighted ranges
+        let weights = ranges.iter().map(|r| r.len()).collect::<Vec<_>>();
+
+        let ranges = Arc::new(ranges);
+        let weights = Arc::new(weights);
+
+        let mut tasks = Vec::<JoinHandle<()>>::new();
+
+        let start = std::time::Instant::now();
+
+        for i in 0..args.num_tasks {
+            let ranges = ranges.clone();
+            let weights = weights.clone();
+            let client = http_client.clone();
+            let tenant_id = tenant_id.clone();
+            let timeline_id = timeline_id.clone();
+            let task = tokio::spawn({
+                let stats = Arc::clone(&stats);
+                async move {
+                    let mut client = getpage_client::Client::new(
+                        args.pq_client_connstring.clone(),
+                        tenant_id.clone(),
+                        timeline_id.clone(),
+                    )
+                    .await
+                    .unwrap();
+                    for i in 0..args.num_requests {
+                        match args.mode {
+                            Mode::GetPage => {
+                                let key = {
+                                    let mut rng = rand::thread_rng();
+                                    let r = ranges.choose_weighted(&mut rng, |r| r.len()).unwrap();
+                                    let key: i128 = rng.gen_range((r.start..r.end));
+                                    let key = repository::Key::from_i128(key);
+                                    // XXX filter these out when we iterate the keyspace
+                                    assert!(
+                                        is_rel_block_key(key),
+                                        "we filter non-relblock keys out above"
+                                    );
+                                    let (rel_tag, block_no) =
+                                        key_to_rel_block(key).expect("we just checked");
+                                    RelTagBlockNo { rel_tag, block_no }
+                                };
+                                client
+                                    .getpage(key, lsn)
+                                    .await
+                                    .with_context(|| {
+                                        format!(
+                                            "getpage for tenant {} timeline {}",
+                                            tenant_id, timeline_id
+                                        )
+                                    })
+                                    .unwrap();
+                            }
+                            Mode::NoOp => {
+                                client.noop().await.unwrap();
+                            }
+                        }
+                        stats.inc();
+                    }
+                    client.shutdown().await;
+                }
+            });
+            tasks.push(task);
+        }
+
+        for task in tasks {
+            task.await.unwrap();
+        }
+    }
+}
+
+mod getpage_client {
+    use std::pin::Pin;
+
+    use futures::SinkExt;
+    use pageserver_api::models::{
+        PagestreamBeMessage, PagestreamFeMessage, PagestreamGetPageRequest,
+        PagestreamGetPageResponse,
+    };
+    use tokio::task::JoinHandle;
+    use tokio_stream::StreamExt;
+    use tokio_util::sync::CancellationToken;
+    use utils::lsn::Lsn;
+
+    use crate::RelTagBlockNo;
+
+    pub(crate) struct Client {
+        copy_both: Pin<Box<tokio_postgres::CopyBothDuplex<bytes::Bytes>>>,
+        cancel_on_client_drop: Option<tokio_util::sync::DropGuard>,
+        conn_task: JoinHandle<()>,
+    }
+
+    impl Client {
+        pub fn new(
+            connstring: String,
+            tenant_id: String,
+            timeline_id: String,
+        ) -> impl std::future::Future<Output = anyhow::Result<Self>> + Send {
+            async move {
+                let (client, connection) =
+                    tokio_postgres::connect(&connstring, postgres::NoTls).await?;
+
+                let conn_task_cancel = CancellationToken::new();
+                let conn_task = tokio::spawn({
+                    let conn_task_cancel = conn_task_cancel.clone();
+                    async move {
+                        tokio::select! {
+                            _ = conn_task_cancel.cancelled() => {
+                                return;
+                            }
+                            res = connection => {
+                                res.unwrap();
+                            }
+                        }
+                    }
+                });
+
+                let copy_both: tokio_postgres::CopyBothDuplex<bytes::Bytes> = client
+                    .copy_both_simple(&format!("pagestream {tenant_id} {timeline_id}"))
+                    .await?;
+
+                Ok(Self {
+                    copy_both: Box::pin(copy_both),
+                    conn_task,
+                    cancel_on_client_drop: Some(conn_task_cancel.drop_guard()),
+                })
+            }
+        }
+
+        pub async fn shutdown(mut self) {
+            let _ = self.cancel_on_client_drop.take();
+            self.conn_task.await.unwrap();
+        }
+
+        pub async fn getpage(
+            &mut self,
+            key: RelTagBlockNo,
+            lsn: Lsn,
+        ) -> anyhow::Result<PagestreamGetPageResponse> {
+            let req = PagestreamGetPageRequest {
+                latest: false,
+                rel: key.rel_tag,
+                blkno: key.block_no,
+                lsn,
+            };
+            let req = PagestreamFeMessage::GetPage(req);
+            match self.do_request(req).await? {
+                PagestreamBeMessage::GetPage(p) => Ok(p),
+                x => anyhow::bail!("Unexpected response: {:?}", x),
+            }
+        }
+
+        pub async fn noop(&mut self) -> anyhow::Result<()> {
+            match self.do_request(PagestreamFeMessage::NoOp).await? {
+                PagestreamBeMessage::NoOp => Ok(()),
+                x => anyhow::bail!("Unexpected response: {:?}", x),
+            }
+        }
+
+        async fn do_request(
+            &mut self,
+            req: PagestreamFeMessage,
+        ) -> Result<PagestreamBeMessage, anyhow::Error> {
+            let req: bytes::Bytes = req.serialize();
+            // let mut req = tokio_util::io::ReaderStream::new(&req);
+            let mut req = tokio_stream::once(Ok(req));
+
+            self.copy_both.send_all(&mut req).await?;
+
+            let next: Option<Result<bytes::Bytes, _>> = self.copy_both.next().await;
+            let next = next.unwrap().unwrap();
+
+            match PagestreamBeMessage::deserialize(next)? {
+                PagestreamBeMessage::Error(e) => anyhow::bail!("Error: {:?}", e),
+                x => Ok(x),
+            }
+        }
+    }
+}
--- a/pageserver/src/bin/noop_server.rs
+++ b/pageserver/src/bin/noop_server.rs
@@ -0,0 +1,109 @@
+use anyhow::Context;
+use bytes::Buf;
+use clap::Parser;
+use pageserver_api::models::{PagestreamBeMessage, PagestreamErrorResponse, PagestreamFeMessage};
+use postgres_backend::{AuthType, PostgresBackend, QueryError};
+use pq_proto::{BeMessage, FeMessage};
+use tokio::io::{AsyncRead, AsyncWrite};
+use tokio_util::sync::CancellationToken;
+
+#[derive(clap::Parser)]
+struct Args {
+    bind: String,
+}
+
+#[tokio::main]
+async fn main() {
+    let args = Args::parse();
+
+    let listener = tokio::net::TcpListener::bind(&args.bind).await.unwrap();
+    loop {
+        let (socket, _) = listener.accept().await.unwrap();
+        tokio::spawn(async move {
+            handle_connection(socket).await.unwrap();
+        });
+    }
+}
+
+async fn handle_connection(socket: tokio::net::TcpStream) -> anyhow::Result<()> {
+    socket
+        .set_nodelay(true)
+        .context("could not set TCP_NODELAY")?;
+
+    let peer_addr = socket.peer_addr().context("get peer address")?;
+    let socket = tokio_io_timeout::TimeoutReader::new(socket);
+    tokio::pin!(socket);
+    let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, AuthType::Trust, None)?;
+    let mut conn_handler = NoOpHandler;
+    let cancel = CancellationToken::new();
+    pgbackend
+        .run(&mut conn_handler, || {
+            let cancel = cancel.clone();
+            async move { cancel.cancelled().await }
+        })
+        .await?;
+    anyhow::Ok(())
+}
+
+struct NoOpHandler;
+
+#[async_trait::async_trait]
+impl<IO> postgres_backend::Handler<IO> for NoOpHandler
+where
+    IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
+{
+    fn startup(
+        &mut self,
+        _pgb: &mut PostgresBackend<IO>,
+        _sm: &pq_proto::FeStartupPacket,
+    ) -> Result<(), QueryError> {
+        Ok(())
+    }
+
+    async fn process_query(
+        &mut self,
+        pgb: &mut PostgresBackend<IO>,
+        query_string: &str,
+    ) -> Result<(), QueryError> {
+        if !query_string.starts_with("pagestream ") {
+            return Err(QueryError::Other(anyhow::anyhow!("not a pagestream query")));
+        }
+
+        // switch client to COPYBOTH
+        pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
+        pgb.flush().await?;
+
+        loop {
+            let msg = pgb.read_message().await?;
+
+            let copy_data_bytes = match msg {
+                Some(FeMessage::CopyData(bytes)) => bytes,
+                Some(FeMessage::Terminate) => return Ok(()),
+                Some(m) => {
+                    return Err(QueryError::Other(anyhow::anyhow!(
+                        "unexpected message: {m:?} during COPY"
+                    )));
+                }
+                None => return Ok(()), // client disconnected
+            };
+
+            let neon_fe_msg = PagestreamFeMessage::parse(&mut copy_data_bytes.reader())?;
+
+            let response = match neon_fe_msg {
+                PagestreamFeMessage::NoOp => Ok(PagestreamBeMessage::NoOp),
+                x => Err(QueryError::Other(anyhow::anyhow!(
+                    "this server only supports no-op: {x:?}"
+                ))),
+            };
+
+            let response = response.unwrap_or_else(|e| {
+                PagestreamBeMessage::Error(PagestreamErrorResponse {
+                    message: e.to_string(),
+                })
+            });
+
+            pgb.write_message_noflush(&BeMessage::CopyData(&response.serialize()))?;
+            pgb.flush().await?;
+        }
+    }
+}
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -34,8 +34,11 @@ use postgres_backend::AuthType;
 use utils::logging::TracingErrorLayerEnablement;
 use utils::signals::ShutdownSignals;
 use utils::{
-    auth::JwtAuth, logging, project_build_tag, project_git_version, sentry_init::init_sentry,
-    signals::Signal, tcp_listener,
+    auth::{JwtAuth, SwappableJwtAuth},
+    logging, project_build_tag, project_git_version,
+    sentry_init::init_sentry,
+    signals::Signal,
+    tcp_listener,
 };

 project_git_version!(GIT_VERSION);
@@ -321,13 +324,12 @@ fn start_pageserver(
    let http_auth;
    let pg_auth;
    if conf.http_auth_type == AuthType::NeonJWT || conf.pg_auth_type == AuthType::NeonJWT {
-        // unwrap is ok because check is performed when creating config, so path is set and file exists
+        // unwrap is ok because check is performed when creating config, so path is set and exists
        let key_path = conf.auth_validation_public_key_path.as_ref().unwrap();
-        info!(
-            "Loading public key for verifying JWT tokens from {:#?}",
-            key_path
-        );
-        let auth: Arc<JwtAuth> = Arc::new(JwtAuth::from_key_path(key_path)?);
+        info!("Loading public key(s) for verifying JWT tokens from {key_path:?}");
+
+        let jwt_auth = JwtAuth::from_key_path(key_path)?;
+        let auth: Arc<SwappableJwtAuth> = Arc::new(SwappableJwtAuth::new(jwt_auth));

        http_auth = match &conf.http_auth_type {
            AuthType::Trust => None,
@@ -410,7 +412,7 @@ fn start_pageserver(

    // Scan the local 'tenants/' directory and start loading the tenants
    let deletion_queue_client = deletion_queue.new_client();
-    BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
+    let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
        conf,
        TenantSharedResources {
            broker_client: broker_client.clone(),
@@ -420,6 +422,7 @@ fn start_pageserver(
        order,
        shutdown_pageserver.clone(),
    ))?;
+    let tenant_manager = Arc::new(tenant_manager);

    BACKGROUND_RUNTIME.spawn({
        let init_done_rx = init_done_rx;
@@ -548,6 +551,7 @@ fn start_pageserver(
        let router_state = Arc::new(
            http::routes::State::new(
                conf,
+                tenant_manager,
                http_auth.clone(),
                remote_storage.clone(),
                broker_client.clone(),
--- a/pageserver/src/bin/tokio_tcp_bench.rs
+++ b/pageserver/src/bin/tokio_tcp_bench.rs
@@ -0,0 +1,130 @@
+use std::env::args;
+
+use clap::Parser;
+use tokio::io::{AsyncReadExt, AsyncWriteExt};
+
+#[derive(clap::Parser)]
+struct Args {
+    #[clap(subcommand)]
+    mode: Mode,
+}
+
+#[derive(clap::Parser)]
+enum Mode {
+    Client(Client),
+    Server(Server),
+}
+
+#[derive(clap::Parser)]
+struct Client {
+    num_tasks: usize,
+}
+#[derive(clap::Parser)]
+struct Server {}
+
+#[tokio::main]
+async fn main() {
+    let args: &'static _ = Box::leak(Box::new(Args::parse()));
+
+    match &args.mode {
+        Mode::Client(x) => client::client(x).await,
+        Mode::Server(x) => server::server(x).await,
+    }
+}
+
+mod client {
+    use std::sync::{atomic::{Ordering, AtomicU64}, Arc};
+
+    use tokio::io::{AsyncReadExt, AsyncWriteExt};
+
+    use super::Client;
+
+    #[derive(Debug, Default)]
+    struct Stats {
+        completed_requests: AtomicU64,
+    }
+
+    impl Stats {
+        fn inc(&self) {
+            self.completed_requests.fetch_add(1, Ordering::Relaxed);
+        }
+    }
+    pub(crate) async fn client(args: &'static Client) {
+        let mut stats = Arc::new(Stats::default());
+
+        tokio::spawn({
+            let stats = Arc::clone(&stats);
+            async move {
+                loop {
+                    let start = std::time::Instant::now();
+                    tokio::time::sleep(std::time::Duration::from_secs(1)).await;
+                    let completed_requests = stats.completed_requests.swap(0, Ordering::Relaxed);
+                    let elapsed = start.elapsed();
+                    println!(
+                        "RPS: {:.0} RPS/client: {:.2}",
+                        completed_requests as f64 / elapsed.as_secs_f64(),
+                        completed_requests as f64 / elapsed.as_secs_f64() / args.num_tasks as f64,
+                    );
+                }
+            }
+        });
+
+        let mut tasks = Vec::new();
+        for _  in 0..args.num_tasks {
+            let stats = Arc::clone(&stats);
+            let t = tokio::spawn(client_task(args, stats));
+            tasks.push(t);
+        }
+
+        for t in tasks {
+            t.await.unwrap();
+        }
+    }
+
+    async fn client_task(args: &'static Client, stats: Arc<Stats>) -> anyhow::Result<()> {
+        let mut conn = tokio::net::TcpStream::connect("localhost:65000").await?;
+        conn.set_nodelay(true)?;
+
+        loop {
+            let mut buf = [0u8; 1];
+            conn.write_all(&buf).await?;
+            conn.read_exact(&mut buf).await?;
+            stats.inc();
+        }
+    }
+}
+
+mod server {
+
+    use anyhow::Context;
+    use tokio::io::{AsyncReadExt, AsyncWriteExt};
+
+    use super::Server;
+
+    pub(crate) async fn server(args: &'static Server) {
+        let listener = tokio::net::TcpListener::bind("localhost:65000").await.unwrap();
+        loop {
+            let (socket, _) = listener.accept().await.unwrap();
+            tokio::spawn(async move {
+                server_handle_connection(args, socket).await.unwrap();
+            });
+        }
+    }
+
+    async fn server_handle_connection(
+        args: &'static Server,
+        socket: tokio::net::TcpStream,
+    ) -> anyhow::Result<()> {
+        socket
+            .set_nodelay(true)
+            .context("could not set TCP_NODELAY")?;
+        // let socket = tokio_io_timeout::TimeoutReader::new(socket);
+        tokio::pin!(socket);
+
+        loop {
+            let mut buf = [0u8; 4096];
+            socket.read_exact(&mut buf).await?;
+            socket.write_all(&buf).await?;
+        }
+    }
+}
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -161,7 +161,7 @@ pub struct PageServerConf {
    pub http_auth_type: AuthType,
    /// authentication method for libpq connections from compute
    pub pg_auth_type: AuthType,
-    /// Path to a file containing public key for verifying JWT tokens.
+    /// Path to a file or directory containing public key(s) for verifying JWT tokens.
    /// Used for both mgmt and compute auth, if enabled.
    pub auth_validation_public_key_path: Option<Utf8PathBuf>,

--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -345,7 +345,7 @@ impl DeletionList {
                result.extend(
                    timeline_layers
                        .into_iter()
-                        .map(|l| timeline_remote_path.join(&Utf8PathBuf::from(l))),
+                        .map(|l| timeline_remote_path.join(Utf8PathBuf::from(l))),
                );
            }
        }
--- a/pageserver/src/deletion_queue/deleter.rs
+++ b/pageserver/src/deletion_queue/deleter.rs
@@ -55,21 +55,24 @@ impl Deleter {

    /// Wrap the remote `delete_objects` with a failpoint
    async fn remote_delete(&self) -> Result<(), anyhow::Error> {
-        fail::fail_point!("deletion-queue-before-execute", |_| {
-            info!("Skipping execution, failpoint set");
-            metrics::DELETION_QUEUE
-                .remote_errors
-                .with_label_values(&["failpoint"])
-                .inc();
-            Err(anyhow::anyhow!("failpoint hit"))
-        });
-
        // A backoff::retry is used here for two reasons:
        // - To provide a backoff rather than busy-polling the API on errors
        // - To absorb transient 429/503 conditions without hitting our error
        //   logging path for issues deleting objects.
        backoff::retry(
-            || async { self.remote_storage.delete_objects(&self.accumulator).await },
+            || async {
+                fail::fail_point!("deletion-queue-before-execute", |_| {
+                    info!("Skipping execution, failpoint set");
+
+                    metrics::DELETION_QUEUE
+                        .remote_errors
+                        .with_label_values(&["failpoint"])
+                        .inc();
+                    Err(anyhow::anyhow!("failpoint: deletion-queue-before-execute"))
+                });
+
+                self.remote_storage.delete_objects(&self.accumulator).await
+            },
            |_| false,
            3,
            10,
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -52,6 +52,31 @@ paths:
              schema:
                type: object

+  /v1/reload_auth_validation_keys:
+    post:
+      description: Reloads the JWT public keys from their pre-configured location on disk.
+      responses:
+        "200":
+          description: The reload completed successfully.
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "500":
+          description: Generic operation error (also hits if no keys were found)
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+
  /v1/tenant/{tenant_id}:
    parameters:
      - name: tenant_id
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -20,6 +20,7 @@ use remote_storage::GenericRemoteStorage;
 use tenant_size_model::{SizeResult, StorageModel};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
+use utils::auth::JwtAuth;
 use utils::http::endpoint::request_span;
 use utils::http::json::json_request_or_empty_body;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
@@ -35,8 +36,8 @@ use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::{LocationConf, TenantConfOpt};
 use crate::tenant::mgr::{
-    GetTenantError, SetNewTenantConfigError, TenantMapError, TenantMapInsertError, TenantSlotError,
-    TenantSlotUpsertError, TenantStateError,
+    GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError,
+    TenantSlotError, TenantSlotUpsertError, TenantStateError,
 };
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
@@ -45,7 +46,7 @@ use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, TenantSha
 use crate::{config::PageServerConf, tenant::mgr};
 use crate::{disk_usage_eviction_task, tenant};
 use utils::{
-    auth::JwtAuth,
+    auth::SwappableJwtAuth,
    generation::Generation,
    http::{
        endpoint::{self, attach_openapi_ui, auth_middleware, check_permission_with},
@@ -63,7 +64,8 @@ use super::models::ConfigureFailpointsRequest;

 pub struct State {
    conf: &'static PageServerConf,
-    auth: Option<Arc<JwtAuth>>,
+    tenant_manager: Arc<TenantManager>,
+    auth: Option<Arc<SwappableJwtAuth>>,
    allowlist_routes: Vec<Uri>,
    remote_storage: Option<GenericRemoteStorage>,
    broker_client: storage_broker::BrokerClientChannel,
@@ -74,7 +76,8 @@ pub struct State {
 impl State {
    pub fn new(
        conf: &'static PageServerConf,
-        auth: Option<Arc<JwtAuth>>,
+        tenant_manager: Arc<TenantManager>,
+        auth: Option<Arc<SwappableJwtAuth>>,
        remote_storage: Option<GenericRemoteStorage>,
        broker_client: storage_broker::BrokerClientChannel,
        disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
@@ -86,6 +89,7 @@ impl State {
            .collect::<Vec<_>>();
        Ok(Self {
            conf,
+            tenant_manager,
            auth,
            allowlist_routes,
            remote_storage,
@@ -389,6 +393,32 @@ async fn status_handler(
    json_response(StatusCode::OK, StatusResponse { id: config.id })
 }

+async fn reload_auth_validation_keys_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    check_permission(&request, None)?;
+    let config = get_config(&request);
+    let state = get_state(&request);
+    let Some(shared_auth) = &state.auth else {
+        return json_response(StatusCode::BAD_REQUEST, ());
+    };
+    // unwrap is ok because check is performed when creating config, so path is set and exists
+    let key_path = config.auth_validation_public_key_path.as_ref().unwrap();
+    info!("Reloading public key(s) for verifying JWT tokens from {key_path:?}");
+
+    match JwtAuth::from_key_path(key_path) {
+        Ok(new_auth) => {
+            shared_auth.swap(new_auth);
+            json_response(StatusCode::OK, ())
+        }
+        Err(e) => {
+            warn!("Error reloading public keys from {key_path:?}: {e:}");
+            json_response(StatusCode::INTERNAL_SERVER_ERROR, ())
+        }
+    }
+}
+
 async fn timeline_create_handler(
    mut request: Request<Body>,
    _cancel: CancellationToken,
@@ -717,6 +747,46 @@ async fn tenant_ignore_handler(
    json_response(StatusCode::OK, ())
 }

+async fn tenant_duplicate_handler(
+    mut request: Request<Body>,
+    cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let src_tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+
+    let request_data: TenantCreateRequest = json_request(&mut request).await?;
+    let new_tenant_id = request_data.new_tenant_id;
+    check_permission(&request, None)?;
+
+    let _timer = STORAGE_TIME_GLOBAL
+        .get_metric_with_label_values(&[StorageTimeOperation::DuplicateTenant.into()])
+        .expect("bug")
+        .start_timer();
+
+    let tenant_conf =
+        TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
+
+    let state = get_state(&request);
+
+    let generation = get_request_generation(state, request_data.generation)?;
+
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
+
+    mgr::duplicate_tenant(
+        state.conf,
+        tenant_conf,
+        src_tenant_id,
+        new_tenant_id,
+        generation,
+        state.tenant_resources(),
+        &ctx,
+        &cancel,
+    )
+    .instrument(info_span!("tenant_duplicate", %src_tenant_id, tenant_id = %new_tenant_id))
+    .await?;
+
+    json_response(StatusCode::CREATED, TenantCreateResponse(new_tenant_id))
+}
+
 async fn tenant_list_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
@@ -1140,20 +1210,14 @@ async fn put_tenant_location_config_handler(
    let location_conf =
        LocationConf::try_from(&request_data.config).map_err(ApiError::BadRequest)?;

-    mgr::upsert_location(
-        state.conf,
-        tenant_id,
-        location_conf,
-        state.broker_client.clone(),
-        state.remote_storage.clone(),
-        state.deletion_queue_client.clone(),
-        &ctx,
-    )
-    .await
-    // TODO: badrequest assumes the caller was asking for something unreasonable, but in
-    // principle we might have hit something like concurrent API calls to the same tenant,
-    // which is not a 400 but a 409.
-    .map_err(ApiError::BadRequest)?;
+    state
+        .tenant_manager
+        .upsert_location(tenant_id, location_conf, &ctx)
+        .await
+        // TODO: badrequest assumes the caller was asking for something unreasonable, but in
+        // principle we might have hit something like concurrent API calls to the same tenant,
+        // which is not a 400 but a 409.
+        .map_err(ApiError::BadRequest)?;

    json_response(StatusCode::OK, ())
 }
@@ -1695,7 +1759,7 @@ where
 pub fn make_router(
    state: Arc<State>,
    launch_ts: &'static LaunchTimestamp,
-    auth: Option<Arc<JwtAuth>>,
+    auth: Option<Arc<SwappableJwtAuth>>,
 ) -> anyhow::Result<RouterBuilder<hyper::Body, ApiError>> {
    let spec = include_bytes!("openapi_spec.yml");
    let mut router = attach_openapi_ui(endpoint::make_router(), spec, "/swagger.yml", "/v1/doc");
@@ -1724,6 +1788,9 @@ pub fn make_router(
        .put("/v1/failpoints", |r| {
            testing_api_handler("manage failpoints", r, failpoints_handler)
        })
+        .post("/v1/reload_auth_validation_keys", |r| {
+            api_handler(r, reload_auth_validation_keys_handler)
+        })
        .get("/v1/tenant", |r| api_handler(r, tenant_list_handler))
        .post("/v1/tenant", |r| api_handler(r, tenant_create_handler))
        .get("/v1/tenant/:tenant_id", |r| api_handler(r, tenant_status))
@@ -1760,6 +1827,9 @@ pub fn make_router(
        .post("/v1/tenant/:tenant_id/ignore", |r| {
            api_handler(r, tenant_ignore_handler)
        })
+        .post("/v1/tenant/:tenant_id/duplicate", |r| {
+            api_handler(r, tenant_duplicate_handler)
+        })
        .get("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
            api_handler(r, timeline_detail_handler)
        })
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -1,3 +1,5 @@
+#![deny(clippy::undocumented_unsafe_blocks)]
+
 mod auth;
 pub mod basebackup;
 pub mod config;
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -51,6 +51,9 @@ pub enum StorageTimeOperation {

    #[strum(serialize = "create tenant")]
    CreateTenant,
+
+    #[strum(serialize = "duplicate tenant")]
+    DuplicateTenant,
 }

 pub static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(|| {
@@ -757,6 +760,7 @@ pub enum SmgrQueryType {
    GetRelSize,
    GetPageAtLsn,
    GetDbSize,
+    NoOp,
 }

 #[derive(Debug)]
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -14,6 +14,7 @@ use async_compression::tokio::write::GzipEncoder;
 use bytes::Buf;
 use bytes::Bytes;
 use futures::Stream;
+use pageserver_api::models::TenantState;
 use pageserver_api::models::{
    PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
    PagestreamErrorResponse, PagestreamExistsRequest, PagestreamExistsResponse,
@@ -39,7 +40,7 @@ use tracing::field;
 use tracing::*;
 use utils::id::ConnectionId;
 use utils::{
-    auth::{Claims, JwtAuth, Scope},
+    auth::{Claims, Scope, SwappableJwtAuth},
    id::{TenantId, TimelineId},
    lsn::Lsn,
    simple_rcu::RcuReadGuard,
@@ -121,7 +122,7 @@ async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()
 pub async fn libpq_listener_main(
    conf: &'static PageServerConf,
    broker_client: storage_broker::BrokerClientChannel,
-    auth: Option<Arc<JwtAuth>>,
+    auth: Option<Arc<SwappableJwtAuth>>,
    listener: TcpListener,
    auth_type: AuthType,
    listener_ctx: RequestContext,
@@ -189,7 +190,7 @@ pub async fn libpq_listener_main(
 async fn page_service_conn_main(
    conf: &'static PageServerConf,
    broker_client: storage_broker::BrokerClientChannel,
-    auth: Option<Arc<JwtAuth>>,
+    auth: Option<Arc<SwappableJwtAuth>>,
    socket: tokio::net::TcpStream,
    auth_type: AuthType,
    connection_ctx: RequestContext,
@@ -252,7 +253,7 @@ async fn page_service_conn_main(
 struct PageServerHandler {
    _conf: &'static PageServerConf,
    broker_client: storage_broker::BrokerClientChannel,
-    auth: Option<Arc<JwtAuth>>,
+    auth: Option<Arc<SwappableJwtAuth>>,
    claims: Option<Claims>,

    /// The context created for the lifetime of the connection
@@ -266,7 +267,7 @@ impl PageServerHandler {
    pub fn new(
        conf: &'static PageServerConf,
        broker_client: storage_broker::BrokerClientChannel,
-        auth: Option<Arc<JwtAuth>>,
+        auth: Option<Arc<SwappableJwtAuth>>,
        connection_ctx: RequestContext,
    ) -> Self {
        PageServerHandler {
@@ -490,6 +491,11 @@ impl PageServerHandler {
                        span,
                    )
                }
+                PagestreamFeMessage::NoOp => {
+                    let _timer = metrics.start_timer(metrics::SmgrQueryType::NoOp);
+                    let span = tracing::info_span!("no_op");
+                    (Ok(PagestreamBeMessage::NoOp), span)
+                }
            };

            if let Err(e) = &response {
@@ -1330,6 +1336,9 @@ impl From<GetActiveTenantError> for QueryError {
            GetActiveTenantError::WaitForActiveTimeout { .. } => QueryError::Disconnected(
                ConnectionError::Io(io::Error::new(io::ErrorKind::TimedOut, e.to_string())),
            ),
+            GetActiveTenantError::WillNotBecomeActive(TenantState::Stopping { .. }) => {
+                QueryError::Shutdown
+            }
            e => QueryError::Other(anyhow::anyhow!(e)),
        }
    }
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -589,11 +589,7 @@ impl Timeline {

        let mut total_size: u64 = 0;
        for (spcnode, dbnode) in dbdir.dbdirs.keys() {
-            for rel in self
-                .list_rels(*spcnode, *dbnode, lsn, ctx)
-                .await
-                .context("list rels")?
-            {
+            for rel in self.list_rels(*spcnode, *dbnode, lsn, ctx).await? {
                if cancel.is_cancelled() {
                    return Err(CalculateLogicalSizeError::Cancelled);
                }
@@ -1704,6 +1700,7 @@ const AUX_FILES_KEY: Key = Key {
 // Reverse mappings for a few Keys.
 // These are needed by WAL redo manager.

+/// Guaranteed to return `Ok()` if [[is_rel_block_key]] returns `true` for `key`.
 pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
    Ok(match key.field1 {
        0x00 => (
@@ -1719,7 +1716,8 @@ pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
    })
 }

-fn is_rel_block_key(key: Key) -> bool {
+/// See [[key_to_rel_block]].
+pub fn is_rel_block_key(key: Key) -> bool {
    key.field1 == 0x00 && key.field4 != 0
 }

--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -573,10 +573,10 @@ impl<const L: usize> BuildNode<L> {
        BuildNode {
            num_children: 0,
            level,
-            prefix: Vec::with_capacity(16),
+            prefix: Vec::new(),
            suffix_len: 0,
-            keys: Vec::with_capacity(5024),
-            values: Vec::with_capacity(3140),
+            keys: Vec::new(),
+            values: Vec::new(),
            size: NODE_HDR_SIZE,
        }
    }
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -6,9 +6,11 @@ use rand::{distributions::Alphanumeric, Rng};
 use std::borrow::Cow;
 use std::collections::HashMap;
 use std::ops::Deref;
+use std::str::FromStr;
 use std::sync::Arc;
 use std::time::{Duration, Instant};
 use tokio::fs;
+use tokio::io::AsyncSeekExt;
 use utils::timeout::{timeout_cancellable, TimeoutCancellableError};

 use anyhow::Context;
@@ -30,7 +32,11 @@ use crate::metrics::TENANT_MANAGER as METRICS;
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::{AttachmentMode, LocationConf, LocationMode, TenantConfOpt};
 use crate::tenant::delete::DeleteTenantFlow;
-use crate::tenant::{create_tenant_files, AttachedTenantConf, SpawnMode, Tenant, TenantState};
+use crate::tenant::span::debug_assert_current_span_has_tenant_id;
+use crate::tenant::storage_layer::{DeltaLayer, ImageLayer, LayerFileName};
+use crate::tenant::{
+    create_tenant_files, remote_timeline_client, AttachedTenantConf, IndexPart, Tenant, TenantState,
+};
 use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};

 use utils::crashsafe::path_with_suffix_extension;
@@ -40,7 +46,7 @@ use utils::id::{TenantId, TimelineId};

 use super::delete::DeleteTenantError;
 use super::timeline::delete::DeleteTimelineFlow;
-use super::TenantSharedResources;
+use super::{SpawnMode, TenantSharedResources};

 /// For a tenant that appears in TenantsMap, it may either be
 /// - `Attached`: has a full Tenant object, is elegible to service
@@ -200,6 +206,22 @@ async fn unsafe_create_dir_all(path: &Utf8PathBuf) -> std::io::Result<()> {
    Ok(())
 }

+/// The TenantManager is responsible for storing and mutating the collection of all tenants
+/// that this pageserver process has state for.  Every Tenant and SecondaryTenant instance
+/// lives inside the TenantManager.
+///
+/// The most important role of the TenantManager is to prevent conflicts: e.g. trying to attach
+/// the same tenant twice concurrently, or trying to configure the same tenant into secondary
+/// and attached modes concurrently.
+pub struct TenantManager {
+    conf: &'static PageServerConf,
+    // TODO: currently this is a &'static pointing to TENANTs.  When we finish refactoring
+    // out of that static variable, the TenantManager can own this.
+    // See https://github.com/neondatabase/neon/issues/5796
+    tenants: &'static std::sync::RwLock<TenantsMap>,
+    resources: TenantSharedResources,
+}
+
 fn emergency_generations(
    tenant_confs: &HashMap<TenantId, anyhow::Result<LocationConf>>,
 ) -> HashMap<TenantId, Generation> {
@@ -366,7 +388,7 @@ pub async fn init_tenant_mgr(
    resources: TenantSharedResources,
    init_order: InitializationOrder,
    cancel: CancellationToken,
-) -> anyhow::Result<()> {
+) -> anyhow::Result<TenantManager> {
    let mut tenants = HashMap::new();

    let ctx = RequestContext::todo_child(TaskKind::Startup, DownloadBehavior::Warn);
@@ -468,7 +490,12 @@ pub async fn init_tenant_mgr(
    assert!(matches!(&*tenants_map, &TenantsMap::Initializing));
    METRICS.tenant_slots.set(tenants.len() as u64);
    *tenants_map = TenantsMap::Open(tenants);
-    Ok(())
+
+    Ok(TenantManager {
+        conf,
+        tenants: &TENANTS,
+        resources,
+    })
 }

 /// Wrapper for Tenant::spawn that checks invariants before running, and inserts
@@ -714,6 +741,171 @@ pub(crate) async fn create_tenant(
    Ok(created_tenant)
 }

+#[allow(clippy::too_many_arguments)]
+pub(crate) async fn duplicate_tenant(
+    conf: &'static PageServerConf,
+    tenant_conf: TenantConfOpt,
+    src_tenant_id: TenantId,
+    new_tenant_id: TenantId,
+    generation: Generation,
+    resources: TenantSharedResources,
+    ctx: &RequestContext,
+    cancel: &CancellationToken,
+) -> Result<(), TenantMapInsertError> {
+    debug_assert_current_span_has_tenant_id();
+
+    // TODO: would be nice to use tenant_map_insert here, but, we're not ready to create a Tenant object yet
+    let tempdir = path_with_suffix_extension(
+        conf.tenants_path().join(&new_tenant_id.to_string()),
+        &format!("duplication.{TEMP_FILE_SUFFIX}"),
+    );
+    tokio::fs::remove_dir_all(&tempdir)
+        .await
+        .or_else(|e| match e.kind() {
+            std::io::ErrorKind::NotFound => Ok(()),
+            _ => Err(e),
+        })
+        .context("pre-run clean up tempdir")?;
+
+    tokio::fs::create_dir(&tempdir)
+        .await
+        .context("create tempdir")?;
+
+    // Copy the tenant's data in S3
+    let remote_storage = resources
+        .remote_storage
+        .as_ref()
+        .context("only works with remote storage")?;
+
+    let (remote_src_timelines, other_prefixes) = remote_timeline_client::list_remote_timelines(
+        remote_storage,
+        src_tenant_id,
+        cancel.clone(),
+    )
+    .await
+    .context("list src timelines")?;
+
+    if !other_prefixes.is_empty() {
+        return Err(TenantMapInsertError::Other(anyhow::anyhow!(
+            "unimplemented: handling of other prefixes in src tenant: {:?}",
+            other_prefixes
+        )));
+    }
+
+    info!(?remote_src_timelines, "got src timelines");
+
+    for timeline_id in remote_src_timelines {
+        async {
+            let tempdir = tempdir.join(&timeline_id.to_string());
+
+            tokio::fs::create_dir(&tempdir)
+                .await
+                .context("create tempdir for timeline")?;
+
+            let remote_src_tl =
+                remote_timeline_client::remote_timeline_path(&src_tenant_id, &timeline_id);
+            let remote_dst_tl =
+                remote_timeline_client::remote_timeline_path(&new_tenant_id, &timeline_id);
+
+            let object_names = remote_storage
+                .list_prefixes(Some(&remote_src_tl))
+                .await
+                .context("list timeline remote prefix")?;
+
+            for name in object_names {
+                async {
+                    let name = name.object_name().context(
+                        "list_prefixes return values should always have object_name()=Some",
+                    )?;
+                    let remote_src_obj = remote_src_tl.join(name);
+                    let remote_dst_obj = remote_dst_tl.join(name);
+
+                    let tmp_obj_filepath = tempdir.join(name);
+                    let mut tmp_obj_file = tokio::fs::OpenOptions::new()
+                        .read(true)
+                        .write(true)
+                        .create_new(true)
+                        .open(&tmp_obj_filepath)
+                        .await
+                        .context("create temp file")?;
+                    let mut tmp_dl = remote_storage
+                        .download(&remote_src_obj)
+                        .await
+                        .context("start download")?;
+                    let tmp_obj_size =
+                        tokio::io::copy(&mut tmp_dl.download_stream, &mut tmp_obj_file)
+                            .await
+                            .context("do the download")?;
+
+                    if name == IndexPart::FILE_NAME {
+                        // needs no patching
+                    } else {
+                        let name = LayerFileName::from_str(name).map_err(|e: String| {
+                            anyhow::anyhow!("unknown key in timeline s3 prefix: {name:?}: {e}")
+                        })?;
+                        match name {
+                            LayerFileName::Image(_) => {
+                                ImageLayer::rewrite_tenant_timeline(
+                                    &tmp_obj_filepath,
+                                    new_tenant_id,
+                                    timeline_id, /* leave as is */
+                                    ctx,
+                                )
+                                .await
+                                .context("rewrite tenant timeline")?;
+                            }
+                            LayerFileName::Delta(_) => {
+                                DeltaLayer::rewrite_tenant_timeline(
+                                    &tmp_obj_filepath,
+                                    new_tenant_id,
+                                    timeline_id, /* leave as is */
+                                    ctx,
+                                )
+                                .await
+                                .context("rewrite tenant timeline")?;
+                            }
+                        }
+                    }
+
+                    info!(?remote_dst_obj, "uploading");
+
+                    tmp_obj_file
+                        .seek(std::io::SeekFrom::Start(0))
+                        .await
+                        .context("seek tmp file to beginning for upload")?;
+                    remote_storage
+                        .upload(
+                            tmp_obj_file,
+                            tmp_obj_size as usize,
+                            &remote_dst_obj,
+                            tmp_dl.metadata,
+                        )
+                        .await
+                        .context("upload modified")?;
+
+                    tokio::fs::remove_file(tmp_obj_filepath)
+                        .await
+                        .context("remove temp file")?;
+
+                    anyhow::Ok(())
+                }
+                .instrument(info_span!("copy object", object_name=?name))
+                .await
+                .context("copy object")?;
+            }
+            anyhow::Ok(())
+        }
+        .instrument(info_span!("copy_timeline", timeline_id=%timeline_id))
+        .await?;
+    }
+
+    tokio::fs::remove_dir_all(&tempdir)
+        .await
+        .context("post-run clean up tempdir")?;
+
+    attach_tenant(conf, new_tenant_id, generation, tenant_conf, resources, ctx).await
+}
+
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum SetNewTenantConfigError {
    #[error(transparent)]
@@ -742,139 +934,134 @@ pub(crate) async fn set_new_tenant_config(
    Ok(())
 }

-#[instrument(skip_all, fields(%tenant_id))]
-pub(crate) async fn upsert_location(
-    conf: &'static PageServerConf,
-    tenant_id: TenantId,
-    new_location_config: LocationConf,
-    broker_client: storage_broker::BrokerClientChannel,
-    remote_storage: Option<GenericRemoteStorage>,
-    deletion_queue_client: DeletionQueueClient,
-    ctx: &RequestContext,
-) -> Result<(), anyhow::Error> {
-    info!("configuring tenant location {tenant_id} to state {new_location_config:?}");
+impl TenantManager {
+    #[instrument(skip_all, fields(%tenant_id))]
+    pub(crate) async fn upsert_location(
+        &self,
+        tenant_id: TenantId,
+        new_location_config: LocationConf,
+        ctx: &RequestContext,
+    ) -> Result<(), anyhow::Error> {
+        info!("configuring tenant location {tenant_id} to state {new_location_config:?}");

-    // Special case fast-path for updates to Tenant: if our upsert is only updating configuration,
-    // then we do not need to set the slot to InProgress, we can just call into the
-    // existng tenant.
-    {
-        let locked = TENANTS.read().unwrap();
-        let peek_slot = tenant_map_peek_slot(&locked, &tenant_id, TenantSlotPeekMode::Write)?;
-        match (&new_location_config.mode, peek_slot) {
-            (LocationMode::Attached(attach_conf), Some(TenantSlot::Attached(tenant))) => {
-                if attach_conf.generation == tenant.generation {
-                    // A transition from Attached to Attached in the same generation, we may
-                    // take our fast path and just provide the updated configuration
-                    // to the tenant.
-                    tenant.set_new_location_config(AttachedTenantConf::try_from(
-                        new_location_config,
-                    )?);
+        // Special case fast-path for updates to Tenant: if our upsert is only updating configuration,
+        // then we do not need to set the slot to InProgress, we can just call into the
+        // existng tenant.
+        {
+            let locked = self.tenants.read().unwrap();
+            let peek_slot = tenant_map_peek_slot(&locked, &tenant_id, TenantSlotPeekMode::Write)?;
+            match (&new_location_config.mode, peek_slot) {
+                (LocationMode::Attached(attach_conf), Some(TenantSlot::Attached(tenant))) => {
+                    if attach_conf.generation == tenant.generation {
+                        // A transition from Attached to Attached in the same generation, we may
+                        // take our fast path and just provide the updated configuration
+                        // to the tenant.
+                        tenant.set_new_location_config(AttachedTenantConf::try_from(
+                            new_location_config,
+                        )?);

-                    // Persist the new config in the background, to avoid holding up any
-                    // locks while we do so.
-                    // TODO
+                        // Persist the new config in the background, to avoid holding up any
+                        // locks while we do so.
+                        // TODO

-                    return Ok(());
-                } else {
-                    // Different generations, fall through to general case
+                        return Ok(());
+                    } else {
+                        // Different generations, fall through to general case
+                    }
+                }
+                _ => {
+                    // Not an Attached->Attached transition, fall through to general case
                }
            }
-            _ => {
-                // Not an Attached->Attached transition, fall through to general case
-            }
        }
-    }

-    // General case for upserts to TenantsMap, excluding the case above: we will substitute an
-    // InProgress value to the slot while we make whatever changes are required.  The state for
-    // the tenant is inaccessible to the outside world while we are doing this, but that is sensible:
-    // the state is ill-defined while we're in transition.  Transitions are async, but fast: we do
-    // not do significant I/O, and shutdowns should be prompt via cancellation tokens.
-    let mut slot_guard = tenant_map_acquire_slot(&tenant_id, TenantSlotAcquireMode::Any)?;
+        // General case for upserts to TenantsMap, excluding the case above: we will substitute an
+        // InProgress value to the slot while we make whatever changes are required.  The state for
+        // the tenant is inaccessible to the outside world while we are doing this, but that is sensible:
+        // the state is ill-defined while we're in transition.  Transitions are async, but fast: we do
+        // not do significant I/O, and shutdowns should be prompt via cancellation tokens.
+        let mut slot_guard = tenant_map_acquire_slot(&tenant_id, TenantSlotAcquireMode::Any)?;

-    if let Some(TenantSlot::Attached(tenant)) = slot_guard.get_old_value() {
-        // The case where we keep a Tenant alive was covered above in the special case
-        // for Attached->Attached transitions in the same generation.  By this point,
-        // if we see an attached tenant we know it will be discarded and should be
-        // shut down.
-        let (_guard, progress) = utils::completion::channel();
+        if let Some(TenantSlot::Attached(tenant)) = slot_guard.get_old_value() {
+            // The case where we keep a Tenant alive was covered above in the special case
+            // for Attached->Attached transitions in the same generation.  By this point,
+            // if we see an attached tenant we know it will be discarded and should be
+            // shut down.
+            let (_guard, progress) = utils::completion::channel();

-        match tenant.get_attach_mode() {
-            AttachmentMode::Single | AttachmentMode::Multi => {
-                // Before we leave our state as the presumed holder of the latest generation,
-                // flush any outstanding deletions to reduce the risk of leaking objects.
-                deletion_queue_client.flush_advisory()
+            match tenant.get_attach_mode() {
+                AttachmentMode::Single | AttachmentMode::Multi => {
+                    // Before we leave our state as the presumed holder of the latest generation,
+                    // flush any outstanding deletions to reduce the risk of leaking objects.
+                    self.resources.deletion_queue_client.flush_advisory()
+                }
+                AttachmentMode::Stale => {
+                    // If we're stale there's not point trying to flush deletions
+                }
+            };
+
+            info!("Shutting down attached tenant");
+            match tenant.shutdown(progress, false).await {
+                Ok(()) => {}
+                Err(barrier) => {
+                    info!("Shutdown already in progress, waiting for it to complete");
+                    barrier.wait().await;
+                }
            }
-            AttachmentMode::Stale => {
-                // If we're stale there's not point trying to flush deletions
+            slot_guard.drop_old_value().expect("We just shut it down");
+        }
+
+        let tenant_path = self.conf.tenant_path(&tenant_id);
+
+        let new_slot = match &new_location_config.mode {
+            LocationMode::Secondary(_) => {
+                let tenant_path = self.conf.tenant_path(&tenant_id);
+                // Directory doesn't need to be fsync'd because if we crash it can
+                // safely be recreated next time this tenant location is configured.
+                unsafe_create_dir_all(&tenant_path)
+                    .await
+                    .with_context(|| format!("Creating {tenant_path}"))?;
+
+                Tenant::persist_tenant_config(self.conf, &tenant_id, &new_location_config)
+                    .await
+                    .map_err(SetNewTenantConfigError::Persist)?;
+
+                TenantSlot::Secondary
+            }
+            LocationMode::Attached(_attach_config) => {
+                let timelines_path = self.conf.timelines_path(&tenant_id);
+
+                // Directory doesn't need to be fsync'd because we do not depend on
+                // it to exist after crashes: it may be recreated when tenant is
+                // re-attached, see https://github.com/neondatabase/neon/issues/5550
+                unsafe_create_dir_all(&timelines_path)
+                    .await
+                    .with_context(|| format!("Creating {timelines_path}"))?;
+
+                Tenant::persist_tenant_config(self.conf, &tenant_id, &new_location_config)
+                    .await
+                    .map_err(SetNewTenantConfigError::Persist)?;
+
+                let tenant = tenant_spawn(
+                    self.conf,
+                    tenant_id,
+                    &tenant_path,
+                    self.resources.clone(),
+                    AttachedTenantConf::try_from(new_location_config)?,
+                    None,
+                    self.tenants,
+                    SpawnMode::Normal,
+                    ctx,
+                )?;
+
+                TenantSlot::Attached(tenant)
            }
        };

-        info!("Shutting down attached tenant");
-        match tenant.shutdown(progress, false).await {
-            Ok(()) => {}
-            Err(barrier) => {
-                info!("Shutdown already in progress, waiting for it to complete");
-                barrier.wait().await;
-            }
-        }
-        slot_guard.drop_old_value().expect("We just shut it down");
+        slot_guard.upsert(new_slot)?;
+
+        Ok(())
    }
-
-    let tenant_path = conf.tenant_path(&tenant_id);
-
-    let new_slot = match &new_location_config.mode {
-        LocationMode::Secondary(_) => {
-            let tenant_path = conf.tenant_path(&tenant_id);
-            // Directory doesn't need to be fsync'd because if we crash it can
-            // safely be recreated next time this tenant location is configured.
-            unsafe_create_dir_all(&tenant_path)
-                .await
-                .with_context(|| format!("Creating {tenant_path}"))?;
-
-            Tenant::persist_tenant_config(conf, &tenant_id, &new_location_config)
-                .await
-                .map_err(SetNewTenantConfigError::Persist)?;
-
-            TenantSlot::Secondary
-        }
-        LocationMode::Attached(_attach_config) => {
-            let timelines_path = conf.timelines_path(&tenant_id);
-
-            // Directory doesn't need to be fsync'd because we do not depend on
-            // it to exist after crashes: it may be recreated when tenant is
-            // re-attached, see https://github.com/neondatabase/neon/issues/5550
-            unsafe_create_dir_all(&timelines_path)
-                .await
-                .with_context(|| format!("Creating {timelines_path}"))?;
-
-            Tenant::persist_tenant_config(conf, &tenant_id, &new_location_config)
-                .await
-                .map_err(SetNewTenantConfigError::Persist)?;
-
-            let tenant = tenant_spawn(
-                conf,
-                tenant_id,
-                &tenant_path,
-                TenantSharedResources {
-                    broker_client,
-                    remote_storage,
-                    deletion_queue_client,
-                },
-                AttachedTenantConf::try_from(new_location_config)?,
-                None,
-                &TENANTS,
-                SpawnMode::Normal,
-                ctx,
-            )?;
-
-            TenantSlot::Attached(tenant)
-        }
-    };
-
-    slot_guard.upsert(new_slot)?;
-
-    Ok(())
 }

 #[derive(Debug, thiserror::Error)]
@@ -1430,9 +1617,6 @@ pub struct SlotGuard {
    _completion: utils::completion::Completion,
 }

-unsafe impl Send for SlotGuard {}
-unsafe impl Sync for SlotGuard {}
-
 impl SlotGuard {
    fn new(
        tenant_id: TenantId,
@@ -1539,14 +1723,7 @@ impl SlotGuard {
    /// is responsible for protecting
    fn old_value_is_shutdown(&self) -> bool {
        match self.old_value.as_ref() {
-            Some(TenantSlot::Attached(tenant)) => {
-                // TODO: PR #5711 will add a gate that enables properly checking that
-                // shutdown completed.
-                matches!(
-                    tenant.current_state(),
-                    TenantState::Stopping { .. } | TenantState::Broken { .. }
-                )
-            }
+            Some(TenantSlot::Attached(tenant)) => tenant.gate.close_complete(),
            Some(TenantSlot::Secondary) => {
                // TODO: when adding secondary mode tenants, this will check for shutdown
                // in the same way that we do for `Tenant` above
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -609,6 +609,49 @@ impl Drop for DeltaLayerWriter {
    }
 }

+impl DeltaLayer {
+    /// Assume the file at `path` is corrupt if this function returns with an error.
+    pub(crate) async fn rewrite_tenant_timeline(
+        path: &Utf8Path,
+        new_tenant: TenantId,
+        new_timeline: TimelineId,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        let file = VirtualFile::open_with_options(
+            path,
+            &*std::fs::OpenOptions::new().read(true).write(true),
+        )
+        .await
+        .with_context(|| format!("Failed to open file '{}'", path))?;
+        let file = FileBlockReader::new(file);
+        let summary_blk = file.read_blk(0, ctx).await?;
+        let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
+        let mut file = file.file;
+        if actual_summary.magic != DELTA_FILE_MAGIC {
+            bail!("File '{}' is not a delta layer", path);
+        }
+        let new_summary = Summary {
+            tenant_id: new_tenant,
+            timeline_id: new_timeline,
+            ..actual_summary
+        };
+
+        let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
+        Summary::ser_into(&new_summary, &mut buf)?;
+        if buf.spilled() {
+            // The code in ImageLayerWriterInner just warn!()s for this.
+            // It should probably error out as well.
+            anyhow::bail!(
+                "Used more than one page size for summary buffer: {}",
+                buf.len()
+            );
+        }
+        file.seek(SeekFrom::Start(0)).await?;
+        file.write_all(&buf).await?;
+        Ok(())
+    }
+}
+
 impl DeltaLayerInner {
    pub(super) async fn load(
        path: &Utf8Path,
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -294,6 +294,49 @@ impl ImageLayer {
    }
 }

+impl ImageLayer {
+    /// Assume the file at `path` is corrupt if this function returns with an error.
+    pub(crate) async fn rewrite_tenant_timeline(
+        path: &Utf8Path,
+        new_tenant: TenantId,
+        new_timeline: TimelineId,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        let file = VirtualFile::open_with_options(
+            path,
+            &*std::fs::OpenOptions::new().read(true).write(true),
+        )
+        .await
+        .with_context(|| format!("Failed to open file '{}'", path))?;
+        let file = FileBlockReader::new(file);
+        let summary_blk = file.read_blk(0, ctx).await?;
+        let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
+        let mut file = file.file;
+        if actual_summary.magic != IMAGE_FILE_MAGIC {
+            bail!("File '{}' is not a delta layer", path);
+        }
+        let new_summary = Summary {
+            tenant_id: new_tenant,
+            timeline_id: new_timeline,
+            ..actual_summary
+        };
+
+        let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
+        Summary::ser_into(&new_summary, &mut buf)?;
+        if buf.spilled() {
+            // The code in ImageLayerWriterInner just warn!()s for this.
+            // It should probably error out as well.
+            anyhow::bail!(
+                "Used more than one page size for summary buffer: {}",
+                buf.len()
+            );
+        }
+        file.seek(SeekFrom::Start(0)).await?;
+        file.write_all(&buf).await?;
+        Ok(())
+    }
+}
+
 impl ImageLayerInner {
    pub(super) async fn load(
        path: &Utf8Path,
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -596,21 +596,21 @@ trait CloseFileDescriptors: CommandExt {

 impl<C: CommandExt> CloseFileDescriptors for C {
    fn close_fds(&mut self) -> &mut Command {
+        // SAFETY: Code executed inside pre_exec should have async-signal-safety,
+        // which means it should be safe to execute inside a signal handler.
+        // The precise meaning depends on platform. See `man signal-safety`
+        // for the linux definition.
+        //
+        // The set_fds_cloexec_threadsafe function is documented to be
+        // async-signal-safe.
+        //
+        // Aside from this function, the rest of the code is re-entrant and
+        // doesn't make any syscalls. We're just passing constants.
+        //
+        // NOTE: It's easy to indirectly cause a malloc or lock a mutex,
+        // which is not async-signal-safe. Be careful.
        unsafe {
            self.pre_exec(move || {
-                // SAFETY: Code executed inside pre_exec should have async-signal-safety,
-                // which means it should be safe to execute inside a signal handler.
-                // The precise meaning depends on platform. See `man signal-safety`
-                // for the linux definition.
-                //
-                // The set_fds_cloexec_threadsafe function is documented to be
-                // async-signal-safe.
-                //
-                // Aside from this function, the rest of the code is re-entrant and
-                // doesn't make any syscalls. We're just passing constants.
-                //
-                // NOTE: It's easy to indirectly cause a malloc or lock a mutex,
-                // which is not async-signal-safe. Be careful.
                close_fds::set_fds_cloexec_threadsafe(3, &[]);
                Ok(())
            })
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -19,7 +19,10 @@
 #include "access/xlog.h"
 #include "access/xlogutils.h"
 #include "storage/buf_internals.h"
+#include "storage/lwlock.h"
+#include "storage/ipc.h"
 #include "c.h"
+#include "postmaster/interrupt.h"

 #include "libpq-fe.h"
 #include "libpq/pqformat.h"
@@ -61,23 +64,63 @@ int			flush_every_n_requests = 8;
 int			n_reconnect_attempts = 0;
 int			max_reconnect_attempts = 60;

+#define MAX_PAGESERVER_CONNSTRING_SIZE 256
+
+typedef struct
+{
+    LWLockId lock;
+    pg_atomic_uint64 update_counter;
+    char pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
+} PagestoreShmemState;
+
+#if PG_VERSION_NUM >= 150000
+static shmem_request_hook_type prev_shmem_request_hook = NULL;
+static void walproposer_shmem_request(void);
+#endif
+static shmem_startup_hook_type prev_shmem_startup_hook;
+static PagestoreShmemState *pagestore_shared;
+static uint64 pagestore_local_counter = 0;
+static char local_pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
+
 bool	(*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;

 static bool pageserver_flush(void);
 static void pageserver_disconnect(void);

-
-static pqsigfunc	 prev_signal_handler;
+static bool
+CheckPageserverConnstring(char **newval, void **extra, GucSource source)
+{
+    return strlen(*newval) < MAX_PAGESERVER_CONNSTRING_SIZE;
+}

 static void
-pageserver_sighup_handler(SIGNAL_ARGS)
+AssignPageserverConnstring(const char *newval, void *extra)
 {
-	if (prev_signal_handler)
-	{
-        	prev_signal_handler(postgres_signal_arg);
-	}
-	neon_log(LOG, "Received SIGHUP, disconnecting pageserver. New pageserver connstring is %s", page_server_connstring);
-	pageserver_disconnect();
+    if(!pagestore_shared)
+        return;
+    LWLockAcquire(pagestore_shared->lock, LW_EXCLUSIVE);
+    strlcpy(pagestore_shared->pageserver_connstring, newval, MAX_PAGESERVER_CONNSTRING_SIZE);
+    pg_atomic_fetch_add_u64(&pagestore_shared->update_counter, 1);
+    LWLockRelease(pagestore_shared->lock);
+}
+
+static bool
+CheckConnstringUpdated()
+{
+    if(!pagestore_shared)
+        return false;
+    return pagestore_local_counter < pg_atomic_read_u64(&pagestore_shared->update_counter);
+}
+
+static void
+ReloadConnstring()
+{
+    if(!pagestore_shared)
+        return;
+    LWLockAcquire(pagestore_shared->lock, LW_SHARED);
+    strlcpy(local_pageserver_connstring, pagestore_shared->pageserver_connstring, sizeof(local_pageserver_connstring));
+    pagestore_local_counter = pg_atomic_read_u64(&pagestore_shared->update_counter);
+    LWLockRelease(pagestore_shared->lock);
 }

 static bool
@@ -91,6 +134,11 @@ pageserver_connect(int elevel)

 	Assert(!connected);

+        if(CheckConnstringUpdated())
+        {
+            ReloadConnstring();
+        }
+
 	/*
 	 * Connect using the connection string we got from the
 	 * neon.pageserver_connstring GUC. If the NEON_AUTH_TOKEN environment
@@ -110,7 +158,7 @@ pageserver_connect(int elevel)
 		n++;
 	}
 	keywords[n] = "dbname";
-	values[n] = page_server_connstring;
+	values[n] = local_pageserver_connstring;
 	n++;
 	keywords[n] = NULL;
 	values[n] = NULL;
@@ -254,6 +302,12 @@ pageserver_send(NeonRequest * request)
 {
 	StringInfoData req_buff;

+        if(CheckConnstringUpdated())
+        {
+            pageserver_disconnect();
+            ReloadConnstring();
+        }
+
 	/* If the connection was lost for some reason, reconnect */
 	if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD)
 	{
@@ -274,6 +328,7 @@ pageserver_send(NeonRequest * request)
 	{
 		while (!pageserver_connect(n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
 		{
+			HandleMainLoopInterrupts();
 			n_reconnect_attempts += 1;
 			pg_usleep(RECONNECT_INTERVAL_USEC);
 		}
@@ -391,7 +446,8 @@ pageserver_flush(void)
 	return true;
 }

-page_server_api api = {
+page_server_api api =
+{
 	.send = pageserver_send,
 	.flush = pageserver_flush,
 	.receive = pageserver_receive
@@ -405,12 +461,72 @@ check_neon_id(char **newval, void **extra, GucSource source)
 	return **newval == '\0' || HexDecodeString(id, *newval, 16);
 }

+static Size
+PagestoreShmemSize(void)
+{
+    return sizeof(PagestoreShmemState);
+}
+
+static bool
+PagestoreShmemInit(void)
+{
+    bool found;
+    LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
+    pagestore_shared = ShmemInitStruct("libpagestore shared state",
+                                       PagestoreShmemSize(),
+                                       &found);
+    if(!found)
+    {
+        pagestore_shared->lock = &(GetNamedLWLockTranche("neon_libpagestore")->lock);
+        pg_atomic_init_u64(&pagestore_shared->update_counter, 0);
+        AssignPageserverConnstring(page_server_connstring, NULL);
+    }
+    LWLockRelease(AddinShmemInitLock);
+    return found;
+}
+
+static void
+pagestore_shmem_startup_hook(void)
+{
+    if(prev_shmem_startup_hook)
+        prev_shmem_startup_hook();
+
+    PagestoreShmemInit();
+}
+
+static void
+pagestore_shmem_request(void)
+{
+#if PG_VERSION_NUM >= 150000
+    if(prev_shmem_request_hook)
+        prev_shmem_request_hook();
+#endif
+
+    RequestAddinShmemSpace(PagestoreShmemSize());
+    RequestNamedLWLockTranche("neon_libpagestore", 1);
+}
+
+static void
+pagestore_prepare_shmem(void)
+{
+#if PG_VERSION_NUM >= 150000
+	prev_shmem_request_hook = shmem_request_hook;
+	shmem_request_hook = pagestore_shmem_request;
+#else
+        pagestore_shmem_request();
+#endif
+	prev_shmem_startup_hook = shmem_startup_hook;
+	shmem_startup_hook = pagestore_shmem_startup_hook;
+}
+
 /*
 * Module initialization function
 */
 void
 pg_init_libpagestore(void)
 {
+        pagestore_prepare_shmem();
+
 	DefineCustomStringVariable("neon.pageserver_connstring",
 							   "connection string to the page server",
 							   NULL,
@@ -418,7 +534,7 @@ pg_init_libpagestore(void)
 							   "",
 							   PGC_SIGHUP,
 							   0,	/* no flags required */
-							   NULL, NULL, NULL);
+							   CheckPageserverConnstring, AssignPageserverConnstring, NULL);

 	DefineCustomStringVariable("neon.timeline_id",
 							   "Neon timeline_id the server is running on",
@@ -499,7 +615,5 @@ pg_init_libpagestore(void)
 		redo_read_buffer_filter = neon_redo_read_buffer_filter;
 	}

-        prev_signal_handler = pqsignal(SIGHUP, pageserver_sighup_handler);
-
 	lfc_init();
 }
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -1,6 +1,8 @@
 //! User credentials used in authentication.

-use crate::{auth::password_hack::parse_endpoint_param, error::UserFacingError};
+use crate::{
+    auth::password_hack::parse_endpoint_param, error::UserFacingError, proxy::neon_options,
+};
 use itertools::Itertools;
 use pq_proto::StartupMessageParams;
 use std::collections::HashSet;
@@ -38,6 +40,8 @@ pub struct ClientCredentials<'a> {
    pub user: &'a str,
    // TODO: this is a severe misnomer! We should think of a new name ASAP.
    pub project: Option<String>,
+
+    pub cache_key: String,
 }

 impl ClientCredentials<'_> {
@@ -53,6 +57,7 @@ impl<'a> ClientCredentials<'a> {
        ClientCredentials {
            user: "",
            project: None,
+            cache_key: "".to_string(),
        }
    }

@@ -120,7 +125,17 @@ impl<'a> ClientCredentials<'a> {

        info!(user, project = project.as_deref(), "credentials");

-        Ok(Self { user, project })
+        let cache_key = format!(
+            "{}{}",
+            project.as_deref().unwrap_or(""),
+            neon_options(params).unwrap_or("".to_string())
+        );
+
+        Ok(Self {
+            user,
+            project,
+            cache_key,
+        })
    }
 }

@@ -176,6 +191,7 @@ mod tests {
        let creds = ClientCredentials::parse(&options, sni, common_names)?;
        assert_eq!(creds.user, "john_doe");
        assert_eq!(creds.project.as_deref(), Some("foo"));
+        assert_eq!(creds.cache_key, "foo");

        Ok(())
    }
@@ -303,4 +319,23 @@ mod tests {
            _ => panic!("bad error: {err:?}"),
        }
    }
+
+    #[test]
+    fn parse_neon_options() -> anyhow::Result<()> {
+        let options = StartupMessageParams::new([
+            ("user", "john_doe"),
+            ("options", "neon_lsn:0/2 neon_endpoint_type:read_write"),
+        ]);
+
+        let sni = Some("project.localhost");
+        let common_names = Some(["localhost".into()].into());
+        let creds = ClientCredentials::parse(&options, sni, common_names)?;
+        assert_eq!(creds.project.as_deref(), Some("project"));
+        assert_eq!(
+            creds.cache_key,
+            "projectneon_endpoint_type:read_write neon_lsn:0/2"
+        );
+
+        Ok(())
+    }
 }
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -3,6 +3,7 @@ use crate::{
    cancellation::CancelClosure,
    console::errors::WakeComputeError,
    error::{io_error, UserFacingError},
+    proxy::is_neon_param,
 };
 use futures::{FutureExt, TryFutureExt};
 use itertools::Itertools;
@@ -278,7 +279,7 @@ fn filtered_options(params: &StartupMessageParams) -> Option<String> {
    #[allow(unstable_name_collisions)]
    let options: String = params
        .options_raw()?
-        .filter(|opt| parse_endpoint_param(opt).is_none())
+        .filter(|opt| parse_endpoint_param(opt).is_none() && !is_neon_param(opt))
        .intersperse(" ") // TODO: use impl from std once it's stabilized
        .collect();

@@ -313,5 +314,11 @@ mod tests {

        let params = StartupMessageParams::new([("options", "project = foo")]);
        assert_eq!(filtered_options(&params).as_deref(), Some("project = foo"));
+
+        let params = StartupMessageParams::new([(
+            "options",
+            "project = foo neon_endpoint_type:read_write   neon_lsn:0/2",
+        )]);
+        assert_eq!(filtered_options(&params).as_deref(), Some("project = foo"));
    }
 }
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -178,6 +178,7 @@ pub struct ConsoleReqExtra<'a> {
    pub session_id: uuid::Uuid,
    /// Name of client application, if set.
    pub application_name: Option<&'a str>,
+    pub options: Option<&'a str>,
 }

 /// Auth secret which is managed by the cloud.
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -99,6 +99,7 @@ impl Api {
                .query(&[
                    ("application_name", extra.application_name),
                    ("project", Some(project)),
+                    ("options", extra.options),
                ])
                .build()?;

@@ -151,7 +152,7 @@ impl super::Api for Api {
        extra: &ConsoleReqExtra<'_>,
        creds: &ClientCredentials,
    ) -> Result<CachedNodeInfo, WakeComputeError> {
-        let key = creds.project().expect("impossible");
+        let key: &str = &creds.cache_key;

        // Every time we do a wakeup http request, the compute node will stay up
        // for some time (highly depends on the console's scale-to-zero policy);
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -1,3 +1,5 @@
+#![deny(clippy::undocumented_unsafe_blocks)]
+
 use std::convert::Infallible;

 use anyhow::{bail, Context};
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -15,10 +15,12 @@ use crate::{
 use anyhow::{bail, Context};
 use async_trait::async_trait;
 use futures::TryFutureExt;
+use itertools::Itertools;
 use metrics::{exponential_buckets, register_int_counter_vec, IntCounterVec};
-use once_cell::sync::Lazy;
+use once_cell::sync::{Lazy, OnceCell};
 use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
 use prometheus::{register_histogram_vec, HistogramVec};
+use regex::Regex;
 use std::{error::Error, io, ops::ControlFlow, sync::Arc, time::Instant};
 use tokio::{
    io::{AsyncRead, AsyncWrite, AsyncWriteExt},
@@ -881,9 +883,12 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
            allow_self_signed_compute,
        } = self;

+        let console_options = neon_options(params);
+
        let extra = console::ConsoleReqExtra {
            session_id, // aka this connection's id
            application_name: params.get("application_name"),
+            options: console_options.as_deref(),
        };

        let mut latency_timer = LatencyTimer::new(mode.protocol_label());
@@ -945,3 +950,27 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
        proxy_pass(stream, node.stream, &aux).await
    }
 }
+
+pub fn neon_options(params: &StartupMessageParams) -> Option<String> {
+    #[allow(unstable_name_collisions)]
+    let options: String = params
+        .options_raw()?
+        .filter(|opt| is_neon_param(opt))
+        .sorted() // we sort it to use as cache key
+        .intersperse(" ") // TODO: use impl from std once it's stabilized
+        .collect();
+
+    // Don't even bother with empty options.
+    if options.is_empty() {
+        return None;
+    }
+
+    Some(options)
+}
+
+pub fn is_neon_param(bytes: &str) -> bool {
+    static RE: OnceCell<Regex> = OnceCell::new();
+    RE.get_or_init(|| Regex::new(r"^neon_\w+:").unwrap());
+
+    RE.get().unwrap().is_match(bytes)
+}
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -440,6 +440,7 @@ fn helper_create_connect_info(
    let extra = console::ConsoleReqExtra {
        session_id: uuid::Uuid::new_v4(),
        application_name: Some("TEST"),
+        options: None,
    };
    let creds = auth::BackendType::Test(mechanism);
    (cache, extra, creds)
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -22,7 +22,10 @@ use tokio_postgres::{AsyncMessage, ReadyForQueryStatus};

 use crate::{
    auth, console,
-    proxy::{LatencyTimer, NUM_DB_CONNECTIONS_CLOSED_COUNTER, NUM_DB_CONNECTIONS_OPENED_COUNTER},
+    proxy::{
+        neon_options, LatencyTimer, NUM_DB_CONNECTIONS_CLOSED_COUNTER,
+        NUM_DB_CONNECTIONS_OPENED_COUNTER,
+    },
    usage_metrics::{Ids, MetricCounter, USAGE_METRICS},
 };
 use crate::{compute, config};
@@ -41,6 +44,7 @@ pub struct ConnInfo {
    pub dbname: String,
    pub hostname: String,
    pub password: String,
+    pub options: Option<String>,
 }

 impl ConnInfo {
@@ -401,26 +405,25 @@ async fn connect_to_compute(
    let tls = config.tls_config.as_ref();
    let common_names = tls.and_then(|tls| tls.common_names.clone());

-    let credential_params = StartupMessageParams::new([
+    let params = StartupMessageParams::new([
        ("user", &conn_info.username),
        ("database", &conn_info.dbname),
        ("application_name", APP_NAME),
+        ("options", conn_info.options.as_deref().unwrap_or("")),
    ]);

    let creds = config
        .auth_backend
        .as_ref()
-        .map(|_| {
-            auth::ClientCredentials::parse(
-                &credential_params,
-                Some(&conn_info.hostname),
-                common_names,
-            )
-        })
+        .map(|_| auth::ClientCredentials::parse(&params, Some(&conn_info.hostname), common_names))
        .transpose()?;
+
+    let console_options = neon_options(&params);
+
    let extra = console::ConsoleReqExtra {
        session_id: uuid::Uuid::new_v4(),
        application_name: Some(APP_NAME),
+        options: console_options.as_deref(),
    };

    let node_info = creds
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -174,11 +174,23 @@ fn get_conn_info(
        }
    }

+    let pairs = connection_url.query_pairs();
+
+    let mut options = Option::None;
+
+    for (key, value) in pairs {
+        if key == "options" {
+            options = Some(value.to_string());
+            break;
+        }
+    }
+
    Ok(ConnInfo {
        username: username.to_owned(),
        dbname: dbname.to_owned(),
        hostname: hostname.to_owned(),
        password: password.to_owned(),
+        options,
    })
 }

--- a/s3_scrubber/src/lib.rs
+++ b/s3_scrubber/src/lib.rs
@@ -1,3 +1,5 @@
+#![deny(unsafe_code)]
+#![deny(clippy::undocumented_unsafe_blocks)]
 pub mod checks;
 pub mod cloud_admin_api;
 pub mod garbage;
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -38,7 +38,7 @@ use safekeeper::{http, WAL_REMOVER_RUNTIME};
 use safekeeper::{remove_wal, WAL_BACKUP_RUNTIME};
 use safekeeper::{wal_backup, HTTP_RUNTIME};
 use storage_broker::DEFAULT_ENDPOINT;
-use utils::auth::{JwtAuth, Scope};
+use utils::auth::{JwtAuth, Scope, SwappableJwtAuth};
 use utils::{
    id::NodeId,
    logging::{self, LogFormat},
@@ -251,10 +251,9 @@ async fn main() -> anyhow::Result<()> {
            None
        }
        Some(path) => {
-            info!("loading http auth JWT key from {path}");
-            Some(Arc::new(
-                JwtAuth::from_key_path(path).context("failed to load the auth key")?,
-            ))
+            info!("loading http auth JWT key(s) from {path}");
+            let jwt_auth = JwtAuth::from_key_path(path).context("failed to load the auth key")?;
+            Some(Arc::new(SwappableJwtAuth::new(jwt_auth)))
        }
    };

--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -30,7 +30,7 @@ use crate::timelines_global_map::TimelineDeleteForceResult;
 use crate::GlobalTimelines;
 use crate::SafeKeeperConf;
 use utils::{
-    auth::JwtAuth,
+    auth::SwappableJwtAuth,
    http::{
        endpoint::{self, auth_middleware, check_permission_with},
        error::ApiError,
@@ -428,8 +428,11 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
            if ALLOWLIST_ROUTES.contains(request.uri()) {
                None
            } else {
-                // Option<Arc<JwtAuth>> is always provided as data below, hence unwrap().
-                request.data::<Option<Arc<JwtAuth>>>().unwrap().as_deref()
+                // Option<Arc<SwappableJwtAuth>> is always provided as data below, hence unwrap().
+                request
+                    .data::<Option<Arc<SwappableJwtAuth>>>()
+                    .unwrap()
+                    .as_deref()
            }
        }))
    }
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -1,3 +1,4 @@
+#![deny(clippy::undocumented_unsafe_blocks)]
 use camino::Utf8PathBuf;
 use once_cell::sync::Lazy;
 use remote_storage::RemoteStorageConfig;
@@ -6,7 +7,10 @@ use tokio::runtime::Runtime;
 use std::time::Duration;
 use storage_broker::Uri;

-use utils::id::{NodeId, TenantId, TenantTimelineId};
+use utils::{
+    auth::SwappableJwtAuth,
+    id::{NodeId, TenantId, TenantTimelineId},
+};

 mod auth;
 pub mod broker;
@@ -69,7 +73,7 @@ pub struct SafeKeeperConf {
    pub wal_backup_enabled: bool,
    pub pg_auth: Option<Arc<JwtAuth>>,
    pub pg_tenant_only_auth: Option<Arc<JwtAuth>>,
-    pub http_auth: Option<Arc<JwtAuth>>,
+    pub http_auth: Option<Arc<SwappableJwtAuth>>,
    pub current_thread_runtime: bool,
 }

--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -111,7 +111,7 @@ impl WalReceivers {
            .count()
    }

-    /// Unregister walsender.
+    /// Unregister walreceiver.
    fn unregister(self: &Arc<WalReceivers>, id: WalReceiverId) {
        let mut shared = self.mutex.lock();
        shared.slots[id] = None;
@@ -138,8 +138,8 @@ pub enum WalReceiverStatus {
    Streaming,
 }

-/// Scope guard to access slot in WalSenders registry and unregister from it in
-/// Drop.
+/// Scope guard to access slot in WalReceivers registry and unregister from
+/// it in Drop.
 pub struct WalReceiverGuard {
    id: WalReceiverId,
    walreceivers: Arc<WalReceivers>,
--- a/test_runner/duplicate_tenant.py
+++ b/test_runner/duplicate_tenant.py
@@ -0,0 +1,43 @@
+# Usage from top of repo:
+#  poetry run python3 test_runner/duplicate_tenant.py b97965931096047b2d54958756baee7b 10
+from queue import Queue
+import sys
+import threading
+
+import requests
+from fixtures.pageserver.http import PageserverHttpClient
+from fixtures.types import TenantId
+
+initial_tenant = sys.argv[1]
+ncopies = int(sys.argv[2])
+numthreads = int(sys.argv[3])
+
+
+# class DuckTypedNeonEnv:
+#     pass
+
+
+# cli = NeonCli(DuckTypedNeonEnv())
+
+q = Queue()
+for i in range(0, ncopies):
+    q.put(i)
+
+for i in range(0, numthreads):
+    q.put(None)
+
+
+def create():
+    while True:
+        if q.get() == None:
+            break
+        new_tenant = TenantId.generate()
+        res = requests.post(
+            f"http://localhost:9898/v1/tenant/{initial_tenant}/duplicate",
+            json={"new_tenant_id": str(new_tenant)},
+        )
+        res.raise_for_status()
+
+
+for i in range(0, numthreads):
+    threading.Thread(target=create).start()
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -361,7 +361,6 @@ class PgProtocol:

@dataclass
 class AuthKeys:
-    pub: str
    priv: str

    def generate_token(self, *, scope: str, **token_data: str) -> str:
@@ -877,9 +876,31 @@ class NeonEnv:

    @cached_property
    def auth_keys(self) -> AuthKeys:
-        pub = (Path(self.repo_dir) / "auth_public_key.pem").read_text()
        priv = (Path(self.repo_dir) / "auth_private_key.pem").read_text()
-        return AuthKeys(pub=pub, priv=priv)
+        return AuthKeys(priv=priv)
+
+    def regenerate_keys_at(self, privkey_path: Path, pubkey_path: Path):
+        # compare generate_auth_keys() in local_env.rs
+        subprocess.run(
+            ["openssl", "genpkey", "-algorithm", "ed25519", "-out", privkey_path],
+            cwd=self.repo_dir,
+            check=True,
+        )
+
+        subprocess.run(
+            [
+                "openssl",
+                "pkey",
+                "-in",
+                privkey_path,
+                "-pubout",
+                "-out",
+                pubkey_path,
+            ],
+            cwd=self.repo_dir,
+            check=True,
+        )
+        del self.auth_keys

    def generate_endpoint_id(self) -> str:
        """
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -189,6 +189,10 @@ class PageserverHttpClient(requests.Session):
        assert res_json is None
        return res_json

+    def reload_auth_validation_keys(self):
+        res = self.post(f"http://localhost:{self.port}/v1/reload_auth_validation_keys")
+        self.verbose_error(res)
+
    def tenant_list(self) -> List[Dict[Any, Any]]:
        res = self.get(f"http://localhost:{self.port}/v1/tenant")
        self.verbose_error(res)
@@ -215,6 +219,25 @@ class PageserverHttpClient(requests.Session):
        assert isinstance(new_tenant_id, str)
        return TenantId(new_tenant_id)

+    def tenant_duplicate(
+        self, src_tenant_id: TenantId, new_tenant_id: TenantId, conf: Optional[Dict[str, Any]] = None
+    ) -> TenantId:
+        if conf is not None:
+            assert "new_tenant_id" not in conf.keys()
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{src_tenant_id}/duplicate",
+            json={
+                "new_tenant_id": str(new_tenant_id),
+                **(conf or {}),
+            },
+        )
+        self.verbose_error(res)
+        if res.status_code == 409:
+            raise Exception(f"could not create tenant: already exists for id {new_tenant_id}")
+        new_tenant_id = res.json()
+        assert isinstance(new_tenant_id, str)
+        return TenantId(new_tenant_id)
+
    def tenant_attach(
        self,
        tenant_id: TenantId,
--- a/test_runner/regress/test_auth.py
+++ b/test_runner/regress/test_auth.py
@@ -1,12 +1,35 @@
+import os
 from contextlib import closing
+from pathlib import Path

 import psycopg2
 import pytest
-from fixtures.neon_fixtures import NeonEnvBuilder, PgProtocol
-from fixtures.pageserver.http import PageserverApiException
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    NeonEnvBuilder,
+    PgProtocol,
+)
+from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
 from fixtures.types import TenantId, TimelineId


+def assert_client_authorized(env: NeonEnv, http_client: PageserverHttpClient):
+    http_client.timeline_create(
+        pg_version=env.pg_version,
+        tenant_id=env.initial_tenant,
+        new_timeline_id=TimelineId.generate(),
+        ancestor_timeline_id=env.initial_timeline,
+    )
+
+
+def assert_client_not_authorized(env: NeonEnv, http_client: PageserverHttpClient):
+    with pytest.raises(
+        PageserverApiException,
+        match="Unauthorized: malformed jwt token",
+    ):
+        assert_client_authorized(env, http_client)
+
+
 def test_pageserver_auth(neon_env_builder: NeonEnvBuilder):
    neon_env_builder.auth_enabled = True
    env = neon_env_builder.init_start()
@@ -27,30 +50,16 @@ def test_pageserver_auth(neon_env_builder: NeonEnvBuilder):
    ps.safe_psql("set FOO", password=pageserver_token)

    # tenant can create branches
-    tenant_http_client.timeline_create(
-        pg_version=env.pg_version,
-        tenant_id=env.initial_tenant,
-        new_timeline_id=TimelineId.generate(),
-        ancestor_timeline_id=env.initial_timeline,
-    )
+    assert_client_authorized(env, tenant_http_client)
+
    # console can create branches for tenant
-    pageserver_http_client.timeline_create(
-        pg_version=env.pg_version,
-        tenant_id=env.initial_tenant,
-        new_timeline_id=TimelineId.generate(),
-        ancestor_timeline_id=env.initial_timeline,
-    )
+    assert_client_authorized(env, pageserver_http_client)

    # fail to create branch using token with different tenant_id
    with pytest.raises(
        PageserverApiException, match="Forbidden: Tenant id mismatch. Permission denied"
    ):
-        invalid_tenant_http_client.timeline_create(
-            pg_version=env.pg_version,
-            tenant_id=env.initial_tenant,
-            new_timeline_id=TimelineId.generate(),
-            ancestor_timeline_id=env.initial_timeline,
-        )
+        assert_client_authorized(env, invalid_tenant_http_client)

    # create tenant using management token
    pageserver_http_client.tenant_create(TenantId.generate())
@@ -82,6 +91,94 @@ def test_compute_auth_to_pageserver(neon_env_builder: NeonEnvBuilder):
            assert cur.fetchone() == (5000050000,)


+def test_pageserver_multiple_keys(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.auth_enabled = True
+    env = neon_env_builder.init_start()
+    env.pageserver.allowed_errors.append(".*Unauthorized: malformed jwt token.*")
+
+    pageserver_token_old = env.auth_keys.generate_pageserver_token()
+    pageserver_http_client_old = env.pageserver.http_client(pageserver_token_old)
+
+    pageserver_http_client_old.reload_auth_validation_keys()
+
+    # This test is to ensure that the pageserver supports multiple keys.
+    # The neon_local tool generates one key pair at a hardcoded path by default.
+    # As a preparation for our test, move the public key of the key pair into a
+    # directory at the same location as the hardcoded path by:
+    # 1. moving the the file at `configured_pub_key_path` to a temporary location
+    # 2. creating a new directory at `configured_pub_key_path`
+    # 3. moving the file from the temporary location into the newly created directory
+    configured_pub_key_path = Path(env.repo_dir) / "auth_public_key.pem"
+    os.rename(configured_pub_key_path, Path(env.repo_dir) / "auth_public_key.pem.file")
+    os.mkdir(configured_pub_key_path)
+    os.rename(
+        Path(env.repo_dir) / "auth_public_key.pem.file",
+        configured_pub_key_path / "auth_public_key_old.pem",
+    )
+
+    # Add a new key pair
+    # This invalidates env.auth_keys and makes them be regenerated
+    env.regenerate_keys_at(
+        Path("auth_private_key.pem"), Path("auth_public_key.pem/auth_public_key_new.pem")
+    )
+
+    # Reload the keys on the pageserver side
+    pageserver_http_client_old.reload_auth_validation_keys()
+
+    # We can continue doing things using the old token
+    assert_client_authorized(env, pageserver_http_client_old)
+
+    pageserver_token_new = env.auth_keys.generate_pageserver_token()
+    pageserver_http_client_new = env.pageserver.http_client(pageserver_token_new)
+
+    # The new token also works
+    assert_client_authorized(env, pageserver_http_client_new)
+
+    # Remove the old token and reload
+    os.remove(Path(env.repo_dir) / "auth_public_key.pem" / "auth_public_key_old.pem")
+    pageserver_http_client_old.reload_auth_validation_keys()
+
+    # Reloading fails now with the old token, but the new token still works
+    assert_client_not_authorized(env, pageserver_http_client_old)
+    assert_client_authorized(env, pageserver_http_client_new)
+
+
+def test_pageserver_key_reload(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.auth_enabled = True
+    env = neon_env_builder.init_start()
+    env.pageserver.allowed_errors.append(".*Unauthorized: malformed jwt token.*")
+
+    pageserver_token_old = env.auth_keys.generate_pageserver_token()
+    pageserver_http_client_old = env.pageserver.http_client(pageserver_token_old)
+
+    pageserver_http_client_old.reload_auth_validation_keys()
+
+    # Regenerate the keys
+    env.regenerate_keys_at(Path("auth_private_key.pem"), Path("auth_public_key.pem"))
+
+    # Reload the keys on the pageserver side
+    pageserver_http_client_old.reload_auth_validation_keys()
+
+    # Next attempt fails as we use the old auth token
+    with pytest.raises(
+        PageserverApiException,
+        match="Unauthorized: malformed jwt token",
+    ):
+        pageserver_http_client_old.reload_auth_validation_keys()
+
+    # same goes for attempts trying to create a timeline
+    assert_client_not_authorized(env, pageserver_http_client_old)
+
+    pageserver_token_new = env.auth_keys.generate_pageserver_token()
+    pageserver_http_client_new = env.pageserver.http_client(pageserver_token_new)
+
+    # timeline creation works with the new token
+    assert_client_authorized(env, pageserver_http_client_new)
+
+    # reloading also works with the new token
+    pageserver_http_client_new.reload_auth_validation_keys()
+
+
@pytest.mark.parametrize("auth_enabled", [False, True])
 def test_auth_failures(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
    neon_env_builder.auth_enabled = auth_enabled
--- a/test_runner/regress/test_change_pageserver.py
+++ b/test_runner/regress/test_change_pageserver.py
@@ -1,9 +1,13 @@
+import asyncio
+
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder
 from fixtures.remote_storage import RemoteStorageKind


 def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
+    num_connections = 3
+
    neon_env_builder.num_pageservers = 2
    neon_env_builder.enable_pageserver_remote_storage(
        remote_storage_kind=RemoteStorageKind.MOCK_S3,
@@ -16,15 +20,24 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
    alt_pageserver_id = env.pageservers[1].id
    env.pageservers[1].tenant_attach(env.initial_tenant)

-    pg_conn = endpoint.connect()
-    cur = pg_conn.cursor()
+    pg_conns = [endpoint.connect() for i in range(num_connections)]
+    curs = [pg_conn.cursor() for pg_conn in pg_conns]
+
+    def execute(statement: str):
+        for cur in curs:
+            cur.execute(statement)
+
+    def fetchone():
+        results = [cur.fetchone() for cur in curs]
+        assert all(result == results[0] for result in results)
+        return results[0]

    # Create table, and insert some rows. Make it big enough that it doesn't fit in
    # shared_buffers, otherwise the SELECT after restart will just return answer
    # from shared_buffers without hitting the page server, which defeats the point
    # of this test.
-    cur.execute("CREATE TABLE foo (t text)")
-    cur.execute(
+    curs[0].execute("CREATE TABLE foo (t text)")
+    curs[0].execute(
        """
        INSERT INTO foo
            SELECT 'long string to consume some space' || g
@@ -33,25 +46,25 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
    )

    # Verify that the table is larger than shared_buffers
-    cur.execute(
+    curs[0].execute(
        """
        select setting::int * pg_size_bytes(unit) as shared_buffers, pg_relation_size('foo') as tbl_size
        from pg_settings where name = 'shared_buffers'
        """
    )
-    row = cur.fetchone()
+    row = curs[0].fetchone()
    assert row is not None
    log.info(f"shared_buffers is {row[0]}, table size {row[1]}")
    assert int(row[0]) < int(row[1])

-    cur.execute("SELECT count(*) FROM foo")
-    assert cur.fetchone() == (100000,)
+    execute("SELECT count(*) FROM foo")
+    assert fetchone() == (100000,)

    endpoint.reconfigure(pageserver_id=alt_pageserver_id)

    # Verify that the neon.pageserver_connstring GUC is set to the correct thing
-    cur.execute("SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'")
-    connstring = cur.fetchone()
+    execute("SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'")
+    connstring = fetchone()
    assert connstring is not None
    expected_connstring = f"postgresql://no_user:@localhost:{env.pageservers[1].service_port.pg}"
    assert expected_connstring == expected_connstring
@@ -60,5 +73,45 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
        0
    ].stop()  # Stop the old pageserver just to make sure we're reading from the new one

-    cur.execute("SELECT count(*) FROM foo")
-    assert cur.fetchone() == (100000,)
+    execute("SELECT count(*) FROM foo")
+    assert fetchone() == (100000,)
+
+    # Try failing back, and this time we will stop the current pageserver before reconfiguring
+    # the endpoint.  Whereas the previous reconfiguration was like a healthy migration, this
+    # is more like what happens in an unexpected  pageserver failure.
+    env.pageservers[0].start()
+    env.pageservers[1].stop()
+
+    endpoint.reconfigure(pageserver_id=env.pageservers[0].id)
+
+    execute("SELECT count(*) FROM foo")
+    assert fetchone() == (100000,)
+
+    env.pageservers[0].stop()
+    env.pageservers[1].start()
+
+    # Test a (former) bug where a child process spins without updating its connection string
+    # by executing a query separately. This query will hang until we issue the reconfigure.
+    async def reconfigure_async():
+        await asyncio.sleep(
+            1
+        )  # Sleep for 1 second just to make sure we actually started our count(*) query
+        endpoint.reconfigure(pageserver_id=env.pageservers[1].id)
+
+    def execute_count():
+        execute("SELECT count(*) FROM FOO")
+
+    async def execute_and_reconfigure():
+        task_exec = asyncio.to_thread(execute_count)
+        task_reconfig = asyncio.create_task(reconfigure_async())
+        await asyncio.gather(
+            task_exec,
+            task_reconfig,
+        )
+
+    asyncio.run(execute_and_reconfigure())
+    assert fetchone() == (100000,)
+
+    # One final check that nothing hangs
+    execute("SELECT count(*) FROM foo")
+    assert fetchone() == (100000,)
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -366,11 +366,17 @@ def test_deletion_queue_recovery(
    assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0

    if validate_before == ValidateBefore.VALIDATE:
+        # At this point, one or more DeletionLists have been written.  We have set a failpoint
+        # to prevent them successfully executing, but we want to see them get validated.
+        #
+        # We await _some_ validations instead of _all_ validations, because our execution failpoint
+        # will prevent validation proceeding for any but the first DeletionList.  Usually the workload
+        # just generates one, but if it generates two due to timing, then we must not expect that the
+        # second one will be validated.
+        def assert_some_validations():
+            assert get_deletion_queue_validated(ps_http) > 0

-        def assert_validation_complete():
-            assert get_deletion_queue_submitted(ps_http) == get_deletion_queue_validated(ps_http)
-
-        wait_until(20, 1, assert_validation_complete)
+        wait_until(20, 1, assert_some_validations)

        # The validatated keys statistic advances before the header is written, so we
        # also wait to see the header hit the disk: this seems paranoid but the race
@@ -380,6 +386,11 @@ def test_deletion_queue_recovery(

        wait_until(20, 1, assert_header_written)

+        # If we will lose attachment, then our expectation on restart is that only the ones
+        # we already validated will execute.  Act like only those were present in the queue.
+        if keep_attachment == KeepAttachment.LOSE:
+            before_restart_depth = get_deletion_queue_validated(ps_http)
+
    log.info(f"Restarting pageserver with {before_restart_depth} deletions enqueued")
    env.pageserver.stop(immediate=True)

@@ -402,11 +413,13 @@ def test_deletion_queue_recovery(
    ps_http.deletion_queue_flush(execute=True)
    wait_until(10, 1, lambda: assert_deletion_queue(ps_http, lambda n: n == 0))

-    if keep_attachment == KeepAttachment.KEEP or validate_before == ValidateBefore.VALIDATE:
+    if keep_attachment == KeepAttachment.KEEP:
        # - If we kept the attachment, then our pre-restart deletions should execute
        #   because on re-attach they were from the immediately preceding generation
-        # - If we validated before restart, then the deletions should execute because the
-        #   deletion queue header records a validated deletion list sequence number.
+        assert get_deletion_queue_executed(ps_http) == before_restart_depth
+    elif validate_before == ValidateBefore.VALIDATE:
+        # - If we validated before restart, then we should execute however many keys were
+        #   validated before restart.
        assert get_deletion_queue_executed(ps_http) == before_restart_depth
    else:
        env.pageserver.allowed_errors.extend([".*Dropping stale deletions.*"])
--- a/test_runner/regress/test_pageserver_restarts_under_workload.py
+++ b/test_runner/regress/test_pageserver_restarts_under_workload.py
@@ -17,10 +17,6 @@ def test_pageserver_restarts_under_worload(neon_simple_env: NeonEnv, pg_bin: PgB
    n_restarts = 10
    scale = 10

-    # Pageserver currently logs requests on non-active tenants at error level
-    # https://github.com/neondatabase/neon/issues/5784
-    env.pageserver.allowed_errors.append(".* will not become active. Current state: Stopping.*")
-
    def run_pgbench(connstr: str):
        log.info(f"Start a pgbench workload on pg {connstr}")
        pg_bin.run_capture(["pgbench", "-i", f"-s{scale}", connstr])
--- a/test_runner/regress/test_tenant_duplicate.py
+++ b/test_runner/regress/test_tenant_duplicate.py
@@ -0,0 +1,54 @@
+import time
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+    last_flush_lsn_upload,
+)
+from fixtures.remote_storage import (
+    RemoteStorageKind,
+)
+from fixtures.types import TenantId
+from fixtures.log_helper import log
+
+
+def test_tenant_duplicate(
+    neon_env_builder: NeonEnvBuilder,
+):
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
+
+    env = neon_env_builder.init_start()
+
+    with env.endpoints.create_start("main", tenant_id=env.initial_tenant) as ep_main:
+        ep_main.safe_psql("CREATE TABLE foo (i int);")
+        ep_main.safe_psql("INSERT INTO foo VALUES (1), (2), (3);")
+        last_flush_lsn = last_flush_lsn_upload(
+            env, ep_main, env.initial_tenant, env.initial_timeline
+        )
+
+    new_tenant_id = TenantId.generate()
+    # timeline id remains unchanged with tenant_duplicate
+    # TODO: implement a remapping scheme so timeline ids remain globally unique
+    new_timeline_id = env.initial_timeline
+
+    log.info(f"Duplicate tenant/timeline will be: {new_tenant_id}/{new_timeline_id}")
+
+    ps_http = env.pageserver.http_client()
+
+    ps_http.tenant_duplicate(env.initial_tenant, new_tenant_id)
+
+    ps_http.tenant_delete(env.initial_tenant)
+
+    env.neon_cli.map_branch("duplicate", new_tenant_id, new_timeline_id)
+
+    # start read-only replicate and validate
+    with env.endpoints.create_start(
+        "duplicate", tenant_id=new_tenant_id, lsn=last_flush_lsn
+    ) as ep_dup:
+        with ep_dup.connect() as conn:
+            with conn.cursor() as cur:
+                cur.execute("SELECT * FROM foo ORDER BY i;")
+                cur.fetchall() == [(1,), (2,), (3,)]
+
+    # ensure restarting PS works
+    env.pageserver.stop()
+    env.pageserver.start()
+
--- a/trace/src/main.rs
+++ b/trace/src/main.rs
@@ -74,6 +74,7 @@ fn analyze_trace<R: std::io::Read>(mut reader: R) {
                prev = Some(req);
            }
            PagestreamFeMessage::DbSize(_) => {}
+            PagestreamFeMessage::NoOp => {},
        };
    }
Author	SHA1	Message	Date
Christian Schwarz	2d90b95b6a	implement a benchmark for tokio tcp handling to figure out perf bottleneck in no_op libpq benchmark	2023-11-08 16:34:57 +00:00
Christian Schwarz	06b57361e4	implement a standalone no-op server usable by getpage_bench_libpq by running it on a different port than the pageserver libpq listener, and overriding connstring for getpage_bench_libpq to point to the noop_server	2023-11-08 16:34:57 +00:00
Christian Schwarz	36be29d0b8	getpage_bench_libpq: support for the no-op mode	2023-11-08 16:34:56 +00:00
Christian Schwarz	6a202cdf08	no-op pagestream request/response type (server-side impl)	2023-11-08 16:34:56 +00:00
Christian Schwarz	f51e608193	pq bench: avoid repeated conversion to_i128	2023-11-08 16:34:56 +00:00
Christian Schwarz	78a28f787c	per-second RPS	2023-11-08 16:34:56 +00:00
Christian Schwarz	001a0e4006	pq bench: proper shutdown	2023-11-08 16:34:56 +00:00
Christian Schwarz	daa2ea7ebe	http bench: sligthly improved stats	2023-11-08 16:34:56 +00:00
Christian Schwarz	37e8eba57f	WIP: libpq-based client depends on https://github.com/neondatabase/rust-postgres/pull/25	2023-11-08 16:34:56 +00:00
Christian Schwarz	f45882ef3c	rename getpage_bench to getpage_bench_http	2023-11-08 16:34:56 +00:00
Christian Schwarz	d16d02d61d	WIP: benchmark that does random getpage requests against the keyspace backup of pageserver.toml d =1 pg_distrib_dir ='/home/admin/neon-main/pg_install' http_auth_type ='Trust' pg_auth_type ='Trust' listen_http_addr ='127.0.0.1:9898' listen_pg_addr ='127.0.0.1:64000' broker_endpoint ='http://127.0.0.1:50051/' #control_plane_api ='http://127.0.0.1:1234/' # Initial configuration file created by 'pageserver --init' #listen_pg_addr = '127.0.0.1:64000' #listen_http_addr = '127.0.0.1:9898' #wait_lsn_timeout = '60 s' #wal_redo_timeout = '60 s' #max_file_descriptors = 10000 #page_cache_size = 160000 # initial superuser role name to use when creating a new tenant #initial_superuser_name = 'cloud_admin' #broker_endpoint = 'http://127.0.0.1:50051' #log_format = 'plain' #concurrent_tenant_size_logical_size_queries = '1' #metric_collection_interval = '10 min' #cached_metric_collection_interval = '0s' #synthetic_size_calculation_interval = '10 min' #disk_usage_based_eviction = { max_usage_pct = .., min_avail_bytes = .., period = "10s"} #background_task_maximum_delay = '10s' [tenant_config] #checkpoint_distance = 268435456 # in bytes #checkpoint_timeout = 10 m #compaction_target_size = 134217728 # in bytes #compaction_period = '20 s' #compaction_threshold = 10 #gc_period = '1 hr' #gc_horizon = 67108864 #image_creation_threshold = 3 #pitr_interval = '7 days' #min_resident_size_override = .. # in bytes #evictions_low_residence_duration_metric_threshold = '24 hour' #gc_feedback = false # make it determinsitic gc_period = '0s' checkpoint_timeout = '3650 day' compaction_period = '20 s' compaction_threshold = 10 compaction_target_size = 134217728 checkpoint_distance = 268435456 image_creation_threshold = 3 [remote_storage] local_path = '/home/admin/neon-main/bench_repo_dir/repo/remote_storage_local_fs'	2023-11-08 16:34:56 +00:00
Christian Schwarz	947f6d9491	API to duplicate a tenant	2023-11-08 16:34:56 +00:00
John Spray	40441f8ada	pageserver: use `Gate` for stronger safety check in `SlotGuard` (#5793 ) ## Problem #5711 and #5367 raced -- the `SlotGuard` type needs `Gate` to properly enforce its invariant that we may not drop an `Arc<Tenant>` from a slot. ## Summary of changes Replace the TODO with the intended check of Gate.	2023-11-08 13:00:11 +00:00
John Spray	a8a39cd464	test: de-flake test_deletion_queue_recovery (#5822 ) ## Problem This test could fail if timing is unlucky, and the deletions in the test land in two deletion lists instead of one. ## Summary of changes We await _some_ validations instead of _all_ validations, because our execution failpoint will prevent validation proceeding for any but the first DeletionList. Usually the workload just generates one, but if it generates two due to timing, then we must not expect that the second one will be validated.	2023-11-08 12:41:48 +00:00
John Spray	b989ad1922	extend test_change_pageserver for failure case, rework changing pageserver (#5693 ) Reproducer for https://github.com/neondatabase/neon/issues/5692 The test change in this PR intentionally fails, to demonstrate the issue. --------- Co-authored-by: Sasha Krassovsky <krassovskysasha@gmail.com>	2023-11-08 11:26:56 +00:00
Em Sharnoff	acef742a6e	vm-monitor: Remove dependency on workspace_hack (#5752 ) neondatabase/autoscaling builds libs/vm-monitor during CI because it's a necessary component of autoscaling. workspace_hack includes a lot of crates that are not necessary for vm-monitor, which artificially inflates the build time on the autoscaling side, so hopefully removing the dependency should speed things up. Co-authored-by: Joonas Koivunen <joonas@neon.tech>	2023-11-07 09:41:20 -08:00
duguorong009	11d9d801b5	pageserver: improve the shutdown log error (#5792 ) ## Problem - Close #5784 ## Summary of changes - Update the `GetActiveTenantError` -> `QueryError` conversion process in `pageserver/src/page_service.rs` - Update the pytest logging exceptions in `./test_runner/regress/test_tenant_detach.py`	2023-11-07 16:57:26 +00:00
Andrew Rudenko	fc47af156f	Passing neon options to the console (#5781 ) The idea is to pass neon_* prefixed options to control plane. It can be used by cplane to dynamically create timelines and computes. Such options also should be excluded from passing to compute. Another issue is how connection caching is working now, because compute's instance now depends not only on hostname but probably on such options too I included them to cache key.	2023-11-07 16:49:26 +01:00
Arpad Müller	e310533ed3	Support JWT key reload in pageserver (#5594 ) ## Problem For quickly rotating JWT secrets, we want to be able to reload the JWT public key file in the pageserver, and also support multiple JWT keys. See #4897. ## Summary of changes * Allow directories for the `auth_validation_public_key_path` config param instead of just files. for the safekeepers, all of their config options also support multiple JWT keys. * For the pageservers, make the JWT public keys easily globally swappable by using the `arc-swap` crate. * Add an endpoint to the pageserver, triggered by a POST to `/v1/reload_auth_validation_keys`, that reloads the JWT public keys from the pre-configured path (for security reasons, you cannot upload any keys yourself). Fixes #4897 --------- Co-authored-by: Heikki Linnakangas <heikki@neon.tech> Co-authored-by: Joonas Koivunen <joonas@neon.tech>	2023-11-07 15:43:29 +01:00
John Spray	1d68f52b57	pageserver: move deletion failpoint inside backoff (#5814 ) ## Problem When enabled, this failpoint would busy-spin in a loop that emits log messages. ## Summary of changes Move the failpoint inside a backoff::exponential block: it will still spam the log, but at much lower rate. --------- Co-authored-by: Joonas Koivunen <joonas@neon.tech>	2023-11-07 14:25:51 +00:00
Alexander Bayandin	4cd47b7d4b	Dockerfile: Set BUILD_TAG for storage services (#5812 ) ## Problem https://github.com/neondatabase/neon/pull/5576 added `build-tag` reporting to `libmetrics_build_info`, but it's not reported because we didn't set the corresponding env variable in the build process. ## Summary of changes - Add `BUILD_TAG` env var while building services	2023-11-07 13:45:59 +00:00
Fernando Luz	0141c95788	build: Add warning when missing postgres submodule during the build (#5614 ) I forked the project and in my local repo, I wasn't able to compile the project and in my search, I found the solution in neon forum. After a PR discussion, I made a change in the makefile to alert the missing `git submodules update` step. --------- Signed-off-by: Fernando Luz <prof.fernando.luz@gmail.com> Co-authored-by: Joonas Koivunen <joonas@neon.tech>	2023-11-07 12:13:05 +00:00
Shany Pozin	0ac4cf67a6	Use self.tenants instead of TENANTS (#5811 )	2023-11-07 11:38:02 +00:00
Joonas Koivunen	4be6bc7251	refactor: remove unnecessary unsafe (#5802 ) unsafe impls for `Send` and `Sync` should not be added by default. in the case of `SlotGuard` removing them does not cause any issues, as the compiler automatically derives those. This PR adds requirement to document the unsafety (see [clippy::undocumented_unsafe_blocks]) and opportunistically adds `#![deny(unsafe_code)]` to most places where we don't have unsafe code right now. TRPL on Send and Sync: https://doc.rust-lang.org/book/ch16-04-extensible-concurrency-sync-and-send.html [clippy::undocumented_unsafe_blocks]: https://rust-lang.github.io/rust-clippy/master/#/undocumented_unsafe_blocks	2023-11-07 10:26:25 +00:00
John Spray	a394f49e0d	pageserver: avoid converting an error to anyhow::Error (#5803 ) This was preventing it getting cleanly converted to a CalculateLogicalSizeError::Cancelled, resulting in "Logical size calculation failed" errors in logs.	2023-11-07 09:35:45 +00:00
John Spray	c00651ff9b	pageserver: start refactoring into TenantManager (#5797 ) ## Problem See: https://github.com/neondatabase/neon/issues/5796 ## Summary of changes Completing the refactor is quite verbose and can be done in stages: each interface that is currently called directly from a top-level mgr.rs function can be moved into TenantManager once the relevant subsystems have access to it. Landing the initial change to create of TenantManager is useful because it enables new code to use it without having to be altered later, and sets us up to incrementally fix the existing code to use an explicit Arc<TenantManager> instead of relying on the static TENANTS.	2023-11-07 09:06:53 +00:00
Richy Wang	bea8efac24	Fix comments in 'receive_wal.rs'. (#5807 ) ## Problem Some comments in 'receive_wal.rs' is not suitable. It may copy from 'send_wal.rs' and leave it unchanged. ## Summary of changes This commit fixes two comments in the code: Changed "/// Unregister walsender." to "/// Unregister walreceiver." Changed "///Scope guard to access slot in WalSenders registry" to "///Scope guard to access slot in WalReceivers registry."	2023-11-07 09:13:01 +01:00