Merge branch 'main' into khanova-test

Increase
fix(metrics): correct maxrss metrics on macos (#7487 )
2026-01-27 23:30:38 +00:00 · 2024-04-24 17:28:28 +02:00 · 2024-04-24 17:25:18 +02:00 · 2024-04-24 15:09:23 +00:00 · 2024-04-24 17:06:50 +02:00 · 2024-04-24 15:36:23 +01:00
145 changed files with 6252 additions and 4682 deletions
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -150,7 +150,7 @@ runs:

        # Use aws s3 cp (instead of aws s3 sync) to keep files from previous runs to make old URLs work,
        # and to keep files on the host to upload them to the database
-        time aws s3 cp --recursive --only-show-errors "${WORKDIR}/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}"
+        time s5cmd --log error cp "${WORKDIR}/report/*" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}/"

        # Generate redirect
        cat <<EOF > ${WORKDIR}/index.html
--- a/.github/workflows/approved-for-ci-run.yml
+++ b/.github/workflows/approved-for-ci-run.yml
@@ -18,6 +18,7 @@ on:

 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
+  cancel-in-progress: false

 env:
  GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -21,6 +21,7 @@ defaults:

 concurrency:
  group: build-build-tools-image-${{ inputs.image-tag }}
+  cancel-in-progress: false

 # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
 permissions: {}
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -735,7 +735,7 @@ jobs:
        run: |
          mkdir -p .docker-custom
          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-      - uses: docker/setup-buildx-action@v3
+      - uses: docker/setup-buildx-action@v2

      - uses: docker/login-action@v3
        with:
@@ -792,7 +792,7 @@ jobs:
        run: |
          mkdir -p .docker-custom
          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-      - uses: docker/setup-buildx-action@v3
+      - uses: docker/setup-buildx-action@v2
        with:
          # Disable parallelism for docker buildkit.
          # As we already build everything with `make -j$(nproc)`, running it in additional level of parallelisam blows up the Runner.
@@ -865,7 +865,7 @@ jobs:
      run:
        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.23.2
+      VM_BUILDER_VERSION: v0.28.1

    steps:
      - name: Checkout
@@ -1133,8 +1133,6 @@ jobs:
              -f deployPreprodRegion=true

            gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
-              -f deployPgSniRouter=false \
-              -f deployProxy=false \
              -f deployStorage=true \
              -f deployStorageBroker=true \
              -f deployStorageController=true \
--- a/.github/workflows/check-build-tools-image.yml
+++ b/.github/workflows/check-build-tools-image.yml
@@ -28,7 +28,9 @@ jobs:
      - name: Get build-tools image tag for the current commit
        id: get-build-tools-tag
        env:
-          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+          # Usually, for COMMIT_SHA, we use `github.event.pull_request.head.sha || github.sha`, but here, even for PRs,
+          # we want to use `github.sha` i.e. point to a phantom merge commit to determine the image tag correctly.
+          COMMIT_SHA: ${{ github.sha }}
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
          LAST_BUILD_TOOLS_SHA=$(
--- a/.github/workflows/pin-build-tools-image.yml
+++ b/.github/workflows/pin-build-tools-image.yml
@@ -20,6 +20,7 @@ defaults:

 concurrency:
  group: pin-build-tools-image-${{ inputs.from-tag }}
+  cancel-in-progress: false

 permissions: {}

--- a/Cargo.lock
+++ b/Cargo.lock
@@ -599,7 +599,7 @@ dependencies = [
 "once_cell",
 "pin-project-lite",
 "pin-utils",
- "rustls 0.21.9",
+ "rustls 0.21.11",
 "tokio",
 "tracing",
 ]
@@ -2519,7 +2519,7 @@ dependencies = [
 "http 0.2.9",
 "hyper 0.14.26",
 "log",
- "rustls 0.21.9",
+ "rustls 0.21.11",
 "rustls-native-certs 0.6.2",
 "tokio",
 "tokio-rustls 0.24.0",
@@ -2932,9 +2932,9 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"

 [[package]]
 name = "measured"
-version = "0.0.20"
+version = "0.0.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3cbf033874bea03565f2449572c8640ca37ec26300455faf36001f24755da452"
+checksum = "652bc741286361c06de8cb4d89b21a6437f120c508c51713663589eeb9928ac5"
 dependencies = [
 "bytes",
 "crossbeam-utils",
@@ -2950,9 +2950,9 @@ dependencies = [

 [[package]]
 name = "measured-derive"
-version = "0.0.20"
+version = "0.0.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "be9e29b682b38f8af2a89f960455054ab1a9f5a06822f6f3500637ad9fa57def"
+checksum = "6ea497f33e1e856a376c32ad916f69a0bd3c597db1f912a399f842b01a4a685d"
 dependencies = [
 "heck 0.5.0",
 "proc-macro2",
@@ -2962,9 +2962,9 @@ dependencies = [

 [[package]]
 name = "measured-process"
-version = "0.0.20"
+version = "0.0.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a20849acdd04c5d6a88f565559044546904648a1842a2937cfff0b48b4ca7ef2"
+checksum = "b364ccb66937a814b6b2ad751d1a2f7a9d5a78c761144036825fb36bb0771000"
 dependencies = [
 "libc",
 "measured",
@@ -3658,6 +3658,7 @@ dependencies = [
 "tokio-util",
 "toml_edit",
 "tracing",
+ "twox-hash",
 "url",
 "utils",
 "walkdir",
@@ -4059,7 +4060,7 @@ dependencies = [
 "futures",
 "once_cell",
 "pq_proto",
- "rustls 0.22.2",
+ "rustls 0.22.4",
 "rustls-pemfile 2.1.1",
 "serde",
 "thiserror",
@@ -4322,6 +4323,7 @@ dependencies = [
 "itertools",
 "lasso",
 "md5",
+ "measured",
 "metrics",
 "native-tls",
 "once_cell",
@@ -4349,7 +4351,7 @@ dependencies = [
 "routerify",
 "rstest",
 "rustc-hash",
- "rustls 0.22.2",
+ "rustls 0.22.4",
 "rustls-pemfile 2.1.1",
 "scopeguard",
 "serde",
@@ -4541,7 +4543,7 @@ dependencies = [
 "itoa",
 "percent-encoding",
 "pin-project-lite",
- "rustls 0.22.2",
+ "rustls 0.22.4",
 "rustls-native-certs 0.7.0",
 "rustls-pemfile 2.1.1",
 "rustls-pki-types",
@@ -4695,7 +4697,7 @@ dependencies = [
 "once_cell",
 "percent-encoding",
 "pin-project-lite",
- "rustls 0.21.9",
+ "rustls 0.21.11",
 "rustls-pemfile 1.0.2",
 "serde",
 "serde_json",
@@ -4955,9 +4957,9 @@ dependencies = [

 [[package]]
 name = "rustls"
-version = "0.21.9"
+version = "0.21.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "629648aced5775d558af50b2b4c7b02983a04b312126d45eeead26e7caa498b9"
+checksum = "7fecbfb7b1444f477b345853b1fce097a2c6fb637b2bfb87e6bc5db0f043fae4"
 dependencies = [
 "log",
 "ring 0.17.6",
@@ -4967,9 +4969,9 @@ dependencies = [

 [[package]]
 name = "rustls"
-version = "0.22.2"
+version = "0.22.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e87c9956bd9807afa1f77e0f7594af32566e830e088a5576d27c5b6f30f49d41"
+checksum = "bf4ef73721ac7bcd79b2b315da7779d8fc09718c6b3d2d1b2d94850eb8c18432"
 dependencies = [
 "log",
 "ring 0.17.6",
@@ -5281,7 +5283,7 @@ checksum = "2e95efd0cefa32028cdb9766c96de71d96671072f9fb494dc9fb84c0ef93e52b"
 dependencies = [
 "httpdate",
 "reqwest",
- "rustls 0.21.9",
+ "rustls 0.21.11",
 "sentry-backtrace",
 "sentry-contexts",
 "sentry-core",
@@ -5829,8 +5831,7 @@ checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"
 [[package]]
 name = "svg_fmt"
 version = "0.4.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f83ba502a3265efb76efb89b0a2f7782ad6f2675015d4ce37e4b547dda42b499"
+source = "git+https://github.com/neondatabase/fork--nical--rust_debug?branch=neon#b9501105e746629004bc6d0473639320939dbe10"

 [[package]]
 name = "syn"
@@ -6192,7 +6193,7 @@ checksum = "0ea13f22eda7127c827983bdaf0d7fff9df21c8817bab02815ac277a21143677"
 dependencies = [
 "futures",
 "ring 0.17.6",
- "rustls 0.22.2",
+ "rustls 0.22.4",
 "tokio",
 "tokio-postgres",
 "tokio-rustls 0.25.0",
@@ -6205,7 +6206,7 @@ version = "0.24.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e0d409377ff5b1e3ca6437aa86c1eb7d40c134bfec254e44c830defa92669db5"
 dependencies = [
- "rustls 0.21.9",
+ "rustls 0.21.11",
 "tokio",
 ]

@@ -6215,7 +6216,7 @@ version = "0.25.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "775e0c0f0adb3a2f22a00c4745d728b479985fc15ee7ca6a2608388c5569860f"
 dependencies = [
- "rustls 0.22.2",
+ "rustls 0.22.4",
 "rustls-pki-types",
 "tokio",
 ]
@@ -6676,7 +6677,7 @@ dependencies = [
 "base64 0.21.1",
 "log",
 "once_cell",
- "rustls 0.21.9",
+ "rustls 0.21.11",
 "rustls-webpki 0.100.2",
 "url",
 "webpki-roots 0.23.1",
@@ -7353,7 +7354,7 @@ dependencies = [
 "regex-automata 0.4.3",
 "regex-syntax 0.8.2",
 "reqwest",
- "rustls 0.21.9",
+ "rustls 0.21.11",
 "scopeguard",
 "serde",
 "serde_json",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -107,8 +107,8 @@ lasso = "0.7"
 leaky-bucket = "1.0.1"
 libc = "0.2"
 md5 = "0.7.0"
-measured = { version = "0.0.20", features=["lasso"] }
-measured-process = { version = "0.0.20" }
+measured = { version = "0.0.21", features=["lasso"] }
+measured-process = { version = "0.0.21" }
 memoffset = "0.8"
 native-tls = "0.2"
 nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
@@ -157,7 +157,8 @@ socket2 = "0.5"
 strum = "0.24"
 strum_macros = "0.24"
 "subtle"  = "2.5.0"
-svg_fmt = "0.4.1"
+# https://github.com/nical/rust_debug/pull/4
+svg_fmt = { git = "https://github.com/neondatabase/fork--nical--rust_debug", branch = "neon" }
 sync_wrapper = "0.1.2"
 tar = "0.4"
 task-local-extensions = "0.1.4"
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -58,6 +58,12 @@ RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v$
    && mv protoc/include/google /usr/local/include/google \
    && rm -rf protoc.zip protoc

+# s5cmd
+ENV S5CMD_VERSION=2.2.2
+RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/s5cmd_${S5CMD_VERSION}_Linux-$(uname -m | sed 's/x86_64/64bit/g' | sed 's/aarch64/arm64/g').tar.gz" | tar zxvf - s5cmd \
+    && chmod +x s5cmd \
+    && mv s5cmd /usr/local/bin/s5cmd
+
 # LLVM
 ENV LLVM_VERSION=17
 RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -818,9 +818,15 @@ impl ComputeNode {
                        Client::connect(zenith_admin_connstr.as_str(), NoTls)
                            .context("broken cloud_admin credential: tried connecting with cloud_admin but could not authenticate, and zenith_admin does not work either")?;
                    // Disable forwarding so that users don't get a cloud_admin role
-                    client.simple_query("SET neon.forward_ddl = false")?;
-                    client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
-                    client.simple_query("GRANT zenith_admin TO cloud_admin")?;
+
+                    let mut func = || {
+                        client.simple_query("SET neon.forward_ddl = false")?;
+                        client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
+                        client.simple_query("GRANT zenith_admin TO cloud_admin")?;
+                        Ok::<_, anyhow::Error>(())
+                    };
+                    func().context("apply_config setup cloud_admin")?;
+
                    drop(client);

                    // reconnect with connstring with expected name
@@ -832,24 +838,29 @@ impl ComputeNode {
        };

        // Disable DDL forwarding because control plane already knows about these roles/databases.
-        client.simple_query("SET neon.forward_ddl = false")?;
+        client
+            .simple_query("SET neon.forward_ddl = false")
+            .context("apply_config SET neon.forward_ddl = false")?;

        // Proceed with post-startup configuration. Note, that order of operations is important.
        let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec;
-        create_neon_superuser(spec, &mut client)?;
-        cleanup_instance(&mut client)?;
-        handle_roles(spec, &mut client)?;
-        handle_databases(spec, &mut client)?;
-        handle_role_deletions(spec, connstr.as_str(), &mut client)?;
+        create_neon_superuser(spec, &mut client).context("apply_config create_neon_superuser")?;
+        cleanup_instance(&mut client).context("apply_config cleanup_instance")?;
+        handle_roles(spec, &mut client).context("apply_config handle_roles")?;
+        handle_databases(spec, &mut client).context("apply_config handle_databases")?;
+        handle_role_deletions(spec, connstr.as_str(), &mut client)
+            .context("apply_config handle_role_deletions")?;
        handle_grants(
            spec,
            &mut client,
            connstr.as_str(),
            self.has_feature(ComputeFeature::AnonExtension),
-        )?;
-        handle_extensions(spec, &mut client)?;
-        handle_extension_neon(&mut client)?;
-        create_availability_check_data(&mut client)?;
+        )
+        .context("apply_config handle_grants")?;
+        handle_extensions(spec, &mut client).context("apply_config handle_extensions")?;
+        handle_extension_neon(&mut client).context("apply_config handle_extension_neon")?;
+        create_availability_check_data(&mut client)
+            .context("apply_config create_availability_check_data")?;

        // 'Close' connection
        drop(client);
@@ -857,7 +868,7 @@ impl ComputeNode {
        // Run migrations separately to not hold up cold starts
        thread::spawn(move || {
            let mut client = Client::connect(connstr.as_str(), NoTls)?;
-            handle_migrations(&mut client)
+            handle_migrations(&mut client).context("apply_config handle_migrations")
        });
        Ok(())
    }
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -6,8 +6,8 @@ use std::path::Path;
 use anyhow::Result;

 use crate::pg_helpers::escape_conf_value;
-use crate::pg_helpers::PgOptionsSerialize;
-use compute_api::spec::{ComputeMode, ComputeSpec};
+use crate::pg_helpers::{GenericOptionExt, PgOptionsSerialize};
+use compute_api::spec::{ComputeMode, ComputeSpec, GenericOption};

 /// Check that `line` is inside a text file and put it there if it is not.
 /// Create file if it doesn't exist.
@@ -92,6 +92,27 @@ pub fn write_postgres_conf(
        }
    }

+    if cfg!(target_os = "linux") {
+        // Check /proc/sys/vm/overcommit_memory -- if it equals 2 (i.e. linux memory overcommit is
+        // disabled), then the control plane has enabled swap and we should set
+        // dynamic_shared_memory_type = 'mmap'.
+        //
+        // This is (maybe?) temporary - for more, see https://github.com/neondatabase/cloud/issues/12047.
+        let overcommit_memory_contents = std::fs::read_to_string("/proc/sys/vm/overcommit_memory")
+            // ignore any errors - they may be expected to occur under certain situations (e.g. when
+            // not running in Linux).
+            .unwrap_or_else(|_| String::new());
+        if overcommit_memory_contents.trim() == "2" {
+            let opt = GenericOption {
+                name: "dynamic_shared_memory_type".to_owned(),
+                value: Some("mmap".to_owned()),
+                vartype: "enum".to_owned(),
+            };
+
+            write!(file, "{}", opt.to_pg_setting())?;
+        }
+    }
+
    // If there are any extra options in the 'settings' field, append those
    if spec.cluster.settings.is_some() {
        writeln!(file, "# Managed by compute_ctl: begin")?;
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -44,7 +44,7 @@ pub fn escape_conf_value(s: &str) -> String {
    format!("'{}'", res)
 }

-trait GenericOptionExt {
+pub trait GenericOptionExt {
    fn to_pg_option(&self) -> String;
    fn to_pg_setting(&self) -> String;
 }
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -2,7 +2,7 @@ use std::fs::File;
 use std::path::Path;
 use std::str::FromStr;

-use anyhow::{anyhow, bail, Result};
+use anyhow::{anyhow, bail, Context, Result};
 use postgres::config::Config;
 use postgres::{Client, NoTls};
 use reqwest::StatusCode;
@@ -698,7 +698,8 @@ pub fn handle_grants(

        // it is important to run this after all grants
        if enable_anon_extension {
-            handle_extension_anon(spec, &db.owner, &mut db_client, false)?;
+            handle_extension_anon(spec, &db.owner, &mut db_client, false)
+                .context("handle_grants handle_extension_anon")?;
        }
    }

@@ -813,28 +814,36 @@ $$;"#,
        // Add new migrations below.
    ];

-    let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
-    client.simple_query(query)?;
+    let mut func = || {
+        let query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
+        client.simple_query(query)?;

-    query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
-    client.simple_query(query)?;
+        let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
+        client.simple_query(query)?;

-    query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
-    client.simple_query(query)?;
+        let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
+        client.simple_query(query)?;

-    query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
-    client.simple_query(query)?;
+        let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
+        client.simple_query(query)?;

-    query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
-    client.simple_query(query)?;
+        let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
+        client.simple_query(query)?;
+        Ok::<_, anyhow::Error>(())
+    };
+    func().context("handle_migrations prepare")?;

-    query = "SELECT id FROM neon_migration.migration_id";
-    let row = client.query_one(query, &[])?;
+    let query = "SELECT id FROM neon_migration.migration_id";
+    let row = client
+        .query_one(query, &[])
+        .context("handle_migrations get migration_id")?;
    let mut current_migration: usize = row.get::<&str, i64>("id") as usize;
    let starting_migration_id = current_migration;

-    query = "BEGIN";
-    client.simple_query(query)?;
+    let query = "BEGIN";
+    client
+        .simple_query(query)
+        .context("handle_migrations begin")?;

    while current_migration < migrations.len() {
        let migration = &migrations[current_migration];
@@ -842,7 +851,9 @@ $$;"#,
            info!("Skip migration id={}", current_migration);
        } else {
            info!("Running migration:\n{}\n", migration);
-            client.simple_query(migration)?;
+            client.simple_query(migration).with_context(|| {
+                format!("handle_migrations current_migration={}", current_migration)
+            })?;
        }
        current_migration += 1;
    }
@@ -850,10 +861,14 @@ $$;"#,
        "UPDATE neon_migration.migration_id SET id={}",
        migrations.len()
    );
-    client.simple_query(&setval)?;
+    client
+        .simple_query(&setval)
+        .context("handle_migrations update id")?;

-    query = "COMMIT";
-    client.simple_query(query)?;
+    let query = "COMMIT";
+    client
+        .simple_query(query)
+        .context("handle_migrations commit")?;

    info!(
        "Ran {} migrations",
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -1231,7 +1231,7 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
    match ComputeControlPlane::load(env.clone()) {
        Ok(cplane) => {
            for (_k, node) in cplane.endpoints {
-                if let Err(e) = node.stop(if immediate { "immediate" } else { "fast " }, false) {
+                if let Err(e) = node.stop(if immediate { "immediate" } else { "fast" }, false) {
                    eprintln!("postgres stop failed: {e:#}");
                }
            }
@@ -1417,6 +1417,7 @@ fn cli() -> Command {
        .subcommand(
            Command::new("timeline")
            .about("Manage timelines")
+            .arg_required_else_help(true)
            .subcommand(Command::new("list")
                .about("List all timelines, available to this pageserver")
                .arg(tenant_id_arg.clone()))
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -156,6 +156,7 @@ pub struct SafekeeperConf {
    pub remote_storage: Option<String>,
    pub backup_threads: Option<u32>,
    pub auth_enabled: bool,
+    pub listen_addr: Option<String>,
 }

 impl Default for SafekeeperConf {
@@ -169,6 +170,7 @@ impl Default for SafekeeperConf {
            remote_storage: None,
            backup_threads: None,
            auth_enabled: false,
+            listen_addr: None,
        }
    }
 }
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -70,24 +70,31 @@ pub struct SafekeeperNode {
    pub pg_connection_config: PgConnectionConfig,
    pub env: LocalEnv,
    pub http_client: reqwest::Client,
+    pub listen_addr: String,
    pub http_base_url: String,
 }

 impl SafekeeperNode {
    pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode {
+        let listen_addr = if let Some(ref listen_addr) = conf.listen_addr {
+            listen_addr.clone()
+        } else {
+            "127.0.0.1".to_string()
+        };
        SafekeeperNode {
            id: conf.id,
            conf: conf.clone(),
-            pg_connection_config: Self::safekeeper_connection_config(conf.pg_port),
+            pg_connection_config: Self::safekeeper_connection_config(&listen_addr, conf.pg_port),
            env: env.clone(),
            http_client: reqwest::Client::new(),
-            http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port),
+            http_base_url: format!("http://{}:{}/v1", listen_addr, conf.http_port),
+            listen_addr,
        }
    }

    /// Construct libpq connection string for connecting to this safekeeper.
-    fn safekeeper_connection_config(port: u16) -> PgConnectionConfig {
-        PgConnectionConfig::new_host_port(url::Host::parse("127.0.0.1").unwrap(), port)
+    fn safekeeper_connection_config(addr: &str, port: u16) -> PgConnectionConfig {
+        PgConnectionConfig::new_host_port(url::Host::parse(addr).unwrap(), port)
    }

    pub fn datadir_path_by_id(env: &LocalEnv, sk_id: NodeId) -> PathBuf {
@@ -111,8 +118,8 @@ impl SafekeeperNode {
        );
        io::stdout().flush().unwrap();

-        let listen_pg = format!("127.0.0.1:{}", self.conf.pg_port);
-        let listen_http = format!("127.0.0.1:{}", self.conf.http_port);
+        let listen_pg = format!("{}:{}", self.listen_addr, self.conf.pg_port);
+        let listen_http = format!("{}:{}", self.listen_addr, self.conf.http_port);
        let id = self.id;
        let datadir = self.datadir_path();

@@ -139,7 +146,7 @@ impl SafekeeperNode {
            availability_zone,
        ];
        if let Some(pg_tenant_only_port) = self.conf.pg_tenant_only_port {
-            let listen_pg_tenant_only = format!("127.0.0.1:{}", pg_tenant_only_port);
+            let listen_pg_tenant_only = format!("{}:{}", self.listen_addr, pg_tenant_only_port);
            args.extend(["--listen-pg-tenant-only".to_owned(), listen_pg_tenant_only]);
        }
        if !self.conf.sync {
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -1,15 +1,15 @@
-use std::{collections::HashMap, str::FromStr};
+use std::{collections::HashMap, str::FromStr, time::Duration};

 use clap::{Parser, Subcommand};
-use hyper::Method;
+use hyper::{Method, StatusCode};
 use pageserver_api::{
    controller_api::{
        NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy,
        TenantDescribeResponse, TenantPolicyRequest,
    },
    models::{
-        ShardParameters, TenantConfig, TenantConfigRequest, TenantCreateRequest,
-        TenantShardSplitRequest, TenantShardSplitResponse,
+        LocationConfigSecondary, ShardParameters, TenantConfig, TenantConfigRequest,
+        TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
    },
    shard::{ShardStripeSize, TenantShardId},
 };
@@ -120,6 +120,12 @@ enum Command {
        #[arg(long)]
        tenant_id: TenantId,
    },
+    /// For a tenant which hasn't been onboarded to the storage controller yet, add it in secondary
+    /// mode so that it can warm up content on a pageserver.
+    TenantWarmup {
+        #[arg(long)]
+        tenant_id: TenantId,
+    },
 }

 #[derive(Parser)]
@@ -581,6 +587,94 @@ async fn main() -> anyhow::Result<()> {
            }
            println!("{table}");
        }
+        Command::TenantWarmup { tenant_id } => {
+            let describe_response = storcon_client
+                .dispatch::<(), TenantDescribeResponse>(
+                    Method::GET,
+                    format!("control/v1/tenant/{tenant_id}"),
+                    None,
+                )
+                .await;
+            match describe_response {
+                Ok(describe) => {
+                    if matches!(describe.policy, PlacementPolicy::Secondary) {
+                        // Fine: it's already known to controller in secondary mode: calling
+                        // again to put it into secondary mode won't cause problems.
+                    } else {
+                        anyhow::bail!("Tenant already present with policy {:?}", describe.policy);
+                    }
+                }
+                Err(mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, _)) => {
+                    // Fine: this tenant isn't know to the storage controller yet.
+                }
+                Err(e) => {
+                    // Unexpected API error
+                    return Err(e.into());
+                }
+            }
+
+            vps_client
+                .location_config(
+                    TenantShardId::unsharded(tenant_id),
+                    pageserver_api::models::LocationConfig {
+                        mode: pageserver_api::models::LocationConfigMode::Secondary,
+                        generation: None,
+                        secondary_conf: Some(LocationConfigSecondary { warm: true }),
+                        shard_number: 0,
+                        shard_count: 0,
+                        shard_stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE.0,
+                        tenant_conf: TenantConfig::default(),
+                    },
+                    None,
+                    true,
+                )
+                .await?;
+
+            let describe_response = storcon_client
+                .dispatch::<(), TenantDescribeResponse>(
+                    Method::GET,
+                    format!("control/v1/tenant/{tenant_id}"),
+                    None,
+                )
+                .await?;
+
+            let secondary_ps_id = describe_response
+                .shards
+                .first()
+                .unwrap()
+                .node_secondary
+                .first()
+                .unwrap();
+
+            println!("Tenant {tenant_id} warming up on pageserver {secondary_ps_id}");
+            loop {
+                let (status, progress) = vps_client
+                    .tenant_secondary_download(
+                        TenantShardId::unsharded(tenant_id),
+                        Some(Duration::from_secs(10)),
+                    )
+                    .await?;
+                println!(
+                    "Progress: {}/{} layers, {}/{} bytes",
+                    progress.layers_downloaded,
+                    progress.layers_total,
+                    progress.bytes_downloaded,
+                    progress.bytes_total
+                );
+                match status {
+                    StatusCode::OK => {
+                        println!("Download complete");
+                        break;
+                    }
+                    StatusCode::ACCEPTED => {
+                        // Loop
+                    }
+                    _ => {
+                        anyhow::bail!("Unexpected download status: {status}");
+                    }
+                }
+            }
+        }
    }

    Ok(())
--- a/docs/storage_controller.md
+++ b/docs/storage_controller.md
@@ -0,0 +1,150 @@
+# Storage Controller
+
+## Concepts
+
+The storage controller sits between administrative API clients and pageservers, and handles the details of mapping tenants to pageserver tenant shards. For example, creating a tenant is one API call to the storage controller,
+which is mapped into many API calls to many pageservers (for multiple shards, and for secondary locations).
+
+It implements a pageserver-compatible API that may be used for CRUD operations on tenants and timelines, translating these requests into appropriate operations on the shards within a tenant, which may be on many different pageservers. Using this API, the storage controller may be used in the same way as the pageserver's administrative HTTP API, hiding
+the underlying details of how data is spread across multiple nodes.
+
+The storage controller also manages generations, high availability (via secondary locations) and live migrations for tenants under its management. This is done with a reconciliation loop pattern, where tenants have an “intent” state and a “reconcile” task that tries to make the outside world match the intent.
+
+## APIs
+
+The storage controller’s HTTP server implements four logically separate APIs:
+
+- `/v1/...` path is the pageserver-compatible API. This has to be at the path root because that’s where clients expect to find it on a pageserver.
+- `/control/v1/...` path is the storage controller’s API, which enables operations such as registering and management pageservers, or executing shard splits.
+- `/debug/v1/...` path contains endpoints which are either exclusively used in tests, or are for use by engineers when supporting a deployed system.
+- `/upcall/v1/...` path contains endpoints that are called by pageservers. This includes the `/re-attach` and `/validate` APIs used by pageservers
+  to ensure data safety with generation numbers.
+
+The API is authenticated with a JWT token, and tokens must have scope `pageserverapi` (i.e. the same scope as pageservers’ APIs).
+
+See the `http.rs` file in the source for where the HTTP APIs are implemented.
+
+## Database
+
+The storage controller uses a postgres database to persist a subset of its state. Note that the storage controller does _not_ keep all its state in the database: this is a design choice to enable most operations to be done efficiently in memory, rather than having to read from the database. See `persistence.rs` for a more comprehensive comment explaining what we do and do not persist: a useful metaphor is that we persist objects like tenants and nodes, but we do not
+persist the _relationships_ between them: the attachment state of a tenant's shards to nodes is kept in memory and
+rebuilt on startup.
+
+The file `[persistence.rs](http://persistence.rs)` contains all the code for accessing the database, and has a large doc comment that goes into more detail about exactly what we persist and why.
+
+The `diesel` crate is used for defining models & migrations.
+
+Running a local cluster with `cargo neon` automatically starts a vanilla postgress process to host the storage controller’s database.
+
+### Diesel tip: migrations
+
+If you need to modify the database schema, here’s how to create a migration:
+
+- Install the diesel CLI with `cargo install diesel_cli`
+- Use `diesel migration generate <name>` to create a new migration
+- Populate the SQL files in the `migrations/` subdirectory
+- Use `DATABASE_URL=... diesel migration run` to apply the migration you just wrote: this will update the `[schema.rs](http://schema.rs)` file automatically.
+  - This requires a running database: the easiest way to do that is to just run `cargo neon init ; cargo neon start`, which will leave a database available at `postgresql://localhost:1235/attachment_service`
+- Commit the migration files and the changes to schema.rs
+- If you need to iterate, you can rewind migrations with `diesel migration revert -a` and then `diesel migration run` again.
+- The migrations are build into the storage controller binary, and automatically run at startup after it is deployed, so once you’ve committed a migration no further steps are needed.
+
+## storcon_cli
+
+The `storcon_cli` tool enables interactive management of the storage controller. This is usually
+only necessary for debug, but may also be used to manage nodes (e.g. marking a node as offline).
+
+`storcon_cli --help` includes details on commands.
+
+# Deploying
+
+This section is aimed at engineers deploying the storage controller outside of Neon's cloud platform, as
+part of a self-hosted system.
+
+_General note: since the default `neon_local` environment includes a storage controller, this is a useful
+reference when figuring out deployment._
+
+## Database
+
+It is **essential** that the database used by the storage controller is durable (**do not store it on ephemeral
+local disk**). This database contains pageserver generation numbers, which are essential to data safety on the pageserver.
+
+The resource requirements for the database are very low: a single CPU core and 1GiB of memory should work well for most deployments. The physical size of the database is typically under a gigabyte.
+
+Set the URL to the database using the `--database-url` CLI option.
+
+There is no need to run migrations manually: the storage controller automatically applies migrations
+when it starts up.
+
+## Configure pageservers to use the storage controller
+
+1. The pageserver `control_plane_api` and `control_plane_api_token` should be set in the `pageserver.toml` file. The API setting should
+   point to the "upcall" prefix, for example `http://127.0.0.1:1234/upcall/v1/` is used in neon_local clusters.
+2. Create a `metadata.json` file in the same directory as `pageserver.toml`: this enables the pageserver to automatically register itself
+   with the storage controller when it starts up. See the example below for the format of this file.
+
+### Example `metadata.json`
+
+```
+{"host":"acmehost.localdomain","http_host":"acmehost.localdomain","http_port":9898,"port":64000}
+```
+
+- `port` and `host` refer to the _postgres_ port and host, and these must be accessible from wherever
+  postgres runs.
+- `http_port` and `http_host` refer to the pageserver's HTTP api, this must be accessible from where
+  the storage controller runs.
+
+## Handle compute notifications.
+
+The storage controller independently moves tenant attachments between pageservers in response to
+changes such as a pageserver node becoming unavailable, or the tenant's shard count changing. To enable
+postgres clients to handle such changes, the storage controller calls an API hook when a tenant's pageserver
+location changes.
+
+The hook is configured using the storage controller's `--compute-hook-url` CLI option. If the hook requires
+JWT auth, the token may be provided with `--control-plane-jwt-token`. The hook will be invoked with a `PUT` request.
+
+In the Neon cloud service, this hook is implemented by Neon's internal cloud control plane. In `neon_local` systems
+the storage controller integrates directly with neon_local to reconfigure local postgres processes instead of calling
+the compute hook.
+
+When implementing an on-premise Neon deployment, you must implement a service that handles the compute hook. This is not complicated:
+the request body has format of the `ComputeHookNotifyRequest` structure, provided below for convenience.
+
+```
+struct ComputeHookNotifyRequestShard {
+    node_id: NodeId,
+    shard_number: ShardNumber,
+}
+
+struct ComputeHookNotifyRequest {
+    tenant_id: TenantId,
+    stripe_size: Option<ShardStripeSize>,
+    shards: Vec<ComputeHookNotifyRequestShard>,
+}
+```
+
+When a notification is received:
+
+1. Modify postgres configuration for this tenant:
+
+   - set `neon.pageserver_connstr` to a comma-separated list of postgres connection strings to pageservers according to the `shards` list. The
+     shards identified by `NodeId` must be converted to the address+port of the node.
+   - if stripe_size is not None, set `neon.stripe_size` to this value
+
+2. Send SIGHUP to postgres to reload configuration
+3. Respond with 200 to the notification request. Do not return success if postgres was not updated: if an error is returned, the controller
+   will retry the notification until it succeeds..
+
+### Example notification body
+
+```
+{
+  "tenant_id": "1f359dd625e519a1a4e8d7509690f6fc",
+  "stripe_size": 32768,
+  "shards": [
+      {"node_id": 344, "shard_number": 0},
+      {"node_id": 722, "shard_number": 1},
+  ],
+}
+```
--- a/libs/metrics/src/hll.rs
+++ b/libs/metrics/src/hll.rs
@@ -7,14 +7,19 @@
 //! use significantly less memory than this, but can only approximate the cardinality.

 use std::{
-    collections::HashMap,
-    hash::{BuildHasher, BuildHasherDefault, Hash, Hasher},
-    sync::{atomic::AtomicU8, Arc, RwLock},
+    hash::{BuildHasher, BuildHasherDefault, Hash},
+    sync::atomic::AtomicU8,
 };

-use prometheus::{
-    core::{self, Describer},
-    proto, Opts,
+use measured::{
+    label::{LabelGroupVisitor, LabelName, LabelValue, LabelVisitor},
+    metric::{
+        group::{Encoding, MetricValue},
+        name::MetricNameEncoder,
+        Metric, MetricType, MetricVec,
+    },
+    text::TextEncoder,
+    LabelGroup,
 };
 use twox_hash::xxh3;

@@ -93,203 +98,25 @@ macro_rules! register_hll {
 /// ```
 ///
 /// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
-#[derive(Clone)]
-pub struct HyperLogLogVec<const N: usize> {
-    core: Arc<HyperLogLogVecCore<N>>,
+pub type HyperLogLogVec<L, const N: usize> = MetricVec<HyperLogLogState<N>, L>;
+pub type HyperLogLog<const N: usize> = Metric<HyperLogLogState<N>>;
+
+pub struct HyperLogLogState<const N: usize> {
+    shards: [AtomicU8; N],
 }
-
-struct HyperLogLogVecCore<const N: usize> {
-    pub children: RwLock<HashMap<u64, HyperLogLog<N>, BuildHasherDefault<xxh3::Hash64>>>,
-    pub desc: core::Desc,
-    pub opts: Opts,
-}
-
-impl<const N: usize> core::Collector for HyperLogLogVec<N> {
-    fn desc(&self) -> Vec<&core::Desc> {
-        vec![&self.core.desc]
-    }
-
-    fn collect(&self) -> Vec<proto::MetricFamily> {
-        let mut m = proto::MetricFamily::default();
-        m.set_name(self.core.desc.fq_name.clone());
-        m.set_help(self.core.desc.help.clone());
-        m.set_field_type(proto::MetricType::GAUGE);
-
-        let mut metrics = Vec::new();
-        for child in self.core.children.read().unwrap().values() {
-            child.core.collect_into(&mut metrics);
-        }
-        m.set_metric(metrics);
-
-        vec![m]
+impl<const N: usize> Default for HyperLogLogState<N> {
+    fn default() -> Self {
+        #[allow(clippy::declare_interior_mutable_const)]
+        const ZERO: AtomicU8 = AtomicU8::new(0);
+        Self { shards: [ZERO; N] }
    }
 }

-impl<const N: usize> HyperLogLogVec<N> {
-    /// Create a new [`HyperLogLogVec`] based on the provided
-    /// [`Opts`] and partitioned by the given label names. At least one label name must be
-    /// provided.
-    pub fn new(opts: Opts, label_names: &[&str]) -> prometheus::Result<Self> {
-        assert!(N.is_power_of_two());
-        let variable_names = label_names.iter().map(|s| (*s).to_owned()).collect();
-        let opts = opts.variable_labels(variable_names);
-
-        let desc = opts.describe()?;
-        let v = HyperLogLogVecCore {
-            children: RwLock::new(HashMap::default()),
-            desc,
-            opts,
-        };
-
-        Ok(Self { core: Arc::new(v) })
-    }
-
-    /// `get_metric_with_label_values` returns the [`HyperLogLog<P>`] for the given slice
-    /// of label values (same order as the VariableLabels in Desc). If that combination of
-    /// label values is accessed for the first time, a new [`HyperLogLog<P>`] is created.
-    ///
-    /// An error is returned if the number of label values is not the same as the
-    /// number of VariableLabels in Desc.
-    pub fn get_metric_with_label_values(
-        &self,
-        vals: &[&str],
-    ) -> prometheus::Result<HyperLogLog<N>> {
-        self.core.get_metric_with_label_values(vals)
-    }
-
-    /// `with_label_values` works as `get_metric_with_label_values`, but panics if an error
-    /// occurs.
-    pub fn with_label_values(&self, vals: &[&str]) -> HyperLogLog<N> {
-        self.get_metric_with_label_values(vals).unwrap()
-    }
+impl<const N: usize> MetricType for HyperLogLogState<N> {
+    type Metadata = ();
 }

-impl<const N: usize> HyperLogLogVecCore<N> {
-    pub fn get_metric_with_label_values(
-        &self,
-        vals: &[&str],
-    ) -> prometheus::Result<HyperLogLog<N>> {
-        let h = self.hash_label_values(vals)?;
-
-        if let Some(metric) = self.children.read().unwrap().get(&h).cloned() {
-            return Ok(metric);
-        }
-
-        self.get_or_create_metric(h, vals)
-    }
-
-    pub(crate) fn hash_label_values(&self, vals: &[&str]) -> prometheus::Result<u64> {
-        if vals.len() != self.desc.variable_labels.len() {
-            return Err(prometheus::Error::InconsistentCardinality {
-                expect: self.desc.variable_labels.len(),
-                got: vals.len(),
-            });
-        }
-
-        let mut h = xxh3::Hash64::default();
-        for val in vals {
-            h.write(val.as_bytes());
-        }
-
-        Ok(h.finish())
-    }
-
-    fn get_or_create_metric(
-        &self,
-        hash: u64,
-        label_values: &[&str],
-    ) -> prometheus::Result<HyperLogLog<N>> {
-        let mut children = self.children.write().unwrap();
-        // Check exist first.
-        if let Some(metric) = children.get(&hash).cloned() {
-            return Ok(metric);
-        }
-
-        let metric = HyperLogLog::with_opts_and_label_values(&self.opts, label_values)?;
-        children.insert(hash, metric.clone());
-        Ok(metric)
-    }
-}
-
-/// HLL is a probabilistic cardinality measure.
-///
-/// How to use this time-series for a metric name `my_metrics_total_hll`:
-///
-/// ```promql
-/// # harmonic mean
-/// 1 / (
-///     sum (
-///         2 ^ -(
-///             # HLL merge operation
-///             max (my_metrics_total_hll{}) by (hll_shard, other_labels...)
-///         )
-///     ) without (hll_shard)
-/// )
-/// * alpha
-/// * shards_count
-/// * shards_count
-/// ```
-///
-/// If you want an estimate over time, you can use the following query:
-///
-/// ```promql
-/// # harmonic mean
-/// 1 / (
-///     sum (
-///         2 ^ -(
-///             # HLL merge operation
-///             max (
-///                 max_over_time(my_metrics_total_hll{}[$__rate_interval])
-///             ) by (hll_shard, other_labels...)
-///         )
-///     ) without (hll_shard)
-/// )
-/// * alpha
-/// * shards_count
-/// * shards_count
-/// ```
-///
-/// In the case of low cardinality, you might want to use the linear counting approximation:
-///
-/// ```promql
-/// # LinearCounting(m, V) = m log (m / V)
-/// shards_count * ln(shards_count /
-///     # calculate V = how many shards contain a 0
-///     count(max (proxy_connecting_endpoints{}) by (hll_shard, protocol) == 0) without (hll_shard)
-/// )
-/// ```
-///
-/// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
-#[derive(Clone)]
-pub struct HyperLogLog<const N: usize> {
-    core: Arc<HyperLogLogCore<N>>,
-}
-
-impl<const N: usize> HyperLogLog<N> {
-    /// Create a [`HyperLogLog`] with the `name` and `help` arguments.
-    pub fn new<S1: Into<String>, S2: Into<String>>(name: S1, help: S2) -> prometheus::Result<Self> {
-        assert!(N.is_power_of_two());
-        let opts = Opts::new(name, help);
-        Self::with_opts(opts)
-    }
-
-    /// Create a [`HyperLogLog`] with the `opts` options.
-    pub fn with_opts(opts: Opts) -> prometheus::Result<Self> {
-        Self::with_opts_and_label_values(&opts, &[])
-    }
-
-    fn with_opts_and_label_values(opts: &Opts, label_values: &[&str]) -> prometheus::Result<Self> {
-        let desc = opts.describe()?;
-        let labels = make_label_pairs(&desc, label_values)?;
-
-        let v = HyperLogLogCore {
-            shards: [0; N].map(AtomicU8::new),
-            desc,
-            labels,
-        };
-        Ok(Self { core: Arc::new(v) })
-    }
-
+impl<const N: usize> HyperLogLogState<N> {
    pub fn measure(&self, item: &impl Hash) {
        // changing the hasher will break compatibility with previous measurements.
        self.record(BuildHasherDefault::<xxh3::Hash64>::default().hash_one(item));
@@ -299,42 +126,11 @@ impl<const N: usize> HyperLogLog<N> {
        let p = N.ilog2() as u8;
        let j = hash & (N as u64 - 1);
        let rho = (hash >> p).leading_zeros() as u8 + 1 - p;
-        self.core.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed);
-    }
-}
-
-struct HyperLogLogCore<const N: usize> {
-    shards: [AtomicU8; N],
-    desc: core::Desc,
-    labels: Vec<proto::LabelPair>,
-}
-
-impl<const N: usize> core::Collector for HyperLogLog<N> {
-    fn desc(&self) -> Vec<&core::Desc> {
-        vec![&self.core.desc]
+        self.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed);
    }

-    fn collect(&self) -> Vec<proto::MetricFamily> {
-        let mut m = proto::MetricFamily::default();
-        m.set_name(self.core.desc.fq_name.clone());
-        m.set_help(self.core.desc.help.clone());
-        m.set_field_type(proto::MetricType::GAUGE);
-
-        let mut metrics = Vec::new();
-        self.core.collect_into(&mut metrics);
-        m.set_metric(metrics);
-
-        vec![m]
-    }
-}
-
-impl<const N: usize> HyperLogLogCore<N> {
-    fn collect_into(&self, metrics: &mut Vec<proto::Metric>) {
-        self.shards.iter().enumerate().for_each(|(i, x)| {
-            let mut shard_label = proto::LabelPair::default();
-            shard_label.set_name("hll_shard".to_owned());
-            shard_label.set_value(format!("{i}"));
-
+    fn take_sample(&self) -> [u8; N] {
+        self.shards.each_ref().map(|x| {
            // We reset the counter to 0 so we can perform a cardinality measure over any time slice in prometheus.

            // This seems like it would be a race condition,
@@ -344,85 +140,90 @@ impl<const N: usize> HyperLogLogCore<N> {

            // TODO: maybe we shouldn't reset this on every collect, instead, only after a time window.
            // this would mean that a dev port-forwarding the metrics url won't break the sampling.
-            let v = x.swap(0, std::sync::atomic::Ordering::Relaxed);
-
-            let mut m = proto::Metric::default();
-            let mut c = proto::Gauge::default();
-            c.set_value(v as f64);
-            m.set_gauge(c);
-
-            let mut labels = Vec::with_capacity(self.labels.len() + 1);
-            labels.extend_from_slice(&self.labels);
-            labels.push(shard_label);
-
-            m.set_label(labels);
-            metrics.push(m);
+            x.swap(0, std::sync::atomic::Ordering::Relaxed)
        })
    }
 }
-
-fn make_label_pairs(
-    desc: &core::Desc,
-    label_values: &[&str],
-) -> prometheus::Result<Vec<proto::LabelPair>> {
-    if desc.variable_labels.len() != label_values.len() {
-        return Err(prometheus::Error::InconsistentCardinality {
-            expect: desc.variable_labels.len(),
-            got: label_values.len(),
-        });
+impl<W: std::io::Write, const N: usize> measured::metric::MetricEncoding<TextEncoder<W>>
+    for HyperLogLogState<N>
+{
+    fn write_type(
+        name: impl MetricNameEncoder,
+        enc: &mut TextEncoder<W>,
+    ) -> Result<(), std::io::Error> {
+        enc.write_type(&name, measured::text::MetricType::Gauge)
    }
+    fn collect_into(
+        &self,
+        _: &(),
+        labels: impl LabelGroup,
+        name: impl MetricNameEncoder,
+        enc: &mut TextEncoder<W>,
+    ) -> Result<(), std::io::Error> {
+        struct I64(i64);
+        impl LabelValue for I64 {
+            fn visit<V: LabelVisitor>(&self, v: V) -> V::Output {
+                v.write_int(self.0)
+            }
+        }

-    let total_len = desc.variable_labels.len() + desc.const_label_pairs.len();
-    if total_len == 0 {
-        return Ok(vec![]);
-    }
+        struct HllShardLabel {
+            hll_shard: i64,
+        }

-    if desc.variable_labels.is_empty() {
-        return Ok(desc.const_label_pairs.clone());
-    }
+        impl LabelGroup for HllShardLabel {
+            fn visit_values(&self, v: &mut impl LabelGroupVisitor) {
+                const LE: &LabelName = LabelName::from_str("hll_shard");
+                v.write_value(LE, &I64(self.hll_shard));
+            }
+        }

-    let mut label_pairs = Vec::with_capacity(total_len);
-    for (i, n) in desc.variable_labels.iter().enumerate() {
-        let mut label_pair = proto::LabelPair::default();
-        label_pair.set_name(n.clone());
-        label_pair.set_value(label_values[i].to_owned());
-        label_pairs.push(label_pair);
+        self.take_sample()
+            .into_iter()
+            .enumerate()
+            .try_for_each(|(hll_shard, val)| {
+                enc.write_metric_value(
+                    name.by_ref(),
+                    labels.by_ref().compose_with(HllShardLabel {
+                        hll_shard: hll_shard as i64,
+                    }),
+                    MetricValue::Int(val as i64),
+                )
+            })
    }
-
-    for label_pair in &desc.const_label_pairs {
-        label_pairs.push(label_pair.clone());
-    }
-    label_pairs.sort();
-    Ok(label_pairs)
 }

 #[cfg(test)]
 mod tests {
    use std::collections::HashSet;

-    use prometheus::{proto, Opts};
+    use measured::{label::StaticLabelSet, FixedCardinalityLabel};
    use rand::{rngs::StdRng, Rng, SeedableRng};
    use rand_distr::{Distribution, Zipf};

    use crate::HyperLogLogVec;

-    fn collect(hll: &HyperLogLogVec<32>) -> Vec<proto::Metric> {
-        let mut metrics = vec![];
-        hll.core
-            .children
-            .read()
-            .unwrap()
-            .values()
-            .for_each(|c| c.core.collect_into(&mut metrics));
-        metrics
+    #[derive(FixedCardinalityLabel, Clone, Copy)]
+    #[label(singleton = "x")]
+    enum Label {
+        A,
+        B,
    }
-    fn get_cardinality(metrics: &[proto::Metric], filter: impl Fn(&proto::Metric) -> bool) -> f64 {
+
+    fn collect(hll: &HyperLogLogVec<StaticLabelSet<Label>, 32>) -> ([u8; 32], [u8; 32]) {
+        // cannot go through the `hll.collect_family_into` interface yet...
+        // need to see if I can fix the conflicting impls problem in measured.
+        (
+            hll.get_metric(hll.with_labels(Label::A)).take_sample(),
+            hll.get_metric(hll.with_labels(Label::B)).take_sample(),
+        )
+    }
+
+    fn get_cardinality(samples: &[[u8; 32]]) -> f64 {
        let mut buckets = [0.0; 32];
-        for metric in metrics.chunks_exact(32) {
-            if filter(&metric[0]) {
-                for (i, m) in metric.iter().enumerate() {
-                    buckets[i] = f64::max(buckets[i], m.get_gauge().get_value());
-                }
+        for &sample in samples {
+            for (i, m) in sample.into_iter().enumerate() {
+                buckets[i] = f64::max(buckets[i], m as f64);
            }
        }

@@ -437,7 +238,7 @@ mod tests {
    }

    fn test_cardinality(n: usize, dist: impl Distribution<f64>) -> ([usize; 3], [f64; 3]) {
-        let hll = HyperLogLogVec::<32>::new(Opts::new("foo", "bar"), &["x"]).unwrap();
+        let hll = HyperLogLogVec::<StaticLabelSet<Label>, 32>::new();

        let mut iter = StdRng::seed_from_u64(0x2024_0112).sample_iter(dist);
        let mut set_a = HashSet::new();
@@ -445,18 +246,20 @@ mod tests {

        for x in iter.by_ref().take(n) {
            set_a.insert(x.to_bits());
-            hll.with_label_values(&["a"]).measure(&x.to_bits());
+            hll.get_metric(hll.with_labels(Label::A))
+                .measure(&x.to_bits());
        }
        for x in iter.by_ref().take(n) {
            set_b.insert(x.to_bits());
-            hll.with_label_values(&["b"]).measure(&x.to_bits());
+            hll.get_metric(hll.with_labels(Label::B))
+                .measure(&x.to_bits());
        }
        let merge = &set_a | &set_b;

-        let metrics = collect(&hll);
-        let len = get_cardinality(&metrics, |_| true);
-        let len_a = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "a");
-        let len_b = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "b");
+        let (a, b) = collect(&hll);
+        let len = get_cardinality(&[a, b]);
+        let len_a = get_cardinality(&[a]);
+        let len_b = get_cardinality(&[b]);

        ([merge.len(), set_a.len(), set_b.len()], [len, len_a, len_b])
    }
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -5,7 +5,7 @@
 #![deny(clippy::undocumented_unsafe_blocks)]

 use measured::{
-    label::{LabelGroupVisitor, LabelName, NoLabels},
+    label::{LabelGroupSet, LabelGroupVisitor, LabelName, NoLabels},
    metric::{
        counter::CounterState,
        gauge::GaugeState,
@@ -40,7 +40,7 @@ pub mod launch_timestamp;
 mod wrappers;
 pub use wrappers::{CountedReader, CountedWriter};
 mod hll;
-pub use hll::{HyperLogLog, HyperLogLogVec};
+pub use hll::{HyperLogLog, HyperLogLogState, HyperLogLogVec};
 #[cfg(target_os = "linux")]
 pub mod more_process_metrics;

@@ -256,7 +256,16 @@ fn update_rusage_metrics() {
    DISK_IO_BYTES
        .with_label_values(&["write"])
        .set(rusage_stats.ru_oublock * BYTES_IN_BLOCK);
-    MAXRSS_KB.set(rusage_stats.ru_maxrss);
+
+    // On macOS, the unit of maxrss is bytes; on Linux, it's kilobytes. https://stackoverflow.com/a/59915669
+    #[cfg(target_os = "macos")]
+    {
+        MAXRSS_KB.set(rusage_stats.ru_maxrss / 1024);
+    }
+    #[cfg(not(target_os = "macos"))]
+    {
+        MAXRSS_KB.set(rusage_stats.ru_maxrss);
+    }
 }

 fn get_rusage_stats() -> libc::rusage {
@@ -421,3 +430,171 @@ pub type IntCounterPair = GenericCounterPair<AtomicU64>;

 /// A guard for [`IntCounterPair`] that will decrement the gauge on drop
 pub type IntCounterPairGuard = GenericCounterPairGuard<AtomicU64>;
+
+pub trait CounterPairAssoc {
+    const INC_NAME: &'static MetricName;
+    const DEC_NAME: &'static MetricName;
+
+    const INC_HELP: &'static str;
+    const DEC_HELP: &'static str;
+
+    type LabelGroupSet: LabelGroupSet;
+}
+
+pub struct CounterPairVec<A: CounterPairAssoc> {
+    vec: measured::metric::MetricVec<MeasuredCounterPairState, A::LabelGroupSet>,
+}
+
+impl<A: CounterPairAssoc> Default for CounterPairVec<A>
+where
+    A::LabelGroupSet: Default,
+{
+    fn default() -> Self {
+        Self {
+            vec: Default::default(),
+        }
+    }
+}
+
+impl<A: CounterPairAssoc> CounterPairVec<A> {
+    pub fn guard(
+        &self,
+        labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>,
+    ) -> MeasuredCounterPairGuard<'_, A> {
+        let id = self.vec.with_labels(labels);
+        self.vec.get_metric(id).inc.inc();
+        MeasuredCounterPairGuard { vec: &self.vec, id }
+    }
+    pub fn inc(&self, labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>) {
+        let id = self.vec.with_labels(labels);
+        self.vec.get_metric(id).inc.inc();
+    }
+    pub fn dec(&self, labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>) {
+        let id = self.vec.with_labels(labels);
+        self.vec.get_metric(id).dec.inc();
+    }
+    pub fn remove_metric(
+        &self,
+        labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>,
+    ) -> Option<MeasuredCounterPairState> {
+        let id = self.vec.with_labels(labels);
+        self.vec.remove_metric(id)
+    }
+}
+
+impl<T, A> ::measured::metric::group::MetricGroup<T> for CounterPairVec<A>
+where
+    T: ::measured::metric::group::Encoding,
+    A: CounterPairAssoc,
+    ::measured::metric::counter::CounterState: ::measured::metric::MetricEncoding<T>,
+{
+    fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> {
+        // write decrement first to avoid a race condition where inc - dec < 0
+        T::write_help(enc, A::DEC_NAME, A::DEC_HELP)?;
+        self.vec
+            .collect_family_into(A::DEC_NAME, &mut Dec(&mut *enc))?;
+
+        T::write_help(enc, A::INC_NAME, A::INC_HELP)?;
+        self.vec
+            .collect_family_into(A::INC_NAME, &mut Inc(&mut *enc))?;
+
+        Ok(())
+    }
+}
+
+#[derive(MetricGroup, Default)]
+pub struct MeasuredCounterPairState {
+    pub inc: CounterState,
+    pub dec: CounterState,
+}
+
+impl measured::metric::MetricType for MeasuredCounterPairState {
+    type Metadata = ();
+}
+
+pub struct MeasuredCounterPairGuard<'a, A: CounterPairAssoc> {
+    vec: &'a measured::metric::MetricVec<MeasuredCounterPairState, A::LabelGroupSet>,
+    id: measured::metric::LabelId<A::LabelGroupSet>,
+}
+
+impl<A: CounterPairAssoc> Drop for MeasuredCounterPairGuard<'_, A> {
+    fn drop(&mut self) {
+        self.vec.get_metric(self.id).dec.inc();
+    }
+}
+
+/// [`MetricEncoding`] for [`MeasuredCounterPairState`] that only writes the inc counter to the inner encoder.
+struct Inc<T>(T);
+/// [`MetricEncoding`] for [`MeasuredCounterPairState`] that only writes the dec counter to the inner encoder.
+struct Dec<T>(T);
+
+impl<T: Encoding> Encoding for Inc<T> {
+    type Err = T::Err;
+
+    fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
+        self.0.write_help(name, help)
+    }
+
+    fn write_metric_value(
+        &mut self,
+        name: impl MetricNameEncoder,
+        labels: impl LabelGroup,
+        value: MetricValue,
+    ) -> Result<(), Self::Err> {
+        self.0.write_metric_value(name, labels, value)
+    }
+}
+
+impl<T: Encoding> MetricEncoding<Inc<T>> for MeasuredCounterPairState
+where
+    CounterState: MetricEncoding<T>,
+{
+    fn write_type(name: impl MetricNameEncoder, enc: &mut Inc<T>) -> Result<(), T::Err> {
+        CounterState::write_type(name, &mut enc.0)
+    }
+    fn collect_into(
+        &self,
+        metadata: &(),
+        labels: impl LabelGroup,
+        name: impl MetricNameEncoder,
+        enc: &mut Inc<T>,
+    ) -> Result<(), T::Err> {
+        self.inc.collect_into(metadata, labels, name, &mut enc.0)
+    }
+}
+
+impl<T: Encoding> Encoding for Dec<T> {
+    type Err = T::Err;
+
+    fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
+        self.0.write_help(name, help)
+    }
+
+    fn write_metric_value(
+        &mut self,
+        name: impl MetricNameEncoder,
+        labels: impl LabelGroup,
+        value: MetricValue,
+    ) -> Result<(), Self::Err> {
+        self.0.write_metric_value(name, labels, value)
+    }
+}
+
+/// Write the dec counter to the encoder
+impl<T: Encoding> MetricEncoding<Dec<T>> for MeasuredCounterPairState
+where
+    CounterState: MetricEncoding<T>,
+{
+    fn write_type(name: impl MetricNameEncoder, enc: &mut Dec<T>) -> Result<(), T::Err> {
+        CounterState::write_type(name, &mut enc.0)
+    }
+    fn collect_into(
+        &self,
+        metadata: &(),
+        labels: impl LabelGroup,
+        name: impl MetricNameEncoder,
+        enc: &mut Dec<T>,
+    ) -> Result<(), T::Err> {
+        self.dec.collect_into(metadata, labels, name, &mut enc.0)
+    }
+}
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -1,8 +1,10 @@
 use anyhow::{bail, Result};
 use byteorder::{ByteOrder, BE};
+use bytes::BufMut;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::{Oid, TransactionId};
 use serde::{Deserialize, Serialize};
+use std::ops::RangeInclusive;
 use std::{fmt, ops::Range};

 use crate::reltag::{BlockNumber, RelTag, SlruKind};
@@ -21,9 +23,81 @@ pub struct Key {
    pub field6: u32,
 }

+/// The storage key size.
 pub const KEY_SIZE: usize = 18;

+/// The metadata key size. 2B fewer than the storage key size because field2 is not fully utilized.
+/// See [`Key::to_i128`] for more information on the encoding.
+pub const METADATA_KEY_SIZE: usize = 16;
+
+/// The key prefix start range for the metadata keys. All keys with the first byte >= 0x80 is a metadata key.
+pub const METADATA_KEY_BEGIN_PREFIX: u8 = 0x80;
+
+/// The (reserved) key prefix of relation sizes.
+pub const RELATION_SIZE_PREFIX: u8 = 0x81;
+
+/// The key prefix of AUX file keys.
+pub const AUX_KEY_PREFIX: u8 = 0x82;
+
+/// Check if the key falls in the range of metadata keys.
+pub const fn is_metadata_key_slice(key: &[u8]) -> bool {
+    key[0] >= METADATA_KEY_BEGIN_PREFIX
+}
+
 impl Key {
+    /// Check if the key falls in the range of metadata keys.
+    pub const fn is_metadata_key(&self) -> bool {
+        self.field1 >= METADATA_KEY_BEGIN_PREFIX
+    }
+
+    /// Encode a metadata key to a storage key.
+    pub fn from_metadata_key_fixed_size(key: &[u8; METADATA_KEY_SIZE]) -> Self {
+        assert!(is_metadata_key_slice(key), "key not in metadata key range");
+        Key {
+            field1: key[0],
+            field2: u16::from_be_bytes(key[1..3].try_into().unwrap()) as u32,
+            field3: u32::from_be_bytes(key[3..7].try_into().unwrap()),
+            field4: u32::from_be_bytes(key[7..11].try_into().unwrap()),
+            field5: key[11],
+            field6: u32::from_be_bytes(key[12..16].try_into().unwrap()),
+        }
+    }
+
+    /// Encode a metadata key to a storage key.
+    pub fn from_metadata_key(key: &[u8]) -> Self {
+        Self::from_metadata_key_fixed_size(key.try_into().expect("expect 16 byte metadata key"))
+    }
+
+    /// Extract a metadata key to a writer. The result should always be 16 bytes.
+    pub fn extract_metadata_key_to_writer(&self, mut writer: impl BufMut) {
+        writer.put_u8(self.field1);
+        assert!(self.field2 <= 0xFFFF);
+        writer.put_u16(self.field2 as u16);
+        writer.put_u32(self.field3);
+        writer.put_u32(self.field4);
+        writer.put_u8(self.field5);
+        writer.put_u32(self.field6);
+    }
+
+    /// Get the range of metadata keys.
+    pub fn metadata_key_range() -> RangeInclusive<Self> {
+        Key {
+            field1: METADATA_KEY_BEGIN_PREFIX,
+            field2: 0,
+            field3: 0,
+            field4: 0,
+            field5: 0,
+            field6: 0,
+        }..=Key {
+            field1: u8::MAX,
+            field2: u16::MAX as u32,
+            field3: u32::MAX,
+            field4: u32::MAX,
+            field5: u8::MAX,
+            field6: u32::MAX,
+        }
+    }
+
    /// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish.
    /// As long as Neon does not support tablespace (because of lack of access to local file system),
    /// we can assume that only some predefined namespace OIDs are used which can fit in u16
@@ -48,11 +122,11 @@ impl Key {
        }
    }

-    pub fn next(&self) -> Key {
+    pub const fn next(&self) -> Key {
        self.add(1)
    }

-    pub fn add(&self, x: u32) -> Key {
+    pub const fn add(&self, x: u32) -> Key {
        let mut key = *self;

        let r = key.field6.overflowing_add(x);
@@ -81,6 +155,8 @@ impl Key {
        key
    }

+    /// Convert a 18B slice to a key. This function should not be used for metadata keys because field2 is handled differently.
+    /// Use [`Key::from_metadata_key`] instead.
    pub fn from_slice(b: &[u8]) -> Self {
        Key {
            field1: b[0],
@@ -92,6 +168,8 @@ impl Key {
        }
    }

+    /// Convert a key to a 18B slice. This function should not be used for metadata keys because field2 is handled differently.
+    /// Use [`Key::extract_metadata_key_to_writer`] instead.
    pub fn write_to_byte_slice(&self, buf: &mut [u8]) {
        buf[0] = self.field1;
        BE::write_u32(&mut buf[1..5], self.field2);
@@ -475,12 +553,14 @@ pub const AUX_FILES_KEY: Key = Key {
 // Reverse mappings for a few Keys.
 // These are needed by WAL redo manager.

+pub const NON_INHERITED_RANGE: Range<Key> = AUX_FILES_KEY..AUX_FILES_KEY.next();
+
 // AUX_FILES currently stores only data for logical replication (slots etc), and
 // we don't preserve these on a branch because safekeepers can't follow timeline
 // switch (and generally it likely should be optional), so ignore these.
 #[inline(always)]
 pub fn is_inherited_key(key: Key) -> bool {
-    key != AUX_FILES_KEY
+    !NON_INHERITED_RANGE.contains(&key)
 }

 #[inline(always)]
@@ -556,11 +636,14 @@ impl std::str::FromStr for Key {
 mod tests {
    use std::str::FromStr;

+    use crate::key::is_metadata_key_slice;
    use crate::key::Key;

    use rand::Rng;
    use rand::SeedableRng;

+    use super::AUX_KEY_PREFIX;
+
    #[test]
    fn display_fromstr_bijection() {
        let mut rng = rand::rngs::StdRng::seed_from_u64(42);
@@ -576,4 +659,16 @@ mod tests {

        assert_eq!(key, Key::from_str(&format!("{key}")).unwrap());
    }
+
+    #[test]
+    fn test_metadata_keys() {
+        let mut metadata_key = vec![AUX_KEY_PREFIX];
+        metadata_key.extend_from_slice(&[0xFF; 15]);
+        let encoded_key = Key::from_metadata_key(&metadata_key);
+        let mut output_key = Vec::new();
+        encoded_key.extract_metadata_key_to_writer(&mut output_key);
+        assert_eq!(metadata_key, output_key);
+        assert!(encoded_key.is_metadata_key());
+        assert!(is_metadata_key_slice(&metadata_key));
+    }
 }
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -94,12 +94,13 @@ impl KeySpace {

    /// Remove all keys in `other` from `self`.
    /// This can involve splitting or removing of existing ranges.
-    pub fn remove_overlapping_with(&mut self, other: &KeySpace) {
+    /// Returns the removed keyspace
+    pub fn remove_overlapping_with(&mut self, other: &KeySpace) -> KeySpace {
        let (self_start, self_end) = match (self.start(), self.end()) {
            (Some(start), Some(end)) => (start, end),
            _ => {
                // self is empty
-                return;
+                return KeySpace::default();
            }
        };

@@ -112,30 +113,37 @@ impl KeySpace {
            .skip_while(|range| self_start >= range.end)
            .take_while(|range| self_end > range.start);

+        let mut removed_accum = KeySpaceRandomAccum::new();
        for range in other_ranges {
            while let Some(overlap_at) = self.overlaps_at(range) {
                let overlapped = self.ranges[overlap_at].clone();

                if overlapped.start < range.start && overlapped.end <= range.end {
                    // Higher part of the range is completely overlapped.
+                    removed_accum.add_range(range.start..self.ranges[overlap_at].end);
                    self.ranges[overlap_at].end = range.start;
                }
                if overlapped.start >= range.start && overlapped.end > range.end {
                    // Lower part of the range is completely overlapped.
+                    removed_accum.add_range(self.ranges[overlap_at].start..range.end);
                    self.ranges[overlap_at].start = range.end;
                }
                if overlapped.start < range.start && overlapped.end > range.end {
                    // Middle part of the range is overlapped.
+                    removed_accum.add_range(range.clone());
                    self.ranges[overlap_at].end = range.start;
                    self.ranges
                        .insert(overlap_at + 1, range.end..overlapped.end);
                }
                if overlapped.start >= range.start && overlapped.end <= range.end {
                    // Whole range is overlapped
+                    removed_accum.add_range(self.ranges[overlap_at].clone());
                    self.ranges.remove(overlap_at);
                }
            }
        }
+
+        removed_accum.to_keyspace()
    }

    pub fn start(&self) -> Option<Key> {
@@ -154,6 +162,10 @@ impl KeySpace {
            .sum()
    }

+    pub fn is_empty(&self) -> bool {
+        self.total_size() == 0
+    }
+
    fn overlaps_at(&self, range: &Range<Key>) -> Option<usize> {
        match self.ranges.binary_search_by_key(&range.end, |r| r.start) {
            Ok(0) => None,
@@ -553,7 +565,16 @@ mod tests {
                Key::from_i128(11)..Key::from_i128(13),
            ],
        };
-        key_space1.remove_overlapping_with(&key_space2);
+        let removed = key_space1.remove_overlapping_with(&key_space2);
+        let removed_expected = KeySpace {
+            ranges: vec![
+                Key::from_i128(2)..Key::from_i128(3),
+                Key::from_i128(6)..Key::from_i128(7),
+                Key::from_i128(11)..Key::from_i128(12),
+            ],
+        };
+        assert_eq!(removed, removed_expected);
+
        assert_eq!(
            key_space1.ranges,
            vec![
@@ -583,7 +604,17 @@ mod tests {
                Key::from_i128(14)..Key::from_i128(17),
            ],
        };
-        key_space1.remove_overlapping_with(&key_space2);
+
+        let removed = key_space1.remove_overlapping_with(&key_space2);
+        let removed_expected = KeySpace {
+            ranges: vec![
+                Key::from_i128(3)..Key::from_i128(5),
+                Key::from_i128(8)..Key::from_i128(10),
+                Key::from_i128(14)..Key::from_i128(15),
+            ],
+        };
+        assert_eq!(removed, removed_expected);
+
        assert_eq!(
            key_space1.ranges,
            vec![
@@ -610,7 +641,11 @@ mod tests {
                Key::from_i128(15)..Key::from_i128(17),
            ],
        };
-        key_space1.remove_overlapping_with(&key_space2);
+
+        let removed = key_space1.remove_overlapping_with(&key_space2);
+        let removed_expected = KeySpace::default();
+        assert_eq!(removed, removed_expected);
+
        assert_eq!(
            key_space1.ranges,
            vec![
@@ -637,7 +672,17 @@ mod tests {
        let key_space2 = KeySpace {
            ranges: vec![Key::from_i128(9)..Key::from_i128(19)],
        };
-        key_space1.remove_overlapping_with(&key_space2);
+
+        let removed = key_space1.remove_overlapping_with(&key_space2);
+        let removed_expected = KeySpace {
+            ranges: vec![
+                Key::from_i128(9)..Key::from_i128(10),
+                Key::from_i128(12)..Key::from_i128(15),
+                Key::from_i128(17)..Key::from_i128(19),
+            ],
+        };
+        assert_eq!(removed, removed_expected);
+
        assert_eq!(
            key_space1.ranges,
            vec![
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -429,6 +429,7 @@ pub struct StatusResponse {
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantLocationConfigRequest {
+    #[serde(skip_serializing_if = "Option::is_none")]
    pub tenant_id: Option<TenantShardId>,
    #[serde(flatten)]
    pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
@@ -747,10 +748,18 @@ pub struct TimelineGcRequest {
    pub gc_horizon: Option<u64>,
 }

+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct WalRedoManagerProcessStatus {
+    pub pid: u32,
+    /// The strum-generated `into::<&'static str>()` for `pageserver::walredo::ProcessKind`.
+    /// `ProcessKind` are a transitory thing, so, they have no enum representation in `pageserver_api`.
+    pub kind: Cow<'static, str>,
+}
+
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct WalRedoManagerStatus {
    pub last_redo_at: Option<chrono::DateTime<chrono::Utc>>,
-    pub pid: Option<u32>,
+    pub process: Option<WalRedoManagerProcessStatus>,
 }

 /// The progress of a secondary tenant is mostly useful when doing a long running download: e.g. initiating
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -5,15 +5,93 @@ use crate::{
    models::ShardParameters,
 };
 use hex::FromHex;
+use postgres_ffi::relfile_utils::INIT_FORKNUM;
 use serde::{Deserialize, Serialize};
 use utils::id::TenantId;

+/// See docs/rfcs/031-sharding-static.md for an overview of sharding.
+///
+/// This module contains a variety of types used to represent the concept of sharding
+/// a Neon tenant across multiple physical shards.  Since there are quite a few of these,
+/// we provide an summary here.
+///
+/// Types used to describe shards:
+/// - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value
+///   which identifies a tenant which is not shard-aware.  This means its storage paths do not include
+///   a shard suffix.
+/// - [`ShardNumber`] is simply the zero-based index of a shard within a tenant.
+/// - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId`
+///   without the tenant ID.  This is useful for things that are implicitly scoped to a particular
+///   tenant, such as layer files.
+/// - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient
+///   detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read.
+/// - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as
+///   four hex digits.  An unsharded tenant is `0000`.
+/// - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant
+///
+/// Types used to describe the parameters for data distribution in a sharded tenant:
+/// - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across
+///   multiple shards.  Its value is given in 8kiB pages.
+/// - [`ShardLayout`] describes the data distribution scheme, and at time of writing is
+///   always zero: this is provided for future upgrades that might introduce different
+///   data distribution schemes.
+///
+/// Examples:
+/// - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000
+/// - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001
+/// - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive),
+///   and their slugs are 0004, 0104, 0204, and 0304.
+
 #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
 pub struct ShardNumber(pub u8);

 #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
 pub struct ShardCount(u8);

+/// Combination of ShardNumber and ShardCount.  For use within the context of a particular tenant,
+/// when we need to know which shard we're dealing with, but do not need to know the full
+/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
+/// the fully qualified TenantShardId.
+#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
+pub struct ShardIndex {
+    pub shard_number: ShardNumber,
+    pub shard_count: ShardCount,
+}
+
+/// The ShardIdentity contains enough information to map a [`Key`] to a [`ShardNumber`],
+/// and to check whether that [`ShardNumber`] is the same as the current shard.
+#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
+pub struct ShardIdentity {
+    pub number: ShardNumber,
+    pub count: ShardCount,
+    pub stripe_size: ShardStripeSize,
+    layout: ShardLayout,
+}
+
+/// Formatting helper, for generating the `shard_id` label in traces.
+struct ShardSlug<'a>(&'a TenantShardId);
+
+/// TenantShardId globally identifies a particular shard in a particular tenant.
+///
+/// These are written as `<TenantId>-<ShardSlug>`, for example:
+///   # The second shard in a two-shard tenant
+///   072f1291a5310026820b2fe4b2968934-0102
+///
+/// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without
+/// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables
+/// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`].
+///
+/// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs,
+/// is both forward and backward compatible with TenantId: a legacy TenantId can be
+/// decoded as a TenantShardId, and when re-encoded it will be parseable
+/// as a TenantId.
+#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
+pub struct TenantShardId {
+    pub tenant_id: TenantId,
+    pub shard_number: ShardNumber,
+    pub shard_count: ShardCount,
+}
+
 impl ShardCount {
    pub const MAX: Self = Self(u8::MAX);

@@ -38,6 +116,7 @@ impl ShardCount {
        self.0
    }

+    ///
    pub fn is_unsharded(&self) -> bool {
        self.0 == 0
    }
@@ -53,33 +132,6 @@ impl ShardNumber {
    pub const MAX: Self = Self(u8::MAX);
 }

-/// TenantShardId identify the units of work for the Pageserver.
-///
-/// These are written as `<tenant_id>-<shard number><shard-count>`, for example:
-///
-///   # The second shard in a two-shard tenant
-///   072f1291a5310026820b2fe4b2968934-0102
-///
-/// Historically, tenants could not have multiple shards, and were identified
-/// by TenantId.  To support this, TenantShardId has a special legacy
-/// mode where `shard_count` is equal to zero: this represents a single-sharded
-/// tenant which should be written as a TenantId with no suffix.
-///
-/// The human-readable encoding of TenantShardId, such as used in API URLs,
-/// is both forward and backward compatible: a legacy TenantId can be
-/// decoded as a TenantShardId, and when re-encoded it will be parseable
-/// as a TenantId.
-///
-/// Note that the binary encoding is _not_ backward compatible, because
-/// at the time sharding is introduced, there are no existing binary structures
-/// containing TenantId that we need to handle.
-#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
-pub struct TenantShardId {
-    pub tenant_id: TenantId,
-    pub shard_number: ShardNumber,
-    pub shard_count: ShardCount,
-}
-
 impl TenantShardId {
    pub fn unsharded(tenant_id: TenantId) -> Self {
        Self {
@@ -111,10 +163,13 @@ impl TenantShardId {
    }

    /// Convenience for code that has special behavior on the 0th shard.
-    pub fn is_zero(&self) -> bool {
+    pub fn is_shard_zero(&self) -> bool {
        self.shard_number == ShardNumber(0)
    }

+    /// The "unsharded" value is distinct from simply having a single shard: it represents
+    /// a tenant which is not shard-aware at all, and whose storage paths will not include
+    /// a shard suffix.
    pub fn is_unsharded(&self) -> bool {
        self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
    }
@@ -150,9 +205,6 @@ impl TenantShardId {
    }
 }

-/// Formatting helper
-struct ShardSlug<'a>(&'a TenantShardId);
-
 impl<'a> std::fmt::Display for ShardSlug<'a> {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
@@ -222,16 +274,6 @@ impl From<[u8; 18]> for TenantShardId {
    }
 }

-/// For use within the context of a particular tenant, when we need to know which
-/// shard we're dealing with, but do not need to know the full ShardIdentity (because
-/// we won't be doing any page->shard mapping), and do not need to know the fully qualified
-/// TenantShardId.
-#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
-pub struct ShardIndex {
-    pub shard_number: ShardNumber,
-    pub shard_count: ShardCount,
-}
-
 impl ShardIndex {
    pub fn new(number: ShardNumber, count: ShardCount) -> Self {
        Self {
@@ -246,6 +288,9 @@ impl ShardIndex {
        }
    }

+    /// The "unsharded" value is distinct from simply having a single shard: it represents
+    /// a tenant which is not shard-aware at all, and whose storage paths will not include
+    /// a shard suffix.
    pub fn is_unsharded(&self) -> bool {
        self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
    }
@@ -313,6 +358,8 @@ impl Serialize for TenantShardId {
        if serializer.is_human_readable() {
            serializer.collect_str(self)
        } else {
+            // Note: while human encoding of [`TenantShardId`] is backward and forward
+            // compatible, this binary encoding is not.
            let mut packed: [u8; 18] = [0; 18];
            packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
            packed[16] = self.shard_number.0;
@@ -390,16 +437,6 @@ const LAYOUT_BROKEN: ShardLayout = ShardLayout(255);
 /// Default stripe size in pages: 256MiB divided by 8kiB page size.
 const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);

-/// The ShardIdentity contains the information needed for one member of map
-/// to resolve a key to a shard, and then check whether that shard is ==self.
-#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
-pub struct ShardIdentity {
-    pub number: ShardNumber,
-    pub count: ShardCount,
-    pub stripe_size: ShardStripeSize,
-    layout: ShardLayout,
-}
-
 #[derive(thiserror::Error, Debug, PartialEq, Eq)]
 pub enum ShardConfigError {
    #[error("Invalid shard count")]
@@ -439,6 +476,9 @@ impl ShardIdentity {
        }
    }

+    /// The "unsharded" value is distinct from simply having a single shard: it represents
+    /// a tenant which is not shard-aware at all, and whose storage paths will not include
+    /// a shard suffix.
    pub fn is_unsharded(&self) -> bool {
        self.number == ShardNumber(0) && self.count == ShardCount(0)
    }
@@ -487,6 +527,8 @@ impl ShardIdentity {
    }

    /// Return true if the key should be ingested by this shard
+    ///
+    /// Shards must ingest _at least_ keys which return true from this check.
    pub fn is_key_local(&self, key: &Key) -> bool {
        assert!(!self.is_broken());
        if self.count < ShardCount(2) || (key_is_shard0(key) && self.number == ShardNumber(0)) {
@@ -496,8 +538,28 @@ impl ShardIdentity {
        }
    }

+    /// Special case for issue `<https://github.com/neondatabase/neon/issues/7451>`
+    ///
+    /// When we fail to read a forknum block, this function tells us whether we may ignore the error
+    /// as a symptom of that issue.
+    pub fn is_key_buggy_forknum(&self, key: &Key) -> bool {
+        if !is_rel_block_key(key) || key.field5 != INIT_FORKNUM {
+            return false;
+        }
+
+        let mut hash = murmurhash32(key.field4);
+        hash = hash_combine(hash, murmurhash32(key.field6 / self.stripe_size.0));
+        let mapped_shard = ShardNumber((hash % self.count.0 as u32) as u8);
+
+        // The key may be affected by issue #7454: it is an initfork and it would not
+        // have mapped to shard 0 until we fixed that issue.
+        mapped_shard != ShardNumber(0)
+    }
+
    /// Return true if the key should be discarded if found in this shard's
-    /// data store, e.g. during compaction after a split
+    /// data store, e.g. during compaction after a split.
+    ///
+    /// Shards _may_ drop keys which return false here, but are not obliged to.
    pub fn is_key_disposable(&self, key: &Key) -> bool {
        if key_is_shard0(key) {
            // Q: Why can't we dispose of shard0 content if we're not shard 0?
@@ -523,7 +585,7 @@ impl ShardIdentity {

    /// Convenience for checking if this identity is the 0th shard in a tenant,
    /// for special cases on shard 0 such as ingesting relation sizes.
-    pub fn is_zero(&self) -> bool {
+    pub fn is_shard_zero(&self) -> bool {
        self.number == ShardNumber(0)
    }
 }
@@ -606,7 +668,13 @@ fn key_is_shard0(key: &Key) -> bool {
    // relation pages are distributed to shards other than shard zero. Everything else gets
    // stored on shard 0.  This guarantees that shard 0 can independently serve basebackup
    // requests, and any request other than those for particular blocks in relations.
-    !is_rel_block_key(key)
+    //
+    // The only exception to this rule is "initfork" data -- this relates to postgres's UNLOGGED table
+    // type. These are special relations, usually with only 0 or 1 blocks, and we store them on shard 0
+    // because they must be included in basebackups.
+    let is_initfork = key.field5 == INIT_FORKNUM;
+
+    !is_rel_block_key(key) || is_initfork
 }

 /// Provide the same result as the function in postgres `hashfn.h` with the same name
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -118,7 +118,9 @@ pub use v14::bindings::{TimeLineID, TimestampTz, XLogRecPtr, XLogSegNo};
 // Likewise for these, although the assumption that these don't change is a little more iffy.
 pub use v14::bindings::{MultiXactOffset, MultiXactStatus};
 pub use v14::bindings::{PageHeaderData, XLogRecord};
-pub use v14::xlog_utils::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD};
+pub use v14::xlog_utils::{
+    XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD,
+};

 pub use v14::bindings::{CheckPoint, ControlFileData};

--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -4,7 +4,9 @@ use log::*;
 use postgres::types::PgLsn;
 use postgres::Client;
 use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
-use postgres_ffi::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD};
+use postgres_ffi::{
+    XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD,
+};
 use std::path::{Path, PathBuf};
 use std::process::Command;
 use std::time::{Duration, Instant};
@@ -262,11 +264,21 @@ fn craft_internal<C: postgres::GenericClient>(
        intermediate_lsns.insert(0, initial_lsn);
    }

-    // Some records may be not flushed, e.g. non-transactional logical messages.
+    // Some records may be not flushed, e.g. non-transactional logical messages. Flush now.
    //
-    // Note: this is broken if pg_current_wal_insert_lsn is at page boundary
-    // because pg_current_wal_insert_lsn skips page headers.
-    client.execute("select neon_xlogflush(pg_current_wal_insert_lsn())", &[])?;
+    // If the previous WAL record ended exactly at page boundary, pg_current_wal_insert_lsn
+    // returns the position just after the page header on the next page. That's where the next
+    // record will be inserted. But the page header hasn't actually been written to the WAL
+    // yet, and if you try to flush it, you get a "request to flush past end of generated WAL"
+    // error. Because of that, if the insert location is just after a page header, back off to
+    // previous page boundary.
+    let mut lsn = u64::from(client.pg_current_wal_insert_lsn()?);
+    if lsn % WAL_SEGMENT_SIZE as u64 == XLOG_SIZE_OF_XLOG_LONG_PHD as u64 {
+        lsn -= XLOG_SIZE_OF_XLOG_LONG_PHD as u64;
+    } else if lsn % XLOG_BLCKSZ as u64 == XLOG_SIZE_OF_XLOG_SHORT_PHD as u64 {
+        lsn -= XLOG_SIZE_OF_XLOG_SHORT_PHD as u64;
+    }
+    client.execute("select neon_xlogflush($1)", &[&PgLsn::from(lsn)])?;
    Ok(intermediate_lsns)
 }

@@ -320,38 +332,49 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {

        client.execute("CREATE table t(x int)", &[])?;

-        // Add padding so the XLOG_SWITCH record ends exactly on XLOG_BLCKSZ boundary.
-        // We will use logical message as the padding. We start with detecting how much WAL
-        // it takes for one logical message, considering all alignments and headers.
-        let base_wal_advance = {
+        // Add padding so the XLOG_SWITCH record ends exactly on XLOG_BLCKSZ boundary.  We
+        // will use carefully-sized logical messages to advance WAL insert location such
+        // that there is just enough space on the page for the XLOG_SWITCH record.
+        loop {
+            // We start with measuring how much WAL it takes for one logical message,
+            // considering all alignments and headers.
            let before_lsn = client.pg_current_wal_insert_lsn()?;
-            // Small non-empty message bigger than few bytes is more likely than an empty
-            // message to have the same format as the big padding message.
            client.execute(
                "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', 10))",
                &[],
            )?;
-            // The XLOG_SWITCH record has no data => its size is exactly XLOG_SIZE_OF_XLOG_RECORD.
-            (u64::from(client.pg_current_wal_insert_lsn()?) - u64::from(before_lsn)) as usize
-                + XLOG_SIZE_OF_XLOG_RECORD
-        };
-        let mut remaining_lsn =
-            XLOG_BLCKSZ - u64::from(client.pg_current_wal_insert_lsn()?) as usize % XLOG_BLCKSZ;
-        if remaining_lsn < base_wal_advance {
-            remaining_lsn += XLOG_BLCKSZ;
+            let after_lsn = client.pg_current_wal_insert_lsn()?;
+
+            // Did the record cross a page boundary? If it did, start over. Crossing a
+            // page boundary adds to the apparent size of the record because of the page
+            // header, which throws off the calculation.
+            if u64::from(before_lsn) / XLOG_BLCKSZ as u64
+                != u64::from(after_lsn) / XLOG_BLCKSZ as u64
+            {
+                continue;
+            }
+            // base_size is the size of a logical message without the payload
+            let base_size = u64::from(after_lsn) - u64::from(before_lsn) - 10;
+
+            // Is there enough space on the page for another logical message and an
+            // XLOG_SWITCH? If not, start over.
+            let page_remain = XLOG_BLCKSZ as u64 - u64::from(after_lsn) % XLOG_BLCKSZ as u64;
+            if page_remain < base_size - XLOG_SIZE_OF_XLOG_RECORD as u64 {
+                continue;
+            }
+
+            // We will write another logical message, such that after the logical message
+            // record, there will be space for exactly one XLOG_SWITCH. How large should
+            // the logical message's payload be? An XLOG_SWITCH record has no data => its
+            // size is exactly XLOG_SIZE_OF_XLOG_RECORD.
+            let repeats = page_remain - base_size - XLOG_SIZE_OF_XLOG_RECORD as u64;
+
+            client.execute(
+                "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))",
+                &[&(repeats as i32)],
+            )?;
+            break;
        }
-        let repeats = 10 + remaining_lsn - base_wal_advance;
-        info!(
-            "current_wal_insert_lsn={}, remaining_lsn={}, base_wal_advance={}, repeats={}",
-            client.pg_current_wal_insert_lsn()?,
-            remaining_lsn,
-            base_wal_advance,
-            repeats
-        );
-        client.execute(
-            "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))",
-            &[&(repeats as i32)],
-        )?;
        info!(
            "current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}",
            client.pg_current_wal_insert_lsn()?,
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -134,6 +134,11 @@ impl RemotePath {
    pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Utf8Path, std::path::StripPrefixError> {
        self.0.strip_prefix(&p.0)
    }
+
+    pub fn add_trailing_slash(&self) -> Self {
+        // Unwrap safety inputs are guararnteed to be valid UTF-8
+        Self(format!("{}/", self.0).try_into().unwrap())
+    }
 }

 /// We don't need callers to be able to pass arbitrary delimiters: just control
@@ -157,47 +162,21 @@ pub struct Listing {
 /// providing basic CRUD operations for storage files.
 #[allow(async_fn_in_trait)]
 pub trait RemoteStorage: Send + Sync + 'static {
-    /// Lists all top level subdirectories for a given prefix
-    /// Note: here we assume that if the prefix is passed it was obtained via remote_object_id
-    /// which already takes into account any kind of global prefix (prefix_in_bucket for S3 or storage_root for LocalFS)
-    /// so this method doesnt need to.
-    async fn list_prefixes(
-        &self,
-        prefix: Option<&RemotePath>,
-        cancel: &CancellationToken,
-    ) -> Result<Vec<RemotePath>, DownloadError> {
-        let result = self
-            .list(prefix, ListingMode::WithDelimiter, None, cancel)
-            .await?
-            .prefixes;
-        Ok(result)
-    }
-    /// Lists all files in directory "recursively"
-    /// (not really recursively, because AWS has a flat namespace)
-    /// Note: This is subtely different than list_prefixes,
-    /// because it is for listing files instead of listing
-    /// names sharing common prefixes.
-    /// For example,
-    /// list_files("foo/bar") = ["foo/bar/cat123.txt",
-    /// "foo/bar/cat567.txt", "foo/bar/dog123.txt", "foo/bar/dog456.txt"]
-    /// whereas,
-    /// list_prefixes("foo/bar/") = ["cat", "dog"]
-    /// See `test_real_s3.rs` for more details.
+    /// List objects in remote storage, with semantics matching AWS S3's ListObjectsV2.
+    /// (see `<https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html>`)
+    ///
+    /// Note that the prefix is relative to any `prefix_in_bucket` configured for the client, not
+    /// from the absolute root of the bucket.
+    ///
+    /// `mode` configures whether to use a delimiter.  Without a delimiter all keys
+    /// within the prefix are listed in the `keys` of the result.  With a delimiter, any "directories" at the top level of
+    /// the prefix are returned in the `prefixes` of the result, and keys in the top level of the prefix are
+    /// returned in `keys` ().
+    ///
+    /// `max_keys` controls the maximum number of keys that will be returned.  If this is None, this function
+    /// will iteratively call listobjects until it runs out of keys.  Note that this is not safe to use on
+    /// unlimted size buckets, as the full list of objects is allocated into a monolithic data structure.
    ///
-    /// max_keys limits max number of keys returned; None means unlimited.
-    async fn list_files(
-        &self,
-        prefix: Option<&RemotePath>,
-        max_keys: Option<NonZeroU32>,
-        cancel: &CancellationToken,
-    ) -> Result<Vec<RemotePath>, DownloadError> {
-        let result = self
-            .list(prefix, ListingMode::NoDelimiter, max_keys, cancel)
-            .await?
-            .keys;
-        Ok(result)
-    }
-
    async fn list(
        &self,
        prefix: Option<&RemotePath>,
@@ -336,41 +315,6 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
        }
    }

-    // A function for listing all the files in a "directory"
-    // Example:
-    // list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
-    //
-    // max_keys limits max number of keys returned; None means unlimited.
-    pub async fn list_files(
-        &self,
-        folder: Option<&RemotePath>,
-        max_keys: Option<NonZeroU32>,
-        cancel: &CancellationToken,
-    ) -> Result<Vec<RemotePath>, DownloadError> {
-        match self {
-            Self::LocalFs(s) => s.list_files(folder, max_keys, cancel).await,
-            Self::AwsS3(s) => s.list_files(folder, max_keys, cancel).await,
-            Self::AzureBlob(s) => s.list_files(folder, max_keys, cancel).await,
-            Self::Unreliable(s) => s.list_files(folder, max_keys, cancel).await,
-        }
-    }
-
-    // lists common *prefixes*, if any of files
-    // Example:
-    // list_prefixes("foo123","foo567","bar123","bar432") = ["foo", "bar"]
-    pub async fn list_prefixes(
-        &self,
-        prefix: Option<&RemotePath>,
-        cancel: &CancellationToken,
-    ) -> Result<Vec<RemotePath>, DownloadError> {
-        match self {
-            Self::LocalFs(s) => s.list_prefixes(prefix, cancel).await,
-            Self::AwsS3(s) => s.list_prefixes(prefix, cancel).await,
-            Self::AzureBlob(s) => s.list_prefixes(prefix, cancel).await,
-            Self::Unreliable(s) => s.list_prefixes(prefix, cancel).await,
-        }
-    }
-
    /// See [`RemoteStorage::upload`]
    pub async fn upload(
        &self,
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -5,11 +5,9 @@
 //! volume is mounted to the local FS.

 use std::{
-    borrow::Cow,
-    future::Future,
+    collections::HashSet,
    io::ErrorKind,
    num::NonZeroU32,
-    pin::Pin,
    time::{Duration, SystemTime, UNIX_EPOCH},
 };

@@ -22,11 +20,11 @@ use tokio::{
    io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
 };
 use tokio_util::{io::ReaderStream, sync::CancellationToken};
-use tracing::*;
-use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
+use utils::crashsafe::path_with_suffix_extension;

 use crate::{
    Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel,
+    REMOTE_STORAGE_PREFIX_SEPARATOR,
 };

 use super::{RemoteStorage, StorageMetadata};
@@ -93,7 +91,47 @@ impl LocalFs {

    #[cfg(test)]
    async fn list_all(&self) -> anyhow::Result<Vec<RemotePath>> {
-        Ok(get_all_files(&self.storage_root, true)
+        use std::{future::Future, pin::Pin};
+        fn get_all_files<'a, P>(
+            directory_path: P,
+        ) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<Utf8PathBuf>>> + Send + Sync + 'a>>
+        where
+            P: AsRef<Utf8Path> + Send + Sync + 'a,
+        {
+            Box::pin(async move {
+                let directory_path = directory_path.as_ref();
+                if directory_path.exists() {
+                    if directory_path.is_dir() {
+                        let mut paths = Vec::new();
+                        let mut dir_contents = fs::read_dir(directory_path).await?;
+                        while let Some(dir_entry) = dir_contents.next_entry().await? {
+                            let file_type = dir_entry.file_type().await?;
+                            let entry_path =
+                                Utf8PathBuf::from_path_buf(dir_entry.path()).map_err(|pb| {
+                                    anyhow::Error::msg(format!(
+                                        "non-Unicode path: {}",
+                                        pb.to_string_lossy()
+                                    ))
+                                })?;
+                            if file_type.is_symlink() {
+                                tracing::debug!("{entry_path:?} is a symlink, skipping")
+                            } else if file_type.is_dir() {
+                                paths.extend(get_all_files(&entry_path).await?.into_iter())
+                            } else {
+                                paths.push(entry_path);
+                            }
+                        }
+                        Ok(paths)
+                    } else {
+                        bail!("Path {directory_path:?} is not a directory")
+                    }
+                } else {
+                    Ok(Vec::new())
+                }
+            })
+        }
+
+        Ok(get_all_files(&self.storage_root)
            .await?
            .into_iter()
            .map(|path| {
@@ -120,6 +158,14 @@ impl LocalFs {
        // S3 object list prefixes can be arbitrary strings, but when reading
        // the local filesystem we need a directory to start calling read_dir on.
        let mut initial_dir = full_path.clone();
+
+        // If there's no trailing slash, we have to start looking from one above: even if
+        // `initial_dir` is a directory, we should still list any prefixes in the parent
+        // that start with the same string.
+        if !full_path.to_string().ends_with('/') {
+            initial_dir.pop();
+        }
+
        loop {
            // Did we make it to the root?
            if initial_dir.parent().is_none() {
@@ -295,61 +341,66 @@ impl RemoteStorage for LocalFs {
        let op = async {
            let mut result = Listing::default();

-            if let ListingMode::NoDelimiter = mode {
-                let keys = self
-                    .list_recursive(prefix)
-                    .await
-                    .map_err(DownloadError::Other)?;
-
-                result.keys = keys
-                    .into_iter()
-                    .filter(|k| {
-                        let path = k.with_base(&self.storage_root);
-                        !path.is_dir()
-                    })
-                    .collect();
-
-                if let Some(max_keys) = max_keys {
-                    result.keys.truncate(max_keys.get() as usize);
-                }
-
-                return Ok(result);
-            }
-
-            let path = match prefix {
-                Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
-                None => Cow::Borrowed(&self.storage_root),
-            };
-
-            let prefixes_to_filter = get_all_files(path.as_ref(), false)
+            // Filter out directories: in S3 directories don't exist, only the keys within them do.
+            let keys = self
+                .list_recursive(prefix)
                .await
                .map_err(DownloadError::Other)?;
+            let keys = keys
+                .into_iter()
+                .filter(|k| {
+                    let path = k.with_base(&self.storage_root);
+                    !path.is_dir()
+                })
+                .collect();

-            // filter out empty directories to mirror s3 behavior.
-            for prefix in prefixes_to_filter {
-                if prefix.is_dir()
-                    && is_directory_empty(&prefix)
-                        .await
-                        .map_err(DownloadError::Other)?
-                {
-                    continue;
-                }
-
-                let stripped = prefix
-                    .strip_prefix(&self.storage_root)
-                    .context("Failed to strip prefix")
-                    .and_then(RemotePath::new)
-                    .expect(
-                        "We list files for storage root, hence should be able to remote the prefix",
-                    );
-
-                if prefix.is_dir() {
-                    result.prefixes.push(stripped);
-                } else {
-                    result.keys.push(stripped);
+            if let ListingMode::NoDelimiter = mode {
+                result.keys = keys;
+            } else {
+                let mut prefixes = HashSet::new();
+                for key in keys {
+                    // If the part after the prefix includes a "/", take only the first part and put it in `prefixes`.
+                    let relative_key = if let Some(prefix) = prefix {
+                        let mut prefix = prefix.clone();
+                        // We only strip the dirname of the prefix, so that when we strip it from the start of keys we
+                        // end up with full file/dir names.
+                        let prefix_full_local_path = prefix.with_base(&self.storage_root);
+                        let has_slash = prefix.0.to_string().ends_with('/');
+                        let strip_prefix = if prefix_full_local_path.is_dir() && has_slash {
+                            prefix
+                        } else {
+                            prefix.0.pop();
+                            prefix
+                        };
+
+                        RemotePath::new(key.strip_prefix(&strip_prefix).unwrap()).unwrap()
+                    } else {
+                        key
+                    };
+
+                    let relative_key = format!("{}", relative_key);
+                    if relative_key.contains(REMOTE_STORAGE_PREFIX_SEPARATOR) {
+                        let first_part = relative_key
+                            .split(REMOTE_STORAGE_PREFIX_SEPARATOR)
+                            .next()
+                            .unwrap()
+                            .to_owned();
+                        prefixes.insert(first_part);
+                    } else {
+                        result
+                            .keys
+                            .push(RemotePath::from_string(&relative_key).unwrap());
+                    }
                }
+                result.prefixes = prefixes
+                    .into_iter()
+                    .map(|s| RemotePath::from_string(&s).unwrap())
+                    .collect();
            }

+            if let Some(max_keys) = max_keys {
+                result.keys.truncate(max_keys.get() as usize);
+            }
            Ok(result)
        };

@@ -560,50 +611,6 @@ fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
    path_with_suffix_extension(original_path, "metadata")
 }

-fn get_all_files<'a, P>(
-    directory_path: P,
-    recursive: bool,
-) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<Utf8PathBuf>>> + Send + Sync + 'a>>
-where
-    P: AsRef<Utf8Path> + Send + Sync + 'a,
-{
-    Box::pin(async move {
-        let directory_path = directory_path.as_ref();
-        if directory_path.exists() {
-            if directory_path.is_dir() {
-                let mut paths = Vec::new();
-                let mut dir_contents = fs::read_dir(directory_path).await?;
-                while let Some(dir_entry) = dir_contents.next_entry().await? {
-                    let file_type = dir_entry.file_type().await?;
-                    let entry_path =
-                        Utf8PathBuf::from_path_buf(dir_entry.path()).map_err(|pb| {
-                            anyhow::Error::msg(format!(
-                                "non-Unicode path: {}",
-                                pb.to_string_lossy()
-                            ))
-                        })?;
-                    if file_type.is_symlink() {
-                        debug!("{entry_path:?} is a symlink, skipping")
-                    } else if file_type.is_dir() {
-                        if recursive {
-                            paths.extend(get_all_files(&entry_path, true).await?.into_iter())
-                        } else {
-                            paths.push(entry_path)
-                        }
-                    } else {
-                        paths.push(entry_path);
-                    }
-                }
-                Ok(paths)
-            } else {
-                bail!("Path {directory_path:?} is not a directory")
-            }
-        } else {
-            Ok(Vec::new())
-        }
-    })
-}
-
 async fn create_target_directory(target_file_path: &Utf8Path) -> anyhow::Result<()> {
    let target_dir = match target_file_path.parent() {
        Some(parent_dir) => parent_dir,
@@ -923,13 +930,18 @@ mod fs_tests {
        // No delimiter: should recursively list everything
        let (storage, cancel) = create_storage()?;
        let child = upload_dummy_file(&storage, "grandparent/parent/child", None, &cancel).await?;
+        let child_sibling =
+            upload_dummy_file(&storage, "grandparent/parent/child_sibling", None, &cancel).await?;
        let uncle = upload_dummy_file(&storage, "grandparent/uncle", None, &cancel).await?;

        let listing = storage
            .list(None, ListingMode::NoDelimiter, None, &cancel)
            .await?;
        assert!(listing.prefixes.is_empty());
-        assert_eq!(listing.keys, [uncle.clone(), child.clone()].to_vec());
+        assert_eq!(
+            listing.keys.into_iter().collect::<HashSet<_>>(),
+            HashSet::from([uncle.clone(), child.clone(), child_sibling.clone()])
+        );

        // Delimiter: should only go one deep
        let listing = storage
@@ -942,7 +954,25 @@ mod fs_tests {
        );
        assert!(listing.keys.is_empty());

-        // Delimiter & prefix
+        // Delimiter & prefix with a trailing slash
+        let listing = storage
+            .list(
+                Some(&RemotePath::from_string("timelines/some_timeline/grandparent/").unwrap()),
+                ListingMode::WithDelimiter,
+                None,
+                &cancel,
+            )
+            .await?;
+        assert_eq!(
+            listing.keys,
+            [RemotePath::from_string("uncle").unwrap()].to_vec()
+        );
+        assert_eq!(
+            listing.prefixes,
+            [RemotePath::from_string("parent").unwrap()].to_vec()
+        );
+
+        // Delimiter and prefix without a trailing slash
        let listing = storage
            .list(
                Some(&RemotePath::from_string("timelines/some_timeline/grandparent").unwrap()),
@@ -951,12 +981,66 @@ mod fs_tests {
                &cancel,
            )
            .await?;
+        assert_eq!(listing.keys, [].to_vec());
        assert_eq!(
            listing.prefixes,
-            [RemotePath::from_string("timelines/some_timeline/grandparent/parent").unwrap()]
-                .to_vec()
+            [RemotePath::from_string("grandparent").unwrap()].to_vec()
+        );
+
+        // Delimiter and prefix that's partway through a path component
+        let listing = storage
+            .list(
+                Some(&RemotePath::from_string("timelines/some_timeline/grandp").unwrap()),
+                ListingMode::WithDelimiter,
+                None,
+                &cancel,
+            )
+            .await?;
+        assert_eq!(listing.keys, [].to_vec());
+        assert_eq!(
+            listing.prefixes,
+            [RemotePath::from_string("grandparent").unwrap()].to_vec()
+        );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn list_part_component() -> anyhow::Result<()> {
+        // No delimiter: should recursively list everything
+        let (storage, cancel) = create_storage()?;
+
+        // Imitates what happens in a tenant path when we have an unsharded path and a sharded path, and do a listing
+        // of the unsharded path: although there is a "directory" at the unsharded path, it should be handled as
+        // a freeform prefix.
+        let _child_a =
+            upload_dummy_file(&storage, "grandparent/tenant-01/child", None, &cancel).await?;
+        let _child_b =
+            upload_dummy_file(&storage, "grandparent/tenant/child", None, &cancel).await?;
+
+        // Delimiter and prefix that's partway through a path component
+        let listing = storage
+            .list(
+                Some(
+                    &RemotePath::from_string("timelines/some_timeline/grandparent/tenant").unwrap(),
+                ),
+                ListingMode::WithDelimiter,
+                None,
+                &cancel,
+            )
+            .await?;
+        assert_eq!(listing.keys, [].to_vec());
+
+        let mut found_prefixes = listing.prefixes.clone();
+        found_prefixes.sort();
+        assert_eq!(
+            found_prefixes,
+            [
+                RemotePath::from_string("tenant").unwrap(),
+                RemotePath::from_string("tenant-01").unwrap(),
+            ]
+            .to_vec()
        );
-        assert_eq!(listing.keys, [uncle.clone()].to_vec());

        Ok(())
    }
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -178,10 +178,7 @@ impl S3Bucket {

    pub fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
        assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR);
-        let path_string = path
-            .get_path()
-            .as_str()
-            .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR);
+        let path_string = path.get_path().as_str();
        match &self.prefix_in_bucket {
            Some(prefix) => prefix.clone() + "/" + path_string,
            None => path_string.to_string(),
@@ -471,16 +468,11 @@ impl RemoteStorage for S3Bucket {
        // get the passed prefix or if it is not set use prefix_in_bucket value
        let list_prefix = prefix
            .map(|p| self.relative_path_to_s3_object(p))
-            .or_else(|| self.prefix_in_bucket.clone())
-            .map(|mut p| {
-                // required to end with a separator
-                // otherwise request will return only the entry of a prefix
-                if matches!(mode, ListingMode::WithDelimiter)
-                    && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
-                {
-                    p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
-                }
-                p
+            .or_else(|| {
+                self.prefix_in_bucket.clone().map(|mut s| {
+                    s.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
+                    s
+                })
            });

        let _permit = self.permit(kind, cancel).await?;
@@ -549,11 +541,15 @@ impl RemoteStorage for S3Bucket {
                }
            }

-            result.prefixes.extend(
-                prefixes
-                    .iter()
-                    .filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))),
-            );
+            // S3 gives us prefixes like "foo/", we return them like "foo"
+            result.prefixes.extend(prefixes.iter().filter_map(|o| {
+                Some(
+                    self.s3_object_to_relative_path(
+                        o.prefix()?
+                            .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR),
+                    ),
+                )
+            }));

            continuation_token = match response.next_continuation_token {
                Some(new_token) => Some(new_token),
@@ -1050,22 +1046,22 @@ mod tests {
            Some("/test/prefix/"),
        ];
        let expected_outputs = [
-            vec!["", "some/path", "some/path"],
-            vec!["/", "/some/path", "/some/path"],
+            vec!["", "some/path", "some/path/"],
+            vec!["/", "/some/path", "/some/path/"],
            vec![
                "test/prefix/",
                "test/prefix/some/path",
-                "test/prefix/some/path",
+                "test/prefix/some/path/",
            ],
            vec![
                "test/prefix/",
                "test/prefix/some/path",
-                "test/prefix/some/path",
+                "test/prefix/some/path/",
            ],
            vec![
                "test/prefix/",
                "test/prefix/some/path",
-                "test/prefix/some/path",
+                "test/prefix/some/path/",
            ],
        ];

--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -107,27 +107,6 @@ impl UnreliableWrapper {
 type VoidStorage = crate::LocalFs;

 impl RemoteStorage for UnreliableWrapper {
-    async fn list_prefixes(
-        &self,
-        prefix: Option<&RemotePath>,
-        cancel: &CancellationToken,
-    ) -> Result<Vec<RemotePath>, DownloadError> {
-        self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
-            .map_err(DownloadError::Other)?;
-        self.inner.list_prefixes(prefix, cancel).await
-    }
-
-    async fn list_files(
-        &self,
-        folder: Option<&RemotePath>,
-        max_keys: Option<NonZeroU32>,
-        cancel: &CancellationToken,
-    ) -> Result<Vec<RemotePath>, DownloadError> {
-        self.attempt(RemoteOp::ListPrefixes(folder.cloned()))
-            .map_err(DownloadError::Other)?;
-        self.inner.list_files(folder, max_keys, cancel).await
-    }
-
    async fn list(
        &self,
        prefix: Option<&RemotePath>,
--- a/libs/remote_storage/tests/common/tests.rs
+++ b/libs/remote_storage/tests/common/tests.rs
@@ -1,5 +1,6 @@
 use anyhow::Context;
 use camino::Utf8Path;
+use remote_storage::ListingMode;
 use remote_storage::RemotePath;
 use std::sync::Arc;
 use std::{collections::HashSet, num::NonZeroU32};
@@ -54,9 +55,9 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
    let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
        .context("common_prefix construction")?;
    let root_remote_prefixes = test_client
-        .list_prefixes(None, &cancel)
-        .await
-        .context("client list root prefixes failure")?
+        .list(None, ListingMode::WithDelimiter, None, &cancel)
+        .await?
+        .prefixes
        .into_iter()
        .collect::<HashSet<_>>();
    assert_eq!(
@@ -65,9 +66,14 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
    );

    let nested_remote_prefixes = test_client
-        .list_prefixes(Some(&base_prefix), &cancel)
-        .await
-        .context("client list nested prefixes failure")?
+        .list(
+            Some(&base_prefix.add_trailing_slash()),
+            ListingMode::WithDelimiter,
+            None,
+            &cancel,
+        )
+        .await?
+        .prefixes
        .into_iter()
        .collect::<HashSet<_>>();
    let remote_only_prefixes = nested_remote_prefixes
@@ -90,11 +96,13 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
 ///
 /// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`]
 /// Then performs the following queries:
-///    1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
-///    2. `list_files("folder1")`.  This  should return all files `random_prefix/folder1/blob_{i}.txt`
+///    1. `list(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
+///    2. `list("folder1")`.  This  should return all files `random_prefix/folder1/blob_{i}.txt`
 #[test_context(MaybeEnabledStorageWithSimpleTestBlobs)]
 #[tokio::test]
-async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> anyhow::Result<()> {
+async fn list_no_delimiter_works(
+    ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs,
+) -> anyhow::Result<()> {
    let ctx = match ctx {
        MaybeEnabledStorageWithSimpleTestBlobs::Enabled(ctx) => ctx,
        MaybeEnabledStorageWithSimpleTestBlobs::Disabled => return Ok(()),
@@ -107,29 +115,36 @@ async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> a
    let base_prefix =
        RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
    let root_files = test_client
-        .list_files(None, None, &cancel)
+        .list(None, ListingMode::NoDelimiter, None, &cancel)
        .await
        .context("client list root files failure")?
+        .keys
        .into_iter()
        .collect::<HashSet<_>>();
    assert_eq!(
        root_files,
        ctx.remote_blobs.clone(),
-        "remote storage list_files on root mismatches with the uploads."
+        "remote storage list on root mismatches with the uploads."
    );

    // Test that max_keys limit works. In total there are about 21 files (see
    // upload_simple_remote_data call in test_real_s3.rs).
    let limited_root_files = test_client
-        .list_files(None, Some(NonZeroU32::new(2).unwrap()), &cancel)
+        .list(
+            None,
+            ListingMode::NoDelimiter,
+            Some(NonZeroU32::new(2).unwrap()),
+            &cancel,
+        )
        .await
        .context("client list root files failure")?;
-    assert_eq!(limited_root_files.len(), 2);
+    assert_eq!(limited_root_files.keys.len(), 2);

    let nested_remote_files = test_client
-        .list_files(Some(&base_prefix), None, &cancel)
+        .list(Some(&base_prefix), ListingMode::NoDelimiter, None, &cancel)
        .await
        .context("client list nested files failure")?
+        .keys
        .into_iter()
        .collect::<HashSet<_>>();
    let trim_remote_blobs: HashSet<_> = ctx
@@ -141,7 +156,7 @@ async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> a
        .collect();
    assert_eq!(
        nested_remote_files, trim_remote_blobs,
-        "remote storage list_files on subdirrectory mismatches with the uploads."
+        "remote storage list on subdirrectory mismatches with the uploads."
    );
    Ok(())
 }
@@ -199,7 +214,11 @@ async fn delete_objects_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<(

    ctx.client.delete_objects(&[path1, path2], &cancel).await?;

-    let prefixes = ctx.client.list_prefixes(None, &cancel).await?;
+    let prefixes = ctx
+        .client
+        .list(None, ListingMode::WithDelimiter, None, &cancel)
+        .await?
+        .prefixes;

    assert_eq!(prefixes.len(), 1);

--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -132,10 +132,6 @@ impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
    }
 }

-// NOTE: the setups for the list_prefixes test and the list_files test are very similar
-// However, they are not idential. The list_prefixes function is concerned with listing prefixes,
-// whereas the list_files function is concerned with listing files.
-// See `RemoteStorage::list_files` documentation for more details
 enum MaybeEnabledStorageWithSimpleTestBlobs {
    Enabled(AzureWithSimpleTestBlobs),
    Disabled,
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -12,8 +12,8 @@ use anyhow::Context;
 use camino::Utf8Path;
 use futures_util::StreamExt;
 use remote_storage::{
-    DownloadError, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
-    S3Config,
+    DownloadError, GenericRemoteStorage, ListingMode, RemotePath, RemoteStorageConfig,
+    RemoteStorageKind, S3Config,
 };
 use test_context::test_context;
 use test_context::AsyncTestContext;
@@ -75,11 +75,14 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
        client: &Arc<GenericRemoteStorage>,
        cancel: &CancellationToken,
    ) -> anyhow::Result<HashSet<RemotePath>> {
-        Ok(retry(|| client.list_files(None, None, cancel))
-            .await
-            .context("list root files failure")?
-            .into_iter()
-            .collect::<HashSet<_>>())
+        Ok(
+            retry(|| client.list(None, ListingMode::NoDelimiter, None, cancel))
+                .await
+                .context("list root files failure")?
+                .keys
+                .into_iter()
+                .collect::<HashSet<_>>(),
+        )
    }

    let cancel = CancellationToken::new();
@@ -294,10 +297,6 @@ impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
    }
 }

-// NOTE: the setups for the list_prefixes test and the list_files test are very similar
-// However, they are not idential. The list_prefixes function is concerned with listing prefixes,
-// whereas the list_files function is concerned with listing files.
-// See `RemoteStorage::list_files` documentation for more details
 enum MaybeEnabledStorageWithSimpleTestBlobs {
    Enabled(S3WithSimpleTestBlobs),
    Disabled,
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -92,6 +92,8 @@ pub mod zstd;

 pub mod env;

+pub mod poison;
+
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
--- a/libs/utils/src/poison.rs
+++ b/libs/utils/src/poison.rs
@@ -0,0 +1,121 @@
+//!  Protect a piece of state from reuse after it is left in an inconsistent state.
+//!
+//!  # Example
+//!
+//!  ```
+//!  # tokio_test::block_on(async {
+//!  use utils::poison::Poison;
+//!  use std::time::Duration;
+//!
+//!  struct State {
+//!    clean: bool,
+//!  }
+//!  let state = tokio::sync::Mutex::new(Poison::new("mystate", State { clean: true }));
+//!
+//!  let mut mutex_guard = state.lock().await;
+//!  let mut poison_guard = mutex_guard.check_and_arm()?;
+//!  let state = poison_guard.data_mut();
+//!  state.clean = false;
+//!  // If we get cancelled at this await point, subsequent check_and_arm() calls will fail.
+//!  tokio::time::sleep(Duration::from_secs(10)).await;
+//!  state.clean = true;
+//!  poison_guard.disarm();
+//!  # Ok::<(), utils::poison::Error>(())
+//!  # });
+//!  ```
+
+use tracing::warn;
+
+pub struct Poison<T> {
+    what: &'static str,
+    state: State,
+    data: T,
+}
+
+#[derive(Clone, Copy)]
+enum State {
+    Clean,
+    Armed,
+    Poisoned { at: chrono::DateTime<chrono::Utc> },
+}
+
+impl<T> Poison<T> {
+    /// We log `what` `warning!` level if the [`Guard`] gets dropped without being [`Guard::disarm`]ed.
+    pub fn new(what: &'static str, data: T) -> Self {
+        Self {
+            what,
+            state: State::Clean,
+            data,
+        }
+    }
+
+    /// Check for poisoning and return a [`Guard`] that provides access to the wrapped state.
+    pub fn check_and_arm(&mut self) -> Result<Guard<T>, Error> {
+        match self.state {
+            State::Clean => {
+                self.state = State::Armed;
+                Ok(Guard(self))
+            }
+            State::Armed => unreachable!("transient state"),
+            State::Poisoned { at } => Err(Error::Poisoned {
+                what: self.what,
+                at,
+            }),
+        }
+    }
+}
+
+/// Use [`Self::data`] and [`Self::data_mut`] to access the wrapped state.
+/// Once modifications are done, use [`Self::disarm`].
+/// If [`Guard`] gets dropped instead of calling [`Self::disarm`], the state is poisoned
+/// and subsequent calls to [`Poison::check_and_arm`] will fail with an error.
+pub struct Guard<'a, T>(&'a mut Poison<T>);
+
+impl<'a, T> Guard<'a, T> {
+    pub fn data(&self) -> &T {
+        &self.0.data
+    }
+    pub fn data_mut(&mut self) -> &mut T {
+        &mut self.0.data
+    }
+
+    pub fn disarm(self) {
+        match self.0.state {
+            State::Clean => unreachable!("we set it to Armed in check_and_arm()"),
+            State::Armed => {
+                self.0.state = State::Clean;
+            }
+            State::Poisoned { at } => {
+                unreachable!("we fail check_and_arm() if it's in that state: {at}")
+            }
+        }
+    }
+}
+
+impl<'a, T> Drop for Guard<'a, T> {
+    fn drop(&mut self) {
+        match self.0.state {
+            State::Clean => {
+                // set by disarm()
+            }
+            State::Armed => {
+                // still armed => poison it
+                let at = chrono::Utc::now();
+                self.0.state = State::Poisoned { at };
+                warn!(at=?at, "poisoning {}", self.0.what);
+            }
+            State::Poisoned { at } => {
+                unreachable!("we fail check_and_arm() if it's in that state: {at}")
+            }
+        }
+    }
+}
+
+#[derive(thiserror::Error, Debug)]
+pub enum Error {
+    #[error("poisoned at {at}: {what}")]
+    Poisoned {
+        what: &'static str,
+        at: chrono::DateTime<chrono::Utc>,
+    },
+}
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -192,6 +192,14 @@ impl<T> OnceCell<T> {
        }
    }

+    /// Like [`Guard::take_and_deinit`], but will return `None` if this OnceCell was never
+    /// initialized.
+    pub fn take_and_deinit(&mut self) -> Option<(T, InitPermit)> {
+        let inner = self.inner.get_mut().unwrap();
+
+        inner.take_and_deinit()
+    }
+
    /// Return the number of [`Self::get_or_init`] calls waiting for initialization to complete.
    pub fn initializer_count(&self) -> usize {
        self.initializers.load(Ordering::Relaxed)
@@ -246,15 +254,23 @@ impl<'a, T> Guard<'a, T> {
    /// The permit will be on a semaphore part of the new internal value, and any following
    /// [`OnceCell::get_or_init`] will wait on it to complete.
    pub fn take_and_deinit(mut self) -> (T, InitPermit) {
+        self.0
+            .take_and_deinit()
+            .expect("guard is not created unless value has been initialized")
+    }
+}
+
+impl<T> Inner<T> {
+    pub fn take_and_deinit(&mut self) -> Option<(T, InitPermit)> {
+        let value = self.value.take()?;
+
        let mut swapped = Inner::default();
        let sem = swapped.init_semaphore.clone();
        // acquire and forget right away, moving the control over to InitPermit
        sem.try_acquire().expect("we just created this").forget();
-        std::mem::swap(&mut *self.0, &mut swapped);
-        swapped
-            .value
-            .map(|v| (v, InitPermit(sem)))
-            .expect("guard is not created unless value has been initialized")
+        let permit = InitPermit(sem);
+        std::mem::swap(self, &mut swapped);
+        Some((value, permit))
    }
 }

@@ -263,6 +279,13 @@ impl<'a, T> Guard<'a, T> {
 /// On drop, this type will return the permit.
 pub struct InitPermit(Arc<tokio::sync::Semaphore>);

+impl std::fmt::Debug for InitPermit {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let ptr = Arc::as_ptr(&self.0) as *const ();
+        f.debug_tuple("InitPermit").field(&ptr).finish()
+    }
+}
+
 impl Drop for InitPermit {
    fn drop(&mut self) {
        assert_eq!(
@@ -559,4 +582,22 @@ mod tests {

        assert_eq!(*target.get().unwrap(), 11);
    }
+
+    #[tokio::test]
+    async fn take_and_deinit_on_mut() {
+        use std::convert::Infallible;
+
+        let mut target = OnceCell::<u32>::default();
+        assert!(target.take_and_deinit().is_none());
+
+        target
+            .get_or_init(|permit| async move { Ok::<_, Infallible>((42, permit)) })
+            .await
+            .unwrap();
+
+        let again = target.take_and_deinit();
+        assert!(matches!(again, Some((42, _))), "{again:?}");
+
+        assert!(target.take_and_deinit().is_none());
+    }
 }
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -70,6 +70,7 @@ tokio-stream.workspace = true
 tokio-util.workspace = true
 toml_edit = { workspace = true, features = [ "serde" ] }
 tracing.workspace = true
+twox-hash.workspace = true
 url.workspace = true
 walkdir.workspace = true
 metrics.workspace = true
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -27,30 +27,50 @@
 //!
 //! # Reference Numbers
 //!
-//! 2024-04-04 on i3en.3xlarge
+//! 2024-04-15 on i3en.3xlarge
 //!
 //! ```text
-//! short/1                 time:   [25.925 µs 26.060 µs 26.209 µs]
-//! short/2                 time:   [31.277 µs 31.483 µs 31.722 µs]
-//! short/4                 time:   [45.496 µs 45.831 µs 46.182 µs]
-//! short/8                 time:   [84.298 µs 84.920 µs 85.566 µs]
-//! short/16                time:   [185.04 µs 186.41 µs 187.88 µs]
-//! short/32                time:   [385.01 µs 386.77 µs 388.70 µs]
-//! short/64                time:   [770.24 µs 773.04 µs 776.04 µs]
-//! short/128               time:   [1.5017 ms 1.5064 ms 1.5113 ms]
-//! medium/1                time:   [106.65 µs 107.20 µs 107.85 µs]
-//! medium/2                time:   [153.28 µs 154.24 µs 155.56 µs]
-//! medium/4                time:   [325.67 µs 327.01 µs 328.71 µs]
-//! medium/8                time:   [646.82 µs 650.17 µs 653.91 µs]
-//! medium/16               time:   [1.2645 ms 1.2701 ms 1.2762 ms]
-//! medium/32               time:   [2.4409 ms 2.4550 ms 2.4692 ms]
-//! medium/64               time:   [4.6814 ms 4.7114 ms 4.7408 ms]
-//! medium/128              time:   [8.7790 ms 8.9037 ms 9.0282 ms]
+//! async-short/1           time:   [24.584 µs 24.737 µs 24.922 µs]
+//! async-short/2           time:   [33.479 µs 33.660 µs 33.888 µs]
+//! async-short/4           time:   [42.713 µs 43.046 µs 43.440 µs]
+//! async-short/8           time:   [71.814 µs 72.478 µs 73.240 µs]
+//! async-short/16          time:   [132.73 µs 134.45 µs 136.22 µs]
+//! async-short/32          time:   [258.31 µs 260.73 µs 263.27 µs]
+//! async-short/64          time:   [511.61 µs 514.44 µs 517.51 µs]
+//! async-short/128         time:   [992.64 µs 998.23 µs 1.0042 ms]
+//! async-medium/1          time:   [110.11 µs 110.50 µs 110.96 µs]
+//! async-medium/2          time:   [153.06 µs 153.85 µs 154.99 µs]
+//! async-medium/4          time:   [317.51 µs 319.92 µs 322.85 µs]
+//! async-medium/8          time:   [638.30 µs 644.68 µs 652.12 µs]
+//! async-medium/16         time:   [1.2651 ms 1.2773 ms 1.2914 ms]
+//! async-medium/32         time:   [2.5117 ms 2.5410 ms 2.5720 ms]
+//! async-medium/64         time:   [4.8088 ms 4.8555 ms 4.9047 ms]
+//! async-medium/128        time:   [8.8311 ms 8.9849 ms 9.1263 ms]
+//! sync-short/1            time:   [25.503 µs 25.626 µs 25.771 µs]
+//! sync-short/2            time:   [30.850 µs 31.013 µs 31.208 µs]
+//! sync-short/4            time:   [45.543 µs 45.856 µs 46.193 µs]
+//! sync-short/8            time:   [84.114 µs 84.639 µs 85.220 µs]
+//! sync-short/16           time:   [185.22 µs 186.15 µs 187.13 µs]
+//! sync-short/32           time:   [377.43 µs 378.87 µs 380.46 µs]
+//! sync-short/64           time:   [756.49 µs 759.04 µs 761.70 µs]
+//! sync-short/128          time:   [1.4825 ms 1.4874 ms 1.4923 ms]
+//! sync-medium/1           time:   [105.66 µs 106.01 µs 106.43 µs]
+//! sync-medium/2           time:   [153.10 µs 153.84 µs 154.72 µs]
+//! sync-medium/4           time:   [327.13 µs 329.44 µs 332.27 µs]
+//! sync-medium/8           time:   [654.26 µs 658.73 µs 663.63 µs]
+//! sync-medium/16          time:   [1.2682 ms 1.2748 ms 1.2816 ms]
+//! sync-medium/32          time:   [2.4456 ms 2.4595 ms 2.4731 ms]
+//! sync-medium/64          time:   [4.6523 ms 4.6890 ms 4.7256 ms]
+//! sync-medium/128         time:   [8.7215 ms 8.8323 ms 8.9344 ms]
 //! ```

 use bytes::{Buf, Bytes};
 use criterion::{BenchmarkId, Criterion};
-use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
+use pageserver::{
+    config::PageServerConf,
+    walrecord::NeonWalRecord,
+    walredo::{PostgresRedoManager, ProcessKind},
+};
 use pageserver_api::{key::Key, shard::TenantShardId};
 use std::{
    sync::Arc,
@@ -60,33 +80,39 @@ use tokio::{sync::Barrier, task::JoinSet};
 use utils::{id::TenantId, lsn::Lsn};

 fn bench(c: &mut Criterion) {
-    {
-        let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
-        for nclients in nclients {
-            let mut group = c.benchmark_group("short");
-            group.bench_with_input(
-                BenchmarkId::from_parameter(nclients),
-                &nclients,
-                |b, nclients| {
-                    let redo_work = Arc::new(Request::short_input());
-                    b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
-                },
-            );
+    for process_kind in &[ProcessKind::Async, ProcessKind::Sync] {
+        {
+            let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
+            for nclients in nclients {
+                let mut group = c.benchmark_group(format!("{process_kind}-short"));
+                group.bench_with_input(
+                    BenchmarkId::from_parameter(nclients),
+                    &nclients,
+                    |b, nclients| {
+                        let redo_work = Arc::new(Request::short_input());
+                        b.iter_custom(|iters| {
+                            bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients)
+                        });
+                    },
+                );
+            }
        }
-    }

-    {
-        let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
-        for nclients in nclients {
-            let mut group = c.benchmark_group("medium");
-            group.bench_with_input(
-                BenchmarkId::from_parameter(nclients),
-                &nclients,
-                |b, nclients| {
-                    let redo_work = Arc::new(Request::medium_input());
-                    b.iter_custom(|iters| bench_impl(Arc::clone(&redo_work), iters, *nclients));
-                },
-            );
+        {
+            let nclients = [1, 2, 4, 8, 16, 32, 64, 128];
+            for nclients in nclients {
+                let mut group = c.benchmark_group(format!("{process_kind}-medium"));
+                group.bench_with_input(
+                    BenchmarkId::from_parameter(nclients),
+                    &nclients,
+                    |b, nclients| {
+                        let redo_work = Arc::new(Request::medium_input());
+                        b.iter_custom(|iters| {
+                            bench_impl(*process_kind, Arc::clone(&redo_work), iters, *nclients)
+                        });
+                    },
+                );
+            }
        }
    }
 }
@@ -94,10 +120,16 @@ criterion::criterion_group!(benches, bench);
 criterion::criterion_main!(benches);

 // Returns the sum of each client's wall-clock time spent executing their share of the n_redos.
-fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration {
+fn bench_impl(
+    process_kind: ProcessKind,
+    redo_work: Arc<Request>,
+    n_redos: u64,
+    nclients: u64,
+) -> Duration {
    let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap();

-    let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
+    let mut conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
+    conf.walredo_process_kind = process_kind;
    let conf = Box::leak(Box::new(conf));
    let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());

@@ -113,25 +145,40 @@ fn bench_impl(redo_work: Arc<Request>, n_redos: u64, nclients: u64) -> Duration
    let manager = PostgresRedoManager::new(conf, tenant_shard_id);
    let manager = Arc::new(manager);

+    // divide the amount of work equally among the clients.
+    let nredos_per_client = n_redos / nclients;
    for _ in 0..nclients {
        rt.block_on(async {
            tasks.spawn(client(
                Arc::clone(&manager),
                Arc::clone(&start),
                Arc::clone(&redo_work),
-                // divide the amount of work equally among the clients
-                n_redos / nclients,
+                nredos_per_client,
            ))
        });
    }

-    rt.block_on(async move {
-        let mut total_wallclock_time = std::time::Duration::from_millis(0);
+    let elapsed = rt.block_on(async move {
+        let mut total_wallclock_time = Duration::ZERO;
        while let Some(res) = tasks.join_next().await {
            total_wallclock_time += res.unwrap();
        }
        total_wallclock_time
-    })
+    });
+
+    // consistency check to ensure process kind setting worked
+    if nredos_per_client > 0 {
+        assert_eq!(
+            manager
+                .status()
+                .process
+                .map(|p| p.kind)
+                .expect("the benchmark work causes a walredo process to be spawned"),
+            std::borrow::Cow::Borrowed(process_kind.into())
+        );
+    }
+
+    elapsed
 }

 async fn client(
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -279,7 +279,7 @@ impl Client {
        lazy: bool,
    ) -> Result<()> {
        let req_body = TenantLocationConfigRequest {
-            tenant_id: Some(tenant_shard_id),
+            tenant_id: None,
            config,
        };

--- a/pageserver/ctl/src/draw_timeline_dir.rs
+++ b/pageserver/ctl/src/draw_timeline_dir.rs
@@ -9,18 +9,45 @@
 //! Coordinates in both axis are compressed for better readability.
 //! (see <https://medium.com/algorithms-digest/coordinate-compression-2fff95326fb>)
 //!
-//! Example use:
+//! The plain text API was chosen so that we can easily work with filenames from various
+//! sources; see the Usage section below for examples.
+//!
+//! # Usage
+//!
+//! ## Producing the SVG
+//!
 //! ```bash
-//! $ ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \
-//! $   grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg
-//! $ firefox out.svg
+//!
+//! # local timeline dir
+//! ls test_output/test_pgbench\[neon-45-684\]/repo/tenants/$TENANT/timelines/$TIMELINE | \
+//!     grep "__" | cargo run --release --bin pagectl draw-timeline-dir > out.svg
+//!
+//! # Layer map dump from `/v1/tenant/$TENANT/timeline/$TIMELINE/layer`
+//! (jq -r '.historic_layers[] | .layer_file_name' | cargo  run -p pagectl draw-timeline) < layer-map.json > out.svg
+//!
+//! # From an `index_part.json` in S3
+//! (jq -r '.layer_metadata | keys[]' | cargo  run -p pagectl draw-timeline ) < index_part.json-00000016 > out.svg
+//!
 //! ```
 //!
-//! This API was chosen so that we can easily work with filenames extracted from ssh,
-//! or from pageserver log files.
+//! ## Viewing
 //!
-//! TODO Consider shipping this as a grafana panel plugin:
-//!      <https://grafana.com/tutorials/build-a-panel-plugin/>
+//! **Inkscape** is better than the built-in viewers in browsers.
+//!
+//! After selecting a layer file rectangle, use "Open XML Editor" (Ctrl|Cmd + Shift + X)
+//! to see the layer file name in the comment field.
+//!
+//! ```bash
+//!
+//! # Linux
+//! inkscape out.svg
+//!
+//! # macOS
+//! /Applications/Inkscape.app/Contents/MacOS/inkscape out.svg
+//!
+//! ```
+//!
+
 use anyhow::Result;
 use pageserver::repository::Key;
 use pageserver::METADATA_FILE_NAME;
@@ -65,7 +92,12 @@ fn parse_filename(name: &str) -> (Range<Key>, Range<Lsn>) {

 pub fn main() -> Result<()> {
    // Parse layer filenames from stdin
-    let mut ranges: Vec<(Range<Key>, Range<Lsn>)> = vec![];
+    struct Layer {
+        filename: String,
+        key_range: Range<Key>,
+        lsn_range: Range<Lsn>,
+    }
+    let mut files: Vec<Layer> = vec![];
    let stdin = io::stdin();
    for line in stdin.lock().lines() {
        let line = line.unwrap();
@@ -76,14 +108,23 @@ pub fn main() -> Result<()> {
            // Don't try and parse "metadata" like a key-lsn range
            continue;
        }
-        let range = parse_filename(filename);
-        ranges.push(range);
+        let (key_range, lsn_range) = parse_filename(filename);
+        files.push(Layer {
+            filename: filename.to_owned(),
+            key_range,
+            lsn_range,
+        });
    }

    // Collect all coordinates
    let mut keys: Vec<Key> = vec![];
    let mut lsns: Vec<Lsn> = vec![];
-    for (keyr, lsnr) in &ranges {
+    for Layer {
+        key_range: keyr,
+        lsn_range: lsnr,
+        ..
+    } in &files
+    {
        keys.push(keyr.start);
        keys.push(keyr.end);
        lsns.push(lsnr.start);
@@ -107,7 +148,12 @@ pub fn main() -> Result<()> {
            h: stretch * lsn_map.len() as f32
        }
    );
-    for (keyr, lsnr) in &ranges {
+    for Layer {
+        filename,
+        key_range: keyr,
+        lsn_range: lsnr,
+    } in &files
+    {
        let key_start = *key_map.get(&keyr.start).unwrap();
        let key_end = *key_map.get(&keyr.end).unwrap();
        let key_diff = key_end - key_start;
@@ -151,6 +197,7 @@ pub fn main() -> Result<()> {
            .fill(fill)
            .stroke(Stroke::Color(rgb(0, 0, 0), 0.1))
            .border_radius(0.4)
+            .comment(filename)
        );
    }
    println!("{}", EndSvg);
--- a/pageserver/src/aux_file.rs
+++ b/pageserver/src/aux_file.rs
@@ -0,0 +1,112 @@
+use pageserver_api::key::{Key, AUX_KEY_PREFIX, METADATA_KEY_SIZE};
+use tracing::warn;
+
+/// Create a metadata key from a hash, encoded as [AUX_KEY_PREFIX, 2B directory prefix, first 13B of 128b xxhash].
+fn aux_hash_to_metadata_key(dir_level1: u8, dir_level2: u8, data: &[u8]) -> Key {
+    let mut key = [0; METADATA_KEY_SIZE];
+    let hash = twox_hash::xxh3::hash128(data).to_be_bytes();
+    key[0] = AUX_KEY_PREFIX;
+    key[1] = dir_level1;
+    key[2] = dir_level2;
+    key[3..16].copy_from_slice(&hash[0..13]);
+    Key::from_metadata_key_fixed_size(&key)
+}
+
+const AUX_DIR_PG_LOGICAL: u8 = 0x01;
+const AUX_DIR_PG_REPLSLOT: u8 = 0x02;
+const AUX_DIR_PG_UNKNOWN: u8 = 0xFF;
+
+/// Encode the aux file into a fixed-size key.
+///
+/// The first byte is the AUX key prefix. We use the next 2 bytes of the key for the directory / aux file type.
+/// We have one-to-one mapping for each of the aux file that we support. We hash the remaining part of the path
+/// (usually a single file name, or several components) into 13-byte hash. The way we determine the 2-byte prefix
+/// is roughly based on the first two components of the path, one unique number for one component.
+///
+/// * pg_logical/mappings -> 0x0101
+/// * pg_logical/snapshots -> 0x0102
+/// * pg_logical/replorigin_checkpoint -> 0x0103
+/// * pg_logical/others -> 0x01FF
+/// * pg_replslot/ -> 0x0201
+/// * others -> 0xFFFF
+///
+/// If you add new AUX files to this function, please also add a test case to `test_encoding_portable`.
+/// The new file type must have never been written to the storage before. Otherwise, there could be data
+/// corruptions as the new file belongs to a new prefix but it might have been stored under the `others` prefix.
+pub fn encode_aux_file_key(path: &str) -> Key {
+    if let Some(fname) = path.strip_prefix("pg_logical/mappings/") {
+        aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x01, fname.as_bytes())
+    } else if let Some(fname) = path.strip_prefix("pg_logical/snapshots/") {
+        aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x02, fname.as_bytes())
+    } else if path == "pg_logical/replorigin_checkpoint" {
+        aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0x03, b"")
+    } else if let Some(fname) = path.strip_prefix("pg_logical/") {
+        if cfg!(debug_assertions) {
+            warn!(
+                "unsupported pg_logical aux file type: {}, putting to 0x01FF, would affect path scanning",
+                path
+            );
+        }
+        aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0xFF, fname.as_bytes())
+    } else if let Some(fname) = path.strip_prefix("pg_replslot/") {
+        aux_hash_to_metadata_key(AUX_DIR_PG_REPLSLOT, 0x01, fname.as_bytes())
+    } else {
+        if cfg!(debug_assertions) {
+            warn!(
+                "unsupported aux file type: {}, putting to 0xFFFF, would affect path scanning",
+                path
+            );
+        }
+        aux_hash_to_metadata_key(AUX_DIR_PG_UNKNOWN, 0xFF, path.as_bytes())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_hash_portable() {
+        // AUX file encoding requires the hash to be portable across all platforms. This test case checks
+        // if the algorithm produces the same hash across different environments.
+        assert_eq!(
+            305317690835051308206966631765527126151,
+            twox_hash::xxh3::hash128("test1".as_bytes())
+        );
+        assert_eq!(
+            85104974691013376326742244813280798847,
+            twox_hash::xxh3::hash128("test/test2".as_bytes())
+        );
+        assert_eq!(0, twox_hash::xxh3::hash128("".as_bytes()));
+    }
+
+    #[test]
+    fn test_encoding_portable() {
+        // To correct retrieve AUX files, the generated keys for the same file must be the same for all versions
+        // of the page server.
+        assert_eq!(
+            "8200000101E5B20C5F8DD5AA3289D6D9EAFA",
+            encode_aux_file_key("pg_logical/mappings/test1").to_string()
+        );
+        assert_eq!(
+            "820000010239AAC544893139B26F501B97E6",
+            encode_aux_file_key("pg_logical/snapshots/test2").to_string()
+        );
+        assert_eq!(
+            "820000010300000000000000000000000000",
+            encode_aux_file_key("pg_logical/replorigin_checkpoint").to_string()
+        );
+        assert_eq!(
+            "82000001FF8635AF2134B7266EC5B4189FD6",
+            encode_aux_file_key("pg_logical/unsupported").to_string()
+        );
+        assert_eq!(
+            "8200000201772D0E5D71DE14DA86142A1619",
+            encode_aux_file_key("pg_replslot/test3").to_string()
+        );
+        assert_eq!(
+            "820000FFFF1866EBEB53B807B26A2416F317",
+            encode_aux_file_key("other_file_not_supported").to_string()
+        );
+    }
+}
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -13,7 +13,7 @@
 use anyhow::{anyhow, bail, ensure, Context};
 use bytes::{BufMut, Bytes, BytesMut};
 use fail::fail_point;
-use pageserver_api::key::{key_to_slru_block, Key};
+use pageserver_api::key::{key_to_slru_block, rel_block_to_key, Key};
 use postgres_ffi::pg_constants;
 use std::fmt::Write as FmtWrite;
 use std::time::SystemTime;
@@ -297,7 +297,20 @@ where
                if rel.forknum == INIT_FORKNUM {
                    // I doubt we need _init fork itself, but having it at least
                    // serves as a marker relation is unlogged.
-                    self.add_rel(rel, rel).await?;
+                    if let Err(_e) = self.add_rel(rel, rel).await {
+                        if self
+                            .timeline
+                            .get_shard_identity()
+                            .is_key_buggy_forknum(&rel_block_to_key(rel, 0x0))
+                        {
+                            // Workaround https://github.com/neondatabase/neon/issues/7451 -- if we have an unlogged relation
+                            // whose INIT_FORKNUM is not correctly on shard zero, then omit it in the basebackup.  This allows
+                            // postgres to start up.  The relation won't work, but it will be possible to DROP TABLE on it and
+                            // recreate.
+                            tracing::warn!("Omitting relation {rel} for issue #7451: drop and recreate this unlogged relation");
+                            continue;
+                        }
+                    };
                    self.add_rel(rel, rel.with_forknum(MAIN_FORKNUM)).await?;
                    continue;
                }
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -285,6 +285,7 @@ fn start_pageserver(
    ))
    .unwrap();
    pageserver::preinitialize_metrics();
+    pageserver::metrics::wal_redo::set_process_kind_metric(conf.walredo_process_kind);

    // If any failpoints were set from FAILPOINTS environment variable,
    // print them to the log for debugging purposes
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -97,6 +97,8 @@ pub mod defaults {

    pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;

+    pub const DEFAULT_WALREDO_PROCESS_KIND: &str = "sync";
+
    ///
    /// Default built-in configuration file.
    ///
@@ -140,6 +142,8 @@ pub mod defaults {

 #validate_vectored_get = '{DEFAULT_VALIDATE_VECTORED_GET}'

+#walredo_process_kind = '{DEFAULT_WALREDO_PROCESS_KIND}'
+
 [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -290,6 +294,8 @@ pub struct PageServerConf {
    ///
    /// Setting this to zero disables limits on total ephemeral layer size.
    pub ephemeral_bytes_per_memory_kb: usize,
+
+    pub walredo_process_kind: crate::walredo::ProcessKind,
 }

 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -413,6 +419,8 @@ struct PageServerConfigBuilder {
    validate_vectored_get: BuilderValue<bool>,

    ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
+
+    walredo_process_kind: BuilderValue<crate::walredo::ProcessKind>,
 }

 impl PageServerConfigBuilder {
@@ -500,6 +508,8 @@ impl PageServerConfigBuilder {
            )),
            validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
            ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
+
+            walredo_process_kind: Set(DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap()),
        }
    }
 }
@@ -683,6 +693,10 @@ impl PageServerConfigBuilder {
        self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value);
    }

+    pub fn get_walredo_process_kind(&mut self, value: crate::walredo::ProcessKind) {
+        self.walredo_process_kind = BuilderValue::Set(value);
+    }
+
    pub fn build(self) -> anyhow::Result<PageServerConf> {
        let default = Self::default_values();

@@ -739,6 +753,7 @@ impl PageServerConfigBuilder {
                max_vectored_read_bytes,
                validate_vectored_get,
                ephemeral_bytes_per_memory_kb,
+                walredo_process_kind,
            }
            CUSTOM LOGIC
            {
@@ -1032,6 +1047,9 @@ impl PageServerConf {
                "ephemeral_bytes_per_memory_kb" => {
                    builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize)
                }
+                "walredo_process_kind" => {
+                    builder.get_walredo_process_kind(parse_toml_from_str("walredo_process_kind", item)?)
+                }
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }
@@ -1114,6 +1132,7 @@ impl PageServerConf {
            ),
            validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
            ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
+            walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
        }
    }
 }
@@ -1351,7 +1370,8 @@ background_task_maximum_delay = '334 s'
                        .expect("Invalid default constant")
                ),
                validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
-                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB
+                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
+                walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -1423,7 +1443,8 @@ background_task_maximum_delay = '334 s'
                        .expect("Invalid default constant")
                ),
                validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
-                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB
+                ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
+                walredo_process_kind: defaults::DEFAULT_WALREDO_PROCESS_KIND.parse().unwrap(),
            },
            "Should be able to parse all basic config values correctly"
        );
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -304,7 +304,7 @@ async fn calculate_synthetic_size_worker(
                continue;
            }

-            if !tenant_shard_id.is_zero() {
+            if !tenant_shard_id.is_shard_zero() {
                // We only send consumption metrics from shard 0, so don't waste time calculating
                // synthetic size on other shards.
                continue;
--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -199,7 +199,7 @@ pub(super) async fn collect_all_metrics(
    };

    let tenants = futures::stream::iter(tenants).filter_map(|(id, state, _)| async move {
-        if state != TenantState::Active || !id.is_zero() {
+        if state != TenantState::Active || !id.is_shard_zero() {
            None
        } else {
            tenant_manager
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -58,24 +58,6 @@ paths:
      responses:
        "200":
          description: The reload completed successfully.
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error (also hits if no keys were found)
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"

  /v1/tenant/{tenant_id}:
    parameters:
@@ -93,62 +75,14 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/TenantInfo"
-        "400":
-          description: Error when no tenant id found in path or no timeline id
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"

    delete:
      description: |
        Attempts to delete specified tenant. 500, 503 and 409 errors should be retried until 404 is retrieved.
        404 means that deletion successfully finished"
      responses:
-        "400":
-          description: Error when no tenant id found in path
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
        "404":
-          description: Tenant not found
+          description: Tenant not found. This is the success path.
          content:
            application/json:
              schema:
@@ -165,18 +99,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/PreconditionFailedError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"

  /v1/tenant/{tenant_id}/time_travel_remote_storage:
    parameters:
@@ -206,36 +128,6 @@ paths:
            application/json:
              schema:
                type: string
-        "400":
-          description: Error when no tenant id found in path or invalid timestamp
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"

  /v1/tenant/{tenant_id}/timeline:
    parameters:
@@ -255,36 +147,6 @@ paths:
                type: array
                items:
                  $ref: "#/components/schemas/TimelineInfo"
-        "400":
-          description: Error when no tenant id found in path
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"


  /v1/tenant/{tenant_id}/timeline/{timeline_id}:
@@ -309,60 +171,12 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/TimelineInfo"
-        "400":
-          description: Error when no tenant id found in path or no timeline id
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"

    delete:
      description: "Attempts to delete specified timeline. 500 and 409 errors should be retried"
      responses:
-        "400":
-          description: Error when no tenant id found in path or no timeline id
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
        "404":
-          description: Timeline not found
+          description: Timeline not found. This is the success path.
          content:
            application/json:
              schema:
@@ -379,18 +193,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/PreconditionFailedError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"

  /v1/tenant/{tenant_id}/timeline/{timeline_id}/get_timestamp_of_lsn:
    parameters:
@@ -423,36 +225,6 @@ paths:
              schema:
                type: string
                format: date-time
-        "400":
-          description: Error when no tenant id found in path, no timeline id or invalid timestamp
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "404":
-          description: Timeline not found, or there is no timestamp information for the given lsn
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/NotFoundError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"

  /v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp:
    parameters:
@@ -484,36 +256,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/LsnByTimestampResponse"
-        "400":
-          description: Error when no tenant id found in path, no timeline id or invalid timestamp
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"

  /v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc:
    parameters:
@@ -537,36 +279,6 @@ paths:
            application/json:
              schema:
                type: string
-        "400":
-          description: Error when no tenant id found in path, no timeline id or invalid timestamp
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
  /v1/tenant/{tenant_shard_id}/location_config:
    parameters:
      - name: tenant_shard_id
@@ -628,24 +340,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/TenantLocationConfigResponse"
-        "503":
-          description: Tenant's state cannot be changed right now.  Wait a few seconds and retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
        "409":
          description: |
            The tenant is already known to Pageserver in some way,
@@ -662,12 +356,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/ConflictError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
  /v1/tenant/{tenant_id}/ignore:
    parameters:
      - name: tenant_id
@@ -684,36 +372,6 @@ paths:
      responses:
        "200":
          description: Tenant ignored
-        "400":
-          description: Error when no tenant id found in path parameters
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"


  /v1/tenant/{tenant_id}/load:
@@ -740,36 +398,6 @@ paths:
      responses:
        "202":
          description: Tenant scheduled to load successfully
-        "400":
-          description: Error when no tenant id found in path parameters
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"

  /v1/tenant/{tenant_id}/{timeline_id}/preserve_initdb_archive:
    parameters:
@@ -790,37 +418,6 @@ paths:
      responses:
        "202":
          description: Tenant scheduled to load successfully
-        "404":
-          description: No tenant or timeline found for the specified ids
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-

  /v1/tenant/{tenant_id}/synthetic_size:
    parameters:
@@ -839,31 +436,8 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/SyntheticSizeResponse"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"

+  # This route has no handler. TODO: remove?
  /v1/tenant/{tenant_id}/size:
    parameters:
      - name: tenant_id
@@ -945,18 +519,6 @@ paths:
      responses:
        "200":
          description: Success
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"

  /v1/tenant/{tenant_shard_id}/secondary/download:
    parameters:
@@ -987,20 +549,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/SecondaryProgress"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-
-

  /v1/tenant/{tenant_id}/timeline/:
    parameters:
@@ -1043,24 +591,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/TimelineInfo"
-        "400":
-          description: Malformed timeline create request
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
        "406":
          description: Permanently unsatisfiable request, don't retry.
          content:
@@ -1079,18 +609,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"

  /v1/tenant/:
    get:
@@ -1104,30 +622,6 @@ paths:
                type: array
                items:
                  $ref: "#/components/schemas/TenantInfo"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"

    post:
      description: |
@@ -1148,43 +642,12 @@ paths:
            application/json:
              schema:
                type: string
-        "400":
-          description: Malformed tenant create request
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
        "409":
          description: Tenant already exists, creation skipped
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ConflictError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"
-

  /v1/tenant/config:
    put:
@@ -1206,36 +669,6 @@ paths:
                type: array
                items:
                  $ref: "#/components/schemas/TenantInfo"
-        "400":
-          description: Malformed tenant config request
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"

  /v1/tenant/{tenant_id}/config/:
    parameters:
@@ -1255,42 +688,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/TenantConfigResponse"
-        "400":
-          description: Malformed get tenanant config request
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "404":
-          description: Tenand or timeline were not found
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/NotFoundError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "503":
-          description: Temporarily unavailable, please retry.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ServiceUnavailableError"

  /v1/utilization:
    get:
@@ -1304,12 +701,6 @@ paths:
              application/json:
                schema:
                  $ref: "#/components/schemas/PageserverUtilization"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"

 components:
  securitySchemes:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -160,6 +160,9 @@ impl From<PageReconstructError> for ApiError {
    fn from(pre: PageReconstructError) -> ApiError {
        match pre {
            PageReconstructError::Other(pre) => ApiError::InternalServerError(pre),
+            PageReconstructError::MissingKey(e) => {
+                ApiError::InternalServerError(anyhow::anyhow!("{e}"))
+            }
            PageReconstructError::Cancelled => {
                ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
            }
@@ -457,8 +460,12 @@ async fn reload_auth_validation_keys_handler(
            json_response(StatusCode::OK, ())
        }
        Err(e) => {
+            let err_msg = "Error reloading public keys";
            warn!("Error reloading public keys from {key_path:?}: {e:}");
-            json_response(StatusCode::INTERNAL_SERVER_ERROR, ())
+            json_response(
+                StatusCode::INTERNAL_SERVER_ERROR,
+                HttpErrorBody::from_msg(err_msg.to_string()),
+            )
        }
    }
 }
@@ -696,7 +703,7 @@ async fn get_lsn_by_timestamp_handler(
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
    let state = get_state(&request);

-    if !tenant_shard_id.is_zero() {
+    if !tenant_shard_id.is_shard_zero() {
        // Requires SLRU contents, which are only stored on shard zero
        return Err(ApiError::BadRequest(anyhow!(
            "Size calculations are only available on shard zero"
@@ -747,7 +754,7 @@ async fn get_timestamp_of_lsn_handler(
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
    let state = get_state(&request);

-    if !tenant_shard_id.is_zero() {
+    if !tenant_shard_id.is_shard_zero() {
        // Requires SLRU contents, which are only stored on shard zero
        return Err(ApiError::BadRequest(anyhow!(
            "Size calculations are only available on shard zero"
@@ -772,7 +779,9 @@ async fn get_timestamp_of_lsn_handler(
            let time = format_rfc3339(postgres_ffi::from_pg_timestamp(time)).to_string();
            json_response(StatusCode::OK, time)
        }
-        None => json_response(StatusCode::NOT_FOUND, ()),
+        None => Err(ApiError::NotFound(
+            anyhow::anyhow!("Timestamp for lsn {} not found", lsn).into(),
+        )),
    }
 }

@@ -1086,7 +1095,7 @@ async fn tenant_size_handler(
    let headers = request.headers();
    let state = get_state(&request);

-    if !tenant_shard_id.is_zero() {
+    if !tenant_shard_id.is_shard_zero() {
        return Err(ApiError::BadRequest(anyhow!(
            "Size calculations are only available on shard zero"
        )));
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -12,6 +12,7 @@ pub mod disk_usage_eviction_task;
 pub mod http;
 pub mod import_datadir;
 pub use pageserver_api::keyspace;
+pub mod aux_file;
 pub mod metrics;
 pub mod page_cache;
 pub mod page_service;
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -86,41 +86,58 @@ pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub(crate) static READ_NUM_FS_LAYERS: Lazy<Histogram> = Lazy::new(|| {
+pub(crate) static READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
-        "pageserver_read_num_fs_layers",
-        "Number of persistent layers accessed for processing a read request, including those in the cache",
-        vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 10.0, 20.0, 50.0, 100.0],
+        "pageserver_layers_visited_per_read_global",
+        "Number of layers visited to reconstruct one key",
+        vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static VEC_READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "pageserver_layers_visited_per_vectored_read_global",
+        "Average number of layers visited to reconstruct one key",
+        vec![1.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
    )
    .expect("failed to define a metric")
 });

 // Metrics collected on operations on the storage repository.
+#[derive(
+    Clone, Copy, enum_map::Enum, strum_macros::EnumString, strum_macros::Display, IntoStaticStr,
+)]
+pub(crate) enum GetKind {
+    Singular,
+    Vectored,
+}

 pub(crate) struct ReconstructTimeMetrics {
-    ok: Histogram,
-    err: Histogram,
+    singular: Histogram,
+    vectored: Histogram,
 }

 pub(crate) static RECONSTRUCT_TIME: Lazy<ReconstructTimeMetrics> = Lazy::new(|| {
    let inner = register_histogram_vec!(
        "pageserver_getpage_reconstruct_seconds",
        "Time spent in reconstruct_value (reconstruct a page from deltas)",
-        &["result"],
+        &["get_kind"],
        CRITICAL_OP_BUCKETS.into(),
    )
    .expect("failed to define a metric");
+
    ReconstructTimeMetrics {
-        ok: inner.get_metric_with_label_values(&["ok"]).unwrap(),
-        err: inner.get_metric_with_label_values(&["err"]).unwrap(),
+        singular: inner.with_label_values(&[GetKind::Singular.into()]),
+        vectored: inner.with_label_values(&[GetKind::Vectored.into()]),
    }
 });

 impl ReconstructTimeMetrics {
-    pub(crate) fn for_result<T, E>(&self, result: &Result<T, E>) -> &Histogram {
-        match result {
-            Ok(_) => &self.ok,
-            Err(_) => &self.err,
+    pub(crate) fn for_get_kind(&self, get_kind: GetKind) -> &Histogram {
+        match get_kind {
+            GetKind::Singular => &self.singular,
+            GetKind::Vectored => &self.vectored,
        }
    }
 }
@@ -133,13 +150,33 @@ pub(crate) static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::n
    .expect("failed to define a metric")
 });

-pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
+pub(crate) struct ReconstructDataTimeMetrics {
+    singular: Histogram,
+    vectored: Histogram,
+}
+
+impl ReconstructDataTimeMetrics {
+    pub(crate) fn for_get_kind(&self, get_kind: GetKind) -> &Histogram {
+        match get_kind {
+            GetKind::Singular => &self.singular,
+            GetKind::Vectored => &self.vectored,
+        }
+    }
+}
+
+pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy<ReconstructDataTimeMetrics> = Lazy::new(|| {
+    let inner = register_histogram_vec!(
        "pageserver_getpage_get_reconstruct_data_seconds",
        "Time spent in get_reconstruct_value_data",
+        &["get_kind"],
        CRITICAL_OP_BUCKETS.into(),
    )
-    .expect("failed to define a metric")
+    .expect("failed to define a metric");
+
+    ReconstructDataTimeMetrics {
+        singular: inner.with_label_values(&[GetKind::Singular.into()]),
+        vectored: inner.with_label_values(&[GetKind::Vectored.into()]),
+    }
 });

 pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
@@ -1518,7 +1555,8 @@ pub(crate) struct SecondaryModeMetrics {
    pub(crate) download_heatmap: IntCounter,
    pub(crate) download_layer: IntCounter,
 }
-pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| SecondaryModeMetrics {
+pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| {
+    SecondaryModeMetrics {
    upload_heatmap: register_int_counter!(
        "pageserver_secondary_upload_heatmap",
        "Number of heatmaps written to remote storage by attached tenants"
@@ -1536,7 +1574,7 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| Seco
    .expect("failed to define a metric"),
    download_heatmap: register_int_counter!(
        "pageserver_secondary_download_heatmap",
-        "Number of downloads of heatmaps by secondary mode locations"
+        "Number of downloads of heatmaps by secondary mode locations, including when it hasn't changed"
    )
    .expect("failed to define a metric"),
    download_layer: register_int_counter!(
@@ -1544,6 +1582,7 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| Seco
        "Number of downloads of layers by secondary mode locations"
    )
    .expect("failed to define a metric"),
+}
 });

 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
@@ -1819,6 +1858,29 @@ impl Default for WalRedoProcessCounters {
 pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy<WalRedoProcessCounters> =
    Lazy::new(WalRedoProcessCounters::default);

+#[cfg(not(test))]
+pub mod wal_redo {
+    use super::*;
+
+    static PROCESS_KIND: Lazy<std::sync::Mutex<UIntGaugeVec>> = Lazy::new(|| {
+        std::sync::Mutex::new(
+            register_uint_gauge_vec!(
+                "pageserver_wal_redo_process_kind",
+                "The configured process kind for walredo",
+                &["kind"],
+            )
+            .unwrap(),
+        )
+    });
+
+    pub fn set_process_kind_metric(kind: crate::walredo::ProcessKind) {
+        // use guard to avoid races around the next two steps
+        let guard = PROCESS_KIND.lock().unwrap();
+        guard.reset();
+        guard.with_label_values(&[&format!("{kind}")]).set(1);
+    }
+}
+
 /// Similar to `prometheus::HistogramTimer` but does not record on drop.
 pub(crate) struct StorageTimeMetricsTimer {
    metrics: StorageTimeMetrics,
@@ -2089,7 +2151,7 @@ impl TimelineMetrics {

 pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
    // Only shard zero deals in synthetic sizes
-    if tenant_shard_id.is_zero() {
+    if tenant_shard_id.is_shard_zero() {
        let tid = tenant_shard_id.tenant_id.to_string();
        let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
    }
@@ -2746,7 +2808,8 @@ pub fn preinitialize_metrics() {

    // histograms
    [
-        &READ_NUM_FS_LAYERS,
+        &READ_NUM_LAYERS_VISITED,
+        &VEC_READ_NUM_LAYERS_VISITED,
        &WAIT_LSN_TIME,
        &WAL_REDO_TIME,
        &WAL_REDO_RECORDS_HISTOGRAM,
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -874,6 +874,11 @@ impl PageServerHandler {
            // walsender completes the authentication and starts streaming the
            // WAL.
            if lsn <= last_record_lsn {
+                // It might be better to use max(lsn, latest_gc_cutoff_lsn) instead
+                // last_record_lsn. That would give the same result, since we know
+                // that there haven't been modifications since 'lsn'. Using an older
+                // LSN might be faster, because that could allow skipping recent
+                // layers when finding the page.
                lsn = last_record_lsn;
            } else {
                timeline
@@ -1201,6 +1206,10 @@ impl PageServerHandler {
        ))
    }

+    /// Note on "fullbackup":
+    /// Full basebackups should only be used for debugging purposes.
+    /// Originally, it was introduced to enable breaking storage format changes,
+    /// but that is not applicable anymore.
    #[allow(clippy::too_many_arguments)]
    #[instrument(skip_all, fields(shard_id, ?lsn, ?prev_lsn, %full_backup))]
    async fn handle_basebackup_request<IO>(
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -252,16 +252,8 @@ impl Timeline {
        let mut buf = version.get(self, key, ctx).await?;
        let nblocks = buf.get_u32_le();

-        if latest {
-            // Update relation size cache only if "latest" flag is set.
-            // This flag is set by compute when it is working with most recent version of relation.
-            // Typically master compute node always set latest=true.
-            // Please notice, that even if compute node "by mistake" specifies old LSN but set
-            // latest=true, then it can not cause cache corruption, because with latest=true
-            // pageserver choose max(request_lsn, last_written_lsn) and so cached value will be
-            // associated with most recent value of LSN.
-            self.update_cached_rel_size(tag, version.get_lsn(), nblocks);
-        }
+        self.update_cached_rel_size(tag, version.get_lsn(), nblocks);
+
        Ok(nblocks)
    }

@@ -456,6 +448,11 @@ impl Timeline {
        // include physical changes from later commits that will be marked
        // as aborted, and will need to be vacuumed away.
        let commit_lsn = Lsn((low - 1) * 8);
+        // This maxing operation is for the edge case that the search above did
+        // set found_smaller to true but it never increased the lsn. Then, low
+        // is still the old min_lsn the subtraction above could possibly give a value
+        // below the anchestor_lsn.
+        let commit_lsn = commit_lsn.max(min_lsn);
        match (found_smaller, found_larger) {
            (false, false) => {
                // This can happen if no commit records have been processed yet, e.g.
@@ -817,7 +814,7 @@ impl Timeline {
    /// Get cached size of relation if it not updated after specified LSN
    pub fn get_cached_rel_size(&self, tag: &RelTag, lsn: Lsn) -> Option<BlockNumber> {
        let rel_size_cache = self.rel_size_cache.read().unwrap();
-        if let Some((cached_lsn, nblocks)) = rel_size_cache.get(tag) {
+        if let Some((cached_lsn, nblocks)) = rel_size_cache.map.get(tag) {
            if lsn >= *cached_lsn {
                return Some(*nblocks);
            }
@@ -828,7 +825,16 @@ impl Timeline {
    /// Update cached relation size if there is no more recent update
    pub fn update_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
        let mut rel_size_cache = self.rel_size_cache.write().unwrap();
-        match rel_size_cache.entry(tag) {
+
+        if lsn < rel_size_cache.complete_as_of {
+            // Do not cache old values. It's safe to cache the size on read, as long as
+            // the read was at an LSN since we started the WAL ingestion. Reasoning: we
+            // never evict values from the cache, so if the relation size changed after
+            // 'lsn', the new value is already in the cache.
+            return;
+        }
+
+        match rel_size_cache.map.entry(tag) {
            hash_map::Entry::Occupied(mut entry) => {
                let cached_lsn = entry.get_mut();
                if lsn >= cached_lsn.0 {
@@ -844,13 +850,13 @@ impl Timeline {
    /// Store cached relation size
    pub fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
        let mut rel_size_cache = self.rel_size_cache.write().unwrap();
-        rel_size_cache.insert(tag, (lsn, nblocks));
+        rel_size_cache.map.insert(tag, (lsn, nblocks));
    }

    /// Remove cached relation size
    pub fn remove_cached_rel_size(&self, tag: &RelTag) {
        let mut rel_size_cache = self.rel_size_cache.write().unwrap();
-        rel_size_cache.remove(tag);
+        rel_size_cache.map.remove(tag);
    }
 }

@@ -1401,7 +1407,7 @@ impl<'a> DatadirModification<'a> {
        let n_files;
        let mut aux_files = self.tline.aux_files.lock().await;
        if let Some(mut dir) = aux_files.dir.take() {
-            // We already updated aux files in `self`: emit a delta and update our latest value
+            // We already updated aux files in `self`: emit a delta and update our latest value.
            dir.upsert(file_path.clone(), content.clone());
            n_files = dir.files.len();
            if aux_files.n_deltas == MAX_AUX_FILE_DELTAS {
@@ -1446,10 +1452,14 @@ impl<'a> DatadirModification<'a> {
                    // reset the map.
                    return Err(e.into());
                }
-                // FIXME: PageReconstructError doesn't have an explicit variant for key-not-found, so
-                // we are assuming that all _other_ possible errors represents a missing key.  If some
-                // other error occurs, we may incorrectly reset the map of aux files.
-                Err(PageReconstructError::Other(_) | PageReconstructError::WalRedo(_)) => {
+                // Note: we added missing key error variant in https://github.com/neondatabase/neon/pull/7393 but
+                // the original code assumes all other errors are missing keys. Therefore, we keep the code path
+                // the same for now, though in theory, we should only match the `MissingKey` variant.
+                Err(
+                    PageReconstructError::Other(_)
+                    | PageReconstructError::WalRedo(_)
+                    | PageReconstructError::MissingKey { .. },
+                ) => {
                    // Key is missing, we must insert an image as the basis for subsequent deltas.

                    let mut dir = AuxFilesDirectory {
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -33,6 +33,52 @@ impl Value {
    }
 }

+#[cfg(test)]
+#[derive(Debug, PartialEq)]
+pub(crate) enum InvalidInput {
+    TooShortValue,
+    TooShortPostgresRecord,
+}
+
+/// We could have a ValueRef where everything is `serde(borrow)`. Before implementing that, lets
+/// use this type for querying if a slice looks some particular way.
+#[cfg(test)]
+pub(crate) struct ValueBytes;
+
+#[cfg(test)]
+impl ValueBytes {
+    pub(crate) fn will_init(raw: &[u8]) -> Result<bool, InvalidInput> {
+        if raw.len() < 12 {
+            return Err(InvalidInput::TooShortValue);
+        }
+
+        let value_discriminator = &raw[0..4];
+
+        if value_discriminator == [0, 0, 0, 0] {
+            // Value::Image always initializes
+            return Ok(true);
+        }
+
+        if value_discriminator != [0, 0, 0, 1] {
+            // not a Value::WalRecord(..)
+            return Ok(false);
+        }
+
+        let walrecord_discriminator = &raw[4..8];
+
+        if walrecord_discriminator != [0, 0, 0, 0] {
+            // only NeonWalRecord::Postgres can have will_init
+            return Ok(false);
+        }
+
+        if raw.len() < 17 {
+            return Err(InvalidInput::TooShortPostgresRecord);
+        }
+
+        Ok(raw[8] == 1)
+    }
+}
+
 #[cfg(test)]
 mod test {
    use super::*;
@@ -70,6 +116,8 @@ mod test {
        ];

        roundtrip!(image, expected);
+
+        assert!(ValueBytes::will_init(&expected).unwrap());
    }

    #[test]
@@ -93,6 +141,96 @@ mod test {
        ];

        roundtrip!(rec, expected);
+
+        assert!(ValueBytes::will_init(&expected).unwrap());
+    }
+
+    #[test]
+    fn bytes_inspection_too_short_image() {
+        let rec = Value::Image(Bytes::from_static(b""));
+
+        #[rustfmt::skip]
+        let expected = [
+            // top level discriminator of 4 bytes
+            0x00, 0x00, 0x00, 0x00,
+            // 8 byte length
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        ];
+
+        roundtrip!(rec, expected);
+
+        assert!(ValueBytes::will_init(&expected).unwrap());
+        assert_eq!(expected.len(), 12);
+        for len in 0..12 {
+            assert_eq!(
+                ValueBytes::will_init(&expected[..len]).unwrap_err(),
+                InvalidInput::TooShortValue
+            );
+        }
+    }
+
+    #[test]
+    fn bytes_inspection_too_short_postgres_record() {
+        let rec = NeonWalRecord::Postgres {
+            will_init: false,
+            rec: Bytes::from_static(b""),
+        };
+        let rec = Value::WalRecord(rec);
+
+        #[rustfmt::skip]
+        let expected = [
+            // flattened discriminator of total 8 bytes
+            0x00, 0x00, 0x00, 0x01,
+            0x00, 0x00, 0x00, 0x00,
+            // will_init
+            0x00,
+            // 8 byte length
+            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        ];
+
+        roundtrip!(rec, expected);
+
+        assert!(!ValueBytes::will_init(&expected).unwrap());
+        assert_eq!(expected.len(), 17);
+        for len in 12..17 {
+            assert_eq!(
+                ValueBytes::will_init(&expected[..len]).unwrap_err(),
+                InvalidInput::TooShortPostgresRecord
+            )
+        }
+        for len in 0..12 {
+            assert_eq!(
+                ValueBytes::will_init(&expected[..len]).unwrap_err(),
+                InvalidInput::TooShortValue
+            )
+        }
+    }
+
+    #[test]
+    fn clear_visibility_map_flags_example() {
+        let rec = NeonWalRecord::ClearVisibilityMapFlags {
+            new_heap_blkno: Some(0x11),
+            old_heap_blkno: None,
+            flags: 0x03,
+        };
+        let rec = Value::WalRecord(rec);
+
+        #[rustfmt::skip]
+        let expected = [
+            // discriminators
+            0x00, 0x00, 0x00, 0x01,
+            0x00, 0x00, 0x00, 0x01,
+            // Some == 1 followed by 4 bytes
+            0x01, 0x00, 0x00, 0x00, 0x11,
+            // None == 0
+            0x00,
+            // flags
+            0x03
+        ];
+
+        roundtrip!(rec, expected);
+
+        assert!(!ValueBytes::will_init(&expected).unwrap());
    }
 }

--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -386,7 +386,7 @@ impl WalRedoManager {

    pub(crate) fn status(&self) -> Option<WalRedoManagerStatus> {
        match self {
-            WalRedoManager::Prod(m) => m.status(),
+            WalRedoManager::Prod(m) => Some(m.status()),
            #[cfg(test)]
            WalRedoManager::Test(_) => None,
        }
@@ -559,9 +559,10 @@ impl Tenant {
            // By doing what we do here, the index part upload is retried.
            // If control plane retries timeline creation in the meantime, the mgmt API handler
            // for timeline creation will coalesce on the upload we queue here.
+            // FIXME: this branch should be dead code as we no longer write local metadata.
            let rtc = timeline.remote_client.as_ref().unwrap();
            rtc.init_upload_queue_for_empty_remote(&metadata)?;
-            rtc.schedule_index_upload_for_metadata_update(&metadata)?;
+            rtc.schedule_index_upload_for_full_metadata_update(&metadata)?;
        }

        timeline
@@ -2869,20 +2870,23 @@ impl Tenant {
                }
            }

-            if let Some(cutoff) = timeline.get_last_record_lsn().checked_sub(horizon) {
-                let branchpoints: Vec<Lsn> = all_branchpoints
-                    .range((
-                        Included((timeline_id, Lsn(0))),
-                        Included((timeline_id, Lsn(u64::MAX))),
-                    ))
-                    .map(|&x| x.1)
-                    .collect();
-                timeline
-                    .update_gc_info(branchpoints, cutoff, pitr, cancel, ctx)
-                    .await?;
+            let cutoff = timeline
+                .get_last_record_lsn()
+                .checked_sub(horizon)
+                .unwrap_or(Lsn(0));

-                gc_timelines.push(timeline);
-            }
+            let branchpoints: Vec<Lsn> = all_branchpoints
+                .range((
+                    Included((timeline_id, Lsn(0))),
+                    Included((timeline_id, Lsn(u64::MAX))),
+                ))
+                .map(|&x| x.1)
+                .collect();
+            timeline
+                .update_gc_info(branchpoints, cutoff, pitr, cancel, ctx)
+                .await?;
+
+            gc_timelines.push(timeline);
        }
        drop(gc_cs);
        Ok(gc_timelines)
@@ -3027,7 +3031,7 @@ impl Tenant {
        // See also https://github.com/neondatabase/neon/issues/3865
        if let Some(remote_client) = new_timeline.remote_client.as_ref() {
            remote_client
-                .schedule_index_upload_for_metadata_update(&metadata)
+                .schedule_index_upload_for_full_metadata_update(&metadata)
                .context("branch initial metadata upload")?;
        }

@@ -3190,7 +3194,7 @@ impl Tenant {
            run_initdb(self.conf, &pgdata_path, pg_version, &self.cancel).await?;

            // Upload the created data dir to S3
-            if self.tenant_shard_id().is_zero() {
+            if self.tenant_shard_id().is_shard_zero() {
                self.upload_initdb(&timelines_path, &pgdata_path, &timeline_id)
                    .await?;
            }
@@ -3437,7 +3441,7 @@ impl Tenant {
            .store(size, Ordering::Relaxed);

        // Only shard zero should be calculating synthetic sizes
-        debug_assert!(self.shard_identity.is_zero());
+        debug_assert!(self.shard_identity.is_shard_zero());

        TENANT_SYNTHETIC_SIZE_METRIC
            .get_metric_with_label_values(&[&self.tenant_shard_id.tenant_id.to_string()])
@@ -3848,6 +3852,8 @@ pub(crate) mod harness {

 #[cfg(test)]
 mod tests {
+    use std::collections::BTreeMap;
+
    use super::*;
    use crate::keyspace::KeySpaceAccum;
    use crate::repository::{Key, Value};
@@ -3856,9 +3862,10 @@ mod tests {
    use crate::DEFAULT_PG_VERSION;
    use bytes::BytesMut;
    use hex_literal::hex;
+    use pageserver_api::key::NON_INHERITED_RANGE;
    use pageserver_api::keyspace::KeySpace;
    use rand::{thread_rng, Rng};
-    use tests::timeline::ShutdownMode;
+    use tests::timeline::{GetVectoredError, ShutdownMode};

    static TEST_KEY: Lazy<Key> =
        Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
@@ -4655,6 +4662,62 @@ mod tests {
        Ok(())
    }

+    #[tokio::test]
+    async fn test_get_vectored_aux_files() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_get_vectored_aux_files")?;
+
+        let (tenant, ctx) = harness.load().await;
+        let tline = tenant
+            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
+            .await?;
+        let tline = tline.raw_timeline().unwrap();
+
+        let mut modification = tline.begin_modification(Lsn(0x1000));
+        modification.put_file("foo/bar1", b"content1", &ctx).await?;
+        modification.set_lsn(Lsn(0x1008))?;
+        modification.put_file("foo/bar2", b"content2", &ctx).await?;
+        modification.commit(&ctx).await?;
+
+        let child_timeline_id = TimelineId::generate();
+        tenant
+            .branch_timeline_test(
+                tline,
+                child_timeline_id,
+                Some(tline.get_last_record_lsn()),
+                &ctx,
+            )
+            .await?;
+
+        let child_timeline = tenant
+            .get_timeline(child_timeline_id, true)
+            .expect("Should have the branched timeline");
+
+        let aux_keyspace = KeySpace {
+            ranges: vec![NON_INHERITED_RANGE],
+        };
+        let read_lsn = child_timeline.get_last_record_lsn();
+
+        let vectored_res = child_timeline
+            .get_vectored_impl(aux_keyspace.clone(), read_lsn, &ctx)
+            .await;
+
+        child_timeline
+            .validate_get_vectored_impl(&vectored_res, aux_keyspace, read_lsn, &ctx)
+            .await;
+
+        let images = vectored_res?;
+        let mut key = NON_INHERITED_RANGE.start;
+        while key < NON_INHERITED_RANGE.end {
+            assert!(matches!(
+                images[&key],
+                Err(PageReconstructError::MissingKey(_))
+            ));
+            key = key.next();
+        }
+
+        Ok(())
+    }
+
    // Test that vectored get handles layer gaps correctly
    // by advancing into the next ancestor timeline if required.
    //
@@ -4794,6 +4857,166 @@ mod tests {
        Ok(())
    }

+    // Test that vectored get descends into ancestor timelines correctly and
+    // does not return an image that's newer than requested.
+    //
+    // The diagram below ilustrates an interesting case. We have a parent timeline
+    // (top of the Lsn range) and a child timeline. The request key cannot be reconstructed
+    // from the child timeline, so the parent timeline must be visited. When advacing into
+    // the child timeline, the read path needs to remember what the requested Lsn was in
+    // order to avoid returning an image that's too new. The test below constructs such
+    // a timeline setup and does a few queries around the Lsn of each page image.
+    // ```
+    //    LSN
+    //     ^
+    //     |
+    //     |
+    // 500 | --------------------------------------> branch point
+    // 400 |        X
+    // 300 |        X
+    // 200 | --------------------------------------> requested lsn
+    // 100 |        X
+    //     |---------------------------------------> Key
+    //              |
+    //              ------> requested key
+    //
+    // Legend:
+    // * X - page images
+    // ```
+    #[tokio::test]
+    async fn test_get_vectored_ancestor_descent() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_get_vectored_on_lsn_axis")?;
+        let (tenant, ctx) = harness.load().await;
+
+        let start_key = Key::from_hex("010000000033333333444444445500000000").unwrap();
+        let end_key = start_key.add(1000);
+        let child_gap_at_key = start_key.add(500);
+        let mut parent_gap_lsns: BTreeMap<Lsn, String> = BTreeMap::new();
+
+        let mut current_lsn = Lsn(0x10);
+
+        let timeline_id = TimelineId::generate();
+        let parent_timeline = tenant
+            .create_test_timeline(timeline_id, current_lsn, DEFAULT_PG_VERSION, &ctx)
+            .await?;
+
+        current_lsn += 0x100;
+
+        for _ in 0..3 {
+            let mut key = start_key;
+            while key < end_key {
+                current_lsn += 0x10;
+
+                let image_value = format!("{} at {}", child_gap_at_key, current_lsn);
+
+                let mut writer = parent_timeline.writer().await;
+                writer
+                    .put(
+                        key,
+                        current_lsn,
+                        &Value::Image(test_img(&image_value)),
+                        &ctx,
+                    )
+                    .await?;
+                writer.finish_write(current_lsn);
+
+                if key == child_gap_at_key {
+                    parent_gap_lsns.insert(current_lsn, image_value);
+                }
+
+                key = key.next();
+            }
+
+            parent_timeline.freeze_and_flush().await?;
+        }
+
+        let child_timeline_id = TimelineId::generate();
+
+        let child_timeline = tenant
+            .branch_timeline_test(&parent_timeline, child_timeline_id, Some(current_lsn), &ctx)
+            .await?;
+
+        let mut key = start_key;
+        while key < end_key {
+            if key == child_gap_at_key {
+                key = key.next();
+                continue;
+            }
+
+            current_lsn += 0x10;
+
+            let mut writer = child_timeline.writer().await;
+            writer
+                .put(
+                    key,
+                    current_lsn,
+                    &Value::Image(test_img(&format!("{} at {}", key, current_lsn))),
+                    &ctx,
+                )
+                .await?;
+            writer.finish_write(current_lsn);
+
+            key = key.next();
+        }
+
+        child_timeline.freeze_and_flush().await?;
+
+        let lsn_offsets: [i64; 5] = [-10, -1, 0, 1, 10];
+        let mut query_lsns = Vec::new();
+        for image_lsn in parent_gap_lsns.keys().rev() {
+            for offset in lsn_offsets {
+                query_lsns.push(Lsn(image_lsn
+                    .0
+                    .checked_add_signed(offset)
+                    .expect("Shouldn't overflow")));
+            }
+        }
+
+        for query_lsn in query_lsns {
+            let results = child_timeline
+                .get_vectored_impl(
+                    KeySpace {
+                        ranges: vec![child_gap_at_key..child_gap_at_key.next()],
+                    },
+                    query_lsn,
+                    &ctx,
+                )
+                .await;
+
+            let expected_item = parent_gap_lsns
+                .iter()
+                .rev()
+                .find(|(lsn, _)| **lsn <= query_lsn);
+
+            info!(
+                "Doing vectored read at LSN {}. Expecting image to be: {:?}",
+                query_lsn, expected_item
+            );
+
+            match expected_item {
+                Some((_, img_value)) => {
+                    let key_results = results.expect("No vectored get error expected");
+                    let key_result = &key_results[&child_gap_at_key];
+                    let returned_img = key_result
+                        .as_ref()
+                        .expect("No page reconstruct error expected");
+
+                    info!(
+                        "Vectored read at LSN {} returned image {}",
+                        query_lsn,
+                        std::str::from_utf8(returned_img)?
+                    );
+                    assert_eq!(*returned_img, test_img(img_value));
+                }
+                None => {
+                    assert!(matches!(results, Err(GetVectoredError::MissingKey(_))));
+                }
+            }
+        }
+
+        Ok(())
+    }
+
    #[tokio::test]
    async fn test_random_updates() -> anyhow::Result<()> {
        let harness = TenantHarness::create("test_random_updates")?;
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -436,6 +436,11 @@ impl DeleteTenantFlow {
        .await
    }

+    /// Check whether background deletion of this tenant is currently in progress
+    pub(crate) fn is_in_progress(tenant: &Tenant) -> bool {
+        tenant.delete_progress.try_lock().is_err()
+    }
+
    async fn prepare(
        tenant: &Arc<Tenant>,
    ) -> Result<tokio::sync::OwnedMutexGuard<Self>, DeleteTenantError> {
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -235,6 +235,12 @@ impl TimelineMetadata {
        let bytes = instance.to_bytes().unwrap();
        Self::from_bytes(&bytes).unwrap()
    }
+
+    pub(crate) fn apply(&mut self, update: &MetadataUpdate) {
+        self.body.disk_consistent_lsn = update.disk_consistent_lsn;
+        self.body.prev_record_lsn = update.prev_record_lsn;
+        self.body.latest_gc_cutoff_lsn = update.latest_gc_cutoff_lsn;
+    }
 }

 impl<'de> Deserialize<'de> for TimelineMetadata {
@@ -259,6 +265,27 @@ impl Serialize for TimelineMetadata {
    }
 }

+/// Parts of the metadata which are regularly modified.
+pub(crate) struct MetadataUpdate {
+    disk_consistent_lsn: Lsn,
+    prev_record_lsn: Option<Lsn>,
+    latest_gc_cutoff_lsn: Lsn,
+}
+
+impl MetadataUpdate {
+    pub(crate) fn new(
+        disk_consistent_lsn: Lsn,
+        prev_record_lsn: Option<Lsn>,
+        latest_gc_cutoff_lsn: Lsn,
+    ) -> Self {
+        Self {
+            disk_consistent_lsn,
+            prev_record_lsn,
+            latest_gc_cutoff_lsn,
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -678,12 +678,19 @@ pub async fn init_tenant_mgr(
                    }
                }
            }
-            LocationMode::Secondary(secondary_conf) => TenantSlot::Secondary(SecondaryTenant::new(
-                tenant_shard_id,
-                shard_identity,
-                location_conf.tenant_conf,
-                &secondary_conf,
-            )),
+            LocationMode::Secondary(secondary_conf) => {
+                info!(
+                    tenant_id = %tenant_shard_id.tenant_id,
+                    shard_id = %tenant_shard_id.shard_slug(),
+                    "Starting secondary tenant"
+                );
+                TenantSlot::Secondary(SecondaryTenant::new(
+                    tenant_shard_id,
+                    shard_identity,
+                    location_conf.tenant_conf,
+                    &secondary_conf,
+                ))
+            }
        };

        tenants.insert(tenant_shard_id, slot);
@@ -1410,9 +1417,15 @@ impl TenantManager {

        match tenant.current_state() {
            TenantState::Broken { .. } | TenantState::Stopping { .. } => {
-                // If a tenant is broken or stopping, DeleteTenantFlow can
-                // handle it: broken tenants proceed to delete, stopping tenants
-                // are checked for deletion already in progress.
+                // If deletion is already in progress, return success (the semantics of this
+                // function are to rerturn success afterr deletion is spawned in background).
+                // Otherwise fall through and let [`DeleteTenantFlow`] handle this state.
+                if DeleteTenantFlow::is_in_progress(&tenant) {
+                    // The `delete_progress` lock is held: deletion is already happening
+                    // in the bacckground
+                    slot_guard.revert();
+                    return Ok(());
+                }
            }
            _ => {
                tenant
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -202,7 +202,9 @@ use std::sync::atomic::{AtomicU32, Ordering};
 use std::sync::{Arc, Mutex};
 use std::time::Duration;

-use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath, TimeoutOrCancel};
+use remote_storage::{
+    DownloadError, GenericRemoteStorage, ListingMode, RemotePath, TimeoutOrCancel,
+};
 use std::ops::DerefMut;
 use tracing::{debug, error, info, instrument, warn};
 use tracing::{info_span, Instrument};
@@ -236,6 +238,7 @@ use utils::id::{TenantId, TimelineId};

 use self::index::IndexPart;

+use super::metadata::MetadataUpdate;
 use super::storage_layer::{Layer, LayerFileName, ResidentLayer};
 use super::upload_queue::SetDeletedFlagProgress;
 use super::Generation;
@@ -536,9 +539,10 @@ impl RemoteTimelineClient {
    // Upload operations.
    //

-    ///
    /// Launch an index-file upload operation in the background, with
-    /// updated metadata.
+    /// fully updated metadata.
+    ///
+    /// This should only be used to upload initial metadata to remote storage.
    ///
    /// The upload will be added to the queue immediately, but it
    /// won't be performed until all previously scheduled layer file
@@ -550,7 +554,7 @@ impl RemoteTimelineClient {
    /// If there were any changes to the list of files, i.e. if any
    /// layer file uploads were scheduled, since the last index file
    /// upload, those will be included too.
-    pub fn schedule_index_upload_for_metadata_update(
+    pub fn schedule_index_upload_for_full_metadata_update(
        self: &Arc<Self>,
        metadata: &TimelineMetadata,
    ) -> anyhow::Result<()> {
@@ -566,6 +570,27 @@ impl RemoteTimelineClient {
        Ok(())
    }

+    /// Launch an index-file upload operation in the background, with only parts of the metadata
+    /// updated.
+    ///
+    /// This is the regular way of updating metadata on layer flushes or Gc.
+    ///
+    /// Using this lighter update mechanism allows for reparenting and detaching without changes to
+    /// `index_part.json`, while being more clear on what values update regularly.
+    pub(crate) fn schedule_index_upload_for_metadata_update(
+        self: &Arc<Self>,
+        update: &MetadataUpdate,
+    ) -> anyhow::Result<()> {
+        let mut guard = self.upload_queue.lock().unwrap();
+        let upload_queue = guard.initialized_mut()?;
+
+        upload_queue.latest_metadata.apply(update);
+
+        self.schedule_index_upload(upload_queue, upload_queue.latest_metadata.clone());
+
+        Ok(())
+    }
+
    ///
    /// Launch an index-file upload operation in the background, if necessary.
    ///
@@ -1122,7 +1147,7 @@ impl RemoteTimelineClient {
        // and retry will arrive to different pageserver there wont be any traces of it on remote storage
        let timeline_storage_path = remote_timeline_path(&self.tenant_shard_id, &self.timeline_id);

-        // Execute all pending deletions, so that when we proceed to do a list_prefixes below, we aren't
+        // Execute all pending deletions, so that when we proceed to do a listing below, we aren't
        // taking the burden of listing all the layers that we already know we should delete.
        self.flush_deletion_queue().await?;

@@ -1131,14 +1156,20 @@ impl RemoteTimelineClient {
        let remaining = download_retry(
            || async {
                self.storage_impl
-                    .list_files(Some(&timeline_storage_path), None, &cancel)
+                    .list(
+                        Some(&timeline_storage_path),
+                        ListingMode::NoDelimiter,
+                        None,
+                        &cancel,
+                    )
                    .await
            },
            "list remaining files",
            &cancel,
        )
        .await
-        .context("list files remaining files")?;
+        .context("list files remaining files")?
+        .keys;

        // We will delete the current index_part object last, since it acts as a deletion
        // marker via its deleted_at attribute
@@ -2024,7 +2055,7 @@ mod tests {
        // Schedule upload of index. Check that it is queued
        let metadata = dummy_metadata(Lsn(0x20));
        client
-            .schedule_index_upload_for_metadata_update(&metadata)
+            .schedule_index_upload_for_full_metadata_update(&metadata)
            .unwrap();
        {
            let mut guard = client.upload_queue.lock().unwrap();
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -258,7 +258,7 @@ pub async fn list_remote_timelines(
    tenant_shard_id: TenantShardId,
    cancel: CancellationToken,
 ) -> anyhow::Result<(HashSet<TimelineId>, HashSet<String>)> {
-    let remote_path = remote_timelines_path(&tenant_shard_id);
+    let remote_path = remote_timelines_path(&tenant_shard_id).add_trailing_slash();

    fail::fail_point!("storage-sync-list-remote-timelines", |_| {
        anyhow::bail!("storage-sync-list-remote-timelines");
@@ -417,11 +417,16 @@ pub(super) async fn download_index_part(
    let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none());

    let indices = download_retry(
-        || async { storage.list_files(Some(&index_prefix), None, cancel).await },
+        || async {
+            storage
+                .list(Some(&index_prefix), ListingMode::NoDelimiter, None, cancel)
+                .await
+        },
        "list index_part files",
        cancel,
    )
-    .await?;
+    .await?
+    .keys;

    // General case logic for which index to use: the latest index whose generation
    // is <= our own.  See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -167,7 +167,7 @@ pub(crate) async fn time_travel_recover_tenant(
    let warn_after = 3;
    let max_attempts = 10;
    let mut prefixes = Vec::with_capacity(2);
-    if tenant_shard_id.is_zero() {
+    if tenant_shard_id.is_shard_zero() {
        // Also recover the unsharded prefix for a shard of zero:
        // - if the tenant is totally unsharded, the unsharded prefix contains all the data
        // - if the tenant is sharded, we still want to recover the initdb data, but we only
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -312,7 +312,7 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
                    (detail.last_download, detail.next_download.unwrap())
                };

-                if now < next_download {
+                if now > next_download {
                    Some(PendingDownload {
                        secondary_state: secondary_tenant,
                        last_download,
@@ -647,6 +647,12 @@ impl<'a> TenantDownloader<'a> {
                progress.bytes_downloaded += layer_byte_count;
                progress.layers_downloaded += layer_count;
            }
+
+            for delete_timeline in &delete_timelines {
+                // We haven't removed from disk yet, but optimistically remove from in-memory state: if removal
+                // from disk fails that will be a fatal error.
+                detail.timelines.remove(delete_timeline);
+            }
        }

        // Execute accumulated deletions
@@ -710,13 +716,14 @@ impl<'a> TenantDownloader<'a> {
                    .await
                    .map_err(UpdateError::from)?;

+                SECONDARY_MODE.download_heatmap.inc();
+
                if Some(&download.etag) == prev_etag {
                    Ok(HeatMapDownload::Unmodified)
                } else {
                    let mut heatmap_bytes = Vec::new();
                    let mut body = tokio_util::io::StreamReader::new(download.download_stream);
                    let _size = tokio::io::copy_buf(&mut body, &mut heatmap_bytes).await?;
-                    SECONDARY_MODE.download_heatmap.inc();
                    Ok(HeatMapDownload::Modified(HeatMapModified {
                        etag: download.etag,
                        last_modified: download.last_modified,
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -118,6 +118,7 @@ pub(crate) struct ValuesReconstructState {
    pub(crate) keys: HashMap<Key, Result<VectoredValueReconstructState, PageReconstructError>>,

    keys_done: KeySpaceRandomAccum,
+    layers_visited: u32,
 }

 impl ValuesReconstructState {
@@ -125,6 +126,7 @@ impl ValuesReconstructState {
        Self {
            keys: HashMap::new(),
            keys_done: KeySpaceRandomAccum::new(),
+            layers_visited: 0,
        }
    }

@@ -138,6 +140,14 @@ impl ValuesReconstructState {
        }
    }

+    pub(crate) fn on_layer_visited(&mut self) {
+        self.layers_visited += 1;
+    }
+
+    pub(crate) fn get_layers_visited(&self) -> u32 {
+        self.layers_visited
+    }
+
    /// Update the state collected for a given key.
    /// Returns true if this was the last value needed for the key and false otherwise.
    ///
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -20,8 +20,8 @@
 //!    000000067F000032BE0000400000000020B6-000000067F000032BE0000400000000030B6__000000578C6B29-0000000057A50051
 //! ```
 //!
-//! Every delta file consists of three parts: "summary", "index", and
-//! "values". The summary is a fixed size header at the beginning of the file,
+//! Every delta file consists of three parts: "summary", "values", and
+//! "index". The summary is a fixed size header at the beginning of the file,
 //! and it contains basic information about the layer, and offsets to the other
 //! parts. The "index" is a B-tree, mapping from Key and LSN to an offset in the
 //! "values" part.  The actual page images and WAL records are stored in the
@@ -728,6 +728,9 @@ impl DeltaLayerInner {
            // production code path
            expected_summary.index_start_blk = actual_summary.index_start_blk;
            expected_summary.index_root_blk = actual_summary.index_root_blk;
+            // mask out the timeline_id, but still require the layers to be from the same tenant
+            expected_summary.timeline_id = actual_summary.timeline_id;
+
            if actual_summary != expected_summary {
                bail!(
                    "in-file summary does not match expected summary. actual = {:?} expected = {:?}",
@@ -863,7 +866,7 @@ impl DeltaLayerInner {
                .into(),
        );

-        let data_end_offset = self.index_start_blk as u64 * PAGE_SZ as u64;
+        let data_end_offset = self.index_start_offset();

        let reads = Self::plan_reads(
            keyspace,
@@ -939,7 +942,7 @@ impl DeltaLayerInner {
            }

            if !range_end_handled {
-                tracing::info!("Handling range end fallback at {}", data_end_offset);
+                tracing::debug!("Handling range end fallback at {}", data_end_offset);
                planner.handle_range_end(data_end_offset);
            }
        }
@@ -1103,11 +1106,195 @@ impl DeltaLayerInner {
        if let Some(last) = all_keys.last_mut() {
            // Last key occupies all space till end of value storage,
            // which corresponds to beginning of the index
-            last.size = self.index_start_blk as u64 * PAGE_SZ as u64 - last.size;
+            last.size = self.index_start_offset() - last.size;
        }
        Ok(all_keys)
    }

+    /// Using the given writer, write out a truncated version, where LSNs higher than the
+    /// truncate_at are missing.
+    #[cfg(test)]
+    pub(super) async fn copy_prefix(
+        &self,
+        writer: &mut DeltaLayerWriter,
+        truncate_at: Lsn,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        use crate::tenant::vectored_blob_io::{
+            BlobMeta, VectoredReadBuilder, VectoredReadExtended,
+        };
+        use futures::stream::TryStreamExt;
+
+        #[derive(Debug)]
+        enum Item {
+            Actual(Key, Lsn, BlobRef),
+            Sentinel,
+        }
+
+        impl From<Item> for Option<(Key, Lsn, BlobRef)> {
+            fn from(value: Item) -> Self {
+                match value {
+                    Item::Actual(key, lsn, blob) => Some((key, lsn, blob)),
+                    Item::Sentinel => None,
+                }
+            }
+        }
+
+        impl Item {
+            fn offset(&self) -> Option<BlobRef> {
+                match self {
+                    Item::Actual(_, _, blob) => Some(*blob),
+                    Item::Sentinel => None,
+                }
+            }
+
+            fn is_last(&self) -> bool {
+                matches!(self, Item::Sentinel)
+            }
+        }
+
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+            self.index_start_blk,
+            self.index_root_blk,
+            block_reader,
+        );
+
+        let stream = self.stream_index_forwards(&tree_reader, &[0u8; DELTA_KEY_SIZE], ctx);
+        let stream = stream.map_ok(|(key, lsn, pos)| Item::Actual(key, lsn, pos));
+        // put in a sentinel value for getting the end offset for last item, and not having to
+        // repeat the whole read part
+        let stream = stream.chain(futures::stream::once(futures::future::ready(Ok(
+            Item::Sentinel,
+        ))));
+        let mut stream = std::pin::pin!(stream);
+
+        let mut prev: Option<(Key, Lsn, BlobRef)> = None;
+
+        let mut read_builder: Option<VectoredReadBuilder> = None;
+
+        let max_read_size = self
+            .max_vectored_read_bytes
+            .map(|x| x.0.get())
+            .unwrap_or(8192);
+
+        let mut buffer = Some(BytesMut::with_capacity(max_read_size));
+
+        // FIXME: buffering of DeltaLayerWriter
+        let mut per_blob_copy = Vec::new();
+
+        while let Some(item) = stream.try_next().await? {
+            tracing::debug!(?item, "popped");
+            let offset = item
+                .offset()
+                .unwrap_or(BlobRef::new(self.index_start_offset(), false));
+
+            let actionable = if let Some((key, lsn, start_offset)) = prev.take() {
+                let end_offset = offset;
+
+                Some((BlobMeta { key, lsn }, start_offset..end_offset))
+            } else {
+                None
+            };
+
+            let is_last = item.is_last();
+
+            prev = Option::from(item);
+
+            let actionable = actionable.filter(|x| x.0.lsn < truncate_at);
+
+            let builder = if let Some((meta, offsets)) = actionable {
+                // extend or create a new builder
+                if read_builder
+                    .as_mut()
+                    .map(|x| x.extend(offsets.start.pos(), offsets.end.pos(), meta))
+                    .unwrap_or(VectoredReadExtended::No)
+                    == VectoredReadExtended::Yes
+                {
+                    None
+                } else {
+                    read_builder.replace(VectoredReadBuilder::new(
+                        offsets.start.pos(),
+                        offsets.end.pos(),
+                        meta,
+                        max_read_size,
+                    ))
+                }
+            } else {
+                // nothing to do, except perhaps flush any existing for the last element
+                None
+            };
+
+            // flush the possible older builder and also the new one if the item was the last one
+            let builders = builder.into_iter();
+            let builders = if is_last {
+                builders.chain(read_builder.take())
+            } else {
+                builders.chain(None)
+            };
+
+            for builder in builders {
+                let read = builder.build();
+
+                let reader = VectoredBlobReader::new(&self.file);
+
+                let mut buf = buffer.take().unwrap();
+
+                buf.clear();
+                buf.reserve(read.size());
+                let res = reader.read_blobs(&read, buf).await?;
+
+                for blob in res.blobs {
+                    let key = blob.meta.key;
+                    let lsn = blob.meta.lsn;
+                    let data = &res.buf[blob.start..blob.end];
+
+                    #[cfg(debug_assertions)]
+                    Value::des(data)
+                        .with_context(|| {
+                            format!(
+                                "blob failed to deserialize for {}@{}, {}..{}: {:?}",
+                                blob.meta.key,
+                                blob.meta.lsn,
+                                blob.start,
+                                blob.end,
+                                utils::Hex(data)
+                            )
+                        })
+                        .unwrap();
+
+                    // is it an image or will_init walrecord?
+                    // FIXME: this could be handled by threading the BlobRef to the
+                    // VectoredReadBuilder
+                    let will_init = crate::repository::ValueBytes::will_init(data)
+                        .inspect_err(|_e| {
+                            #[cfg(feature = "testing")]
+                            tracing::error!(data=?utils::Hex(data), err=?_e, "failed to parse will_init out of serialized value");
+                        })
+                        .unwrap_or(false);
+
+                    per_blob_copy.clear();
+                    per_blob_copy.extend_from_slice(data);
+
+                    let (tmp, res) = writer
+                        .put_value_bytes(key, lsn, std::mem::take(&mut per_blob_copy), will_init)
+                        .await;
+                    per_blob_copy = tmp;
+                    res?;
+                }
+
+                buffer = Some(res.buf);
+            }
+        }
+
+        assert!(
+            read_builder.is_none(),
+            "with the sentinel above loop should had handled all"
+        );
+
+        Ok(())
+    }
+
    pub(super) async fn dump(&self, ctx: &RequestContext) -> anyhow::Result<()> {
        println!(
            "index_start_blk: {}, root {}",
@@ -1177,6 +1364,44 @@ impl DeltaLayerInner {

        Ok(())
    }
+
+    #[cfg(test)]
+    fn stream_index_forwards<'a, R>(
+        &'a self,
+        reader: &'a DiskBtreeReader<R, DELTA_KEY_SIZE>,
+        start: &'a [u8; DELTA_KEY_SIZE],
+        ctx: &'a RequestContext,
+    ) -> impl futures::stream::Stream<
+        Item = Result<(Key, Lsn, BlobRef), crate::tenant::disk_btree::DiskBtreeError>,
+    > + 'a
+    where
+        R: BlockReader,
+    {
+        use futures::stream::TryStreamExt;
+        let stream = reader.get_stream_from(start, ctx);
+        stream.map_ok(|(key, value)| {
+            let key = DeltaKey::from_slice(&key);
+            let (key, lsn) = (key.key(), key.lsn());
+            let offset = BlobRef(value);
+
+            (key, lsn, offset)
+        })
+    }
+
+    /// The file offset to the first block of index.
+    ///
+    /// The file structure is summary, values, and index. We often need this for the size of last blob.
+    fn index_start_offset(&self) -> u64 {
+        let offset = self.index_start_blk as u64 * PAGE_SZ as u64;
+        let bref = BlobRef(offset);
+        tracing::debug!(
+            index_start_blk = self.index_start_blk,
+            offset,
+            pos = bref.pos(),
+            "index_start_offset"
+        );
+        offset
+    }
 }

 /// A set of data associated with a delta layer key and its value
@@ -1538,7 +1763,7 @@ mod test {

        let resident = writer.finish(entries_meta.key_range.end, &timeline).await?;

-        let inner = resident.get_inner_delta(&ctx).await?;
+        let inner = resident.as_delta(&ctx).await?;

        let file_size = inner.file.metadata().await?.len();
        tracing::info!(
@@ -1594,4 +1819,217 @@ mod test {

        Ok(())
    }
+
+    #[tokio::test]
+    async fn copy_delta_prefix_smoke() {
+        use crate::walrecord::NeonWalRecord;
+        use bytes::Bytes;
+
+        let h = crate::tenant::harness::TenantHarness::create("truncate_delta_smoke").unwrap();
+        let (tenant, ctx) = h.load().await;
+        let ctx = &ctx;
+        let timeline = tenant
+            .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, ctx)
+            .await
+            .unwrap();
+
+        let initdb_layer = timeline
+            .layers
+            .read()
+            .await
+            .likely_resident_layers()
+            .next()
+            .unwrap();
+
+        {
+            let mut writer = timeline.writer().await;
+
+            let data = [
+                (0x20, 12, Value::Image(Bytes::from_static(b"foobar"))),
+                (
+                    0x30,
+                    12,
+                    Value::WalRecord(NeonWalRecord::Postgres {
+                        will_init: false,
+                        rec: Bytes::from_static(b"1"),
+                    }),
+                ),
+                (
+                    0x40,
+                    12,
+                    Value::WalRecord(NeonWalRecord::Postgres {
+                        will_init: true,
+                        rec: Bytes::from_static(b"2"),
+                    }),
+                ),
+                // build an oversized value so we cannot extend and existing read over
+                // this
+                (
+                    0x50,
+                    12,
+                    Value::WalRecord(NeonWalRecord::Postgres {
+                        will_init: true,
+                        rec: {
+                            let mut buf =
+                                vec![0u8; tenant.conf.max_vectored_read_bytes.0.get() + 1024];
+                            buf.iter_mut()
+                                .enumerate()
+                                .for_each(|(i, slot)| *slot = (i % 256) as u8);
+                            Bytes::from(buf)
+                        },
+                    }),
+                ),
+                // because the oversized read cannot be extended further, we are sure to exercise the
+                // builder created on the last round with this:
+                (
+                    0x60,
+                    12,
+                    Value::WalRecord(NeonWalRecord::Postgres {
+                        will_init: true,
+                        rec: Bytes::from_static(b"3"),
+                    }),
+                ),
+                (
+                    0x60,
+                    9,
+                    Value::Image(Bytes::from_static(b"something for a different key")),
+                ),
+            ];
+
+            let mut last_lsn = None;
+
+            for (lsn, key, value) in data {
+                let key = Key::from_i128(key);
+                writer.put(key, Lsn(lsn), &value, ctx).await.unwrap();
+                last_lsn = Some(lsn);
+            }
+
+            writer.finish_write(Lsn(last_lsn.unwrap()));
+        }
+        timeline.freeze_and_flush().await.unwrap();
+
+        let new_layer = timeline
+            .layers
+            .read()
+            .await
+            .likely_resident_layers()
+            .find(|x| x != &initdb_layer)
+            .unwrap();
+
+        // create a copy for the timeline, so we don't overwrite the file
+        let branch = tenant
+            .branch_timeline_test(&timeline, TimelineId::generate(), None, ctx)
+            .await
+            .unwrap();
+
+        assert_eq!(branch.get_ancestor_lsn(), Lsn(0x60));
+
+        // truncating at 0x61 gives us a full copy, otherwise just go backwards until there's just
+        // a single key
+
+        for truncate_at in [0x61, 0x51, 0x41, 0x31, 0x21] {
+            let truncate_at = Lsn(truncate_at);
+
+            let mut writer = DeltaLayerWriter::new(
+                tenant.conf,
+                branch.timeline_id,
+                tenant.tenant_shard_id,
+                Key::MIN,
+                Lsn(0x11)..truncate_at,
+            )
+            .await
+            .unwrap();
+
+            let new_layer = new_layer.download_and_keep_resident().await.unwrap();
+
+            new_layer
+                .copy_delta_prefix(&mut writer, truncate_at, ctx)
+                .await
+                .unwrap();
+
+            let copied_layer = writer.finish(Key::MAX, &branch).await.unwrap();
+
+            copied_layer.as_delta(ctx).await.unwrap();
+
+            assert_keys_and_values_eq(
+                new_layer.as_delta(ctx).await.unwrap(),
+                copied_layer.as_delta(ctx).await.unwrap(),
+                truncate_at,
+                ctx,
+            )
+            .await;
+        }
+    }
+
+    async fn assert_keys_and_values_eq(
+        source: &DeltaLayerInner,
+        truncated: &DeltaLayerInner,
+        truncated_at: Lsn,
+        ctx: &RequestContext,
+    ) {
+        use futures::future::ready;
+        use futures::stream::TryStreamExt;
+
+        let start_key = [0u8; DELTA_KEY_SIZE];
+
+        let source_reader = FileBlockReader::new(&source.file, source.file_id);
+        let source_tree = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+            source.index_start_blk,
+            source.index_root_blk,
+            &source_reader,
+        );
+        let source_stream = source.stream_index_forwards(&source_tree, &start_key, ctx);
+        let source_stream = source_stream.filter(|res| match res {
+            Ok((_, lsn, _)) => ready(lsn < &truncated_at),
+            _ => ready(true),
+        });
+        let mut source_stream = std::pin::pin!(source_stream);
+
+        let truncated_reader = FileBlockReader::new(&truncated.file, truncated.file_id);
+        let truncated_tree = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
+            truncated.index_start_blk,
+            truncated.index_root_blk,
+            &truncated_reader,
+        );
+        let truncated_stream = truncated.stream_index_forwards(&truncated_tree, &start_key, ctx);
+        let mut truncated_stream = std::pin::pin!(truncated_stream);
+
+        let mut scratch_left = Vec::new();
+        let mut scratch_right = Vec::new();
+
+        loop {
+            let (src, truncated) = (source_stream.try_next(), truncated_stream.try_next());
+            let (src, truncated) = tokio::try_join!(src, truncated).unwrap();
+
+            if src.is_none() {
+                assert!(truncated.is_none());
+                break;
+            }
+
+            let (src, truncated) = (src.unwrap(), truncated.unwrap());
+
+            // because we've filtered the source with Lsn, we should always have the same keys from both.
+            assert_eq!(src.0, truncated.0);
+            assert_eq!(src.1, truncated.1);
+
+            // if this is needed for something else, just drop this assert.
+            assert!(
+                src.2.pos() >= truncated.2.pos(),
+                "value position should not go backwards {} vs. {}",
+                src.2.pos(),
+                truncated.2.pos()
+            );
+
+            scratch_left.clear();
+            let src_cursor = source_reader.block_cursor();
+            let left = src_cursor.read_blob_into_buf(src.2.pos(), &mut scratch_left, ctx);
+            scratch_right.clear();
+            let trunc_cursor = truncated_reader.block_cursor();
+            let right = trunc_cursor.read_blob_into_buf(truncated.2.pos(), &mut scratch_right, ctx);
+
+            tokio::try_join!(left, right).unwrap();
+
+            assert_eq!(utils::Hex(&scratch_left), utils::Hex(&scratch_right));
+        }
+    }
 }
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -396,6 +396,8 @@ impl ImageLayerInner {
            // production code path
            expected_summary.index_start_blk = actual_summary.index_start_blk;
            expected_summary.index_root_blk = actual_summary.index_root_blk;
+            // mask out the timeline_id, but still require the layers to be from the same tenant
+            expected_summary.timeline_id = actual_summary.timeline_id;

            if actual_summary != expected_summary {
                bail!(
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -26,7 +26,7 @@ use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
 // while being able to use std::fmt::Write's methods
 use crate::metrics::TIMELINE_EPHEMERAL_BYTES;
 use std::cmp::Ordering;
-use std::fmt::Write as _;
+use std::fmt::Write;
 use std::ops::Range;
 use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::atomic::{AtomicU64, AtomicUsize};
@@ -54,6 +54,12 @@ pub struct InMemoryLayer {
    /// Writes are only allowed when this is `None`.
    end_lsn: OnceLock<Lsn>,

+    /// Used for traversal path. Cached representation of the in-memory layer before frozen.
+    local_path_str: Arc<str>,
+
+    /// Used for traversal path. Cached representation of the in-memory layer after frozen.
+    frozen_local_path_str: OnceLock<Arc<str>>,
+
    opened_at: Instant,

    /// The above fields never change, except for `end_lsn`, which is only set once.
@@ -241,6 +247,12 @@ impl InMemoryLayer {
        self.start_lsn..self.end_lsn_or_max()
    }

+    pub(crate) fn local_path_str(&self) -> &Arc<str> {
+        self.frozen_local_path_str
+            .get()
+            .unwrap_or(&self.local_path_str)
+    }
+
    /// debugging function to print out the contents of the layer
    ///
    /// this is likely completly unused
@@ -430,10 +442,24 @@ impl InMemoryLayer {
    }
 }

+fn inmem_layer_display(mut f: impl Write, start_lsn: Lsn, end_lsn: Lsn) -> std::fmt::Result {
+    write!(f, "inmem-{:016X}-{:016X}", start_lsn.0, end_lsn.0)
+}
+
+fn inmem_layer_log_display(
+    mut f: impl Write,
+    timeline: TimelineId,
+    start_lsn: Lsn,
+    end_lsn: Lsn,
+) -> std::fmt::Result {
+    write!(f, "timeline {} in-memory ", timeline)?;
+    inmem_layer_display(f, start_lsn, end_lsn)
+}
+
 impl std::fmt::Display for InMemoryLayer {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        let end_lsn = self.end_lsn_or_max();
-        write!(f, "inmem-{:016X}-{:016X}", self.start_lsn.0, end_lsn.0)
+        inmem_layer_display(f, self.start_lsn, end_lsn)
    }
 }

@@ -458,6 +484,12 @@ impl InMemoryLayer {

        Ok(InMemoryLayer {
            file_id: key,
+            local_path_str: {
+                let mut buf = String::new();
+                inmem_layer_log_display(&mut buf, timeline_id, start_lsn, Lsn::MAX).unwrap();
+                buf.into()
+            },
+            frozen_local_path_str: OnceLock::new(),
            conf,
            timeline_id,
            tenant_shard_id,
@@ -552,6 +584,15 @@ impl InMemoryLayer {
        );
        self.end_lsn.set(end_lsn).expect("end_lsn set only once");

+        self.frozen_local_path_str
+            .set({
+                let mut buf = String::new();
+                inmem_layer_log_display(&mut buf, self.get_timeline_id(), self.start_lsn, end_lsn)
+                    .unwrap();
+                buf.into()
+            })
+            .expect("frozen_local_path_str set only once");
+
        for vec_map in inner.index.values() {
            for (lsn, _pos) in vec_map.as_slice() {
                assert!(*lsn < end_lsn);
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -116,6 +116,12 @@ impl AsLayerDesc for Layer {
    }
 }

+impl PartialEq for Layer {
+    fn eq(&self, other: &Self) -> bool {
+        Arc::as_ptr(&self.0) == Arc::as_ptr(&other.0)
+    }
+}
+
 impl Layer {
    /// Creates a layer value for a file we know to not be resident.
    pub(crate) fn for_evicted(
@@ -389,6 +395,10 @@ impl Layer {
        &self.0.path
    }

+    pub(crate) fn local_path_str(&self) -> &Arc<str> {
+        &self.0.path_str
+    }
+
    pub(crate) fn metadata(&self) -> LayerFileMetadata {
        self.0.metadata()
    }
@@ -511,6 +521,9 @@ struct LayerInner {
    /// Full path to the file; unclear if this should exist anymore.
    path: Utf8PathBuf,

+    /// String representation of the full path, used for traversal id.
+    path_str: Arc<str>,
+
    desc: PersistentLayerDesc,

    /// Timeline access is needed for remote timeline client and metrics.
@@ -604,9 +617,17 @@ enum Status {

 impl Drop for LayerInner {
    fn drop(&mut self) {
+        // if there was a pending eviction, mark it cancelled here to balance metrics
+        if let Some((ResidentOrWantedEvicted::WantedEvicted(..), _)) = self.inner.take_and_deinit()
+        {
+            // eviction has already been started
+            LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
+
+            // eviction request is intentionally not honored as no one is present to wait for it
+            // and we could be delaying shutdown for nothing.
+        }
+
        if !*self.wanted_deleted.get_mut() {
-            // should we try to evict if the last wish was for eviction? seems more like a hazard
-            // than a clear win.
            return;
        }

@@ -708,6 +729,7 @@ impl LayerInner {

        LayerInner {
            conf,
+            path_str: path.to_string().into(),
            path,
            desc,
            timeline: Arc::downgrade(timeline),
@@ -1552,8 +1574,8 @@ impl Drop for DownloadedLayer {
        if let Some(owner) = self.owner.upgrade() {
            owner.on_downloaded_layer_drop(self.version);
        } else {
-            // no need to do anything, we are shutting down
-            LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
+            // Layer::drop will handle cancelling the eviction; because of drop order and
+            // `DownloadedLayer` never leaking, we cannot know here if eviction was requested.
        }
    }
 }
@@ -1752,6 +1774,28 @@ impl ResidentLayer {
        }
    }

+    /// FIXME: truncate is bad name because we are not truncating anything, but copying the
+    /// filtered parts.
+    #[cfg(test)]
+    pub(super) async fn copy_delta_prefix(
+        &self,
+        writer: &mut super::delta_layer::DeltaLayerWriter,
+        truncate_at: Lsn,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        use LayerKind::*;
+
+        let owner = &self.owner.0;
+
+        match self.downloaded.get(owner, ctx).await? {
+            Delta(ref d) => d
+                .copy_prefix(writer, truncate_at, ctx)
+                .await
+                .with_context(|| format!("truncate {self}")),
+            Image(_) => anyhow::bail!(format!("cannot truncate image layer {self}")),
+        }
+    }
+
    pub(crate) fn local_path(&self) -> &Utf8Path {
        &self.owner.0.path
    }
@@ -1761,14 +1805,14 @@ impl ResidentLayer {
    }

    #[cfg(test)]
-    pub(crate) async fn get_inner_delta<'a>(
-        &'a self,
+    pub(crate) async fn as_delta(
+        &self,
        ctx: &RequestContext,
-    ) -> anyhow::Result<&'a delta_layer::DeltaLayerInner> {
-        let owner = &self.owner.0;
-        match self.downloaded.get(owner, ctx).await? {
-            LayerKind::Delta(d) => Ok(d),
-            LayerKind::Image(_) => Err(anyhow::anyhow!("Expected a delta layer")),
+    ) -> anyhow::Result<&delta_layer::DeltaLayerInner> {
+        use LayerKind::*;
+        match self.downloaded.get(&self.owner.0, ctx).await? {
+            Delta(ref d) => Ok(d),
+            Image(_) => Err(anyhow::anyhow!("image layer")),
        }
    }
 }
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -721,11 +721,110 @@ async fn evict_and_wait_does_not_wait_for_download() {
    layer.evict_and_wait(FOREVER).await.unwrap();
 }

+/// Asserts that there is no miscalculation when Layer is dropped while it is being kept resident,
+/// which is the last value.
+///
+/// Also checks that the same does not happen on a non-evicted layer (regression test).
+#[tokio::test(start_paused = true)]
+async fn eviction_cancellation_on_drop() {
+    use crate::repository::Value;
+    use bytes::Bytes;
+
+    // this is the runtime on which Layer spawns the blocking tasks on
+    let handle = tokio::runtime::Handle::current();
+
+    let h = TenantHarness::create("eviction_cancellation_on_drop").unwrap();
+    utils::logging::replace_panic_hook_with_tracing_panic_hook().forget();
+    let (tenant, ctx) = h.load().await;
+
+    let timeline = tenant
+        .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
+        .await
+        .unwrap();
+
+    {
+        // create_test_timeline wrote us one layer, write another
+        let mut writer = timeline.writer().await;
+        writer
+            .put(
+                Key::from_i128(5),
+                Lsn(0x20),
+                &Value::Image(Bytes::from_static(b"this does not matter either")),
+                &ctx,
+            )
+            .await
+            .unwrap();
+
+        writer.finish_write(Lsn(0x20));
+    }
+
+    timeline.freeze_and_flush().await.unwrap();
+
+    // wait for the upload to complete so our Arc::strong_count assertion holds
+    timeline
+        .remote_client
+        .as_ref()
+        .unwrap()
+        .wait_completion()
+        .await
+        .unwrap();
+
+    let (evicted_layer, not_evicted) = {
+        let mut layers = {
+            let mut guard = timeline.layers.write().await;
+            let layers = guard.likely_resident_layers().collect::<Vec<_>>();
+            // remove the layers from layermap
+            guard.finish_gc_timeline(&layers);
+
+            layers
+        };
+
+        assert_eq!(layers.len(), 2);
+
+        (layers.pop().unwrap(), layers.pop().unwrap())
+    };
+
+    let victims = [(evicted_layer, true), (not_evicted, false)];
+
+    for (victim, evict) in victims {
+        let resident = victim.keep_resident().await.unwrap();
+        drop(victim);
+
+        assert_eq!(Arc::strong_count(&resident.owner.0), 1);
+
+        if evict {
+            let evict_and_wait = resident.owner.evict_and_wait(FOREVER);
+
+            // drive the future to await on the status channel, and then drop it
+            tokio::time::timeout(ADVANCE, evict_and_wait)
+                .await
+                .expect_err("should had been a timeout since we are holding the layer resident");
+        }
+
+        // 1 == we only evict one of the layers
+        assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
+
+        drop(resident);
+
+        // run any spawned
+        tokio::time::sleep(ADVANCE).await;
+
+        SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await;
+
+        assert_eq!(
+            1,
+            LAYER_IMPL_METRICS.cancelled_evictions[EvictionCancelled::LayerGone].get()
+        );
+    }
+}
+
+/// A test case to remind you the cost of these structures. You can bump the size limit
+/// below if it is really necessary to add more fields to the structures.
 #[test]
 fn layer_size() {
    assert_eq!(std::mem::size_of::<LayerAccessStats>(), 2040);
    assert_eq!(std::mem::size_of::<PersistentLayerDesc>(), 104);
-    assert_eq!(std::mem::size_of::<LayerInner>(), 2328);
+    assert_eq!(std::mem::size_of::<LayerInner>(), 2344);
    // it also has the utf8 path
 }

--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -62,7 +62,7 @@ impl BackgroundLoopKind {
 pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
    loop_kind: BackgroundLoopKind,
    _ctx: &RequestContext,
-) -> impl Drop {
+) -> tokio::sync::SemaphorePermit<'static> {
    let _guard = crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_GAUGE
        .with_label_values(&[loop_kind.as_static_str()])
        .guard();
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -16,14 +16,14 @@ use enumset::EnumSet;
 use fail::fail_point;
 use once_cell::sync::Lazy;
 use pageserver_api::{
-    key::AUX_FILES_KEY,
+    key::{AUX_FILES_KEY, NON_INHERITED_RANGE},
    keyspace::KeySpaceAccum,
    models::{
        CompactionAlgorithm, DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest,
        EvictionPolicy, InMemoryLayerInfo, LayerMapInfo, TimelineState,
    },
    reltag::BlockNumber,
-    shard::{ShardIdentity, TenantShardId},
+    shard::{ShardIdentity, ShardNumber, TenantShardId},
 };
 use rand::Rng;
 use serde_with::serde_as;
@@ -86,7 +86,7 @@ use crate::{
 use crate::config::PageServerConf;
 use crate::keyspace::{KeyPartitioning, KeySpace};
 use crate::metrics::{
-    TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
+    GetKind, TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
 };
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;
 use crate::tenant::config::TenantConfOpt;
@@ -182,6 +182,16 @@ pub(crate) struct AuxFilesState {
    pub(crate) n_deltas: usize,
 }

+/// The relation size cache caches relation sizes at the end of the timeline. It speeds up WAL
+/// ingestion considerably, because WAL ingestion needs to check on most records if the record
+/// implicitly extends the relation.  At startup, `complete_as_of` is initialized to the current end
+/// of the timeline (disk_consistent_lsn).  It's used on reads of relation sizes to check if the
+/// value can be used to also update the cache, see [`Timeline::update_cached_rel_size`].
+pub(crate) struct RelSizeCache {
+    pub(crate) complete_as_of: Lsn,
+    pub(crate) map: HashMap<RelTag, (Lsn, BlockNumber)>,
+}
+
 pub struct Timeline {
    conf: &'static PageServerConf,
    tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,
@@ -324,7 +334,7 @@ pub struct Timeline {
    pub walreceiver: Mutex<Option<WalReceiver>>,

    /// Relation size cache
-    pub rel_size_cache: RwLock<HashMap<RelTag, (Lsn, BlockNumber)>>,
+    pub(crate) rel_size_cache: RwLock<RelSizeCache>,

    download_all_remote_layers_task_info: RwLock<Option<DownloadRemoteLayersTaskInfo>>,

@@ -428,6 +438,62 @@ pub(crate) enum PageReconstructError {
    /// An error happened replaying WAL records
    #[error(transparent)]
    WalRedo(anyhow::Error),
+
+    #[error("{0}")]
+    MissingKey(MissingKeyError),
+}
+
+#[derive(Debug)]
+pub struct MissingKeyError {
+    stuck_at_lsn: bool,
+    key: Key,
+    shard: ShardNumber,
+    cont_lsn: Lsn,
+    request_lsn: Lsn,
+    ancestor_lsn: Option<Lsn>,
+    traversal_path: Vec<TraversalPathItem>,
+    backtrace: Option<std::backtrace::Backtrace>,
+}
+
+impl std::fmt::Display for MissingKeyError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        if self.stuck_at_lsn {
+            // Records are found in this timeline but no image layer or initial delta record was found.
+            write!(
+                f,
+                "could not find layer with more data for key {} (shard {:?}) at LSN {}, request LSN {}",
+                self.key, self.shard, self.cont_lsn, self.request_lsn
+            )?;
+            if let Some(ref ancestor_lsn) = self.ancestor_lsn {
+                write!(f, ", ancestor {}", ancestor_lsn)?;
+            }
+        } else {
+            // No records in this timeline.
+            write!(
+                f,
+                "could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}",
+                self.key, self.shard, self.cont_lsn, self.request_lsn
+            )?;
+        }
+
+        if !self.traversal_path.is_empty() {
+            writeln!(f)?;
+        }
+
+        for (r, c, l) in &self.traversal_path {
+            writeln!(
+                f,
+                "layer traversal: result {:?}, cont_lsn {}, layer: {}",
+                r, c, l,
+            )?;
+        }
+
+        if let Some(ref backtrace) = self.backtrace {
+            write!(f, "\n{}", backtrace)?;
+        }
+
+        Ok(())
+    }
 }

 impl PageReconstructError {
@@ -439,6 +505,7 @@ impl PageReconstructError {
            AncestorLsnTimeout(_) => false,
            Cancelled | AncestorStopping(_) => true,
            WalRedo(_) => false,
+            MissingKey { .. } => false,
        }
    }
 }
@@ -730,7 +797,9 @@ impl Timeline {
            img: cached_page_img,
        };

-        let timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME.start_timer();
+        let timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME
+            .for_get_kind(GetKind::Singular)
+            .start_timer();
        let path = self
            .get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx)
            .await?;
@@ -740,7 +809,7 @@ impl Timeline {
        let res = self.reconstruct_value(key, lsn, reconstruct_state).await;
        let elapsed = start.elapsed();
        crate::metrics::RECONSTRUCT_TIME
-            .for_result(&res)
+            .for_get_kind(GetKind::Singular)
            .observe(elapsed.as_secs_f64());

        if cfg!(feature = "testing") && res.is_err() {
@@ -753,7 +822,7 @@ impl Timeline {
                writeln!(
                    msg,
                    "- layer traversal: result {res:?}, cont_lsn {cont_lsn}, layer: {}",
-                    layer(),
+                    layer,
                )
                .expect("string grows")
            });
@@ -872,8 +941,16 @@ impl Timeline {
                    Err(Cancelled | AncestorStopping(_)) => {
                        return Err(GetVectoredError::Cancelled)
                    }
-                    Err(Other(err)) if err.to_string().contains("could not find data for key") => {
-                        return Err(GetVectoredError::MissingKey(key))
+                    // we only capture stuck_at_lsn=false now until we figure out https://github.com/neondatabase/neon/issues/7380
+                    Err(MissingKey(MissingKeyError {
+                        stuck_at_lsn: false,
+                        ..
+                    })) if !NON_INHERITED_RANGE.contains(&key) => {
+                        // The vectored read path handles non inherited keys specially.
+                        // If such a a key cannot be reconstructed from the current timeline,
+                        // the vectored read path returns a key level error as opposed to a top
+                        // level error.
+                        return Err(GetVectoredError::MissingKey(key));
                    }
                    _ => {
                        values.insert(key, block);
@@ -894,10 +971,24 @@ impl Timeline {
    ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
        let mut reconstruct_state = ValuesReconstructState::new();

+        let get_kind = if keyspace.total_size() == 1 {
+            GetKind::Singular
+        } else {
+            GetKind::Vectored
+        };
+
+        let get_data_timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME
+            .for_get_kind(get_kind)
+            .start_timer();
        self.get_vectored_reconstruct_data(keyspace, lsn, &mut reconstruct_state, ctx)
            .await?;
+        get_data_timer.stop_and_record();

+        let reconstruct_timer = crate::metrics::RECONSTRUCT_TIME
+            .for_get_kind(get_kind)
+            .start_timer();
        let mut results: BTreeMap<Key, Result<Bytes, PageReconstructError>> = BTreeMap::new();
+        let layers_visited = reconstruct_state.get_layers_visited();
        for (key, res) in reconstruct_state.keys {
            match res {
                Err(err) => {
@@ -911,6 +1002,13 @@ impl Timeline {
                }
            }
        }
+        reconstruct_timer.stop_and_record();
+
+        // Note that this is an approximation. Tracking the exact number of layers visited
+        // per key requires virtually unbounded memory usage and is inefficient
+        // (i.e. segment tree tracking each range queried from a layer)
+        crate::metrics::VEC_READ_NUM_LAYERS_VISITED
+            .observe(layers_visited as f64 / results.len() as f64);

        Ok(results)
    }
@@ -1344,7 +1442,7 @@ impl Timeline {
        background_jobs_can_start: Option<&completion::Barrier>,
        ctx: &RequestContext,
    ) {
-        if self.tenant_shard_id.is_zero() {
+        if self.tenant_shard_id.is_shard_zero() {
            // Logical size is only maintained accurately on shard zero.
            self.spawn_initial_logical_size_computation_task(ctx);
        }
@@ -1892,7 +1990,10 @@ impl Timeline {
                last_image_layer_creation_check_at: AtomicLsn::new(0),

                last_received_wal: Mutex::new(None),
-                rel_size_cache: RwLock::new(HashMap::new()),
+                rel_size_cache: RwLock::new(RelSizeCache {
+                    complete_as_of: disk_consistent_lsn,
+                    map: HashMap::new(),
+                }),

                download_all_remote_layers_task_info: RwLock::new(None),

@@ -2237,7 +2338,7 @@ impl Timeline {
        priority: GetLogicalSizePriority,
        ctx: &RequestContext,
    ) -> logical_size::CurrentLogicalSize {
-        if !self.tenant_shard_id.is_zero() {
+        if !self.tenant_shard_id.is_shard_zero() {
            // Logical size is only accurately maintained on shard zero: when called elsewhere, for example
            // when HTTP API is serving a GET for timeline zero, return zero
            return logical_size::CurrentLogicalSize::Approximate(logical_size::Approximate::zero());
@@ -2533,7 +2634,7 @@ impl Timeline {
        crate::span::debug_assert_current_span_has_tenant_and_timeline_id();
        // We should never be calculating logical sizes on shard !=0, because these shards do not have
        // accurate relation sizes, and they do not emit consumption metrics.
-        debug_assert!(self.tenant_shard_id.is_zero());
+        debug_assert!(self.tenant_shard_id.is_shard_zero());

        let guard = self
            .gate
@@ -2692,7 +2793,7 @@ impl Timeline {
    }
 }

-type TraversalId = String;
+type TraversalId = Arc<str>;

 trait TraversalLayerExt {
    fn traversal_id(&self) -> TraversalId;
@@ -2700,13 +2801,13 @@ trait TraversalLayerExt {

 impl TraversalLayerExt for Layer {
    fn traversal_id(&self) -> TraversalId {
-        self.local_path().to_string()
+        Arc::clone(self.local_path_str())
    }
 }

 impl TraversalLayerExt for Arc<InMemoryLayer> {
    fn traversal_id(&self) -> TraversalId {
-        format!("timeline {} in-memory {self}", self.get_timeline_id())
+        Arc::clone(self.local_path_str())
    }
 }

@@ -2735,7 +2836,7 @@ impl Timeline {
        let mut timeline = self;

        let mut read_count = scopeguard::guard(0, |cnt| {
-            crate::metrics::READ_NUM_FS_LAYERS.observe(cnt as f64)
+            crate::metrics::READ_NUM_LAYERS_VISITED.observe(cnt as f64)
        });

        // For debugging purposes, collect the path of layers that we traversed
@@ -2775,32 +2876,35 @@ impl Timeline {
                        if prev <= cont_lsn {
                            // Didn't make any progress in last iteration. Error out to avoid
                            // getting stuck in the loop.
-                            return Err(layer_traversal_error(format!(
-                                "could not find layer with more data for key {} at LSN {}, request LSN {}, ancestor {}",
+                            return Err(PageReconstructError::MissingKey(MissingKeyError {
+                                stuck_at_lsn: true,
                                key,
-                                Lsn(cont_lsn.0 - 1),
+                                shard: self.shard_identity.get_shard_number(&key),
+                                cont_lsn: Lsn(cont_lsn.0 - 1),
                                request_lsn,
-                                timeline.ancestor_lsn
-                            ), traversal_path));
+                                ancestor_lsn: Some(timeline.ancestor_lsn),
+                                traversal_path,
+                                backtrace: None,
+                            }));
                        }
                    }
                    prev_lsn = Some(cont_lsn);
                }
                ValueReconstructResult::Missing => {
-                    return Err(layer_traversal_error(
-                        if cfg!(test) {
-                            format!(
-                                "could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}\n{}",
-                                key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn, std::backtrace::Backtrace::force_capture(),
-                            )
-                        } else {
-                            format!(
-                                "could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}",
-                                key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn
-                            )
-                        },
+                    return Err(PageReconstructError::MissingKey(MissingKeyError {
+                        stuck_at_lsn: false,
+                        key,
+                        shard: self.shard_identity.get_shard_number(&key),
+                        cont_lsn,
+                        request_lsn,
+                        ancestor_lsn: None,
                        traversal_path,
-                    ));
+                        backtrace: if cfg!(test) {
+                            Some(std::backtrace::Backtrace::force_capture())
+                        } else {
+                            None
+                        },
+                    }));
                }
            }

@@ -2847,12 +2951,8 @@ impl Timeline {
                        Err(e) => return Err(PageReconstructError::from(e)),
                    };
                    cont_lsn = lsn_floor;
-                    // metrics: open_layer does not count as fs access, so we are not updating `read_count`
-                    traversal_path.push((
-                        result,
-                        cont_lsn,
-                        Box::new(move || open_layer.traversal_id()),
-                    ));
+                    *read_count += 1;
+                    traversal_path.push((result, cont_lsn, open_layer.traversal_id()));
                    continue 'outer;
                }
            }
@@ -2878,12 +2978,8 @@ impl Timeline {
                        Err(e) => return Err(PageReconstructError::from(e)),
                    };
                    cont_lsn = lsn_floor;
-                    // metrics: open_layer does not count as fs access, so we are not updating `read_count`
-                    traversal_path.push((
-                        result,
-                        cont_lsn,
-                        Box::new(move || frozen_layer.traversal_id()),
-                    ));
+                    *read_count += 1;
+                    traversal_path.push((result, cont_lsn, frozen_layer.traversal_id()));
                    continue 'outer;
                }
            }
@@ -2904,14 +3000,7 @@ impl Timeline {
                };
                cont_lsn = lsn_floor;
                *read_count += 1;
-                traversal_path.push((
-                    result,
-                    cont_lsn,
-                    Box::new({
-                        let layer = layer.to_owned();
-                        move || layer.traversal_id()
-                    }),
-                ));
+                traversal_path.push((result, cont_lsn, layer.traversal_id()));
                continue 'outer;
            } else if timeline.ancestor_timeline.is_some() {
                // Nothing on this timeline. Traverse to parent
@@ -2964,11 +3053,47 @@ impl Timeline {
            .await?;

            keyspace.remove_overlapping_with(&completed);
+
+            // Do not descend into the ancestor timeline for aux files.
+            // We don't return a blanket [`GetVectoredError::MissingKey`] to avoid
+            // stalling compaction.
+            // TODO(chi): this will need to be updated for aux files v2 storage
+            if keyspace.overlaps(&NON_INHERITED_RANGE) {
+                let removed = keyspace.remove_overlapping_with(&KeySpace {
+                    ranges: vec![NON_INHERITED_RANGE],
+                });
+
+                for range in removed.ranges {
+                    let mut key = range.start;
+                    while key < range.end {
+                        reconstruct_state.on_key_error(
+                            key,
+                            PageReconstructError::MissingKey(MissingKeyError {
+                                stuck_at_lsn: false,
+                                key,
+                                shard: self.shard_identity.get_shard_number(&key),
+                                cont_lsn,
+                                request_lsn,
+                                ancestor_lsn: None,
+                                traversal_path: Vec::default(),
+                                backtrace: if cfg!(test) {
+                                    Some(std::backtrace::Backtrace::force_capture())
+                                } else {
+                                    None
+                                },
+                            }),
+                        );
+                        key = key.next();
+                    }
+                }
+            }
+
            if keyspace.total_size() == 0 || timeline.ancestor_timeline.is_none() {
                break;
            }

-            cont_lsn = Lsn(timeline.ancestor_lsn.0 + 1);
+            // Take the min to avoid reconstructing a page with data newer than request Lsn.
+            cont_lsn = std::cmp::min(Lsn(request_lsn.0 + 1), Lsn(timeline.ancestor_lsn.0 + 1));
            timeline_owned = timeline
                .get_ready_ancestor_timeline(ctx)
                .await
@@ -3018,55 +3143,61 @@ impl Timeline {
            unmapped_keyspace.remove_overlapping_with(&keys_done_last_step);
            completed_keyspace.merge(&keys_done_last_step);

-            let guard = timeline.layers.read().await;
-            let layers = guard.layer_map();
+            // Do not descent any further if the last layer we visited
+            // completed all keys in the keyspace it inspected. This is not
+            // required for correctness, but avoids visiting extra layers
+            // which turns out to be a perf bottleneck in some cases.
+            if !unmapped_keyspace.is_empty() {
+                let guard = timeline.layers.read().await;
+                let layers = guard.layer_map();

-            let in_memory_layer = layers.find_in_memory_layer(|l| {
-                let start_lsn = l.get_lsn_range().start;
-                cont_lsn > start_lsn
-            });
+                let in_memory_layer = layers.find_in_memory_layer(|l| {
+                    let start_lsn = l.get_lsn_range().start;
+                    cont_lsn > start_lsn
+                });

-            match in_memory_layer {
-                Some(l) => {
-                    let lsn_range = l.get_lsn_range().start..cont_lsn;
-                    fringe.update(
-                        ReadableLayer::InMemoryLayer(l),
-                        unmapped_keyspace.clone(),
-                        lsn_range,
-                    );
-                }
-                None => {
-                    for range in unmapped_keyspace.ranges.iter() {
-                        let results = layers.range_search(range.clone(), cont_lsn);
+                match in_memory_layer {
+                    Some(l) => {
+                        let lsn_range = l.get_lsn_range().start..cont_lsn;
+                        fringe.update(
+                            ReadableLayer::InMemoryLayer(l),
+                            unmapped_keyspace.clone(),
+                            lsn_range,
+                        );
+                    }
+                    None => {
+                        for range in unmapped_keyspace.ranges.iter() {
+                            let results = layers.range_search(range.clone(), cont_lsn);

-                        results
-                            .found
-                            .into_iter()
-                            .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
-                                (
-                                    ReadableLayer::PersistentLayer(guard.get_from_desc(&layer)),
-                                    keyspace_accum.to_keyspace(),
-                                    lsn_floor..cont_lsn,
-                                )
-                            })
-                            .for_each(|(layer, keyspace, lsn_range)| {
-                                fringe.update(layer, keyspace, lsn_range)
-                            });
+                            results
+                                .found
+                                .into_iter()
+                                .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
+                                    (
+                                        ReadableLayer::PersistentLayer(guard.get_from_desc(&layer)),
+                                        keyspace_accum.to_keyspace(),
+                                        lsn_floor..cont_lsn,
+                                    )
+                                })
+                                .for_each(|(layer, keyspace, lsn_range)| {
+                                    fringe.update(layer, keyspace, lsn_range)
+                                });
+                        }
                    }
                }
-            }

-            // It's safe to drop the layer map lock after planning the next round of reads.
-            // The fringe keeps readable handles for the layers which are safe to read even
-            // if layers were compacted or flushed.
-            //
-            // The more interesting consideration is: "Why is the read algorithm still correct
-            // if the layer map changes while it is operating?". Doing a vectored read on a
-            // timeline boils down to pushing an imaginary lsn boundary downwards for each range
-            // covered by the read. The layer map tells us how to move the lsn downwards for a
-            // range at *a particular point in time*. It is fine for the answer to be different
-            // at two different time points.
-            drop(guard);
+                // It's safe to drop the layer map lock after planning the next round of reads.
+                // The fringe keeps readable handles for the layers which are safe to read even
+                // if layers were compacted or flushed.
+                //
+                // The more interesting consideration is: "Why is the read algorithm still correct
+                // if the layer map changes while it is operating?". Doing a vectored read on a
+                // timeline boils down to pushing an imaginary lsn boundary downwards for each range
+                // covered by the read. The layer map tells us how to move the lsn downwards for a
+                // range at *a particular point in time*. It is fine for the answer to be different
+                // at two different time points.
+                drop(guard);
+            }

            if let Some((layer_to_read, keyspace_to_read, lsn_range)) = fringe.next_layer() {
                let next_cont_lsn = lsn_range.start;
@@ -3081,6 +3212,8 @@ impl Timeline {

                unmapped_keyspace = keyspace_to_read;
                cont_lsn = next_cont_lsn;
+
+                reconstruct_state.on_layer_visited();
            } else {
                break;
            }
@@ -3524,7 +3657,7 @@ impl Timeline {
        &self,
        disk_consistent_lsn: Lsn,
        layers_to_upload: impl IntoIterator<Item = ResidentLayer>,
-    ) -> anyhow::Result<TimelineMetadata> {
+    ) -> anyhow::Result<()> {
        // We can only save a valid 'prev_record_lsn' value on disk if we
        // flushed *all* in-memory changes to disk. We only track
        // 'prev_record_lsn' in memory for the latest processed record, so we
@@ -3541,19 +3674,10 @@ impl Timeline {
            None
        };

-        let ancestor_timeline_id = self
-            .ancestor_timeline
-            .as_ref()
-            .map(|ancestor| ancestor.timeline_id);
-
-        let metadata = TimelineMetadata::new(
+        let update = crate::tenant::metadata::MetadataUpdate::new(
            disk_consistent_lsn,
            ondisk_prev_record_lsn,
-            ancestor_timeline_id,
-            self.ancestor_lsn,
            *self.latest_gc_cutoff_lsn.read(),
-            self.initdb_lsn,
-            self.pg_version,
        );

        fail_point!("checkpoint-before-saving-metadata", |x| bail!(
@@ -3565,10 +3689,10 @@ impl Timeline {
            for layer in layers_to_upload {
                remote_client.schedule_layer_file_upload(layer)?;
            }
-            remote_client.schedule_index_upload_for_metadata_update(&metadata)?;
+            remote_client.schedule_index_upload_for_metadata_update(&update)?;
        }

-        Ok(metadata)
+        Ok(())
    }

    pub(crate) async fn preserve_initdb_archive(&self) -> anyhow::Result<()> {
@@ -4142,9 +4266,8 @@ impl Timeline {
                *self.get_latest_gc_cutoff_lsn()
            }
        } else {
-            // No time-based retention was configured. Set time-based cutoff to
-            // same as LSN based.
-            cutoff_horizon
+            // No time-based retention was configured. Interpret this as "keep no history".
+            self.get_last_record_lsn()
        };

        // Grab the lock and update the values
@@ -4664,35 +4787,7 @@ impl Timeline {
    }
 }

-type TraversalPathItem = (
-    ValueReconstructResult,
-    Lsn,
-    Box<dyn Send + FnOnce() -> TraversalId>,
-);
-
-/// Helper function for get_reconstruct_data() to add the path of layers traversed
-/// to an error, as anyhow context information.
-fn layer_traversal_error(msg: String, path: Vec<TraversalPathItem>) -> PageReconstructError {
-    // We want the original 'msg' to be the outermost context. The outermost context
-    // is the most high-level information, which also gets propagated to the client.
-    let mut msg_iter = path
-        .into_iter()
-        .map(|(r, c, l)| {
-            format!(
-                "layer traversal: result {:?}, cont_lsn {}, layer: {}",
-                r,
-                c,
-                l(),
-            )
-        })
-        .chain(std::iter::once(msg));
-    // Construct initial message from the first traversed layer
-    let err = anyhow!(msg_iter.next().unwrap());
-
-    // Append all subsequent traversals, and the error message 'msg', as contexts.
-    let msg = msg_iter.fold(err, |err, msg| err.context(msg));
-    PageReconstructError::from(msg)
-}
+type TraversalPathItem = (ValueReconstructResult, Lsn, TraversalId);

 struct TimelineWriterState {
    open_layer: Arc<InMemoryLayer>,
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -188,24 +188,10 @@ impl Timeline {
    ) -> ControlFlow<()> {
        let now = SystemTime::now();

-        let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit(
-            BackgroundLoopKind::Eviction,
-            ctx,
-        );
+        let permit = self.acquire_imitation_permit(cancel, ctx).await?;

-        let _permit = tokio::select! {
-            permit = acquire_permit => permit,
-            _ = cancel.cancelled() => return ControlFlow::Break(()),
-            _ = self.cancel.cancelled() => return ControlFlow::Break(()),
-        };
-
-        match self
-            .imitate_layer_accesses(tenant, p, cancel, gate, ctx)
-            .await
-        {
-            ControlFlow::Break(()) => return ControlFlow::Break(()),
-            ControlFlow::Continue(()) => (),
-        }
+        self.imitate_layer_accesses(tenant, p, cancel, gate, permit, ctx)
+            .await?;

        #[derive(Debug, Default)]
        struct EvictionStats {
@@ -330,19 +316,27 @@ impl Timeline {
        gate: &GateGuard,
        ctx: &RequestContext,
    ) -> ControlFlow<()> {
+        let permit = self.acquire_imitation_permit(cancel, ctx).await?;
+
+        self.imitate_layer_accesses(tenant, p, cancel, gate, permit, ctx)
+            .await
+    }
+
+    async fn acquire_imitation_permit(
+        &self,
+        cancel: &CancellationToken,
+        ctx: &RequestContext,
+    ) -> ControlFlow<(), tokio::sync::SemaphorePermit<'static>> {
        let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit(
            BackgroundLoopKind::Eviction,
            ctx,
        );

-        let _permit = tokio::select! {
-            permit = acquire_permit => permit,
-            _ = cancel.cancelled() => return ControlFlow::Break(()),
-            _ = self.cancel.cancelled() => return ControlFlow::Break(()),
-        };
-
-        self.imitate_layer_accesses(tenant, p, cancel, gate, ctx)
-            .await
+        tokio::select! {
+            permit = acquire_permit => ControlFlow::Continue(permit),
+            _ = cancel.cancelled() => ControlFlow::Break(()),
+            _ = self.cancel.cancelled() => ControlFlow::Break(()),
+        }
    }

    /// If we evict layers but keep cached values derived from those layers, then
@@ -376,9 +370,10 @@ impl Timeline {
        p: &EvictionPolicyLayerAccessThreshold,
        cancel: &CancellationToken,
        gate: &GateGuard,
+        permit: tokio::sync::SemaphorePermit<'static>,
        ctx: &RequestContext,
    ) -> ControlFlow<()> {
-        if !self.tenant_shard_id.is_zero() {
+        if !self.tenant_shard_id.is_shard_zero() {
            // Shards !=0 do not maintain accurate relation sizes, and do not need to calculate logical size
            // for consumption metrics (consumption metrics are only sent from shard 0).  We may therefore
            // skip imitating logical size accesses for eviction purposes.
@@ -408,7 +403,28 @@ impl Timeline {
        // Make one of the tenant's timelines draw the short straw and run the calculation.
        // The others wait until the calculation is done so that they take into account the
        // imitated accesses that the winner made.
-        let mut state = tenant.eviction_task_tenant_state.lock().await;
+        let (mut state, _permit) = {
+            if let Ok(locked) = tenant.eviction_task_tenant_state.try_lock() {
+                (locked, permit)
+            } else {
+                // we might need to wait for a long time here in case of pathological synthetic
+                // size calculation performance
+                drop(permit);
+                let locked = tokio::select! {
+                    locked = tenant.eviction_task_tenant_state.lock() => locked,
+                    _ = self.cancel.cancelled() => {
+                        return ControlFlow::Break(())
+                    },
+                    _ = cancel.cancelled() => {
+                        return ControlFlow::Break(())
+                    }
+                };
+                // then reacquire -- this will be bad if there is a lot of traffic, but because we
+                // released the permit, the overall latency will be much better.
+                let permit = self.acquire_imitation_permit(cancel, ctx).await?;
+                (locked, permit)
+            }
+        };
        match state.last_layer_access_imitation {
            Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
            _ => {
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -427,7 +427,7 @@ pub(super) async fn handle_walreceiver_connection(

            // Send the replication feedback message.
            // Regular standby_status_update fields are put into this message.
-            let current_timeline_size = if timeline.tenant_shard_id.is_zero() {
+            let current_timeline_size = if timeline.tenant_shard_id.is_shard_zero() {
                timeline
                    .get_current_logical_size(
                        crate::tenant::timeline::GetLogicalSizePriority::User,
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -61,18 +61,18 @@ pub struct VectoredRead {
 }

 impl VectoredRead {
-    pub fn size(&self) -> usize {
+    pub(crate) fn size(&self) -> usize {
        (self.end - self.start) as usize
    }
 }

 #[derive(Eq, PartialEq)]
-enum VectoredReadExtended {
+pub(crate) enum VectoredReadExtended {
    Yes,
    No,
 }

-struct VectoredReadBuilder {
+pub(crate) struct VectoredReadBuilder {
    start: u64,
    end: u64,
    blobs_at: VecMap<u64, BlobMeta>,
@@ -80,7 +80,17 @@ struct VectoredReadBuilder {
 }

 impl VectoredReadBuilder {
-    fn new(start_offset: u64, end_offset: u64, meta: BlobMeta, max_read_size: usize) -> Self {
+    /// Start building a new vectored read.
+    ///
+    /// Note that by design, this does not check against reading more than `max_read_size` to
+    /// support reading larger blobs than the configuration value. The builder will be single use
+    /// however after that.
+    pub(crate) fn new(
+        start_offset: u64,
+        end_offset: u64,
+        meta: BlobMeta,
+        max_read_size: usize,
+    ) -> Self {
        let mut blobs_at = VecMap::default();
        blobs_at
            .append(start_offset, meta)
@@ -97,7 +107,8 @@ impl VectoredReadBuilder {
    /// Attempt to extend the current read with a new blob if the start
    /// offset matches with the current end of the vectored read
    /// and the resuting size is below the max read size
-    fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended {
+    pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended {
+        tracing::trace!(start, end, "trying to extend");
        let size = (end - start) as usize;
        if self.end == start && self.size() + size <= self.max_read_size {
            self.end = end;
@@ -111,11 +122,11 @@ impl VectoredReadBuilder {
        VectoredReadExtended::No
    }

-    fn size(&self) -> usize {
+    pub(crate) fn size(&self) -> usize {
        (self.end - self.start) as usize
    }

-    fn build(self) -> VectoredRead {
+    pub(crate) fn build(self) -> VectoredRead {
        VectoredRead {
            start: self.start,
            end: self.end,
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -403,7 +403,7 @@ impl WalIngest {
            );

            if !key_is_local {
-                if self.shard.is_zero() {
+                if self.shard.is_shard_zero() {
                    // Shard 0 tracks relation sizes.  Although we will not store this block, we will observe
                    // its blkno in case it implicitly extends a relation.
                    self.observe_decoded_block(modification, blk, ctx).await?;
--- a/pageserver/src/walrecord.rs
+++ b/pageserver/src/walrecord.rs
@@ -55,6 +55,7 @@ impl NeonWalRecord {
    /// Does replaying this WAL record initialize the page from scratch, or does
    /// it need to be applied over the previous image of the page?
    pub fn will_init(&self) -> bool {
+        // If you change this function, you'll also need to change ValueBytes::will_init
        match self {
            NeonWalRecord::Postgres { will_init, rec: _ } => *will_init,

--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -20,6 +20,7 @@

 /// Process lifecycle and abstracction for the IPC protocol.
 mod process;
+pub use process::Kind as ProcessKind;

 /// Code to apply [`NeonWalRecord`]s.
 pub(crate) mod apply_neon;
@@ -34,7 +35,7 @@ use crate::walrecord::NeonWalRecord;
 use anyhow::Context;
 use bytes::{Bytes, BytesMut};
 use pageserver_api::key::key_to_rel_block;
-use pageserver_api::models::WalRedoManagerStatus;
+use pageserver_api::models::{WalRedoManagerProcessStatus, WalRedoManagerStatus};
 use pageserver_api::shard::TenantShardId;
 use std::sync::Arc;
 use std::time::Duration;
@@ -54,7 +55,7 @@ pub struct PostgresRedoManager {
    tenant_shard_id: TenantShardId,
    conf: &'static PageServerConf,
    last_redo_at: std::sync::Mutex<Option<Instant>>,
-    /// The current [`process::WalRedoProcess`] that is used by new redo requests.
+    /// The current [`process::Process`] that is used by new redo requests.
    /// We use [`heavier_once_cell`] for coalescing the spawning, but the redo
    /// requests don't use the [`heavier_once_cell::Guard`] to keep ahold of the
    /// their process object; we use [`Arc::clone`] for that.
@@ -66,7 +67,7 @@ pub struct PostgresRedoManager {
    /// still be using the old redo process. But, those other tasks will most likely
    /// encounter an error as well, and errors are an unexpected condition anyway.
    /// So, probably we could get rid of the `Arc` in the future.
-    redo_process: heavier_once_cell::OnceCell<Arc<process::WalRedoProcess>>,
+    redo_process: heavier_once_cell::OnceCell<Arc<process::Process>>,
 }

 ///
@@ -139,8 +140,8 @@ impl PostgresRedoManager {
        }
    }

-    pub(crate) fn status(&self) -> Option<WalRedoManagerStatus> {
-        Some(WalRedoManagerStatus {
+    pub fn status(&self) -> WalRedoManagerStatus {
+        WalRedoManagerStatus {
            last_redo_at: {
                let at = *self.last_redo_at.lock().unwrap();
                at.and_then(|at| {
@@ -149,8 +150,14 @@ impl PostgresRedoManager {
                    chrono::Utc::now().checked_sub_signed(chrono::Duration::from_std(age).ok()?)
                })
            },
-            pid: self.redo_process.get().map(|p| p.id()),
-        })
+            process: self
+                .redo_process
+                .get()
+                .map(|p| WalRedoManagerProcessStatus {
+                    pid: p.id(),
+                    kind: std::borrow::Cow::Borrowed(p.kind().into()),
+                }),
+        }
    }
 }

@@ -208,37 +215,33 @@ impl PostgresRedoManager {
        const MAX_RETRY_ATTEMPTS: u32 = 1;
        let mut n_attempts = 0u32;
        loop {
-            let proc: Arc<process::WalRedoProcess> =
-                match self.redo_process.get_or_init_detached().await {
-                    Ok(guard) => Arc::clone(&guard),
-                    Err(permit) => {
-                        // don't hold poison_guard, the launch code can bail
-                        let start = Instant::now();
-                        let proc = Arc::new(
-                            process::WalRedoProcess::launch(
-                                self.conf,
-                                self.tenant_shard_id,
-                                pg_version,
-                            )
+            let proc: Arc<process::Process> = match self.redo_process.get_or_init_detached().await {
+                Ok(guard) => Arc::clone(&guard),
+                Err(permit) => {
+                    // don't hold poison_guard, the launch code can bail
+                    let start = Instant::now();
+                    let proc = Arc::new(
+                        process::Process::launch(self.conf, self.tenant_shard_id, pg_version)
                            .context("launch walredo process")?,
-                        );
-                        let duration = start.elapsed();
-                        WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
-                        info!(
-                            duration_ms = duration.as_millis(),
-                            pid = proc.id(),
-                            "launched walredo process"
-                        );
-                        self.redo_process.set(Arc::clone(&proc), permit);
-                        proc
-                    }
-                };
+                    );
+                    let duration = start.elapsed();
+                    WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
+                    info!(
+                        duration_ms = duration.as_millis(),
+                        pid = proc.id(),
+                        "launched walredo process"
+                    );
+                    self.redo_process.set(Arc::clone(&proc), permit);
+                    proc
+                }
+            };

            let started_at = std::time::Instant::now();

            // Relational WAL records are applied using wal-redo-postgres
            let result = proc
                .apply_wal_records(rel, blknum, &base_img, records, wal_redo_timeout)
+                .await
                .context("apply_wal_records");

            let duration = started_at.elapsed();
--- a/pageserver/src/walredo/process.rs
+++ b/pageserver/src/walredo/process.rs
@@ -1,186 +1,67 @@
-use self::no_leak_child::NoLeakChild;
-use crate::{
-    config::PageServerConf,
-    metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
-    walrecord::NeonWalRecord,
-};
-use anyhow::Context;
+use std::time::Duration;
+
 use bytes::Bytes;
-use nix::poll::{PollFd, PollFlags};
 use pageserver_api::{reltag::RelTag, shard::TenantShardId};
-use postgres_ffi::BLCKSZ;
-use std::os::fd::AsRawFd;
-#[cfg(feature = "testing")]
-use std::sync::atomic::AtomicUsize;
-use std::{
-    collections::VecDeque,
-    io::{Read, Write},
-    process::{ChildStdin, ChildStdout, Command, Stdio},
-    sync::{Mutex, MutexGuard},
-    time::Duration,
-};
-use tracing::{debug, error, instrument, Instrument};
-use utils::{lsn::Lsn, nonblock::set_nonblock};
+use utils::lsn::Lsn;
+
+use crate::{config::PageServerConf, walrecord::NeonWalRecord};

 mod no_leak_child;
 /// The IPC protocol that pageserver and walredo process speak over their shared pipe.
 mod protocol;

-pub struct WalRedoProcess {
-    #[allow(dead_code)]
-    conf: &'static PageServerConf,
-    tenant_shard_id: TenantShardId,
-    // Some() on construction, only becomes None on Drop.
-    child: Option<NoLeakChild>,
-    stdout: Mutex<ProcessOutput>,
-    stdin: Mutex<ProcessInput>,
-    /// Counter to separate same sized walredo inputs failing at the same millisecond.
-    #[cfg(feature = "testing")]
-    dump_sequence: AtomicUsize,
+mod process_impl {
+    pub(super) mod process_async;
+    pub(super) mod process_std;
 }

-struct ProcessInput {
-    stdin: ChildStdin,
-    n_requests: usize,
+#[derive(
+    Clone,
+    Copy,
+    Debug,
+    PartialEq,
+    Eq,
+    strum_macros::EnumString,
+    strum_macros::Display,
+    strum_macros::IntoStaticStr,
+    serde_with::DeserializeFromStr,
+    serde_with::SerializeDisplay,
+)]
+#[strum(serialize_all = "kebab-case")]
+#[repr(u8)]
+pub enum Kind {
+    Sync,
+    Async,
 }

-struct ProcessOutput {
-    stdout: ChildStdout,
-    pending_responses: VecDeque<Option<Bytes>>,
-    n_processed_responses: usize,
+pub(crate) enum Process {
+    Sync(process_impl::process_std::WalRedoProcess),
+    Async(process_impl::process_async::WalRedoProcess),
 }

-impl WalRedoProcess {
-    //
-    // Start postgres binary in special WAL redo mode.
-    //
-    #[instrument(skip_all,fields(pg_version=pg_version))]
-    pub(crate) fn launch(
+impl Process {
+    #[inline(always)]
+    pub fn launch(
        conf: &'static PageServerConf,
        tenant_shard_id: TenantShardId,
        pg_version: u32,
    ) -> anyhow::Result<Self> {
-        crate::span::debug_assert_current_span_has_tenant_id();
-
-        let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
-        let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
-
-        use no_leak_child::NoLeakChildCommandExt;
-        // Start postgres itself
-        let child = Command::new(pg_bin_dir_path.join("postgres"))
-            // the first arg must be --wal-redo so the child process enters into walredo mode
-            .arg("--wal-redo")
-            // the child doesn't process this arg, but, having it in the argv helps indentify the
-            // walredo process for a particular tenant when debugging a pagserver
-            .args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
-            .stdin(Stdio::piped())
-            .stderr(Stdio::piped())
-            .stdout(Stdio::piped())
-            .env_clear()
-            .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
-            .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
-            // NB: The redo process is not trusted after we sent it the first
-            // walredo work. Before that, it is trusted. Specifically, we trust
-            // it to
-            // 1. close all file descriptors except stdin, stdout, stderr because
-            //    pageserver might not be 100% diligent in setting FD_CLOEXEC on all
-            //    the files it opens, and
-            // 2. to use seccomp to sandbox itself before processing the first
-            //    walredo request.
-            .spawn_no_leak_child(tenant_shard_id)
-            .context("spawn process")?;
-        WAL_REDO_PROCESS_COUNTERS.started.inc();
-        let mut child = scopeguard::guard(child, |child| {
-            error!("killing wal-redo-postgres process due to a problem during launch");
-            child.kill_and_wait(WalRedoKillCause::Startup);
-        });
-
-        let stdin = child.stdin.take().unwrap();
-        let stdout = child.stdout.take().unwrap();
-        let stderr = child.stderr.take().unwrap();
-        let stderr = tokio::process::ChildStderr::from_std(stderr)
-            .context("convert to tokio::ChildStderr")?;
-        macro_rules! set_nonblock_or_log_err {
-        ($file:ident) => {{
-            let res = set_nonblock($file.as_raw_fd());
-            if let Err(e) = &res {
-                error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed");
-            }
-            res
-        }};
-    }
-        set_nonblock_or_log_err!(stdin)?;
-        set_nonblock_or_log_err!(stdout)?;
-
-        // all fallible operations post-spawn are complete, so get rid of the guard
-        let child = scopeguard::ScopeGuard::into_inner(child);
-
-        tokio::spawn(
-        async move {
-            scopeguard::defer! {
-                debug!("wal-redo-postgres stderr_logger_task finished");
-                crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
-            }
-            debug!("wal-redo-postgres stderr_logger_task started");
-            crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
-
-            use tokio::io::AsyncBufReadExt;
-            let mut stderr_lines = tokio::io::BufReader::new(stderr);
-            let mut buf = Vec::new();
-            let res = loop {
-                buf.clear();
-                // TODO we don't trust the process to cap its stderr length.
-                // Currently it can do unbounded Vec allocation.
-                match stderr_lines.read_until(b'\n', &mut buf).await {
-                    Ok(0) => break Ok(()), // eof
-                    Ok(num_bytes) => {
-                        let output = String::from_utf8_lossy(&buf[..num_bytes]);
-                        error!(%output, "received output");
-                    }
-                    Err(e) => {
-                        break Err(e);
-                    }
-                }
-            };
-            match res {
-                Ok(()) => (),
-                Err(e) => {
-                    error!(error=?e, "failed to read from walredo stderr");
-                }
-            }
-        }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
-    );
-
-        Ok(Self {
-            conf,
-            tenant_shard_id,
-            child: Some(child),
-            stdin: Mutex::new(ProcessInput {
-                stdin,
-                n_requests: 0,
-            }),
-            stdout: Mutex::new(ProcessOutput {
-                stdout,
-                pending_responses: VecDeque::new(),
-                n_processed_responses: 0,
-            }),
-            #[cfg(feature = "testing")]
-            dump_sequence: AtomicUsize::default(),
+        Ok(match conf.walredo_process_kind {
+            Kind::Sync => Self::Sync(process_impl::process_std::WalRedoProcess::launch(
+                conf,
+                tenant_shard_id,
+                pg_version,
+            )?),
+            Kind::Async => Self::Async(process_impl::process_async::WalRedoProcess::launch(
+                conf,
+                tenant_shard_id,
+                pg_version,
+            )?),
        })
    }

-    pub(crate) fn id(&self) -> u32 {
-        self.child
-            .as_ref()
-            .expect("must not call this during Drop")
-            .id()
-    }
-
-    // Apply given WAL records ('records') over an old page image. Returns
-    // new page image.
-    //
-    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
-    pub(crate) fn apply_wal_records(
+    #[inline(always)]
+    pub(crate) async fn apply_wal_records(
        &self,
        rel: RelTag,
        blknum: u32,
@@ -188,221 +69,29 @@ impl WalRedoProcess {
        records: &[(Lsn, NeonWalRecord)],
        wal_redo_timeout: Duration,
    ) -> anyhow::Result<Bytes> {
-        let tag = protocol::BufferTag { rel, blknum };
-        let input = self.stdin.lock().unwrap();
-
-        // Serialize all the messages to send the WAL redo process first.
-        //
-        // This could be problematic if there are millions of records to replay,
-        // but in practice the number of records is usually so small that it doesn't
-        // matter, and it's better to keep this code simple.
-        //
-        // Most requests start with a before-image with BLCKSZ bytes, followed by
-        // by some other WAL records. Start with a buffer that can hold that
-        // comfortably.
-        let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
-        protocol::build_begin_redo_for_block_msg(tag, &mut writebuf);
-        if let Some(img) = base_img {
-            protocol::build_push_page_msg(tag, img, &mut writebuf);
-        }
-        for (lsn, rec) in records.iter() {
-            if let NeonWalRecord::Postgres {
-                will_init: _,
-                rec: postgres_rec,
-            } = rec
-            {
-                protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
-            } else {
-                anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
+        match self {
+            Process::Sync(p) => {
+                p.apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout)
+                    .await
            }
-        }
-        protocol::build_get_page_msg(tag, &mut writebuf);
-        WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
-
-        let res = self.apply_wal_records0(&writebuf, input, wal_redo_timeout);
-
-        if res.is_err() {
-            // not all of these can be caused by this particular input, however these are so rare
-            // in tests so capture all.
-            self.record_and_log(&writebuf);
-        }
-
-        res
-    }
-
-    fn apply_wal_records0(
-        &self,
-        writebuf: &[u8],
-        input: MutexGuard<ProcessInput>,
-        wal_redo_timeout: Duration,
-    ) -> anyhow::Result<Bytes> {
-        let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small.
-        let mut nwrite = 0usize;
-
-        while nwrite < writebuf.len() {
-            let mut stdin_pollfds = [PollFd::new(&proc.stdin, PollFlags::POLLOUT)];
-            let n = loop {
-                match nix::poll::poll(&mut stdin_pollfds[..], wal_redo_timeout.as_millis() as i32) {
-                    Err(nix::errno::Errno::EINTR) => continue,
-                    res => break res,
-                }
-            }?;
-
-            if n == 0 {
-                anyhow::bail!("WAL redo timed out");
+            Process::Async(p) => {
+                p.apply_wal_records(rel, blknum, base_img, records, wal_redo_timeout)
+                    .await
            }
-
-            // If 'stdin' is writeable, do write.
-            let in_revents = stdin_pollfds[0].revents().unwrap();
-            if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
-                nwrite += proc.stdin.write(&writebuf[nwrite..])?;
-            }
-            if in_revents.contains(PollFlags::POLLHUP) {
-                // We still have more data to write, but the process closed the pipe.
-                anyhow::bail!("WAL redo process closed its stdin unexpectedly");
-            }
-        }
-        let request_no = proc.n_requests;
-        proc.n_requests += 1;
-        drop(proc);
-
-        // To improve walredo performance we separate sending requests and receiving
-        // responses. Them are protected by different mutexes (output and input).
-        // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
-        // then there is not warranty that T1 will first granted output mutex lock.
-        // To address this issue we maintain number of sent requests, number of processed
-        // responses and ring buffer with pending responses. After sending response
-        // (under input mutex), threads remembers request number. Then it releases
-        // input mutex, locks output mutex and fetch in ring buffer all responses until
-        // its stored request number. The it takes correspondent element from
-        // pending responses ring buffer and truncate all empty elements from the front,
-        // advancing processed responses number.
-
-        let mut output = self.stdout.lock().unwrap();
-        let n_processed_responses = output.n_processed_responses;
-        while n_processed_responses + output.pending_responses.len() <= request_no {
-            // We expect the WAL redo process to respond with an 8k page image. We read it
-            // into this buffer.
-            let mut resultbuf = vec![0; BLCKSZ.into()];
-            let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
-            while nresult < BLCKSZ.into() {
-                let mut stdout_pollfds = [PollFd::new(&output.stdout, PollFlags::POLLIN)];
-                // We do two things simultaneously: reading response from stdout
-                // and forward any logging information that the child writes to its stderr to the page server's log.
-                let n = loop {
-                    match nix::poll::poll(
-                        &mut stdout_pollfds[..],
-                        wal_redo_timeout.as_millis() as i32,
-                    ) {
-                        Err(nix::errno::Errno::EINTR) => continue,
-                        res => break res,
-                    }
-                }?;
-
-                if n == 0 {
-                    anyhow::bail!("WAL redo timed out");
-                }
-
-                // If we have some data in stdout, read it to the result buffer.
-                let out_revents = stdout_pollfds[0].revents().unwrap();
-                if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
-                    nresult += output.stdout.read(&mut resultbuf[nresult..])?;
-                }
-                if out_revents.contains(PollFlags::POLLHUP) {
-                    anyhow::bail!("WAL redo process closed its stdout unexpectedly");
-                }
-            }
-            output
-                .pending_responses
-                .push_back(Some(Bytes::from(resultbuf)));
-        }
-        // Replace our request's response with None in `pending_responses`.
-        // Then make space in the ring buffer by clearing out any seqence of contiguous
-        // `None`'s from the front of `pending_responses`.
-        // NB: We can't pop_front() because other requests' responses because another
-        // requester might have grabbed the output mutex before us:
-        // T1: grab input mutex
-        // T1: send request_no 23
-        // T1: release input mutex
-        // T2: grab input mutex
-        // T2: send request_no 24
-        // T2: release input mutex
-        // T2: grab output mutex
-        // T2: n_processed_responses + output.pending_responses.len() <= request_no
-        //            23                                0                   24
-        // T2: enters poll loop that reads stdout
-        // T2: put response for 23 into pending_responses
-        // T2: put response for 24 into pending_resposnes
-        // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
-        // T2: takes its response_24
-        // pending_responses now looks like this: Front Some(response_23) None Back
-        // T2: does the while loop below
-        // pending_responses now looks like this: Front Some(response_23) None Back
-        // T2: releases output mutex
-        // T1: grabs output mutex
-        // T1: n_processed_responses + output.pending_responses.len() > request_no
-        //            23                                2                   23
-        // T1: skips poll loop that reads stdout
-        // T1: takes its response_23
-        // pending_responses now looks like this: Front None None Back
-        // T2: does the while loop below
-        // pending_responses now looks like this: Front Back
-        // n_processed_responses now has value 25
-        let res = output.pending_responses[request_no - n_processed_responses]
-            .take()
-            .expect("we own this request_no, nobody else is supposed to take it");
-        while let Some(front) = output.pending_responses.front() {
-            if front.is_none() {
-                output.pending_responses.pop_front();
-                output.n_processed_responses += 1;
-            } else {
-                break;
-            }
-        }
-        Ok(res)
-    }
-
-    #[cfg(feature = "testing")]
-    fn record_and_log(&self, writebuf: &[u8]) {
-        use std::sync::atomic::Ordering;
-
-        let millis = std::time::SystemTime::now()
-            .duration_since(std::time::SystemTime::UNIX_EPOCH)
-            .unwrap()
-            .as_millis();
-
-        let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
-
-        // these files will be collected to an allure report
-        let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
-
-        let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
-
-        let res = std::fs::OpenOptions::new()
-            .write(true)
-            .create_new(true)
-            .read(true)
-            .open(path)
-            .and_then(|mut f| f.write_all(writebuf));
-
-        // trip up allowed_errors
-        if let Err(e) = res {
-            tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
-        } else {
-            tracing::error!(filename, "erroring walredo input saved");
        }
    }

-    #[cfg(not(feature = "testing"))]
-    fn record_and_log(&self, _: &[u8]) {}
-}
+    pub(crate) fn id(&self) -> u32 {
+        match self {
+            Process::Sync(p) => p.id(),
+            Process::Async(p) => p.id(),
+        }
+    }

-impl Drop for WalRedoProcess {
-    fn drop(&mut self) {
-        self.child
-            .take()
-            .expect("we only do this once")
-            .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
-        // no way to wait for stderr_logger_task from Drop because that is async only
+    pub(crate) fn kind(&self) -> Kind {
+        match self {
+            Process::Sync(_) => Kind::Sync,
+            Process::Async(_) => Kind::Async,
+        }
    }
 }
--- a/pageserver/src/walredo/process/process_impl/process_async.rs
+++ b/pageserver/src/walredo/process/process_impl/process_async.rs
@@ -0,0 +1,374 @@
+use self::no_leak_child::NoLeakChild;
+use crate::{
+    config::PageServerConf,
+    metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
+    walrecord::NeonWalRecord,
+    walredo::process::{no_leak_child, protocol},
+};
+use anyhow::Context;
+use bytes::Bytes;
+use pageserver_api::{reltag::RelTag, shard::TenantShardId};
+use postgres_ffi::BLCKSZ;
+#[cfg(feature = "testing")]
+use std::sync::atomic::AtomicUsize;
+use std::{
+    collections::VecDeque,
+    process::{Command, Stdio},
+    time::Duration,
+};
+use tokio::io::{AsyncReadExt, AsyncWriteExt};
+use tracing::{debug, error, instrument, Instrument};
+use utils::{lsn::Lsn, poison::Poison};
+
+pub struct WalRedoProcess {
+    #[allow(dead_code)]
+    conf: &'static PageServerConf,
+    tenant_shard_id: TenantShardId,
+    // Some() on construction, only becomes None on Drop.
+    child: Option<NoLeakChild>,
+    stdout: tokio::sync::Mutex<Poison<ProcessOutput>>,
+    stdin: tokio::sync::Mutex<Poison<ProcessInput>>,
+    /// Counter to separate same sized walredo inputs failing at the same millisecond.
+    #[cfg(feature = "testing")]
+    dump_sequence: AtomicUsize,
+}
+
+struct ProcessInput {
+    stdin: tokio::process::ChildStdin,
+    n_requests: usize,
+}
+
+struct ProcessOutput {
+    stdout: tokio::process::ChildStdout,
+    pending_responses: VecDeque<Option<Bytes>>,
+    n_processed_responses: usize,
+}
+
+impl WalRedoProcess {
+    //
+    // Start postgres binary in special WAL redo mode.
+    //
+    #[instrument(skip_all,fields(pg_version=pg_version))]
+    pub(crate) fn launch(
+        conf: &'static PageServerConf,
+        tenant_shard_id: TenantShardId,
+        pg_version: u32,
+    ) -> anyhow::Result<Self> {
+        crate::span::debug_assert_current_span_has_tenant_id();
+
+        let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
+        let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
+
+        use no_leak_child::NoLeakChildCommandExt;
+        // Start postgres itself
+        let child = Command::new(pg_bin_dir_path.join("postgres"))
+            // the first arg must be --wal-redo so the child process enters into walredo mode
+            .arg("--wal-redo")
+            // the child doesn't process this arg, but, having it in the argv helps indentify the
+            // walredo process for a particular tenant when debugging a pagserver
+            .args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
+            .stdin(Stdio::piped())
+            .stderr(Stdio::piped())
+            .stdout(Stdio::piped())
+            .env_clear()
+            .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
+            .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
+            // NB: The redo process is not trusted after we sent it the first
+            // walredo work. Before that, it is trusted. Specifically, we trust
+            // it to
+            // 1. close all file descriptors except stdin, stdout, stderr because
+            //    pageserver might not be 100% diligent in setting FD_CLOEXEC on all
+            //    the files it opens, and
+            // 2. to use seccomp to sandbox itself before processing the first
+            //    walredo request.
+            .spawn_no_leak_child(tenant_shard_id)
+            .context("spawn process")?;
+        WAL_REDO_PROCESS_COUNTERS.started.inc();
+        let mut child = scopeguard::guard(child, |child| {
+            error!("killing wal-redo-postgres process due to a problem during launch");
+            child.kill_and_wait(WalRedoKillCause::Startup);
+        });
+
+        let stdin = child.stdin.take().unwrap();
+        let stdout = child.stdout.take().unwrap();
+        let stderr = child.stderr.take().unwrap();
+        let stderr = tokio::process::ChildStderr::from_std(stderr)
+            .context("convert to tokio::ChildStderr")?;
+        let stdin =
+            tokio::process::ChildStdin::from_std(stdin).context("convert to tokio::ChildStdin")?;
+        let stdout = tokio::process::ChildStdout::from_std(stdout)
+            .context("convert to tokio::ChildStdout")?;
+
+        // all fallible operations post-spawn are complete, so get rid of the guard
+        let child = scopeguard::ScopeGuard::into_inner(child);
+
+        tokio::spawn(
+            async move {
+                scopeguard::defer! {
+                    debug!("wal-redo-postgres stderr_logger_task finished");
+                    crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
+                }
+                debug!("wal-redo-postgres stderr_logger_task started");
+                crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
+
+                use tokio::io::AsyncBufReadExt;
+                let mut stderr_lines = tokio::io::BufReader::new(stderr);
+                let mut buf = Vec::new();
+                let res = loop {
+                    buf.clear();
+                    // TODO we don't trust the process to cap its stderr length.
+                    // Currently it can do unbounded Vec allocation.
+                    match stderr_lines.read_until(b'\n', &mut buf).await {
+                        Ok(0) => break Ok(()), // eof
+                        Ok(num_bytes) => {
+                            let output = String::from_utf8_lossy(&buf[..num_bytes]);
+                            error!(%output, "received output");
+                        }
+                        Err(e) => {
+                            break Err(e);
+                        }
+                    }
+                };
+                match res {
+                    Ok(()) => (),
+                    Err(e) => {
+                        error!(error=?e, "failed to read from walredo stderr");
+                    }
+                }
+            }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
+        );
+
+        Ok(Self {
+            conf,
+            tenant_shard_id,
+            child: Some(child),
+            stdin: tokio::sync::Mutex::new(Poison::new(
+                "stdin",
+                ProcessInput {
+                    stdin,
+                    n_requests: 0,
+                },
+            )),
+            stdout: tokio::sync::Mutex::new(Poison::new(
+                "stdout",
+                ProcessOutput {
+                    stdout,
+                    pending_responses: VecDeque::new(),
+                    n_processed_responses: 0,
+                },
+            )),
+            #[cfg(feature = "testing")]
+            dump_sequence: AtomicUsize::default(),
+        })
+    }
+
+    pub(crate) fn id(&self) -> u32 {
+        self.child
+            .as_ref()
+            .expect("must not call this during Drop")
+            .id()
+    }
+
+    /// Apply given WAL records ('records') over an old page image. Returns
+    /// new page image.
+    ///
+    /// # Cancel-Safety
+    ///
+    /// Cancellation safe.
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
+    pub(crate) async fn apply_wal_records(
+        &self,
+        rel: RelTag,
+        blknum: u32,
+        base_img: &Option<Bytes>,
+        records: &[(Lsn, NeonWalRecord)],
+        wal_redo_timeout: Duration,
+    ) -> anyhow::Result<Bytes> {
+        let tag = protocol::BufferTag { rel, blknum };
+
+        // Serialize all the messages to send the WAL redo process first.
+        //
+        // This could be problematic if there are millions of records to replay,
+        // but in practice the number of records is usually so small that it doesn't
+        // matter, and it's better to keep this code simple.
+        //
+        // Most requests start with a before-image with BLCKSZ bytes, followed by
+        // by some other WAL records. Start with a buffer that can hold that
+        // comfortably.
+        let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
+        protocol::build_begin_redo_for_block_msg(tag, &mut writebuf);
+        if let Some(img) = base_img {
+            protocol::build_push_page_msg(tag, img, &mut writebuf);
+        }
+        for (lsn, rec) in records.iter() {
+            if let NeonWalRecord::Postgres {
+                will_init: _,
+                rec: postgres_rec,
+            } = rec
+            {
+                protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
+            } else {
+                anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
+            }
+        }
+        protocol::build_get_page_msg(tag, &mut writebuf);
+        WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
+
+        let Ok(res) =
+            tokio::time::timeout(wal_redo_timeout, self.apply_wal_records0(&writebuf)).await
+        else {
+            anyhow::bail!("WAL redo timed out");
+        };
+
+        if res.is_err() {
+            // not all of these can be caused by this particular input, however these are so rare
+            // in tests so capture all.
+            self.record_and_log(&writebuf);
+        }
+
+        res
+    }
+
+    /// # Cancel-Safety
+    ///
+    /// When not polled to completion (e.g. because in `tokio::select!` another
+    /// branch becomes ready before this future), concurrent and subsequent
+    /// calls may fail due to [`utils::poison::Poison::check_and_arm`] calls.
+    /// Dispose of this process instance and create a new one.
+    async fn apply_wal_records0(&self, writebuf: &[u8]) -> anyhow::Result<Bytes> {
+        let request_no = {
+            let mut lock_guard = self.stdin.lock().await;
+            let mut poison_guard = lock_guard.check_and_arm()?;
+            let input = poison_guard.data_mut();
+            input
+                .stdin
+                .write_all(writebuf)
+                .await
+                .context("write to walredo stdin")?;
+            let request_no = input.n_requests;
+            input.n_requests += 1;
+            poison_guard.disarm();
+            request_no
+        };
+
+        // To improve walredo performance we separate sending requests and receiving
+        // responses. Them are protected by different mutexes (output and input).
+        // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
+        // then there is not warranty that T1 will first granted output mutex lock.
+        // To address this issue we maintain number of sent requests, number of processed
+        // responses and ring buffer with pending responses. After sending response
+        // (under input mutex), threads remembers request number. Then it releases
+        // input mutex, locks output mutex and fetch in ring buffer all responses until
+        // its stored request number. The it takes correspondent element from
+        // pending responses ring buffer and truncate all empty elements from the front,
+        // advancing processed responses number.
+
+        let mut lock_guard = self.stdout.lock().await;
+        let mut poison_guard = lock_guard.check_and_arm()?;
+        let output = poison_guard.data_mut();
+        let n_processed_responses = output.n_processed_responses;
+        while n_processed_responses + output.pending_responses.len() <= request_no {
+            // We expect the WAL redo process to respond with an 8k page image. We read it
+            // into this buffer.
+            let mut resultbuf = vec![0; BLCKSZ.into()];
+            output
+                .stdout
+                .read_exact(&mut resultbuf)
+                .await
+                .context("read walredo stdout")?;
+            output
+                .pending_responses
+                .push_back(Some(Bytes::from(resultbuf)));
+        }
+        // Replace our request's response with None in `pending_responses`.
+        // Then make space in the ring buffer by clearing out any seqence of contiguous
+        // `None`'s from the front of `pending_responses`.
+        // NB: We can't pop_front() because other requests' responses because another
+        // requester might have grabbed the output mutex before us:
+        // T1: grab input mutex
+        // T1: send request_no 23
+        // T1: release input mutex
+        // T2: grab input mutex
+        // T2: send request_no 24
+        // T2: release input mutex
+        // T2: grab output mutex
+        // T2: n_processed_responses + output.pending_responses.len() <= request_no
+        //            23                                0                   24
+        // T2: enters poll loop that reads stdout
+        // T2: put response for 23 into pending_responses
+        // T2: put response for 24 into pending_resposnes
+        // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
+        // T2: takes its response_24
+        // pending_responses now looks like this: Front Some(response_23) None Back
+        // T2: does the while loop below
+        // pending_responses now looks like this: Front Some(response_23) None Back
+        // T2: releases output mutex
+        // T1: grabs output mutex
+        // T1: n_processed_responses + output.pending_responses.len() > request_no
+        //            23                                2                   23
+        // T1: skips poll loop that reads stdout
+        // T1: takes its response_23
+        // pending_responses now looks like this: Front None None Back
+        // T2: does the while loop below
+        // pending_responses now looks like this: Front Back
+        // n_processed_responses now has value 25
+        let res = output.pending_responses[request_no - n_processed_responses]
+            .take()
+            .expect("we own this request_no, nobody else is supposed to take it");
+        while let Some(front) = output.pending_responses.front() {
+            if front.is_none() {
+                output.pending_responses.pop_front();
+                output.n_processed_responses += 1;
+            } else {
+                break;
+            }
+        }
+        poison_guard.disarm();
+        Ok(res)
+    }
+
+    #[cfg(feature = "testing")]
+    fn record_and_log(&self, writebuf: &[u8]) {
+        use std::sync::atomic::Ordering;
+
+        let millis = std::time::SystemTime::now()
+            .duration_since(std::time::SystemTime::UNIX_EPOCH)
+            .unwrap()
+            .as_millis();
+
+        let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
+
+        // these files will be collected to an allure report
+        let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
+
+        let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
+
+        use std::io::Write;
+        let res = std::fs::OpenOptions::new()
+            .write(true)
+            .create_new(true)
+            .read(true)
+            .open(path)
+            .and_then(|mut f| f.write_all(writebuf));
+
+        // trip up allowed_errors
+        if let Err(e) = res {
+            tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
+        } else {
+            tracing::error!(filename, "erroring walredo input saved");
+        }
+    }
+
+    #[cfg(not(feature = "testing"))]
+    fn record_and_log(&self, _: &[u8]) {}
+}
+
+impl Drop for WalRedoProcess {
+    fn drop(&mut self) {
+        self.child
+            .take()
+            .expect("we only do this once")
+            .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
+        // no way to wait for stderr_logger_task from Drop because that is async only
+    }
+}
--- a/pageserver/src/walredo/process/process_impl/process_std.rs
+++ b/pageserver/src/walredo/process/process_impl/process_std.rs
@@ -0,0 +1,405 @@
+use self::no_leak_child::NoLeakChild;
+use crate::{
+    config::PageServerConf,
+    metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
+    walrecord::NeonWalRecord,
+    walredo::process::{no_leak_child, protocol},
+};
+use anyhow::Context;
+use bytes::Bytes;
+use nix::poll::{PollFd, PollFlags};
+use pageserver_api::{reltag::RelTag, shard::TenantShardId};
+use postgres_ffi::BLCKSZ;
+use std::os::fd::AsRawFd;
+#[cfg(feature = "testing")]
+use std::sync::atomic::AtomicUsize;
+use std::{
+    collections::VecDeque,
+    io::{Read, Write},
+    process::{ChildStdin, ChildStdout, Command, Stdio},
+    sync::{Mutex, MutexGuard},
+    time::Duration,
+};
+use tracing::{debug, error, instrument, Instrument};
+use utils::{lsn::Lsn, nonblock::set_nonblock};
+
+pub struct WalRedoProcess {
+    #[allow(dead_code)]
+    conf: &'static PageServerConf,
+    tenant_shard_id: TenantShardId,
+    // Some() on construction, only becomes None on Drop.
+    child: Option<NoLeakChild>,
+    stdout: Mutex<ProcessOutput>,
+    stdin: Mutex<ProcessInput>,
+    /// Counter to separate same sized walredo inputs failing at the same millisecond.
+    #[cfg(feature = "testing")]
+    dump_sequence: AtomicUsize,
+}
+
+struct ProcessInput {
+    stdin: ChildStdin,
+    n_requests: usize,
+}
+
+struct ProcessOutput {
+    stdout: ChildStdout,
+    pending_responses: VecDeque<Option<Bytes>>,
+    n_processed_responses: usize,
+}
+
+impl WalRedoProcess {
+    //
+    // Start postgres binary in special WAL redo mode.
+    //
+    #[instrument(skip_all,fields(pg_version=pg_version))]
+    pub(crate) fn launch(
+        conf: &'static PageServerConf,
+        tenant_shard_id: TenantShardId,
+        pg_version: u32,
+    ) -> anyhow::Result<Self> {
+        crate::span::debug_assert_current_span_has_tenant_id();
+
+        let pg_bin_dir_path = conf.pg_bin_dir(pg_version).context("pg_bin_dir")?; // TODO these should be infallible.
+        let pg_lib_dir_path = conf.pg_lib_dir(pg_version).context("pg_lib_dir")?;
+
+        use no_leak_child::NoLeakChildCommandExt;
+        // Start postgres itself
+        let child = Command::new(pg_bin_dir_path.join("postgres"))
+            // the first arg must be --wal-redo so the child process enters into walredo mode
+            .arg("--wal-redo")
+            // the child doesn't process this arg, but, having it in the argv helps indentify the
+            // walredo process for a particular tenant when debugging a pagserver
+            .args(["--tenant-shard-id", &format!("{tenant_shard_id}")])
+            .stdin(Stdio::piped())
+            .stderr(Stdio::piped())
+            .stdout(Stdio::piped())
+            .env_clear()
+            .env("LD_LIBRARY_PATH", &pg_lib_dir_path)
+            .env("DYLD_LIBRARY_PATH", &pg_lib_dir_path)
+            // NB: The redo process is not trusted after we sent it the first
+            // walredo work. Before that, it is trusted. Specifically, we trust
+            // it to
+            // 1. close all file descriptors except stdin, stdout, stderr because
+            //    pageserver might not be 100% diligent in setting FD_CLOEXEC on all
+            //    the files it opens, and
+            // 2. to use seccomp to sandbox itself before processing the first
+            //    walredo request.
+            .spawn_no_leak_child(tenant_shard_id)
+            .context("spawn process")?;
+        WAL_REDO_PROCESS_COUNTERS.started.inc();
+        let mut child = scopeguard::guard(child, |child| {
+            error!("killing wal-redo-postgres process due to a problem during launch");
+            child.kill_and_wait(WalRedoKillCause::Startup);
+        });
+
+        let stdin = child.stdin.take().unwrap();
+        let stdout = child.stdout.take().unwrap();
+        let stderr = child.stderr.take().unwrap();
+        let stderr = tokio::process::ChildStderr::from_std(stderr)
+            .context("convert to tokio::ChildStderr")?;
+        macro_rules! set_nonblock_or_log_err {
+        ($file:ident) => {{
+            let res = set_nonblock($file.as_raw_fd());
+            if let Err(e) = &res {
+                error!(error = %e, file = stringify!($file), pid = child.id(), "set_nonblock failed");
+            }
+            res
+        }};
+    }
+        set_nonblock_or_log_err!(stdin)?;
+        set_nonblock_or_log_err!(stdout)?;
+
+        // all fallible operations post-spawn are complete, so get rid of the guard
+        let child = scopeguard::ScopeGuard::into_inner(child);
+
+        tokio::spawn(
+            async move {
+                scopeguard::defer! {
+                    debug!("wal-redo-postgres stderr_logger_task finished");
+                    crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
+                }
+                debug!("wal-redo-postgres stderr_logger_task started");
+                crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
+
+                use tokio::io::AsyncBufReadExt;
+                let mut stderr_lines = tokio::io::BufReader::new(stderr);
+                let mut buf = Vec::new();
+                let res = loop {
+                    buf.clear();
+                    // TODO we don't trust the process to cap its stderr length.
+                    // Currently it can do unbounded Vec allocation.
+                    match stderr_lines.read_until(b'\n', &mut buf).await {
+                        Ok(0) => break Ok(()), // eof
+                        Ok(num_bytes) => {
+                            let output = String::from_utf8_lossy(&buf[..num_bytes]);
+                            error!(%output, "received output");
+                        }
+                        Err(e) => {
+                            break Err(e);
+                        }
+                    }
+                };
+                match res {
+                    Ok(()) => (),
+                    Err(e) => {
+                        error!(error=?e, "failed to read from walredo stderr");
+                    }
+                }
+            }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %pg_version))
+        );
+
+        Ok(Self {
+            conf,
+            tenant_shard_id,
+            child: Some(child),
+            stdin: Mutex::new(ProcessInput {
+                stdin,
+                n_requests: 0,
+            }),
+            stdout: Mutex::new(ProcessOutput {
+                stdout,
+                pending_responses: VecDeque::new(),
+                n_processed_responses: 0,
+            }),
+            #[cfg(feature = "testing")]
+            dump_sequence: AtomicUsize::default(),
+        })
+    }
+
+    pub(crate) fn id(&self) -> u32 {
+        self.child
+            .as_ref()
+            .expect("must not call this during Drop")
+            .id()
+    }
+
+    // Apply given WAL records ('records') over an old page image. Returns
+    // new page image.
+    //
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), pid=%self.id()))]
+    pub(crate) async fn apply_wal_records(
+        &self,
+        rel: RelTag,
+        blknum: u32,
+        base_img: &Option<Bytes>,
+        records: &[(Lsn, NeonWalRecord)],
+        wal_redo_timeout: Duration,
+    ) -> anyhow::Result<Bytes> {
+        let tag = protocol::BufferTag { rel, blknum };
+        let input = self.stdin.lock().unwrap();
+
+        // Serialize all the messages to send the WAL redo process first.
+        //
+        // This could be problematic if there are millions of records to replay,
+        // but in practice the number of records is usually so small that it doesn't
+        // matter, and it's better to keep this code simple.
+        //
+        // Most requests start with a before-image with BLCKSZ bytes, followed by
+        // by some other WAL records. Start with a buffer that can hold that
+        // comfortably.
+        let mut writebuf: Vec<u8> = Vec::with_capacity((BLCKSZ as usize) * 3);
+        protocol::build_begin_redo_for_block_msg(tag, &mut writebuf);
+        if let Some(img) = base_img {
+            protocol::build_push_page_msg(tag, img, &mut writebuf);
+        }
+        for (lsn, rec) in records.iter() {
+            if let NeonWalRecord::Postgres {
+                will_init: _,
+                rec: postgres_rec,
+            } = rec
+            {
+                protocol::build_apply_record_msg(*lsn, postgres_rec, &mut writebuf);
+            } else {
+                anyhow::bail!("tried to pass neon wal record to postgres WAL redo");
+            }
+        }
+        protocol::build_get_page_msg(tag, &mut writebuf);
+        WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);
+
+        let res = self.apply_wal_records0(&writebuf, input, wal_redo_timeout);
+
+        if res.is_err() {
+            // not all of these can be caused by this particular input, however these are so rare
+            // in tests so capture all.
+            self.record_and_log(&writebuf);
+        }
+
+        res
+    }
+
+    fn apply_wal_records0(
+        &self,
+        writebuf: &[u8],
+        input: MutexGuard<ProcessInput>,
+        wal_redo_timeout: Duration,
+    ) -> anyhow::Result<Bytes> {
+        let mut proc = { input }; // TODO: remove this legacy rename, but this keep the patch small.
+        let mut nwrite = 0usize;
+
+        while nwrite < writebuf.len() {
+            let mut stdin_pollfds = [PollFd::new(&proc.stdin, PollFlags::POLLOUT)];
+            let n = loop {
+                match nix::poll::poll(&mut stdin_pollfds[..], wal_redo_timeout.as_millis() as i32) {
+                    Err(nix::errno::Errno::EINTR) => continue,
+                    res => break res,
+                }
+            }?;
+
+            if n == 0 {
+                anyhow::bail!("WAL redo timed out");
+            }
+
+            // If 'stdin' is writeable, do write.
+            let in_revents = stdin_pollfds[0].revents().unwrap();
+            if in_revents & (PollFlags::POLLERR | PollFlags::POLLOUT) != PollFlags::empty() {
+                nwrite += proc.stdin.write(&writebuf[nwrite..])?;
+            }
+            if in_revents.contains(PollFlags::POLLHUP) {
+                // We still have more data to write, but the process closed the pipe.
+                anyhow::bail!("WAL redo process closed its stdin unexpectedly");
+            }
+        }
+        let request_no = proc.n_requests;
+        proc.n_requests += 1;
+        drop(proc);
+
+        // To improve walredo performance we separate sending requests and receiving
+        // responses. Them are protected by different mutexes (output and input).
+        // If thread T1, T2, T3 send requests D1, D2, D3 to walredo process
+        // then there is not warranty that T1 will first granted output mutex lock.
+        // To address this issue we maintain number of sent requests, number of processed
+        // responses and ring buffer with pending responses. After sending response
+        // (under input mutex), threads remembers request number. Then it releases
+        // input mutex, locks output mutex and fetch in ring buffer all responses until
+        // its stored request number. The it takes correspondent element from
+        // pending responses ring buffer and truncate all empty elements from the front,
+        // advancing processed responses number.
+
+        let mut output = self.stdout.lock().unwrap();
+        let n_processed_responses = output.n_processed_responses;
+        while n_processed_responses + output.pending_responses.len() <= request_no {
+            // We expect the WAL redo process to respond with an 8k page image. We read it
+            // into this buffer.
+            let mut resultbuf = vec![0; BLCKSZ.into()];
+            let mut nresult: usize = 0; // # of bytes read into 'resultbuf' so far
+            while nresult < BLCKSZ.into() {
+                let mut stdout_pollfds = [PollFd::new(&output.stdout, PollFlags::POLLIN)];
+                // We do two things simultaneously: reading response from stdout
+                // and forward any logging information that the child writes to its stderr to the page server's log.
+                let n = loop {
+                    match nix::poll::poll(
+                        &mut stdout_pollfds[..],
+                        wal_redo_timeout.as_millis() as i32,
+                    ) {
+                        Err(nix::errno::Errno::EINTR) => continue,
+                        res => break res,
+                    }
+                }?;
+
+                if n == 0 {
+                    anyhow::bail!("WAL redo timed out");
+                }
+
+                // If we have some data in stdout, read it to the result buffer.
+                let out_revents = stdout_pollfds[0].revents().unwrap();
+                if out_revents & (PollFlags::POLLERR | PollFlags::POLLIN) != PollFlags::empty() {
+                    nresult += output.stdout.read(&mut resultbuf[nresult..])?;
+                }
+                if out_revents.contains(PollFlags::POLLHUP) {
+                    anyhow::bail!("WAL redo process closed its stdout unexpectedly");
+                }
+            }
+            output
+                .pending_responses
+                .push_back(Some(Bytes::from(resultbuf)));
+        }
+        // Replace our request's response with None in `pending_responses`.
+        // Then make space in the ring buffer by clearing out any seqence of contiguous
+        // `None`'s from the front of `pending_responses`.
+        // NB: We can't pop_front() because other requests' responses because another
+        // requester might have grabbed the output mutex before us:
+        // T1: grab input mutex
+        // T1: send request_no 23
+        // T1: release input mutex
+        // T2: grab input mutex
+        // T2: send request_no 24
+        // T2: release input mutex
+        // T2: grab output mutex
+        // T2: n_processed_responses + output.pending_responses.len() <= request_no
+        //            23                                0                   24
+        // T2: enters poll loop that reads stdout
+        // T2: put response for 23 into pending_responses
+        // T2: put response for 24 into pending_resposnes
+        // pending_responses now looks like this: Front Some(response_23) Some(response_24) Back
+        // T2: takes its response_24
+        // pending_responses now looks like this: Front Some(response_23) None Back
+        // T2: does the while loop below
+        // pending_responses now looks like this: Front Some(response_23) None Back
+        // T2: releases output mutex
+        // T1: grabs output mutex
+        // T1: n_processed_responses + output.pending_responses.len() > request_no
+        //            23                                2                   23
+        // T1: skips poll loop that reads stdout
+        // T1: takes its response_23
+        // pending_responses now looks like this: Front None None Back
+        // T2: does the while loop below
+        // pending_responses now looks like this: Front Back
+        // n_processed_responses now has value 25
+        let res = output.pending_responses[request_no - n_processed_responses]
+            .take()
+            .expect("we own this request_no, nobody else is supposed to take it");
+        while let Some(front) = output.pending_responses.front() {
+            if front.is_none() {
+                output.pending_responses.pop_front();
+                output.n_processed_responses += 1;
+            } else {
+                break;
+            }
+        }
+        Ok(res)
+    }
+
+    #[cfg(feature = "testing")]
+    fn record_and_log(&self, writebuf: &[u8]) {
+        use std::sync::atomic::Ordering;
+
+        let millis = std::time::SystemTime::now()
+            .duration_since(std::time::SystemTime::UNIX_EPOCH)
+            .unwrap()
+            .as_millis();
+
+        let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
+
+        // these files will be collected to an allure report
+        let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
+
+        let path = self.conf.tenant_path(&self.tenant_shard_id).join(&filename);
+
+        let res = std::fs::OpenOptions::new()
+            .write(true)
+            .create_new(true)
+            .read(true)
+            .open(path)
+            .and_then(|mut f| f.write_all(writebuf));
+
+        // trip up allowed_errors
+        if let Err(e) = res {
+            tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
+        } else {
+            tracing::error!(filename, "erroring walredo input saved");
+        }
+    }
+
+    #[cfg(not(feature = "testing"))]
+    fn record_and_log(&self, _: &[u8]) {}
+}
+
+impl Drop for WalRedoProcess {
+    fn drop(&mut self) {
+        self.child
+            .take()
+            .expect("we only do this once")
+            .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
+        // no way to wait for stderr_logger_task from Drop because that is async only
+    }
+}
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,88 +1,88 @@
-# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand.

 [[package]]
 name = "aiohttp"
-version = "3.9.2"
+version = "3.9.4"
 description = "Async http client/server framework (asyncio)"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "aiohttp-3.9.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:772fbe371788e61c58d6d3d904268e48a594ba866804d08c995ad71b144f94cb"},
-    {file = "aiohttp-3.9.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:edd4f1af2253f227ae311ab3d403d0c506c9b4410c7fc8d9573dec6d9740369f"},
-    {file = "aiohttp-3.9.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cfee9287778399fdef6f8a11c9e425e1cb13cc9920fd3a3df8f122500978292b"},
-    {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3cc158466f6a980a6095ee55174d1de5730ad7dec251be655d9a6a9dd7ea1ff9"},
-    {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:54ec82f45d57c9a65a1ead3953b51c704f9587440e6682f689da97f3e8defa35"},
-    {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:abeb813a18eb387f0d835ef51f88568540ad0325807a77a6e501fed4610f864e"},
-    {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc91d07280d7d169f3a0f9179d8babd0ee05c79d4d891447629ff0d7d8089ec2"},
-    {file = "aiohttp-3.9.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b65e861f4bebfb660f7f0f40fa3eb9f2ab9af10647d05dac824390e7af8f75b7"},
-    {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:04fd8ffd2be73d42bcf55fd78cde7958eeee6d4d8f73c3846b7cba491ecdb570"},
-    {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:3d8d962b439a859b3ded9a1e111a4615357b01620a546bc601f25b0211f2da81"},
-    {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:8ceb658afd12b27552597cf9a65d9807d58aef45adbb58616cdd5ad4c258c39e"},
-    {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:0e4ee4df741670560b1bc393672035418bf9063718fee05e1796bf867e995fad"},
-    {file = "aiohttp-3.9.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:2dec87a556f300d3211decf018bfd263424f0690fcca00de94a837949fbcea02"},
-    {file = "aiohttp-3.9.2-cp310-cp310-win32.whl", hash = "sha256:3e1a800f988ce7c4917f34096f81585a73dbf65b5c39618b37926b1238cf9bc4"},
-    {file = "aiohttp-3.9.2-cp310-cp310-win_amd64.whl", hash = "sha256:ea510718a41b95c236c992b89fdfc3d04cc7ca60281f93aaada497c2b4e05c46"},
-    {file = "aiohttp-3.9.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6aaa6f99256dd1b5756a50891a20f0d252bd7bdb0854c5d440edab4495c9f973"},
-    {file = "aiohttp-3.9.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a27d8c70ad87bcfce2e97488652075a9bdd5b70093f50b10ae051dfe5e6baf37"},
-    {file = "aiohttp-3.9.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:54287bcb74d21715ac8382e9de146d9442b5f133d9babb7e5d9e453faadd005e"},
-    {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5bb3d05569aa83011fcb346b5266e00b04180105fcacc63743fc2e4a1862a891"},
-    {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c8534e7d69bb8e8d134fe2be9890d1b863518582f30c9874ed7ed12e48abe3c4"},
-    {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4bd9d5b989d57b41e4ff56ab250c5ddf259f32db17159cce630fd543376bd96b"},
-    {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fa6904088e6642609981f919ba775838ebf7df7fe64998b1a954fb411ffb4663"},
-    {file = "aiohttp-3.9.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:bda42eb410be91b349fb4ee3a23a30ee301c391e503996a638d05659d76ea4c2"},
-    {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:193cc1ccd69d819562cc7f345c815a6fc51d223b2ef22f23c1a0f67a88de9a72"},
-    {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:b9f1cb839b621f84a5b006848e336cf1496688059d2408e617af33e3470ba204"},
-    {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:d22a0931848b8c7a023c695fa2057c6aaac19085f257d48baa24455e67df97ec"},
-    {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:4112d8ba61fbd0abd5d43a9cb312214565b446d926e282a6d7da3f5a5aa71d36"},
-    {file = "aiohttp-3.9.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c4ad4241b52bb2eb7a4d2bde060d31c2b255b8c6597dd8deac2f039168d14fd7"},
-    {file = "aiohttp-3.9.2-cp311-cp311-win32.whl", hash = "sha256:ee2661a3f5b529f4fc8a8ffee9f736ae054adfb353a0d2f78218be90617194b3"},
-    {file = "aiohttp-3.9.2-cp311-cp311-win_amd64.whl", hash = "sha256:4deae2c165a5db1ed97df2868ef31ca3cc999988812e82386d22937d9d6fed52"},
-    {file = "aiohttp-3.9.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:6f4cdba12539215aaecf3c310ce9d067b0081a0795dd8a8805fdb67a65c0572a"},
-    {file = "aiohttp-3.9.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:84e843b33d5460a5c501c05539809ff3aee07436296ff9fbc4d327e32aa3a326"},
-    {file = "aiohttp-3.9.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8008d0f451d66140a5aa1c17e3eedc9d56e14207568cd42072c9d6b92bf19b52"},
-    {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:61c47ab8ef629793c086378b1df93d18438612d3ed60dca76c3422f4fbafa792"},
-    {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bc71f748e12284312f140eaa6599a520389273174b42c345d13c7e07792f4f57"},
-    {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a1c3a4d0ab2f75f22ec80bca62385db2e8810ee12efa8c9e92efea45c1849133"},
-    {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9a87aa0b13bbee025faa59fa58861303c2b064b9855d4c0e45ec70182bbeba1b"},
-    {file = "aiohttp-3.9.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e2cc0d04688b9f4a7854c56c18aa7af9e5b0a87a28f934e2e596ba7e14783192"},
-    {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:1956e3ac376b1711c1533266dec4efd485f821d84c13ce1217d53e42c9e65f08"},
-    {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:114da29f39eccd71b93a0fcacff178749a5c3559009b4a4498c2c173a6d74dff"},
-    {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:3f17999ae3927d8a9a823a1283b201344a0627272f92d4f3e3a4efe276972fe8"},
-    {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:f31df6a32217a34ae2f813b152a6f348154f948c83213b690e59d9e84020925c"},
-    {file = "aiohttp-3.9.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:7a75307ffe31329928a8d47eae0692192327c599113d41b278d4c12b54e1bd11"},
-    {file = "aiohttp-3.9.2-cp312-cp312-win32.whl", hash = "sha256:972b63d589ff8f305463593050a31b5ce91638918da38139b9d8deaba9e0fed7"},
-    {file = "aiohttp-3.9.2-cp312-cp312-win_amd64.whl", hash = "sha256:200dc0246f0cb5405c80d18ac905c8350179c063ea1587580e3335bfc243ba6a"},
-    {file = "aiohttp-3.9.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:158564d0d1020e0d3fe919a81d97aadad35171e13e7b425b244ad4337fc6793a"},
-    {file = "aiohttp-3.9.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:da1346cd0ccb395f0ed16b113ebb626fa43b7b07fd7344fce33e7a4f04a8897a"},
-    {file = "aiohttp-3.9.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:eaa9256de26ea0334ffa25f1913ae15a51e35c529a1ed9af8e6286dd44312554"},
-    {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1543e7fb00214fb4ccead42e6a7d86f3bb7c34751ec7c605cca7388e525fd0b4"},
-    {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:186e94570433a004e05f31f632726ae0f2c9dee4762a9ce915769ce9c0a23d89"},
-    {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d52d20832ac1560f4510d68e7ba8befbc801a2b77df12bd0cd2bcf3b049e52a4"},
-    {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1c45e4e815ac6af3b72ca2bde9b608d2571737bb1e2d42299fc1ffdf60f6f9a1"},
-    {file = "aiohttp-3.9.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:aa906b9bdfd4a7972dd0628dbbd6413d2062df5b431194486a78f0d2ae87bd55"},
-    {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:68bbee9e17d66f17bb0010aa15a22c6eb28583edcc8b3212e2b8e3f77f3ebe2a"},
-    {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:4c189b64bd6d9a403a1a3f86a3ab3acbc3dc41a68f73a268a4f683f89a4dec1f"},
-    {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:8a7876f794523123bca6d44bfecd89c9fec9ec897a25f3dd202ee7fc5c6525b7"},
-    {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:d23fba734e3dd7b1d679b9473129cd52e4ec0e65a4512b488981a56420e708db"},
-    {file = "aiohttp-3.9.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:b141753be581fab842a25cb319f79536d19c2a51995d7d8b29ee290169868eab"},
-    {file = "aiohttp-3.9.2-cp38-cp38-win32.whl", hash = "sha256:103daf41ff3b53ba6fa09ad410793e2e76c9d0269151812e5aba4b9dd674a7e8"},
-    {file = "aiohttp-3.9.2-cp38-cp38-win_amd64.whl", hash = "sha256:328918a6c2835861ff7afa8c6d2c70c35fdaf996205d5932351bdd952f33fa2f"},
-    {file = "aiohttp-3.9.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:5264d7327c9464786f74e4ec9342afbbb6ee70dfbb2ec9e3dfce7a54c8043aa3"},
-    {file = "aiohttp-3.9.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:07205ae0015e05c78b3288c1517afa000823a678a41594b3fdc870878d645305"},
-    {file = "aiohttp-3.9.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ae0a1e638cffc3ec4d4784b8b4fd1cf28968febc4bd2718ffa25b99b96a741bd"},
-    {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d43302a30ba1166325974858e6ef31727a23bdd12db40e725bec0f759abce505"},
-    {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:16a967685907003765855999af11a79b24e70b34dc710f77a38d21cd9fc4f5fe"},
-    {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6fa3ee92cd441d5c2d07ca88d7a9cef50f7ec975f0117cd0c62018022a184308"},
-    {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b500c5ad9c07639d48615a770f49618130e61be36608fc9bc2d9bae31732b8f"},
-    {file = "aiohttp-3.9.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c07327b368745b1ce2393ae9e1aafed7073d9199e1dcba14e035cc646c7941bf"},
-    {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:cc7d6502c23a0ec109687bf31909b3fb7b196faf198f8cff68c81b49eb316ea9"},
-    {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:07be2be7071723c3509ab5c08108d3a74f2181d4964e869f2504aaab68f8d3e8"},
-    {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:122468f6fee5fcbe67cb07014a08c195b3d4c41ff71e7b5160a7bcc41d585a5f"},
-    {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:00a9abcea793c81e7f8778ca195a1714a64f6d7436c4c0bb168ad2a212627000"},
-    {file = "aiohttp-3.9.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:7a9825fdd64ecac5c670234d80bb52bdcaa4139d1f839165f548208b3779c6c6"},
-    {file = "aiohttp-3.9.2-cp39-cp39-win32.whl", hash = "sha256:5422cd9a4a00f24c7244e1b15aa9b87935c85fb6a00c8ac9b2527b38627a9211"},
-    {file = "aiohttp-3.9.2-cp39-cp39-win_amd64.whl", hash = "sha256:7d579dcd5d82a86a46f725458418458fa43686f6a7b252f2966d359033ffc8ab"},
-    {file = "aiohttp-3.9.2.tar.gz", hash = "sha256:b0ad0a5e86ce73f5368a164c10ada10504bf91869c05ab75d982c6048217fbf7"},
+    {file = "aiohttp-3.9.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:76d32588ef7e4a3f3adff1956a0ba96faabbdee58f2407c122dd45aa6e34f372"},
+    {file = "aiohttp-3.9.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:56181093c10dbc6ceb8a29dfeea1e815e1dfdc020169203d87fd8d37616f73f9"},
+    {file = "aiohttp-3.9.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c7a5b676d3c65e88b3aca41816bf72831898fcd73f0cbb2680e9d88e819d1e4d"},
+    {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d1df528a85fb404899d4207a8d9934cfd6be626e30e5d3a5544a83dbae6d8a7e"},
+    {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f595db1bceabd71c82e92df212dd9525a8a2c6947d39e3c994c4f27d2fe15b11"},
+    {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9c0b09d76e5a4caac3d27752027fbd43dc987b95f3748fad2b924a03fe8632ad"},
+    {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:689eb4356649ec9535b3686200b231876fb4cab4aca54e3bece71d37f50c1d13"},
+    {file = "aiohttp-3.9.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a3666cf4182efdb44d73602379a66f5fdfd5da0db5e4520f0ac0dcca644a3497"},
+    {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b65b0f8747b013570eea2f75726046fa54fa8e0c5db60f3b98dd5d161052004a"},
+    {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:a1885d2470955f70dfdd33a02e1749613c5a9c5ab855f6db38e0b9389453dce7"},
+    {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:0593822dcdb9483d41f12041ff7c90d4d1033ec0e880bcfaf102919b715f47f1"},
+    {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:47f6eb74e1ecb5e19a78f4a4228aa24df7fbab3b62d4a625d3f41194a08bd54f"},
+    {file = "aiohttp-3.9.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:c8b04a3dbd54de6ccb7604242fe3ad67f2f3ca558f2d33fe19d4b08d90701a89"},
+    {file = "aiohttp-3.9.4-cp310-cp310-win32.whl", hash = "sha256:8a78dfb198a328bfb38e4308ca8167028920fb747ddcf086ce706fbdd23b2926"},
+    {file = "aiohttp-3.9.4-cp310-cp310-win_amd64.whl", hash = "sha256:e78da6b55275987cbc89141a1d8e75f5070e577c482dd48bd9123a76a96f0bbb"},
+    {file = "aiohttp-3.9.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c111b3c69060d2bafc446917534150fd049e7aedd6cbf21ba526a5a97b4402a5"},
+    {file = "aiohttp-3.9.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:efbdd51872cf170093998c87ccdf3cb5993add3559341a8e5708bcb311934c94"},
+    {file = "aiohttp-3.9.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7bfdb41dc6e85d8535b00d73947548a748e9534e8e4fddd2638109ff3fb081df"},
+    {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2bd9d334412961125e9f68d5b73c1d0ab9ea3f74a58a475e6b119f5293eee7ba"},
+    {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:35d78076736f4a668d57ade00c65d30a8ce28719d8a42471b2a06ccd1a2e3063"},
+    {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:824dff4f9f4d0f59d0fa3577932ee9a20e09edec8a2f813e1d6b9f89ced8293f"},
+    {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:52b8b4e06fc15519019e128abedaeb56412b106ab88b3c452188ca47a25c4093"},
+    {file = "aiohttp-3.9.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:eae569fb1e7559d4f3919965617bb39f9e753967fae55ce13454bec2d1c54f09"},
+    {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:69b97aa5792428f321f72aeb2f118e56893371f27e0b7d05750bcad06fc42ca1"},
+    {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4d79aad0ad4b980663316f26d9a492e8fab2af77c69c0f33780a56843ad2f89e"},
+    {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:d6577140cd7db19e430661e4b2653680194ea8c22c994bc65b7a19d8ec834403"},
+    {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:9860d455847cd98eb67897f5957b7cd69fbcb436dd3f06099230f16a66e66f79"},
+    {file = "aiohttp-3.9.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:69ff36d3f8f5652994e08bd22f093e11cfd0444cea310f92e01b45a4e46b624e"},
+    {file = "aiohttp-3.9.4-cp311-cp311-win32.whl", hash = "sha256:e27d3b5ed2c2013bce66ad67ee57cbf614288bda8cdf426c8d8fe548316f1b5f"},
+    {file = "aiohttp-3.9.4-cp311-cp311-win_amd64.whl", hash = "sha256:d6a67e26daa686a6fbdb600a9af8619c80a332556245fa8e86c747d226ab1a1e"},
+    {file = "aiohttp-3.9.4-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:c5ff8ff44825736a4065d8544b43b43ee4c6dd1530f3a08e6c0578a813b0aa35"},
+    {file = "aiohttp-3.9.4-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:d12a244627eba4e9dc52cbf924edef905ddd6cafc6513849b4876076a6f38b0e"},
+    {file = "aiohttp-3.9.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:dcad56c8d8348e7e468899d2fb3b309b9bc59d94e6db08710555f7436156097f"},
+    {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f7e69a7fd4b5ce419238388e55abd220336bd32212c673ceabc57ccf3d05b55"},
+    {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4870cb049f10d7680c239b55428916d84158798eb8f353e74fa2c98980dcc0b"},
+    {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3b2feaf1b7031ede1bc0880cec4b0776fd347259a723d625357bb4b82f62687b"},
+    {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:939393e8c3f0a5bcd33ef7ace67680c318dc2ae406f15e381c0054dd658397de"},
+    {file = "aiohttp-3.9.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7d2334e387b2adcc944680bebcf412743f2caf4eeebd550f67249c1c3696be04"},
+    {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:e0198ea897680e480845ec0ffc5a14e8b694e25b3f104f63676d55bf76a82f1a"},
+    {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:e40d2cd22914d67c84824045861a5bb0fb46586b15dfe4f046c7495bf08306b2"},
+    {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:aba80e77c227f4234aa34a5ff2b6ff30c5d6a827a91d22ff6b999de9175d71bd"},
+    {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:fb68dc73bc8ac322d2e392a59a9e396c4f35cb6fdbdd749e139d1d6c985f2527"},
+    {file = "aiohttp-3.9.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:f3460a92638dce7e47062cf088d6e7663adb135e936cb117be88d5e6c48c9d53"},
+    {file = "aiohttp-3.9.4-cp312-cp312-win32.whl", hash = "sha256:32dc814ddbb254f6170bca198fe307920f6c1308a5492f049f7f63554b88ef36"},
+    {file = "aiohttp-3.9.4-cp312-cp312-win_amd64.whl", hash = "sha256:63f41a909d182d2b78fe3abef557fcc14da50c7852f70ae3be60e83ff64edba5"},
+    {file = "aiohttp-3.9.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:c3770365675f6be220032f6609a8fbad994d6dcf3ef7dbcf295c7ee70884c9af"},
+    {file = "aiohttp-3.9.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:305edae1dea368ce09bcb858cf5a63a064f3bff4767dec6fa60a0cc0e805a1d3"},
+    {file = "aiohttp-3.9.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6f121900131d116e4a93b55ab0d12ad72573f967b100e49086e496a9b24523ea"},
+    {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b71e614c1ae35c3d62a293b19eface83d5e4d194e3eb2fabb10059d33e6e8cbf"},
+    {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:419f009fa4cfde4d16a7fc070d64f36d70a8d35a90d71aa27670bba2be4fd039"},
+    {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7b39476ee69cfe64061fd77a73bf692c40021f8547cda617a3466530ef63f947"},
+    {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b33f34c9c7decdb2ab99c74be6443942b730b56d9c5ee48fb7df2c86492f293c"},
+    {file = "aiohttp-3.9.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c78700130ce2dcebb1a8103202ae795be2fa8c9351d0dd22338fe3dac74847d9"},
+    {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:268ba22d917655d1259af2d5659072b7dc11b4e1dc2cb9662fdd867d75afc6a4"},
+    {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:17e7c051f53a0d2ebf33013a9cbf020bb4e098c4bc5bce6f7b0c962108d97eab"},
+    {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:7be99f4abb008cb38e144f85f515598f4c2c8932bf11b65add0ff59c9c876d99"},
+    {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:d58a54d6ff08d2547656356eea8572b224e6f9bbc0cf55fa9966bcaac4ddfb10"},
+    {file = "aiohttp-3.9.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:7673a76772bda15d0d10d1aa881b7911d0580c980dbd16e59d7ba1422b2d83cd"},
+    {file = "aiohttp-3.9.4-cp38-cp38-win32.whl", hash = "sha256:e4370dda04dc8951012f30e1ce7956a0a226ac0714a7b6c389fb2f43f22a250e"},
+    {file = "aiohttp-3.9.4-cp38-cp38-win_amd64.whl", hash = "sha256:eb30c4510a691bb87081192a394fb661860e75ca3896c01c6d186febe7c88530"},
+    {file = "aiohttp-3.9.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:84e90494db7df3be5e056f91412f9fa9e611fbe8ce4aaef70647297f5943b276"},
+    {file = "aiohttp-3.9.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7d4845f8501ab28ebfdbeab980a50a273b415cf69e96e4e674d43d86a464df9d"},
+    {file = "aiohttp-3.9.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:69046cd9a2a17245c4ce3c1f1a4ff8c70c7701ef222fce3d1d8435f09042bba1"},
+    {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b73a06bafc8dcc508420db43b4dd5850e41e69de99009d0351c4f3007960019"},
+    {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:418bb0038dfafeac923823c2e63226179976c76f981a2aaad0ad5d51f2229bca"},
+    {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:71a8f241456b6c2668374d5d28398f8e8cdae4cce568aaea54e0f39359cd928d"},
+    {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:935c369bf8acc2dc26f6eeb5222768aa7c62917c3554f7215f2ead7386b33748"},
+    {file = "aiohttp-3.9.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:74e4e48c8752d14ecfb36d2ebb3d76d614320570e14de0a3aa7a726ff150a03c"},
+    {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:916b0417aeddf2c8c61291238ce25286f391a6acb6f28005dd9ce282bd6311b6"},
+    {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:9b6787b6d0b3518b2ee4cbeadd24a507756ee703adbac1ab6dc7c4434b8c572a"},
+    {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:221204dbda5ef350e8db6287937621cf75e85778b296c9c52260b522231940ed"},
+    {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:10afd99b8251022ddf81eaed1d90f5a988e349ee7d779eb429fb07b670751e8c"},
+    {file = "aiohttp-3.9.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:2506d9f7a9b91033201be9ffe7d89c6a54150b0578803cce5cb84a943d075bc3"},
+    {file = "aiohttp-3.9.4-cp39-cp39-win32.whl", hash = "sha256:e571fdd9efd65e86c6af2f332e0e95dad259bfe6beb5d15b3c3eca3a6eb5d87b"},
+    {file = "aiohttp-3.9.4-cp39-cp39-win_amd64.whl", hash = "sha256:7d29dd5319d20aa3b7749719ac9685fbd926f71ac8c77b2477272725f882072d"},
+    {file = "aiohttp-3.9.4.tar.gz", hash = "sha256:6ff71ede6d9a5a58cfb7b6fffc83ab5d4a63138276c771ac91ceaaddf5459644"},
 ]

 [package.dependencies]
@@ -1191,13 +1191,13 @@ files = [

 [[package]]
 name = "idna"
-version = "3.3"
+version = "3.7"
 description = "Internationalized Domain Names in Applications (IDNA)"
 optional = false
 python-versions = ">=3.5"
 files = [
-    {file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"},
-    {file = "idna-3.3.tar.gz", hash = "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d"},
+    {file = "idna-3.7-py3-none-any.whl", hash = "sha256:82fee1fc78add43492d3a1898bfa6d8a904cc97d8427f683ed8e798d07761aa0"},
+    {file = "idna-3.7.tar.gz", hash = "sha256:028ff3aadf0609c1fd278d8ea3089299412a7a8b9bd005dd08b9f8285bcb5cfc"},
 ]

 [[package]]
@@ -2182,6 +2182,7 @@ files = [
    {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
    {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
    {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
+    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
    {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
    {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
@@ -2652,6 +2653,16 @@ files = [
    {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
    {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
    {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
+    {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
+    {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
+    {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
+    {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
    {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
    {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
    {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
@@ -2889,4 +2900,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "df7161da4fdc3cba0a445176fc9dda2a0e8a53e13a7aa8a864385ca259381b41"
+content-hash = "b3452b50901123fd5f2c385ce8a0c1c492296393b8a7926a322b6df0ea3ac572"
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -44,6 +44,7 @@ ipnet.workspace = true
 itertools.workspace = true
 lasso = { workspace = true, features = ["multi-threaded"] }
 md5.workspace = true
+measured = { workspace = true, features = ["lasso"] }
 metrics.workspace = true
 once_cell.workspace = true
 opentelemetry.workspace = true
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -2,8 +2,15 @@ mod classic;
 mod hacks;
 mod link;

+use std::net::IpAddr;
+use std::sync::Arc;
+use std::time::Duration;
+
+use ipnet::{Ipv4Net, Ipv6Net};
 pub use link::LinkAuthError;
+use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_postgres::config::AuthKeys;
+use tracing::{info, warn};

 use crate::auth::credentials::check_peer_addr_is_in_list;
 use crate::auth::validate_password_and_exchange;
@@ -13,9 +20,10 @@ use crate::console::provider::{CachedRoleSecret, ConsoleBackend};
 use crate::console::{AuthSecret, NodeInfo};
 use crate::context::RequestMonitoring;
 use crate::intern::EndpointIdInt;
-use crate::metrics::{AUTH_RATE_LIMIT_HITS, ENDPOINTS_AUTH_RATE_LIMITED};
+use crate::metrics::Metrics;
 use crate::proxy::connect_compute::ComputeConnectBackend;
 use crate::proxy::NeonOptions;
+use crate::rate_limiter::{BucketRateLimiter, RateBucketInfo};
 use crate::stream::Stream;
 use crate::{
    auth::{self, ComputeUserInfoMaybeEndpoint},
@@ -28,9 +36,6 @@ use crate::{
    stream, url,
 };
 use crate::{scram, EndpointCacheKey, EndpointId, Normalize, RoleName};
-use std::sync::Arc;
-use tokio::io::{AsyncRead, AsyncWrite};
-use tracing::{info, warn};

 /// Alternative to [`std::borrow::Cow`] but doesn't need `T: ToOwned` as we don't need that functionality
 pub enum MaybeOwned<'a, T> {
@@ -176,11 +181,45 @@ impl TryFrom<ComputeUserInfoMaybeEndpoint> for ComputeUserInfo {
    }
 }

+#[derive(PartialEq, PartialOrd, Hash, Eq, Ord, Debug, Copy, Clone)]
+pub struct MaskedIp(IpAddr);
+
+impl MaskedIp {
+    fn new(value: IpAddr, prefix: u8) -> Self {
+        match value {
+            IpAddr::V4(v4) => Self(IpAddr::V4(
+                Ipv4Net::new(v4, prefix).map_or(v4, |x| x.trunc().addr()),
+            )),
+            IpAddr::V6(v6) => Self(IpAddr::V6(
+                Ipv6Net::new(v6, prefix).map_or(v6, |x| x.trunc().addr()),
+            )),
+        }
+    }
+}
+
+// This can't be just per IP because that would limit some PaaS that share IP addresses
+pub type AuthRateLimiter = BucketRateLimiter<(EndpointIdInt, MaskedIp)>;
+
+impl RateBucketInfo {
+    /// All of these are per endpoint-maskedip pair.
+    /// Context: 4096 rounds of pbkdf2 take about 1ms of cpu time to execute (1 milli-cpu-second or 1mcpus).
+    ///
+    /// First bucket: 1000mcpus total per endpoint-ip pair
+    /// * 4096000 requests per second with 1 hash rounds.
+    /// * 1000 requests per second with 4096 hash rounds.
+    /// * 6.8 requests per second with 600000 hash rounds.
+    pub const DEFAULT_AUTH_SET: [Self; 3] = [
+        Self::new(1000 * 4096, Duration::from_secs(1)),
+        Self::new(600 * 4096, Duration::from_secs(60)),
+        Self::new(300 * 4096, Duration::from_secs(600)),
+    ];
+}
+
 impl AuthenticationConfig {
    pub fn check_rate_limit(
        &self,
-
        ctx: &mut RequestMonitoring,
+        config: &AuthenticationConfig,
        secret: AuthSecret,
        endpoint: &EndpointId,
        is_cleartext: bool,
@@ -201,17 +240,25 @@ impl AuthenticationConfig {
            1
        };

-        let limit_not_exceeded = self
-            .rate_limiter
-            .check((endpoint_int, ctx.peer_addr), password_weight);
+        let limit_not_exceeded = self.rate_limiter.check(
+            (
+                endpoint_int,
+                MaskedIp::new(ctx.peer_addr, config.rate_limit_ip_subnet),
+            ),
+            password_weight,
+        );

        if !limit_not_exceeded {
            warn!(
                enabled = self.rate_limiter_enabled,
                "rate limiting authentication"
            );
-            AUTH_RATE_LIMIT_HITS.inc();
-            ENDPOINTS_AUTH_RATE_LIMITED.measure(endpoint);
+            Metrics::get().proxy.requests_auth_rate_limits_total.inc();
+            Metrics::get()
+                .proxy
+                .endpoints_auth_rate_limits
+                .get_metric()
+                .measure(endpoint);

            if self.rate_limiter_enabled {
                return Err(auth::AuthError::too_many_connections());
@@ -267,6 +314,7 @@ async fn auth_quirks(
    let secret = match secret {
        Some(secret) => config.check_rate_limit(
            ctx,
+            config,
            secret,
            &info.endpoint,
            unauthenticated_password.is_some() || allow_cleartext,
@@ -469,7 +517,7 @@ impl ComputeConnectBackend for BackendType<'_, ComputeCredentials, &()> {

 #[cfg(test)]
 mod tests {
-    use std::sync::Arc;
+    use std::{net::IpAddr, sync::Arc, time::Duration};

    use bytes::BytesMut;
    use fallible_iterator::FallibleIterator;
@@ -482,7 +530,7 @@ mod tests {
    use tokio::io::{AsyncRead, AsyncReadExt, AsyncWriteExt};

    use crate::{
-        auth::{ComputeUserInfoMaybeEndpoint, IpPattern},
+        auth::{backend::MaskedIp, ComputeUserInfoMaybeEndpoint, IpPattern},
        config::AuthenticationConfig,
        console::{
            self,
@@ -491,12 +539,12 @@ mod tests {
        },
        context::RequestMonitoring,
        proxy::NeonOptions,
-        rate_limiter::{AuthRateLimiter, RateBucketInfo},
+        rate_limiter::RateBucketInfo,
        scram::ServerSecret,
        stream::{PqStream, Stream},
    };

-    use super::auth_quirks;
+    use super::{auth_quirks, AuthRateLimiter};

    struct Auth {
        ips: Vec<IpPattern>,
@@ -537,6 +585,7 @@ mod tests {
        scram_protocol_timeout: std::time::Duration::from_secs(5),
        rate_limiter_enabled: true,
        rate_limiter: AuthRateLimiter::new(&RateBucketInfo::DEFAULT_AUTH_SET),
+        rate_limit_ip_subnet: 64,
    });

    async fn read_message(r: &mut (impl AsyncRead + Unpin), b: &mut BytesMut) -> PgMessage {
@@ -548,6 +597,51 @@ mod tests {
        }
    }

+    #[test]
+    fn masked_ip() {
+        let ip_a = IpAddr::V4([127, 0, 0, 1].into());
+        let ip_b = IpAddr::V4([127, 0, 0, 2].into());
+        let ip_c = IpAddr::V4([192, 168, 1, 101].into());
+        let ip_d = IpAddr::V4([192, 168, 1, 102].into());
+        let ip_e = IpAddr::V6("abcd:abcd:abcd:abcd:abcd:abcd:abcd:abcd".parse().unwrap());
+        let ip_f = IpAddr::V6("abcd:abcd:abcd:abcd:1234:abcd:abcd:abcd".parse().unwrap());
+
+        assert_ne!(MaskedIp::new(ip_a, 64), MaskedIp::new(ip_b, 64));
+        assert_ne!(MaskedIp::new(ip_a, 32), MaskedIp::new(ip_b, 32));
+        assert_eq!(MaskedIp::new(ip_a, 30), MaskedIp::new(ip_b, 30));
+        assert_eq!(MaskedIp::new(ip_c, 30), MaskedIp::new(ip_d, 30));
+
+        assert_ne!(MaskedIp::new(ip_e, 128), MaskedIp::new(ip_f, 128));
+        assert_eq!(MaskedIp::new(ip_e, 64), MaskedIp::new(ip_f, 64));
+    }
+
+    #[test]
+    fn test_default_auth_rate_limit_set() {
+        // these values used to exceed u32::MAX
+        assert_eq!(
+            RateBucketInfo::DEFAULT_AUTH_SET,
+            [
+                RateBucketInfo {
+                    interval: Duration::from_secs(1),
+                    max_rpi: 1000 * 4096,
+                },
+                RateBucketInfo {
+                    interval: Duration::from_secs(60),
+                    max_rpi: 600 * 4096 * 60,
+                },
+                RateBucketInfo {
+                    interval: Duration::from_secs(600),
+                    max_rpi: 300 * 4096 * 600,
+                }
+            ]
+        );
+
+        for x in RateBucketInfo::DEFAULT_AUTH_SET {
+            let y = x.to_string().parse().unwrap();
+            assert_eq!(x, y);
+        }
+    }
+
    #[tokio::test]
    async fn auth_quirks_scram() {
        let (mut client, server) = tokio::io::duplex(1024);
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -4,7 +4,7 @@ use crate::{
    auth::password_hack::parse_endpoint_param,
    context::RequestMonitoring,
    error::{ReportableError, UserFacingError},
-    metrics::NUM_CONNECTION_ACCEPTED_BY_SNI,
+    metrics::{Metrics, SniKind},
    proxy::NeonOptions,
    serverless::SERVERLESS_DRIVER_SNI,
    EndpointId, RoleName,
@@ -144,21 +144,22 @@ impl ComputeUserInfoMaybeEndpoint {
            ctx.set_endpoint_id(ep.clone());
        }

+        let metrics = Metrics::get();
        info!(%user, "credentials");
        if sni.is_some() {
            info!("Connection with sni");
-            NUM_CONNECTION_ACCEPTED_BY_SNI
-                .with_label_values(&["sni"])
-                .inc();
+            metrics.proxy.accepted_connections_by_sni.inc(SniKind::Sni);
        } else if endpoint.is_some() {
-            NUM_CONNECTION_ACCEPTED_BY_SNI
-                .with_label_values(&["no_sni"])
-                .inc();
+            metrics
+                .proxy
+                .accepted_connections_by_sni
+                .inc(SniKind::NoSni);
            info!("Connection without sni");
        } else {
-            NUM_CONNECTION_ACCEPTED_BY_SNI
-                .with_label_values(&["password_hack"])
-                .inc();
+            metrics
+                .proxy
+                .accepted_connections_by_sni
+                .inc(SniKind::PasswordHack);
            info!("Connection with password hack");
        }

--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -9,15 +9,13 @@ use futures::future::Either;
 use itertools::Itertools;
 use proxy::config::TlsServerEndPoint;
 use proxy::context::RequestMonitoring;
-use proxy::proxy::run_until_cancelled;
-use proxy::{BranchId, EndpointId, ProjectId};
+use proxy::proxy::{copy_bidirectional_client_compute, run_until_cancelled};
 use rustls::pki_types::PrivateKeyDer;
 use tokio::net::TcpListener;

 use anyhow::{anyhow, bail, ensure, Context};
 use clap::Arg;
 use futures::TryFutureExt;
-use proxy::console::messages::MetricsAuxInfo;
 use proxy::stream::{PqStream, Stream};

 use tokio::io::{AsyncRead, AsyncWrite};
@@ -176,7 +174,12 @@ async fn task_main(
                    .context("failed to set socket option")?;

                info!(%peer_addr, "serving");
-                let ctx = RequestMonitoring::new(session_id, peer_addr.ip(), "sni_router", "sni");
+                let ctx = RequestMonitoring::new(
+                    session_id,
+                    peer_addr.ip(),
+                    proxy::metrics::Protocol::SniRouter,
+                    "sni",
+                );
                handle_client(ctx, dest_suffix, tls_config, tls_server_end_point, socket).await
            }
            .unwrap_or_else(|e| {
@@ -199,6 +202,7 @@ async fn task_main(
 const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";

 async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
+    ctx: &mut RequestMonitoring,
    raw_stream: S,
    tls_config: Arc<rustls::ServerConfig>,
    tls_server_end_point: TlsServerEndPoint,
@@ -228,7 +232,10 @@ async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
            }

            Ok(Stream::Tls {
-                tls: Box::new(raw.upgrade(tls_config).await?),
+                tls: Box::new(
+                    raw.upgrade(tls_config, !ctx.has_private_peer_addr())
+                        .await?,
+                ),
                tls_server_end_point,
            })
        }
@@ -251,7 +258,7 @@ async fn handle_client(
    tls_server_end_point: TlsServerEndPoint,
    stream: impl AsyncRead + AsyncWrite + Unpin,
 ) -> anyhow::Result<()> {
-    let tls_stream = ssl_handshake(stream, tls_config, tls_server_end_point).await?;
+    let mut tls_stream = ssl_handshake(&mut ctx, stream, tls_config, tls_server_end_point).await?;

    // Cut off first part of the SNI domain
    // We receive required destination details in the format of
@@ -268,18 +275,15 @@ async fn handle_client(

    info!("destination: {}", destination);

-    let client = tokio::net::TcpStream::connect(destination).await?;
-
-    let metrics_aux: MetricsAuxInfo = MetricsAuxInfo {
-        endpoint_id: (&EndpointId::from("")).into(),
-        project_id: (&ProjectId::from("")).into(),
-        branch_id: (&BranchId::from("")).into(),
-        cold_start_info: proxy::console::messages::ColdStartInfo::Unknown,
-    };
+    let mut client = tokio::net::TcpStream::connect(destination).await?;

    // doesn't yet matter as pg-sni-router doesn't report analytics logs
    ctx.set_success();
    ctx.log();

-    proxy::proxy::passthrough::proxy_pass(tls_stream, client, metrics_aux).await
+    // Starting from here we only proxy the client's traffic.
+    info!("performing the proxy pass...");
+    let _ = copy_bidirectional_client_compute(&mut tls_stream, &mut client).await?;
+
+    Ok(())
 }
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -7,6 +7,7 @@ use aws_config::provider_config::ProviderConfig;
 use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider;
 use futures::future::Either;
 use proxy::auth;
+use proxy::auth::backend::AuthRateLimiter;
 use proxy::auth::backend::MaybeOwned;
 use proxy::cancellation::CancelMap;
 use proxy::cancellation::CancellationHandler;
@@ -18,11 +19,10 @@ use proxy::config::ProjectInfoCacheOptions;
 use proxy::console;
 use proxy::context::parquet::ParquetUploadArgs;
 use proxy::http;
-use proxy::metrics::NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT;
-use proxy::rate_limiter::AuthRateLimiter;
+use proxy::http::health_server::AppMetrics;
+use proxy::metrics::Metrics;
 use proxy::rate_limiter::EndpointRateLimiter;
 use proxy::rate_limiter::RateBucketInfo;
-use proxy::rate_limiter::RateLimiterConfig;
 use proxy::redis::cancellation_publisher::RedisPublisherClient;
 use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
 use proxy::redis::elasticache;
@@ -42,6 +42,7 @@ use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
 use tracing::info;
 use tracing::warn;
+use tracing::Instrument;
 use utils::{project_build_tag, project_git_version, sentry_init::init_sentry};

 project_git_version!(GIT_VERSION);
@@ -131,14 +132,8 @@ struct ProxyCliArgs {
    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
    require_client_ip: bool,
    /// Disable dynamic rate limiter and store the metrics to ensure its production behaviour.
-    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
+    #[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
    disable_dynamic_rate_limiter: bool,
-    /// Rate limit algorithm. Makes sense only if `disable_rate_limiter` is `false`.
-    #[clap(value_enum, long, default_value_t = proxy::rate_limiter::RateLimitAlgorithm::Aimd)]
-    rate_limit_algorithm: proxy::rate_limiter::RateLimitAlgorithm,
-    /// Timeout for rate limiter. If it didn't manage to aquire a permit in this time, it will return an error.
-    #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
-    rate_limiter_timeout: tokio::time::Duration,
    /// Endpoint rate limiter max number of requests per second.
    ///
    /// Provided in the form '<Requests Per Second>@<Bucket Duration Size>'.
@@ -151,14 +146,12 @@ struct ProxyCliArgs {
    /// Authentication rate limiter max number of hashes per second.
    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_AUTH_SET)]
    auth_rate_limit: Vec<RateBucketInfo>,
+    /// The IP subnet to use when considering whether two IP addresses are considered the same.
+    #[clap(long, default_value_t = 64)]
+    auth_rate_limit_ip_subnet: u8,
    /// Redis rate limiter max number of requests per second.
    #[clap(long, default_values_t = RateBucketInfo::DEFAULT_ENDPOINT_SET)]
    redis_rps_limit: Vec<RateBucketInfo>,
-    /// Initial limit for dynamic rate limiter. Makes sense only if `rate_limit_algorithm` is *not* `None`.
-    #[clap(long, default_value_t = 100)]
-    initial_limit: usize,
-    #[clap(flatten)]
-    aimd_config: proxy::rate_limiter::AimdConfig,
    /// cache for `allowed_ips` (use `size=0` to disable)
    #[clap(long, default_value = config::CacheOptions::CACHE_DEFAULT_OPTIONS)]
    allowed_ips_cache: String,
@@ -207,6 +200,12 @@ struct ProxyCliArgs {
    /// Size of each event is no more than 400 bytes, so 2**22 is about 200MB before the compression.
    #[clap(long, default_value = "4194304")]
    metric_backup_collection_chunk_size: usize,
+    /// Whether to retry the connection to the compute node
+    #[clap(long, default_value = config::RetryConfig::CONNECT_TO_COMPUTE_DEFAULT_VALUES)]
+    connect_to_compute_retry: String,
+    /// Whether to retry the wake_compute request
+    #[clap(long, default_value = config::RetryConfig::WAKE_COMPUTE_DEFAULT_VALUES)]
+    wake_compute_retry: String,
 }

 #[derive(clap::Args, Clone, Copy, Debug)]
@@ -251,14 +250,18 @@ async fn main() -> anyhow::Result<()> {

    info!("Version: {GIT_VERSION}");
    info!("Build_tag: {BUILD_TAG}");
-    ::metrics::set_build_info_metric(GIT_VERSION, BUILD_TAG);
+    let neon_metrics = ::metrics::NeonMetrics::new(::metrics::BuildInfo {
+        revision: GIT_VERSION,
+        build_tag: BUILD_TAG,
+    });

-    match proxy::jemalloc::MetricRecorder::new(prometheus::default_registry()) {
-        Ok(t) => {
-            t.start();
+    let jemalloc = match proxy::jemalloc::MetricRecorder::new() {
+        Ok(t) => Some(t),
+        Err(e) => {
+            tracing::error!(error = ?e, "could not start jemalloc metrics loop");
+            None
        }
-        Err(e) => tracing::error!(error = ?e, "could not start jemalloc metrics loop"),
-    }
+    };

    let args = ProxyCliArgs::parse();
    let config = build_config(&args)?;
@@ -298,27 +301,27 @@ async fn main() -> anyhow::Result<()> {
        ),
        aws_credentials_provider,
    ));
-    let redis_notifications_client =
-        match (args.redis_notifications, (args.redis_host, args.redis_port)) {
-            (Some(url), _) => {
-                info!("Starting redis notifications listener ({url})");
-                Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url))
-            }
-            (None, (Some(host), Some(port))) => Some(
-                ConnectionWithCredentialsProvider::new_with_credentials_provider(
-                    host,
-                    port,
-                    elasticache_credentials_provider.clone(),
-                ),
+    let regional_redis_client = match (args.redis_host, args.redis_port) {
+        (Some(host), Some(port)) => Some(
+            ConnectionWithCredentialsProvider::new_with_credentials_provider(
+                host,
+                port,
+                elasticache_credentials_provider.clone(),
            ),
-            (None, (None, None)) => {
-                warn!("Redis is disabled");
-                None
-            }
-            _ => {
-                bail!("redis-host and redis-port must be specified together");
-            }
-        };
+        ),
+        (None, None) => {
+            warn!("Redis events from console are disabled");
+            None
+        }
+        _ => {
+            bail!("redis-host and redis-port must be specified together");
+        }
+    };
+    let redis_notifications_client = if let Some(url) = args.redis_notifications {
+        Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url))
+    } else {
+        regional_redis_client.clone()
+    };

    // Check that we can bind to address before further initialization
    let http_address: SocketAddr = args.http.parse()?;
@@ -334,11 +337,9 @@ async fn main() -> anyhow::Result<()> {
    let proxy_listener = TcpListener::bind(proxy_address).await?;
    let cancellation_token = CancellationToken::new();

-    let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(&config.endpoint_rps_limit));
    let cancel_map = CancelMap::default();

-    // let redis_notifications_client = redis_notifications_client.map(|x| Box::leak(Box::new(x)));
-    let redis_publisher = match &redis_notifications_client {
+    let redis_publisher = match &regional_redis_client {
        Some(redis_publisher) => Some(Arc::new(Mutex::new(RedisPublisherClient::new(
            redis_publisher.clone(),
            args.region.clone(),
@@ -351,7 +352,7 @@ async fn main() -> anyhow::Result<()> {
    >::new(
        cancel_map.clone(),
        redis_publisher,
-        NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT,
+        proxy::metrics::CancellationSource::FromClient,
    ));

    // client facing tasks. these will exit on error or on cancellation
@@ -361,7 +362,6 @@ async fn main() -> anyhow::Result<()> {
        config,
        proxy_listener,
        cancellation_token.clone(),
-        endpoint_rate_limiter.clone(),
        cancellation_handler.clone(),
    ));

@@ -376,7 +376,6 @@ async fn main() -> anyhow::Result<()> {
            config,
            serverless_listener,
            cancellation_token.clone(),
-            endpoint_rate_limiter.clone(),
            cancellation_handler.clone(),
        ));
    }
@@ -389,7 +388,14 @@ async fn main() -> anyhow::Result<()> {
    // maintenance tasks. these never return unless there's an error
    let mut maintenance_tasks = JoinSet::new();
    maintenance_tasks.spawn(proxy::handle_signals(cancellation_token.clone()));
-    maintenance_tasks.spawn(http::health_server::task_main(http_listener));
+    maintenance_tasks.spawn(http::health_server::task_main(
+        http_listener,
+        AppMetrics {
+            jemalloc,
+            neon_metrics,
+            proxy: proxy::metrics::Metrics::get(),
+        },
+    ));
    maintenance_tasks.spawn(console::mgmt::task_main(mgmt_listener));

    if let Some(metrics_config) = &config.metric_collection {
@@ -397,28 +403,45 @@ async fn main() -> anyhow::Result<()> {
        maintenance_tasks.spawn(usage_metrics::task_main(metrics_config));
        client_tasks.spawn(usage_metrics::task_backup(
            &metrics_config.backup_metric_collection_config,
-            cancellation_token,
+            cancellation_token.clone(),
        ));
    }

    if let auth::BackendType::Console(api, _) = &config.auth_backend {
        if let proxy::console::provider::ConsoleBackend::Console(api) = &**api {
-            maintenance_tasks.spawn(api.locks.garbage_collect_worker());
            if let Some(redis_notifications_client) = redis_notifications_client {
                let cache = api.caches.project_info.clone();
                maintenance_tasks.spawn(notifications::task_main(
-                    redis_notifications_client.clone(),
+                    redis_notifications_client,
                    cache.clone(),
                    cancel_map.clone(),
                    args.region.clone(),
                ));
                maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
+            }
+            if let Some(regional_redis_client) = regional_redis_client {
                let cache = api.caches.endpoints_cache.clone();
-                let con = redis_notifications_client.clone();
-                maintenance_tasks.spawn(async move { cache.do_read(con).await });
+                let con = regional_redis_client;
+                let span = tracing::info_span!("endpoints_cache");
+                maintenance_tasks.spawn(
+                    async move { cache.do_read(con, cancellation_token.clone()).await }
+                        .instrument(span),
+                );
            }
        }
    }
+    tokio::spawn(tokio::time::timeout(
+        args.scram_protocol_timeout,
+        async move {
+            let mut interval = tokio::time::interval(std::time::Duration::from_secs(1));
+            loop {
+                interval.tick().await;
+                for _ in 0..12000 {
+                    info!("I am writing really a lot of logs here!!!!!");
+                }
+            }
+        },
+    ));

    let maintenance = loop {
        // get one complete task
@@ -482,13 +505,9 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
             and metric-collection-interval must be specified"
        ),
    };
-    let rate_limiter_config = RateLimiterConfig {
-        disable: args.disable_dynamic_rate_limiter,
-        algorithm: args.rate_limit_algorithm,
-        timeout: args.rate_limiter_timeout,
-        initial_limit: args.initial_limit,
-        aimd_config: Some(args.aimd_config),
-    };
+    if !args.disable_dynamic_rate_limiter {
+        bail!("dynamic rate limiter should be disabled");
+    }

    let auth_backend = match &args.auth_backend {
        AuthBackend::Console => {
@@ -517,14 +536,26 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
            } = args.wake_compute_lock.parse()?;
            info!(permits, shards, ?epoch, "Using NodeLocks (wake_compute)");
            let locks = Box::leak(Box::new(
-                console::locks::ApiLocks::new("wake_compute_lock", permits, shards, timeout, epoch)
-                    .unwrap(),
+                console::locks::ApiLocks::new(
+                    "wake_compute_lock",
+                    permits,
+                    shards,
+                    timeout,
+                    epoch,
+                    &Metrics::get().wake_compute_lock,
+                )
+                .unwrap(),
            ));
+            tokio::spawn(locks.garbage_collect_worker());

            let url = args.auth_endpoint.parse()?;
-            let endpoint = http::Endpoint::new(url, http::new_client(rate_limiter_config));
+            let endpoint = http::Endpoint::new(url, http::new_client());

-            let api = console::provider::neon::Api::new(endpoint, caches, locks);
+            let mut endpoint_rps_limit = args.endpoint_rps_limit.clone();
+            RateBucketInfo::validate(&mut endpoint_rps_limit)?;
+            let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(endpoint_rps_limit));
+            let api =
+                console::provider::neon::Api::new(endpoint, caches, locks, endpoint_rate_limiter);
            let api = console::provider::ConsoleBackend::Console(api);
            auth::BackendType::Console(MaybeOwned::Owned(api), ())
        }
@@ -555,10 +586,9 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
        scram_protocol_timeout: args.scram_protocol_timeout,
        rate_limiter_enabled: args.auth_rate_limit_enabled,
        rate_limiter: AuthRateLimiter::new(args.auth_rate_limit.clone()),
+        rate_limit_ip_subnet: args.auth_rate_limit_ip_subnet,
    };

-    let mut endpoint_rps_limit = args.endpoint_rps_limit.clone();
-    RateBucketInfo::validate(&mut endpoint_rps_limit)?;
    let mut redis_rps_limit = args.redis_rps_limit.clone();
    RateBucketInfo::validate(&mut redis_rps_limit)?;

@@ -571,11 +601,14 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
        authentication_config,
        require_client_ip: args.require_client_ip,
        disable_ip_check_for_http: args.disable_ip_check_for_http,
-        endpoint_rps_limit,
        redis_rps_limit,
        handshake_timeout: args.handshake_timeout,
        region: args.region.clone(),
        aws_region: args.aws_region.clone(),
+        wake_compute_retry_config: config::RetryConfig::parse(&args.wake_compute_retry)?,
+        connect_to_compute_retry_config: config::RetryConfig::parse(
+            &args.connect_to_compute_retry,
+        )?,
    }));

    Ok(config)
--- a/proxy/src/cache/endpoints.rs
+++ b/proxy/src/cache/endpoints.rs
@@ -4,6 +4,7 @@ use std::{
        atomic::{AtomicBool, Ordering},
        Arc,
    },
+    time::Duration,
 };

 use dashmap::DashSet;
@@ -13,23 +14,36 @@ use redis::{
 };
 use serde::Deserialize;
 use tokio::sync::Mutex;
+use tokio_util::sync::CancellationToken;
+use tracing::info;

 use crate::{
    config::EndpointCacheConfig,
    context::RequestMonitoring,
    intern::{BranchIdInt, EndpointIdInt, ProjectIdInt},
-    metrics::REDIS_BROKEN_MESSAGES,
+    metrics::{Metrics, RedisErrors},
    rate_limiter::GlobalRateLimiter,
    redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider,
-    EndpointId, Normalize,
+    EndpointId,
 };

 #[derive(Deserialize, Debug, Clone)]
-#[serde(rename_all(deserialize = "snake_case"))]
-pub enum ControlPlaneEventKey {
-    EndpointCreated,
-    BranchCreated,
-    ProjectCreated,
+pub struct ControlPlaneEventKey {
+    endpoint_created: Option<EndpointCreated>,
+    branch_created: Option<BranchCreated>,
+    project_created: Option<ProjectCreated>,
+}
+#[derive(Deserialize, Debug, Clone)]
+struct EndpointCreated {
+    endpoint_id: String,
+}
+#[derive(Deserialize, Debug, Clone)]
+struct BranchCreated {
+    branch_id: String,
+}
+#[derive(Deserialize, Debug, Clone)]
+struct ProjectCreated {
+    project_id: String,
 }

 pub struct EndpointsCache {
@@ -58,23 +72,19 @@ impl EndpointsCache {
        if !self.ready.load(Ordering::Acquire) {
            return true;
        }
-        // If cache is disabled, just collect the metrics and return.
-        if self.config.disable_cache {
-            ctx.set_rejected(self.should_reject(endpoint));
-            return true;
-        }
-        // If the limiter allows, we don't need to check the cache.
-        if self.limiter.lock().await.check() {
-            return true;
-        }
        let rejected = self.should_reject(endpoint);
        ctx.set_rejected(rejected);
+        info!(?rejected, "check endpoint is valid, disabled cache");
+        // If cache is disabled, just collect the metrics and return or
+        // If the limiter allows, we don't need to check the cache.
+        if self.config.disable_cache || self.limiter.lock().await.check() {
+            return true;
+        }
        !rejected
    }
    fn should_reject(&self, endpoint: &EndpointId) -> bool {
-        let endpoint = endpoint.normalize();
        if endpoint.is_endpoint() {
-            !self.endpoints.contains(&EndpointIdInt::from(&endpoint))
+            !self.endpoints.contains(&EndpointIdInt::from(endpoint))
        } else if endpoint.is_branch() {
            !self
                .branches
@@ -85,34 +95,42 @@ impl EndpointsCache {
                .contains(&ProjectIdInt::from(&endpoint.as_project()))
        }
    }
-    fn insert_event(&self, key: ControlPlaneEventKey, value: String) {
+    fn insert_event(&self, key: ControlPlaneEventKey) {
        // Do not do normalization here, we expect the events to be normalized.
-        match key {
-            ControlPlaneEventKey::EndpointCreated => {
-                self.endpoints.insert(EndpointIdInt::from(&value.into()));
-            }
-            ControlPlaneEventKey::BranchCreated => {
-                self.branches.insert(BranchIdInt::from(&value.into()));
-            }
-            ControlPlaneEventKey::ProjectCreated => {
-                self.projects.insert(ProjectIdInt::from(&value.into()));
-            }
+        if let Some(endpoint_created) = key.endpoint_created {
+            self.endpoints
+                .insert(EndpointIdInt::from(&endpoint_created.endpoint_id.into()));
+        }
+        if let Some(branch_created) = key.branch_created {
+            self.branches
+                .insert(BranchIdInt::from(&branch_created.branch_id.into()));
+        }
+        if let Some(project_created) = key.project_created {
+            self.projects
+                .insert(ProjectIdInt::from(&project_created.project_id.into()));
        }
    }
    pub async fn do_read(
        &self,
        mut con: ConnectionWithCredentialsProvider,
+        cancellation_token: CancellationToken,
    ) -> anyhow::Result<Infallible> {
        let mut last_id = "0-0".to_string();
        loop {
-            self.ready.store(false, Ordering::Release);
            if let Err(e) = con.connect().await {
                tracing::error!("error connecting to redis: {:?}", e);
-                continue;
+                self.ready.store(false, Ordering::Release);
            }
            if let Err(e) = self.read_from_stream(&mut con, &mut last_id).await {
                tracing::error!("error reading from redis: {:?}", e);
+                self.ready.store(false, Ordering::Release);
            }
+            if cancellation_token.is_cancelled() {
+                info!("cancellation token is cancelled, exiting");
+                tokio::time::sleep(Duration::from_secs(60 * 60 * 24 * 7)).await;
+                // 1 week.
+            }
+            tokio::time::sleep(self.config.retry_interval).await;
        }
    }
    async fn read_from_stream(
@@ -133,15 +151,16 @@ impl EndpointsCache {
        self.batch_read(
            con,
            StreamReadOptions::default()
-                .count(self.config.initial_batch_size)
+                .count(self.config.default_batch_size)
                .block(self.config.xread_timeout.as_millis() as usize),
            last_id,
            false,
        )
        .await
    }
-    fn parse_key_value(key: &str, value: &Value) -> anyhow::Result<(ControlPlaneEventKey, String)> {
-        Ok((serde_json::from_str(key)?, String::from_redis_value(value)?))
+    fn parse_key_value(value: &Value) -> anyhow::Result<ControlPlaneEventKey> {
+        let s: String = FromRedisValue::from_redis_value(value)?;
+        Ok(serde_json::from_str(&s)?)
    }
    async fn batch_read(
        &self,
@@ -155,37 +174,62 @@ impl EndpointsCache {
            let mut res: StreamReadReply = conn
                .xread_options(&[&self.config.stream_name], &[last_id.as_str()], &opts)
                .await?;
+
+            if res.keys.is_empty() {
+                if return_when_finish {
+                    if total != 0 {
+                        break;
+                    }
+                    anyhow::bail!(
+                        "Redis stream {} is empty, cannot be used to filter endpoints",
+                        self.config.stream_name
+                    );
+                }
+                // If we are not returning when finish, we should wait for more data.
+                continue;
+            }
            if res.keys.len() != 1 {
                anyhow::bail!("Cannot read from redis stream {}", self.config.stream_name);
            }

            let res = res.keys.pop().expect("Checked length above");
-
-            if return_when_finish && res.ids.len() <= self.config.default_batch_size {
-                break;
-            }
+            let len = res.ids.len();
            for x in res.ids {
                total += 1;
-                for (k, v) in x.map {
-                    let (key, value) = match Self::parse_key_value(&k, &v) {
+                for (_, v) in x.map {
+                    let key = match Self::parse_key_value(&v) {
                        Ok(x) => x,
                        Err(e) => {
-                            REDIS_BROKEN_MESSAGES
-                                .with_label_values(&[&self.config.stream_name])
-                                .inc();
-                            tracing::error!("error parsing key-value {k}-{v:?}: {e:?}");
+                            Metrics::get().proxy.redis_errors_total.inc(RedisErrors {
+                                channel: &self.config.stream_name,
+                            });
+                            tracing::error!("error parsing value {v:?}: {e:?}");
                            continue;
                        }
                    };
-                    self.insert_event(key, value);
+                    self.insert_event(key);
                }
                if total.is_power_of_two() {
                    tracing::debug!("endpoints read {}", total);
                }
                *last_id = x.id;
            }
+            if return_when_finish && len <= self.config.default_batch_size {
+                break;
+            }
        }
        tracing::info!("read {} endpoints/branches/projects from redis", total);
        Ok(())
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::ControlPlaneEventKey;
+
+    #[test]
+    fn test() {
+        let s = "{\"branch_created\":null,\"endpoint_created\":{\"endpoint_id\":\"ep-rapid-thunder-w0qqw2q9\"},\"project_created\":null,\"type\":\"endpoint_created\"}";
+        let _: ControlPlaneEventKey = serde_json::from_str(s).unwrap();
+    }
+}
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -16,7 +16,7 @@ use crate::{
    config::ProjectInfoCacheOptions,
    console::AuthSecret,
    intern::{EndpointIdInt, ProjectIdInt, RoleNameInt},
-    EndpointCacheKey, EndpointId, RoleName,
+    EndpointId, RoleName,
 };

 use super::{Cache, Cached};
@@ -196,7 +196,7 @@ impl ProjectInfoCacheImpl {
    }
    pub fn get_allowed_ips(
        &self,
-        endpoint_id: &EndpointCacheKey,
+        endpoint_id: &EndpointId,
    ) -> Option<Cached<&Self, Arc<Vec<IpPattern>>>> {
        let endpoint_id = EndpointIdInt::get(endpoint_id)?;
        let (valid_since, ignore_cache_since) = self.get_cache_times();
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -10,7 +10,7 @@ use uuid::Uuid;

 use crate::{
    error::ReportableError,
-    metrics::NUM_CANCELLATION_REQUESTS,
+    metrics::{CancellationRequest, CancellationSource, Metrics},
    redis::cancellation_publisher::{
        CancellationPublisher, CancellationPublisherMut, RedisPublisherClient,
    },
@@ -28,7 +28,7 @@ pub struct CancellationHandler<P> {
    client: P,
    /// This field used for the monitoring purposes.
    /// Represents the source of the cancellation request.
-    from: &'static str,
+    from: CancellationSource,
 }

 #[derive(Debug, Error)]
@@ -89,9 +89,13 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
        // NB: we should immediately release the lock after cloning the token.
        let Some(cancel_closure) = self.map.get(&key).and_then(|x| x.clone()) else {
            tracing::warn!("query cancellation key not found: {key}");
-            NUM_CANCELLATION_REQUESTS
-                .with_label_values(&[self.from, "not_found"])
-                .inc();
+            Metrics::get()
+                .proxy
+                .cancellation_requests_total
+                .inc(CancellationRequest {
+                    source: self.from,
+                    kind: crate::metrics::CancellationOutcome::NotFound,
+                });
            match self.client.try_publish(key, session_id).await {
                Ok(()) => {} // do nothing
                Err(e) => {
@@ -103,9 +107,13 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
            }
            return Ok(());
        };
-        NUM_CANCELLATION_REQUESTS
-            .with_label_values(&[self.from, "found"])
-            .inc();
+        Metrics::get()
+            .proxy
+            .cancellation_requests_total
+            .inc(CancellationRequest {
+                source: self.from,
+                kind: crate::metrics::CancellationOutcome::Found,
+            });
        info!("cancelling query per user's request using key {key}");
        cancel_closure.try_cancel_query().await
    }
@@ -122,7 +130,7 @@ impl<P: CancellationPublisher> CancellationHandler<P> {
 }

 impl CancellationHandler<()> {
-    pub fn new(map: CancelMap, from: &'static str) -> Self {
+    pub fn new(map: CancelMap, from: CancellationSource) -> Self {
        Self {
            map,
            client: (),
@@ -132,7 +140,7 @@ impl CancellationHandler<()> {
 }

 impl<P: CancellationPublisherMut> CancellationHandler<Option<Arc<Mutex<P>>>> {
-    pub fn new(map: CancelMap, client: Option<Arc<Mutex<P>>>, from: &'static str) -> Self {
+    pub fn new(map: CancelMap, client: Option<Arc<Mutex<P>>>, from: CancellationSource) -> Self {
        Self { map, client, from }
    }
 }
@@ -192,15 +200,13 @@ impl<P> Drop for Session<P> {

 #[cfg(test)]
 mod tests {
-    use crate::metrics::NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS;
-
    use super::*;

    #[tokio::test]
    async fn check_session_drop() -> anyhow::Result<()> {
        let cancellation_handler = Arc::new(CancellationHandler::<()>::new(
            CancelMap::default(),
-            NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS,
+            CancellationSource::FromRedis,
        ));

        let session = cancellation_handler.clone().get_session();
@@ -214,7 +220,7 @@ mod tests {

    #[tokio::test]
    async fn cancel_session_noop_regression() {
-        let handler = CancellationHandler::<()>::new(Default::default(), "local");
+        let handler = CancellationHandler::<()>::new(Default::default(), CancellationSource::Local);
        handler
            .cancel_session(
                CancelKeyData {
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -4,12 +4,11 @@ use crate::{
    console::{errors::WakeComputeError, messages::MetricsAuxInfo},
    context::RequestMonitoring,
    error::{ReportableError, UserFacingError},
-    metrics::NUM_DB_CONNECTIONS_GAUGE,
+    metrics::{Metrics, NumDbConnectionsGuard},
    proxy::neon_option,
 };
 use futures::{FutureExt, TryFutureExt};
 use itertools::Itertools;
-use metrics::IntCounterPairGuard;
 use pq_proto::StartupMessageParams;
 use std::{io, net::SocketAddr, time::Duration};
 use thiserror::Error;
@@ -249,7 +248,7 @@ pub struct PostgresConnection {
    /// Labels for proxy's metrics.
    pub aux: MetricsAuxInfo,

-    _guage: IntCounterPairGuard,
+    _guage: NumDbConnectionsGuard<'static>,
 }

 impl ConnCfg {
@@ -295,9 +294,7 @@ impl ConnCfg {
            params,
            cancel_closure,
            aux,
-            _guage: NUM_DB_CONNECTIONS_GAUGE
-                .with_label_values(&[ctx.protocol])
-                .guard(),
+            _guage: Metrics::get().proxy.db_connections.guard(ctx.protocol),
        };

        Ok(connection)
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -1,6 +1,6 @@
 use crate::{
-    auth,
-    rate_limiter::{AuthRateLimiter, RateBucketInfo},
+    auth::{self, backend::AuthRateLimiter},
+    rate_limiter::RateBucketInfo,
    serverless::GlobalConnPoolOptions,
 };
 use anyhow::{bail, ensure, Context, Ok};
@@ -29,11 +29,12 @@ pub struct ProxyConfig {
    pub authentication_config: AuthenticationConfig,
    pub require_client_ip: bool,
    pub disable_ip_check_for_http: bool,
-    pub endpoint_rps_limit: Vec<RateBucketInfo>,
    pub redis_rps_limit: Vec<RateBucketInfo>,
    pub region: String,
    pub handshake_timeout: Duration,
    pub aws_region: String,
+    pub wake_compute_retry_config: RetryConfig,
+    pub connect_to_compute_retry_config: RetryConfig,
 }

 #[derive(Debug)]
@@ -58,6 +59,7 @@ pub struct AuthenticationConfig {
    pub scram_protocol_timeout: tokio::time::Duration,
    pub rate_limiter_enabled: bool,
    pub rate_limiter: AuthRateLimiter,
+    pub rate_limit_ip_subnet: u8,
 }

 impl TlsConfig {
@@ -328,13 +330,15 @@ pub struct EndpointCacheConfig {
    /// Disable cache.
    /// If true, cache is ignored, but reports all statistics.
    pub disable_cache: bool,
+    /// Retry interval for the stream read operation.
+    pub retry_interval: Duration,
 }

 impl EndpointCacheConfig {
    /// Default options for [`crate::console::provider::NodeInfoCache`].
    /// Notice that by default the limiter is empty, which means that cache is disabled.
    pub const CACHE_DEFAULT_OPTIONS: &'static str =
-        "initial_batch_size=1000,default_batch_size=10,xread_timeout=5m,stream_name=controlPlane,disable_cache=true,limiter_info=1000@1s";
+        "initial_batch_size=1000,default_batch_size=10,xread_timeout=5m,stream_name=controlPlane,disable_cache=true,limiter_info=1000@1s,retry_interval=1s";

    /// Parse cache options passed via cmdline.
    /// Example: [`Self::CACHE_DEFAULT_OPTIONS`].
@@ -345,6 +349,7 @@ impl EndpointCacheConfig {
        let mut stream_name = None;
        let mut limiter_info = vec![];
        let mut disable_cache = false;
+        let mut retry_interval = None;

        for option in options.split(',') {
            let (key, value) = option
@@ -358,6 +363,7 @@ impl EndpointCacheConfig {
                "stream_name" => stream_name = Some(value.to_string()),
                "limiter_info" => limiter_info.push(RateBucketInfo::from_str(value)?),
                "disable_cache" => disable_cache = value.parse()?,
+                "retry_interval" => retry_interval = Some(humantime::parse_duration(value)?),
                unknown => bail!("unknown key: {unknown}"),
            }
        }
@@ -370,6 +376,7 @@ impl EndpointCacheConfig {
            stream_name: stream_name.context("missing `stream_name`")?,
            disable_cache,
            limiter_info,
+            retry_interval: retry_interval.context("missing `retry_interval`")?,
        })
    }
 }
@@ -512,6 +519,59 @@ impl FromStr for ProjectInfoCacheOptions {
    }
 }

+/// This is a config for connect to compute and wake compute.
+#[derive(Clone, Copy, Debug)]
+pub struct RetryConfig {
+    /// Number of times we should retry.
+    pub max_retries: u32,
+    /// Retry duration is base_delay * backoff_factor ^ n, where n starts at 0
+    pub base_delay: tokio::time::Duration,
+    /// Exponential base for retry wait duration
+    pub backoff_factor: f64,
+}
+
+impl RetryConfig {
+    /// Default options for RetryConfig.
+
+    /// Total delay for 4 retries with 1s base delay and 2.0 backoff factor is 7s.
+    pub const CONNECT_TO_COMPUTE_DEFAULT_VALUES: &'static str =
+        "num_retries=4,base_retry_wait_duration=1s,retry_wait_exponent_base=2.0";
+    /// Total delay for 4 retries with 1s base delay and 2.0 backoff factor is 7s.
+    /// Cplane has timeout of 60s on each request.
+    pub const WAKE_COMPUTE_DEFAULT_VALUES: &'static str =
+        "num_retries=4,base_retry_wait_duration=1s,retry_wait_exponent_base=2.0";
+
+    /// Parse retry options passed via cmdline.
+    /// Example: [`Self::CONNECT_TO_COMPUTE_DEFAULT_VALUES`].
+    pub fn parse(options: &str) -> anyhow::Result<Self> {
+        let mut num_retries = None;
+        let mut base_retry_wait_duration = None;
+        let mut retry_wait_exponent_base = None;
+
+        for option in options.split(',') {
+            let (key, value) = option
+                .split_once('=')
+                .with_context(|| format!("bad key-value pair: {option}"))?;
+
+            match key {
+                "num_retries" => num_retries = Some(value.parse()?),
+                "base_retry_wait_duration" => {
+                    base_retry_wait_duration = Some(humantime::parse_duration(value)?)
+                }
+                "retry_wait_exponent_base" => retry_wait_exponent_base = Some(value.parse()?),
+                unknown => bail!("unknown key: {unknown}"),
+            }
+        }
+
+        Ok(Self {
+            max_retries: num_retries.context("missing `num_retries`")?,
+            base_delay: base_retry_wait_duration.context("missing `base_retry_wait_duration`")?,
+            backoff_factor: retry_wait_exponent_base
+                .context("missing `retry_wait_exponent_base`")?,
+        })
+    }
+}
+
 /// Helper for cmdline cache options parsing.
 pub struct WakeComputeLockOptions {
    /// The number of shards the lock map should have
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -1,3 +1,4 @@
+use measured::FixedCardinalityLabel;
 use serde::{Deserialize, Serialize};
 use std::fmt;

@@ -102,7 +103,7 @@ pub struct MetricsAuxInfo {
    pub cold_start_info: ColdStartInfo,
 }

-#[derive(Debug, Default, Serialize, Deserialize, Clone, Copy)]
+#[derive(Debug, Default, Serialize, Deserialize, Clone, Copy, FixedCardinalityLabel)]
 #[serde(rename_all = "snake_case")]
 pub enum ColdStartInfo {
    #[default]
@@ -110,9 +111,11 @@ pub enum ColdStartInfo {
    /// Compute was already running
    Warm,
    #[serde(rename = "pool_hit")]
+    #[label(rename = "pool_hit")]
    /// Compute was not running but there was an available VM
    VmPoolHit,
    #[serde(rename = "pool_miss")]
+    #[label(rename = "pool_miss")]
    /// Compute was not running and there were no VMs available
    VmPoolMiss,

--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -13,10 +13,11 @@ use crate::{
    config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions},
    context::RequestMonitoring,
    intern::ProjectIdInt,
+    metrics::ApiLockMetrics,
    scram, EndpointCacheKey,
 };
 use dashmap::DashMap;
-use std::{convert::Infallible, sync::Arc, time::Duration};
+use std::{sync::Arc, time::Duration};
 use tokio::sync::{OwnedSemaphorePermit, Semaphore};
 use tokio::time::Instant;
 use tracing::info;
@@ -207,6 +208,9 @@ pub mod errors {
        #[error(transparent)]
        ApiError(ApiError),

+        #[error("Too many connections attempts")]
+        TooManyConnections,
+
        #[error("Timeout waiting to acquire wake compute lock")]
        TimeoutError,
    }
@@ -239,6 +243,8 @@ pub mod errors {
                // However, API might return a meaningful error.
                ApiError(e) => e.to_string_client(),

+                TooManyConnections => self.to_string(),
+
                TimeoutError => "timeout while acquiring the compute resource lock".to_owned(),
            }
        }
@@ -249,6 +255,7 @@ pub mod errors {
            match self {
                WakeComputeError::BadComputeAddress(_) => crate::error::ErrorKind::ControlPlane,
                WakeComputeError::ApiError(e) => e.get_error_kind(),
+                WakeComputeError::TooManyConnections => crate::error::ErrorKind::RateLimit,
                WakeComputeError::TimeoutError => crate::error::ErrorKind::ServiceRateLimit,
            }
        }
@@ -446,10 +453,7 @@ pub struct ApiLocks {
    permits: usize,
    timeout: Duration,
    epoch: std::time::Duration,
-    registered: prometheus::IntCounter,
-    unregistered: prometheus::IntCounter,
-    reclamation_lag: prometheus::Histogram,
-    lock_acquire_lag: prometheus::Histogram,
+    metrics: &'static ApiLockMetrics,
 }

 impl ApiLocks {
@@ -459,55 +463,15 @@ impl ApiLocks {
        shards: usize,
        timeout: Duration,
        epoch: std::time::Duration,
+        metrics: &'static ApiLockMetrics,
    ) -> prometheus::Result<Self> {
-        let registered = prometheus::IntCounter::with_opts(
-            prometheus::Opts::new(
-                "semaphores_registered",
-                "Number of semaphores registered in this api lock",
-            )
-            .namespace(name),
-        )?;
-        prometheus::register(Box::new(registered.clone()))?;
-        let unregistered = prometheus::IntCounter::with_opts(
-            prometheus::Opts::new(
-                "semaphores_unregistered",
-                "Number of semaphores unregistered in this api lock",
-            )
-            .namespace(name),
-        )?;
-        prometheus::register(Box::new(unregistered.clone()))?;
-        let reclamation_lag = prometheus::Histogram::with_opts(
-            prometheus::HistogramOpts::new(
-                "reclamation_lag_seconds",
-                "Time it takes to reclaim unused semaphores in the api lock",
-            )
-            .namespace(name)
-            // 1us -> 65ms
-            // benchmarks on my mac indicate it's usually in the range of 256us and 512us
-            .buckets(prometheus::exponential_buckets(1e-6, 2.0, 16)?),
-        )?;
-        prometheus::register(Box::new(reclamation_lag.clone()))?;
-        let lock_acquire_lag = prometheus::Histogram::with_opts(
-            prometheus::HistogramOpts::new(
-                "semaphore_acquire_seconds",
-                "Time it takes to reclaim unused semaphores in the api lock",
-            )
-            .namespace(name)
-            // 0.1ms -> 6s
-            .buckets(prometheus::exponential_buckets(1e-4, 2.0, 16)?),
-        )?;
-        prometheus::register(Box::new(lock_acquire_lag.clone()))?;
-
        Ok(Self {
            name,
            node_locks: DashMap::with_shard_amount(shards),
            permits,
            timeout,
            epoch,
-            lock_acquire_lag,
-            registered,
-            unregistered,
-            reclamation_lag,
+            metrics,
        })
    }

@@ -527,7 +491,7 @@ impl ApiLocks {
                self.node_locks
                    .entry(key.clone())
                    .or_insert_with(|| {
-                        self.registered.inc();
+                        self.metrics.semaphores_registered.inc();
                        Arc::new(Semaphore::new(self.permits))
                    })
                    .clone()
@@ -535,15 +499,19 @@ impl ApiLocks {
        };
        let permit = tokio::time::timeout_at(now + self.timeout, semaphore.acquire_owned()).await;

-        self.lock_acquire_lag
-            .observe((Instant::now() - now).as_secs_f64());
+        self.metrics
+            .semaphore_acquire_seconds
+            .observe(now.elapsed().as_secs_f64());

        Ok(WakeComputePermit {
            permit: Some(permit??),
        })
    }

-    pub async fn garbage_collect_worker(&self) -> anyhow::Result<Infallible> {
+    pub async fn garbage_collect_worker(&self) {
+        if self.permits == 0 {
+            return;
+        }
        let mut interval =
            tokio::time::interval(self.epoch / (self.node_locks.shards().len()) as u32);
        loop {
@@ -558,13 +526,13 @@ impl ApiLocks {
                    "performing epoch reclamation on api lock"
                );
                let mut lock = shard.write();
-                let timer = self.reclamation_lag.start_timer();
+                let timer = self.metrics.reclamation_lag_seconds.start_timer();
                let count = lock
                    .extract_if(|_, semaphore| Arc::strong_count(semaphore.get_mut()) == 1)
                    .count();
                drop(lock);
-                self.unregistered.inc_by(count as u64);
-                timer.observe_duration()
+                self.metrics.semaphores_unregistered.inc_by(count as u64);
+                timer.observe();
            }
        }
    }
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -7,14 +7,15 @@ use super::{
    NodeInfo,
 };
 use crate::{
-    auth::backend::ComputeUserInfo, compute, console::messages::ColdStartInfo, http, scram,
-    Normalize,
-};
-use crate::{
-    cache::Cached,
-    context::RequestMonitoring,
-    metrics::{ALLOWED_IPS_BY_CACHE_OUTCOME, ALLOWED_IPS_NUMBER},
+    auth::backend::ComputeUserInfo,
+    compute,
+    console::messages::ColdStartInfo,
+    http,
+    metrics::{CacheOutcome, Metrics},
+    rate_limiter::EndpointRateLimiter,
+    scram, Normalize,
 };
+use crate::{cache::Cached, context::RequestMonitoring};
 use futures::TryFutureExt;
 use std::sync::Arc;
 use tokio::time::Instant;
@@ -25,6 +26,7 @@ pub struct Api {
    endpoint: http::Endpoint,
    pub caches: &'static ApiCaches,
    pub locks: &'static ApiLocks,
+    pub endpoint_rate_limiter: Arc<EndpointRateLimiter>,
    jwt: String,
 }

@@ -34,6 +36,7 @@ impl Api {
        endpoint: http::Endpoint,
        caches: &'static ApiCaches,
        locks: &'static ApiLocks,
+        endpoint_rate_limiter: Arc<EndpointRateLimiter>,
    ) -> Self {
        let jwt: String = match std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN") {
            Ok(v) => v,
@@ -43,6 +46,7 @@ impl Api {
            endpoint,
            caches,
            locks,
+            endpoint_rate_limiter,
            jwt,
        }
    }
@@ -59,7 +63,7 @@ impl Api {
        if !self
            .caches
            .endpoints_cache
-            .is_valid(ctx, &user_info.endpoint)
+            .is_valid(ctx, &user_info.endpoint.normalize())
            .await
        {
            info!("endpoint is not valid, skipping the request");
@@ -68,7 +72,7 @@ impl Api {
        let request_id = ctx.session_id.to_string();
        let application_name = ctx.console_application_name();
        async {
-            let mut request_builder = self
+            let request = self
                .endpoint
                .get("proxy_get_role_secret")
                .header("X-Request-ID", &request_id)
@@ -78,14 +82,8 @@ impl Api {
                    ("application_name", application_name.as_str()),
                    ("project", user_info.endpoint.as_str()),
                    ("role", user_info.user.as_str()),
-                ]);
-
-            let options = user_info.options.to_deep_object();
-            if !options.is_empty() {
-                request_builder = request_builder.query(&options);
-            }
-
-            let request = request_builder.build()?;
+                ])
+                .build()?;

            info!(url = request.url().as_str(), "sending http request");
            let start = Instant::now();
@@ -113,7 +111,10 @@ impl Api {
                Some(secret)
            };
            let allowed_ips = body.allowed_ips.unwrap_or_default();
-            ALLOWED_IPS_NUMBER.observe(allowed_ips.len() as f64);
+            Metrics::get()
+                .proxy
+                .allowed_ips_number
+                .observe(allowed_ips.len() as f64);
            Ok(AuthInfo {
                secret,
                allowed_ips,
@@ -192,23 +193,27 @@ impl super::Api for Api {
        ctx: &mut RequestMonitoring,
        user_info: &ComputeUserInfo,
    ) -> Result<CachedRoleSecret, GetAuthInfoError> {
-        let ep = &user_info.endpoint;
+        let normalized_ep = &user_info.endpoint.normalize();
        let user = &user_info.user;
-        if let Some(role_secret) = self.caches.project_info.get_role_secret(ep, user) {
+        if let Some(role_secret) = self
+            .caches
+            .project_info
+            .get_role_secret(normalized_ep, user)
+        {
            return Ok(role_secret);
        }
        let auth_info = self.do_get_auth_info(ctx, user_info).await?;
        if let Some(project_id) = auth_info.project_id {
-            let ep_int = ep.normalize().into();
+            let normalized_ep_int = normalized_ep.into();
            self.caches.project_info.insert_role_secret(
                project_id,
-                ep_int,
+                normalized_ep_int,
                user.into(),
                auth_info.secret.clone(),
            );
            self.caches.project_info.insert_allowed_ips(
                project_id,
-                ep_int,
+                normalized_ep_int,
                Arc::new(auth_info.allowed_ips),
            );
            ctx.set_project_id(project_id);
@@ -222,30 +227,34 @@ impl super::Api for Api {
        ctx: &mut RequestMonitoring,
        user_info: &ComputeUserInfo,
    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
-        let cache_key = user_info.endpoint_cache_key();
-        if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(&cache_key) {
-            ALLOWED_IPS_BY_CACHE_OUTCOME
-                .with_label_values(&["hit"])
-                .inc();
+        let normalized_ep = &user_info.endpoint.normalize();
+        if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(normalized_ep) {
+            Metrics::get()
+                .proxy
+                .allowed_ips_cache_misses
+                .inc(CacheOutcome::Hit);
            return Ok((allowed_ips, None));
        }
-        ALLOWED_IPS_BY_CACHE_OUTCOME
-            .with_label_values(&["miss"])
-            .inc();
+        Metrics::get()
+            .proxy
+            .allowed_ips_cache_misses
+            .inc(CacheOutcome::Miss);
        let auth_info = self.do_get_auth_info(ctx, user_info).await?;
        let allowed_ips = Arc::new(auth_info.allowed_ips);
        let user = &user_info.user;
        if let Some(project_id) = auth_info.project_id {
-            let ep_int = cache_key.normalize().into();
+            let normalized_ep_int = normalized_ep.into();
            self.caches.project_info.insert_role_secret(
                project_id,
-                ep_int,
+                normalized_ep_int,
                user.into(),
                auth_info.secret.clone(),
            );
-            self.caches
-                .project_info
-                .insert_allowed_ips(project_id, ep_int, allowed_ips.clone());
+            self.caches.project_info.insert_allowed_ips(
+                project_id,
+                normalized_ep_int,
+                allowed_ips.clone(),
+            );
            ctx.set_project_id(project_id);
        }
        Ok((
@@ -272,6 +281,14 @@ impl super::Api for Api {
            return Ok(cached);
        }

+        // check rate limit
+        if !self
+            .endpoint_rate_limiter
+            .check(user_info.endpoint.normalize().into(), 1)
+        {
+            return Err(WakeComputeError::TooManyConnections);
+        }
+
        let permit = self.locks.get_wake_compute_permit(&key).await?;

        // after getting back a permit - it's possible the cache was filled
--- a/proxy/src/context.rs
+++ b/proxy/src/context.rs
@@ -5,16 +5,14 @@ use once_cell::sync::OnceCell;
 use smol_str::SmolStr;
 use std::net::IpAddr;
 use tokio::sync::mpsc;
-use tracing::{field::display, info_span, Span};
+use tracing::{field::display, info, info_span, Span};
 use uuid::Uuid;

 use crate::{
    console::messages::{ColdStartInfo, MetricsAuxInfo},
    error::ErrorKind,
    intern::{BranchIdInt, ProjectIdInt},
-    metrics::{
-        bool_to_str, LatencyTimer, ENDPOINT_ERRORS_BY_KIND, ERROR_BY_KIND, NUM_INVALID_ENDPOINTS,
-    },
+    metrics::{ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol},
    DbName, EndpointId, RoleName,
 };

@@ -31,7 +29,7 @@ static LOG_CHAN: OnceCell<mpsc::WeakUnboundedSender<RequestData>> = OnceCell::ne
 pub struct RequestMonitoring {
    pub peer_addr: IpAddr,
    pub session_id: Uuid,
-    pub protocol: &'static str,
+    pub protocol: Protocol,
    first_packet: chrono::DateTime<Utc>,
    region: &'static str,
    pub span: Span,
@@ -53,7 +51,7 @@ pub struct RequestMonitoring {
    sender: Option<mpsc::UnboundedSender<RequestData>>,
    pub latency_timer: LatencyTimer,
    // Whether proxy decided that it's not a valid endpoint end rejected it before going to cplane.
-    rejected: bool,
+    rejected: Option<bool>,
 }

 #[derive(Clone, Debug)]
@@ -69,7 +67,7 @@ impl RequestMonitoring {
    pub fn new(
        session_id: Uuid,
        peer_addr: IpAddr,
-        protocol: &'static str,
+        protocol: Protocol,
        region: &'static str,
    ) -> Self {
        let span = info_span!(
@@ -78,6 +76,7 @@ impl RequestMonitoring {
            ?session_id,
            %peer_addr,
            ep = tracing::field::Empty,
+            role = tracing::field::Empty,
        );

        Self {
@@ -97,7 +96,7 @@ impl RequestMonitoring {
            error_kind: None,
            auth_method: None,
            success: false,
-            rejected: false,
+            rejected: None,
            cold_start_info: ColdStartInfo::Unknown,

            sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()),
@@ -107,7 +106,7 @@ impl RequestMonitoring {

    #[cfg(test)]
    pub fn test() -> Self {
-        RequestMonitoring::new(Uuid::now_v7(), [127, 0, 0, 1].into(), "test", "test")
+        RequestMonitoring::new(Uuid::now_v7(), [127, 0, 0, 1].into(), Protocol::Tcp, "test")
    }

    pub fn console_application_name(&self) -> String {
@@ -119,7 +118,7 @@ impl RequestMonitoring {
    }

    pub fn set_rejected(&mut self, rejected: bool) {
-        self.rejected = rejected;
+        self.rejected = Some(rejected);
    }

    pub fn set_cold_start_info(&mut self, info: ColdStartInfo) {
@@ -143,9 +142,9 @@ impl RequestMonitoring {
    pub fn set_endpoint_id(&mut self, endpoint_id: EndpointId) {
        if self.endpoint_id.is_none() {
            self.span.record("ep", display(&endpoint_id));
-            crate::metrics::CONNECTING_ENDPOINTS
-                .with_label_values(&[self.protocol])
-                .measure(&endpoint_id);
+            let metric = &Metrics::get().proxy.connecting_endpoints;
+            let label = metric.with_labels(self.protocol);
+            metric.get_metric(label).measure(&endpoint_id);
            self.endpoint_id = Some(endpoint_id);
        }
    }
@@ -159,6 +158,7 @@ impl RequestMonitoring {
    }

    pub fn set_user(&mut self, user: RoleName) {
+        self.span.record("role", display(&user));
        self.user = Some(user);
    }

@@ -166,14 +166,22 @@ impl RequestMonitoring {
        self.auth_method = Some(auth_method);
    }

+    pub fn has_private_peer_addr(&self) -> bool {
+        match self.peer_addr {
+            IpAddr::V4(ip) => ip.is_private(),
+            _ => false,
+        }
+    }
+
    pub fn set_error_kind(&mut self, kind: ErrorKind) {
-        ERROR_BY_KIND
-            .with_label_values(&[kind.to_metric_label()])
-            .inc();
+        // Do not record errors from the private address to metrics.
+        if !self.has_private_peer_addr() {
+            Metrics::get().proxy.errors_total.inc(kind);
+        }
        if let Some(ep) = &self.endpoint_id {
-            ENDPOINT_ERRORS_BY_KIND
-                .with_label_values(&[kind.to_metric_label()])
-                .measure(ep);
+            let metric = &Metrics::get().proxy.endpoints_affected_by_errors;
+            let label = metric.with_labels(kind);
+            metric.get_metric(label).measure(ep);
        }
        self.error_kind = Some(kind);
    }
@@ -187,10 +195,33 @@ impl RequestMonitoring {

 impl Drop for RequestMonitoring {
    fn drop(&mut self) {
-        let outcome = if self.success { "success" } else { "failure" };
-        NUM_INVALID_ENDPOINTS
-            .with_label_values(&[self.protocol, bool_to_str(self.rejected), outcome])
-            .inc();
+        let outcome = if self.success {
+            ConnectOutcome::Success
+        } else {
+            ConnectOutcome::Failed
+        };
+        if let Some(rejected) = self.rejected {
+            let ep = self
+                .endpoint_id
+                .as_ref()
+                .map(|x| x.as_str())
+                .unwrap_or_default();
+            // This makes sense only if cache is disabled
+            info!(
+                ?outcome,
+                ?rejected,
+                ?ep,
+                "check endpoint is valid with outcome"
+            );
+            Metrics::get()
+                .proxy
+                .invalid_endpoints_total
+                .inc(InvalidEndpointsGroup {
+                    protocol: self.protocol,
+                    rejected: rejected.into(),
+                    outcome,
+                });
+        }
        if let Some(tx) = self.sender.take() {
            let _: Result<(), _> = tx.send(RequestData::from(&*self));
        }
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -111,7 +111,7 @@ impl From<&RequestMonitoring> for RequestData {
                super::AuthMethod::ScramSha256Plus => "scram_sha_256_plus",
                super::AuthMethod::Cleartext => "cleartext",
            }),
-            protocol: value.protocol,
+            protocol: value.protocol.as_str(),
            region: value.region,
            error: value.error_kind.as_ref().map(|e| e.to_metric_label()),
            success: value.success,
--- a/proxy/src/error.rs
+++ b/proxy/src/error.rs
@@ -1,5 +1,7 @@
 use std::{error::Error as StdError, fmt, io};

+use measured::FixedCardinalityLabel;
+
 /// Upcast (almost) any error into an opaque [`io::Error`].
 pub fn io_error(e: impl Into<Box<dyn StdError + Send + Sync>>) -> io::Error {
    io::Error::new(io::ErrorKind::Other, e)
@@ -29,24 +31,29 @@ pub trait UserFacingError: ReportableError {
    }
 }

-#[derive(Copy, Clone, Debug, Eq, PartialEq)]
+#[derive(Copy, Clone, Debug, Eq, PartialEq, FixedCardinalityLabel)]
+#[label(singleton = "type")]
 pub enum ErrorKind {
    /// Wrong password, unknown endpoint, protocol violation, etc...
    User,

    /// Network error between user and proxy. Not necessarily user error
+    #[label(rename = "clientdisconnect")]
    ClientDisconnect,

    /// Proxy self-imposed user rate limits
+    #[label(rename = "ratelimit")]
    RateLimit,

    /// Proxy self-imposed service-wise rate limits
+    #[label(rename = "serviceratelimit")]
    ServiceRateLimit,

    /// internal errors
    Service,

    /// Error communicating with control plane
+    #[label(rename = "controlplane")]
    ControlPlane,

    /// Postgres error
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -13,13 +13,16 @@ pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
 use tokio::time::Instant;
 use tracing::trace;

-use crate::{metrics::CONSOLE_REQUEST_LATENCY, rate_limiter, url::ApiUrl};
+use crate::{
+    metrics::{ConsoleRequest, Metrics},
+    url::ApiUrl,
+};
 use reqwest_middleware::RequestBuilder;

 /// This is the preferred way to create new http clients,
 /// because it takes care of observability (OpenTelemetry).
 /// We deliberately don't want to replace this with a public static.
-pub fn new_client(rate_limiter_config: rate_limiter::RateLimiterConfig) -> ClientWithMiddleware {
+pub fn new_client() -> ClientWithMiddleware {
    let client = reqwest::ClientBuilder::new()
        .dns_resolver(Arc::new(GaiResolver::default()))
        .connection_verbose(true)
@@ -28,7 +31,6 @@ pub fn new_client(rate_limiter_config: rate_limiter::RateLimiterConfig) -> Clien

    reqwest_middleware::ClientBuilder::new(client)
        .with(reqwest_tracing::TracingMiddleware::default())
-        .with(rate_limiter::Limiter::new(rate_limiter_config))
        .build()
 }

@@ -90,13 +92,14 @@ impl Endpoint {

    /// Execute a [request](reqwest::Request).
    pub async fn execute(&self, request: Request) -> Result<Response, Error> {
-        let path = request.url().path().to_string();
-        let start = Instant::now();
-        let res = self.client.execute(request).await;
-        CONSOLE_REQUEST_LATENCY
-            .with_label_values(&[&path])
-            .observe(start.elapsed().as_secs_f64());
-        res
+        let _timer = Metrics::get()
+            .proxy
+            .console_request_latency
+            .start_timer(ConsoleRequest {
+                request: request.url().path(),
+            });
+
+        self.client.execute(request).await
    }
 }

--- a/proxy/src/http/health_server.rs
+++ b/proxy/src/http/health_server.rs
@@ -1,30 +1,49 @@
 use anyhow::{anyhow, bail};
-use hyper::{Body, Request, Response, StatusCode};
-use std::{convert::Infallible, net::TcpListener};
-use tracing::info;
+use hyper::{header::CONTENT_TYPE, Body, Request, Response, StatusCode};
+use measured::{text::BufferedTextEncoder, MetricGroup};
+use metrics::NeonMetrics;
+use std::{
+    convert::Infallible,
+    net::TcpListener,
+    sync::{Arc, Mutex},
+};
+use tracing::{info, info_span};
 use utils::http::{
-    endpoint::{self, prometheus_metrics_handler, request_span},
+    endpoint::{self, request_span},
    error::ApiError,
    json::json_response,
    RouterBuilder, RouterService,
 };

+use crate::jemalloc;
+
 async fn status_handler(_: Request<Body>) -> Result<Response<Body>, ApiError> {
    json_response(StatusCode::OK, "")
 }

-fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
+fn make_router(metrics: AppMetrics) -> RouterBuilder<hyper::Body, ApiError> {
+    let state = Arc::new(Mutex::new(PrometheusHandler {
+        encoder: BufferedTextEncoder::new(),
+        metrics,
+    }));
+
    endpoint::make_router()
-        .get("/metrics", |r| request_span(r, prometheus_metrics_handler))
+        .get("/metrics", move |r| {
+            let state = state.clone();
+            request_span(r, move |b| prometheus_metrics_handler(b, state))
+        })
        .get("/v1/status", status_handler)
 }

-pub async fn task_main(http_listener: TcpListener) -> anyhow::Result<Infallible> {
+pub async fn task_main(
+    http_listener: TcpListener,
+    metrics: AppMetrics,
+) -> anyhow::Result<Infallible> {
    scopeguard::defer! {
        info!("http has shut down");
    }

-    let service = || RouterService::new(make_router().build()?);
+    let service = || RouterService::new(make_router(metrics).build()?);

    hyper::Server::from_tcp(http_listener)?
        .serve(service().map_err(|e| anyhow!(e))?)
@@ -32,3 +51,57 @@ pub async fn task_main(http_listener: TcpListener) -> anyhow::Result<Infallible>

    bail!("hyper server without shutdown handling cannot shutdown successfully");
 }
+
+struct PrometheusHandler {
+    encoder: BufferedTextEncoder,
+    metrics: AppMetrics,
+}
+
+#[derive(MetricGroup)]
+pub struct AppMetrics {
+    #[metric(namespace = "jemalloc")]
+    pub jemalloc: Option<jemalloc::MetricRecorder>,
+    #[metric(flatten)]
+    pub neon_metrics: NeonMetrics,
+    #[metric(flatten)]
+    pub proxy: &'static crate::metrics::Metrics,
+}
+
+async fn prometheus_metrics_handler(
+    _req: Request<Body>,
+    state: Arc<Mutex<PrometheusHandler>>,
+) -> Result<Response<Body>, ApiError> {
+    let started_at = std::time::Instant::now();
+
+    let span = info_span!("blocking");
+    let body = tokio::task::spawn_blocking(move || {
+        let _span = span.entered();
+
+        let mut state = state.lock().unwrap();
+        let PrometheusHandler { encoder, metrics } = &mut *state;
+
+        metrics
+            .collect_group_into(&mut *encoder)
+            .unwrap_or_else(|infallible| match infallible {});
+
+        let body = encoder.finish();
+
+        tracing::info!(
+            bytes = body.len(),
+            elapsed_ms = started_at.elapsed().as_millis(),
+            "responded /metrics"
+        );
+
+        body
+    })
+    .await
+    .unwrap();
+
+    let response = Response::builder()
+        .status(200)
+        .header(CONTENT_TYPE, "text/plain; version=0.0.4")
+        .body(Body::from(body))
+        .unwrap();
+
+    Ok(response)
+}
--- a/proxy/src/intern.rs
+++ b/proxy/src/intern.rs
@@ -5,7 +5,7 @@ use std::{
 use lasso::{Capacity, MemoryLimits, Spur, ThreadedRodeo};
 use rustc_hash::FxHasher;

-use crate::{BranchId, EndpointCacheKey, EndpointId, ProjectId, RoleName};
+use crate::{BranchId, EndpointId, ProjectId, RoleName};

 pub trait InternId: Sized + 'static {
    fn get_interner() -> &'static StringInterner<Self>;
@@ -165,11 +165,6 @@ impl From<EndpointId> for EndpointIdInt {
        EndpointIdTag::get_interner().get_or_intern(&value)
    }
 }
-impl From<EndpointCacheKey> for EndpointIdInt {
-    fn from(value: EndpointCacheKey) -> Self {
-        EndpointIdTag::get_interner().get_or_intern(&value)
-    }
-}

 #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)]
 pub struct BranchIdTag;
--- a/proxy/src/jemalloc.rs
+++ b/proxy/src/jemalloc.rs
@@ -1,27 +1,45 @@
-use std::time::Duration;
+use std::marker::PhantomData;

-use metrics::IntGauge;
-use prometheus::{register_int_gauge_with_registry, Registry};
+use measured::{
+    label::NoLabels,
+    metric::{
+        gauge::GaugeState, group::Encoding, group::MetricValue, name::MetricNameEncoder,
+        MetricEncoding, MetricFamilyEncoding, MetricType,
+    },
+    text::TextEncoder,
+    LabelGroup, MetricGroup,
+};
 use tikv_jemalloc_ctl::{config, epoch, epoch_mib, stats, version};

 pub struct MetricRecorder {
    epoch: epoch_mib,
-    active: stats::active_mib,
-    active_gauge: IntGauge,
-    allocated: stats::allocated_mib,
-    allocated_gauge: IntGauge,
-    mapped: stats::mapped_mib,
-    mapped_gauge: IntGauge,
-    metadata: stats::metadata_mib,
-    metadata_gauge: IntGauge,
-    resident: stats::resident_mib,
-    resident_gauge: IntGauge,
-    retained: stats::retained_mib,
-    retained_gauge: IntGauge,
+    inner: Metrics,
+}
+
+#[derive(MetricGroup)]
+struct Metrics {
+    active_bytes: JemallocGaugeFamily<stats::active_mib>,
+    allocated_bytes: JemallocGaugeFamily<stats::allocated_mib>,
+    mapped_bytes: JemallocGaugeFamily<stats::mapped_mib>,
+    metadata_bytes: JemallocGaugeFamily<stats::metadata_mib>,
+    resident_bytes: JemallocGaugeFamily<stats::resident_mib>,
+    retained_bytes: JemallocGaugeFamily<stats::retained_mib>,
+}
+
+impl<Enc: Encoding> MetricGroup<Enc> for MetricRecorder
+where
+    Metrics: MetricGroup<Enc>,
+{
+    fn collect_group_into(&self, enc: &mut Enc) -> Result<(), Enc::Err> {
+        if self.epoch.advance().is_ok() {
+            self.inner.collect_group_into(enc)?;
+        }
+        Ok(())
+    }
 }

 impl MetricRecorder {
-    pub fn new(registry: &Registry) -> Result<Self, anyhow::Error> {
+    pub fn new() -> Result<Self, anyhow::Error> {
        tracing::info!(
            config = config::malloc_conf::read()?,
            version = version::read()?,
@@ -30,71 +48,69 @@ impl MetricRecorder {

        Ok(Self {
            epoch: epoch::mib()?,
-            active: stats::active::mib()?,
-            active_gauge: register_int_gauge_with_registry!(
-                "jemalloc_active_bytes",
-                "Total number of bytes in active pages allocated by the process",
-                registry
-            )?,
-            allocated: stats::allocated::mib()?,
-            allocated_gauge: register_int_gauge_with_registry!(
-                "jemalloc_allocated_bytes",
-                "Total number of bytes allocated by the process",
-                registry
-            )?,
-            mapped: stats::mapped::mib()?,
-            mapped_gauge: register_int_gauge_with_registry!(
-                "jemalloc_mapped_bytes",
-                "Total number of bytes in active extents mapped by the allocator",
-                registry
-            )?,
-            metadata: stats::metadata::mib()?,
-            metadata_gauge: register_int_gauge_with_registry!(
-                "jemalloc_metadata_bytes",
-                "Total number of bytes dedicated to jemalloc metadata",
-                registry
-            )?,
-            resident: stats::resident::mib()?,
-            resident_gauge: register_int_gauge_with_registry!(
-                "jemalloc_resident_bytes",
-                "Total number of bytes in physically resident data pages mapped by the allocator",
-                registry
-            )?,
-            retained: stats::retained::mib()?,
-            retained_gauge: register_int_gauge_with_registry!(
-                "jemalloc_retained_bytes",
-                "Total number of bytes in virtual memory mappings that were retained rather than being returned to the operating system",
-                registry
-            )?,
-        })
-    }
-
-    fn _poll(&self) -> Result<(), anyhow::Error> {
-        self.epoch.advance()?;
-        self.active_gauge.set(self.active.read()? as i64);
-        self.allocated_gauge.set(self.allocated.read()? as i64);
-        self.mapped_gauge.set(self.mapped.read()? as i64);
-        self.metadata_gauge.set(self.metadata.read()? as i64);
-        self.resident_gauge.set(self.resident.read()? as i64);
-        self.retained_gauge.set(self.retained.read()? as i64);
-        Ok(())
-    }
-
-    #[inline]
-    pub fn poll(&self) {
-        if let Err(error) = self._poll() {
-            tracing::warn!(%error, "Failed to poll jemalloc stats");
-        }
-    }
-
-    pub fn start(self) -> tokio::task::JoinHandle<()> {
-        tokio::task::spawn(async move {
-            let mut interval = tokio::time::interval(Duration::from_secs(15));
-            interval.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Skip);
-            loop {
-                self.poll();
-                interval.tick().await;
-            }
+            inner: Metrics {
+                active_bytes: JemallocGaugeFamily(stats::active::mib()?),
+                allocated_bytes: JemallocGaugeFamily(stats::allocated::mib()?),
+                mapped_bytes: JemallocGaugeFamily(stats::mapped::mib()?),
+                metadata_bytes: JemallocGaugeFamily(stats::metadata::mib()?),
+                resident_bytes: JemallocGaugeFamily(stats::resident::mib()?),
+                retained_bytes: JemallocGaugeFamily(stats::retained::mib()?),
+            },
        })
    }
 }
+
+struct JemallocGauge<T>(PhantomData<T>);
+
+impl<T> Default for JemallocGauge<T> {
+    fn default() -> Self {
+        JemallocGauge(PhantomData)
+    }
+}
+impl<T> MetricType for JemallocGauge<T> {
+    type Metadata = T;
+}
+
+struct JemallocGaugeFamily<T>(T);
+impl<M, T: Encoding> MetricFamilyEncoding<T> for JemallocGaugeFamily<M>
+where
+    JemallocGauge<M>: MetricEncoding<T, Metadata = M>,
+{
+    fn collect_family_into(&self, name: impl MetricNameEncoder, enc: &mut T) -> Result<(), T::Err> {
+        JemallocGauge::write_type(&name, enc)?;
+        JemallocGauge(PhantomData).collect_into(&self.0, NoLabels, name, enc)
+    }
+}
+
+macro_rules! jemalloc_gauge {
+    ($stat:ident, $mib:ident) => {
+        impl<W: std::io::Write> MetricEncoding<TextEncoder<W>> for JemallocGauge<stats::$mib> {
+            fn write_type(
+                name: impl MetricNameEncoder,
+                enc: &mut TextEncoder<W>,
+            ) -> Result<(), std::io::Error> {
+                GaugeState::write_type(name, enc)
+            }
+
+            fn collect_into(
+                &self,
+                mib: &stats::$mib,
+                labels: impl LabelGroup,
+                name: impl MetricNameEncoder,
+                enc: &mut TextEncoder<W>,
+            ) -> Result<(), std::io::Error> {
+                if let Ok(v) = mib.read() {
+                    enc.write_metric_value(name, labels, MetricValue::Int(v as i64))?;
+                }
+                Ok(())
+            }
+        }
+    };
+}
+
+jemalloc_gauge!(active, active_mib);
+jemalloc_gauge!(allocated, allocated_mib);
+jemalloc_gauge!(mapped, mapped_mib);
+jemalloc_gauge!(metadata, metadata_mib);
+jemalloc_gauge!(resident, resident_mib);
+jemalloc_gauge!(retained, retained_mib);
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -1,188 +1,352 @@
-use ::metrics::{
-    exponential_buckets, register_histogram, register_histogram_vec, register_hll_vec,
-    register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge,
-    register_int_gauge_vec, Histogram, HistogramVec, HyperLogLogVec, IntCounterPairVec,
-    IntCounterVec, IntGauge, IntGaugeVec,
-};
-use metrics::{
-    register_hll, register_int_counter, register_int_counter_pair, HyperLogLog, IntCounter,
-    IntCounterPair,
-};
+use std::sync::OnceLock;
+
+use lasso::ThreadedRodeo;
+use measured::{
+    label::StaticLabelSet,
+    metric::{histogram::Thresholds, name::MetricName},
+    Counter, CounterVec, FixedCardinalityLabel, Gauge, Histogram, HistogramVec, LabelGroup,
+    MetricGroup,
+};
+use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLog, HyperLogLogVec};

-use once_cell::sync::Lazy;
 use tokio::time::{self, Instant};

 use crate::console::messages::ColdStartInfo;

-pub static NUM_DB_CONNECTIONS_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
-    register_int_counter_pair_vec!(
-        "proxy_opened_db_connections_total",
-        "Number of opened connections to a database.",
-        "proxy_closed_db_connections_total",
-        "Number of closed connections to a database.",
-        &["protocol"],
-    )
-    .unwrap()
-});
+#[derive(MetricGroup)]
+pub struct Metrics {
+    #[metric(namespace = "proxy")]
+    pub proxy: ProxyMetrics,

-pub static NUM_CLIENT_CONNECTION_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
-    register_int_counter_pair_vec!(
-        "proxy_opened_client_connections_total",
-        "Number of opened connections from a client.",
-        "proxy_closed_client_connections_total",
-        "Number of closed connections from a client.",
-        &["protocol"],
-    )
-    .unwrap()
-});
+    #[metric(namespace = "wake_compute_lock")]
+    pub wake_compute_lock: ApiLockMetrics,
+}

-pub static NUM_CONNECTION_REQUESTS_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
-    register_int_counter_pair_vec!(
-        "proxy_accepted_connections_total",
-        "Number of client connections accepted.",
-        "proxy_closed_connections_total",
-        "Number of client connections closed.",
-        &["protocol"],
-    )
-    .unwrap()
-});
+impl Metrics {
+    pub fn get() -> &'static Self {
+        static SELF: OnceLock<Metrics> = OnceLock::new();
+        SELF.get_or_init(|| Metrics {
+            proxy: ProxyMetrics::default(),
+            wake_compute_lock: ApiLockMetrics::new(),
+        })
+    }
+}

-pub static COMPUTE_CONNECTION_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
-    register_histogram_vec!(
-        "proxy_compute_connection_latency_seconds",
-        "Time it took for proxy to establish a connection to the compute endpoint",
-        // http/ws/tcp, true/false, true/false, success/failure, client/client_and_cplane
-        // 3 * 6 * 2 * 2 = 72 counters
-        &["protocol", "cold_start_info", "outcome", "excluded"],
-        // largest bucket = 2^16 * 0.5ms = 32s
-        exponential_buckets(0.0005, 2.0, 16).unwrap(),
-    )
-    .unwrap()
-});
+#[derive(MetricGroup)]
+#[metric(new())]
+pub struct ProxyMetrics {
+    #[metric(flatten)]
+    pub db_connections: CounterPairVec<NumDbConnectionsGauge>,
+    #[metric(flatten)]
+    pub client_connections: CounterPairVec<NumClientConnectionsGauge>,
+    #[metric(flatten)]
+    pub connection_requests: CounterPairVec<NumConnectionRequestsGauge>,
+    #[metric(flatten)]
+    pub http_endpoint_pools: HttpEndpointPools,

-pub static CONSOLE_REQUEST_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
-    register_histogram_vec!(
-        "proxy_console_request_latency",
-        "Time it took for proxy to establish a connection to the compute endpoint",
-        // proxy_wake_compute/proxy_get_role_info
-        &["request"],
+    /// Time it took for proxy to establish a connection to the compute endpoint.
+    // largest bucket = 2^16 * 0.5ms = 32s
+    #[metric(metadata = Thresholds::exponential_buckets(0.0005, 2.0))]
+    pub compute_connection_latency_seconds: HistogramVec<ComputeConnectionLatencySet, 16>,
+
+    /// Time it took for proxy to receive a response from control plane.
+    #[metric(
        // largest bucket = 2^16 * 0.2ms = 13s
-        exponential_buckets(0.0002, 2.0, 16).unwrap(),
-    )
-    .unwrap()
-});
+        metadata = Thresholds::exponential_buckets(0.0002, 2.0),
+    )]
+    pub console_request_latency: HistogramVec<ConsoleRequestSet, 16>,

-pub static ALLOWED_IPS_BY_CACHE_OUTCOME: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_allowed_ips_cache_misses",
-        "Number of cache hits/misses for allowed ips",
-        // hit/miss
-        &["outcome"],
-    )
-    .unwrap()
-});
+    /// Time it takes to acquire a token to call console plane.
+    // largest bucket = 3^16 * 0.05ms = 2.15s
+    #[metric(metadata = Thresholds::exponential_buckets(0.00005, 3.0))]
+    pub control_plane_token_acquire_seconds: Histogram<16>,

-pub static RATE_LIMITER_ACQUIRE_LATENCY: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
-        "proxy_control_plane_token_acquire_seconds",
-        "Time it took for proxy to establish a connection to the compute endpoint",
-        // largest bucket = 3^16 * 0.05ms = 2.15s
-        exponential_buckets(0.00005, 3.0, 16).unwrap(),
-    )
-    .unwrap()
-});
+    /// Size of the HTTP request body lengths.
+    // smallest bucket = 16 bytes
+    // largest bucket = 4^12 * 16 bytes = 256MB
+    #[metric(metadata = Thresholds::exponential_buckets(16.0, 4.0))]
+    pub http_conn_content_length_bytes: HistogramVec<StaticLabelSet<HttpDirection>, 12>,

-pub static RATE_LIMITER_LIMIT: Lazy<IntGaugeVec> = Lazy::new(|| {
-    register_int_gauge_vec!(
-        "semaphore_control_plane_limit",
-        "Current limit of the semaphore control plane",
-        &["limit"], // 2 counters
-    )
-    .unwrap()
-});
+    /// Time it takes to reclaim unused connection pools.
+    #[metric(metadata = Thresholds::exponential_buckets(1e-6, 2.0))]
+    pub http_pool_reclaimation_lag_seconds: Histogram<16>,

-pub static NUM_CONNECTION_ACCEPTED_BY_SNI: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_accepted_connections_by_sni",
-        "Number of connections (per sni).",
-        &["kind"],
-    )
-    .unwrap()
-});
+    /// Number of opened connections to a database.
+    pub http_pool_opened_connections: Gauge,

-pub static ALLOWED_IPS_NUMBER: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
-        "proxy_allowed_ips_number",
-        "Number of allowed ips",
-        vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0],
-    )
-    .unwrap()
-});
+    /// Number of cache hits/misses for allowed ips.
+    pub allowed_ips_cache_misses: CounterVec<StaticLabelSet<CacheOutcome>>,

-pub static HTTP_CONTENT_LENGTH: Lazy<HistogramVec> = Lazy::new(|| {
-    register_histogram_vec!(
-        "proxy_http_conn_content_length_bytes",
-        "Number of bytes the HTTP response content consumes",
-        // request/response
-        &["direction"],
-        // smallest bucket = 16 bytes
-        // largest bucket = 4^12 * 16 bytes = 256MB
-        exponential_buckets(16.0, 4.0, 12).unwrap()
-    )
-    .unwrap()
-});
+    /// Number of allowed ips
+    #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0]))]
+    pub allowed_ips_number: Histogram<10>,

-pub static GC_LATENCY: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
-        "proxy_http_pool_reclaimation_lag_seconds",
-        "Time it takes to reclaim unused connection pools",
-        // 1us -> 65ms
-        exponential_buckets(1e-6, 2.0, 16).unwrap(),
-    )
-    .unwrap()
-});
+    /// Number of connections (per sni).
+    pub accepted_connections_by_sni: CounterVec<StaticLabelSet<SniKind>>,

-pub static ENDPOINT_POOLS: Lazy<IntCounterPair> = Lazy::new(|| {
-    register_int_counter_pair!(
-        "proxy_http_pool_endpoints_registered_total",
-        "Number of endpoints we have registered pools for",
-        "proxy_http_pool_endpoints_unregistered_total",
-        "Number of endpoints we have unregistered pools for",
-    )
-    .unwrap()
-});
+    /// Number of connection failures (per kind).
+    pub connection_failures_total: CounterVec<StaticLabelSet<ConnectionFailureKind>>,

-pub static NUM_OPEN_CLIENTS_IN_HTTP_POOL: Lazy<IntGauge> = Lazy::new(|| {
-    register_int_gauge!(
-        "proxy_http_pool_opened_connections",
-        "Number of opened connections to a database.",
-    )
-    .unwrap()
-});
+    /// Number of wake-up failures (per kind).
+    pub connection_failures_breakdown: CounterVec<ConnectionFailuresBreakdownSet>,

-pub static NUM_CANCELLATION_REQUESTS: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_cancellation_requests_total",
-        "Number of cancellation requests (per found/not_found).",
-        &["source", "kind"],
-    )
-    .unwrap()
-});
+    /// Number of bytes sent/received between all clients and backends.
+    pub io_bytes: CounterVec<StaticLabelSet<Direction>>,

-pub static NUM_INVALID_ENDPOINTS: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_invalid_endpoints_total",
-        "Number of invalid endpoints (per protocol, per rejected).",
-        // http/ws/tcp, true/false, success/failure
-        // TODO(anna): the last dimension is just a proxy to what we actually want to measure.
-        // We need to measure whether the endpoint was found by cplane or not.
-        &["protocol", "rejected", "outcome"],
-    )
-    .unwrap()
-});
+    /// Number of errors by a given classification.
+    pub errors_total: CounterVec<StaticLabelSet<crate::error::ErrorKind>>,

-pub const NUM_CANCELLATION_REQUESTS_SOURCE_FROM_CLIENT: &str = "from_client";
-pub const NUM_CANCELLATION_REQUESTS_SOURCE_FROM_REDIS: &str = "from_redis";
+    /// Number of cancellation requests (per found/not_found).
+    pub cancellation_requests_total: CounterVec<CancellationRequestSet>,
+
+    /// Number of errors by a given classification
+    pub redis_errors_total: CounterVec<RedisErrorsSet>,
+
+    /// Number of TLS handshake failures
+    pub tls_handshake_failures: Counter,
+
+    /// Number of connection requests affected by authentication rate limits
+    pub requests_auth_rate_limits_total: Counter,
+
+    /// HLL approximate cardinality of endpoints that are connecting
+    pub connecting_endpoints: HyperLogLogVec<StaticLabelSet<Protocol>, 32>,
+
+    /// Number of endpoints affected by errors of a given classification
+    pub endpoints_affected_by_errors: HyperLogLogVec<StaticLabelSet<crate::error::ErrorKind>, 32>,
+
+    /// Number of endpoints affected by authentication rate limits
+    pub endpoints_auth_rate_limits: HyperLogLog<32>,
+
+    /// Number of invalid endpoints (per protocol, per rejected).
+    pub invalid_endpoints_total: CounterVec<InvalidEndpointsSet>,
+
+    /// Number of retries (per outcome, per retry_type).
+    #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]))]
+    pub retries_metric: HistogramVec<RetriesMetricSet, 9>,
+}
+
+#[derive(MetricGroup)]
+#[metric(new())]
+pub struct ApiLockMetrics {
+    /// Number of semaphores registered in this api lock
+    pub semaphores_registered: Counter,
+    /// Number of semaphores unregistered in this api lock
+    pub semaphores_unregistered: Counter,
+    /// Time it takes to reclaim unused semaphores in the api lock
+    #[metric(metadata = Thresholds::exponential_buckets(1e-6, 2.0))]
+    pub reclamation_lag_seconds: Histogram<16>,
+    /// Time it takes to acquire a semaphore lock
+    #[metric(metadata = Thresholds::exponential_buckets(1e-4, 2.0))]
+    pub semaphore_acquire_seconds: Histogram<16>,
+}
+
+impl Default for ProxyMetrics {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+#[derive(FixedCardinalityLabel, Copy, Clone)]
+#[label(singleton = "direction")]
+pub enum HttpDirection {
+    Request,
+    Response,
+}
+
+#[derive(FixedCardinalityLabel, Copy, Clone)]
+#[label(singleton = "direction")]
+pub enum Direction {
+    Tx,
+    Rx,
+}
+
+#[derive(FixedCardinalityLabel, Clone, Copy, Debug)]
+#[label(singleton = "protocol")]
+pub enum Protocol {
+    Http,
+    Ws,
+    Tcp,
+    SniRouter,
+}
+
+impl Protocol {
+    pub fn as_str(&self) -> &'static str {
+        match self {
+            Protocol::Http => "http",
+            Protocol::Ws => "ws",
+            Protocol::Tcp => "tcp",
+            Protocol::SniRouter => "sni_router",
+        }
+    }
+}
+
+impl std::fmt::Display for Protocol {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.write_str(self.as_str())
+    }
+}
+
+#[derive(FixedCardinalityLabel, Copy, Clone)]
+pub enum Bool {
+    True,
+    False,
+}
+
+#[derive(FixedCardinalityLabel, Copy, Clone)]
+#[label(singleton = "outcome")]
+pub enum Outcome {
+    Success,
+    Failed,
+}
+
+#[derive(FixedCardinalityLabel, Copy, Clone)]
+#[label(singleton = "outcome")]
+pub enum CacheOutcome {
+    Hit,
+    Miss,
+}
+
+#[derive(LabelGroup)]
+#[label(set = ConsoleRequestSet)]
+pub struct ConsoleRequest<'a> {
+    #[label(dynamic_with = ThreadedRodeo, default)]
+    pub request: &'a str,
+}
+
+#[derive(MetricGroup, Default)]
+pub struct HttpEndpointPools {
+    /// Number of endpoints we have registered pools for
+    pub http_pool_endpoints_registered_total: Counter,
+    /// Number of endpoints we have unregistered pools for
+    pub http_pool_endpoints_unregistered_total: Counter,
+}
+
+pub struct HttpEndpointPoolsGuard<'a> {
+    dec: &'a Counter,
+}
+
+impl Drop for HttpEndpointPoolsGuard<'_> {
+    fn drop(&mut self) {
+        self.dec.inc();
+    }
+}
+
+impl HttpEndpointPools {
+    pub fn guard(&self) -> HttpEndpointPoolsGuard {
+        self.http_pool_endpoints_registered_total.inc();
+        HttpEndpointPoolsGuard {
+            dec: &self.http_pool_endpoints_unregistered_total,
+        }
+    }
+}
+pub struct NumDbConnectionsGauge;
+impl CounterPairAssoc for NumDbConnectionsGauge {
+    const INC_NAME: &'static MetricName = MetricName::from_str("opened_db_connections_total");
+    const DEC_NAME: &'static MetricName = MetricName::from_str("closed_db_connections_total");
+    const INC_HELP: &'static str = "Number of opened connections to a database.";
+    const DEC_HELP: &'static str = "Number of closed connections to a database.";
+    type LabelGroupSet = StaticLabelSet<Protocol>;
+}
+pub type NumDbConnectionsGuard<'a> = metrics::MeasuredCounterPairGuard<'a, NumDbConnectionsGauge>;
+
+pub struct NumClientConnectionsGauge;
+impl CounterPairAssoc for NumClientConnectionsGauge {
+    const INC_NAME: &'static MetricName = MetricName::from_str("opened_client_connections_total");
+    const DEC_NAME: &'static MetricName = MetricName::from_str("closed_client_connections_total");
+    const INC_HELP: &'static str = "Number of opened connections from a client.";
+    const DEC_HELP: &'static str = "Number of closed connections from a client.";
+    type LabelGroupSet = StaticLabelSet<Protocol>;
+}
+pub type NumClientConnectionsGuard<'a> =
+    metrics::MeasuredCounterPairGuard<'a, NumClientConnectionsGauge>;
+
+pub struct NumConnectionRequestsGauge;
+impl CounterPairAssoc for NumConnectionRequestsGauge {
+    const INC_NAME: &'static MetricName = MetricName::from_str("accepted_connections_total");
+    const DEC_NAME: &'static MetricName = MetricName::from_str("closed_connections_total");
+    const INC_HELP: &'static str = "Number of client connections accepted.";
+    const DEC_HELP: &'static str = "Number of client connections closed.";
+    type LabelGroupSet = StaticLabelSet<Protocol>;
+}
+pub type NumConnectionRequestsGuard<'a> =
+    metrics::MeasuredCounterPairGuard<'a, NumConnectionRequestsGauge>;
+
+#[derive(LabelGroup)]
+#[label(set = ComputeConnectionLatencySet)]
+pub struct ComputeConnectionLatencyGroup {
+    protocol: Protocol,
+    cold_start_info: ColdStartInfo,
+    outcome: ConnectOutcome,
+    excluded: LatencyExclusions,
+}
+
+#[derive(FixedCardinalityLabel, Copy, Clone)]
+pub enum LatencyExclusions {
+    Client,
+    ClientAndCplane,
+}
+
+#[derive(FixedCardinalityLabel, Copy, Clone)]
+#[label(singleton = "kind")]
+pub enum SniKind {
+    Sni,
+    NoSni,
+    PasswordHack,
+}
+
+#[derive(FixedCardinalityLabel, Copy, Clone)]
+#[label(singleton = "kind")]
+pub enum ConnectionFailureKind {
+    ComputeCached,
+    ComputeUncached,
+}
+
+#[derive(FixedCardinalityLabel, Copy, Clone)]
+#[label(singleton = "kind")]
+pub enum WakeupFailureKind {
+    BadComputeAddress,
+    ApiTransportError,
+    QuotaExceeded,
+    ApiConsoleLocked,
+    ApiConsoleBadRequest,
+    ApiConsoleOtherServerError,
+    ApiConsoleOtherError,
+    TimeoutError,
+}
+
+#[derive(LabelGroup)]
+#[label(set = ConnectionFailuresBreakdownSet)]
+pub struct ConnectionFailuresBreakdownGroup {
+    pub kind: WakeupFailureKind,
+    pub retry: Bool,
+}
+
+#[derive(LabelGroup, Copy, Clone)]
+#[label(set = RedisErrorsSet)]
+pub struct RedisErrors<'a> {
+    #[label(dynamic_with = ThreadedRodeo, default)]
+    pub channel: &'a str,
+}
+
+#[derive(FixedCardinalityLabel, Copy, Clone)]
+pub enum CancellationSource {
+    FromClient,
+    FromRedis,
+    Local,
+}
+
+#[derive(FixedCardinalityLabel, Copy, Clone)]
+pub enum CancellationOutcome {
+    NotFound,
+    Found,
+}
+
+#[derive(LabelGroup)]
+#[label(set = CancellationRequestSet)]
+pub struct CancellationRequest {
+    pub source: CancellationSource,
+    pub kind: CancellationOutcome,
+}

 pub enum Waiting {
    Cplane,
@@ -197,20 +361,6 @@ struct Accumulated {
    compute: time::Duration,
 }

-enum Outcome {
-    Success,
-    Failed,
-}
-
-impl Outcome {
-    fn as_str(&self) -> &'static str {
-        match self {
-            Outcome::Success => "success",
-            Outcome::Failed => "failed",
-        }
-    }
-}
-
 pub struct LatencyTimer {
    // time since the stopwatch was started
    start: time::Instant,
@@ -219,9 +369,9 @@ pub struct LatencyTimer {
    // accumulated time on the stopwatch
    accumulated: Accumulated,
    // label data
-    protocol: &'static str,
+    protocol: Protocol,
    cold_start_info: ColdStartInfo,
-    outcome: Outcome,
+    outcome: ConnectOutcome,
 }

 pub struct LatencyTimerPause<'a> {
@@ -231,7 +381,7 @@ pub struct LatencyTimerPause<'a> {
 }

 impl LatencyTimer {
-    pub fn new(protocol: &'static str) -> Self {
+    pub fn new(protocol: Protocol) -> Self {
        Self {
            start: time::Instant::now(),
            stop: None,
@@ -239,7 +389,7 @@ impl LatencyTimer {
            protocol,
            cold_start_info: ColdStartInfo::Unknown,
            // assume failed unless otherwise specified
-            outcome: Outcome::Failed,
+            outcome: ConnectOutcome::Failed,
        }
    }

@@ -260,7 +410,7 @@ impl LatencyTimer {
        self.stop = Some(time::Instant::now());

        // success
-        self.outcome = Outcome::Success;
+        self.outcome = ConnectOutcome::Success;
    }
 }

@@ -275,128 +425,75 @@ impl Drop for LatencyTimerPause<'_> {
    }
 }

+#[derive(FixedCardinalityLabel, Clone, Copy, Debug)]
+pub enum ConnectOutcome {
+    Success,
+    Failed,
+}
+
 impl Drop for LatencyTimer {
    fn drop(&mut self) {
        let duration = self
            .stop
            .unwrap_or_else(time::Instant::now)
            .duration_since(self.start);
-        // Excluding cplane communication from the accumulated time.
-        COMPUTE_CONNECTION_LATENCY
-            .with_label_values(&[
-                self.protocol,
-                self.cold_start_info.as_str(),
-                self.outcome.as_str(),
-                "client",
-            ])
-            .observe((duration.saturating_sub(self.accumulated.client)).as_secs_f64());
+
+        let metric = &Metrics::get().proxy.compute_connection_latency_seconds;
+
+        // Excluding client communication from the accumulated time.
+        metric.observe(
+            ComputeConnectionLatencyGroup {
+                protocol: self.protocol,
+                cold_start_info: self.cold_start_info,
+                outcome: self.outcome,
+                excluded: LatencyExclusions::Client,
+            },
+            duration
+                .saturating_sub(self.accumulated.client)
+                .as_secs_f64(),
+        );
+
        // Exclude client and cplane communication from the accumulated time.
        let accumulated_total = self.accumulated.client + self.accumulated.cplane;
-        COMPUTE_CONNECTION_LATENCY
-            .with_label_values(&[
-                self.protocol,
-                self.cold_start_info.as_str(),
-                self.outcome.as_str(),
-                "client_and_cplane",
-            ])
-            .observe((duration.saturating_sub(accumulated_total)).as_secs_f64());
+        metric.observe(
+            ComputeConnectionLatencyGroup {
+                protocol: self.protocol,
+                cold_start_info: self.cold_start_info,
+                outcome: self.outcome,
+                excluded: LatencyExclusions::ClientAndCplane,
+            },
+            duration.saturating_sub(accumulated_total).as_secs_f64(),
+        );
    }
 }

-pub static NUM_CONNECTION_FAILURES: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_connection_failures_total",
-        "Number of connection failures (per kind).",
-        &["kind"],
-    )
-    .unwrap()
-});
-
-pub static NUM_WAKEUP_FAILURES: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_connection_failures_breakdown",
-        "Number of wake-up failures (per kind).",
-        &["retry", "kind"],
-    )
-    .unwrap()
-});
-
-pub static NUM_BYTES_PROXIED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_io_bytes",
-        "Number of bytes sent/received between all clients and backends.",
-        &["direction"],
-    )
-    .unwrap()
-});
-
-pub const fn bool_to_str(x: bool) -> &'static str {
-    if x {
-        "true"
-    } else {
-        "false"
+impl From<bool> for Bool {
+    fn from(value: bool) -> Self {
+        if value {
+            Bool::True
+        } else {
+            Bool::False
+        }
    }
 }

-pub static CONNECTING_ENDPOINTS: Lazy<HyperLogLogVec<32>> = Lazy::new(|| {
-    register_hll_vec!(
-        32,
-        "proxy_connecting_endpoints",
-        "HLL approximate cardinality of endpoints that are connecting",
-        &["protocol"],
-    )
-    .unwrap()
-});
+#[derive(LabelGroup)]
+#[label(set = InvalidEndpointsSet)]
+pub struct InvalidEndpointsGroup {
+    pub protocol: Protocol,
+    pub rejected: Bool,
+    pub outcome: ConnectOutcome,
+}

-pub static ERROR_BY_KIND: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_errors_total",
-        "Number of errors by a given classification",
-        &["type"],
-    )
-    .unwrap()
-});
+#[derive(LabelGroup)]
+#[label(set = RetriesMetricSet)]
+pub struct RetriesMetricGroup {
+    pub outcome: ConnectOutcome,
+    pub retry_type: RetryType,
+}

-pub static ENDPOINT_ERRORS_BY_KIND: Lazy<HyperLogLogVec<32>> = Lazy::new(|| {
-    register_hll_vec!(
-        32,
-        "proxy_endpoints_affected_by_errors",
-        "Number of endpoints affected by errors of a given classification",
-        &["type"],
-    )
-    .unwrap()
-});
-
-pub static REDIS_BROKEN_MESSAGES: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
-        "proxy_redis_errors_total",
-        "Number of errors by a given classification",
-        &["channel"],
-    )
-    .unwrap()
-});
-
-pub static TLS_HANDSHAKE_FAILURES: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
-        "proxy_tls_handshake_failures",
-        "Number of TLS handshake failures",
-    )
-    .unwrap()
-});
-
-pub static ENDPOINTS_AUTH_RATE_LIMITED: Lazy<HyperLogLog<32>> = Lazy::new(|| {
-    register_hll!(
-        32,
-        "proxy_endpoints_auth_rate_limits",
-        "Number of endpoints affected by authentication rate limits",
-    )
-    .unwrap()
-});
-
-pub static AUTH_RATE_LIMIT_HITS: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
-        "proxy_requests_auth_rate_limits_total",
-        "Number of connection requests affected by authentication rate limits",
-    )
-    .unwrap()
-});
+#[derive(FixedCardinalityLabel, Clone, Copy, Debug)]
+pub enum RetryType {
+    WakeCompute,
+    ConnectToCompute,
+}
--- a/Show More
+++ b/Show More