Move saving of stdin descriptor

Check if walredo pipe was recreated by some other backend before klilling walredo process
Fix case for array of jsons (#5523 )
2026-03-14 05:40:38 +00:00 · 2023-10-13 09:16:52 +03:00 · 2023-10-12 22:53:27 +03:00 · 2023-10-12 14:32:49 +02:00 · 2023-10-12 11:41:07 +02:00 · 2023-10-11 16:24:36 +01:00
152 changed files with 8202 additions and 4771 deletions
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -76,8 +76,8 @@ runs:
          rm -f ${ALLURE_ZIP}
        fi
      env:
-        ALLURE_VERSION: 2.23.1
-        ALLURE_ZIP_SHA256: 11141bfe727504b3fd80c0f9801eb317407fd0ac983ebb57e671f14bac4bcd86
+        ALLURE_VERSION: 2.24.0
+        ALLURE_ZIP_SHA256: 60b1d6ce65d9ef24b23cf9c2c19fd736a123487c38e54759f1ed1a7a77353c90

    # Potentially we could have several running build for the same key (for example, for the main branch), so we use improvised lock for this
    - name: Acquire lock
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -834,7 +834,7 @@ jobs:
      run:
        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.17.11
+      VM_BUILDER_VERSION: v0.17.12

    steps:
      - name: Checkout
@@ -1092,8 +1092,10 @@ jobs:
        run: |
          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
-          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
+
+            # TODO: move deployPreprodRegion to release (`"$GITHUB_REF_NAME" == "release"` block), once Staging support different compute tag prefixes for different regions
            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=true
+          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
            gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f disclamerAcknowledged=true
          else
            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
--- a/2
+++ b/2
@@ -5,7 +5,7 @@
 /libs/remote_storage/ @neondatabase/storage
 /libs/safekeeper_api/ @neondatabase/safekeepers
 /libs/vm_monitor/ @neondatabase/autoscaling @neondatabase/compute
-/pageserver/ @neondatabase/compute @neondatabase/storage
+/pageserver/ @neondatabase/storage
 /pgxn/ @neondatabase/compute
 /proxy/ @neondatabase/proxy
 /safekeeper/ @neondatabase/safekeepers
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -798,6 +798,22 @@ dependencies = [
 "either",
 ]

+[[package]]
+name = "camino"
+version = "1.1.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c59e92b5a388f549b863a7bea62612c09f24c8393560709a54558a9abdfb3b9c"
+
+[[package]]
+name = "camino-tempfile"
+version = "1.0.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d2ab15a83d13f75dbd86f082bdefd160b628476ef58d3b900a0ef74e001bb097"
+dependencies = [
+ "camino",
+ "tempfile",
+]
+
 [[package]]
 name = "cast"
 version = "0.3.0"
@@ -1053,6 +1069,7 @@ name = "control_plane"
 version = "0.1.0"
 dependencies = [
 "anyhow",
+ "camino",
 "clap",
 "comfy-table",
 "compute_api",
@@ -1780,18 +1797,9 @@ checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"

 [[package]]
 name = "hermit-abi"
-version = "0.2.6"
+version = "0.3.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ee512640fe35acbfb4bb779db6f0d80704c2cacfa2e39b601ef3e3f47d1ae4c7"
-dependencies = [
- "libc",
-]
-
-[[package]]
-name = "hermit-abi"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286"
+checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7"

 [[package]]
 name = "hex"
@@ -2053,7 +2061,7 @@ version = "1.0.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2"
 dependencies = [
- "hermit-abi 0.3.1",
+ "hermit-abi",
 "libc",
 "windows-sys 0.48.0",
 ]
@@ -2070,7 +2078,7 @@ version = "0.4.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "adcf93614601c8129ddf72e2d5633df827ba6551541c6d8c59520a371475be1f"
 dependencies = [
- "hermit-abi 0.3.1",
+ "hermit-abi",
 "io-lifetimes",
 "rustix 0.37.19",
 "windows-sys 0.48.0",
@@ -2444,11 +2452,11 @@ dependencies = [

 [[package]]
 name = "num_cpus"
-version = "1.15.0"
+version = "1.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0fac9e2da13b5eb447a6ce3d392f23a29d8694bff781bf03a16cd9ac8697593b"
+checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43"
 dependencies = [
- "hermit-abi 0.2.6",
+ "hermit-abi",
 "libc",
 ]

@@ -2650,6 +2658,7 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "bytes",
+ "camino",
 "clap",
 "git-version",
 "pageserver",
@@ -2670,6 +2679,8 @@ dependencies = [
 "async-trait",
 "byteorder",
 "bytes",
+ "camino",
+ "camino-tempfile",
 "chrono",
 "clap",
 "close_fds",
@@ -2721,7 +2732,6 @@ dependencies = [
 "strum_macros",
 "svg_fmt",
 "sync_wrapper",
- "tempfile",
 "tenant_size_model",
 "thiserror",
 "tokio",
@@ -3246,6 +3256,7 @@ dependencies = [
 "reqwest-tracing",
 "routerify",
 "rstest",
+ "rustc-hash",
 "rustls",
 "rustls-pemfile",
 "scopeguard",
@@ -3413,6 +3424,8 @@ dependencies = [
 "aws-sdk-s3",
 "aws-smithy-http",
 "aws-types",
+ "camino",
+ "camino-tempfile",
 "hyper",
 "metrics",
 "once_cell",
@@ -3421,7 +3434,6 @@ dependencies = [
 "scopeguard",
 "serde",
 "serde_json",
- "tempfile",
 "test-context",
 "tokio",
 "tokio-util",
@@ -3773,6 +3785,8 @@ dependencies = [
 "async-trait",
 "byteorder",
 "bytes",
+ "camino",
+ "camino-tempfile",
 "chrono",
 "clap",
 "const_format",
@@ -3801,7 +3815,6 @@ dependencies = [
 "serde_with",
 "signal-hook",
 "storage_broker",
- "tempfile",
 "thiserror",
 "tokio",
 "tokio-io-timeout",
@@ -5100,6 +5113,8 @@ dependencies = [
 "bincode",
 "byteorder",
 "bytes",
+ "camino",
+ "camino-tempfile",
 "chrono",
 "const_format",
 "criterion",
@@ -5125,7 +5140,6 @@ dependencies = [
 "signal-hook",
 "strum",
 "strum_macros",
- "tempfile",
 "thiserror",
 "tokio",
 "tokio-stream",
@@ -5199,6 +5213,7 @@ name = "wal_craft"
 version = "0.1.0"
 dependencies = [
 "anyhow",
+ "camino-tempfile",
 "clap",
 "env_logger",
 "log",
@@ -5206,7 +5221,6 @@ dependencies = [
 "postgres",
 "postgres_ffi",
 "regex",
- "tempfile",
 "utils",
 "workspace_hack",
 ]
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -51,6 +51,7 @@ bindgen = "0.65"
 bstr = "1.0"
 byteorder = "1.4"
 bytes = "1.0"
+camino = "1.1.6"
 cfg-if = "1.0.0"
 chrono = { version = "0.4", default-features = false, features = ["clock"] }
 clap = { version = "4.0", features = ["derive"] }
@@ -107,6 +108,7 @@ reqwest-middleware = "0.2.0"
 reqwest-retry = "0.2.2"
 routerify = "3"
 rpds = "0.13"
+rustc-hash = "1.1.0"
 rustls = "0.21"
 rustls-pemfile = "1"
 rustls-split = "0.3"
@@ -186,7 +188,7 @@ workspace_hack = { version = "0.1", path = "./workspace_hack/" }
 criterion = "0.5.1"
 rcgen = "0.11"
 rstest = "0.18"
-tempfile = "3.4"
+camino-tempfile = "1.0.2"
 tonic-build = "0.9"

 [patch.crates-io]
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -224,8 +224,8 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -
 FROM build-deps AS vector-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.0.tar.gz -O pgvector.tar.gz && \
-    echo "d8aa3504b215467ca528525a6de12c3f85f9891b091ce0e5864dd8a9b757f77b pgvector.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.5.1.tar.gz -O pgvector.tar.gz && \
+    echo "cc7a8e034a96e30a819911ac79d32f6bc47bdd1aa2de4d7d4904e26b83209dc8 pgvector.tar.gz" | sha256sum --check && \
    mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -368,8 +368,8 @@ RUN wget https://github.com/citusdata/postgresql-hll/archive/refs/tags/v2.18.tar
 FROM build-deps AS plpgsql-check-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/

-RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.4.0.tar.gz -O plpgsql_check.tar.gz && \
-    echo "9ba58387a279b35a3bfa39ee611e5684e6cddb2ba046ddb2c5190b3bd2ca254a plpgsql_check.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.5.3.tar.gz -O plpgsql_check.tar.gz && \
+    echo "6631ec3e7fb3769eaaf56e3dfedb829aa761abf163d13dba354b4c218508e1c0 plpgsql_check.tar.gz" | sha256sum --check && \
    mkdir plpgsql_check-src && cd plpgsql_check-src && tar xvzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
@@ -615,11 +615,7 @@ RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgre
 #########################################################################################
 #
 # Layer "rust extensions"
-# This layer is used to build `pgx` deps
-#
-# FIXME: This needs to be updated to latest version of 'pgrx' (it was renamed from
-# 'pgx' to 'pgrx') for PostgreSQL 16. And that in turn requires bumping the pgx
-# dependency on all the rust extension that depend on it, too.
+# This layer is used to build `pgrx` deps
 #
 #########################################################################################
 FROM build-deps AS rust-extensions-build
@@ -635,22 +631,12 @@ USER nonroot
 WORKDIR /home/nonroot
 ARG PG_VERSION

-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        ;; \
-      "v16") \
-        echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
-	;; \
-      *) \
-        echo "unexpected PostgreSQL version ${PG_VERSION}" && exit 1 \
-        ;; \
-    esac && \
-    curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
+RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
    chmod +x rustup-init && \
    ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
    rm rustup-init && \
-    cargo install --locked --version 0.7.3 cargo-pgx && \
-    /bin/bash -c 'cargo pgx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
+    cargo install --locked --version 0.10.2 cargo-pgrx && \
+    /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'

 USER root

@@ -664,23 +650,11 @@ USER root
 FROM rust-extensions-build AS pg-jsonschema-pg-build
 ARG PG_VERSION

-# caeab60d70b2fd3ae421ec66466a3abbb37b7ee6 made on 06/03/2023
-# there is no release tag yet, but we need it due to the superuser fix in the control file, switch to git tag after release >= 0.1.5
-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        ;; \
-      "v16") \
-        echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
-	;; \
-      *) \
-        echo "unexpected PostgreSQL version \"${PG_VERSION}\"" && exit 1 \
-        ;; \
-    esac && \
-    wget https://github.com/supabase/pg_jsonschema/archive/caeab60d70b2fd3ae421ec66466a3abbb37b7ee6.tar.gz -O pg_jsonschema.tar.gz && \
-    echo "54129ce2e7ee7a585648dbb4cef6d73f795d94fe72f248ac01119992518469a4 pg_jsonschema.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.2.0.tar.gz -O pg_jsonschema.tar.gz && \
+    echo "9118fc508a6e231e7a39acaa6f066fcd79af17a5db757b47d2eefbe14f7794f0 pg_jsonschema.tar.gz" | sha256sum --check && \
    mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xvzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
-    sed -i 's/pgx = "0.7.1"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
-    cargo pgx install --release && \
+    sed -i 's/pgrx = "0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    cargo pgrx install --release && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_jsonschema.control

 #########################################################################################
@@ -693,26 +667,11 @@ RUN case "${PG_VERSION}" in \
 FROM rust-extensions-build AS pg-graphql-pg-build
 ARG PG_VERSION

-# b4988843647450a153439be367168ed09971af85 made on 22/02/2023 (from remove-pgx-contrib-spiext branch)
-# Currently pgx version bump to >= 0.7.2  causes "call to unsafe function" compliation errors in
-# pgx-contrib-spiext. There is a branch that removes that dependency, so use it. It is on the
-# same 1.1 version we've used before.
-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        ;; \
-      "v16") \
-        echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
-	;; \
-      *) \
-        echo "unexpected PostgreSQL version" && exit 1 \
-        ;; \
-    esac && \
-    wget https://github.com/yrashk/pg_graphql/archive/b4988843647450a153439be367168ed09971af85.tar.gz -O pg_graphql.tar.gz && \
-    echo "0c7b0e746441b2ec24187d0e03555faf935c2159e2839bddd14df6dafbc8c9bd pg_graphql.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.4.0.tar.gz -O pg_graphql.tar.gz && \
+    echo "bd8dc7230282b3efa9ae5baf053a54151ed0e66881c7c53750e2d0c765776edc pg_graphql.tar.gz" | sha256sum --check && \
    mkdir pg_graphql-src && cd pg_graphql-src && tar xvzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
-    sed -i 's/pgx = "~0.7.1"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
-    sed -i 's/pgx-tests = "~0.7.1"/pgx-tests = "0.7.3"/g' Cargo.toml && \
-    cargo pgx install --release && \
+    sed -i 's/pgrx = "=0.10.2"/pgrx = { version = "0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    cargo pgrx install --release && \
    # it's needed to enable extension because it uses untrusted C language
    sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_graphql.control && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_graphql.control
@@ -727,21 +686,11 @@ RUN case "${PG_VERSION}" in \
 FROM rust-extensions-build AS pg-tiktoken-pg-build
 ARG PG_VERSION

-# 801f84f08c6881c8aa30f405fafbf00eec386a72 made on 10/03/2023
-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        ;; \
-      "v16") \
-        echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
-	;; \
-      *) \
-        echo "unexpected PostgreSQL version" && exit 1 \
-        ;; \
-    esac && \
-    wget https://github.com/kelvich/pg_tiktoken/archive/801f84f08c6881c8aa30f405fafbf00eec386a72.tar.gz -O pg_tiktoken.tar.gz && \
-    echo "52f60ac800993a49aa8c609961842b611b6b1949717b69ce2ec9117117e16e4a pg_tiktoken.tar.gz" | sha256sum --check && \
+# 26806147b17b60763039c6a6878884c41a262318 made on 26/09/2023
+RUN wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \
+    echo "e64e55aaa38c259512d3e27c572da22c4637418cf124caba904cd50944e5004e pg_tiktoken.tar.gz" | sha256sum --check && \
    mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xvzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \
-    cargo pgx install --release && \
+    cargo pgrx install --release && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control

 #########################################################################################
@@ -754,21 +703,15 @@ RUN case "${PG_VERSION}" in \
 FROM rust-extensions-build AS pg-pgx-ulid-build
 ARG PG_VERSION

-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        ;; \
-      "v16") \
-        echo "TODO: Not yet supported for PostgreSQL 16. Need to update pgrx dependencies" && exit 0 \
-	;; \
-      *) \
-        echo "unexpected PostgreSQL version" && exit 1 \
-        ;; \
-    esac && \
-    wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.0.tar.gz -O pgx_ulid.tar.gz && \
-    echo "908b7358e6f846e87db508ae5349fb56a88ee6305519074b12f3d5b0ff09f791 pgx_ulid.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.3.tar.gz -O pgx_ulid.tar.gz && \
+    echo "ee5db82945d2d9f2d15597a80cf32de9dca67b897f605beb830561705f12683c pgx_ulid.tar.gz" | sha256sum --check && \
    mkdir pgx_ulid-src && cd pgx_ulid-src && tar xvzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
-    sed -i 's/pgx        = "=0.7.3"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
-    cargo pgx install --release && \
+    echo "******************* Apply a patch for Postgres 16 support; delete in the next release ******************" && \
+    wget https://github.com/pksunkara/pgx_ulid/commit/f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \
+    patch -p1 < f84954cf63fc8c80d964ac970d9eceed3c791196.patch && \
+    echo "********************************************************************************************************" && \
+    sed -i 's/pgrx       = "=0.10.2"/pgrx = { version = "=0.10.2", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    cargo pgrx install --release && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control

 #########################################################################################
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1039,7 +1039,7 @@ LIMIT 100",
        let remote_extensions = spec
            .remote_extensions
            .as_ref()
-            .ok_or(anyhow::anyhow!("Remote extensions are not configured",))?;
+            .ok_or(anyhow::anyhow!("Remote extensions are not configured"))?;

        info!("parse shared_preload_libraries from spec.cluster.settings");
        let mut libs_vec = Vec::new();
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -1,5 +1,5 @@
 use std::sync::Arc;
-use std::{thread, time};
+use std::{thread, time::Duration};

 use chrono::{DateTime, Utc};
 use postgres::{Client, NoTls};
@@ -7,7 +7,7 @@ use tracing::{debug, info};

 use crate::compute::ComputeNode;

-const MONITOR_CHECK_INTERVAL: u64 = 500; // milliseconds
+const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500);

 // Spin in a loop and figure out the last activity time in the Postgres.
 // Then update it in the shared state. This function never errors out.
@@ -17,13 +17,12 @@ fn watch_compute_activity(compute: &ComputeNode) {
    let connstr = compute.connstr.as_str();
    // Define `client` outside of the loop to reuse existing connection if it's active.
    let mut client = Client::connect(connstr, NoTls);
-    let timeout = time::Duration::from_millis(MONITOR_CHECK_INTERVAL);

    info!("watching Postgres activity at {}", connstr);

    loop {
        // Should be outside of the write lock to allow others to read while we sleep.
-        thread::sleep(timeout);
+        thread::sleep(MONITOR_CHECK_INTERVAL);

        match &mut client {
            Ok(cli) => {
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -6,6 +6,7 @@ license.workspace = true

 [dependencies]
 anyhow.workspace = true
+camino.workspace = true
 clap.workspace = true
 comfy-table.workspace = true
 git-version.workspace = true
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -1,5 +1,6 @@
 use crate::{background_process, local_env::LocalEnv};
 use anyhow::anyhow;
+use camino::Utf8PathBuf;
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
 use std::{path::PathBuf, process::Child};
@@ -47,8 +48,9 @@ impl AttachmentService {
        }
    }

-    fn pid_file(&self) -> PathBuf {
-        self.env.base_data_dir.join("attachment_service.pid")
+    fn pid_file(&self) -> Utf8PathBuf {
+        Utf8PathBuf::from_path_buf(self.env.base_data_dir.join("attachment_service.pid"))
+            .expect("non-Unicode path")
    }

    pub fn start(&self) -> anyhow::Result<Child> {
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -16,12 +16,13 @@ use std::ffi::OsStr;
 use std::io::Write;
 use std::os::unix::prelude::AsRawFd;
 use std::os::unix::process::CommandExt;
-use std::path::{Path, PathBuf};
+use std::path::Path;
 use std::process::{Child, Command};
 use std::time::Duration;
 use std::{fs, io, thread};

 use anyhow::Context;
+use camino::{Utf8Path, Utf8PathBuf};
 use nix::errno::Errno;
 use nix::fcntl::{FcntlArg, FdFlag};
 use nix::sys::signal::{kill, Signal};
@@ -45,9 +46,9 @@ const NOTICE_AFTER_RETRIES: u64 = 50;
 /// it itself.
 pub enum InitialPidFile<'t> {
    /// Create a pidfile, to allow future CLI invocations to manipulate the process.
-    Create(&'t Path),
+    Create(&'t Utf8Path),
    /// The process will create the pidfile itself, need to wait for that event.
-    Expect(&'t Path),
+    Expect(&'t Utf8Path),
 }

 /// Start a background child process using the parameters given.
@@ -137,7 +138,11 @@ where
 }

 /// Stops the process, using the pid file given. Returns Ok also if the process is already not running.
-pub fn stop_process(immediate: bool, process_name: &str, pid_file: &Path) -> anyhow::Result<()> {
+pub fn stop_process(
+    immediate: bool,
+    process_name: &str,
+    pid_file: &Utf8Path,
+) -> anyhow::Result<()> {
    let pid = match pid_file::read(pid_file)
        .with_context(|| format!("read pid_file {pid_file:?}"))?
    {
@@ -252,9 +257,9 @@ fn fill_aws_secrets_vars(mut cmd: &mut Command) -> &mut Command {
 ///    will remain held until the cmd exits.
 fn pre_exec_create_pidfile<P>(cmd: &mut Command, path: P) -> &mut Command
 where
-    P: Into<PathBuf>,
+    P: Into<Utf8PathBuf>,
 {
-    let path: PathBuf = path.into();
+    let path: Utf8PathBuf = path.into();
    // SAFETY
    // pre_exec is marked unsafe because it runs between fork and exec.
    // Why is that dangerous in various ways?
@@ -311,7 +316,7 @@ where

 fn process_started<F>(
    pid: Pid,
-    pid_file_to_check: Option<&Path>,
+    pid_file_to_check: Option<&Utf8Path>,
    status_check: &F,
 ) -> anyhow::Result<bool>
 where
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -116,6 +116,7 @@ fn main() -> Result<()> {
            "attachment_service" => handle_attachment_service(sub_args, &env),
            "safekeeper" => handle_safekeeper(sub_args, &env),
            "endpoint" => handle_endpoint(sub_args, &env),
+            "mappings" => handle_mappings(sub_args, &mut env),
            "pg" => bail!("'pg' subcommand has been renamed to 'endpoint'"),
            _ => bail!("unexpected subcommand {sub_name}"),
        };
@@ -816,6 +817,38 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
    Ok(())
 }

+fn handle_mappings(sub_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> {
+    let (sub_name, sub_args) = match sub_match.subcommand() {
+        Some(ep_subcommand_data) => ep_subcommand_data,
+        None => bail!("no mappings subcommand provided"),
+    };
+
+    match sub_name {
+        "map" => {
+            let branch_name = sub_args
+                .get_one::<String>("branch-name")
+                .expect("branch-name argument missing");
+
+            let tenant_id = sub_args
+                .get_one::<String>("tenant-id")
+                .map(|x| TenantId::from_str(x))
+                .expect("tenant-id argument missing")
+                .expect("malformed tenant-id arg");
+
+            let timeline_id = sub_args
+                .get_one::<String>("timeline-id")
+                .map(|x| TimelineId::from_str(x))
+                .expect("timeline-id argument missing")
+                .expect("malformed timeline-id arg");
+
+            env.register_branch_mapping(branch_name.to_owned(), tenant_id, timeline_id)?;
+
+            Ok(())
+        }
+        other => unimplemented!("mappings subcommand {other}"),
+    }
+}
+
 fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
    fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageServerNode> {
        let node_id = if let Some(id_str) = args.get_one::<String>("pageserver-id") {
@@ -1084,6 +1117,7 @@ fn cli() -> Command {
    // --id, when using a pageserver command
    let pageserver_id_arg = Arg::new("pageserver-id")
        .long("id")
+        .global(true)
        .help("pageserver id")
        .required(false);
    // --pageserver-id when using a non-pageserver command
@@ -1254,17 +1288,20 @@ fn cli() -> Command {
            Command::new("pageserver")
                .arg_required_else_help(true)
                .about("Manage pageserver")
+                .arg(pageserver_id_arg)
                .subcommand(Command::new("status"))
-                .arg(pageserver_id_arg.clone())
-                .subcommand(Command::new("start").about("Start local pageserver")
-                .arg(pageserver_id_arg.clone())
-                .arg(pageserver_config_args.clone()))
-                .subcommand(Command::new("stop").about("Stop local pageserver")
-                .arg(pageserver_id_arg.clone())
-                            .arg(stop_mode_arg.clone()))
-                .subcommand(Command::new("restart").about("Restart local pageserver")
-                .arg(pageserver_id_arg.clone())
-                .arg(pageserver_config_args.clone()))
+                .subcommand(Command::new("start")
+                    .about("Start local pageserver")
+                    .arg(pageserver_config_args.clone())
+                )
+                .subcommand(Command::new("stop")
+                    .about("Stop local pageserver")
+                    .arg(stop_mode_arg.clone())
+                )
+                .subcommand(Command::new("restart")
+                    .about("Restart local pageserver")
+                    .arg(pageserver_config_args.clone())
+                )
        )
        .subcommand(
            Command::new("attachment_service")
@@ -1321,8 +1358,8 @@ fn cli() -> Command {
                    .about("Start postgres.\n If the endpoint doesn't exist yet, it is created.")
                    .arg(endpoint_id_arg.clone())
                    .arg(tenant_id_arg.clone())
-                    .arg(branch_name_arg)
-                    .arg(timeline_id_arg)
+                    .arg(branch_name_arg.clone())
+                    .arg(timeline_id_arg.clone())
                    .arg(lsn_arg)
                    .arg(pg_port_arg)
                    .arg(http_port_arg)
@@ -1335,7 +1372,7 @@ fn cli() -> Command {
                .subcommand(
                    Command::new("stop")
                    .arg(endpoint_id_arg)
-                    .arg(tenant_id_arg)
+                    .arg(tenant_id_arg.clone())
                    .arg(
                        Arg::new("destroy")
                            .help("Also delete data directory (now optional, should be default in future)")
@@ -1346,6 +1383,18 @@ fn cli() -> Command {
                )

        )
+        .subcommand(
+            Command::new("mappings")
+                .arg_required_else_help(true)
+                .about("Manage neon_local branch name mappings")
+                .subcommand(
+                    Command::new("map")
+                        .about("Create new mapping which cannot exist already")
+                        .arg(branch_name_arg.clone())
+                        .arg(tenant_id_arg.clone())
+                        .arg(timeline_id_arg.clone())
+                )
+        )
        // Obsolete old name for 'endpoint'. We now just print an error if it's used.
        .subcommand(
            Command::new("pg")
--- a/control_plane/src/broker.rs
+++ b/control_plane/src/broker.rs
@@ -7,7 +7,7 @@
 //! ```
 use anyhow::Context;

-use std::path::PathBuf;
+use camino::Utf8PathBuf;

 use crate::{background_process, local_env};

@@ -30,7 +30,7 @@ pub fn start_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
        || {
            let url = broker.client_url();
            let status_url = url.join("status").with_context(|| {
-                format!("Failed to append /status path to broker endpoint {url}",)
+                format!("Failed to append /status path to broker endpoint {url}")
            })?;
            let request = client
                .get(status_url)
@@ -50,6 +50,7 @@ pub fn stop_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
    background_process::stop_process(true, "storage_broker", &storage_broker_pid_file_path(env))
 }

-fn storage_broker_pid_file_path(env: &local_env::LocalEnv) -> PathBuf {
-    env.base_data_dir.join("storage_broker.pid")
+fn storage_broker_pid_file_path(env: &local_env::LocalEnv) -> Utf8PathBuf {
+    Utf8PathBuf::from_path_buf(env.base_data_dir.join("storage_broker.pid"))
+        .expect("non-Unicode path")
 }
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -14,6 +14,7 @@ use std::process::{Child, Command};
 use std::{io, result};

 use anyhow::{bail, Context};
+use camino::Utf8PathBuf;
 use pageserver_api::models::{self, TenantInfo, TimelineInfo};
 use postgres_backend::AuthType;
 use postgres_connection::{parse_host_port, PgConnectionConfig};
@@ -144,7 +145,7 @@ impl PageServerNode {
    pub fn initialize(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
        // First, run `pageserver --init` and wait for it to write a config into FS and exit.
        self.pageserver_init(config_overrides)
-            .with_context(|| format!("Failed to run init for pageserver node {}", self.conf.id,))
+            .with_context(|| format!("Failed to run init for pageserver node {}", self.conf.id))
    }

    pub fn repo_path(&self) -> PathBuf {
@@ -154,8 +155,9 @@ impl PageServerNode {
    /// The pid file is created by the pageserver process, with its pid stored inside.
    /// Other pageservers cannot lock the same file and overwrite it for as long as the current
    /// pageserver runs. (Unless someone removes the file manually; never do that!)
-    fn pid_file(&self) -> PathBuf {
-        self.repo_path().join("pageserver.pid")
+    fn pid_file(&self) -> Utf8PathBuf {
+        Utf8PathBuf::from_path_buf(self.repo_path().join("pageserver.pid"))
+            .expect("non-Unicode path")
    }

    pub fn start(&self, config_overrides: &[&str]) -> anyhow::Result<Child> {
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -11,6 +11,7 @@ use std::process::Child;
 use std::{io, result};

 use anyhow::Context;
+use camino::Utf8PathBuf;
 use postgres_connection::PgConnectionConfig;
 use reqwest::blocking::{Client, RequestBuilder, Response};
 use reqwest::{IntoUrl, Method};
@@ -97,8 +98,9 @@ impl SafekeeperNode {
        SafekeeperNode::datadir_path_by_id(&self.env, self.id)
    }

-    pub fn pid_file(&self) -> PathBuf {
-        self.datadir_path().join("safekeeper.pid")
+    pub fn pid_file(&self) -> Utf8PathBuf {
+        Utf8PathBuf::from_path_buf(self.datadir_path().join("safekeeper.pid"))
+            .expect("non-Unicode path")
    }

    pub fn start(&self, extra_opts: Vec<String>) -> anyhow::Result<Child> {
--- a/deny.toml
+++ b/deny.toml
@@ -23,7 +23,7 @@ vulnerability = "deny"
 unmaintained = "warn"
 yanked = "warn"
 notice = "warn"
-ignore = ["RUSTSEC-2023-0052"]
+ignore = []

 # This section is considered when running `cargo deny check licenses`
 # More documentation for the licenses section can be found here:
--- a/docs/rfcs/028-pageserver-migration.md
+++ b/docs/rfcs/028-pageserver-migration.md
@@ -0,0 +1,599 @@
+# Seamless tenant migration
+
+- Author: john@neon.tech
+- Created on 2023-08-11
+- Implemented on ..
+
+## Summary
+
+The preceding [generation numbers RFC](025-generation-numbers.md) may be thought of as "making tenant
+migration safe". Following that,
+this RFC is about how those migrations are to be done:
+
+1. Seamlessly (without interruption to client availability)
+2. Quickly (enabling faster operations)
+3. Efficiently (minimizing I/O and $ cost)
+
+These points are in priority order: if we have to sacrifice
+efficiency to make a migration seamless for clients, we will
+do so, etc.
+
+This is accomplished by introducing two high level changes:
+
+- A dual-attached state for tenants, used in a control-plane-orchestrated
+  migration procedure that preserves availability during a migration.
+- Warm secondary locations for tenants, where on-disk content is primed
+  for a fast migration of the tenant from its current attachment to this
+  secondary location.
+
+## Motivation
+
+Migrating tenants between pageservers is essential to operating a service
+at scale, in several contexts:
+
+1. Responding to a pageserver node failure by migrating tenants to other pageservers
+2. Balancing load and capacity across pageservers, for example when a user expands their
+   database and they need to migrate to a pageserver with more capacity.
+3. Restarting pageservers for upgrades and maintenance
+
+The current situation steps for migration are:
+
+- detach from old node; skip if old node is dead; (the [skip part is still WIP](https://github.com/neondatabase/cloud/issues/5426)).
+- attach to new node
+- re-configure endpoints to use the new node
+
+Once [generation numbers](025-generation-numbers.md) are implemented,
+the detach step is no longer critical for correctness. So, we can
+
+- attach to a new node,
+- re-configure endpoints to use the new node, and then
+- detach from the old node.
+
+However, this still does not meet our seamless/fast/efficient goals:
+
+- Not fast: The new node will have to download potentially large amounts
+  of data from S3, which may take many minutes.
+- Not seamless: If we attach to a new pageserver before detaching an old one,
+  the new one might delete some objects that interrupt availability of reads on the old one.
+- Not efficient: the old pageserver will continue uploading
+  S3 content during the migration that will never be read.
+
+The user expectations for availability are:
+
+- For planned maintenance, there should be zero availability
+  gap. This expectation is fulfilled by this RFC.
+- For unplanned changes (e.g. node failures), there should be
+  minimal availability gap. This RFC provides the _mechanism_
+  to fail over quickly, but does not provide the failure _detection_
+  nor failover _policy_.
+
+## Non Goals
+
+- Defining service tiers with different storage strategies: the same
+  level of HA & overhead will apply to all tenants. This doesn't rule out
+  adding such tiers in future.
+- Enabling pageserver failover in the absence of a control plane: the control
+  plane will remain the source of truth for what should be attached where.
+- Totally avoiding availability gaps on unplanned migrations during
+  a failure (we expect a small, bounded window of
+  read unavailability of very recent LSNs)
+- Workload balancing: this RFC defines the mechanism for moving tenants
+  around, not the higher level logic for deciding who goes where.
+- Defining all possible configuration flows for tenants: the migration process
+  defined in this RFC demonstrates the sufficiency of the pageserver API, but
+  is not the only kind of configuration change the control plane will ever do.
+  The APIs defined here should let the control plane move tenants around in
+  whatever way is needed while preserving data safety and read availability.
+
+## Impacted components
+
+Pageserver, control plane
+
+## Terminology
+
+- **Attachment**: a tenant is _attached_ to a pageserver if it has
+  been issued a generation number, and is running an instance of
+  the `Tenant` type, ingesting the WAL, and available to serve
+  page reads.
+- **Location**: locations are a superset of attachments. A location
+  is a combination of a tenant and a pageserver. We may _attach_ at a _location_.
+
+- **Secondary location**: a location which is not currently attached.
+- **Warm secondary location**: a location which is not currently attached, but is endeavoring to maintain a warm local cache of layers. We avoid calling this a _warm standby_ to avoid confusion with similar postgres features.
+
+## Implementation (high level)
+
+### Warm secondary locations
+
+To enable faster migrations, we will identify at least one _secondary location_
+for each tenant. This secondary location will keep a warm cache of layers
+for the tenant, so that if it is later attached, it can catch up with the
+latest LSN quickly: rather than downloading everything, it only has to replay
+the recent part of the WAL to advance from the remote_consistent_offset to the
+most recent LSN in the WAL.
+
+The control plane is responsible for selecting secondary locations, and
+calling into pageservers to configure tenants into a secondary mode at this
+new location, as well as attaching the tenant in its existing primary location.
+
+The attached pageserver for a tenant will publish a [layer heatmap](#layer-heatmap)
+to advise secondaries of which layers should be downloaded.
+
+### Location modes
+
+Currently, we consider a tenant to be in one of two states on a pageserver:
+
+- Attached: active `Tenant` object, and layers on local disk
+- Detached: no layers on local disk, no runtime state.
+
+We will extend this with finer-grained modes, whose purpose will become
+clear in later sections:
+
+- **AttachedSingle**: equivalent the existing attached state.
+- **AttachedMulti**: like AttachedSingle, holds an up to date generation, but
+  does not do deletions.
+- **AttachedStale**: like AttachedSingle, holds a stale generation,
+  do not do any remote storage operations.
+- **Secondary**: keep local state on disk, periodically update from S3.
+- **Detached**: equivalent to existing detached state.
+
+To control these finer grained states, a new pageserver API endpoint will be added.
+
+### Cutover procedure
+
+Define old location and new location as "Node A" and "Node B". Consider
+the case where both nodes are available, and Node B was previously configured
+as a secondary location for the tenant we are migrating.
+
+The cutover procedure is orchestrated by the control plane, calling into
+the pageservers' APIs:
+
+1. Call to Node A requesting it to flush to S3 and enter AttachedStale state
+2. Increment generation, and call to Node B requesting it to enter AttachedMulti
+   state with the new generation.
+3. Call to Node B, requesting it to download the latest hot layers from remote storage,
+   according to the latest heatmap flushed by Node A.
+4. Wait for Node B's WAL ingestion to catch up with node A's
+5. Update endpoints to use node B instead of node A
+6. Call to node B requesting it to enter state AttachedSingle.
+7. Call to node A requesting it to enter state Secondary
+
+The following table summarizes how the state of the system advances:
+
+|     Step      |     Node A     |     Node B     | Node used by endpoints |
+| :-----------: | :------------: | :------------: | :--------------------: |
+| 1 (_initial_) | AttachedSingle |   Secondary    |           A            |
+|       2       | AttachedStale  | AttachedMulti  |           A            |
+|       3       | AttachedStale  | AttachedMulti  |           A            |
+|       4       | AttachedStale  | AttachedMulti  |           A            |
+| 5 (_cutover_) | AttachedStale  | AttachedMulti  |           B            |
+|       6       | AttachedStale  | AttachedSingle |           B            |
+|  7 (_final_)  |   Secondary    | AttachedSingle |           B            |
+
+The procedure described for a clean handover from a live node to a secondary
+is also used for failure cases and for migrations to a location that is not
+configured as a secondary, by simply skipping irrelevant steps, as described in
+the following sections.
+
+#### Migration from an unresponsive node
+
+If node A is unavailable, then all calls into
+node A are skipped and we don't wait for B to catch up before
+switching updating the endpoints to use B.
+
+#### Migration to a location that is not a secondary
+
+If node B is initially in Detached state, the procedure is identical. Since Node B
+is coming from a Detached state rather than Secondary, the download of layers and
+catch up with WAL will take much longer.
+
+We might do this if:
+
+- Attached and secondary locations are both critically low on disk, and we need
+  to migrate to a third node with more resources available.
+- We are migrating a tenant which does not use secondary locations to save on cost.
+
+#### Permanent migration away from a node
+
+In the final step of the migration, we generally request the original node to enter a Secondary
+state. This is typical if we are doing a planned migration during maintenance, or to
+balance CPU/network load away from a node.
+
+One might also want to permanently migrate away: this can be done by simply removing the secondary
+location after the migration is complete, or as an optimization by substituting the Detached state
+for the Secondary state in the final step.
+
+#### Cutover diagram
+
+```mermaid
+sequenceDiagram
+participant CP as Control plane
+participant A as Node A
+participant B as Node B
+participant E as Endpoint
+
+CP->>A: PUT Flush & go to AttachedStale
+note right of A: A continues to ingest WAL
+CP->>B: PUT AttachedMulti
+CP->>B: PUT Download layers from latest heatmap
+note right of B: B downloads from S3
+loop Poll until download complete
+CP->>B: GET download status
+end
+activate B
+note right of B: B ingests WAL
+loop Poll until catch up
+CP->>B: GET visible WAL
+CP->>A: GET visible WAL
+end
+deactivate B
+CP->>E: Configure to use Node B
+E->>B: Connect for reads
+CP->>B: PUT AttachedSingle
+CP->>A: PUT Secondary
+```
+
+#### Cutover from an unavailable pageserver
+
+This case is far simpler: we may skip straight to our intended
+end state.
+
+```mermaid
+sequenceDiagram
+participant A as Node A
+participant CP as Control plane
+participant B as Node B
+participant E as Endpoint
+
+note right of A: Node A offline
+activate A
+CP->>B: PUT AttachedSingle
+CP->>E: Configure to use Node B
+E->>B: Connect for reads
+deactivate A
+```
+
+## Implementation (detail)
+
+### Purpose of AttachedMulti, AttachedStale
+
+#### AttachedMulti
+
+Ordinarily, an attached pageserver whose generation is the latest may delete
+layers at will (e.g. during compaction). If a previous generation pageserver
+is also still attached, and in use by endpoints, then this layer deletion could
+lead to a loss of availability for the endpoint when reading from the previous
+generation pageserver.
+
+The _AttachedMulti_ state simply disables deletions. These will be enqueued
+in `RemoteTimelineClient` until the control plane transitions the
+node into AttachedSingle, which unblocks deletions.  Other remote storage operations
+such as uploads are not blocked.
+
+AttachedMulti is not required for data safety, only to preserve availability
+on pageservers running with stale generations.
+
+A node enters AttachedMulti only when explicitly asked to by the control plane. It should
+only remain in this state for the duration of a migration.
+
+If a control plane bug leaves
+the node in AttachedMulti for a long time, then we must avoid unbounded memory use from enqueued
+deletions. This may be accomplished simply, by dropping enqueued deletions when some modest
+threshold of delayed deletions (e.g. 10k layers per tenant) is reached. As with all deletions,
+it is safe to skip them, and the leaked objects will be eventually cleaned up by scrub or
+by timeline deletion.
+
+During AttachedMulti, the Tenant is free to drop layers from local disk in response to
+disk pressure: only the deletion of remote layers is blocked.
+
+#### AttachedStale
+
+Currently, a pageserver with a stale generation number will continue to
+upload layers, but be prevented from completing deletions. This is safe, but inefficient: layers uploaded by this stale generation
+will not be read back by future generations of pageservers.
+
+The _AttachedStale_ state disables S3 uploads. The stale pageserver
+will continue to ingest the WAL and write layers to local disk, but not to
+do any uploads to S3.
+
+A node may enter AttachedStale in two ways:
+
+- Explicitly, when control plane calls into the node at the start of a migration.
+- Implicitly, when the node tries to validate some deletions and discovers
+  that its generation is stale.
+
+The AttachedStale state also disables sending consumption metrics from
+that location: it is interpreted as an indication that some other pageserver
+is already attached or is about to be attached, and that new pageserver will
+be responsible for sending consumption metrics.
+
+#### Disk Pressure & AttachedStale
+
+Over long periods of time, a tenant location in AttachedStale will accumulate data
+on local disk, as it cannot evict any layers written since it entered the
+AttachStale state. We rely on the control plane to revert the location to
+Secondary or Detached at the end of a migration.
+
+This scenario is particularly noteworthy when evacuating all tenants on a pageserver:
+since _all_ the attached tenants will go into AttachedStale, we will be doing no
+uploads at all, therefore ingested data will cause disk usage to increase continuously.
+Under nominal conditions, the available disk space on pageservers should be sufficient
+to complete the evacuation before this becomes a problem, but we must also handle
+the case where we hit a low disk situation while in this state.
+
+The concept of disk pressure already exists in the pageserver: the `disk_usage_eviction_task`
+touches each Tenant when it determines that a low-disk condition requires
+some layer eviction. Having selected layers for eviction, the eviction
+task calls `Timeline::evict_layers`.
+
+**Safety**: If evict_layers is called while in AttachedStale state, and some of the to-be-evicted
+layers are not yet uploaded to S3, then the block on uploads will be lifted. This
+will result in leaking some objects once a migration is complete, but will enable
+the node to manage its disk space properly: if a node is left with some tenants
+in AttachedStale indefinitely due to a network partition or control plane bug,
+these tenants will not cause a full disk condition.
+
+### Warm secondary updates
+
+#### Layer heatmap
+
+The secondary location's job is to serve reads **with the same quality of service as the original location
+was serving them around the time of a migration**. This does not mean the secondary
+location needs the whole set of layers: inactive layers that might soon
+be evicted on the attached pageserver need not be downloaded by the
+secondary. A totally idle tenant only needs to maintain enough on-disk
+state to enable a fast cold start (i.e. the most recent image layers are
+typically sufficient).
+
+To enable this, we introduce the concept of a _layer heatmap_, which
+acts as an advisory input to secondary locations to decide which
+layers to download from S3.
+
+#### Attached pageserver
+
+The attached pageserver, if in state AttachedSingle, periodically
+uploads a serialized heat map to S3. It may skip this if there
+is no change since the last time it uploaded (e.g. if the tenant
+is totally idle).
+
+Additionally, when the tenant is flushed to remote storage prior to a migration
+(the first step in [cutover procedure](#cutover-procedure)), 
+the heatmap is written out. This enables a future attached pageserver
+to get an up to date view when deciding which layers to download.
+
+#### Secondary location behavior
+
+Secondary warm locations run a simple loop, implemented separately from
+the main `Tenant` type, which represents attached tenants:
+
+- Download the layer heatmap
+- Select any "hot enough" layers to download, if there is sufficient
+  free disk space.
+- Download layers, if they were not previously evicted (see below)
+- Download the latest index_part.json
+- Check if any layers currently on disk are no longer referenced by
+  IndexPart & delete them
+
+Note that the heatmap is only advisory: if a secondary location has plenty
+of disk space, it may choose to retain layers that aren't referenced
+by the heatmap, as long as they are still referenced by the IndexPart. Conversely,
+if a node is very low on disk space, it might opt to raise the heat threshold required
+to both downloading a layer, until more disk space is available.
+
+#### Secondary locations & disk pressure
+
+Secondary locations are subject to eviction on disk pressure, just as
+attached locations are.  For eviction purposes, the access time of a
+layer in a secondary location will be the access time given in the heatmap,
+rather than the literal time at which the local layer file was accessed.
+
+The heatmap will indicate which layers are in local storage on the attached
+location.  The secondary will always attempt to get back to having that
+set of layers on disk, but to avoid flapping, it will remember the access
+time of the layer it was most recently asked to evict, and layers whose
+access time is below that will not be re-downloaded.
+
+The resulting behavior is that after a layer is evicted from a secondary
+location, it is only re-downloaded once the attached pageserver accesses
+the layer and uploads a heatmap reflecting that access time.  On a pageserver
+restart, the secondary location will attempt to download all layers in
+the heatmap again, if they are not on local disk.
+
+This behavior will be slightly different when secondary locations are
+used for "low energy tenants", but that is beyond the scope of this RFC.
+
+### Location configuration API
+
+Currently, the `/tenant/<tenant_id>/config` API defines various
+tunables like compaction settings, which apply to the tenant irrespective
+of which pageserver it is running on.
+
+A new "location config" structure will be introduced, which defines
+configuration which is per-tenant, but local to a particular pageserver,
+such as the attachment mode and whether it is a secondary.
+
+The pageserver will expose a new per-tenant API for setting
+the state: `/tenant/<tenant_id>/location/config`.
+
+Body content:
+
+```
+{
+  state: 'enum{Detached, Secondary, AttachedSingle, AttachedMulti, AttachedStale}',
+  generation: Option<u32>,
+  configuration: `Option<TenantConfig>`
+  flush: bool
+}
+```
+
+Existing `/attach` and `/detach` endpoint will have the same
+behavior as calling `/location/config` with `AttachedSingle` and `Detached`
+states respectively. These endpoints will be deprecated and later
+removed.
+
+The generation attribute is mandatory for entering `AttachedSingle` or
+`AttachedMulti`.
+
+The configuration attribute is mandatory when entering any state other
+than `Detached`. This configuration is the same as the body for
+the existing `/tenant/<tenant_id>/config` endpoint.
+
+The `flush` argument indicates whether the pageservers should flush
+to S3 before proceeding: this only has any effect if the node is
+currently in AttachedSingle or AttachedMulti. This is used
+during the first phase of migration, when transitioning the
+old pageserver to AttachedSingle.
+
+The `/re-attach` API response will be extended to include a `state` as
+well as a `generation`, enabling the pageserver to enter the
+correct state for each tenant on startup.
+
+### Database schema for locations
+
+A new table `ProjectLocation`:
+
+- pageserver_id: int
+- tenant_id: TenantId
+- generation: Option<int>
+- state: `enum(Secondary, AttachedSingle, AttachedMulti)`
+
+Notes:
+
+- It is legacy for a Project to have zero `ProjectLocation`s
+- The `pageserver` column in `Project` now means "to which pageserver should
+  endpoints connect", rather than simply which pageserver is attached.
+- The `generation` column in `Project` remains, and is incremented and used
+  to set the generation of `ProjectLocation` rows when they are set into
+  an attached state.
+- The `Detached` state is implicitly represented as the absence of
+  a `ProjectLocation`.
+
+### Executing migrations
+
+Migrations will be implemented as Go functions, within the
+existing `Operation` framework in the control plane. These
+operations are persistent, such that they will always keep
+trying until completion: this property is important to avoid
+leaving garbage behind on pageservers, such as AttachedStale
+locations.
+
+### Recovery from failures during migration
+
+During migration, the control plane may encounter failures of either
+the original or new pageserver, or both:
+
+- If the original fails, skip past waiting for the new pageserver
+  to catch up, and put it into AttachedSingle immediately.
+- If the new node fails, put the old pageserver into Secondary
+  and then back into AttachedSingle (this has the effect of
+  retaining on-disk state and granting it a fresh generation number).
+- If both nodes fail, keep trying until one of them is available
+  again.
+
+### Control plane -> Pageserver reconciliation
+
+A migration may be done while the old node is unavailable,
+in which case the old node may still be running in an AttachedStale
+state.
+
+In this case, it is undesirable to have the migration `Operation`
+stay alive until the old node eventually comes back online
+and can be cleaned up. To handle this, the control plane
+should run a background reconciliation process to compare
+a pageserver's attachments with the database, and clean up
+any that shouldn't be there any more.
+
+Note that there will be no work to do if the old node was really
+offline, as during startup it will call into `/re-attach` and
+be updated that way. The reconciliation will only be needed
+if the node was unavailable but still running.
+
+## Alternatives considered
+
+### Only enabling secondary locations for tenants on a higher service tier
+
+This will make sense in future, especially for tiny databases that may be
+downloaded from S3 in milliseconds when needed.
+
+However, it is not wise to do it immediately, because pageservers contain
+a mixture of higher and lower tier workloads. If we had 1 tenant with
+a secondary location and 9 without, then those other 9 tenants will do
+a lot of I/O as they try to recover from S3, which may degrade the
+service of the tenant which had a secondary location.
+
+Until we segregate tenant on different service tiers on different pageserver
+nodes, or implement & test QoS to ensure that tenants with secondaries are
+not harmed by tenants without, we should use the same failover approach
+for all the tenants.
+
+### Hot secondary locations (continuous WAL replay)
+
+Instead of secondary locations populating their caches from S3, we could
+have them consume the WAL from safekeepers. The downsides of this would be:
+
+- Double load on safekeepers, which are a less scalable service than S3
+- Secondary locations' on-disk state would end up subtly different to
+  the remote state, which would make synchronizing with S3 more complex/expensive
+  when going into attached state.
+
+The downside of only updating secondary locations from S3 is that we will
+have a delay during migration from replaying the LSN range between what's
+in S3 and what's in the pageserver. This range will be very small on
+planned migrations, as we have the old pageserver flush to S3 immediately
+before attaching the new pageserver. On unplanned migrations (old pageserver
+is unavailable), the range of LSNs to replay is bounded by the flush frequency
+on the old pageserver. However, the migration doesn't have to wait for the
+replay: it's just that not-yet-replayed LSNs will be unavailable for read
+until the new pageserver catches up.
+
+We expect that pageserver reads of the most recent LSNs will be relatively
+rare, as for an active endpoint those pages will usually still be in the postgres
+page cache: this leads us to prefer synchronizing from S3 on secondary
+locations, rather than consuming the WAL from safekeepers.
+
+### Cold secondary locations
+
+It is not functionally necessary to keep warm caches on secondary locations at all. However, if we do not, then
+we would experience a de-facto availability loss in unplanned migrations, as reads to the new node would take an extremely long time (many seconds, perhaps minutes).
+
+Warm caches on secondary locations are necessary to meet
+our availability goals.
+
+### Pageserver-granularity failover
+
+Instead of migrating tenants individually, we could have entire spare nodes,
+and on a node death, move all its work to one of these spares.
+
+This approach is avoided for several reasons:
+
+- we would still need fine-grained tenant migration for other
+  purposes such as balancing load
+- by sharing the spare capacity over many peers rather than one spare node,
+  these peers may use the capacity for other purposes, until it is needed
+  to handle migrated tenants. e.g. for keeping a deeper cache of their
+  attached tenants.
+
+### Readonly during migration
+
+We could simplify migrations by making both previous and new nodes go into a
+readonly state, then flush remote content from the previous node, then activate
+attachment on the secondary node.
+
+The downside to this approach is a potentially large gap in readability of
+recent LSNs while loading data onto the new node. To avoid this, it is worthwhile
+to incur the extra cost of double-replaying the WAL onto old and new nodes' local
+storage during a migration.
+
+### Peer-to-peer pageserver communication
+
+Rather than uploading the heatmap to S3, attached pageservers could make it
+available to peers.
+
+Currently, pageservers have no peer to peer communication, so adding this
+for heatmaps would incur significant overhead in deployment and configuration
+of the service, and ensuring that when a new pageserver is deployed, other
+pageservers are updated to be aware of it.
+
+As well as simplifying implementation, putting heatmaps in S3 will be useful
+for future analytics purposes -- gathering aggregated statistics on activity
+pattersn across many tenants may be done directly from data in S3.
--- a/libs/consumption_metrics/src/lib.rs
+++ b/libs/consumption_metrics/src/lib.rs
@@ -107,7 +107,7 @@ pub const CHUNK_SIZE: usize = 1000;

 // Just a wrapper around a slice of events
 // to serialize it as `{"events" : [ ] }
-#[derive(serde::Serialize)]
+#[derive(serde::Serialize, serde::Deserialize)]
 pub struct EventChunk<'a, T: Clone> {
    pub events: std::borrow::Cow<'a, [T]>,
 }
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -10,6 +10,7 @@ use serde_with::{serde_as, DisplayFromStr};
 use strum_macros;
 use utils::{
    completion,
+    generation::Generation,
    history_buffer::HistoryBufferWithDropCounter,
    id::{NodeId, TenantId, TimelineId},
    lsn::Lsn,
@@ -218,6 +219,8 @@ impl std::ops::Deref for TenantCreateRequest {
    }
 }

+/// An alternative representation of `pageserver::tenant::TenantConf` with
+/// simpler types.
 #[derive(Serialize, Deserialize, Debug, Default)]
 pub struct TenantConfig {
    pub checkpoint_distance: Option<u64>,
@@ -243,6 +246,39 @@ pub struct TenantConfig {
    pub gc_feedback: Option<bool>,
 }

+/// A flattened analog of a `pagesever::tenant::LocationMode`, which
+/// lists out all possible states (and the virtual "Detached" state)
+/// in a flat form rather than using rust-style enums.
+#[derive(Serialize, Deserialize, Debug)]
+pub enum LocationConfigMode {
+    AttachedSingle,
+    AttachedMulti,
+    AttachedStale,
+    Secondary,
+    Detached,
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+pub struct LocationConfigSecondary {
+    pub warm: bool,
+}
+
+/// An alternative representation of `pageserver::tenant::LocationConf`,
+/// for use in external-facing APIs.
+#[derive(Serialize, Deserialize, Debug)]
+pub struct LocationConfig {
+    pub mode: LocationConfigMode,
+    /// If attaching, in what generation?
+    #[serde(default)]
+    pub generation: Option<Generation>,
+    #[serde(default)]
+    pub secondary_conf: Option<LocationConfigSecondary>,
+
+    // If requesting mode `Secondary`, configuration for that.
+    // Custom storage configuration for the tenant, if any
+    pub tenant_conf: TenantConfig,
+}
+
 #[serde_as]
 #[derive(Serialize, Deserialize)]
 #[serde(transparent)]
@@ -253,6 +289,16 @@ pub struct StatusResponse {
    pub id: NodeId,
 }

+#[serde_as]
+#[derive(Serialize, Deserialize, Debug)]
+#[serde(deny_unknown_fields)]
+pub struct TenantLocationConfigRequest {
+    #[serde_as(as = "DisplayFromStr")]
+    pub tenant_id: TenantId,
+    #[serde(flatten)]
+    pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
+}
+
 #[serde_as]
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -442,10 +442,20 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
            trace!("got message {:?}", msg);

            let result = self.process_message(handler, msg, &mut query_string).await;
-            self.flush().await?;
+            tokio::select!(
+                biased;
+                _ = shutdown_watcher() => {
+                    // We were requested to shut down.
+                    tracing::info!("shutdown request received during response flush");
+                    return Ok(())
+                },
+                flush_r = self.flush() => {
+                    flush_r?;
+                }
+            );
+
            match result? {
                ProcessMsgResult::Continue => {
-                    self.flush().await?;
                    continue;
                }
                ProcessMsgResult::Break => break,
--- a/libs/postgres_ffi/wal_craft/Cargo.toml
+++ b/libs/postgres_ffi/wal_craft/Cargo.toml
@@ -12,7 +12,7 @@ log.workspace = true
 once_cell.workspace = true
 postgres.workspace = true
 postgres_ffi.workspace = true
-tempfile.workspace = true
+camino-tempfile.workspace = true

 workspace_hack.workspace = true

--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -1,4 +1,5 @@
 use anyhow::{bail, ensure};
+use camino_tempfile::{tempdir, Utf8TempDir};
 use log::*;
 use postgres::types::PgLsn;
 use postgres::Client;
@@ -8,7 +9,6 @@ use std::cmp::Ordering;
 use std::path::{Path, PathBuf};
 use std::process::Command;
 use std::time::{Duration, Instant};
-use tempfile::{tempdir, TempDir};

 macro_rules! xlog_utils_test {
    ($version:ident) => {
@@ -33,7 +33,7 @@ pub struct Conf {

 pub struct PostgresServer {
    process: std::process::Child,
-    _unix_socket_dir: TempDir,
+    _unix_socket_dir: Utf8TempDir,
    client_config: postgres::Config,
 }

--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -13,6 +13,7 @@ aws-types.workspace = true
 aws-config.workspace = true
 aws-sdk-s3.workspace = true
 aws-credential-types.workspace = true
+camino.workspace = true
 hyper = { workspace = true, features = ["stream"] }
 serde.workspace = true
 serde_json.workspace = true
@@ -27,6 +28,6 @@ pin-project-lite.workspace = true
 workspace_hack.workspace = true

 [dev-dependencies]
-tempfile.workspace = true
+camino-tempfile.workspace = true
 test-context.workspace = true
 rand.workspace = true
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -13,23 +13,19 @@ use std::{
    collections::HashMap,
    fmt::Debug,
    num::{NonZeroU32, NonZeroUsize},
-    path::{Path, PathBuf},
    pin::Pin,
    sync::Arc,
 };

 use anyhow::{bail, Context};
+use camino::{Utf8Path, Utf8PathBuf};

 use serde::{Deserialize, Serialize};
 use tokio::io;
 use toml_edit::Item;
 use tracing::info;

-pub use self::{
-    local_fs::LocalFs,
-    s3_bucket::S3Bucket,
-    simulate_failures::{SimulatedError, UnreliableWrapper},
-};
+pub use self::{local_fs::LocalFs, s3_bucket::S3Bucket, simulate_failures::UnreliableWrapper};

 /// How many different timelines can be processed simultaneously when synchronizing layers with the remote storage.
 /// During regular work, pageserver produces one layer file per timeline checkpoint, with bursts of concurrency
@@ -56,7 +52,7 @@ const REMOTE_STORAGE_PREFIX_SEPARATOR: char = '/';
 /// The prefix is an implementation detail, that allows representing local paths
 /// as the remote ones, stripping the local storage prefix away.
 #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
-pub struct RemotePath(PathBuf);
+pub struct RemotePath(Utf8PathBuf);

 impl Serialize for RemotePath {
    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
@@ -73,18 +69,18 @@ impl<'de> Deserialize<'de> for RemotePath {
        D: serde::Deserializer<'de>,
    {
        let str = String::deserialize(deserializer)?;
-        Ok(Self(PathBuf::from(&str)))
+        Ok(Self(Utf8PathBuf::from(&str)))
    }
 }

 impl std::fmt::Display for RemotePath {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{}", self.0.display())
+        std::fmt::Display::fmt(&self.0, f)
    }
 }

 impl RemotePath {
-    pub fn new(relative_path: &Path) -> anyhow::Result<Self> {
+    pub fn new(relative_path: &Utf8Path) -> anyhow::Result<Self> {
        anyhow::ensure!(
            relative_path.is_relative(),
            "Path {relative_path:?} is not relative"
@@ -93,30 +89,30 @@ impl RemotePath {
    }

    pub fn from_string(relative_path: &str) -> anyhow::Result<Self> {
-        Self::new(Path::new(relative_path))
+        Self::new(Utf8Path::new(relative_path))
    }

-    pub fn with_base(&self, base_path: &Path) -> PathBuf {
+    pub fn with_base(&self, base_path: &Utf8Path) -> Utf8PathBuf {
        base_path.join(&self.0)
    }

    pub fn object_name(&self) -> Option<&str> {
-        self.0.file_name().and_then(|os_str| os_str.to_str())
+        self.0.file_name()
    }

-    pub fn join(&self, segment: &Path) -> Self {
+    pub fn join(&self, segment: &Utf8Path) -> Self {
        Self(self.0.join(segment))
    }

-    pub fn get_path(&self) -> &PathBuf {
+    pub fn get_path(&self) -> &Utf8PathBuf {
        &self.0
    }

    pub fn extension(&self) -> Option<&str> {
-        self.0.extension()?.to_str()
+        self.0.extension()
    }

-    pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Path, std::path::StripPrefixError> {
+    pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Utf8Path, std::path::StripPrefixError> {
        self.0.strip_prefix(&p.0)
    }
 }
@@ -194,44 +190,26 @@ impl Debug for Download {
 #[derive(Debug)]
 pub enum DownloadError {
    /// Validation or other error happened due to user input.
-    ///
-    /// This is only used by LOCAL_FS.
    BadInput(anyhow::Error),
-
    /// The file was not found in the remote storage.
-    ///
-    /// This can only happen during download, never during delete.
    NotFound,
-
-    /// The file was found in the remote storage, but the operation failed.
-    ///
-    /// The error should have context already describing the real failed operation.
+    /// The file was found in the remote storage, but the download failed.
    Other(anyhow::Error),
 }

 impl std::fmt::Display for DownloadError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        use DownloadError::*;
        match self {
-            NotFound => write!(f, "No file found for the remote object id given"),
-            // this is same as thiserror error(transparent); it handles {} and {:#}
-            Other(e) | BadInput(e) => std::fmt::Display::fmt(e, f),
+            DownloadError::BadInput(e) => {
+                write!(f, "Failed to download a remote file due to user input: {e}")
+            }
+            DownloadError::NotFound => write!(f, "No file found for the remote object id given"),
+            DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e:?}"),
        }
    }
 }

-impl std::error::Error for DownloadError {
-    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
-        use DownloadError::*;
-        match self {
-            NotFound => None,
-            Other(_) | BadInput(_) => {
-                // TODO: these are anyhow, cannot return here
-                None
-            }
-        }
-    }
-}
+impl std::error::Error for DownloadError {}

 /// Every storage, currently supported.
 /// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics.
@@ -333,7 +311,7 @@ impl GenericRemoteStorage {
    pub fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result<Self> {
        Ok(match &storage_config.storage {
            RemoteStorageKind::LocalFs(root) => {
-                info!("Using fs root '{}' as a remote storage", root.display());
+                info!("Using fs root '{root}' as a remote storage");
                Self::LocalFs(LocalFs::new(root.clone())?)
            }
            RemoteStorageKind::AwsS3(s3_config) => {
@@ -401,7 +379,7 @@ pub struct RemoteStorageConfig {
 pub enum RemoteStorageKind {
    /// Storage based on local file system.
    /// Specify a root folder to place all stored files into.
-    LocalFs(PathBuf),
+    LocalFs(Utf8PathBuf),
    /// AWS S3 based storage, storing all files in the S3 bucket
    /// specified by the config
    AwsS3(S3Config),
@@ -496,7 +474,7 @@ impl RemoteStorageConfig {
                concurrency_limit,
                max_keys_per_list_response,
            }),
-            (Some(local_path), None, None) => RemoteStorageKind::LocalFs(PathBuf::from(
+            (Some(local_path), None, None) => RemoteStorageKind::LocalFs(Utf8PathBuf::from(
                parse_toml_string("local_path", local_path)?,
            )),
            (Some(_), Some(_), _) => bail!("local_path and bucket_name are mutually exclusive"),
@@ -541,23 +519,23 @@ mod tests {

    #[test]
    fn test_object_name() {
-        let k = RemotePath::new(Path::new("a/b/c")).unwrap();
+        let k = RemotePath::new(Utf8Path::new("a/b/c")).unwrap();
        assert_eq!(k.object_name(), Some("c"));

-        let k = RemotePath::new(Path::new("a/b/c/")).unwrap();
+        let k = RemotePath::new(Utf8Path::new("a/b/c/")).unwrap();
        assert_eq!(k.object_name(), Some("c"));

-        let k = RemotePath::new(Path::new("a/")).unwrap();
+        let k = RemotePath::new(Utf8Path::new("a/")).unwrap();
        assert_eq!(k.object_name(), Some("a"));

        // XXX is it impossible to have an empty key?
-        let k = RemotePath::new(Path::new("")).unwrap();
+        let k = RemotePath::new(Utf8Path::new("")).unwrap();
        assert_eq!(k.object_name(), None);
    }

    #[test]
    fn rempte_path_cannot_be_created_from_absolute_ones() {
-        let err = RemotePath::new(Path::new("/")).expect_err("Should fail on absolute paths");
+        let err = RemotePath::new(Utf8Path::new("/")).expect_err("Should fail on absolute paths");
        assert_eq!(err.to_string(), "Path \"/\" is not relative");
    }
 }
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -4,15 +4,10 @@
 //! This storage used in tests, but can also be used in cases when a certain persistent
 //! volume is mounted to the local FS.

-use std::{
-    borrow::Cow,
-    future::Future,
-    io::ErrorKind,
-    path::{Path, PathBuf},
-    pin::Pin,
-};
+use std::{borrow::Cow, future::Future, io::ErrorKind, pin::Pin};

 use anyhow::{bail, ensure, Context};
+use camino::{Utf8Path, Utf8PathBuf};
 use tokio::{
    fs,
    io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
@@ -28,20 +23,20 @@ const LOCAL_FS_TEMP_FILE_SUFFIX: &str = "___temp";

 #[derive(Debug, Clone)]
 pub struct LocalFs {
-    storage_root: PathBuf,
+    storage_root: Utf8PathBuf,
 }

 impl LocalFs {
    /// Attempts to create local FS storage, along with its root directory.
    /// Storage root will be created (if does not exist) and transformed into an absolute path (if passed as relative).
-    pub fn new(mut storage_root: PathBuf) -> anyhow::Result<Self> {
+    pub fn new(mut storage_root: Utf8PathBuf) -> anyhow::Result<Self> {
        if !storage_root.exists() {
            std::fs::create_dir_all(&storage_root).with_context(|| {
                format!("Failed to create all directories in the given root path {storage_root:?}")
            })?;
        }
        if !storage_root.is_absolute() {
-            storage_root = storage_root.canonicalize().with_context(|| {
+            storage_root = storage_root.canonicalize_utf8().with_context(|| {
                format!("Failed to represent path {storage_root:?} as an absolute path")
            })?;
        }
@@ -50,7 +45,7 @@ impl LocalFs {
    }

    // mirrors S3Bucket::s3_object_to_relative_path
-    fn local_file_to_relative_path(&self, key: PathBuf) -> RemotePath {
+    fn local_file_to_relative_path(&self, key: Utf8PathBuf) -> RemotePath {
        let relative_path = key
            .strip_prefix(&self.storage_root)
            .expect("relative path must contain storage_root as prefix");
@@ -59,22 +54,18 @@ impl LocalFs {

    async fn read_storage_metadata(
        &self,
-        file_path: &Path,
+        file_path: &Utf8Path,
    ) -> anyhow::Result<Option<StorageMetadata>> {
        let metadata_path = storage_metadata_path(file_path);
        if metadata_path.exists() && metadata_path.is_file() {
            let metadata_string = fs::read_to_string(&metadata_path).await.with_context(|| {
-                format!(
-                    "Failed to read metadata from the local storage at '{}'",
-                    metadata_path.display()
-                )
+                format!("Failed to read metadata from the local storage at '{metadata_path}'")
            })?;

            serde_json::from_str(&metadata_string)
                .with_context(|| {
                    format!(
-                        "Failed to deserialize metadata from the local storage at '{}'",
-                        metadata_path.display()
+                        "Failed to deserialize metadata from the local storage at '{metadata_path}'",
                    )
                })
                .map(|metadata| Some(StorageMetadata(metadata)))
@@ -171,25 +162,21 @@ impl RemoteStorage for LocalFs {
            }
        }

-        // Note that PathBuf starts_with only considers full path segments, but
+        // Note that Utf8PathBuf starts_with only considers full path segments, but
        // object prefixes are arbitrary strings, so we need the strings for doing
        // starts_with later.
-        let prefix = full_path.to_string_lossy();
+        let prefix = full_path.as_str();

        let mut files = vec![];
-        let mut directory_queue = vec![initial_dir.clone()];
+        let mut directory_queue = vec![initial_dir];
        while let Some(cur_folder) = directory_queue.pop() {
-            let mut entries = fs::read_dir(cur_folder.clone()).await?;
-            while let Some(entry) = entries.next_entry().await? {
-                let file_name: PathBuf = entry.file_name().into();
-                let full_file_name = cur_folder.clone().join(&file_name);
-                if full_file_name
-                    .to_str()
-                    .map(|s| s.starts_with(prefix.as_ref()))
-                    .unwrap_or(false)
-                {
+            let mut entries = cur_folder.read_dir_utf8()?;
+            while let Some(Ok(entry)) = entries.next() {
+                let file_name = entry.file_name();
+                let full_file_name = cur_folder.join(file_name);
+                if full_file_name.as_str().starts_with(prefix) {
                    let file_remote_path = self.local_file_to_relative_path(full_file_name.clone());
-                    files.push(file_remote_path.clone());
+                    files.push(file_remote_path);
                    if full_file_name.is_dir() {
                        directory_queue.push(full_file_name);
                    }
@@ -230,10 +217,7 @@ impl RemoteStorage for LocalFs {
                .open(&temp_file_path)
                .await
                .with_context(|| {
-                    format!(
-                        "Failed to open target fs destination at '{}'",
-                        target_file_path.display()
-                    )
+                    format!("Failed to open target fs destination at '{target_file_path}'")
                })?,
        );

@@ -244,8 +228,7 @@ impl RemoteStorage for LocalFs {
            .await
            .with_context(|| {
                format!(
-                    "Failed to upload file (write temp) to the local storage at '{}'",
-                    temp_file_path.display()
+                    "Failed to upload file (write temp) to the local storage at '{temp_file_path}'",
                )
            })?;

@@ -262,8 +245,7 @@ impl RemoteStorage for LocalFs {

        destination.flush().await.with_context(|| {
            format!(
-                "Failed to upload (flush temp) file to the local storage at '{}'",
-                temp_file_path.display()
+                "Failed to upload (flush temp) file to the local storage at '{temp_file_path}'",
            )
        })?;

@@ -271,8 +253,7 @@ impl RemoteStorage for LocalFs {
            .await
            .with_context(|| {
                format!(
-                    "Failed to upload (rename) file to the local storage at '{}'",
-                    target_file_path.display()
+                    "Failed to upload (rename) file to the local storage at '{target_file_path}'",
                )
            })?;

@@ -286,8 +267,7 @@ impl RemoteStorage for LocalFs {
            .await
            .with_context(|| {
                format!(
-                    "Failed to write metadata to the local storage at '{}'",
-                    storage_metadata_path.display()
+                    "Failed to write metadata to the local storage at '{storage_metadata_path}'",
                )
            })?;
        }
@@ -393,16 +373,16 @@ impl RemoteStorage for LocalFs {
    }
 }

-fn storage_metadata_path(original_path: &Path) -> PathBuf {
+fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
    path_with_suffix_extension(original_path, "metadata")
 }

 fn get_all_files<'a, P>(
    directory_path: P,
    recursive: bool,
-) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<PathBuf>>> + Send + Sync + 'a>>
+) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<Utf8PathBuf>>> + Send + Sync + 'a>>
 where
-    P: AsRef<Path> + Send + Sync + 'a,
+    P: AsRef<Utf8Path> + Send + Sync + 'a,
 {
    Box::pin(async move {
        let directory_path = directory_path.as_ref();
@@ -412,7 +392,13 @@ where
                let mut dir_contents = fs::read_dir(directory_path).await?;
                while let Some(dir_entry) = dir_contents.next_entry().await? {
                    let file_type = dir_entry.file_type().await?;
-                    let entry_path = dir_entry.path();
+                    let entry_path =
+                        Utf8PathBuf::from_path_buf(dir_entry.path()).map_err(|pb| {
+                            anyhow::Error::msg(format!(
+                                "non-Unicode path: {}",
+                                pb.to_string_lossy()
+                            ))
+                        })?;
                    if file_type.is_symlink() {
                        debug!("{entry_path:?} is a symlink, skipping")
                    } else if file_type.is_dir() {
@@ -435,13 +421,10 @@ where
    })
 }

-async fn create_target_directory(target_file_path: &Path) -> anyhow::Result<()> {
+async fn create_target_directory(target_file_path: &Utf8Path) -> anyhow::Result<()> {
    let target_dir = match target_file_path.parent() {
        Some(parent_dir) => parent_dir,
-        None => bail!(
-            "File path '{}' has no parent directory",
-            target_file_path.display()
-        ),
+        None => bail!("File path '{target_file_path}' has no parent directory"),
    };
    if !target_dir.exists() {
        fs::create_dir_all(target_dir).await?;
@@ -449,13 +432,9 @@ async fn create_target_directory(target_file_path: &Path) -> anyhow::Result<()>
    Ok(())
 }

-fn file_exists(file_path: &Path) -> anyhow::Result<bool> {
+fn file_exists(file_path: &Utf8Path) -> anyhow::Result<bool> {
    if file_path.exists() {
-        ensure!(
-            file_path.is_file(),
-            "file path '{}' is not a file",
-            file_path.display()
-        );
+        ensure!(file_path.is_file(), "file path '{file_path}' is not a file");
        Ok(true)
    } else {
        Ok(false)
@@ -466,13 +445,13 @@ fn file_exists(file_path: &Path) -> anyhow::Result<bool> {
 mod fs_tests {
    use super::*;

+    use camino_tempfile::tempdir;
    use std::{collections::HashMap, io::Write};
-    use tempfile::tempdir;

    async fn read_and_assert_remote_file_contents(
        storage: &LocalFs,
        #[allow(clippy::ptr_arg)]
-        // have to use &PathBuf due to `storage.local_path` parameter requirements
+        // have to use &Utf8PathBuf due to `storage.local_path` parameter requirements
        remote_storage_path: &RemotePath,
        expected_metadata: Option<&StorageMetadata>,
    ) -> anyhow::Result<String> {
@@ -519,7 +498,7 @@ mod fs_tests {
    async fn upload_file_negatives() -> anyhow::Result<()> {
        let storage = create_storage()?;

-        let id = RemotePath::new(Path::new("dummy"))?;
+        let id = RemotePath::new(Utf8Path::new("dummy"))?;
        let content = std::io::Cursor::new(b"12345");

        // Check that you get an error if the size parameter doesn't match the actual
@@ -544,7 +523,8 @@ mod fs_tests {
    }

    fn create_storage() -> anyhow::Result<LocalFs> {
-        LocalFs::new(tempdir()?.path().to_owned())
+        let storage_root = tempdir()?.path().to_path_buf();
+        LocalFs::new(storage_root)
    }

    #[tokio::test]
@@ -561,7 +541,7 @@ mod fs_tests {
        );

        let non_existing_path = "somewhere/else";
-        match storage.download(&RemotePath::new(Path::new(non_existing_path))?).await {
+        match storage.download(&RemotePath::new(Utf8Path::new(non_existing_path))?).await {
            Err(DownloadError::NotFound) => {} // Should get NotFound for non existing keys
            other => panic!("Should get a NotFound error when downloading non-existing storage files, but got: {other:?}"),
        }
@@ -775,7 +755,7 @@ mod fs_tests {
    }

    async fn create_file_for_upload(
-        path: &Path,
+        path: &Utf8Path,
        contents: &str,
    ) -> anyhow::Result<(io::BufReader<fs::File>, usize)> {
        std::fs::create_dir_all(path.parent().unwrap())?;
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -4,7 +4,7 @@
 //! allowing multiple api users to independently work with the same S3 bucket, if
 //! their bucket prefixes are both specified and different.

-use std::sync::Arc;
+use std::{borrow::Cow, sync::Arc};

 use anyhow::Context;
 use aws_config::{
@@ -47,10 +47,47 @@ pub struct S3Bucket {
    bucket_name: String,
    prefix_in_bucket: Option<String>,
    max_keys_per_list_response: Option<i32>,
+    concurrency_limiter: ConcurrencyLimiter,
+}
+
+struct ConcurrencyLimiter {
    // Every request to S3 can be throttled or cancelled, if a certain number of requests per second is exceeded.
    // Same goes to IAM, which is queried before every S3 request, if enabled. IAM has even lower RPS threshold.
    // The helps to ensure we don't exceed the thresholds.
-    concurrency_limiter: Arc<Semaphore>,
+    write: Arc<Semaphore>,
+    read: Arc<Semaphore>,
+}
+
+impl ConcurrencyLimiter {
+    fn for_kind(&self, kind: RequestKind) -> &Arc<Semaphore> {
+        match kind {
+            RequestKind::Get => &self.read,
+            RequestKind::Put => &self.write,
+            RequestKind::List => &self.read,
+            RequestKind::Delete => &self.write,
+        }
+    }
+
+    async fn acquire(
+        &self,
+        kind: RequestKind,
+    ) -> Result<tokio::sync::SemaphorePermit<'_>, tokio::sync::AcquireError> {
+        self.for_kind(kind).acquire().await
+    }
+
+    async fn acquire_owned(
+        &self,
+        kind: RequestKind,
+    ) -> Result<tokio::sync::OwnedSemaphorePermit, tokio::sync::AcquireError> {
+        Arc::clone(self.for_kind(kind)).acquire_owned().await
+    }
+
+    fn new(limit: usize) -> ConcurrencyLimiter {
+        Self {
+            read: Arc::new(Semaphore::new(limit)),
+            write: Arc::new(Semaphore::new(limit)),
+        }
+    }
 }

 #[derive(Default)]
@@ -117,7 +154,7 @@ impl S3Bucket {
            bucket_name: aws_config.bucket_name.clone(),
            max_keys_per_list_response: aws_config.max_keys_per_list_response,
            prefix_in_bucket,
-            concurrency_limiter: Arc::new(Semaphore::new(aws_config.concurrency_limit.get())),
+            concurrency_limiter: ConcurrencyLimiter::new(aws_config.concurrency_limit.get()),
        })
    }

@@ -143,12 +180,11 @@ impl S3Bucket {
        assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR);
        let path_string = path
            .get_path()
-            .to_string_lossy()
-            .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR)
-            .to_string();
+            .as_str()
+            .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR);
        match &self.prefix_in_bucket {
-            Some(prefix) => prefix.clone() + "/" + &path_string,
-            None => path_string,
+            Some(prefix) => prefix.clone() + "/" + path_string,
+            None => path_string.to_string(),
        }
    }

@@ -156,7 +192,7 @@ impl S3Bucket {
        let started_at = start_counting_cancelled_wait(kind);
        let permit = self
            .concurrency_limiter
-            .acquire()
+            .acquire(kind)
            .await
            .expect("semaphore is never closed");

@@ -172,8 +208,7 @@ impl S3Bucket {
        let started_at = start_counting_cancelled_wait(kind);
        let permit = self
            .concurrency_limiter
-            .clone()
-            .acquire_owned()
+            .acquire_owned(kind)
            .await
            .expect("semaphore is never closed");

@@ -345,7 +380,7 @@ impl RemoteStorage for S3Bucket {
                .set_max_keys(self.max_keys_per_list_response)
                .send()
                .await
-                .context("list S3 prefixes")
+                .context("Failed to list S3 prefixes")
                .map_err(DownloadError::Other);

            let started_at = ScopeGuard::into_inner(started_at);
@@ -397,7 +432,7 @@ impl RemoteStorage for S3Bucket {
                .set_max_keys(self.max_keys_per_list_response)
                .send()
                .await
-                .context("list files in S3 bucket");
+                .context("Failed to list files in S3 bucket");

            let started_at = ScopeGuard::into_inner(started_at);
            metrics::BUCKET_METRICS
@@ -521,7 +556,24 @@ impl RemoteStorage for S3Bucket {
                        .deleted_objects_total
                        .inc_by(chunk.len() as u64);
                    if let Some(errors) = resp.errors {
-                        return Err(anyhow::anyhow!("delete {} objects", errors.len()));
+                        // Log a bounded number of the errors within the response:
+                        // these requests can carry 1000 keys so logging each one
+                        // would be too verbose, especially as errors may lead us
+                        // to retry repeatedly.
+                        const LOG_UP_TO_N_ERRORS: usize = 10;
+                        for e in errors.iter().take(LOG_UP_TO_N_ERRORS) {
+                            tracing::warn!(
+                                "DeleteObjects key {} failed: {}: {}",
+                                e.key.as_ref().map(Cow::from).unwrap_or("".into()),
+                                e.code.as_ref().map(Cow::from).unwrap_or("".into()),
+                                e.message.as_ref().map(Cow::from).unwrap_or("".into())
+                            );
+                        }
+
+                        return Err(anyhow::format_err!(
+                            "Failed to delete {} objects",
+                            errors.len()
+                        ));
                    }
                }
                Err(e) => {
@@ -562,8 +614,8 @@ fn start_measuring_requests(

 #[cfg(test)]
 mod tests {
+    use camino::Utf8Path;
    use std::num::NonZeroUsize;
-    use std::path::Path;

    use crate::{RemotePath, S3Bucket, S3Config};

@@ -572,7 +624,7 @@ mod tests {
        let all_paths = ["", "some/path", "some/path/"];
        let all_paths: Vec<RemotePath> = all_paths
            .iter()
-            .map(|x| RemotePath::new(Path::new(x)).expect("bad path"))
+            .map(|x| RemotePath::new(Utf8Path::new(x)).expect("bad path"))
            .collect();
        let prefixes = [
            None,
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -18,7 +18,7 @@ pub struct UnreliableWrapper {
 }

 /// Used to identify retries of different unique operation.
-#[derive(Hash, Eq, PartialEq)]
+#[derive(Debug, Hash, Eq, PartialEq)]
 enum RemoteOp {
    ListPrefixes(Option<RemotePath>),
    Upload(RemotePath),
@@ -27,22 +27,6 @@ enum RemoteOp {
    DeleteObjects(Vec<RemotePath>),
 }

-impl std::fmt::Debug for RemoteOp {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        use RemoteOp::*;
-        match self {
-            ListPrefixes(arg0) => f.debug_tuple("ListPrefixes").field(arg0).finish(),
-            Upload(arg0) => f.debug_tuple("Upload").field(arg0).finish(),
-            Download(arg0) => f.debug_tuple("Download").field(arg0).finish(),
-            Delete(arg0) => f.debug_tuple("Delete").field(arg0).finish(),
-            DeleteObjects(many) if many.len() > 3 => {
-                write!(f, "DeleteObjects({} paths)", many.len())
-            }
-            DeleteObjects(few) => f.debug_tuple("DeleteObjects").field(few).finish(),
-        }
-    }
-}
-
 impl UnreliableWrapper {
    pub fn new(inner: crate::GenericRemoteStorage, attempts_to_fail: u64) -> Self {
        assert!(attempts_to_fail > 0);
@@ -75,12 +59,13 @@ impl UnreliableWrapper {
                    e.remove();
                    Ok(attempts_before_this)
                } else {
-                    let error = anyhow::anyhow!(SimulatedError::from(e.key()));
+                    let error =
+                        anyhow::anyhow!("simulated failure of remote operation {:?}", e.key());
                    Err(DownloadError::Other(error))
                }
            }
            Entry::Vacant(e) => {
-                let error = anyhow::anyhow!(SimulatedError::from(e.key()));
+                let error = anyhow::anyhow!("simulated failure of remote operation {:?}", e.key());
                e.insert(1);
                Err(DownloadError::Other(error))
            }
@@ -95,26 +80,6 @@ impl UnreliableWrapper {
    }
 }

-/// `pub` type for checking if this is the root cause around logging.
-///
-/// This is just a string to avoid cloning a huge number of paths a second time.
-#[derive(Debug)]
-pub struct SimulatedError(String);
-
-impl<'a> From<&'a RemoteOp> for SimulatedError {
-    fn from(value: &'_ RemoteOp) -> Self {
-        SimulatedError(format!("simulated failure of remote operation {:?}", value))
-    }
-}
-
-impl std::fmt::Display for SimulatedError {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.write_str(&self.0)
-    }
-}
-
-impl std::error::Error for SimulatedError {}
-
 #[async_trait::async_trait]
 impl RemoteStorage for UnreliableWrapper {
    async fn list_prefixes(
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -2,11 +2,12 @@ use std::collections::HashSet;
 use std::env;
 use std::num::{NonZeroU32, NonZeroUsize};
 use std::ops::ControlFlow;
-use std::path::{Path, PathBuf};
+use std::path::PathBuf;
 use std::sync::Arc;
 use std::time::UNIX_EPOCH;

 use anyhow::Context;
+use camino::Utf8Path;
 use once_cell::sync::OnceCell;
 use remote_storage::{
    GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
@@ -55,7 +56,7 @@ async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3WithTestBlobs) -> any
    let test_client = Arc::clone(&ctx.enabled.client);
    let expected_remote_prefixes = ctx.remote_prefixes.clone();

-    let base_prefix = RemotePath::new(Path::new(ctx.enabled.base_prefix))
+    let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
        .context("common_prefix construction")?;
    let root_remote_prefixes = test_client
        .list_prefixes(None)
@@ -108,7 +109,7 @@ async fn s3_list_files_works(ctx: &mut MaybeEnabledS3WithSimpleTestBlobs) -> any
    };
    let test_client = Arc::clone(&ctx.enabled.client);
    let base_prefix =
-        RemotePath::new(Path::new("folder1")).context("common_prefix construction")?;
+        RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
    let root_files = test_client
        .list_files(None)
        .await
@@ -129,9 +130,9 @@ async fn s3_list_files_works(ctx: &mut MaybeEnabledS3WithSimpleTestBlobs) -> any
    let trim_remote_blobs: HashSet<_> = ctx
        .remote_blobs
        .iter()
-        .map(|x| x.get_path().to_str().expect("must be valid name"))
+        .map(|x| x.get_path())
        .filter(|x| x.starts_with("folder1"))
-        .map(|x| RemotePath::new(Path::new(x)).expect("must be valid name"))
+        .map(|x| RemotePath::new(x).expect("must be valid path"))
        .collect();
    assert_eq!(
        nested_remote_files, trim_remote_blobs,
@@ -148,10 +149,9 @@ async fn s3_delete_non_exising_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result
        MaybeEnabledS3::Disabled => return Ok(()),
    };

-    let path = RemotePath::new(&PathBuf::from(format!(
-        "{}/for_sure_there_is_nothing_there_really",
-        ctx.base_prefix,
-    )))
+    let path = RemotePath::new(Utf8Path::new(
+        format!("{}/for_sure_there_is_nothing_there_really", ctx.base_prefix).as_str(),
+    ))
    .with_context(|| "RemotePath conversion")?;

    ctx.client.delete(&path).await.expect("should succeed");
@@ -167,13 +167,13 @@ async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()>
        MaybeEnabledS3::Disabled => return Ok(()),
    };

-    let path1 = RemotePath::new(&PathBuf::from(format!("{}/path1", ctx.base_prefix,)))
+    let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str()))
        .with_context(|| "RemotePath conversion")?;

-    let path2 = RemotePath::new(&PathBuf::from(format!("{}/path2", ctx.base_prefix,)))
+    let path2 = RemotePath::new(Utf8Path::new(format!("{}/path2", ctx.base_prefix).as_str()))
        .with_context(|| "RemotePath conversion")?;

-    let path3 = RemotePath::new(&PathBuf::from(format!("{}/path3", ctx.base_prefix,)))
+    let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
        .with_context(|| "RemotePath conversion")?;

    let data1 = "remote blob data1".as_bytes();
@@ -427,10 +427,10 @@ async fn upload_s3_data(
    for i in 1..upload_tasks_count + 1 {
        let task_client = Arc::clone(client);
        upload_tasks.spawn(async move {
-            let prefix = PathBuf::from(format!("{base_prefix_str}/sub_prefix_{i}/"));
-            let blob_prefix = RemotePath::new(&prefix)
+            let prefix = format!("{base_prefix_str}/sub_prefix_{i}/");
+            let blob_prefix = RemotePath::new(Utf8Path::new(&prefix))
                .with_context(|| format!("{prefix:?} to RemotePath conversion"))?;
-            let blob_path = blob_prefix.join(Path::new(&format!("blob_{i}")));
+            let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}")));
            debug!("Creating remote item {i} at path {blob_path:?}");

            let data = format!("remote blob data {i}").into_bytes();
@@ -512,8 +512,10 @@ async fn upload_simple_s3_data(
        let task_client = Arc::clone(client);
        upload_tasks.spawn(async move {
            let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i));
-            let blob_path = RemotePath::new(&blob_path)
-                .with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
+            let blob_path = RemotePath::new(
+                Utf8Path::from_path(blob_path.as_path()).expect("must be valid blob path"),
+            )
+            .with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
            debug!("Creating remote item {i} at path {blob_path:?}");

            let data = format!("remote blob data {i}").into_bytes();
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -10,6 +10,7 @@ async-trait.workspace = true
 anyhow.workspace = true
 bincode.workspace = true
 bytes.workspace = true
+camino.workspace = true
 chrono.workspace = true
 heapless.workspace = true
 hex = { workspace = true, features = ["serde"] }
@@ -53,7 +54,7 @@ byteorder.workspace = true
 bytes.workspace = true
 criterion.workspace = true
 hex-literal.workspace = true
-tempfile.workspace = true
+camino-tempfile.workspace = true

 [[bench]]
 name = "benchmarks"
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -2,9 +2,9 @@

 use serde;
 use std::fs;
-use std::path::Path;

 use anyhow::Result;
+use camino::Utf8Path;
 use jsonwebtoken::{
    decode, encode, Algorithm, DecodingKey, EncodingKey, Header, TokenData, Validation,
 };
@@ -65,7 +65,7 @@ impl JwtAuth {
        }
    }

-    pub fn from_key_path(key_path: &Path) -> Result<Self> {
+    pub fn from_key_path(key_path: &Utf8Path) -> Result<Self> {
        let public_key = fs::read(key_path)?;
        Ok(Self::new(DecodingKey::from_ed_pem(&public_key)?))
    }
--- a/libs/utils/src/crashsafe.rs
+++ b/libs/utils/src/crashsafe.rs
@@ -1,14 +1,14 @@
 use std::{
    borrow::Cow,
-    ffi::OsStr,
    fs::{self, File},
    io,
-    path::{Path, PathBuf},
 };

+use camino::{Utf8Path, Utf8PathBuf};
+
 /// Similar to [`std::fs::create_dir`], except we fsync the
 /// created directory and its parent.
-pub fn create_dir(path: impl AsRef<Path>) -> io::Result<()> {
+pub fn create_dir(path: impl AsRef<Utf8Path>) -> io::Result<()> {
    let path = path.as_ref();

    fs::create_dir(path)?;
@@ -18,7 +18,7 @@ pub fn create_dir(path: impl AsRef<Path>) -> io::Result<()> {

 /// Similar to [`std::fs::create_dir_all`], except we fsync all
 /// newly created directories and the pre-existing parent.
-pub fn create_dir_all(path: impl AsRef<Path>) -> io::Result<()> {
+pub fn create_dir_all(path: impl AsRef<Utf8Path>) -> io::Result<()> {
    let mut path = path.as_ref();

    let mut dirs_to_create = Vec::new();
@@ -30,7 +30,7 @@ pub fn create_dir_all(path: impl AsRef<Path>) -> io::Result<()> {
            Ok(_) => {
                return Err(io::Error::new(
                    io::ErrorKind::AlreadyExists,
-                    format!("non-directory found in path: {}", path.display()),
+                    format!("non-directory found in path: {path}"),
                ));
            }
            Err(ref e) if e.kind() == io::ErrorKind::NotFound => {}
@@ -44,7 +44,7 @@ pub fn create_dir_all(path: impl AsRef<Path>) -> io::Result<()> {
            None => {
                return Err(io::Error::new(
                    io::ErrorKind::InvalidInput,
-                    format!("can't find parent of path '{}'", path.display()).as_str(),
+                    format!("can't find parent of path '{path}'"),
                ));
            }
        }
@@ -70,21 +70,18 @@ pub fn create_dir_all(path: impl AsRef<Path>) -> io::Result<()> {

 /// Adds a suffix to the file(directory) name, either appending the suffix to the end of its extension,
 /// or if there's no extension, creates one and puts a suffix there.
-pub fn path_with_suffix_extension(original_path: impl AsRef<Path>, suffix: &str) -> PathBuf {
-    let new_extension = match original_path
-        .as_ref()
-        .extension()
-        .map(OsStr::to_string_lossy)
-    {
+pub fn path_with_suffix_extension(
+    original_path: impl AsRef<Utf8Path>,
+    suffix: &str,
+) -> Utf8PathBuf {
+    let new_extension = match original_path.as_ref().extension() {
        Some(extension) => Cow::Owned(format!("{extension}.{suffix}")),
        None => Cow::Borrowed(suffix),
    };
-    original_path
-        .as_ref()
-        .with_extension(new_extension.as_ref())
+    original_path.as_ref().with_extension(new_extension)
 }

-pub fn fsync_file_and_parent(file_path: &Path) -> io::Result<()> {
+pub fn fsync_file_and_parent(file_path: &Utf8Path) -> io::Result<()> {
    let parent = file_path.parent().ok_or_else(|| {
        io::Error::new(
            io::ErrorKind::Other,
@@ -97,7 +94,7 @@ pub fn fsync_file_and_parent(file_path: &Path) -> io::Result<()> {
    Ok(())
 }

-pub fn fsync(path: &Path) -> io::Result<()> {
+pub fn fsync(path: &Utf8Path) -> io::Result<()> {
    File::open(path)
        .map_err(|e| io::Error::new(e.kind(), format!("Failed to open the file {path:?}: {e}")))
        .and_then(|file| {
@@ -111,19 +108,18 @@ pub fn fsync(path: &Path) -> io::Result<()> {
        .map_err(|e| io::Error::new(e.kind(), format!("Failed to fsync file {path:?}: {e}")))
 }

-pub async fn fsync_async(path: impl AsRef<std::path::Path>) -> Result<(), std::io::Error> {
-    tokio::fs::File::open(path).await?.sync_all().await
+pub async fn fsync_async(path: impl AsRef<Utf8Path>) -> Result<(), std::io::Error> {
+    tokio::fs::File::open(path.as_ref()).await?.sync_all().await
 }

 #[cfg(test)]
 mod tests {
-    use tempfile::tempdir;

    use super::*;

    #[test]
    fn test_create_dir_fsyncd() {
-        let dir = tempdir().unwrap();
+        let dir = camino_tempfile::tempdir().unwrap();

        let existing_dir_path = dir.path();
        let err = create_dir(existing_dir_path).unwrap_err();
@@ -139,7 +135,7 @@ mod tests {

    #[test]
    fn test_create_dir_all_fsyncd() {
-        let dir = tempdir().unwrap();
+        let dir = camino_tempfile::tempdir().unwrap();

        let existing_dir_path = dir.path();
        create_dir_all(existing_dir_path).unwrap();
@@ -166,29 +162,29 @@ mod tests {

    #[test]
    fn test_path_with_suffix_extension() {
-        let p = PathBuf::from("/foo/bar");
+        let p = Utf8PathBuf::from("/foo/bar");
        assert_eq!(
-            &path_with_suffix_extension(p, "temp").to_string_lossy(),
+            &path_with_suffix_extension(p, "temp").to_string(),
            "/foo/bar.temp"
        );
-        let p = PathBuf::from("/foo/bar");
+        let p = Utf8PathBuf::from("/foo/bar");
        assert_eq!(
-            &path_with_suffix_extension(p, "temp.temp").to_string_lossy(),
+            &path_with_suffix_extension(p, "temp.temp").to_string(),
            "/foo/bar.temp.temp"
        );
-        let p = PathBuf::from("/foo/bar.baz");
+        let p = Utf8PathBuf::from("/foo/bar.baz");
        assert_eq!(
-            &path_with_suffix_extension(p, "temp.temp").to_string_lossy(),
+            &path_with_suffix_extension(p, "temp.temp").to_string(),
            "/foo/bar.baz.temp.temp"
        );
-        let p = PathBuf::from("/foo/bar.baz");
+        let p = Utf8PathBuf::from("/foo/bar.baz");
        assert_eq!(
-            &path_with_suffix_extension(p, ".temp").to_string_lossy(),
+            &path_with_suffix_extension(p, ".temp").to_string(),
            "/foo/bar.baz..temp"
        );
-        let p = PathBuf::from("/foo/bar/dir/");
+        let p = Utf8PathBuf::from("/foo/bar/dir/");
        assert_eq!(
-            &path_with_suffix_extension(p, ".temp").to_string_lossy(),
+            &path_with_suffix_extension(p, ".temp").to_string(),
            "/foo/bar/dir..temp"
        );
    }
--- a/libs/utils/src/fs_ext.rs
+++ b/libs/utils/src/fs_ext.rs
@@ -55,8 +55,6 @@ where

 #[cfg(test)]
 mod test {
-    use std::path::PathBuf;
-
    use crate::fs_ext::{is_directory_empty, list_dir};

    use super::ignore_absent_files;
@@ -65,7 +63,7 @@ mod test {
    fn is_empty_dir() {
        use super::PathExt;

-        let dir = tempfile::tempdir().unwrap();
+        let dir = camino_tempfile::tempdir().unwrap();
        let dir_path = dir.path();

        // test positive case
@@ -75,7 +73,7 @@ mod test {
        );

        // invoke on a file to ensure it returns an error
-        let file_path: PathBuf = dir_path.join("testfile");
+        let file_path = dir_path.join("testfile");
        let f = std::fs::File::create(&file_path).unwrap();
        drop(f);
        assert!(file_path.is_empty_dir().is_err());
@@ -87,7 +85,7 @@ mod test {

    #[tokio::test]
    async fn is_empty_dir_async() {
-        let dir = tempfile::tempdir().unwrap();
+        let dir = camino_tempfile::tempdir().unwrap();
        let dir_path = dir.path();

        // test positive case
@@ -97,7 +95,7 @@ mod test {
        );

        // invoke on a file to ensure it returns an error
-        let file_path: PathBuf = dir_path.join("testfile");
+        let file_path = dir_path.join("testfile");
        let f = std::fs::File::create(&file_path).unwrap();
        drop(f);
        assert!(is_directory_empty(&file_path).await.is_err());
@@ -109,10 +107,9 @@ mod test {

    #[test]
    fn ignore_absent_files_works() {
-        let dir = tempfile::tempdir().unwrap();
-        let dir_path = dir.path();
+        let dir = camino_tempfile::tempdir().unwrap();

-        let file_path: PathBuf = dir_path.join("testfile");
+        let file_path = dir.path().join("testfile");

        ignore_absent_files(|| std::fs::remove_file(&file_path)).expect("should execute normally");

@@ -126,17 +123,17 @@ mod test {

    #[tokio::test]
    async fn list_dir_works() {
-        let dir = tempfile::tempdir().unwrap();
+        let dir = camino_tempfile::tempdir().unwrap();
        let dir_path = dir.path();

        assert!(list_dir(dir_path).await.unwrap().is_empty());

-        let file_path: PathBuf = dir_path.join("testfile");
+        let file_path = dir_path.join("testfile");
        let _ = std::fs::File::create(&file_path).unwrap();

        assert_eq!(&list_dir(dir_path).await.unwrap(), &["testfile"]);

-        let another_dir_path: PathBuf = dir_path.join("testdir");
+        let another_dir_path = dir_path.join("testdir");
        std::fs::create_dir(another_dir_path).unwrap();

        let expected = &["testdir", "testfile"];
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -1,8 +1,9 @@
 use hyper::{header, Body, Response, StatusCode};
 use serde::{Deserialize, Serialize};
+use std::borrow::Cow;
 use std::error::Error as StdError;
 use thiserror::Error;
-use tracing::error;
+use tracing::{error, info};

 #[derive(Debug, Error)]
 pub enum ApiError {
@@ -24,6 +25,9 @@ pub enum ApiError {
    #[error("Precondition failed: {0}")]
    PreconditionFailed(Box<str>),

+    #[error("Resource temporarily unavailable: {0}")]
+    ResourceUnavailable(Cow<'static, str>),
+
    #[error("Shutting down")]
    ShuttingDown,

@@ -59,6 +63,10 @@ impl ApiError {
                "Shutting down".to_string(),
                StatusCode::SERVICE_UNAVAILABLE,
            ),
+            ApiError::ResourceUnavailable(err) => HttpErrorBody::response_from_msg_and_status(
+                err.to_string(),
+                StatusCode::SERVICE_UNAVAILABLE,
+            ),
            ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
                err.to_string(),
                StatusCode::INTERNAL_SERVER_ERROR,
@@ -108,10 +116,12 @@ pub async fn route_error_handler(err: routerify::RouteError) -> Response<Body> {

 pub fn api_error_handler(api_error: ApiError) -> Response<Body> {
    // Print a stack trace for Internal Server errors
-    if let ApiError::InternalServerError(_) = api_error {
-        error!("Error processing HTTP request: {api_error:?}");
-    } else {
-        error!("Error processing HTTP request: {api_error:#}");
+
+    match api_error {
+        ApiError::ResourceUnavailable(_) => info!("Error processing HTTP request: {api_error:#}"),
+        ApiError::NotFound(_) => info!("Error processing HTTP request: {api_error:#}"),
+        ApiError::InternalServerError(_) => error!("Error processing HTTP request: {api_error:?}"),
+        _ => error!("Error processing HTTP request: {api_error:#}"),
    }

    api_error.into_response()
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -1,4 +1,3 @@
-use std::ffi::OsStr;
 use std::{fmt, str::FromStr};

 use anyhow::Context;
@@ -215,12 +214,11 @@ pub struct TimelineId(Id);

 id_newtype!(TimelineId);

-impl TryFrom<Option<&OsStr>> for TimelineId {
+impl TryFrom<Option<&str>> for TimelineId {
    type Error = anyhow::Error;

-    fn try_from(value: Option<&OsStr>) -> Result<Self, Self::Error> {
+    fn try_from(value: Option<&str>) -> Result<Self, Self::Error> {
        value
-            .and_then(OsStr::to_str)
            .unwrap_or_default()
            .parse::<TimelineId>()
            .with_context(|| format!("Could not parse timeline id from {:?}", value))
--- a/libs/utils/src/lock_file.rs
+++ b/libs/utils/src/lock_file.rs
@@ -11,10 +11,10 @@ use std::{
    io::{Read, Write},
    ops::Deref,
    os::unix::prelude::AsRawFd,
-    path::{Path, PathBuf},
 };

 use anyhow::Context;
+use camino::{Utf8Path, Utf8PathBuf};
 use nix::{errno::Errno::EAGAIN, fcntl};

 use crate::crashsafe;
@@ -23,7 +23,7 @@ use crate::crashsafe;
 /// Returned by [`create_exclusive`].
 #[must_use]
 pub struct UnwrittenLockFile {
-    path: PathBuf,
+    path: Utf8PathBuf,
    file: fs::File,
 }

@@ -60,7 +60,7 @@ impl UnwrittenLockFile {
 ///
 /// It is not an error if the file already exists.
 /// It is an error if the file is already locked.
-pub fn create_exclusive(lock_file_path: &Path) -> anyhow::Result<UnwrittenLockFile> {
+pub fn create_exclusive(lock_file_path: &Utf8Path) -> anyhow::Result<UnwrittenLockFile> {
    let lock_file = fs::OpenOptions::new()
        .create(true) // O_CREAT
        .write(true)
@@ -101,7 +101,7 @@ pub enum LockFileRead {
 /// Open & try to lock the lock file at the given `path`, returning a [handle][`LockFileRead`] to
 /// inspect its content. It is not an `Err(...)` if the file does not exist or is already locked.
 /// Check the [`LockFileRead`] variants for details.
-pub fn read_and_hold_lock_file(path: &Path) -> anyhow::Result<LockFileRead> {
+pub fn read_and_hold_lock_file(path: &Utf8Path) -> anyhow::Result<LockFileRead> {
    let res = fs::OpenOptions::new().read(true).open(path);
    let mut lock_file = match res {
        Ok(f) => f,
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -216,6 +216,30 @@ impl std::fmt::Debug for PrettyLocation<'_, '_> {
    }
 }

+/// When you will store a secret but want to make sure it won't
+/// be accidentally logged, wrap it in a SecretString, whose Debug
+/// implementation does not expose the contents.
+#[derive(Clone, Eq, PartialEq)]
+pub struct SecretString(String);
+
+impl SecretString {
+    pub fn get_contents(&self) -> &str {
+        self.0.as_str()
+    }
+}
+
+impl From<String> for SecretString {
+    fn from(s: String) -> Self {
+        Self(s)
+    }
+}
+
+impl std::fmt::Debug for SecretString {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "[SECRET]")
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use metrics::{core::Opts, IntCounterVec};
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -1,9 +1,9 @@
 #![warn(missing_docs)]

+use camino::Utf8Path;
 use serde::{Deserialize, Serialize};
 use std::fmt;
 use std::ops::{Add, AddAssign};
-use std::path::Path;
 use std::str::FromStr;
 use std::sync::atomic::{AtomicU64, Ordering};

@@ -44,11 +44,9 @@ impl Lsn {
    /// Parse an LSN from a filename in the form `0000000000000000`
    pub fn from_filename<F>(filename: F) -> Result<Self, LsnParseError>
    where
-        F: AsRef<Path>,
+        F: AsRef<Utf8Path>,
    {
-        let filename: &Path = filename.as_ref();
-        let filename = filename.to_str().ok_or(LsnParseError)?;
-        Lsn::from_hex(filename)
+        Lsn::from_hex(filename.as_ref().as_str())
    }

    /// Parse an LSN from a string in the form `0000000000000000`
--- a/libs/utils/src/pid_file.rs
+++ b/libs/utils/src/pid_file.rs
@@ -49,9 +49,10 @@
 //! At this point, `B` and `C` are running, which is hazardous.
 //! Morale of the story: don't unlink pidfiles, ever.

-use std::{ops::Deref, path::Path};
+use std::ops::Deref;

 use anyhow::Context;
+use camino::Utf8Path;
 use nix::unistd::Pid;

 use crate::lock_file::{self, LockFileRead};
@@ -84,7 +85,7 @@ impl Deref for PidFileGuard {
 /// The claim ends as soon as the returned guard object is dropped.
 /// To maintain the claim for the remaining lifetime of the current process,
 /// use [`std::mem::forget`] or similar.
-pub fn claim_for_current_process(path: &Path) -> anyhow::Result<PidFileGuard> {
+pub fn claim_for_current_process(path: &Utf8Path) -> anyhow::Result<PidFileGuard> {
    let unwritten_lock_file = lock_file::create_exclusive(path).context("lock file")?;
    // if any of the next steps fail, we drop the file descriptor and thereby release the lock
    let guard = unwritten_lock_file
@@ -132,7 +133,7 @@ pub enum PidFileRead {
 ///
 /// On success, this function returns a [`PidFileRead`].
 /// Check its docs for a description of the meaning of its different variants.
-pub fn read(pidfile: &Path) -> anyhow::Result<PidFileRead> {
+pub fn read(pidfile: &Utf8Path) -> anyhow::Result<PidFileRead> {
    let res = lock_file::read_and_hold_lock_file(pidfile).context("read and hold pid file")?;
    let ret = match res {
        LockFileRead::NotExist => PidFileRead::NotExist,
--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -58,7 +58,7 @@ where
 // to get that.
 impl<T: Ord> PartialOrd for Waiter<T> {
    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
-        other.wake_num.partial_cmp(&self.wake_num)
+        Some(self.cmp(other))
    }
 }

--- a/libs/vm_monitor/src/cgroup.rs
+++ b/libs/vm_monitor/src/cgroup.rs
@@ -431,14 +431,14 @@ impl CgroupWatcher {
                            .context("failed to request upscale")?;

                        let memory_high =
-                            self.get_high_bytes().context("failed to get memory.high")?;
+                            self.get_memory_high_bytes().context("failed to get memory.high")?;
                        let new_high = memory_high + self.config.memory_high_increase_by_bytes;
                        info!(
                            current_high_bytes = memory_high,
                            new_high_bytes = new_high,
                            "updating memory.high"
                        );
-                        self.set_high_bytes(new_high)
+                        self.set_memory_high_bytes(new_high)
                            .context("failed to set memory.high")?;
                        last_memory_high_increase_at = Some(Instant::now());
                        continue;
@@ -556,14 +556,6 @@ impl CgroupWatcher {
    }
 }

-/// Represents a set of limits we apply to a cgroup to control memory usage.
-///
-/// Setting these values also affects the thresholds for receiving usage alerts.
-#[derive(Debug)]
-pub struct MemoryLimits {
-    pub high: u64,
-}
-
 // Methods for manipulating the actual cgroup
 impl CgroupWatcher {
    /// Get a handle on the freezer subsystem.
@@ -624,50 +616,29 @@ impl CgroupWatcher {
    }

    /// Set cgroup memory.high threshold.
-    pub fn set_high_bytes(&self, bytes: u64) -> anyhow::Result<()> {
+    pub fn set_memory_high_bytes(&self, bytes: u64) -> anyhow::Result<()> {
+        self.set_memory_high_internal(MaxValue::Value(u64::min(bytes, i64::MAX as u64) as i64))
+    }
+
+    /// Set the cgroup's memory.high to 'max', disabling it.
+    pub fn unset_memory_high(&self) -> anyhow::Result<()> {
+        self.set_memory_high_internal(MaxValue::Max)
+    }
+
+    fn set_memory_high_internal(&self, value: MaxValue) -> anyhow::Result<()> {
        self.memory()
            .context("failed to get memory subsystem")?
            .set_mem(cgroups_rs::memory::SetMemory {
                low: None,
-                high: Some(MaxValue::Value(u64::min(bytes, i64::MAX as u64) as i64)),
+                high: Some(value),
                min: None,
                max: None,
            })
-            .context("failed to set memory.high")
-    }
-
-    /// Set cgroup memory.high and memory.max.
-    pub fn set_limits(&self, limits: &MemoryLimits) -> anyhow::Result<()> {
-        info!(limits.high, path = self.path(), "writing new memory limits",);
-        self.memory()
-            .context("failed to get memory subsystem while setting memory limits")?
-            .set_mem(cgroups_rs::memory::SetMemory {
-                min: None,
-                low: None,
-                high: Some(MaxValue::Value(
-                    u64::min(limits.high, i64::MAX as u64) as i64
-                )),
-                max: None,
-            })
-            .context("failed to set memory limits")
-    }
-
-    /// Given some amount of available memory, set the desired cgroup memory limits
-    pub fn set_memory_limits(&mut self, available_memory: u64) -> anyhow::Result<()> {
-        let new_high = self.config.calculate_memory_high_value(available_memory);
-        let limits = MemoryLimits { high: new_high };
-        info!(
-            path = self.path(),
-            memory = ?limits,
-            "setting cgroup memory",
-        );
-        self.set_limits(&limits)
-            .context("failed to set cgroup memory limits")?;
-        Ok(())
+            .map_err(anyhow::Error::from)
    }

    /// Get memory.high threshold.
-    pub fn get_high_bytes(&self) -> anyhow::Result<u64> {
+    pub fn get_memory_high_bytes(&self) -> anyhow::Result<u64> {
        let high = self
            .memory()
            .context("failed to get memory subsystem while getting memory statistics")?
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -4,9 +4,9 @@
 //! This is the "Monitor" part of the monitor binary and is the main entrypoint for
 //! all functionality.

+use std::fmt::Debug;
 use std::sync::Arc;
 use std::time::{Duration, Instant};
-use std::{fmt::Debug, mem};

 use anyhow::{bail, Context};
 use axum::extract::ws::{Message, WebSocket};
@@ -16,7 +16,7 @@ use tokio::sync::mpsc;
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, warn};

-use crate::cgroup::{CgroupWatcher, MemoryLimits, Sequenced};
+use crate::cgroup::{CgroupWatcher, Sequenced};
 use crate::dispatcher::Dispatcher;
 use crate::filecache::{FileCacheConfig, FileCacheState};
 use crate::protocol::{InboundMsg, InboundMsgKind, OutboundMsg, OutboundMsgKind, Resources};
@@ -106,6 +106,43 @@ impl Runner {
            kill,
        };

+        // If we have both the cgroup and file cache integrations enabled, it's possible for
+        // temporary failures to result in cgroup throttling (from memory.high), that in turn makes
+        // it near-impossible to connect to the file cache (because it times out). Unfortunately,
+        // we *do* still want to determine the file cache size before setting the cgroup's
+        // memory.high, so it's not as simple as just swapping the order.
+        //
+        // Instead, the resolution here is that on vm-monitor startup (note: happens on each
+        // connection from autoscaler-agent, possibly multiple times per compute_ctl lifecycle), we
+        // temporarily unset memory.high, to allow any existing throttling to dissipate. It's a bit
+        // of a hacky solution, but helps with reliability.
+        if let Some(name) = &args.cgroup {
+            // Best not to set up cgroup stuff more than once, so we'll initialize cgroup state
+            // now, and then set limits later.
+            info!("initializing cgroup");
+
+            let (cgroup, cgroup_event_stream) = CgroupWatcher::new(name.clone(), requesting_send)
+                .context("failed to create cgroup manager")?;
+
+            info!("temporarily unsetting memory.high");
+
+            // Temporarily un-set cgroup memory.high; see above.
+            cgroup
+                .unset_memory_high()
+                .context("failed to unset memory.high")?;
+
+            let cgroup = Arc::new(cgroup);
+
+            let cgroup_clone = Arc::clone(&cgroup);
+            spawn_with_cancel(
+                token.clone(),
+                |_| error!("cgroup watcher terminated"),
+                async move { cgroup_clone.watch(notified_recv, cgroup_event_stream).await },
+            );
+
+            state.cgroup = Some(cgroup);
+        }
+
        let mut file_cache_reserved_bytes = 0;
        let mem = get_total_system_memory();

@@ -119,7 +156,7 @@ impl Runner {
                false => FileCacheConfig::default_in_memory(),
            };

-            let mut file_cache = FileCacheState::new(connstr, config, token.clone())
+            let mut file_cache = FileCacheState::new(connstr, config, token)
                .await
                .context("failed to create file cache")?;

@@ -152,35 +189,15 @@ impl Runner {
            state.filecache = Some(file_cache);
        }

-        if let Some(name) = &args.cgroup {
-            let (mut cgroup, cgroup_event_stream) =
-                CgroupWatcher::new(name.clone(), requesting_send)
-                    .context("failed to create cgroup manager")?;
-
+        if let Some(cgroup) = &state.cgroup {
            let available = mem - file_cache_reserved_bytes;
+            let value = cgroup.config.calculate_memory_high_value(available);
+
+            info!(value, "setting memory.high");

            cgroup
-                .set_memory_limits(available)
-                .context("failed to set cgroup memory limits")?;
-
-            let cgroup = Arc::new(cgroup);
-
-            // Some might call this . . . cgroup v2
-            let cgroup_clone = Arc::clone(&cgroup);
-
-            spawn_with_cancel(token, |_| error!("cgroup watcher terminated"), async move {
-                cgroup_clone.watch(notified_recv, cgroup_event_stream).await
-            });
-
-            state.cgroup = Some(cgroup);
-        } else {
-            // *NOTE*: We need to forget the sender so that its drop impl does not get ran.
-            // This allows us to poll it in `Monitor::run` regardless of whether we
-            // are managing a cgroup or not. If we don't forget it, all receives will
-            // immediately return an error because the sender is droped and it will
-            // claim all select! statements, effectively turning `Monitor::run` into
-            // `loop { fail to receive }`.
-            mem::forget(requesting_send);
+                .set_memory_high_bytes(value)
+                .context("failed to set cgroup memory.high")?;
        }

        Ok(state)
@@ -257,14 +274,11 @@ impl Runner {
                new_cgroup_mem_high = cgroup.config.calculate_memory_high_value(available_memory);
            }

-            let limits = MemoryLimits {
-                // new_cgroup_mem_high is initialized to 0 but it is guarancontextd to not be here
-                // since it is properly initialized in the previous cgroup if let block
-                high: new_cgroup_mem_high,
-            };
+            // new_cgroup_mem_high is initialized to 0 but it is guaranteed to not be here
+            // since it is properly initialized in the previous cgroup if let block
            cgroup
-                .set_limits(&limits)
-                .context("failed to set cgroup memory limits")?;
+                .set_memory_high_bytes(new_cgroup_mem_high)
+                .context("failed to set cgroup memory.high")?;

            let message = format!(
                "set cgroup memory.high to {} MiB, of new max {} MiB",
@@ -327,12 +341,9 @@ impl Runner {
                name = cgroup.path(),
                "updating cgroup memory.high",
            );
-            let limits = MemoryLimits {
-                high: new_cgroup_mem_high,
-            };
            cgroup
-                .set_limits(&limits)
-                .context("failed to set file cache size")?;
+                .set_memory_high_bytes(new_cgroup_mem_high)
+                .context("failed to set cgroup memory.high")?;
        }

        Ok(())
@@ -398,7 +409,7 @@ impl Runner {
                    }
                }
                // we need to propagate an upscale request
-                request = self.dispatcher.request_upscale_events.recv() => {
+                request = self.dispatcher.request_upscale_events.recv(), if self.cgroup.is_some() => {
                    if request.is_none() {
                        bail!("failed to listen for upscale event from cgroup")
                    }
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -17,6 +17,8 @@ async-stream.workspace = true
 async-trait.workspace = true
 byteorder.workspace = true
 bytes.workspace = true
+camino.workspace = true
+camino-tempfile.workspace = true
 chrono = { workspace = true, features = ["serde"] }
 clap = { workspace = true, features = ["string"] }
 close_fds.workspace = true
@@ -80,7 +82,6 @@ enum-map.workspace = true
 enumset.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
-tempfile.workspace = true

 [dev-dependencies]
 criterion.workspace = true
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -25,7 +25,7 @@ fn redo_scenarios(c: &mut Criterion) {
    // input to the stderr.
    // utils::logging::init(utils::logging::LogFormat::Plain).unwrap();

-    let repo_dir = tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap();
+    let repo_dir = camino_tempfile::tempdir_in(env!("CARGO_TARGET_TMPDIR")).unwrap();

    let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
    let conf = Box::leak(Box::new(conf));
--- a/pageserver/ctl/Cargo.toml
+++ b/pageserver/ctl/Cargo.toml
@@ -9,6 +9,7 @@ license.workspace = true
 [dependencies]
 anyhow.workspace = true
 bytes.workspace = true
+camino.workspace = true
 clap = { workspace = true, features = ["string"] }
 git-version.workspace = true
 pageserver = { path = ".." }
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -3,13 +3,14 @@
 //! Currently it only analyzes holes, which are regions within the layer range that the layer contains no updates for. In the future it might do more analysis (maybe key quantiles?) but it should never return sensitive data.

 use anyhow::Result;
+use camino::{Utf8Path, Utf8PathBuf};
 use pageserver::context::{DownloadBehavior, RequestContext};
 use pageserver::task_mgr::TaskKind;
 use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
 use std::cmp::Ordering;
 use std::collections::BinaryHeap;
 use std::ops::Range;
-use std::{fs, path::Path, str};
+use std::{fs, str};

 use pageserver::page_cache::PAGE_SZ;
 use pageserver::repository::{Key, KEY_SIZE};
@@ -98,7 +99,7 @@ pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
 }

 // Finds the max_holes largest holes, ignoring any that are smaller than MIN_HOLE_LENGTH"
-async fn get_holes(path: &Path, max_holes: usize, ctx: &RequestContext) -> Result<Vec<Hole>> {
+async fn get_holes(path: &Utf8Path, max_holes: usize, ctx: &RequestContext) -> Result<Vec<Hole>> {
    let file = FileBlockReader::new(VirtualFile::open(path).await?);
    let summary_blk = file.read_blk(0, ctx).await?;
    let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
@@ -167,7 +168,9 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
                    parse_filename(&layer.file_name().into_string().unwrap())
                {
                    if layer_file.is_delta {
-                        layer_file.holes = get_holes(&layer.path(), max_holes, &ctx).await?;
+                        let layer_path =
+                            Utf8PathBuf::from_path_buf(layer.path()).expect("non-Unicode path");
+                        layer_file.holes = get_holes(&layer_path, max_holes, &ctx).await?;
                        n_deltas += 1;
                    }
                    layers.push(layer_file);
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -1,6 +1,7 @@
 use std::path::{Path, PathBuf};

 use anyhow::Result;
+use camino::Utf8Path;
 use clap::Subcommand;
 use pageserver::context::{DownloadBehavior, RequestContext};
 use pageserver::task_mgr::TaskKind;
@@ -47,7 +48,7 @@ pub(crate) enum LayerCmd {
 }

 async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
-    let path = path.as_ref();
+    let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
    virtual_file::init(10);
    page_cache::init(100);
    let file = FileBlockReader::new(VirtualFile::open(path).await?);
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -8,6 +8,7 @@ mod draw_timeline_dir;
 mod layer_map_analyzer;
 mod layers;

+use camino::{Utf8Path, Utf8PathBuf};
 use clap::{Parser, Subcommand};
 use layers::LayerCmd;
 use pageserver::{
@@ -18,7 +19,6 @@ use pageserver::{
    virtual_file,
 };
 use postgres_ffi::ControlFileData;
-use std::path::{Path, PathBuf};
 use utils::{lsn::Lsn, project_git_version};

 project_git_version!(GIT_VERSION);
@@ -49,7 +49,7 @@ enum Commands {
 #[derive(Parser)]
 struct MetadataCmd {
    /// Input metadata file path
-    metadata_path: PathBuf,
+    metadata_path: Utf8PathBuf,
    /// Replace disk consistent Lsn
    disk_consistent_lsn: Option<Lsn>,
    /// Replace previous record Lsn
@@ -61,13 +61,13 @@ struct MetadataCmd {
 #[derive(Parser)]
 struct PrintLayerFileCmd {
    /// Pageserver data path
-    path: PathBuf,
+    path: Utf8PathBuf,
 }

 #[derive(Parser)]
 struct AnalyzeLayerMapCmd {
    /// Pageserver data path
-    path: PathBuf,
+    path: Utf8PathBuf,
    /// Max holes
    max_holes: Option<usize>,
 }
@@ -102,7 +102,7 @@ async fn main() -> anyhow::Result<()> {
    Ok(())
 }

-fn read_pg_control_file(control_file_path: &Path) -> anyhow::Result<()> {
+fn read_pg_control_file(control_file_path: &Utf8Path) -> anyhow::Result<()> {
    let control_file = ControlFileData::decode(&std::fs::read(control_file_path)?)?;
    println!("{control_file:?}");
    let control_file_initdb = Lsn(control_file.checkPoint);
@@ -114,7 +114,7 @@ fn read_pg_control_file(control_file_path: &Path) -> anyhow::Result<()> {
    Ok(())
 }

-async fn print_layerfile(path: &Path) -> anyhow::Result<()> {
+async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> {
    // Basic initialization of things that don't change after startup
    virtual_file::init(10);
    page_cache::init(100);
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -2,9 +2,10 @@

 use std::env::{var, VarError};
 use std::sync::Arc;
-use std::{env, ops::ControlFlow, path::Path, str::FromStr};
+use std::{env, ops::ControlFlow, str::FromStr};

 use anyhow::{anyhow, Context};
+use camino::Utf8Path;
 use clap::{Arg, ArgAction, Command};

 use metrics::launch_timestamp::{set_launch_timestamp_metric, LaunchTimestamp};
@@ -65,21 +66,17 @@ fn main() -> anyhow::Result<()> {

    let workdir = arg_matches
        .get_one::<String>("workdir")
-        .map(Path::new)
-        .unwrap_or_else(|| Path::new(".neon"));
+        .map(Utf8Path::new)
+        .unwrap_or_else(|| Utf8Path::new(".neon"));
    let workdir = workdir
-        .canonicalize()
-        .with_context(|| format!("Error opening workdir '{}'", workdir.display()))?;
+        .canonicalize_utf8()
+        .with_context(|| format!("Error opening workdir '{workdir}'"))?;

    let cfg_file_path = workdir.join("pageserver.toml");

    // Set CWD to workdir for non-daemon modes
-    env::set_current_dir(&workdir).with_context(|| {
-        format!(
-            "Failed to set application's current dir to '{}'",
-            workdir.display()
-        )
-    })?;
+    env::set_current_dir(&workdir)
+        .with_context(|| format!("Failed to set application's current dir to '{workdir}'"))?;

    let conf = match initialize_config(&cfg_file_path, arg_matches, &workdir)? {
        ControlFlow::Continue(conf) => conf,
@@ -115,12 +112,8 @@ fn main() -> anyhow::Result<()> {

    let tenants_path = conf.tenants_path();
    if !tenants_path.exists() {
-        utils::crashsafe::create_dir_all(conf.tenants_path()).with_context(|| {
-            format!(
-                "Failed to create tenants root dir at '{}'",
-                tenants_path.display()
-            )
-        })?;
+        utils::crashsafe::create_dir_all(conf.tenants_path())
+            .with_context(|| format!("Failed to create tenants root dir at '{tenants_path}'"))?;
    }

    // Initialize up failpoints support
@@ -137,9 +130,9 @@ fn main() -> anyhow::Result<()> {
 }

 fn initialize_config(
-    cfg_file_path: &Path,
+    cfg_file_path: &Utf8Path,
    arg_matches: clap::ArgMatches,
-    workdir: &Path,
+    workdir: &Utf8Path,
 ) -> anyhow::Result<ControlFlow<(), &'static PageServerConf>> {
    let init = arg_matches.get_flag("init");
    let update_config = init || arg_matches.get_flag("update-config");
@@ -147,33 +140,22 @@ fn initialize_config(
    let (mut toml, config_file_exists) = if cfg_file_path.is_file() {
        if init {
            anyhow::bail!(
-                "Config file '{}' already exists, cannot init it, use --update-config to update it",
-                cfg_file_path.display()
+                "Config file '{cfg_file_path}' already exists, cannot init it, use --update-config to update it",
            );
        }
        // Supplement the CLI arguments with the config file
-        let cfg_file_contents = std::fs::read_to_string(cfg_file_path).with_context(|| {
-            format!(
-                "Failed to read pageserver config at '{}'",
-                cfg_file_path.display()
-            )
-        })?;
+        let cfg_file_contents = std::fs::read_to_string(cfg_file_path)
+            .with_context(|| format!("Failed to read pageserver config at '{cfg_file_path}'"))?;
        (
            cfg_file_contents
                .parse::<toml_edit::Document>()
                .with_context(|| {
-                    format!(
-                        "Failed to parse '{}' as pageserver config",
-                        cfg_file_path.display()
-                    )
+                    format!("Failed to parse '{cfg_file_path}' as pageserver config")
                })?,
            true,
        )
    } else if cfg_file_path.exists() {
-        anyhow::bail!(
-            "Config file '{}' exists but is not a regular file",
-            cfg_file_path.display()
-        );
+        anyhow::bail!("Config file '{cfg_file_path}' exists but is not a regular file");
    } else {
        // We're initializing the tenant, so there's no config file yet
        (
@@ -192,7 +174,7 @@ fn initialize_config(

            for (key, item) in doc.iter() {
                if config_file_exists && update_config && key == "id" && toml.contains_key(key) {
-                    anyhow::bail!("Pageserver config file exists at '{}' and has node id already, it cannot be overridden", cfg_file_path.display());
+                    anyhow::bail!("Pageserver config file exists at '{cfg_file_path}' and has node id already, it cannot be overridden");
                }
                toml.insert(key, item.clone());
            }
@@ -204,18 +186,11 @@ fn initialize_config(
        .context("Failed to parse pageserver configuration")?;

    if update_config {
-        info!("Writing pageserver config to '{}'", cfg_file_path.display());
+        info!("Writing pageserver config to '{cfg_file_path}'");

-        std::fs::write(cfg_file_path, toml.to_string()).with_context(|| {
-            format!(
-                "Failed to write pageserver config to '{}'",
-                cfg_file_path.display()
-            )
-        })?;
-        info!(
-            "Config successfully written to '{}'",
-            cfg_file_path.display()
-        )
+        std::fs::write(cfg_file_path, toml.to_string())
+            .with_context(|| format!("Failed to write pageserver config to '{cfg_file_path}'"))?;
+        info!("Config successfully written to '{cfg_file_path}'")
    }

    Ok(if init {
@@ -644,7 +619,7 @@ fn create_remote_storage_client(
    let config = if let Some(config) = &conf.remote_storage_config {
        config
    } else {
-        // No remote storage configured.
+        tracing::warn!("no remote storage configured, this is a deprecated configuration");
        return Ok(None);
    };

--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -11,17 +11,18 @@ use std::env;
 use storage_broker::Uri;
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::ConnectionId;
+use utils::logging::SecretString;

 use once_cell::sync::OnceCell;
 use reqwest::Url;
 use std::num::NonZeroUsize;
-use std::path::{Path, PathBuf};
 use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
 use toml_edit;
 use toml_edit::{Document, Item};

+use camino::{Utf8Path, Utf8PathBuf};
 use postgres_backend::AuthType;
 use utils::{
    id::{NodeId, TenantId, TimelineId},
@@ -36,8 +37,8 @@ use crate::tenant::{
    TIMELINES_SEGMENT_NAME,
 };
 use crate::{
-    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX,
-    TIMELINE_UNINIT_MARK_SUFFIX,
+    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_LOCATION_CONFIG_NAME,
+    TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
 };

 pub mod defaults {
@@ -152,9 +153,9 @@ pub struct PageServerConf {
    // that during unit testing, because the current directory is global
    // to the process but different unit tests work on different
    // repositories.
-    pub workdir: PathBuf,
+    pub workdir: Utf8PathBuf,

-    pub pg_distrib_dir: PathBuf,
+    pub pg_distrib_dir: Utf8PathBuf,

    // Authentication
    /// authentication method for the HTTP mgmt API
@@ -163,7 +164,7 @@ pub struct PageServerConf {
    pub pg_auth_type: AuthType,
    /// Path to a file containing public key for verifying JWT tokens.
    /// Used for both mgmt and compute auth, if enabled.
-    pub auth_validation_public_key_path: Option<PathBuf>,
+    pub auth_validation_public_key_path: Option<Utf8PathBuf>,

    pub remote_storage_config: Option<RemoteStorageConfig>,

@@ -207,6 +208,13 @@ pub struct PageServerConf {
    pub background_task_maximum_delay: Duration,

    pub control_plane_api: Option<Url>,
+
+    /// JWT token for use with the control plane API.
+    pub control_plane_api_token: Option<SecretString>,
+
+    /// If true, pageserver will make best-effort to operate without a control plane: only
+    /// for use in major incidents.
+    pub control_plane_emergency_mode: bool,
 }

 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -249,15 +257,15 @@ struct PageServerConfigBuilder {
    page_cache_size: BuilderValue<usize>,
    max_file_descriptors: BuilderValue<usize>,

-    workdir: BuilderValue<PathBuf>,
+    workdir: BuilderValue<Utf8PathBuf>,

-    pg_distrib_dir: BuilderValue<PathBuf>,
+    pg_distrib_dir: BuilderValue<Utf8PathBuf>,

    http_auth_type: BuilderValue<AuthType>,
    pg_auth_type: BuilderValue<AuthType>,

    //
-    auth_validation_public_key_path: BuilderValue<Option<PathBuf>>,
+    auth_validation_public_key_path: BuilderValue<Option<Utf8PathBuf>>,
    remote_storage_config: BuilderValue<Option<RemoteStorageConfig>>,

    id: BuilderValue<NodeId>,
@@ -283,6 +291,8 @@ struct PageServerConfigBuilder {
    background_task_maximum_delay: BuilderValue<Duration>,

    control_plane_api: BuilderValue<Option<Url>>,
+    control_plane_api_token: BuilderValue<Option<SecretString>>,
+    control_plane_emergency_mode: BuilderValue<bool>,
 }

 impl Default for PageServerConfigBuilder {
@@ -300,10 +310,12 @@ impl Default for PageServerConfigBuilder {
            superuser: Set(DEFAULT_SUPERUSER.to_string()),
            page_cache_size: Set(DEFAULT_PAGE_CACHE_SIZE),
            max_file_descriptors: Set(DEFAULT_MAX_FILE_DESCRIPTORS),
-            workdir: Set(PathBuf::new()),
-            pg_distrib_dir: Set(env::current_dir()
-                .expect("cannot access current directory")
-                .join("pg_install")),
+            workdir: Set(Utf8PathBuf::new()),
+            pg_distrib_dir: Set(Utf8PathBuf::from_path_buf(
+                env::current_dir().expect("cannot access current directory"),
+            )
+            .expect("non-Unicode path")
+            .join("pg_install")),
            http_auth_type: Set(AuthType::Trust),
            pg_auth_type: Set(AuthType::Trust),
            auth_validation_public_key_path: Set(None),
@@ -347,6 +359,8 @@ impl Default for PageServerConfigBuilder {
            .unwrap()),

            control_plane_api: Set(None),
+            control_plane_api_token: Set(None),
+            control_plane_emergency_mode: Set(false),
        }
    }
 }
@@ -384,11 +398,11 @@ impl PageServerConfigBuilder {
        self.max_file_descriptors = BuilderValue::Set(max_file_descriptors)
    }

-    pub fn workdir(&mut self, workdir: PathBuf) {
+    pub fn workdir(&mut self, workdir: Utf8PathBuf) {
        self.workdir = BuilderValue::Set(workdir)
    }

-    pub fn pg_distrib_dir(&mut self, pg_distrib_dir: PathBuf) {
+    pub fn pg_distrib_dir(&mut self, pg_distrib_dir: Utf8PathBuf) {
        self.pg_distrib_dir = BuilderValue::Set(pg_distrib_dir)
    }

@@ -402,7 +416,7 @@ impl PageServerConfigBuilder {

    pub fn auth_validation_public_key_path(
        &mut self,
-        auth_validation_public_key_path: Option<PathBuf>,
+        auth_validation_public_key_path: Option<Utf8PathBuf>,
    ) {
        self.auth_validation_public_key_path = BuilderValue::Set(auth_validation_public_key_path)
    }
@@ -479,6 +493,14 @@ impl PageServerConfigBuilder {
        self.control_plane_api = BuilderValue::Set(api)
    }

+    pub fn control_plane_api_token(&mut self, token: Option<SecretString>) {
+        self.control_plane_api_token = BuilderValue::Set(token)
+    }
+
+    pub fn control_plane_emergency_mode(&mut self, enabled: bool) {
+        self.control_plane_emergency_mode = BuilderValue::Set(enabled)
+    }
+
    pub fn build(self) -> anyhow::Result<PageServerConf> {
        let concurrent_tenant_size_logical_size_queries = self
            .concurrent_tenant_size_logical_size_queries
@@ -567,6 +589,12 @@ impl PageServerConfigBuilder {
            control_plane_api: self
                .control_plane_api
                .ok_or(anyhow!("missing control_plane_api"))?,
+            control_plane_api_token: self
+                .control_plane_api_token
+                .ok_or(anyhow!("missing control_plane_api_token"))?,
+            control_plane_emergency_mode: self
+                .control_plane_emergency_mode
+                .ok_or(anyhow!("missing control_plane_emergency_mode"))?,
        })
    }
 }
@@ -576,15 +604,15 @@ impl PageServerConf {
    // Repository paths, relative to workdir.
    //

-    pub fn tenants_path(&self) -> PathBuf {
+    pub fn tenants_path(&self) -> Utf8PathBuf {
        self.workdir.join(TENANTS_SEGMENT_NAME)
    }

-    pub fn deletion_prefix(&self) -> PathBuf {
+    pub fn deletion_prefix(&self) -> Utf8PathBuf {
        self.workdir.join("deletion")
    }

-    pub fn deletion_list_path(&self, sequence: u64) -> PathBuf {
+    pub fn deletion_list_path(&self, sequence: u64) -> Utf8PathBuf {
        // Encode a version in the filename, so that if we ever switch away from JSON we can
        // increment this.
        const VERSION: u8 = 1;
@@ -593,7 +621,7 @@ impl PageServerConf {
            .join(format!("{sequence:016x}-{VERSION:02x}.list"))
    }

-    pub fn deletion_header_path(&self) -> PathBuf {
+    pub fn deletion_header_path(&self) -> Utf8PathBuf {
        // Encode a version in the filename, so that if we ever switch away from JSON we can
        // increment this.
        const VERSION: u8 = 1;
@@ -601,30 +629,38 @@ impl PageServerConf {
        self.deletion_prefix().join(format!("header-{VERSION:02x}"))
    }

-    pub fn tenant_path(&self, tenant_id: &TenantId) -> PathBuf {
+    pub fn tenant_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
        self.tenants_path().join(tenant_id.to_string())
    }

-    pub fn tenant_attaching_mark_file_path(&self, tenant_id: &TenantId) -> PathBuf {
+    pub fn tenant_attaching_mark_file_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
        self.tenant_path(tenant_id)
            .join(TENANT_ATTACHING_MARKER_FILENAME)
    }

-    pub fn tenant_ignore_mark_file_path(&self, tenant_id: &TenantId) -> PathBuf {
+    pub fn tenant_ignore_mark_file_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
        self.tenant_path(tenant_id).join(IGNORED_TENANT_FILE_NAME)
    }

    /// Points to a place in pageserver's local directory,
    /// where certain tenant's tenantconf file should be located.
-    pub fn tenant_config_path(&self, tenant_id: &TenantId) -> PathBuf {
+    ///
+    /// Legacy: superseded by tenant_location_config_path.  Eventually
+    /// remove this function.
+    pub fn tenant_config_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
        self.tenant_path(tenant_id).join(TENANT_CONFIG_NAME)
    }

-    pub fn timelines_path(&self, tenant_id: &TenantId) -> PathBuf {
+    pub fn tenant_location_config_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
+        self.tenant_path(tenant_id)
+            .join(TENANT_LOCATION_CONFIG_NAME)
+    }
+
+    pub fn timelines_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
        self.tenant_path(tenant_id).join(TIMELINES_SEGMENT_NAME)
    }

-    pub fn timeline_path(&self, tenant_id: &TenantId, timeline_id: &TimelineId) -> PathBuf {
+    pub fn timeline_path(&self, tenant_id: &TenantId, timeline_id: &TimelineId) -> Utf8PathBuf {
        self.timelines_path(tenant_id).join(timeline_id.to_string())
    }

@@ -632,7 +668,7 @@ impl PageServerConf {
        &self,
        tenant_id: TenantId,
        timeline_id: TimelineId,
-    ) -> PathBuf {
+    ) -> Utf8PathBuf {
        path_with_suffix_extension(
            self.timeline_path(&tenant_id, &timeline_id),
            TIMELINE_UNINIT_MARK_SUFFIX,
@@ -643,19 +679,19 @@ impl PageServerConf {
        &self,
        tenant_id: TenantId,
        timeline_id: TimelineId,
-    ) -> PathBuf {
+    ) -> Utf8PathBuf {
        path_with_suffix_extension(
            self.timeline_path(&tenant_id, &timeline_id),
            TIMELINE_DELETE_MARK_SUFFIX,
        )
    }

-    pub fn tenant_deleted_mark_file_path(&self, tenant_id: &TenantId) -> PathBuf {
+    pub fn tenant_deleted_mark_file_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
        self.tenant_path(tenant_id)
            .join(TENANT_DELETED_MARKER_FILE_NAME)
    }

-    pub fn traces_path(&self) -> PathBuf {
+    pub fn traces_path(&self) -> Utf8PathBuf {
        self.workdir.join("traces")
    }

@@ -664,7 +700,7 @@ impl PageServerConf {
        tenant_id: &TenantId,
        timeline_id: &TimelineId,
        connection_id: &ConnectionId,
-    ) -> PathBuf {
+    ) -> Utf8PathBuf {
        self.traces_path()
            .join(tenant_id.to_string())
            .join(timeline_id.to_string())
@@ -673,20 +709,20 @@ impl PageServerConf {

    /// Points to a place in pageserver's local directory,
    /// where certain timeline's metadata file should be located.
-    pub fn metadata_path(&self, tenant_id: &TenantId, timeline_id: &TimelineId) -> PathBuf {
+    pub fn metadata_path(&self, tenant_id: &TenantId, timeline_id: &TimelineId) -> Utf8PathBuf {
        self.timeline_path(tenant_id, timeline_id)
            .join(METADATA_FILE_NAME)
    }

    /// Turns storage remote path of a file into its local path.
-    pub fn local_path(&self, remote_path: &RemotePath) -> PathBuf {
+    pub fn local_path(&self, remote_path: &RemotePath) -> Utf8PathBuf {
        remote_path.with_base(&self.workdir)
    }

    //
    // Postgres distribution paths
    //
-    pub fn pg_distrib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
+    pub fn pg_distrib_dir(&self, pg_version: u32) -> anyhow::Result<Utf8PathBuf> {
        let path = self.pg_distrib_dir.clone();

        #[allow(clippy::manual_range_patterns)]
@@ -696,10 +732,10 @@ impl PageServerConf {
        }
    }

-    pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
+    pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result<Utf8PathBuf> {
        Ok(self.pg_distrib_dir(pg_version)?.join("bin"))
    }
-    pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
+    pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result<Utf8PathBuf> {
        Ok(self.pg_distrib_dir(pg_version)?.join("lib"))
    }

@@ -707,7 +743,7 @@ impl PageServerConf {
    /// validating the input and failing on errors.
    ///
    /// This leaves any options not present in the file in the built-in defaults.
-    pub fn parse_and_validate(toml: &Document, workdir: &Path) -> anyhow::Result<Self> {
+    pub fn parse_and_validate(toml: &Document, workdir: &Utf8Path) -> anyhow::Result<Self> {
        let mut builder = PageServerConfigBuilder::default();
        builder.workdir(workdir.to_owned());

@@ -726,10 +762,10 @@ impl PageServerConf {
                    builder.max_file_descriptors(parse_toml_u64(key, item)? as usize)
                }
                "pg_distrib_dir" => {
-                    builder.pg_distrib_dir(PathBuf::from(parse_toml_string(key, item)?))
+                    builder.pg_distrib_dir(Utf8PathBuf::from(parse_toml_string(key, item)?))
                }
                "auth_validation_public_key_path" => builder.auth_validation_public_key_path(Some(
-                    PathBuf::from(parse_toml_string(key, item)?),
+                    Utf8PathBuf::from(parse_toml_string(key, item)?),
                )),
                "http_auth_type" => builder.http_auth_type(parse_toml_from_str(key, item)?),
                "pg_auth_type" => builder.pg_auth_type(parse_toml_from_str(key, item)?),
@@ -776,6 +812,18 @@ impl PageServerConf {
                        builder.control_plane_api(Some(parsed.parse().context("failed to parse control plane URL")?))
                    }
                },
+                "control_plane_api_token" => {
+                    let parsed = parse_toml_string(key, item)?;
+                    if parsed.is_empty() {
+                        builder.control_plane_api_token(None)
+                    } else {
+                        builder.control_plane_api_token(Some(parsed.into()))
+                    }
+                },
+                "control_plane_emergency_mode" => {
+                    builder.control_plane_emergency_mode(parse_toml_bool(key, item)?)
+
+                },
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }
@@ -789,8 +837,7 @@ impl PageServerConf {
            ensure!(
                auth_validation_public_key_path.exists(),
                format!(
-                    "Can't find auth_validation_public_key at '{}'",
-                    auth_validation_public_key_path.display()
+                    "Can't find auth_validation_public_key at '{auth_validation_public_key_path}'",
                )
            );
        }
@@ -906,12 +953,12 @@ impl PageServerConf {
    }

    #[cfg(test)]
-    pub fn test_repo_dir(test_name: &str) -> PathBuf {
-        PathBuf::from(format!("../tmp_check/test_{test_name}"))
+    pub fn test_repo_dir(test_name: &str) -> Utf8PathBuf {
+        Utf8PathBuf::from(format!("../tmp_check/test_{test_name}"))
    }

-    pub fn dummy_conf(repo_dir: PathBuf) -> Self {
-        let pg_distrib_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../pg_install");
+    pub fn dummy_conf(repo_dir: Utf8PathBuf) -> Self {
+        let pg_distrib_dir = Utf8PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../pg_install");

        PageServerConf {
            id: NodeId(0),
@@ -945,6 +992,8 @@ impl PageServerConf {
            ondemand_download_behavior_treat_error_as_warn: false,
            background_task_maximum_delay: Duration::ZERO,
            control_plane_api: None,
+            control_plane_api_token: None,
+            control_plane_emergency_mode: false,
        }
    }
 }
@@ -1077,8 +1126,8 @@ mod tests {
        num::{NonZeroU32, NonZeroUsize},
    };

+    use camino_tempfile::{tempdir, Utf8TempDir};
    use remote_storage::{RemoteStorageKind, S3Config};
-    use tempfile::{tempdir, TempDir};
    use utils::serde_percent::Percent;

    use super::*;
@@ -1117,8 +1166,7 @@ background_task_maximum_delay = '334 s'
        let broker_endpoint = storage_broker::DEFAULT_ENDPOINT;
        // we have to create dummy values to overcome the validation errors
        let config_string = format!(
-            "pg_distrib_dir='{}'\nid=10\nbroker_endpoint = '{broker_endpoint}'",
-            pg_distrib_dir.display()
+            "pg_distrib_dir='{pg_distrib_dir}'\nid=10\nbroker_endpoint = '{broker_endpoint}'",
        );
        let toml = config_string.parse()?;

@@ -1168,7 +1216,9 @@ background_task_maximum_delay = '334 s'
                background_task_maximum_delay: humantime::parse_duration(
                    defaults::DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY
                )?,
-                control_plane_api: None
+                control_plane_api: None,
+                control_plane_api_token: None,
+                control_plane_emergency_mode: false
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -1183,8 +1233,7 @@ background_task_maximum_delay = '334 s'
        let broker_endpoint = storage_broker::DEFAULT_ENDPOINT;

        let config_string = format!(
-            "{ALL_BASE_VALUES_TOML}pg_distrib_dir='{}'\nbroker_endpoint = '{broker_endpoint}'",
-            pg_distrib_dir.display()
+            "{ALL_BASE_VALUES_TOML}pg_distrib_dir='{pg_distrib_dir}'\nbroker_endpoint = '{broker_endpoint}'",
        );
        let toml = config_string.parse()?;

@@ -1224,7 +1273,9 @@ background_task_maximum_delay = '334 s'
                test_remote_failures: 0,
                ondemand_download_behavior_treat_error_as_warn: false,
                background_task_maximum_delay: Duration::from_secs(334),
-                control_plane_api: None
+                control_plane_api: None,
+                control_plane_api_token: None,
+                control_plane_emergency_mode: false
            },
            "Should be able to parse all basic config values correctly"
        );
@@ -1243,23 +1294,18 @@ background_task_maximum_delay = '334 s'
        let identical_toml_declarations = &[
            format!(
                r#"[remote_storage]
-local_path = '{}'"#,
-                local_storage_path.display()
-            ),
-            format!(
-                "remote_storage={{local_path='{}'}}",
-                local_storage_path.display()
+local_path = '{local_storage_path}'"#,
            ),
+            format!("remote_storage={{local_path='{local_storage_path}'}}"),
        ];

        for remote_storage_config_str in identical_toml_declarations {
            let config_string = format!(
                r#"{ALL_BASE_VALUES_TOML}
-pg_distrib_dir='{}'
+pg_distrib_dir='{pg_distrib_dir}'
 broker_endpoint = '{broker_endpoint}'

 {remote_storage_config_str}"#,
-                pg_distrib_dir.display(),
            );

            let toml = config_string.parse()?;
@@ -1322,11 +1368,10 @@ concurrency_limit = {s3_concurrency_limit}"#
        for remote_storage_config_str in identical_toml_declarations {
            let config_string = format!(
                r#"{ALL_BASE_VALUES_TOML}
-pg_distrib_dir='{}'
+pg_distrib_dir='{pg_distrib_dir}'
 broker_endpoint = '{broker_endpoint}'

 {remote_storage_config_str}"#,
-                pg_distrib_dir.display(),
            );

            let toml = config_string.parse()?;
@@ -1368,12 +1413,11 @@ broker_endpoint = '{broker_endpoint}'

        let config_string = format!(
            r#"{ALL_BASE_VALUES_TOML}
-pg_distrib_dir='{}'
+pg_distrib_dir='{pg_distrib_dir}'
 broker_endpoint = '{broker_endpoint}'

 [tenant_config]
 trace_read_requests = {trace_read_requests}"#,
-            pg_distrib_dir.display(),
        );

        let toml = config_string.parse()?;
@@ -1393,7 +1437,7 @@ trace_read_requests = {trace_read_requests}"#,
        let (workdir, pg_distrib_dir) = prepare_fs(&tempdir)?;

        let pageserver_conf_toml = format!(
-            r#"pg_distrib_dir = "{}"
+            r#"pg_distrib_dir = "{pg_distrib_dir}"
 metric_collection_endpoint = "http://sample.url"
 metric_collection_interval = "10min"
 id = 222
@@ -1411,7 +1455,6 @@ kind = "LayerAccessThreshold"
 period = "20m"
 threshold = "20m"
 "#,
-            pg_distrib_dir.display(),
        );
        let toml: Document = pageserver_conf_toml.parse()?;
        let conf = PageServerConf::parse_and_validate(&toml, &workdir)?;
@@ -1452,7 +1495,7 @@ threshold = "20m"
        Ok(())
    }

-    fn prepare_fs(tempdir: &TempDir) -> anyhow::Result<(PathBuf, PathBuf)> {
+    fn prepare_fs(tempdir: &Utf8TempDir) -> anyhow::Result<(Utf8PathBuf, Utf8PathBuf)> {
        let tempdir_path = tempdir.path();

        let workdir = tempdir_path.join("workdir");
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -3,11 +3,11 @@
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::{mgr, LogicalSizeCalculationCause};
+use camino::Utf8PathBuf;
 use consumption_metrics::EventType;
 use pageserver_api::models::TenantState;
 use reqwest::Url;
 use std::collections::HashMap;
-use std::path::PathBuf;
 use std::sync::Arc;
 use std::time::{Duration, SystemTime};
 use tracing::*;
@@ -41,7 +41,7 @@ pub async fn collect_metrics(
    _cached_metric_collection_interval: Duration,
    synthetic_size_calculation_interval: Duration,
    node_id: NodeId,
-    local_disk_storage: PathBuf,
+    local_disk_storage: Utf8PathBuf,
    ctx: RequestContext,
 ) -> anyhow::Result<()> {
    if _cached_metric_collection_interval != Duration::ZERO {
@@ -68,7 +68,7 @@ pub async fn collect_metrics(
        },
    );

-    let path: Arc<PathBuf> = Arc::new(local_disk_storage);
+    let path: Arc<Utf8PathBuf> = Arc::new(local_disk_storage);

    let cancel = task_mgr::shutdown_token();

@@ -153,7 +153,7 @@ pub async fn collect_metrics(
 ///
 /// Cancellation safe.
 async fn restore_and_reschedule(
-    path: &Arc<PathBuf>,
+    path: &Arc<Utf8PathBuf>,
    metric_collection_interval: Duration,
 ) -> Cache {
    let (cached, earlier_metric_at) = match disk_cache::read_metrics_from_disk(path.clone()).await {
--- a/pageserver/src/consumption_metrics/disk_cache.rs
+++ b/pageserver/src/consumption_metrics/disk_cache.rs
@@ -1,10 +1,12 @@
 use anyhow::Context;
-use std::path::PathBuf;
+use camino::{Utf8Path, Utf8PathBuf};
 use std::sync::Arc;

 use super::RawMetric;

-pub(super) async fn read_metrics_from_disk(path: Arc<PathBuf>) -> anyhow::Result<Vec<RawMetric>> {
+pub(super) async fn read_metrics_from_disk(
+    path: Arc<Utf8PathBuf>,
+) -> anyhow::Result<Vec<RawMetric>> {
    // do not add context to each error, callsite will log with full path
    let span = tracing::Span::current();
    tokio::task::spawn_blocking(move || {
@@ -25,10 +27,10 @@ pub(super) async fn read_metrics_from_disk(path: Arc<PathBuf>) -> anyhow::Result
    .and_then(|x| x)
 }

-fn scan_and_delete_with_same_prefix(path: &std::path::Path) -> std::io::Result<()> {
+fn scan_and_delete_with_same_prefix(path: &Utf8Path) -> std::io::Result<()> {
    let it = std::fs::read_dir(path.parent().expect("caller checked"))?;

-    let prefix = path.file_name().expect("caller checked").to_string_lossy();
+    let prefix = path.file_name().expect("caller checked").to_string();

    for entry in it {
        let entry = entry?;
@@ -62,7 +64,7 @@ fn scan_and_delete_with_same_prefix(path: &std::path::Path) -> std::io::Result<(

 pub(super) async fn flush_metrics_to_disk(
    current_metrics: &Arc<Vec<RawMetric>>,
-    path: &Arc<PathBuf>,
+    path: &Arc<Utf8PathBuf>,
 ) -> anyhow::Result<()> {
    use std::io::Write;

@@ -81,7 +83,7 @@ pub(super) async fn flush_metrics_to_disk(

            let parent = path.parent().expect("existence checked");
            let file_name = path.file_name().expect("existence checked");
-            let mut tempfile = tempfile::Builder::new()
+            let mut tempfile = camino_tempfile::Builder::new()
                .prefix(file_name)
                .suffix(".tmp")
                .tempfile_in(parent)?;
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -53,12 +53,16 @@ impl ControlPlaneClient {
            segs.pop_if_empty().push("");
        }

-        let client = reqwest::ClientBuilder::new()
-            .build()
-            .expect("Failed to construct http client");
+        let mut client = reqwest::ClientBuilder::new();
+
+        if let Some(jwt) = &conf.control_plane_api_token {
+            let mut headers = hyper::HeaderMap::new();
+            headers.insert("Authorization", jwt.get_contents().parse().unwrap());
+            client = client.default_headers(headers);
+        }

        Some(Self {
-            http_client: client,
+            http_client: client.build().expect("Failed to construct HTTP client"),
            base_url: url,
            node_id: conf.id,
            cancel: cancel.clone(),
@@ -129,6 +133,8 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
            node_id: self.node_id,
        };

+        fail::fail_point!("control-plane-client-re-attach");
+
        let response: ReAttachResponse = self.retry_http_forever(&re_attach_path, request).await?;
        tracing::info!(
            "Received re-attach response with {} tenants",
@@ -164,6 +170,8 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
                .collect(),
        };

+        fail::fail_point!("control-plane-client-validate");
+
        let response: ValidateResponse = self.retry_http_forever(&re_attach_path, request).await?;

        Ok(response
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -3,7 +3,6 @@ mod list_writer;
 mod validator;

 use std::collections::HashMap;
-use std::path::PathBuf;
 use std::sync::Arc;
 use std::time::Duration;

@@ -13,6 +12,7 @@ use crate::tenant::remote_timeline_client::remote_layer_path;
 use crate::tenant::remote_timeline_client::remote_timeline_path;
 use crate::virtual_file::VirtualFile;
 use anyhow::Context;
+use camino::Utf8PathBuf;
 use hex::FromHex;
 use remote_storage::{GenericRemoteStorage, RemotePath};
 use serde::Deserialize;
@@ -40,7 +40,6 @@ use validator::ValidatorQueueMessage;

 use crate::{config::PageServerConf, tenant::storage_layer::LayerFileName};

-// TODO: adminstrative "panic button" config property to disable all deletions
 // TODO: configurable for how long to wait before executing deletions

 /// We aggregate object deletions from many tenants in one place, for several reasons:
@@ -154,7 +153,7 @@ impl FlushOp {

 #[derive(Clone, Debug)]
 pub struct DeletionQueueClient {
-    tx: tokio::sync::mpsc::Sender<ListWriterQueueMessage>,
+    tx: tokio::sync::mpsc::UnboundedSender<ListWriterQueueMessage>,
    executor_tx: tokio::sync::mpsc::Sender<DeleterMessage>,

    lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
@@ -186,7 +185,7 @@ where
    V: Serialize,
    I: AsRef<[u8]>,
 {
-    let transformed = input.iter().map(|(k, v)| (hex::encode(k), v.clone()));
+    let transformed = input.iter().map(|(k, v)| (hex::encode(k), v));

    transformed
        .collect::<HashMap<String, &V>>()
@@ -213,7 +212,7 @@ where

 /// Files ending with this suffix will be ignored and erased
 /// during recovery as startup.
-const TEMP_SUFFIX: &str = ".tmp";
+const TEMP_SUFFIX: &str = "tmp";

 #[serde_as]
 #[derive(Debug, Serialize, Deserialize)]
@@ -325,10 +324,7 @@ impl DeletionList {
            return false;
        }

-        let timeline_entry = tenant_entry
-            .timelines
-            .entry(*timeline)
-            .or_insert_with(Vec::new);
+        let timeline_entry = tenant_entry.timelines.entry(*timeline).or_default();

        let timeline_remote_path = remote_timeline_path(tenant, timeline);

@@ -336,7 +332,6 @@ impl DeletionList {
        timeline_entry.extend(objects.drain(..).map(|p| {
            p.strip_prefix(&timeline_remote_path)
                .expect("Timeline paths always start with the timeline prefix")
-                .to_string_lossy()
                .to_string()
        }));
        true
@@ -350,7 +345,7 @@ impl DeletionList {
                result.extend(
                    timeline_layers
                        .into_iter()
-                        .map(|l| timeline_remote_path.join(&PathBuf::from(l))),
+                        .map(|l| timeline_remote_path.join(&Utf8PathBuf::from(l))),
                );
            }
        }
@@ -421,7 +416,7 @@ pub enum DeletionQueueError {
 impl DeletionQueueClient {
    pub(crate) fn broken() -> Self {
        // Channels whose receivers are immediately dropped.
-        let (tx, _rx) = tokio::sync::mpsc::channel(1);
+        let (tx, _rx) = tokio::sync::mpsc::unbounded_channel();
        let (executor_tx, _executor_rx) = tokio::sync::mpsc::channel(1);
        Self {
            tx,
@@ -433,12 +428,12 @@ impl DeletionQueueClient {
    /// This is cancel-safe.  If you drop the future before it completes, the message
    /// is not pushed, although in the context of the deletion queue it doesn't matter: once
    /// we decide to do a deletion the decision is always final.
-    async fn do_push<T>(
+    fn do_push<T>(
        &self,
-        queue: &tokio::sync::mpsc::Sender<T>,
+        queue: &tokio::sync::mpsc::UnboundedSender<T>,
        msg: T,
    ) -> Result<(), DeletionQueueError> {
-        match queue.send(msg).await {
+        match queue.send(msg) {
            Ok(_) => Ok(()),
            Err(e) => {
                // This shouldn't happen, we should shut down all tenants before
@@ -450,7 +445,7 @@ impl DeletionQueueClient {
        }
    }

-    pub(crate) async fn recover(
+    pub(crate) fn recover(
        &self,
        attached_tenants: HashMap<TenantId, Generation>,
    ) -> Result<(), DeletionQueueError> {
@@ -458,7 +453,6 @@ impl DeletionQueueClient {
            &self.tx,
            ListWriterQueueMessage::Recover(RecoverOp { attached_tenants }),
        )
-        .await
    }

    /// When a Timeline wishes to update the remote_consistent_lsn that it exposes to the outside
@@ -531,6 +525,21 @@ impl DeletionQueueClient {
            return self.flush_immediate().await;
        }

+        self.push_layers_sync(tenant_id, timeline_id, current_generation, layers)
+    }
+
+    /// When a Tenant has a generation, push_layers is always synchronous because
+    /// the ListValidator channel is an unbounded channel.
+    ///
+    /// This can be merged into push_layers when we remove the Generation-less mode
+    /// support (`<https://github.com/neondatabase/neon/issues/5395>`)
+    pub(crate) fn push_layers_sync(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        current_generation: Generation,
+        layers: Vec<(LayerFileName, Generation)>,
+    ) -> Result<(), DeletionQueueError> {
        metrics::DELETION_QUEUE
            .keys_submitted
            .inc_by(layers.len() as u64);
@@ -544,17 +553,16 @@ impl DeletionQueueClient {
                objects: Vec::new(),
            }),
        )
-        .await
    }

    /// This is cancel-safe.  If you drop the future the flush may still happen in the background.
    async fn do_flush<T>(
        &self,
-        queue: &tokio::sync::mpsc::Sender<T>,
+        queue: &tokio::sync::mpsc::UnboundedSender<T>,
        msg: T,
        rx: tokio::sync::oneshot::Receiver<()>,
    ) -> Result<(), DeletionQueueError> {
-        self.do_push(queue, msg).await?;
+        self.do_push(queue, msg)?;
        if rx.await.is_err() {
            // This shouldn't happen if tenants are shut down before deletion queue.  If we
            // encounter a bug like this, then a flusher will incorrectly believe it has flushed
@@ -575,6 +583,18 @@ impl DeletionQueueClient {
            .await
    }

+    /// Issue a flush without waiting for it to complete.  This is useful on advisory flushes where
+    /// the caller wants to avoid the risk of waiting for lots of enqueued work, such as on tenant
+    /// detach where flushing is nice but not necessary.
+    ///
+    /// This function provides no guarantees of work being done.
+    pub fn flush_advisory(&self) {
+        let (flush_op, _) = FlushOp::new();
+
+        // Transmit the flush message, ignoring any result (such as a closed channel during shutdown).
+        drop(self.tx.send(ListWriterQueueMessage::FlushExecute(flush_op)));
+    }
+
    // Wait until all previous deletions are executed
    pub(crate) async fn flush_execute(&self) -> Result<(), DeletionQueueError> {
        debug!("flush_execute: flushing to deletion lists...");
@@ -591,9 +611,7 @@ impl DeletionQueueClient {
        // Flush any immediate-mode deletions (the above backend flush will only flush
        // the executor if deletions had flowed through the backend)
        debug!("flush_execute: flushing execution...");
-        let (flush_op, rx) = FlushOp::new();
-        self.do_flush(&self.executor_tx, DeleterMessage::Flush(flush_op), rx)
-            .await?;
+        self.flush_immediate().await?;
        debug!("flush_execute: finished flushing execution...");
        Ok(())
    }
@@ -648,8 +666,10 @@ impl DeletionQueue {
    where
        C: ControlPlaneGenerationsApi + Send + Sync,
    {
-        // Deep channel: it consumes deletions from all timelines and we do not want to block them
-        let (tx, rx) = tokio::sync::mpsc::channel(16384);
+        // Unbounded channel: enables non-async functions to submit deletions.  The actual length is
+        // constrained by how promptly the ListWriter wakes up and drains it, which should be frequent
+        // enough to avoid this taking pathologically large amount of memory.
+        let (tx, rx) = tokio::sync::mpsc::unbounded_channel();

        // Shallow channel: it carries DeletionLists which each contain up to thousands of deletions
        let (backend_tx, backend_rx) = tokio::sync::mpsc::channel(16);
@@ -727,12 +747,9 @@ impl DeletionQueue {

 #[cfg(test)]
 mod test {
+    use camino::Utf8Path;
    use hex_literal::hex;
-    use std::{
-        io::ErrorKind,
-        path::{Path, PathBuf},
-        time::Duration,
-    };
+    use std::{io::ErrorKind, time::Duration};
    use tracing::info;

    use remote_storage::{RemoteStorageConfig, RemoteStorageKind};
@@ -764,7 +781,7 @@ mod test {

    struct TestSetup {
        harness: TenantHarness,
-        remote_fs_dir: PathBuf,
+        remote_fs_dir: Utf8PathBuf,
        storage: GenericRemoteStorage,
        mock_control_plane: MockControlPlane,
        deletion_queue: DeletionQueue,
@@ -873,7 +890,7 @@ mod test {
        // Set up a GenericRemoteStorage targetting a directory
        let remote_fs_dir = harness.conf.workdir.join("remote_fs");
        std::fs::create_dir_all(remote_fs_dir)?;
-        let remote_fs_dir = std::fs::canonicalize(harness.conf.workdir.join("remote_fs"))?;
+        let remote_fs_dir = harness.conf.workdir.join("remote_fs").canonicalize_utf8()?;
        let storage_config = RemoteStorageConfig {
            max_concurrent_syncs: std::num::NonZeroUsize::new(
                remote_storage::DEFAULT_REMOTE_STORAGE_MAX_CONCURRENT_SYNCS,
@@ -909,7 +926,7 @@ mod test {
    }

    // TODO: put this in a common location so that we can share with remote_timeline_client's tests
-    fn assert_remote_files(expected: &[&str], remote_path: &Path) {
+    fn assert_remote_files(expected: &[&str], remote_path: &Utf8Path) {
        let mut expected: Vec<String> = expected.iter().map(|x| String::from(*x)).collect();
        expected.sort();

@@ -926,10 +943,7 @@ mod test {
                        unreachable!();
                    }
                } else {
-                    panic!(
-                        "Unexpected error listing {}: {e}",
-                        remote_path.to_string_lossy()
-                    );
+                    panic!("Unexpected error listing {remote_path}: {e}");
                }
            }
        };
@@ -944,7 +958,7 @@ mod test {
        assert_eq!(expected, found);
    }

-    fn assert_local_files(expected: &[&str], directory: &Path) {
+    fn assert_local_files(expected: &[&str], directory: &Utf8Path) {
        let dir = match std::fs::read_dir(directory) {
            Ok(d) => d,
            Err(_) => {
@@ -968,7 +982,7 @@ mod test {
        // Basic test that the deletion queue processes the deletions we pass into it
        let ctx = setup("deletion_queue_smoke").expect("Failed test setup");
        let client = ctx.deletion_queue.new_client();
-        client.recover(HashMap::new()).await?;
+        client.recover(HashMap::new())?;

        let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
        let tenant_id = ctx.harness.tenant_id;
@@ -1036,7 +1050,7 @@ mod test {
    async fn deletion_queue_validation() -> anyhow::Result<()> {
        let ctx = setup("deletion_queue_validation").expect("Failed test setup");
        let client = ctx.deletion_queue.new_client();
-        client.recover(HashMap::new()).await?;
+        client.recover(HashMap::new())?;

        // Generation that the control plane thinks is current
        let latest_generation = Generation::new(0xdeadbeef);
@@ -1093,7 +1107,7 @@ mod test {
        // Basic test that the deletion queue processes the deletions we pass into it
        let mut ctx = setup("deletion_queue_recovery").expect("Failed test setup");
        let client = ctx.deletion_queue.new_client();
-        client.recover(HashMap::new()).await?;
+        client.recover(HashMap::new())?;

        let tenant_id = ctx.harness.tenant_id;

@@ -1156,9 +1170,7 @@ mod test {
        drop(client);
        ctx.restart().await;
        let client = ctx.deletion_queue.new_client();
-        client
-            .recover(HashMap::from([(tenant_id, now_generation)]))
-            .await?;
+        client.recover(HashMap::from([(tenant_id, now_generation)]))?;

        info!("Flush-executing");
        client.flush_execute().await?;
@@ -1184,7 +1196,7 @@ pub(crate) mod mock {
    };

    pub struct ConsumerState {
-        rx: tokio::sync::mpsc::Receiver<ListWriterQueueMessage>,
+        rx: tokio::sync::mpsc::UnboundedReceiver<ListWriterQueueMessage>,
        executor_rx: tokio::sync::mpsc::Receiver<DeleterMessage>,
    }

@@ -1261,7 +1273,7 @@ pub(crate) mod mock {
    }

    pub struct MockDeletionQueue {
-        tx: tokio::sync::mpsc::Sender<ListWriterQueueMessage>,
+        tx: tokio::sync::mpsc::UnboundedSender<ListWriterQueueMessage>,
        executor_tx: tokio::sync::mpsc::Sender<DeleterMessage>,
        executed: Arc<AtomicUsize>,
        remote_storage: Option<GenericRemoteStorage>,
@@ -1271,7 +1283,7 @@ pub(crate) mod mock {

    impl MockDeletionQueue {
        pub fn new(remote_storage: Option<GenericRemoteStorage>) -> Self {
-            let (tx, rx) = tokio::sync::mpsc::channel(16384);
+            let (tx, rx) = tokio::sync::mpsc::unbounded_channel();
            let (executor_tx, executor_rx) = tokio::sync::mpsc::channel(16384);

            let executed = Arc::new(AtomicUsize::new(0));
--- a/pageserver/src/deletion_queue/deleter.rs
+++ b/pageserver/src/deletion_queue/deleter.rs
@@ -13,6 +13,7 @@ use std::time::Duration;
 use tokio_util::sync::CancellationToken;
 use tracing::info;
 use tracing::warn;
+use utils::backoff;

 use crate::metrics;

@@ -63,7 +64,19 @@ impl Deleter {
            Err(anyhow::anyhow!("failpoint hit"))
        });

-        self.remote_storage.delete_objects(&self.accumulator).await
+        // A backoff::retry is used here for two reasons:
+        // - To provide a backoff rather than busy-polling the API on errors
+        // - To absorb transient 429/503 conditions without hitting our error
+        //   logging path for issues deleting objects.
+        backoff::retry(
+            || async { self.remote_storage.delete_objects(&self.accumulator).await },
+            |_| false,
+            3,
+            10,
+            "executing deletion batch",
+            backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Shutting down")),
+        )
+        .await
    }

    /// Block until everything in accumulator has been executed
@@ -88,7 +101,10 @@ impl Deleter {
                    self.accumulator.clear();
                }
                Err(e) => {
-                    warn!("DeleteObjects request failed: {e:#}, will retry");
+                    if self.cancel.is_cancelled() {
+                        return Err(DeletionQueueError::ShuttingDown);
+                    }
+                    warn!("DeleteObjects request failed: {e:#}, will continue trying");
                    metrics::DELETION_QUEUE
                        .remote_errors
                        .with_label_values(&["execute"])
--- a/pageserver/src/deletion_queue/list_writer.rs
+++ b/pageserver/src/deletion_queue/list_writer.rs
@@ -85,7 +85,7 @@ pub(super) struct ListWriter {
    conf: &'static PageServerConf,

    // Incoming frontend requests to delete some keys
-    rx: tokio::sync::mpsc::Receiver<ListWriterQueueMessage>,
+    rx: tokio::sync::mpsc::UnboundedReceiver<ListWriterQueueMessage>,

    // Outbound requests to the backend to execute deletion lists we have composed.
    tx: tokio::sync::mpsc::Sender<ValidatorQueueMessage>,
@@ -111,7 +111,7 @@ impl ListWriter {

    pub(super) fn new(
        conf: &'static PageServerConf,
-        rx: tokio::sync::mpsc::Receiver<ListWriterQueueMessage>,
+        rx: tokio::sync::mpsc::UnboundedReceiver<ListWriterQueueMessage>,
        tx: tokio::sync::mpsc::Sender<ValidatorQueueMessage>,
        cancel: CancellationToken,
    ) -> Self {
@@ -180,8 +180,7 @@ impl ListWriter {
                    Ok(h) => Ok(Some(h.validated_sequence)),
                    Err(e) => {
                        warn!(
-                            "Failed to deserialize deletion header, ignoring {}: {e:#}",
-                            header_path.display()
+                            "Failed to deserialize deletion header, ignoring {header_path}: {e:#}",
                        );
                        // This should never happen unless we make a mistake with our serialization.
                        // Ignoring a deletion header is not consequential for correctnes because all deletions
@@ -193,10 +192,7 @@ impl ListWriter {
            }
            Err(e) => {
                if e.kind() == std::io::ErrorKind::NotFound {
-                    debug!(
-                        "Deletion header {} not found, first start?",
-                        header_path.display()
-                    );
+                    debug!("Deletion header {header_path} not found, first start?");
                    Ok(None)
                } else {
                    Err(anyhow::anyhow!(e))
@@ -223,10 +219,7 @@ impl ListWriter {
        let mut dir = match tokio::fs::read_dir(&deletion_directory).await {
            Ok(d) => d,
            Err(e) => {
-                warn!(
-                    "Failed to open deletion list directory {}: {e:#}",
-                    deletion_directory.display(),
-                );
+                warn!("Failed to open deletion list directory {deletion_directory}: {e:#}");

                // Give up: if we can't read the deletion list directory, we probably can't
                // write lists into it later, so the queue won't work.
@@ -237,27 +230,26 @@ impl ListWriter {
        let list_name_pattern =
            Regex::new("(?<sequence>[a-zA-Z0-9]{16})-(?<version>[a-zA-Z0-9]{2}).list").unwrap();

+        let temp_extension = format!(".{TEMP_SUFFIX}");
        let header_path = self.conf.deletion_header_path();
        let mut seqs: Vec<u64> = Vec::new();
        while let Some(dentry) = dir.next_entry().await? {
            let file_name = dentry.file_name();
            let dentry_str = file_name.to_string_lossy();

-            if Some(file_name.as_os_str()) == header_path.file_name() {
+            if file_name == header_path.file_name().unwrap_or("") {
                // Don't try and parse the header's name like a list
                continue;
            }

-            if dentry_str.ends_with(TEMP_SUFFIX) {
+            if dentry_str.ends_with(&temp_extension) {
                info!("Cleaning up temporary file {dentry_str}");
-                let absolute_path = deletion_directory.join(dentry.file_name());
+                let absolute_path =
+                    deletion_directory.join(dentry.file_name().to_str().expect("non-Unicode path"));
                if let Err(e) = tokio::fs::remove_file(&absolute_path).await {
                    // Non-fatal error: we will just leave the file behind but not
                    // try and load it.
-                    warn!(
-                        "Failed to clean up temporary file {}: {e:#}",
-                        absolute_path.display()
-                    );
+                    warn!("Failed to clean up temporary file {absolute_path}: {e:#}");
                }

                continue;
@@ -360,7 +352,7 @@ impl ListWriter {
        if let Err(e) = create_dir_all(&self.conf.deletion_prefix()) {
            tracing::error!(
                "Failed to create deletion list directory {}, deletions will not be executed ({e})",
-                self.conf.deletion_prefix().display()
+                self.conf.deletion_prefix(),
            );
            metrics::DELETION_QUEUE.unexpected_errors.inc();
            return;
--- a/pageserver/src/deletion_queue/validator.rs
+++ b/pageserver/src/deletion_queue/validator.rs
@@ -15,10 +15,10 @@
 //! Deletions are passed onward to the Deleter.

 use std::collections::HashMap;
-use std::path::PathBuf;
 use std::sync::Arc;
 use std::time::Duration;

+use camino::Utf8PathBuf;
 use tokio_util::sync::CancellationToken;
 use tracing::debug;
 use tracing::info;
@@ -220,6 +220,8 @@ where
                    warn!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation);
                    metrics::DELETION_QUEUE.keys_dropped.inc_by(tenant.len() as u64);
                    mutated = true;
+                } else {
+                    metrics::DELETION_QUEUE.keys_validated.inc_by(tenant.len() as u64);
                }
                this_list_valid
            });
@@ -282,16 +284,16 @@ where
        Ok(())
    }

-    async fn cleanup_lists(&mut self, list_paths: Vec<PathBuf>) {
+    async fn cleanup_lists(&mut self, list_paths: Vec<Utf8PathBuf>) {
        for list_path in list_paths {
-            debug!("Removing deletion list {}", list_path.display());
+            debug!("Removing deletion list {list_path}");

            if let Err(e) = tokio::fs::remove_file(&list_path).await {
                // Unexpected: we should have permissions and nothing else should
                // be touching these files.  We will leave the file behind.  Subsequent
                // pageservers will try and load it again: hopefully whatever storage
                // issue (probably permissions) has been fixed by then.
-                tracing::error!("Failed to delete {}: {e:#}", list_path.display());
+                tracing::error!("Failed to delete {list_path}: {e:#}");
                metrics::DELETION_QUEUE.unexpected_errors.inc();
                break;
            }
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -43,12 +43,12 @@

 use std::{
    collections::HashMap,
-    path::Path,
    sync::Arc,
    time::{Duration, SystemTime},
 };

 use anyhow::Context;
+use camino::Utf8Path;
 use remote_storage::GenericRemoteStorage;
 use serde::{Deserialize, Serialize};
 use tokio::time::Instant;
@@ -122,7 +122,7 @@ async fn disk_usage_eviction_task(
    state: &State,
    task_config: &DiskUsageEvictionTaskConfig,
    storage: GenericRemoteStorage,
-    tenants_dir: &Path,
+    tenants_dir: &Utf8Path,
    cancel: CancellationToken,
 ) {
    scopeguard::defer! {
@@ -184,7 +184,7 @@ async fn disk_usage_eviction_task_iteration(
    state: &State,
    task_config: &DiskUsageEvictionTaskConfig,
    storage: &GenericRemoteStorage,
-    tenants_dir: &Path,
+    tenants_dir: &Utf8Path,
    cancel: &CancellationToken,
 ) -> anyhow::Result<()> {
    let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
@@ -620,9 +620,8 @@ impl std::ops::Deref for TimelineKey {
 }

 mod filesystem_level_usage {
-    use std::path::Path;
-
    use anyhow::Context;
+    use camino::Utf8Path;

    use crate::statvfs::Statvfs;

@@ -664,7 +663,7 @@ mod filesystem_level_usage {
    }

    pub fn get<'a>(
-        tenants_dir: &Path,
+        tenants_dir: &Utf8Path,
        config: &'a DiskUsageEvictionTaskConfig,
    ) -> anyhow::Result<Usage<'a>> {
        let mock_config = {
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -93,9 +93,16 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+
    delete:
      description: |
-        Attempts to delete specified tenant. 500 and 409 errors should be retried until 404 is retrieved.
+        Attempts to delete specified tenant. 500, 503 and 409 errors should be retried until 404 is retrieved.
        404 means that deletion successfully finished"
      responses:
        "400":
@@ -134,6 +141,13 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+

  /v1/tenant/{tenant_id}/timeline:
    parameters:
@@ -178,6 +192,13 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+

  /v1/tenant/{tenant_id}/timeline/{timeline_id}:
    parameters:
@@ -226,6 +247,13 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+
    delete:
      description: "Attempts to delete specified timeline. 500 and 409 errors should be retried"
      responses:
@@ -265,13 +293,19 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/PreconditionFailedError"
-
        "500":
          description: Generic operation error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+

  /v1/tenant/{tenant_id}/timeline/{timeline_id}/get_lsn_by_timestamp:
    parameters:
@@ -328,6 +362,13 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+
  /v1/tenant/{tenant_id}/timeline/{timeline_id}/do_gc:
    parameters:
      - name: tenant_id
@@ -375,6 +416,13 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+
  /v1/tenant/{tenant_id}/attach:
    parameters:
      - name: tenant_id
@@ -465,6 +513,13 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+

  /v1/tenant/{tenant_id}/detach:
    parameters:
@@ -518,6 +573,13 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+

  /v1/tenant/{tenant_id}/ignore:
    parameters:
@@ -560,6 +622,13 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+

  /v1/tenant/{tenant_id}/load:
    parameters:
@@ -604,6 +673,13 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+

  /v1/tenant/{tenant_id}/synthetic_size:
    parameters:
@@ -641,6 +717,12 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"

  /v1/tenant/{tenant_id}/size:
    parameters:
@@ -704,6 +786,13 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+

  /v1/tenant/{tenant_id}/timeline/:
    parameters:
@@ -780,6 +869,13 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+
  /v1/tenant/:
    get:
      description: Get tenants list
@@ -810,6 +906,13 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+
    post:
      description: |
        Create a tenant. Returns new tenant id on success.
@@ -860,6 +963,13 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+

  /v1/tenant/config:
    put:
@@ -905,6 +1015,13 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+
  /v1/tenant/{tenant_id}/config/:
    parameters:
      - name: tenant_id
@@ -954,6 +1071,13 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
+        "503":
+          description: Temporarily unavailable, please retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ServiceUnavailableError"
+
 components:
  securitySchemes:
    JWT:
@@ -1220,6 +1344,13 @@ components:
      properties:
        msg:
          type: string
+    ServiceUnavailableError:
+      type: object
+      required:
+        - msg
+      properties:
+        msg:
+          type: string
    NotFoundError:
      type: object
      required:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -6,11 +6,13 @@ use std::sync::Arc;

 use anyhow::{anyhow, Context, Result};
 use futures::TryFutureExt;
+use hyper::header::CONTENT_TYPE;
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
 use pageserver_api::models::{
-    DownloadRemoteLayersTaskSpawnRequest, TenantAttachRequest, TenantLoadRequest,
+    DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
+    TenantLoadRequest, TenantLocationConfigRequest,
 };
 use remote_storage::GenericRemoteStorage;
 use tenant_size_model::{SizeResult, StorageModel};
@@ -29,7 +31,7 @@ use crate::deletion_queue::DeletionQueueClient;
 use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL};
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
-use crate::tenant::config::TenantConfOpt;
+use crate::tenant::config::{LocationConf, TenantConfOpt};
 use crate::tenant::mgr::{
    GetTenantError, SetNewTenantConfigError, TenantMapInsertError, TenantStateError,
 };
@@ -75,7 +77,7 @@ impl State {
        disk_usage_eviction_state: Arc<disk_usage_eviction_task::State>,
        deletion_queue_client: DeletionQueueClient,
    ) -> anyhow::Result<Self> {
-        let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml"]
+        let allowlist_routes = ["/v1/status", "/v1/doc", "/swagger.yml", "/metrics"]
            .iter()
            .map(|v| v.parse().unwrap())
            .collect::<Vec<_>>();
@@ -132,7 +134,7 @@ impl From<PageReconstructError> for ApiError {
                ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
            }
            PageReconstructError::AncestorStopping(_) => {
-                ApiError::InternalServerError(anyhow::Error::new(pre))
+                ApiError::ResourceUnavailable(format!("{pre}").into())
            }
            PageReconstructError::WalRedo(pre) => {
                ApiError::InternalServerError(anyhow::Error::new(pre))
@@ -145,12 +147,15 @@ impl From<TenantMapInsertError> for ApiError {
    fn from(tmie: TenantMapInsertError) -> ApiError {
        match tmie {
            TenantMapInsertError::StillInitializing | TenantMapInsertError::ShuttingDown => {
-                ApiError::InternalServerError(anyhow::Error::new(tmie))
+                ApiError::ResourceUnavailable(format!("{tmie}").into())
            }
            TenantMapInsertError::TenantAlreadyExists(id, state) => {
                ApiError::Conflict(format!("tenant {id} already exists, state: {state:?}"))
            }
-            TenantMapInsertError::Closure(e) => ApiError::InternalServerError(e),
+            TenantMapInsertError::TenantExistsSecondary(id) => {
+                ApiError::Conflict(format!("tenant {id} already exists as secondary"))
+            }
+            TenantMapInsertError::Other(e) => ApiError::InternalServerError(e),
        }
    }
 }
@@ -159,6 +164,9 @@ impl From<TenantStateError> for ApiError {
    fn from(tse: TenantStateError) -> ApiError {
        match tse {
            TenantStateError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
+            TenantStateError::IsStopping(_) => {
+                ApiError::ResourceUnavailable("Tenant is stopping".into())
+            }
            _ => ApiError::InternalServerError(anyhow::Error::new(tse)),
        }
    }
@@ -168,14 +176,17 @@ impl From<GetTenantError> for ApiError {
    fn from(tse: GetTenantError) -> ApiError {
        match tse {
            GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
-            e @ GetTenantError::NotActive(_) => {
+            GetTenantError::Broken(reason) => {
+                ApiError::InternalServerError(anyhow!("tenant is broken: {}", reason))
+            }
+            GetTenantError::NotActive(_) => {
                // Why is this not `ApiError::NotFound`?
                // Because we must be careful to never return 404 for a tenant if it does
                // in fact exist locally. If we did, the caller could draw the conclusion
                // that it can attach the tenant to another PS and we'd be in split-brain.
                //
                // (We can produce this variant only in `mgr::get_tenant(..., active=true)` calls).
-                ApiError::InternalServerError(anyhow::Error::new(e))
+                ApiError::ResourceUnavailable("Tenant not yet active".into())
            }
        }
    }
@@ -382,6 +393,9 @@ async fn timeline_create_handler(
                    format!("{err:#}")
                ))
            }
+            Err(e @ tenant::CreateTimelineError::AncestorNotActive) => {
+                json_response(StatusCode::SERVICE_UNAVAILABLE, HttpErrorBody::from_msg(e.to_string()))
+            }
            Err(tenant::CreateTimelineError::Other(err)) => Err(ApiError::InternalServerError(err)),
        }
    }
@@ -558,9 +572,14 @@ async fn tenant_detach_handler(

    let state = get_state(&request);
    let conf = state.conf;
-    mgr::detach_tenant(conf, tenant_id, detach_ignored.unwrap_or(false))
-        .instrument(info_span!("tenant_detach", %tenant_id))
-        .await?;
+    mgr::detach_tenant(
+        conf,
+        tenant_id,
+        detach_ignored.unwrap_or(false),
+        &state.deletion_queue_client,
+    )
+    .instrument(info_span!("tenant_detach", %tenant_id))
+    .await?;

    json_response(StatusCode::OK, ())
 }
@@ -622,8 +641,9 @@ async fn tenant_list_handler(
    let response_data = mgr::list_tenants()
        .instrument(info_span!("tenant_list"))
        .await
-        .map_err(anyhow::Error::new)
-        .map_err(ApiError::InternalServerError)?
+        .map_err(|_| {
+            ApiError::ResourceUnavailable("Tenant map is initializing or shutting down".into())
+        })?
        .iter()
        .map(|(id, state)| TenantInfo {
            id: *id,
@@ -1001,6 +1021,48 @@ async fn update_tenant_config_handler(
    json_response(StatusCode::OK, ())
 }

+async fn put_tenant_location_config_handler(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let request_data: TenantLocationConfigRequest = json_request(&mut request).await?;
+    let tenant_id = request_data.tenant_id;
+    check_permission(&request, Some(tenant_id))?;
+
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
+    let state = get_state(&request);
+    let conf = state.conf;
+
+    // The `Detached` state is special, it doesn't upsert a tenant, it removes
+    // its local disk content and drops it from memory.
+    if let LocationConfigMode::Detached = request_data.config.mode {
+        mgr::detach_tenant(conf, tenant_id, true, &state.deletion_queue_client)
+            .instrument(info_span!("tenant_detach", %tenant_id))
+            .await?;
+        return json_response(StatusCode::OK, ());
+    }
+
+    let location_conf =
+        LocationConf::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
+
+    mgr::upsert_location(
+        state.conf,
+        tenant_id,
+        location_conf,
+        state.broker_client.clone(),
+        state.remote_storage.clone(),
+        state.deletion_queue_client.clone(),
+        &ctx,
+    )
+    .await
+    // TODO: badrequest assumes the caller was asking for something unreasonable, but in
+    // principle we might have hit something like concurrent API calls to the same tenant,
+    // which is not a 400 but a 409.
+    .map_err(ApiError::BadRequest)?;
+
+    json_response(StatusCode::OK, ())
+}
+
 /// Testing helper to transition a tenant to [`crate::tenant::TenantState::Broken`].
 async fn handle_tenant_break(
    r: Request<Body>,
@@ -1180,6 +1242,136 @@ async fn deletion_queue_flush(
    }
 }

+/// Try if `GetPage@Lsn` is successful, useful for manual debugging.
+async fn getpage_at_lsn_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    check_permission(&request, Some(tenant_id))?;
+
+    struct Key(crate::repository::Key);
+
+    impl std::str::FromStr for Key {
+        type Err = anyhow::Error;
+
+        fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
+            crate::repository::Key::from_hex(s).map(Key)
+        }
+    }
+
+    let key: Key = parse_query_param(&request, "key")?
+        .ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'key' query parameter")))?;
+    let lsn: Lsn = parse_query_param(&request, "lsn")?
+        .ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'lsn' query parameter")))?;
+
+    async {
+        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+        let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
+
+        let page = timeline.get(key.0, lsn, &ctx).await?;
+
+        Result::<_, ApiError>::Ok(
+            Response::builder()
+                .status(StatusCode::OK)
+                .header(CONTENT_TYPE, "application/octet-stream")
+                .body(hyper::Body::from(page))
+                .unwrap(),
+        )
+    }
+    .instrument(info_span!("timeline_get", %tenant_id, %timeline_id))
+    .await
+}
+
+async fn timeline_collect_keyspace(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    check_permission(&request, Some(tenant_id))?;
+
+    struct Partitioning {
+        keys: crate::keyspace::KeySpace,
+
+        at_lsn: Lsn,
+    }
+
+    impl serde::Serialize for Partitioning {
+        fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+        where
+            S: serde::Serializer,
+        {
+            use serde::ser::SerializeMap;
+            let mut map = serializer.serialize_map(Some(2))?;
+            map.serialize_key("keys")?;
+            map.serialize_value(&KeySpace(&self.keys))?;
+            map.serialize_key("at_lsn")?;
+            map.serialize_value(&WithDisplay(&self.at_lsn))?;
+            map.end()
+        }
+    }
+
+    struct WithDisplay<'a, T>(&'a T);
+
+    impl<'a, T: std::fmt::Display> serde::Serialize for WithDisplay<'a, T> {
+        fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+        where
+            S: serde::Serializer,
+        {
+            serializer.collect_str(&self.0)
+        }
+    }
+
+    struct KeySpace<'a>(&'a crate::keyspace::KeySpace);
+
+    impl<'a> serde::Serialize for KeySpace<'a> {
+        fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
+        where
+            S: serde::Serializer,
+        {
+            use serde::ser::SerializeSeq;
+            let mut seq = serializer.serialize_seq(Some(self.0.ranges.len()))?;
+            for kr in &self.0.ranges {
+                seq.serialize_element(&KeyRange(kr))?;
+            }
+            seq.end()
+        }
+    }
+
+    struct KeyRange<'a>(&'a std::ops::Range<crate::repository::Key>);
+
+    impl<'a> serde::Serialize for KeyRange<'a> {
+        fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+        where
+            S: serde::Serializer,
+        {
+            use serde::ser::SerializeTuple;
+            let mut t = serializer.serialize_tuple(2)?;
+            t.serialize_element(&WithDisplay(&self.0.start))?;
+            t.serialize_element(&WithDisplay(&self.0.end))?;
+            t.end()
+        }
+    }
+
+    let at_lsn: Option<Lsn> = parse_query_param(&request, "at_lsn")?;
+
+    async {
+        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+        let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
+        let at_lsn = at_lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
+        let keys = timeline
+            .collect_keyspace(at_lsn, &ctx)
+            .await
+            .map_err(ApiError::InternalServerError)?;
+
+        json_response(StatusCode::OK, Partitioning { keys, at_lsn })
+    }
+    .instrument(info_span!("timeline_collect_keyspace", %tenant_id, %timeline_id))
+    .await
+}
+
 async fn active_timeline_of_active_tenant(
    tenant_id: TenantId,
    timeline_id: TimelineId,
@@ -1454,6 +1646,9 @@ pub fn make_router(
        .get("/v1/tenant/:tenant_id/config", |r| {
            api_handler(r, get_tenant_config_handler)
        })
+        .put("/v1/tenant/:tenant_id/location_config", |r| {
+            api_handler(r, put_tenant_location_config_handler)
+        })
        .get("/v1/tenant/:tenant_id/timeline", |r| {
            api_handler(r, timeline_list_handler)
        })
@@ -1524,5 +1719,12 @@ pub fn make_router(
        .post("/v1/tracing/event", |r| {
            testing_api_handler("emit a tracing event", r, post_tracing_event_handler)
        })
+        .get("/v1/tenant/:tenant_id/timeline/:timeline_id/getpage", |r| {
+            testing_api_handler("getpage@lsn", r, getpage_at_lsn_handler)
+        })
+        .get(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/keyspace",
+            |r| testing_api_handler("read out the keyspace", r, timeline_collect_keyspace),
+        )
        .any(handler_404))
 }
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -6,6 +6,7 @@ use std::path::{Path, PathBuf};

 use anyhow::{bail, ensure, Context, Result};
 use bytes::Bytes;
+use camino::Utf8Path;
 use futures::StreamExt;
 use tokio::io::{AsyncRead, AsyncReadExt};
 use tokio_tar::Archive;
@@ -29,7 +30,7 @@ use postgres_ffi::{BLCKSZ, WAL_SEGMENT_SIZE};
 use utils::lsn::Lsn;

 // Returns checkpoint LSN from controlfile
-pub fn get_lsn_from_controlfile(path: &Path) -> Result<Lsn> {
+pub fn get_lsn_from_controlfile(path: &Utf8Path) -> Result<Lsn> {
    // Read control file to extract the LSN
    let controlfile_path = path.join("global").join("pg_control");
    let controlfile = ControlFileData::decode(&std::fs::read(controlfile_path)?)?;
@@ -46,7 +47,7 @@ pub fn get_lsn_from_controlfile(path: &Path) -> Result<Lsn> {
 /// cluster was not shut down cleanly.
 pub async fn import_timeline_from_postgres_datadir(
    tline: &Timeline,
-    pgdata_path: &Path,
+    pgdata_path: &Utf8Path,
    pgdata_lsn: Lsn,
    ctx: &RequestContext,
 ) -> Result<()> {
@@ -256,7 +257,7 @@ async fn import_slru(
 /// Scan PostgreSQL WAL files in given directory and load all records between
 /// 'startpoint' and 'endpoint' into the repository.
 async fn import_wal(
-    walpath: &Path,
+    walpath: &Utf8Path,
    tline: &Timeline,
    startpoint: Lsn,
    endpoint: Lsn,
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -25,9 +25,8 @@ pub mod walredo;

 pub mod failpoint_support;

-use std::path::Path;
-
 use crate::task_mgr::TaskKind;
+use camino::Utf8Path;
 use deletion_queue::DeletionQueue;
 use tracing::info;

@@ -113,6 +112,10 @@ pub const METADATA_FILE_NAME: &str = "metadata";
 /// Full path: `tenants/<tenant_id>/config`.
 pub const TENANT_CONFIG_NAME: &str = "config";

+/// Per-tenant configuration file.
+/// Full path: `tenants/<tenant_id>/config`.
+pub const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1";
+
 /// A suffix used for various temporary files. Any temporary files found in the
 /// data directory at pageserver startup can be automatically removed.
 pub const TEMP_FILE_SUFFIX: &str = "___temp";
@@ -132,25 +135,25 @@ pub const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete";
 /// Full path: `tenants/<tenant_id>/___ignored_tenant`.
 pub const IGNORED_TENANT_FILE_NAME: &str = "___ignored_tenant";

-pub fn is_temporary(path: &Path) -> bool {
+pub fn is_temporary(path: &Utf8Path) -> bool {
    match path.file_name() {
-        Some(name) => name.to_string_lossy().ends_with(TEMP_FILE_SUFFIX),
+        Some(name) => name.ends_with(TEMP_FILE_SUFFIX),
        None => false,
    }
 }

-fn ends_with_suffix(path: &Path, suffix: &str) -> bool {
+fn ends_with_suffix(path: &Utf8Path, suffix: &str) -> bool {
    match path.file_name() {
-        Some(name) => name.to_string_lossy().ends_with(suffix),
+        Some(name) => name.ends_with(suffix),
        None => false,
    }
 }

-pub fn is_uninit_mark(path: &Path) -> bool {
+pub fn is_uninit_mark(path: &Utf8Path) -> bool {
    ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX)
 }

-pub fn is_delete_mark(path: &Path) -> bool {
+pub fn is_delete_mark(path: &Utf8Path) -> bool {
    ends_with_suffix(path, TIMELINE_DELETE_MARK_SUFFIX)
 }

--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -94,15 +94,35 @@ pub(crate) static READ_NUM_FS_LAYERS: Lazy<Histogram> = Lazy::new(|| {
 });

 // Metrics collected on operations on the storage repository.
-pub(crate) static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
+
+pub(crate) struct ReconstructTimeMetrics {
+    ok: Histogram,
+    err: Histogram,
+}
+
+pub(crate) static RECONSTRUCT_TIME: Lazy<ReconstructTimeMetrics> = Lazy::new(|| {
+    let inner = register_histogram_vec!(
        "pageserver_getpage_reconstruct_seconds",
        "Time spent in reconstruct_value (reconstruct a page from deltas)",
+        &["result"],
        CRITICAL_OP_BUCKETS.into(),
    )
-    .expect("failed to define a metric")
+    .expect("failed to define a metric");
+    ReconstructTimeMetrics {
+        ok: inner.get_metric_with_label_values(&["ok"]).unwrap(),
+        err: inner.get_metric_with_label_values(&["err"]).unwrap(),
+    }
 });

+impl ReconstructTimeMetrics {
+    pub(crate) fn for_result<T, E>(&self, result: &Result<T, E>) -> &Histogram {
+        match result {
+            Ok(_) => &self.ok,
+            Err(_) => &self.err,
+        }
+    }
+}
+
 pub(crate) static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_materialized_cache_hits_direct_total",
@@ -264,6 +284,46 @@ pub static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> = Lazy::new(|| PageCacheS
    },
 });

+pub(crate) static PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "pageserver_page_cache_acquire_pinned_slot_seconds",
+        "Time spent acquiring a pinned slot in the page cache",
+        CRITICAL_OP_BUCKETS.into(),
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_page_cache_find_victim_iters_total",
+        "Counter for the number of iterations in the find_victim loop",
+    )
+    .expect("failed to define a metric")
+});
+
+static PAGE_CACHE_ERRORS: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "page_cache_errors_total",
+        "Number of timeouts while acquiring a pinned slot in the page cache",
+        &["error_kind"]
+    )
+    .expect("failed to define a metric")
+});
+
+#[derive(IntoStaticStr)]
+#[strum(serialize_all = "kebab_case")]
+pub(crate) enum PageCacheErrorKind {
+    AcquirePinnedSlotTimeout,
+    EvictIterLimit,
+}
+
+pub(crate) fn page_cache_errors_inc(error_kind: PageCacheErrorKind) {
+    PAGE_CACHE_ERRORS
+        .get_metric_with_label_values(&[error_kind.into()])
+        .unwrap()
+        .inc();
+}
+
 pub(crate) static WAIT_LSN_TIME: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wait_lsn_seconds",
@@ -291,6 +351,14 @@ static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

+pub(crate) static RESIDENT_PHYSICAL_SIZE_GLOBAL: Lazy<UIntGauge> = Lazy::new(|| {
+    register_uint_gauge!(
+        "pageserver_resident_physical_size_global",
+        "Like `pageserver_resident_physical_size`, but without tenant/timeline dimensions."
+    )
+    .expect("failed to define a metric")
+});
+
 static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_remote_physical_size",
@@ -301,6 +369,14 @@ static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

+static REMOTE_PHYSICAL_SIZE_GLOBAL: Lazy<UIntGauge> = Lazy::new(|| {
+    register_uint_gauge!(
+        "pageserver_remote_physical_size_global",
+        "Like `pageserver_remote_physical_size`, but without tenant/timeline dimensions."
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_remote_ondemand_downloaded_layers_total",
@@ -615,10 +691,9 @@ impl StorageIoTime {
        .expect("failed to define a metric");
        let metrics = std::array::from_fn(|i| {
            let op = StorageIoOperation::from_repr(i).unwrap();
-            let metric = storage_io_histogram_vec
+            storage_io_histogram_vec
                .get_metric_with_label_values(&[op.as_str()])
-                .unwrap();
-            metric
+                .unwrap()
        });
        Self { metrics }
    }
@@ -891,6 +966,7 @@ pub(crate) struct DeletionQueueMetrics {
    pub(crate) keys_submitted: IntCounter,
    pub(crate) keys_dropped: IntCounter,
    pub(crate) keys_executed: IntCounter,
+    pub(crate) keys_validated: IntCounter,
    pub(crate) dropped_lsn_updates: IntCounter,
    pub(crate) unexpected_errors: IntCounter,
    pub(crate) remote_errors: IntCounterVec,
@@ -912,7 +988,13 @@ pub(crate) static DELETION_QUEUE: Lazy<DeletionQueueMetrics> = Lazy::new(|| {

    keys_executed: register_int_counter!(
        "pageserver_deletion_queue_executed_total",
-        "Number of objects deleted. Only includes objects that we actually deleted, sum with pageserver_deletion_queue_dropped_total for the total number of keys processed."
+        "Number of objects deleted. Only includes objects that we actually deleted, sum with pageserver_deletion_queue_dropped_total for the total number of keys processed to completion"
+    )
+    .expect("failed to define a metric"),
+
+    keys_validated: register_int_counter!(
+        "pageserver_deletion_queue_validated_total",
+        "Number of keys validated for deletion.  Sum with pageserver_deletion_queue_dropped_total for the total number of keys that have passed through the validation stage."
    )
    .expect("failed to define a metric"),

@@ -1209,7 +1291,7 @@ pub struct TimelineMetrics {
    pub load_layer_map_histo: StorageTimeMetrics,
    pub garbage_collect_histo: StorageTimeMetrics,
    pub last_record_gauge: IntGauge,
-    pub resident_physical_size_gauge: UIntGauge,
+    resident_physical_size_gauge: UIntGauge,
    /// copy of LayeredTimeline.current_logical_size
    pub current_logical_size_gauge: UIntGauge,
    pub num_persistent_files_created: IntCounter,
@@ -1287,10 +1369,29 @@ impl TimelineMetrics {
    }

    pub fn record_new_file_metrics(&self, sz: u64) {
-        self.resident_physical_size_gauge.add(sz);
+        self.resident_physical_size_add(sz);
        self.num_persistent_files_created.inc_by(1);
        self.persistent_bytes_written.inc_by(sz);
    }
+
+    pub fn resident_physical_size_sub(&self, sz: u64) {
+        self.resident_physical_size_gauge.sub(sz);
+        crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(sz);
+    }
+
+    pub fn resident_physical_size_add(&self, sz: u64) {
+        self.resident_physical_size_gauge.add(sz);
+        crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.add(sz);
+    }
+
+    pub fn resident_physical_size_set(&self, sz: u64) {
+        self.resident_physical_size_gauge.set(sz);
+        crate::metrics::RESIDENT_PHYSICAL_SIZE_GLOBAL.set(sz);
+    }
+
+    pub fn resident_physical_size_get(&self) -> u64 {
+        self.resident_physical_size_gauge.get()
+    }
 }

 impl Drop for TimelineMetrics {
@@ -1298,7 +1399,10 @@ impl Drop for TimelineMetrics {
        let tenant_id = &self.tenant_id;
        let timeline_id = &self.timeline_id;
        let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
-        let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
+        {
+            RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
+            let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
+        }
        let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
        let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]);
        let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
@@ -1352,10 +1456,43 @@ use std::time::{Duration, Instant};
 use crate::context::{PageContentKind, RequestContext};
 use crate::task_mgr::TaskKind;

+/// Maintain a per timeline gauge in addition to the global gauge.
+struct PerTimelineRemotePhysicalSizeGauge {
+    last_set: u64,
+    gauge: UIntGauge,
+}
+
+impl PerTimelineRemotePhysicalSizeGauge {
+    fn new(per_timeline_gauge: UIntGauge) -> Self {
+        Self {
+            last_set: per_timeline_gauge.get(),
+            gauge: per_timeline_gauge,
+        }
+    }
+    fn set(&mut self, sz: u64) {
+        self.gauge.set(sz);
+        if sz < self.last_set {
+            REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set - sz);
+        } else {
+            REMOTE_PHYSICAL_SIZE_GLOBAL.add(sz - self.last_set);
+        };
+        self.last_set = sz;
+    }
+    fn get(&self) -> u64 {
+        self.gauge.get()
+    }
+}
+
+impl Drop for PerTimelineRemotePhysicalSizeGauge {
+    fn drop(&mut self) {
+        REMOTE_PHYSICAL_SIZE_GLOBAL.sub(self.last_set);
+    }
+}
+
 pub struct RemoteTimelineClientMetrics {
    tenant_id: String,
    timeline_id: String,
-    remote_physical_size_gauge: Mutex<Option<UIntGauge>>,
+    remote_physical_size_gauge: Mutex<Option<PerTimelineRemotePhysicalSizeGauge>>,
    calls_unfinished_gauge: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
    bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
    bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
@@ -1373,18 +1510,24 @@ impl RemoteTimelineClientMetrics {
        }
    }

-    pub fn remote_physical_size_gauge(&self) -> UIntGauge {
+    pub(crate) fn remote_physical_size_set(&self, sz: u64) {
        let mut guard = self.remote_physical_size_gauge.lock().unwrap();
-        guard
-            .get_or_insert_with(|| {
+        let gauge = guard.get_or_insert_with(|| {
+            PerTimelineRemotePhysicalSizeGauge::new(
                REMOTE_PHYSICAL_SIZE
                    .get_metric_with_label_values(&[
                        &self.tenant_id.to_string(),
                        &self.timeline_id.to_string(),
                    ])
-                    .unwrap()
-            })
-            .clone()
+                    .unwrap(),
+            )
+        });
+        gauge.set(sz);
+    }
+
+    pub(crate) fn remote_physical_size_get(&self) -> u64 {
+        let guard = self.remote_physical_size_gauge.lock().unwrap();
+        guard.as_ref().map(|gauge| gauge.get()).unwrap_or(0)
    }

    pub fn remote_operation_time(
@@ -1739,7 +1882,6 @@ pub fn preinitialize_metrics() {
    // histograms
    [
        &READ_NUM_FS_LAYERS,
-        &RECONSTRUCT_TIME,
        &WAIT_LSN_TIME,
        &WAL_REDO_TIME,
        &WAL_REDO_WAIT_TIME,
@@ -1750,4 +1892,7 @@ pub fn preinitialize_metrics() {
    .for_each(|h| {
        Lazy::force(h);
    });
+
+    // Custom
+    Lazy::force(&RECONSTRUCT_TIME);
 }
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -66,8 +66,7 @@
 //! inserted to the mapping, but you must hold the write-lock on the slot until
 //! the contents are valid. If you need to release the lock without initializing
 //! the contents, you must remove the mapping first. We make that easy for the
-//! callers with PageWriteGuard: when lock_for_write() returns an uninitialized
-//! page, the caller must explicitly call guard.mark_valid() after it has
+//! callers with PageWriteGuard: the caller must explicitly call guard.mark_valid() after it has
 //! initialized it. If the guard is dropped without calling mark_valid(), the
 //! mapping is automatically removed and the slot is marked free.
 //!
@@ -75,7 +74,11 @@
 use std::{
    collections::{hash_map::Entry, HashMap},
    convert::TryInto,
-    sync::atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering},
+    sync::{
+        atomic::{AtomicU64, AtomicU8, AtomicUsize, Ordering},
+        Arc, Weak,
+    },
+    time::Duration,
 };

 use anyhow::Context;
@@ -165,6 +168,8 @@ struct Slot {

 struct SlotInner {
    key: Option<CacheKey>,
+    // for `coalesce_readers_permit`
+    permit: std::sync::Mutex<Weak<PinnedSlotsPermit>>,
    buf: &'static mut [u8; PAGE_SZ],
 }

@@ -207,6 +212,22 @@ impl Slot {
    }
 }

+impl SlotInner {
+    /// If there is aready a reader, drop our permit and share its permit, just like we share read access.
+    fn coalesce_readers_permit(&self, permit: PinnedSlotsPermit) -> Arc<PinnedSlotsPermit> {
+        let mut guard = self.permit.lock().unwrap();
+        if let Some(existing_permit) = guard.upgrade() {
+            drop(guard);
+            drop(permit);
+            existing_permit
+        } else {
+            let permit = Arc::new(permit);
+            *guard = Arc::downgrade(&permit);
+            permit
+        }
+    }
+}
+
 pub struct PageCache {
    /// This contains the mapping from the cache key to buffer slot that currently
    /// contains the page, if any.
@@ -224,6 +245,8 @@ pub struct PageCache {
    /// The actual buffers with their metadata.
    slots: Box<[Slot]>,

+    pinned_slots: Arc<tokio::sync::Semaphore>,
+
    /// Index of the next candidate to evict, for the Clock replacement algorithm.
    /// This is interpreted modulo the page cache size.
    next_evict_slot: AtomicUsize,
@@ -231,23 +254,28 @@ pub struct PageCache {
    size_metrics: &'static PageCacheSizeMetrics,
 }

+struct PinnedSlotsPermit(tokio::sync::OwnedSemaphorePermit);
+
 ///
 /// PageReadGuard is a "lease" on a buffer, for reading. The page is kept locked
 /// until the guard is dropped.
 ///
-pub struct PageReadGuard<'i>(tokio::sync::RwLockReadGuard<'i, SlotInner>);
+pub struct PageReadGuard<'i> {
+    _permit: Arc<PinnedSlotsPermit>,
+    slot_guard: tokio::sync::RwLockReadGuard<'i, SlotInner>,
+}

 impl std::ops::Deref for PageReadGuard<'_> {
    type Target = [u8; PAGE_SZ];

    fn deref(&self) -> &Self::Target {
-        self.0.buf
+        self.slot_guard.buf
    }
 }

 impl AsRef<[u8; PAGE_SZ]> for PageReadGuard<'_> {
    fn as_ref(&self) -> &[u8; PAGE_SZ] {
-        self.0.buf
+        self.slot_guard.buf
    }
 }

@@ -257,21 +285,25 @@ impl AsRef<[u8; PAGE_SZ]> for PageReadGuard<'_> {
 ///
 /// Counterintuitively, this is used even for a read, if the requested page is not
 /// currently found in the page cache. In that case, the caller of lock_for_read()
-/// is expected to fill in the page contents and call mark_valid(). Similarly
-/// lock_for_write() can return an invalid buffer that the caller is expected to
-/// to initialize.
-///
+/// is expected to fill in the page contents and call mark_valid().
 pub struct PageWriteGuard<'i> {
-    inner: tokio::sync::RwLockWriteGuard<'i, SlotInner>,
+    state: PageWriteGuardState<'i>,
+}

-    // Are the page contents currently valid?
-    // Used to mark pages as invalid that are assigned but not yet filled with data.
-    valid: bool,
+enum PageWriteGuardState<'i> {
+    Invalid {
+        inner: tokio::sync::RwLockWriteGuard<'i, SlotInner>,
+        _permit: PinnedSlotsPermit,
+    },
+    Downgraded,
 }

 impl std::ops::DerefMut for PageWriteGuard<'_> {
    fn deref_mut(&mut self) -> &mut Self::Target {
-        self.inner.buf
+        match &mut self.state {
+            PageWriteGuardState::Invalid { inner, _permit } => inner.buf,
+            PageWriteGuardState::Downgraded => unreachable!(),
+        }
    }
 }

@@ -279,25 +311,37 @@ impl std::ops::Deref for PageWriteGuard<'_> {
    type Target = [u8; PAGE_SZ];

    fn deref(&self) -> &Self::Target {
-        self.inner.buf
+        match &self.state {
+            PageWriteGuardState::Invalid { inner, _permit } => inner.buf,
+            PageWriteGuardState::Downgraded => unreachable!(),
+        }
    }
 }

 impl AsMut<[u8; PAGE_SZ]> for PageWriteGuard<'_> {
    fn as_mut(&mut self) -> &mut [u8; PAGE_SZ] {
-        self.inner.buf
+        match &mut self.state {
+            PageWriteGuardState::Invalid { inner, _permit } => inner.buf,
+            PageWriteGuardState::Downgraded => unreachable!(),
+        }
    }
 }

-impl PageWriteGuard<'_> {
+impl<'a> PageWriteGuard<'a> {
    /// Mark that the buffer contents are now valid.
-    pub fn mark_valid(&mut self) {
-        assert!(self.inner.key.is_some());
-        assert!(
-            !self.valid,
-            "mark_valid called on a buffer that was already valid"
-        );
-        self.valid = true;
+    #[must_use]
+    pub fn mark_valid(mut self) -> PageReadGuard<'a> {
+        let prev = std::mem::replace(&mut self.state, PageWriteGuardState::Downgraded);
+        match prev {
+            PageWriteGuardState::Invalid { inner, _permit } => {
+                assert!(inner.key.is_some());
+                PageReadGuard {
+                    _permit: Arc::new(_permit),
+                    slot_guard: inner.downgrade(),
+                }
+            }
+            PageWriteGuardState::Downgraded => unreachable!(),
+        }
    }
 }

@@ -308,11 +352,14 @@ impl Drop for PageWriteGuard<'_> {
    /// initializing it, remove the mapping from the page cache.
    ///
    fn drop(&mut self) {
-        assert!(self.inner.key.is_some());
-        if !self.valid {
-            let self_key = self.inner.key.as_ref().unwrap();
-            PAGE_CACHE.get().unwrap().remove_mapping(self_key);
-            self.inner.key = None;
+        match &mut self.state {
+            PageWriteGuardState::Invalid { inner, _permit } => {
+                assert!(inner.key.is_some());
+                let self_key = inner.key.as_ref().unwrap();
+                PAGE_CACHE.get().unwrap().remove_mapping(self_key);
+                inner.key = None;
+            }
+            PageWriteGuardState::Downgraded => {}
        }
    }
 }
@@ -323,12 +370,6 @@ pub enum ReadBufResult<'a> {
    NotFound(PageWriteGuard<'a>),
 }

-/// lock_for_write() return value
-pub enum WriteBufResult<'a> {
-    Found(PageWriteGuard<'a>),
-    NotFound(PageWriteGuard<'a>),
-}
-
 impl PageCache {
    //
    // Section 1.1: Public interface functions for looking up and memorizing materialized page
@@ -348,6 +389,10 @@ impl PageCache {
        lsn: Lsn,
        ctx: &RequestContext,
    ) -> Option<(Lsn, PageReadGuard)> {
+        let Ok(permit) = self.try_get_pinned_slot_permit().await else {
+            return None;
+        };
+
        crate::metrics::PAGE_CACHE
            .for_ctx(ctx)
            .read_accesses_materialized_page
@@ -362,7 +407,10 @@ impl PageCache {
            lsn,
        };

-        if let Some(guard) = self.try_lock_for_read(&mut cache_key).await {
+        if let Some(guard) = self
+            .try_lock_for_read(&mut cache_key, &mut Some(permit))
+            .await
+        {
            if let CacheKey::MaterializedPage {
                hash_key: _,
                lsn: available_lsn,
@@ -408,20 +456,77 @@ impl PageCache {
            lsn,
        };

-        match self.lock_for_write(&cache_key).await? {
-            WriteBufResult::Found(write_guard) => {
-                // We already had it in cache. Another thread must've put it there
-                // concurrently. Check that it had the same contents that we
-                // replayed.
-                assert!(*write_guard == img);
+        let mut permit = Some(self.try_get_pinned_slot_permit().await?);
+        loop {
+            // First check if the key already exists in the cache.
+            if let Some(slot_idx) = self.search_mapping_exact(&cache_key) {
+                // The page was found in the mapping. Lock the slot, and re-check
+                // that it's still what we expected (because we don't released the mapping
+                // lock already, another thread could have evicted the page)
+                let slot = &self.slots[slot_idx];
+                let inner = slot.inner.write().await;
+                if inner.key.as_ref() == Some(&cache_key) {
+                    slot.inc_usage_count();
+                    debug_assert!(
+                        {
+                            let guard = inner.permit.lock().unwrap();
+                            guard.upgrade().is_none()
+                        },
+                        "we hold a write lock, so, no one else should have a permit"
+                    );
+                    debug_assert_eq!(inner.buf.len(), img.len());
+                    // We already had it in cache. Another thread must've put it there
+                    // concurrently. Check that it had the same contents that we
+                    // replayed.
+                    assert!(inner.buf == img);
+                    return Ok(());
+                }
            }
-            WriteBufResult::NotFound(mut write_guard) => {
-                write_guard.copy_from_slice(img);
-                write_guard.mark_valid();
-            }
-        }
+            debug_assert!(permit.is_some());

-        Ok(())
+            // Not found. Find a victim buffer
+            let (slot_idx, mut inner) = self
+                .find_victim(permit.as_ref().unwrap())
+                .await
+                .context("Failed to find evict victim")?;
+
+            // Insert mapping for this. At this point, we may find that another
+            // thread did the same thing concurrently. In that case, we evicted
+            // our victim buffer unnecessarily. Put it into the free list and
+            // continue with the slot that the other thread chose.
+            if let Some(_existing_slot_idx) = self.try_insert_mapping(&cache_key, slot_idx) {
+                // TODO: put to free list
+
+                // We now just loop back to start from beginning. This is not
+                // optimal, we'll perform the lookup in the mapping again, which
+                // is not really necessary because we already got
+                // 'existing_slot_idx'.  But this shouldn't happen often enough
+                // to matter much.
+                continue;
+            }
+
+            // Make the slot ready
+            let slot = &self.slots[slot_idx];
+            inner.key = Some(cache_key.clone());
+            slot.set_usage_count(1);
+            // Create a write guard for the slot so we go through the expected motions.
+            debug_assert!(
+                {
+                    let guard = inner.permit.lock().unwrap();
+                    guard.upgrade().is_none()
+                },
+                "we hold a write lock, so, no one else should have a permit"
+            );
+            let mut write_guard = PageWriteGuard {
+                state: PageWriteGuardState::Invalid {
+                    _permit: permit.take().unwrap(),
+                    inner,
+                },
+            };
+            write_guard.copy_from_slice(img);
+            let _ = write_guard.mark_valid();
+            return Ok(());
+        }
    }

    // Section 1.2: Public interface functions for working with immutable file pages.
@@ -445,6 +550,29 @@ impl PageCache {
    // "mappings" after this section. But the routines in this section should
    // not require changes.

+    async fn try_get_pinned_slot_permit(&self) -> anyhow::Result<PinnedSlotsPermit> {
+        let timer = crate::metrics::PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME.start_timer();
+        match tokio::time::timeout(
+            // Choose small timeout, neon_smgr does its own retries.
+            // https://neondb.slack.com/archives/C04DGM6SMTM/p1694786876476869
+            Duration::from_secs(10),
+            Arc::clone(&self.pinned_slots).acquire_owned(),
+        )
+        .await
+        {
+            Ok(res) => Ok(PinnedSlotsPermit(
+                res.expect("this semaphore is never closed"),
+            )),
+            Err(_timeout) => {
+                timer.stop_and_discard();
+                crate::metrics::page_cache_errors_inc(
+                    crate::metrics::PageCacheErrorKind::AcquirePinnedSlotTimeout,
+                );
+                anyhow::bail!("timeout: there were page guards alive for all page cache slots")
+            }
+        }
+    }
+
    /// Look up a page in the cache.
    ///
    /// If the search criteria is not exact, *cache_key is updated with the key
@@ -454,7 +582,11 @@ impl PageCache {
    ///
    /// If no page is found, returns None and *cache_key is left unmodified.
    ///
-    async fn try_lock_for_read(&self, cache_key: &mut CacheKey) -> Option<PageReadGuard> {
+    async fn try_lock_for_read(
+        &self,
+        cache_key: &mut CacheKey,
+        permit: &mut Option<PinnedSlotsPermit>,
+    ) -> Option<PageReadGuard> {
        let cache_key_orig = cache_key.clone();
        if let Some(slot_idx) = self.search_mapping(cache_key) {
            // The page was found in the mapping. Lock the slot, and re-check
@@ -464,7 +596,10 @@ impl PageCache {
            let inner = slot.inner.read().await;
            if inner.key.as_ref() == Some(cache_key) {
                slot.inc_usage_count();
-                return Some(PageReadGuard(inner));
+                return Some(PageReadGuard {
+                    _permit: inner.coalesce_readers_permit(permit.take().unwrap()),
+                    slot_guard: inner,
+                });
            } else {
                // search_mapping might have modified the search key; restore it.
                *cache_key = cache_key_orig;
@@ -507,6 +642,8 @@ impl PageCache {
        cache_key: &mut CacheKey,
        ctx: &RequestContext,
    ) -> anyhow::Result<ReadBufResult> {
+        let mut permit = Some(self.try_get_pinned_slot_permit().await?);
+
        let (read_access, hit) = match cache_key {
            CacheKey::MaterializedPage { .. } => {
                unreachable!("Materialized pages use lookup_materialized_page")
@@ -523,17 +660,21 @@ impl PageCache {
        let mut is_first_iteration = true;
        loop {
            // First check if the key already exists in the cache.
-            if let Some(read_guard) = self.try_lock_for_read(cache_key).await {
+            if let Some(read_guard) = self.try_lock_for_read(cache_key, &mut permit).await {
+                debug_assert!(permit.is_none());
                if is_first_iteration {
                    hit.inc();
                }
                return Ok(ReadBufResult::Found(read_guard));
            }
+            debug_assert!(permit.is_some());
            is_first_iteration = false;

            // Not found. Find a victim buffer
-            let (slot_idx, mut inner) =
-                self.find_victim().context("Failed to find evict victim")?;
+            let (slot_idx, mut inner) = self
+                .find_victim(permit.as_ref().unwrap())
+                .await
+                .context("Failed to find evict victim")?;

            // Insert mapping for this. At this point, we may find that another
            // thread did the same thing concurrently. In that case, we evicted
@@ -555,70 +696,19 @@ impl PageCache {
            inner.key = Some(cache_key.clone());
            slot.set_usage_count(1);

+            debug_assert!(
+                {
+                    let guard = inner.permit.lock().unwrap();
+                    guard.upgrade().is_none()
+                },
+                "we hold a write lock, so, no one else should have a permit"
+            );
+
            return Ok(ReadBufResult::NotFound(PageWriteGuard {
-                inner,
-                valid: false,
-            }));
-        }
-    }
-
-    /// Look up a page in the cache and lock it in write mode. If it's not
-    /// found, returns None.
-    ///
-    /// When locking a page for writing, the search criteria is always "exact".
-    async fn try_lock_for_write(&self, cache_key: &CacheKey) -> Option<PageWriteGuard> {
-        if let Some(slot_idx) = self.search_mapping_for_write(cache_key) {
-            // The page was found in the mapping. Lock the slot, and re-check
-            // that it's still what we expected (because we don't released the mapping
-            // lock already, another thread could have evicted the page)
-            let slot = &self.slots[slot_idx];
-            let inner = slot.inner.write().await;
-            if inner.key.as_ref() == Some(cache_key) {
-                slot.inc_usage_count();
-                return Some(PageWriteGuard { inner, valid: true });
-            }
-        }
-        None
-    }
-
-    /// Return a write-locked buffer for given block.
-    ///
-    /// Similar to lock_for_read(), but the returned buffer is write-locked and
-    /// may be modified by the caller even if it's already found in the cache.
-    async fn lock_for_write(&self, cache_key: &CacheKey) -> anyhow::Result<WriteBufResult> {
-        loop {
-            // First check if the key already exists in the cache.
-            if let Some(write_guard) = self.try_lock_for_write(cache_key).await {
-                return Ok(WriteBufResult::Found(write_guard));
-            }
-
-            // Not found. Find a victim buffer
-            let (slot_idx, mut inner) =
-                self.find_victim().context("Failed to find evict victim")?;
-
-            // Insert mapping for this. At this point, we may find that another
-            // thread did the same thing concurrently. In that case, we evicted
-            // our victim buffer unnecessarily. Put it into the free list and
-            // continue with the slot that the other thread chose.
-            if let Some(_existing_slot_idx) = self.try_insert_mapping(cache_key, slot_idx) {
-                // TODO: put to free list
-
-                // We now just loop back to start from beginning. This is not
-                // optimal, we'll perform the lookup in the mapping again, which
-                // is not really necessary because we already got
-                // 'existing_slot_idx'.  But this shouldn't happen often enough
-                // to matter much.
-                continue;
-            }
-
-            // Make the slot ready
-            let slot = &self.slots[slot_idx];
-            inner.key = Some(cache_key.clone());
-            slot.set_usage_count(1);
-
-            return Ok(WriteBufResult::NotFound(PageWriteGuard {
-                inner,
-                valid: false,
+                state: PageWriteGuardState::Invalid {
+                    _permit: permit.take().unwrap(),
+                    inner,
+                },
            }));
        }
    }
@@ -663,7 +753,7 @@ impl PageCache {
    ///
    /// Like 'search_mapping, but performs an "exact" search. Used for
    /// allocating a new buffer.
-    fn search_mapping_for_write(&self, key: &CacheKey) -> Option<usize> {
+    fn search_mapping_exact(&self, key: &CacheKey) -> Option<usize> {
        match key {
            CacheKey::MaterializedPage { hash_key, lsn } => {
                let map = self.materialized_page_map.read().unwrap();
@@ -769,7 +859,10 @@ impl PageCache {
    /// Find a slot to evict.
    ///
    /// On return, the slot is empty and write-locked.
-    fn find_victim(&self) -> anyhow::Result<(usize, tokio::sync::RwLockWriteGuard<SlotInner>)> {
+    async fn find_victim(
+        &self,
+        _permit_witness: &PinnedSlotsPermit,
+    ) -> anyhow::Result<(usize, tokio::sync::RwLockWriteGuard<SlotInner>)> {
        let iter_limit = self.slots.len() * 10;
        let mut iters = 0;
        loop {
@@ -782,13 +875,40 @@ impl PageCache {
                let mut inner = match slot.inner.try_write() {
                    Ok(inner) => inner,
                    Err(_err) => {
-                        // If we have looped through the whole buffer pool 10 times
-                        // and still haven't found a victim buffer, something's wrong.
-                        // Maybe all the buffers were in locked. That could happen in
-                        // theory, if you have more threads holding buffers locked than
-                        // there are buffers in the pool. In practice, with a reasonably
-                        // large buffer pool it really shouldn't happen.
                        if iters > iter_limit {
+                            // NB: Even with the permits, there's no hard guarantee that we will find a slot with
+                            // any particular number of iterations: other threads might race ahead and acquire and
+                            // release pins just as we're scanning the array.
+                            //
+                            // Imagine that nslots is 2, and as starting point, usage_count==1 on all
+                            // slots. There are two threads running concurrently, A and B. A has just
+                            // acquired the permit from the semaphore.
+                            //
+                            //   A: Look at slot 1. Its usage_count == 1, so decrement it to zero, and continue the search
+                            //   B: Acquire permit.
+                            //   B: Look at slot 2, decrement its usage_count to zero and continue the search
+                            //   B: Look at slot 1. Its usage_count is zero, so pin it and bump up its usage_count to 1.
+                            //   B: Release pin and permit again
+                            //   B: Acquire permit.
+                            //   B: Look at slot 2. Its usage_count is zero, so pin it and bump up its usage_count to 1.
+                            //   B: Release pin and permit again
+                            //
+                            // Now we're back in the starting situation that both slots have
+                            // usage_count 1, but A has now been through one iteration of the
+                            // find_victim() loop. This can repeat indefinitely and on each
+                            // iteration, A's iteration count increases by one.
+                            //
+                            // So, even though the semaphore for the permits is fair, the victim search
+                            // itself happens in parallel and is not fair.
+                            // Hence even with a permit, a task can theoretically be starved.
+                            // To avoid this, we'd need tokio to give priority to tasks that are holding
+                            // permits for longer.
+                            // Note that just yielding to tokio during iteration without such
+                            // priority boosting is likely counter-productive. We'd just give more opportunities
+                            // for B to bump usage count, further starving A.
+                            crate::metrics::page_cache_errors_inc(
+                                crate::metrics::PageCacheErrorKind::EvictIterLimit,
+                            );
                            anyhow::bail!("exceeded evict iter limit");
                        }
                        continue;
@@ -799,6 +919,7 @@ impl PageCache {
                    self.remove_mapping(old_key);
                    inner.key = None;
                }
+                crate::metrics::PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL.inc_by(iters as u64);
                return Ok((slot_idx, inner));
            }
        }
@@ -826,7 +947,11 @@ impl PageCache {
                let buf: &mut [u8; PAGE_SZ] = chunk.try_into().unwrap();

                Slot {
-                    inner: tokio::sync::RwLock::new(SlotInner { key: None, buf }),
+                    inner: tokio::sync::RwLock::new(SlotInner {
+                        key: None,
+                        buf,
+                        permit: std::sync::Mutex::new(Weak::new()),
+                    }),
                    usage_count: AtomicU8::new(0),
                }
            })
@@ -838,6 +963,7 @@ impl PageCache {
            slots,
            next_evict_slot: AtomicUsize::new(0),
            size_metrics,
+            pinned_slots: Arc::new(tokio::sync::Semaphore::new(num_pages)),
        }
    }
 }
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -35,6 +35,7 @@ use std::time::Duration;
 use tokio::io::AsyncWriteExt;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_util::io::StreamReader;
+use tokio_util::sync::CancellationToken;
 use tracing::field;
 use tracing::*;
 use utils::id::ConnectionId;
@@ -64,69 +65,6 @@ use crate::trace::Tracer;
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;

-fn copyin_stream<IO>(pgb: &mut PostgresBackend<IO>) -> impl Stream<Item = io::Result<Bytes>> + '_
-where
-    IO: AsyncRead + AsyncWrite + Unpin,
-{
-    async_stream::try_stream! {
-        loop {
-            let msg = tokio::select! {
-                biased;
-
-                _ = task_mgr::shutdown_watcher() => {
-                    // We were requested to shut down.
-                    let msg = "pageserver is shutting down";
-                    let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, None));
-                    Err(QueryError::Other(anyhow::anyhow!(msg)))
-                }
-
-                msg = pgb.read_message() => { msg.map_err(QueryError::from)}
-            };
-
-            match msg {
-                Ok(Some(message)) => {
-                    let copy_data_bytes = match message {
-                        FeMessage::CopyData(bytes) => bytes,
-                        FeMessage::CopyDone => { break },
-                        FeMessage::Sync => continue,
-                        FeMessage::Terminate => {
-                            let msg = "client terminated connection with Terminate message during COPY";
-                            let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
-                            // error can't happen here, ErrorResponse serialization should be always ok
-                            pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
-                            Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
-                            break;
-                        }
-                        m => {
-                            let msg = format!("unexpected message {m:?}");
-                            // error can't happen here, ErrorResponse serialization should be always ok
-                            pgb.write_message_noflush(&BeMessage::ErrorResponse(&msg, None)).map_err(|e| e.into_io_error())?;
-                            Err(io::Error::new(io::ErrorKind::Other, msg))?;
-                            break;
-                        }
-                    };
-
-                    yield copy_data_bytes;
-                }
-                Ok(None) => {
-                    let msg = "client closed connection during COPY";
-                    let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
-                    // error can't happen here, ErrorResponse serialization should be always ok
-                    pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
-                    pgb.flush().await?;
-                    Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
-                }
-                Err(QueryError::Disconnected(ConnectionError::Io(io_error))) => {
-                    Err(io_error)?;
-                }
-                Err(other) => {
-                    Err(io::Error::new(io::ErrorKind::Other, other.to_string()))?;
-                }
-            };
-        }
-    }
-}
-
 /// Read the end of a tar archive.
 ///
 /// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each.
@@ -284,7 +222,13 @@ async fn page_service_conn_main(
    // and create a child per-query context when it invokes process_query.
    // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
    // and create the per-query context in process_query ourselves.
-    let mut conn_handler = PageServerHandler::new(conf, broker_client, auth, connection_ctx);
+    let mut conn_handler = PageServerHandler::new(
+        conf,
+        broker_client,
+        auth,
+        connection_ctx,
+        task_mgr::shutdown_token(),
+    );
    let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;

    match pgbackend
@@ -318,6 +262,10 @@ struct PageServerHandler {
    /// For each query received over the connection,
    /// `process_query` creates a child context from this one.
    connection_ctx: RequestContext,
+
+    /// A token that should fire when the tenant transitions from
+    /// attached state, or when the pageserver is shutting down.
+    cancel: CancellationToken,
 }

 impl PageServerHandler {
@@ -326,6 +274,7 @@ impl PageServerHandler {
        broker_client: storage_broker::BrokerClientChannel,
        auth: Option<Arc<JwtAuth>>,
        connection_ctx: RequestContext,
+        cancel: CancellationToken,
    ) -> Self {
        PageServerHandler {
            _conf: conf,
@@ -333,6 +282,91 @@ impl PageServerHandler {
            auth,
            claims: None,
            connection_ctx,
+            cancel,
+        }
+    }
+
+    /// Wrap PostgresBackend::flush to respect our CancellationToken: it is important to use
+    /// this rather than naked flush() in order to shut down promptly.  Without this, we would
+    /// block shutdown of a tenant if a postgres client was failing to consume bytes we send
+    /// in the flush.
+    async fn flush_cancellable<IO>(&self, pgb: &mut PostgresBackend<IO>) -> Result<(), QueryError>
+    where
+        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
+    {
+        tokio::select!(
+            flush_r = pgb.flush() => {
+                Ok(flush_r?)
+            },
+            _ = self.cancel.cancelled() => {
+                Err(QueryError::Other(anyhow::anyhow!("Shutting down")))
+            }
+        )
+    }
+
+    fn copyin_stream<'a, IO>(
+        &'a self,
+        pgb: &'a mut PostgresBackend<IO>,
+    ) -> impl Stream<Item = io::Result<Bytes>> + 'a
+    where
+        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
+    {
+        async_stream::try_stream! {
+            loop {
+                let msg = tokio::select! {
+                    biased;
+
+                    _ = task_mgr::shutdown_watcher() => {
+                        // We were requested to shut down.
+                        let msg = "pageserver is shutting down";
+                        let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, None));
+                        Err(QueryError::Other(anyhow::anyhow!(msg)))
+                    }
+
+                    msg = pgb.read_message() => { msg.map_err(QueryError::from)}
+                };
+
+                match msg {
+                    Ok(Some(message)) => {
+                        let copy_data_bytes = match message {
+                            FeMessage::CopyData(bytes) => bytes,
+                            FeMessage::CopyDone => { break },
+                            FeMessage::Sync => continue,
+                            FeMessage::Terminate => {
+                                let msg = "client terminated connection with Terminate message during COPY";
+                                let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
+                                // error can't happen here, ErrorResponse serialization should be always ok
+                                pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
+                                Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
+                                break;
+                            }
+                            m => {
+                                let msg = format!("unexpected message {m:?}");
+                                // error can't happen here, ErrorResponse serialization should be always ok
+                                pgb.write_message_noflush(&BeMessage::ErrorResponse(&msg, None)).map_err(|e| e.into_io_error())?;
+                                Err(io::Error::new(io::ErrorKind::Other, msg))?;
+                                break;
+                            }
+                        };
+
+                        yield copy_data_bytes;
+                    }
+                    Ok(None) => {
+                        let msg = "client closed connection during COPY";
+                        let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
+                        // error can't happen here, ErrorResponse serialization should be always ok
+                        pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
+                        self.flush_cancellable(pgb).await.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
+                        Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
+                    }
+                    Err(QueryError::Disconnected(ConnectionError::Io(io_error))) => {
+                        Err(io_error)?;
+                    }
+                    Err(other) => {
+                        Err(io::Error::new(io::ErrorKind::Other, other.to_string()))?;
+                    }
+                };
+            }
        }
    }

@@ -372,7 +406,7 @@ impl PageServerHandler {

        // switch client to COPYBOTH
        pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
-        pgb.flush().await?;
+        self.flush_cancellable(pgb).await?;

        let metrics = metrics::SmgrQueryTimePerTimeline::new(&tenant_id, &timeline_id);

@@ -412,38 +446,60 @@ impl PageServerHandler {
            // TODO: We could create a new per-request context here, with unique ID.
            // Currently we use the same per-timeline context for all requests

-            let response = match neon_fe_msg {
+            let (response, span) = match neon_fe_msg {
                PagestreamFeMessage::Exists(req) => {
                    let _timer = metrics.start_timer(metrics::SmgrQueryType::GetRelExists);
-                    self.handle_get_rel_exists_request(&timeline, &req, &ctx)
-                        .await
+                    let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.lsn);
+                    (
+                        self.handle_get_rel_exists_request(&timeline, &req, &ctx)
+                            .instrument(span.clone())
+                            .await,
+                        span,
+                    )
                }
                PagestreamFeMessage::Nblocks(req) => {
                    let _timer = metrics.start_timer(metrics::SmgrQueryType::GetRelSize);
-                    self.handle_get_nblocks_request(&timeline, &req, &ctx).await
+                    let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.lsn);
+                    (
+                        self.handle_get_nblocks_request(&timeline, &req, &ctx)
+                            .instrument(span.clone())
+                            .await,
+                        span,
+                    )
                }
                PagestreamFeMessage::GetPage(req) => {
                    let _timer = metrics.start_timer(metrics::SmgrQueryType::GetPageAtLsn);
-                    self.handle_get_page_at_lsn_request(&timeline, &req, &ctx)
-                        .await
+                    let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn);
+                    (
+                        self.handle_get_page_at_lsn_request(&timeline, &req, &ctx)
+                            .instrument(span.clone())
+                            .await,
+                        span,
+                    )
                }
                PagestreamFeMessage::DbSize(req) => {
                    let _timer = metrics.start_timer(metrics::SmgrQueryType::GetDbSize);
-                    self.handle_db_size_request(&timeline, &req, &ctx).await
+                    let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.lsn);
+                    (
+                        self.handle_db_size_request(&timeline, &req, &ctx)
+                            .instrument(span.clone())
+                            .await,
+                        span,
+                    )
                }
            };

            let response = response.unwrap_or_else(|e| {
                // print the all details to the log with {:#}, but for the client the
                // error message is enough
-                error!("error reading relation or page version: {:?}", e);
+                span.in_scope(|| error!("error reading relation or page version: {:#}", e));
                PagestreamBeMessage::Error(PagestreamErrorResponse {
                    message: e.to_string(),
                })
            });

            pgb.write_message_noflush(&BeMessage::CopyData(&response.serialize()))?;
-            pgb.flush().await?;
+            self.flush_cancellable(pgb).await?;
        }
        Ok(())
    }
@@ -486,9 +542,9 @@ impl PageServerHandler {
        // Import basebackup provided via CopyData
        info!("importing basebackup");
        pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
-        pgb.flush().await?;
+        self.flush_cancellable(pgb).await?;

-        let mut copyin_reader = pin!(StreamReader::new(copyin_stream(pgb)));
+        let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb)));
        timeline
            .import_basebackup_from_tar(
                &mut copyin_reader,
@@ -541,8 +597,8 @@ impl PageServerHandler {
        // Import wal provided via CopyData
        info!("importing wal");
        pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
-        pgb.flush().await?;
-        let mut copyin_reader = pin!(StreamReader::new(copyin_stream(pgb)));
+        self.flush_cancellable(pgb).await?;
+        let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb)));
        import_wal_from_tar(&timeline, &mut copyin_reader, start_lsn, end_lsn, &ctx).await?;
        info!("wal import complete");

@@ -627,7 +683,6 @@ impl PageServerHandler {
        Ok(lsn)
    }

-    #[instrument(skip(self, timeline, req, ctx), fields(rel = %req.rel, req_lsn = %req.lsn))]
    async fn handle_get_rel_exists_request(
        &self,
        timeline: &Timeline,
@@ -648,7 +703,6 @@ impl PageServerHandler {
        }))
    }

-    #[instrument(skip(self, timeline, req, ctx), fields(rel = %req.rel, req_lsn = %req.lsn))]
    async fn handle_get_nblocks_request(
        &self,
        timeline: &Timeline,
@@ -667,7 +721,6 @@ impl PageServerHandler {
        }))
    }

-    #[instrument(skip(self, timeline, req, ctx), fields(dbnode = %req.dbnode, req_lsn = %req.lsn))]
    async fn handle_db_size_request(
        &self,
        timeline: &Timeline,
@@ -689,7 +742,6 @@ impl PageServerHandler {
        }))
    }

-    #[instrument(skip(self, timeline, req, ctx), fields(rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn))]
    async fn handle_get_page_at_lsn_request(
        &self,
        timeline: &Timeline,
@@ -754,7 +806,7 @@ impl PageServerHandler {

        // switch client to COPYOUT
        pgb.write_message_noflush(&BeMessage::CopyOutResponse)?;
-        pgb.flush().await?;
+        self.flush_cancellable(pgb).await?;

        // Send a tarball of the latest layer on the timeline. Compress if not
        // fullbackup. TODO Compress in that case too (tests need to be updated)
@@ -806,7 +858,7 @@ impl PageServerHandler {
        }

        pgb.write_message_noflush(&BeMessage::CopyDone)?;
-        pgb.flush().await?;
+        self.flush_cancellable(pgb).await?;

        let basebackup_after = started
            .elapsed()
@@ -1265,7 +1317,10 @@ async fn get_active_tenant_with_timeout(
        Ok(tenant) => tenant,
        Err(e @ GetTenantError::NotFound(_)) => return Err(GetActiveTenantError::NotFound(e)),
        Err(GetTenantError::NotActive(_)) => {
-            unreachable!("we're calling get_tenant with active=false")
+            unreachable!("we're calling get_tenant with active_only=false")
+        }
+        Err(GetTenantError::Broken(_)) => {
+            unreachable!("we're calling get_tenant with active_only=false")
        }
    };
    let wait_time = Duration::from_secs(30);
--- a/pageserver/src/statvfs.rs
+++ b/pageserver/src/statvfs.rs
@@ -1,6 +1,6 @@
 //! Wrapper around nix::sys::statvfs::Statvfs that allows for mocking.

-use std::path::Path;
+use camino::Utf8Path;

 pub enum Statvfs {
    Real(nix::sys::statvfs::Statvfs),
@@ -12,11 +12,13 @@ pub enum Statvfs {
 // Sincce it should only be a problem on > 2TiB disks, let's ignore
 // the problem for now and upcast to u64.
 impl Statvfs {
-    pub fn get(tenants_dir: &Path, mocked: Option<&mock::Behavior>) -> nix::Result<Self> {
+    pub fn get(tenants_dir: &Utf8Path, mocked: Option<&mock::Behavior>) -> nix::Result<Self> {
        if let Some(mocked) = mocked {
            Ok(Statvfs::Mock(mock::get(tenants_dir, mocked)?))
        } else {
-            Ok(Statvfs::Real(nix::sys::statvfs::statvfs(tenants_dir)?))
+            Ok(Statvfs::Real(nix::sys::statvfs::statvfs(
+                tenants_dir.as_std_path(),
+            )?))
        }
    }

@@ -55,8 +57,8 @@ impl Statvfs {

 pub mod mock {
    use anyhow::Context;
+    use camino::Utf8Path;
    use regex::Regex;
-    use std::path::Path;
    use tracing::log::info;

    #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -86,7 +88,7 @@ pub mod mock {
        }
    }

-    pub fn get(tenants_dir: &Path, behavior: &Behavior) -> nix::Result<Statvfs> {
+    pub fn get(tenants_dir: &Utf8Path, behavior: &Behavior) -> nix::Result<Statvfs> {
        info!("running mocked statvfs");

        match behavior {
@@ -119,7 +121,7 @@ pub mod mock {
        }
    }

-    fn walk_dir_disk_usage(path: &Path, name_filter: Option<&Regex>) -> anyhow::Result<u64> {
+    fn walk_dir_disk_usage(path: &Utf8Path, name_filter: Option<&Regex>) -> anyhow::Result<u64> {
        let mut total = 0;
        for entry in walkdir::WalkDir::new(path) {
            let entry = entry?;
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -12,6 +12,7 @@
 //!

 use anyhow::{bail, Context};
+use camino::{Utf8Path, Utf8PathBuf};
 use futures::FutureExt;
 use pageserver_api::models::TimelineState;
 use remote_storage::DownloadError;
@@ -34,8 +35,6 @@ use std::fs;
 use std::fs::File;
 use std::io;
 use std::ops::Bound::Included;
-use std::path::Path;
-use std::path::PathBuf;
 use std::process::Command;
 use std::process::Stdio;
 use std::sync::atomic::AtomicU64;
@@ -45,6 +44,9 @@ use std::sync::MutexGuard;
 use std::sync::{Mutex, RwLock};
 use std::time::{Duration, Instant};

+use self::config::AttachedLocationConfig;
+use self::config::AttachmentMode;
+use self::config::LocationConf;
 use self::config::TenantConf;
 use self::delete::DeleteTenantFlow;
 use self::metadata::LoadMetadataError;
@@ -65,6 +67,7 @@ use crate::metrics::{remove_tenant_metrics, TENANT_STATE_METRIC, TENANT_SYNTHETI
 use crate::repository::GcResult;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
+use crate::tenant::config::LocationMode;
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::metadata::load_metadata;
 pub use crate::tenant::remote_timeline_client::index::IndexPart;
@@ -161,6 +164,28 @@ pub struct TenantSharedResources {
    pub deletion_queue_client: DeletionQueueClient,
 }

+/// A [`Tenant`] is really an _attached_ tenant.  The configuration
+/// for an attached tenant is a subset of the [`LocationConf`], represented
+/// in this struct.
+pub(super) struct AttachedTenantConf {
+    tenant_conf: TenantConfOpt,
+    location: AttachedLocationConfig,
+}
+
+impl AttachedTenantConf {
+    fn try_from(location_conf: LocationConf) -> anyhow::Result<Self> {
+        match &location_conf.mode {
+            LocationMode::Attached(attach_conf) => Ok(Self {
+                tenant_conf: location_conf.tenant_conf,
+                location: attach_conf.clone(),
+            }),
+            LocationMode::Secondary(_) => {
+                anyhow::bail!("Attempted to construct AttachedTenantConf from a LocationConf in secondary mode")
+            }
+        }
+    }
+}
+
 ///
 /// Tenant consists of multiple timelines. Keep them in a hash table.
 ///
@@ -178,12 +203,15 @@ pub struct Tenant {
    // We keep TenantConfOpt sturct here to preserve the information
    // about parameters that are not set.
    // This is necessary to allow global config updates.
-    tenant_conf: Arc<RwLock<TenantConfOpt>>,
+    tenant_conf: Arc<RwLock<AttachedTenantConf>>,

    tenant_id: TenantId,

    /// The remote storage generation, used to protect S3 objects from split-brain.
    /// Does not change over the lifetime of the [`Tenant`] object.
+    ///
+    /// This duplicates the generation stored in LocationConf, but that structure is mutable:
+    /// this copy enforces the invariant that generatio doesn't change during a Tenant's lifetime.
    generation: Generation,

    timelines: Mutex<HashMap<TimelineId, Arc<Timeline>>>,
@@ -379,6 +407,8 @@ pub enum CreateTimelineError {
    AlreadyExists,
    #[error(transparent)]
    AncestorLsn(anyhow::Error),
+    #[error("ancestor timeline is not active")]
+    AncestorNotActive,
    #[error(transparent)]
    Other(#[from] anyhow::Error),
 }
@@ -527,14 +557,13 @@ impl Tenant {
    pub(crate) fn spawn_attach(
        conf: &'static PageServerConf,
        tenant_id: TenantId,
-        generation: Generation,
        resources: TenantSharedResources,
+        attached_conf: AttachedTenantConf,
        tenants: &'static tokio::sync::RwLock<TenantsMap>,
        ctx: &RequestContext,
    ) -> anyhow::Result<Arc<Tenant>> {
        // TODO dedup with spawn_load
-        let tenant_conf =
-            Self::load_tenant_config(conf, &tenant_id).context("load tenant config")?;
+        let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id));

        let TenantSharedResources {
            broker_client,
@@ -542,14 +571,12 @@ impl Tenant {
            deletion_queue_client,
        } = resources;

-        let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id));
        let tenant = Arc::new(Tenant::new(
            TenantState::Attaching,
            conf,
-            tenant_conf,
+            attached_conf,
            wal_redo_manager,
            tenant_id,
-            generation,
            remote_storage.clone(),
            deletion_queue_client,
        ));
@@ -772,7 +799,7 @@ impl Tenant {
        }

        std::fs::remove_file(&marker_file)
-            .with_context(|| format!("unlink attach marker file {}", marker_file.display()))?;
+            .with_context(|| format!("unlink attach marker file {marker_file}"))?;
        crashsafe::fsync(marker_file.parent().expect("marker file has parent dir"))
            .context("fsync tenant directory after unlinking attach marker file")?;

@@ -860,10 +887,9 @@ impl Tenant {
                backtrace: String::new(),
            },
            conf,
-            TenantConfOpt::default(),
+            AttachedTenantConf::try_from(LocationConf::default()).unwrap(),
            wal_redo_manager,
            tenant_id,
-            Generation::broken(),
            None,
            DeletionQueueClient::broken(),
        ))
@@ -882,7 +908,7 @@ impl Tenant {
    pub(crate) fn spawn_load(
        conf: &'static PageServerConf,
        tenant_id: TenantId,
-        generation: Generation,
+        attached_conf: AttachedTenantConf,
        resources: TenantSharedResources,
        init_order: Option<InitializationOrder>,
        tenants: &'static tokio::sync::RwLock<TenantsMap>,
@@ -890,14 +916,6 @@ impl Tenant {
    ) -> Arc<Tenant> {
        span::debug_assert_current_span_has_tenant_id();

-        let tenant_conf = match Self::load_tenant_config(conf, &tenant_id) {
-            Ok(conf) => conf,
-            Err(e) => {
-                error!("load tenant config failed: {:?}", e);
-                return Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}"));
-            }
-        };
-
        let broker_client = resources.broker_client;
        let remote_storage = resources.remote_storage;

@@ -905,10 +923,9 @@ impl Tenant {
        let tenant = Tenant::new(
            TenantState::Loading,
            conf,
-            tenant_conf,
+            attached_conf,
            wal_redo_manager,
            tenant_id,
-            generation,
            remote_storage.clone(),
            resources.deletion_queue_client.clone(),
        );
@@ -1024,58 +1041,47 @@ impl Tenant {

        let timelines_dir = self.conf.timelines_path(&self.tenant_id);

-        for entry in
-            std::fs::read_dir(&timelines_dir).context("list timelines directory for tenant")?
+        for entry in timelines_dir
+            .read_dir_utf8()
+            .context("list timelines directory for tenant")?
        {
            let entry = entry.context("read timeline dir entry")?;
            let timeline_dir = entry.path();

-            if crate::is_temporary(&timeline_dir) {
-                info!(
-                    "Found temporary timeline directory, removing: {}",
-                    timeline_dir.display()
-                );
-                if let Err(e) = std::fs::remove_dir_all(&timeline_dir) {
-                    error!(
-                        "Failed to remove temporary directory '{}': {:?}",
-                        timeline_dir.display(),
-                        e
-                    );
+            if crate::is_temporary(timeline_dir) {
+                info!("Found temporary timeline directory, removing: {timeline_dir}");
+                if let Err(e) = std::fs::remove_dir_all(timeline_dir) {
+                    error!("Failed to remove temporary directory '{timeline_dir}': {e:?}");
                }
-            } else if is_uninit_mark(&timeline_dir) {
+            } else if is_uninit_mark(timeline_dir) {
                if !timeline_dir.exists() {
-                    warn!(
-                        "Timeline dir entry become invalid: {}",
-                        timeline_dir.display()
-                    );
+                    warn!("Timeline dir entry become invalid: {timeline_dir}");
                    continue;
                }

                let timeline_uninit_mark_file = &timeline_dir;
                info!(
-                    "Found an uninit mark file {}, removing the timeline and its uninit mark",
-                    timeline_uninit_mark_file.display()
+                    "Found an uninit mark file {timeline_uninit_mark_file}, removing the timeline and its uninit mark",
                );
-                let timeline_id = TimelineId::try_from(timeline_uninit_mark_file.file_stem())
-                    .with_context(|| {
-                        format!(
-                            "Could not parse timeline id out of the timeline uninit mark name {}",
-                            timeline_uninit_mark_file.display()
+                let timeline_id =
+                    TimelineId::try_from(timeline_uninit_mark_file.file_stem())
+                        .with_context(|| {
+                            format!(
+                            "Could not parse timeline id out of the timeline uninit mark name {timeline_uninit_mark_file}",
                        )
-                    })?;
+                        })?;
                let timeline_dir = self.conf.timeline_path(&self.tenant_id, &timeline_id);
                if let Err(e) =
                    remove_timeline_and_uninit_mark(&timeline_dir, timeline_uninit_mark_file)
                {
                    error!("Failed to clean up uninit marked timeline: {e:?}");
                }
-            } else if crate::is_delete_mark(&timeline_dir) {
+            } else if crate::is_delete_mark(timeline_dir) {
                // If metadata exists, load as usual, continue deletion
-                let timeline_id =
-                    TimelineId::try_from(timeline_dir.file_stem()).with_context(|| {
+                let timeline_id = TimelineId::try_from(timeline_dir.file_stem())
+                    .with_context(|| {
                        format!(
-                            "Could not parse timeline id out of the timeline uninit mark name {}",
-                            timeline_dir.display()
+                            "Could not parse timeline id out of the timeline uninit mark name {timeline_dir}",
                        )
                    })?;

@@ -1114,17 +1120,13 @@ impl Tenant {
                }
            } else {
                if !timeline_dir.exists() {
-                    warn!(
-                        "Timeline dir entry become invalid: {}",
-                        timeline_dir.display()
-                    );
+                    warn!("Timeline dir entry become invalid: {timeline_dir}");
                    continue;
                }
-                let timeline_id =
-                    TimelineId::try_from(timeline_dir.file_name()).with_context(|| {
+                let timeline_id = TimelineId::try_from(timeline_dir.file_name())
+                    .with_context(|| {
                        format!(
-                            "Could not parse timeline id out of the timeline dir name {}",
-                            timeline_dir.display()
+                            "Could not parse timeline id out of the timeline dir name {timeline_dir}",
                        )
                    })?;
                let timeline_uninit_mark_file = self
@@ -1136,7 +1138,7 @@ impl Tenant {
                        "Found an uninit mark file, removing the timeline and its uninit mark",
                    );
                    if let Err(e) =
-                        remove_timeline_and_uninit_mark(&timeline_dir, &timeline_uninit_mark_file)
+                        remove_timeline_and_uninit_mark(timeline_dir, &timeline_uninit_mark_file)
                    {
                        error!("Failed to clean up uninit marked timeline: {e:?}");
                    }
@@ -1152,18 +1154,13 @@ impl Tenant {
                }

                let file_name = entry.file_name();
-                if let Ok(timeline_id) =
-                    file_name.to_str().unwrap_or_default().parse::<TimelineId>()
-                {
+                if let Ok(timeline_id) = file_name.parse::<TimelineId>() {
                    let metadata = load_metadata(self.conf, &self.tenant_id, &timeline_id)
                        .context("failed to load metadata")?;
                    timelines_to_load.insert(timeline_id, metadata);
                } else {
                    // A file or directory that doesn't look like a timeline ID
-                    warn!(
-                        "unexpected file or directory in timelines directory: {}",
-                        file_name.to_string_lossy()
-                    );
+                    warn!("unexpected file or directory in timelines directory: {file_name}");
                }
            }
        }
@@ -1593,6 +1590,12 @@ impl Tenant {
                    .get_timeline(ancestor_timeline_id, false)
                    .context("Cannot branch off the timeline that's not present in pageserver")?;

+                // instead of waiting around, just deny the request because ancestor is not yet
+                // ready for other purposes either.
+                if !ancestor_timeline.is_active() {
+                    return Err(CreateTimelineError::AncestorNotActive);
+                }
+
                if let Some(lsn) = ancestor_start_lsn.as_mut() {
                    *lsn = lsn.align();

@@ -1625,8 +1628,6 @@ impl Tenant {
            }
        };

-        loaded_timeline.activate(broker_client, None, ctx);
-
        if let Some(remote_client) = loaded_timeline.remote_client.as_ref() {
            // Wait for the upload of the 'index_part.json` file to finish, so that when we return
            // Ok, the timeline is durable in remote storage.
@@ -1638,6 +1639,8 @@ impl Tenant {
            })?;
        }

+        loaded_timeline.activate(broker_client, None, ctx);
+
        Ok(loaded_timeline)
    }

@@ -1667,6 +1670,15 @@ impl Tenant {
            "Cannot run GC iteration on inactive tenant"
        );

+        {
+            let conf = self.tenant_conf.read().unwrap();
+
+            if !conf.location.may_delete_layers_hint() {
+                info!("Skipping GC in location state {:?}", conf.location);
+                return Ok(GcResult::default());
+            }
+        }
+
        self.gc_iteration_internal(target_timeline_id, horizon, pitr, ctx)
            .await
    }
@@ -1685,6 +1697,14 @@ impl Tenant {
            "Cannot run compaction iteration on inactive tenant"
        );

+        {
+            let conf = self.tenant_conf.read().unwrap();
+            if !conf.location.may_delete_layers_hint() || !conf.location.may_upload_layers_hint() {
+                info!("Skipping compaction in location state {:?}", conf.location);
+                return Ok(());
+            }
+        }
+
        // Scan through the hashmap and collect a list of all the timelines,
        // while holding the lock. Then drop the lock and actually perform the
        // compactions.  We don't want to block everything else while the
@@ -2057,6 +2077,15 @@ impl Tenant {
            }
        }
    }
+
+    pub(crate) fn get_attach_mode(&self) -> AttachmentMode {
+        self.tenant_conf
+            .read()
+            .unwrap()
+            .location
+            .attach_mode
+            .clone()
+    }
 }

 /// Given a Vec of timelines and their ancestors (timeline_id, ancestor_id),
@@ -2110,7 +2139,7 @@ where

 impl Tenant {
    pub fn tenant_specific_overrides(&self) -> TenantConfOpt {
-        *self.tenant_conf.read().unwrap()
+        self.tenant_conf.read().unwrap().tenant_conf
    }

    pub fn effective_config(&self) -> TenantConf {
@@ -2119,84 +2148,95 @@ impl Tenant {
    }

    pub fn get_checkpoint_distance(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.read().unwrap();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
        tenant_conf
            .checkpoint_distance
            .unwrap_or(self.conf.default_tenant_conf.checkpoint_distance)
    }

    pub fn get_checkpoint_timeout(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.read().unwrap();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
        tenant_conf
            .checkpoint_timeout
            .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout)
    }

    pub fn get_compaction_target_size(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.read().unwrap();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
        tenant_conf
            .compaction_target_size
            .unwrap_or(self.conf.default_tenant_conf.compaction_target_size)
    }

    pub fn get_compaction_period(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.read().unwrap();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
        tenant_conf
            .compaction_period
            .unwrap_or(self.conf.default_tenant_conf.compaction_period)
    }

    pub fn get_compaction_threshold(&self) -> usize {
-        let tenant_conf = self.tenant_conf.read().unwrap();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
        tenant_conf
            .compaction_threshold
            .unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
    }

    pub fn get_gc_horizon(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.read().unwrap();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
        tenant_conf
            .gc_horizon
            .unwrap_or(self.conf.default_tenant_conf.gc_horizon)
    }

    pub fn get_gc_period(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.read().unwrap();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
        tenant_conf
            .gc_period
            .unwrap_or(self.conf.default_tenant_conf.gc_period)
    }

    pub fn get_image_creation_threshold(&self) -> usize {
-        let tenant_conf = self.tenant_conf.read().unwrap();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
        tenant_conf
            .image_creation_threshold
            .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
    }

    pub fn get_pitr_interval(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.read().unwrap();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
        tenant_conf
            .pitr_interval
            .unwrap_or(self.conf.default_tenant_conf.pitr_interval)
    }

    pub fn get_trace_read_requests(&self) -> bool {
-        let tenant_conf = self.tenant_conf.read().unwrap();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
        tenant_conf
            .trace_read_requests
            .unwrap_or(self.conf.default_tenant_conf.trace_read_requests)
    }

    pub fn get_min_resident_size_override(&self) -> Option<u64> {
-        let tenant_conf = self.tenant_conf.read().unwrap();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
        tenant_conf
            .min_resident_size_override
            .or(self.conf.default_tenant_conf.min_resident_size_override)
    }

    pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
-        *self.tenant_conf.write().unwrap() = new_tenant_conf;
+        self.tenant_conf.write().unwrap().tenant_conf = new_tenant_conf;
+        // Don't hold self.timelines.lock() during the notifies.
+        // There's no risk of deadlock right now, but there could be if we consolidate
+        // mutexes in struct Timeline in the future.
+        let timelines = self.list_timelines();
+        for timeline in timelines {
+            timeline.tenant_conf_updated();
+        }
+    }
+
+    pub(crate) fn set_new_location_config(&self, new_conf: AttachedTenantConf) {
+        *self.tenant_conf.write().unwrap() = new_conf;
        // Don't hold self.timelines.lock() during the notifies.
        // There's no risk of deadlock right now, but there could be if we consolidate
        // mutexes in struct Timeline in the future.
@@ -2266,10 +2306,9 @@ impl Tenant {
    fn new(
        state: TenantState,
        conf: &'static PageServerConf,
-        tenant_conf: TenantConfOpt,
+        attached_conf: AttachedTenantConf,
        walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
        tenant_id: TenantId,
-        generation: Generation,
        remote_storage: Option<GenericRemoteStorage>,
        deletion_queue_client: DeletionQueueClient,
    ) -> Tenant {
@@ -2329,12 +2368,12 @@ impl Tenant {

        Tenant {
            tenant_id,
-            generation,
+            generation: attached_conf.location.generation,
            conf,
            // using now here is good enough approximation to catch tenants with really long
            // activation times.
            loading_started_at: Instant::now(),
-            tenant_conf: Arc::new(RwLock::new(tenant_conf)),
+            tenant_conf: Arc::new(RwLock::new(attached_conf)),
            timelines: Mutex::new(HashMap::new()),
            gc_cs: tokio::sync::Mutex::new(()),
            walredo_mgr,
@@ -2352,54 +2391,124 @@ impl Tenant {
    pub(super) fn load_tenant_config(
        conf: &'static PageServerConf,
        tenant_id: &TenantId,
-    ) -> anyhow::Result<TenantConfOpt> {
-        let target_config_path = conf.tenant_config_path(tenant_id);
-        let target_config_display = target_config_path.display();
+    ) -> anyhow::Result<LocationConf> {
+        let legacy_config_path = conf.tenant_config_path(tenant_id);
+        let config_path = conf.tenant_location_config_path(tenant_id);

-        info!("loading tenantconf from {target_config_display}");
+        if config_path.exists() {
+            // New-style config takes precedence
+            let deserialized = Self::read_config(&config_path)?;
+            Ok(toml_edit::de::from_document::<LocationConf>(deserialized)?)
+        } else if legacy_config_path.exists() {
+            // Upgrade path: found an old-style configuration only
+            let deserialized = Self::read_config(&legacy_config_path)?;

-        // FIXME If the config file is not found, assume that we're attaching
-        // a detached tenant and config is passed via attach command.
-        // https://github.com/neondatabase/neon/issues/1555
-        // OR: we're loading after incomplete deletion that managed to remove config.
-        if !target_config_path.exists() {
-            info!("tenant config not found in {target_config_display}");
-            return Ok(TenantConfOpt::default());
+            let mut tenant_conf = TenantConfOpt::default();
+            for (key, item) in deserialized.iter() {
+                match key {
+                    "tenant_config" => {
+                        tenant_conf = PageServerConf::parse_toml_tenant_conf(item).with_context(|| {
+                            format!("Failed to parse config from file '{legacy_config_path}' as pageserver config")
+                        })?;
+                    }
+                    _ => bail!(
+                        "config file {legacy_config_path} has unrecognized pageserver option '{key}'"
+                    ),
+                }
+            }
+
+            // Legacy configs are implicitly in attached state
+            Ok(LocationConf::attached_single(
+                tenant_conf,
+                Generation::none(),
+            ))
+        } else {
+            // FIXME If the config file is not found, assume that we're attaching
+            // a detached tenant and config is passed via attach command.
+            // https://github.com/neondatabase/neon/issues/1555
+            // OR: we're loading after incomplete deletion that managed to remove config.
+            info!(
+                "tenant config not found in {} or {}",
+                config_path, legacy_config_path
+            );
+            Ok(LocationConf::default())
        }
+    }
+
+    fn read_config(path: &Utf8Path) -> anyhow::Result<toml_edit::Document> {
+        info!("loading tenant configuration from {path}");

        // load and parse file
-        let config = fs::read_to_string(&target_config_path).with_context(|| {
-            format!("Failed to load config from path '{target_config_display}'")
-        })?;
+        let config = fs::read_to_string(path)
+            .with_context(|| format!("Failed to load config from path '{path}'"))?;

-        let toml = config.parse::<toml_edit::Document>().with_context(|| {
-            format!("Failed to parse config from file '{target_config_display}' as toml file")
-        })?;
-
-        let mut tenant_conf = TenantConfOpt::default();
-        for (key, item) in toml.iter() {
-            match key {
-                "tenant_config" => {
-                    tenant_conf = PageServerConf::parse_toml_tenant_conf(item).with_context(|| {
-                        format!("Failed to parse config from file '{target_config_display}' as pageserver config")
-                    })?;
-                }
-                _ => bail!("config file {target_config_display} has unrecognized pageserver option '{key}'"),
-
-            }
-        }
-
-        Ok(tenant_conf)
+        config
+            .parse::<toml_edit::Document>()
+            .with_context(|| format!("Failed to parse config from file '{path}' as toml file"))
    }

    #[tracing::instrument(skip_all, fields(%tenant_id))]
    pub(super) async fn persist_tenant_config(
+        conf: &'static PageServerConf,
        tenant_id: &TenantId,
-        target_config_path: &Path,
-        tenant_conf: TenantConfOpt,
+        location_conf: &LocationConf,
    ) -> anyhow::Result<()> {
-        // imitate a try-block with a closure
-        info!("persisting tenantconf to {}", target_config_path.display());
+        let legacy_config_path = conf.tenant_config_path(tenant_id);
+        let config_path = conf.tenant_location_config_path(tenant_id);
+        Self::persist_tenant_config_at(tenant_id, &config_path, &legacy_config_path, location_conf)
+            .await
+    }
+
+    #[tracing::instrument(skip_all, fields(%tenant_id))]
+    pub(super) async fn persist_tenant_config_at(
+        tenant_id: &TenantId,
+        config_path: &Utf8Path,
+        legacy_config_path: &Utf8Path,
+        location_conf: &LocationConf,
+    ) -> anyhow::Result<()> {
+        // Forward compat: write out an old-style configuration that old versions can read, in case we roll back
+        Self::persist_tenant_config_legacy(
+            tenant_id,
+            legacy_config_path,
+            &location_conf.tenant_conf,
+        )
+        .await?;
+
+        if let LocationMode::Attached(attach_conf) = &location_conf.mode {
+            // Once we use LocationMode, generations are mandatory.  If we aren't using generations,
+            // then drop out after writing legacy-style config.
+            if attach_conf.generation.is_none() {
+                tracing::debug!("Running without generations, not writing new-style LocationConf");
+                return Ok(());
+            }
+        }
+
+        info!("persisting tenantconf to {config_path}");
+
+        let mut conf_content = r#"# This file contains a specific per-tenant's config.
+#  It is read in case of pageserver restart.
+"#
+        .to_string();
+
+        // Convert the config to a toml file.
+        conf_content += &toml_edit::ser::to_string_pretty(&location_conf)?;
+
+        let conf_content = conf_content.as_bytes();
+
+        let temp_path = path_with_suffix_extension(config_path, TEMP_FILE_SUFFIX);
+        VirtualFile::crashsafe_overwrite(config_path, &temp_path, conf_content)
+            .await
+            .with_context(|| format!("write tenant {tenant_id} config to {config_path}"))?;
+        Ok(())
+    }
+
+    #[tracing::instrument(skip_all, fields(%tenant_id))]
+    async fn persist_tenant_config_legacy(
+        tenant_id: &TenantId,
+        target_config_path: &Utf8Path,
+        tenant_conf: &TenantConfOpt,
+    ) -> anyhow::Result<()> {
+        info!("persisting tenantconf to {target_config_path}");

        let mut conf_content = r#"# This file contains a specific per-tenant's config.
 #  It is read in case of pageserver restart.
@@ -2416,12 +2525,7 @@ impl Tenant {
        let temp_path = path_with_suffix_extension(target_config_path, TEMP_FILE_SUFFIX);
        VirtualFile::crashsafe_overwrite(target_config_path, &temp_path, conf_content)
            .await
-            .with_context(|| {
-                format!(
-                    "write tenant {tenant_id} config to {}",
-                    target_config_path.display()
-                )
-            })?;
+            .with_context(|| format!("write tenant {tenant_id} config to {target_config_path}"))?;
        Ok(())
    }

@@ -2651,6 +2755,11 @@ impl Tenant {
    ) -> Result<Arc<Timeline>, CreateTimelineError> {
        let src_id = src_timeline.timeline_id;

+        // First acquire the GC lock so that another task cannot advance the GC
+        // cutoff in 'gc_info', and make 'start_lsn' invalid, while we are
+        // creating the branch.
+        let _gc_cs = self.gc_cs.lock().await;
+
        // If no start LSN is specified, we branch the new timeline from the source timeline's last record LSN
        let start_lsn = start_lsn.unwrap_or_else(|| {
            let lsn = src_timeline.get_last_record_lsn();
@@ -2658,11 +2767,6 @@ impl Tenant {
            lsn
        });

-        // First acquire the GC lock so that another task cannot advance the GC
-        // cutoff in 'gc_info', and make 'start_lsn' invalid, while we are
-        // creating the branch.
-        let _gc_cs = self.gc_cs.lock().await;
-
        // Create a placeholder for the new branch. This will error
        // out if the new timeline ID is already in use.
        let timeline_uninit_mark = {
@@ -2788,10 +2892,7 @@ impl Tenant {
        // current initdb was not run yet, so remove whatever was left from the previous runs
        if initdb_path.exists() {
            fs::remove_dir_all(&initdb_path).with_context(|| {
-                format!(
-                    "Failed to remove already existing initdb directory: {}",
-                    initdb_path.display()
-                )
+                format!("Failed to remove already existing initdb directory: {initdb_path}")
            })?;
        }
        // Init temporarily repo to get bootstrap data, this creates a directory in the `initdb_path` path
@@ -2800,7 +2901,7 @@ impl Tenant {
        scopeguard::defer! {
            if let Err(e) = fs::remove_dir_all(&initdb_path) {
                // this is unlikely, but we will remove the directory on pageserver restart or another bootstrap call
-                error!("Failed to remove temporary initdb directory '{}': {}", initdb_path.display(), e);
+                error!("Failed to remove temporary initdb directory '{initdb_path}': {e}");
            }
        }
        let pgdata_path = &initdb_path;
@@ -2950,7 +3051,7 @@ impl Tenant {

    async fn create_timeline_files(
        &self,
-        timeline_path: &Path,
+        timeline_path: &Utf8Path,
        new_timeline_id: &TimelineId,
        new_metadata: &TimelineMetadata,
    ) -> anyhow::Result<()> {
@@ -2984,8 +3085,7 @@ impl Tenant {
        let timeline_path = self.conf.timeline_path(&tenant_id, &timeline_id);
        anyhow::ensure!(
            !timeline_path.exists(),
-            "Timeline {} already exists, cannot create its uninit mark file",
-            timeline_path.display()
+            "Timeline {timeline_path} already exists, cannot create its uninit mark file",
        );

        let uninit_mark_path = self
@@ -3077,7 +3177,10 @@ impl Tenant {
    }
 }

-fn remove_timeline_and_uninit_mark(timeline_dir: &Path, uninit_mark: &Path) -> anyhow::Result<()> {
+fn remove_timeline_and_uninit_mark(
+    timeline_dir: &Utf8Path,
+    uninit_mark: &Utf8Path,
+) -> anyhow::Result<()> {
    fs::remove_dir_all(timeline_dir)
        .or_else(|e| {
            if e.kind() == std::io::ErrorKind::NotFound {
@@ -3089,17 +3192,10 @@ fn remove_timeline_and_uninit_mark(timeline_dir: &Path, uninit_mark: &Path) -> a
            }
        })
        .with_context(|| {
-            format!(
-                "Failed to remove unit marked timeline directory {}",
-                timeline_dir.display()
-            )
+            format!("Failed to remove unit marked timeline directory {timeline_dir}")
        })?;
-    fs::remove_file(uninit_mark).with_context(|| {
-        format!(
-            "Failed to remove timeline uninit mark file {}",
-            uninit_mark.display()
-        )
-    })?;
+    fs::remove_file(uninit_mark)
+        .with_context(|| format!("Failed to remove timeline uninit mark file {uninit_mark}"))?;

    Ok(())
 }
@@ -3111,10 +3207,10 @@ pub(crate) enum CreateTenantFilesMode {

 pub(crate) async fn create_tenant_files(
    conf: &'static PageServerConf,
-    tenant_conf: TenantConfOpt,
+    location_conf: &LocationConf,
    tenant_id: &TenantId,
    mode: CreateTenantFilesMode,
-) -> anyhow::Result<PathBuf> {
+) -> anyhow::Result<Utf8PathBuf> {
    let target_tenant_directory = conf.tenant_path(tenant_id);
    anyhow::ensure!(
        !target_tenant_directory
@@ -3125,22 +3221,16 @@ pub(crate) async fn create_tenant_files(

    let temporary_tenant_dir =
        path_with_suffix_extension(&target_tenant_directory, TEMP_FILE_SUFFIX);
-    debug!(
-        "Creating temporary directory structure in {}",
-        temporary_tenant_dir.display()
-    );
+    debug!("Creating temporary directory structure in {temporary_tenant_dir}");

    // top-level dir may exist if we are creating it through CLI
    crashsafe::create_dir_all(&temporary_tenant_dir).with_context(|| {
-        format!(
-            "could not create temporary tenant directory {}",
-            temporary_tenant_dir.display()
-        )
+        format!("could not create temporary tenant directory {temporary_tenant_dir}")
    })?;

    let creation_result = try_create_target_tenant_dir(
        conf,
-        tenant_conf,
+        location_conf,
        tenant_id,
        mode,
        &temporary_tenant_dir,
@@ -3166,11 +3256,11 @@ pub(crate) async fn create_tenant_files(

 async fn try_create_target_tenant_dir(
    conf: &'static PageServerConf,
-    tenant_conf: TenantConfOpt,
+    location_conf: &LocationConf,
    tenant_id: &TenantId,
    mode: CreateTenantFilesMode,
-    temporary_tenant_dir: &Path,
-    target_tenant_directory: &Path,
+    temporary_tenant_dir: &Utf8Path,
+    target_tenant_directory: &Utf8Path,
 ) -> Result<(), anyhow::Error> {
    match mode {
        CreateTenantFilesMode::Create => {} // needs no attach marker, writing tenant conf + atomic rename of dir is good enough
@@ -3196,20 +3286,31 @@ async fn try_create_target_tenant_dir(
        temporary_tenant_dir,
    )
    .with_context(|| format!("resolve tenant {tenant_id} temporary timelines dir"))?;
-    let temporary_tenant_config_path = rebase_directory(
+    let temporary_legacy_tenant_config_path = rebase_directory(
        &conf.tenant_config_path(tenant_id),
        target_tenant_directory,
        temporary_tenant_dir,
    )
    .with_context(|| format!("resolve tenant {tenant_id} temporary config path"))?;
+    let temporary_tenant_config_path = rebase_directory(
+        &conf.tenant_location_config_path(tenant_id),
+        target_tenant_directory,
+        temporary_tenant_dir,
+    )
+    .with_context(|| format!("resolve tenant {tenant_id} temporary config path"))?;

-    Tenant::persist_tenant_config(tenant_id, &temporary_tenant_config_path, tenant_conf).await?;
+    Tenant::persist_tenant_config_at(
+        tenant_id,
+        &temporary_tenant_config_path,
+        &temporary_legacy_tenant_config_path,
+        location_conf,
+    )
+    .await?;

    crashsafe::create_dir(&temporary_tenant_timelines_dir).with_context(|| {
        format!(
            "create tenant {} temporary timelines directory {}",
-            tenant_id,
-            temporary_tenant_timelines_dir.display()
+            tenant_id, temporary_tenant_timelines_dir,
        )
    })?;
    fail::fail_point!("tenant-creation-before-tmp-rename", |_| {
@@ -3224,35 +3325,34 @@ async fn try_create_target_tenant_dir(
    fs::rename(temporary_tenant_dir, target_tenant_directory).with_context(|| {
        format!(
            "move tenant {} temporary directory {} into the permanent one {}",
-            tenant_id,
-            temporary_tenant_dir.display(),
-            target_tenant_directory.display()
+            tenant_id, temporary_tenant_dir, target_tenant_directory
        )
    })?;
    let target_dir_parent = target_tenant_directory.parent().with_context(|| {
        format!(
            "get tenant {} dir parent for {}",
-            tenant_id,
-            target_tenant_directory.display()
+            tenant_id, target_tenant_directory,
        )
    })?;
    crashsafe::fsync(target_dir_parent).with_context(|| {
        format!(
            "fsync renamed directory's parent {} for tenant {}",
-            target_dir_parent.display(),
-            tenant_id,
+            target_dir_parent, tenant_id,
        )
    })?;

    Ok(())
 }

-fn rebase_directory(original_path: &Path, base: &Path, new_base: &Path) -> anyhow::Result<PathBuf> {
+fn rebase_directory(
+    original_path: &Utf8Path,
+    base: &Utf8Path,
+    new_base: &Utf8Path,
+) -> anyhow::Result<Utf8PathBuf> {
    let relative_path = original_path.strip_prefix(base).with_context(|| {
        format!(
            "Failed to strip base prefix '{}' off path '{}'",
-            base.display(),
-            original_path.display()
+            base, original_path
        )
    })?;
    Ok(new_base.join(relative_path))
@@ -3262,20 +3362,18 @@ fn rebase_directory(original_path: &Path, base: &Path, new_base: &Path) -> anyho
 /// to get bootstrap data for timeline initialization.
 fn run_initdb(
    conf: &'static PageServerConf,
-    initdb_target_dir: &Path,
+    initdb_target_dir: &Utf8Path,
    pg_version: u32,
 ) -> anyhow::Result<()> {
    let initdb_bin_path = conf.pg_bin_dir(pg_version)?.join("initdb");
    let initdb_lib_dir = conf.pg_lib_dir(pg_version)?;
    info!(
        "running {} in {}, libdir: {}",
-        initdb_bin_path.display(),
-        initdb_target_dir.display(),
-        initdb_lib_dir.display(),
+        initdb_bin_path, initdb_target_dir, initdb_lib_dir,
    );

    let initdb_output = Command::new(&initdb_bin_path)
-        .args(["-D", &initdb_target_dir.to_string_lossy()])
+        .args(["-D", initdb_target_dir.as_ref()])
        .args(["-U", &conf.superuser])
        .args(["-E", "utf8"])
        .arg("--no-instructions")
@@ -3290,8 +3388,7 @@ fn run_initdb(
        .with_context(|| {
            format!(
                "failed to execute {} at target dir {}",
-                initdb_bin_path.display(),
-                initdb_target_dir.display()
+                initdb_bin_path, initdb_target_dir,
            )
        })?;
    if !initdb_output.status.success() {
@@ -3311,7 +3408,7 @@ impl Drop for Tenant {
 }
 /// Dump contents of a layer file to stdout.
 pub async fn dump_layerfile_from_path(
-    path: &Path,
+    path: &Utf8Path,
    verbose: bool,
    ctx: &RequestContext,
 ) -> anyhow::Result<()> {
@@ -3344,8 +3441,8 @@ pub async fn dump_layerfile_from_path(
 pub mod harness {
    use bytes::{Bytes, BytesMut};
    use once_cell::sync::OnceCell;
+    use std::fs;
    use std::sync::Arc;
-    use std::{fs, path::PathBuf};
    use utils::logging;
    use utils::lsn::Lsn;

@@ -3410,7 +3507,7 @@ pub mod harness {
        pub tenant_id: TenantId,
        pub generation: Generation,
        pub remote_storage: GenericRemoteStorage,
-        pub remote_fs_dir: PathBuf,
+        pub remote_fs_dir: Utf8PathBuf,
        pub deletion_queue: MockDeletionQueue,
    }

@@ -3489,10 +3586,13 @@ pub mod harness {
            let tenant = Arc::new(Tenant::new(
                TenantState::Loading,
                self.conf,
-                TenantConfOpt::from(self.tenant_conf),
+                AttachedTenantConf::try_from(LocationConf::attached_single(
+                    TenantConfOpt::from(self.tenant_conf),
+                    self.generation,
+                ))
+                .unwrap(),
                walredo_mgr,
                self.tenant_id,
-                self.generation,
                Some(self.remote_storage.clone()),
                self.deletion_queue.new_client(),
            ));
@@ -3509,7 +3609,7 @@ pub mod harness {
            Ok(tenant)
        }

-        pub fn timeline_path(&self, timeline_id: &TimelineId) -> PathBuf {
+        pub fn timeline_path(&self, timeline_id: &TimelineId) -> Utf8PathBuf {
            self.conf.timeline_path(&self.tenant_id, timeline_id)
        }
    }
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -238,14 +238,14 @@ mod tests {
    use rand::{Rng, SeedableRng};

    async fn round_trip_test<const BUFFERED: bool>(blobs: &[Vec<u8>]) -> Result<(), Error> {
-        let temp_dir = tempfile::tempdir()?;
-        let path = temp_dir.path().join("file");
+        let temp_dir = camino_tempfile::tempdir()?;
+        let pathbuf = temp_dir.path().join("file");
        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);

        // Write part (in block to drop the file)
        let mut offsets = Vec::new();
        {
-            let file = VirtualFile::create(&path).await?;
+            let file = VirtualFile::create(pathbuf.as_path()).await?;
            let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
            for blob in blobs.iter() {
                let offs = wtr.write_blob(blob).await?;
@@ -258,7 +258,7 @@ mod tests {
            wtr.flush_buffer().await?;
        }

-        let file = VirtualFile::open(&path).await?;
+        let file = VirtualFile::open(pathbuf.as_path()).await?;
        let rdr = BlockReaderRef::VirtualFile(&file);
        let rdr = BlockCursor::new(rdr);
        for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() {
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -186,26 +186,21 @@ impl FileBlockReader {
        ctx: &RequestContext,
    ) -> Result<BlockLease, std::io::Error> {
        let cache = page_cache::get();
-        loop {
-            match cache
-                .read_immutable_buf(self.file_id, blknum, ctx)
-                .await
-                .map_err(|e| {
-                    std::io::Error::new(
-                        std::io::ErrorKind::Other,
-                        format!("Failed to read immutable buf: {e:#}"),
-                    )
-                })? {
-                ReadBufResult::Found(guard) => break Ok(guard.into()),
-                ReadBufResult::NotFound(mut write_guard) => {
-                    // Read the page from disk into the buffer
-                    self.fill_buffer(write_guard.deref_mut(), blknum).await?;
-                    write_guard.mark_valid();
-
-                    // Swap for read lock
-                    continue;
-                }
-            };
+        match cache
+            .read_immutable_buf(self.file_id, blknum, ctx)
+            .await
+            .map_err(|e| {
+                std::io::Error::new(
+                    std::io::ErrorKind::Other,
+                    format!("Failed to read immutable buf: {e:#}"),
+                )
+            })? {
+            ReadBufResult::Found(guard) => Ok(guard.into()),
+            ReadBufResult::NotFound(mut write_guard) => {
+                // Read the page from disk into the buffer
+                self.fill_buffer(write_guard.deref_mut(), blknum).await?;
+                Ok(write_guard.mark_valid().into())
+            }
        }
    }
 }
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -13,6 +13,7 @@ use pageserver_api::models;
 use serde::{Deserialize, Serialize};
 use std::num::NonZeroU64;
 use std::time::Duration;
+use utils::generation::Generation;

 pub mod defaults {
    // FIXME: This current value is very low. I would imagine something like 1 GB or 10 GB
@@ -44,7 +45,211 @@ pub mod defaults {
    pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
 }

-/// Per-tenant configuration options
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+pub(crate) enum AttachmentMode {
+    /// Our generation is current as far as we know, and as far as we know we are the only attached
+    /// pageserver.  This is the "normal" attachment mode.
+    Single,
+    /// Our generation number is current as far as we know, but we are advised that another
+    /// pageserver is still attached, and therefore to avoid executing deletions.   This is
+    /// the attachment mode of a pagesever that is the destination of a migration.
+    Multi,
+    /// Our generation number is superseded, or about to be superseded.  We are advised
+    /// to avoid remote storage writes if possible, and to avoid sending billing data.  This
+    /// is the attachment mode of a pageserver that is the origin of a migration.
+    Stale,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+pub(crate) struct AttachedLocationConfig {
+    pub(crate) generation: Generation,
+    pub(crate) attach_mode: AttachmentMode,
+    // TODO: add a flag to override AttachmentMode's policies under
+    // disk pressure (i.e. unblock uploads under disk pressure in Stale
+    // state, unblock deletions after timeout in Multi state)
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+pub(crate) struct SecondaryLocationConfig {
+    /// If true, keep the local cache warm by polling remote storage
+    pub(crate) warm: bool,
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+pub(crate) enum LocationMode {
+    Attached(AttachedLocationConfig),
+    Secondary(SecondaryLocationConfig),
+}
+
+/// Per-tenant, per-pageserver configuration.  All pageservers use the same TenantConf,
+/// but have distinct LocationConf.
+#[derive(Clone, PartialEq, Eq, Serialize, Deserialize)]
+pub(crate) struct LocationConf {
+    /// The location-specific part of the configuration, describes the operating
+    /// mode of this pageserver for this tenant.
+    pub(crate) mode: LocationMode,
+    /// The pan-cluster tenant configuration, the same on all locations
+    pub(crate) tenant_conf: TenantConfOpt,
+}
+
+impl std::fmt::Debug for LocationConf {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match &self.mode {
+            LocationMode::Attached(conf) => {
+                write!(
+                    f,
+                    "Attached {:?}, gen={:?}",
+                    conf.attach_mode, conf.generation
+                )
+            }
+            LocationMode::Secondary(conf) => {
+                write!(f, "Secondary, warm={}", conf.warm)
+            }
+        }
+    }
+}
+
+impl AttachedLocationConfig {
+    /// Consult attachment mode to determine whether we are currently permitted
+    /// to delete layers.  This is only advisory, not required for data safety.
+    /// See [`AttachmentMode`] for more context.
+    pub(crate) fn may_delete_layers_hint(&self) -> bool {
+        // TODO: add an override for disk pressure in AttachedLocationConfig,
+        // and respect it here.
+        match &self.attach_mode {
+            AttachmentMode::Single => true,
+            AttachmentMode::Multi | AttachmentMode::Stale => {
+                // In Multi mode we avoid doing deletions because some other
+                // attached pageserver might get 404 while trying to read
+                // a layer we delete which is still referenced in their metadata.
+                //
+                // In Stale mode, we avoid doing deletions because we expect
+                // that they would ultimately fail validation in the deletion
+                // queue due to our stale generation.
+                false
+            }
+        }
+    }
+
+    /// Whether we are currently hinted that it is worthwhile to upload layers.
+    /// This is only advisory, not required for data safety.
+    /// See [`AttachmentMode`] for more context.
+    pub(crate) fn may_upload_layers_hint(&self) -> bool {
+        // TODO: add an override for disk pressure in AttachedLocationConfig,
+        // and respect it here.
+        match &self.attach_mode {
+            AttachmentMode::Single | AttachmentMode::Multi => true,
+            AttachmentMode::Stale => {
+                // In Stale mode, we avoid doing uploads because we expect that
+                // our replacement pageserver will already have started its own
+                // IndexPart that will never reference layers we upload: it is
+                // wasteful.
+                false
+            }
+        }
+    }
+}
+
+impl LocationConf {
+    /// For use when loading from a legacy configuration: presence of a tenant
+    /// implies it is in AttachmentMode::Single, which used to be the only
+    /// possible state.  This function should eventually be removed.
+    pub(crate) fn attached_single(tenant_conf: TenantConfOpt, generation: Generation) -> Self {
+        Self {
+            mode: LocationMode::Attached(AttachedLocationConfig {
+                generation,
+                attach_mode: AttachmentMode::Single,
+            }),
+            tenant_conf,
+        }
+    }
+
+    /// For use when attaching/re-attaching: update the generation stored in this
+    /// structure.  If we were in a secondary state, promote to attached (posession
+    /// of a fresh generation implies this).
+    pub(crate) fn attach_in_generation(&mut self, generation: Generation) {
+        match &mut self.mode {
+            LocationMode::Attached(attach_conf) => {
+                attach_conf.generation = generation;
+            }
+            LocationMode::Secondary(_) => {
+                // We are promoted to attached by the control plane's re-attach response
+                self.mode = LocationMode::Attached(AttachedLocationConfig {
+                    generation,
+                    attach_mode: AttachmentMode::Single,
+                })
+            }
+        }
+    }
+
+    pub(crate) fn try_from(conf: &'_ models::LocationConfig) -> anyhow::Result<Self> {
+        let tenant_conf = TenantConfOpt::try_from(&conf.tenant_conf)?;
+
+        fn get_generation(conf: &'_ models::LocationConfig) -> Result<Generation, anyhow::Error> {
+            conf.generation
+                .ok_or_else(|| anyhow::anyhow!("Generation must be set when attaching"))
+        }
+
+        let mode = match &conf.mode {
+            models::LocationConfigMode::AttachedMulti => {
+                LocationMode::Attached(AttachedLocationConfig {
+                    generation: get_generation(conf)?,
+                    attach_mode: AttachmentMode::Multi,
+                })
+            }
+            models::LocationConfigMode::AttachedSingle => {
+                LocationMode::Attached(AttachedLocationConfig {
+                    generation: get_generation(conf)?,
+                    attach_mode: AttachmentMode::Single,
+                })
+            }
+            models::LocationConfigMode::AttachedStale => {
+                LocationMode::Attached(AttachedLocationConfig {
+                    generation: get_generation(conf)?,
+                    attach_mode: AttachmentMode::Stale,
+                })
+            }
+            models::LocationConfigMode::Secondary => {
+                anyhow::ensure!(conf.generation.is_none());
+
+                let warm = conf
+                    .secondary_conf
+                    .as_ref()
+                    .map(|c| c.warm)
+                    .unwrap_or(false);
+                LocationMode::Secondary(SecondaryLocationConfig { warm })
+            }
+            models::LocationConfigMode::Detached => {
+                // Should not have been called: API code should translate this mode
+                // into a detach rather than trying to decode it as a LocationConf
+                return Err(anyhow::anyhow!("Cannot decode a Detached configuration"));
+            }
+        };
+
+        Ok(Self { mode, tenant_conf })
+    }
+}
+
+impl Default for LocationConf {
+    // TODO: this should be removed once tenant loading can guarantee that we are never
+    // loading from a directory without a configuration.
+    // => tech debt since https://github.com/neondatabase/neon/issues/1555
+    fn default() -> Self {
+        Self {
+            mode: LocationMode::Attached(AttachedLocationConfig {
+                generation: Generation::none(),
+                attach_mode: AttachmentMode::Single,
+            }),
+            tenant_conf: TenantConfOpt::default(),
+        }
+    }
+}
+
+/// A tenant's calcuated configuration, which is the result of merging a
+/// tenant's TenantConfOpt with the global TenantConf from PageServerConf.
+///
+/// For storing and transmitting individual tenant's configuration, see
+/// TenantConfOpt.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 pub struct TenantConf {
    // Flush out an inmemory layer, if it's holding WAL older than this
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -1,9 +1,7 @@
-use std::{
-    path::{Path, PathBuf},
-    sync::Arc,
-};
+use std::sync::Arc;

 use anyhow::Context;
+use camino::{Utf8Path, Utf8PathBuf};
 use pageserver_api::models::TenantState;
 use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
 use tokio::sync::OwnedMutexGuard;
@@ -33,7 +31,7 @@ use super::{
 const SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS: u32 = 3;

 #[derive(Debug, thiserror::Error)]
-pub enum DeleteTenantError {
+pub(crate) enum DeleteTenantError {
    #[error("GetTenant {0}")]
    Get(#[from] GetTenantError),

@@ -62,7 +60,7 @@ fn remote_tenant_delete_mark_path(
        .context("Failed to strip workdir prefix")
        .and_then(RemotePath::new)
        .context("tenant path")?;
-    Ok(tenant_remote_path.join(Path::new("deleted")))
+    Ok(tenant_remote_path.join(Utf8Path::new("deleted")))
 }

 async fn create_remote_delete_mark(
@@ -148,7 +146,7 @@ async fn schedule_ordered_timeline_deletions(
    Ok(already_running_deletions)
 }

-async fn ensure_timelines_dir_empty(timelines_path: &Path) -> Result<(), DeleteTenantError> {
+async fn ensure_timelines_dir_empty(timelines_path: &Utf8Path) -> Result<(), DeleteTenantError> {
    // Assert timelines dir is empty.
    if !fs_ext::is_directory_empty(timelines_path).await? {
        // Display first 10 items in directory
@@ -188,20 +186,18 @@ async fn cleanup_remaining_fs_traces(
    conf: &PageServerConf,
    tenant_id: &TenantId,
 ) -> Result<(), DeleteTenantError> {
-    let rm = |p: PathBuf, is_dir: bool| async move {
+    let rm = |p: Utf8PathBuf, is_dir: bool| async move {
        if is_dir {
            tokio::fs::remove_dir(&p).await
        } else {
            tokio::fs::remove_file(&p).await
        }
        .or_else(fs_ext::ignore_not_found)
-        .with_context(|| {
-            let to_display = p.display();
-            format!("failed to delete {to_display}")
-        })
+        .with_context(|| format!("failed to delete {p}"))
    };

    rm(conf.tenant_config_path(tenant_id), false).await?;
+    rm(conf.tenant_location_config_path(tenant_id), false).await?;

    fail::fail_point!("tenant-delete-before-remove-timelines-dir", |_| {
        Err(anyhow::anyhow!(
@@ -380,7 +376,7 @@ impl DeleteTenantFlow {
        Ok(())
    }

-    pub async fn should_resume_deletion(
+    pub(crate) async fn should_resume_deletion(
        conf: &'static PageServerConf,
        remote_storage: Option<&GenericRemoteStorage>,
        tenant: &Tenant,
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -6,11 +6,11 @@ use crate::context::RequestContext;
 use crate::page_cache::{self, PAGE_SZ};
 use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
 use crate::virtual_file::VirtualFile;
+use camino::Utf8PathBuf;
 use std::cmp::min;
 use std::fs::OpenOptions;
 use std::io::{self, ErrorKind};
 use std::ops::DerefMut;
-use std::path::PathBuf;
 use std::sync::atomic::AtomicU64;
 use tracing::*;
 use utils::id::{TenantId, TimelineId};
@@ -40,7 +40,9 @@ impl EphemeralFile {

        let filename = conf
            .timeline_path(&tenant_id, &timeline_id)
-            .join(PathBuf::from(format!("ephemeral-{filename_disambiguator}")));
+            .join(Utf8PathBuf::from(format!(
+                "ephemeral-{filename_disambiguator}"
+            )));

        let file = VirtualFile::open_with_options(
            &filename,
@@ -70,38 +72,32 @@ impl EphemeralFile {
        let flushed_blknums = 0..self.len / PAGE_SZ as u64;
        if flushed_blknums.contains(&(blknum as u64)) {
            let cache = page_cache::get();
-            loop {
-                match cache
-                    .read_immutable_buf(self.page_cache_file_id, blknum, ctx)
-                    .await
-                    .map_err(|e| {
-                        std::io::Error::new(
-                            std::io::ErrorKind::Other,
-                            // order path before error because error is anyhow::Error => might have many contexts
-                            format!(
-                                "ephemeral file: read immutable page #{}: {}: {:#}",
-                                blknum,
-                                self.file.path.display(),
-                                e,
-                            ),
-                        )
-                    })? {
-                    page_cache::ReadBufResult::Found(guard) => {
-                        return Ok(BlockLease::PageReadGuard(guard))
-                    }
-                    page_cache::ReadBufResult::NotFound(mut write_guard) => {
-                        let buf: &mut [u8] = write_guard.deref_mut();
-                        debug_assert_eq!(buf.len(), PAGE_SZ);
-                        self.file
-                            .read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)
-                            .await?;
-                        write_guard.mark_valid();
-
-                        // Swap for read lock
-                        continue;
-                    }
-                };
-            }
+            match cache
+                .read_immutable_buf(self.page_cache_file_id, blknum, ctx)
+                .await
+                .map_err(|e| {
+                    std::io::Error::new(
+                        std::io::ErrorKind::Other,
+                        // order path before error because error is anyhow::Error => might have many contexts
+                        format!(
+                            "ephemeral file: read immutable page #{}: {}: {:#}",
+                            blknum, self.file.path, e,
+                        ),
+                    )
+                })? {
+                page_cache::ReadBufResult::Found(guard) => {
+                    return Ok(BlockLease::PageReadGuard(guard))
+                }
+                page_cache::ReadBufResult::NotFound(mut write_guard) => {
+                    let buf: &mut [u8] = write_guard.deref_mut();
+                    debug_assert_eq!(buf.len(), PAGE_SZ);
+                    self.file
+                        .read_exact_at(&mut buf[..], blknum as u64 * PAGE_SZ as u64)
+                        .await?;
+                    let read_guard = write_guard.mark_valid();
+                    return Ok(BlockLease::PageReadGuard(read_guard));
+                }
+            };
        } else {
            debug_assert_eq!(blknum as u64, self.len / PAGE_SZ as u64);
            Ok(BlockLease::EphemeralFileMutableTail(&self.mutable_tail))
@@ -171,7 +167,7 @@ impl EphemeralFile {
                                        let buf: &mut [u8] = write_guard.deref_mut();
                                        debug_assert_eq!(buf.len(), PAGE_SZ);
                                        buf.copy_from_slice(&self.ephemeral_file.mutable_tail);
-                                        write_guard.mark_valid();
+                                        let _ = write_guard.mark_valid();
                                        // pre-warm successful
                                    }
                                    Err(e) => {
@@ -195,7 +191,7 @@ impl EphemeralFile {
                                        "ephemeral_file: write_blob: write-back full tail blk #{}: {:#}: {}",
                                        self.blknum,
                                        e,
-                                        self.ephemeral_file.file.path.display(),
+                                        self.ephemeral_file.file.path,
                                    ),
                                ));
                            }
@@ -258,8 +254,7 @@ impl Drop for EphemeralFile {
                // not found files might also be related to https://github.com/neondatabase/neon/issues/2442
                error!(
                    "could not remove ephemeral file '{}': {}",
-                    self.file.path.display(),
-                    e
+                    self.file.path, e
                );
            }
        }
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
--- a/pageserver/src/tenant/par_fsync.rs
+++ b/pageserver/src/tenant/par_fsync.rs
@@ -1,16 +1,17 @@
 use std::{
    io,
-    path::{Path, PathBuf},
    sync::atomic::{AtomicUsize, Ordering},
 };

-fn fsync_path(path: &Path) -> io::Result<()> {
+use camino::{Utf8Path, Utf8PathBuf};
+
+fn fsync_path(path: &Utf8Path) -> io::Result<()> {
    // TODO use VirtualFile::fsync_all once we fully go async.
    let file = std::fs::File::open(path)?;
    file.sync_all()
 }

-fn parallel_worker(paths: &[PathBuf], next_path_idx: &AtomicUsize) -> io::Result<()> {
+fn parallel_worker(paths: &[Utf8PathBuf], next_path_idx: &AtomicUsize) -> io::Result<()> {
    while let Some(path) = paths.get(next_path_idx.fetch_add(1, Ordering::Relaxed)) {
        fsync_path(path)?;
    }
@@ -18,7 +19,7 @@ fn parallel_worker(paths: &[PathBuf], next_path_idx: &AtomicUsize) -> io::Result
    Ok(())
 }

-fn fsync_in_thread_pool(paths: &[PathBuf]) -> io::Result<()> {
+fn fsync_in_thread_pool(paths: &[Utf8PathBuf]) -> io::Result<()> {
    // TODO: remove this function in favor of `par_fsync_async` once we asyncify everything.

    /// Use at most this number of threads.
@@ -47,7 +48,7 @@ fn fsync_in_thread_pool(paths: &[PathBuf]) -> io::Result<()> {
 }

 /// Parallel fsync all files. Can be used in non-async context as it is using rayon thread pool.
-pub fn par_fsync(paths: &[PathBuf]) -> io::Result<()> {
+pub fn par_fsync(paths: &[Utf8PathBuf]) -> io::Result<()> {
    if paths.len() == 1 {
        fsync_path(&paths[0])?;
        return Ok(());
@@ -58,7 +59,7 @@ pub fn par_fsync(paths: &[PathBuf]) -> io::Result<()> {

 /// Parallel fsync asynchronously. If number of files are less than PARALLEL_PATH_THRESHOLD, fsync is done in the current
 /// execution thread. Otherwise, we will spawn_blocking and run it in tokio.
-pub async fn par_fsync_async(paths: &[PathBuf]) -> io::Result<()> {
+pub async fn par_fsync_async(paths: &[Utf8PathBuf]) -> io::Result<()> {
    const MAX_CONCURRENT_FSYNC: usize = 64;
    let mut next = paths.iter().peekable();
    let mut js = tokio::task::JoinSet::new();
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -209,6 +209,7 @@ pub mod index;
 mod upload;

 use anyhow::Context;
+use camino::Utf8Path;
 use chrono::{NaiveDateTime, Utc};
 // re-export these
 pub use download::{is_temp_download_file, list_remote_timelines};
@@ -219,7 +220,6 @@ use utils::backoff::{
 };

 use std::collections::{HashMap, VecDeque};
-use std::path::{Path, PathBuf};
 use std::sync::atomic::{AtomicU32, Ordering};
 use std::sync::{Arc, Mutex};

@@ -453,11 +453,11 @@ impl RemoteTimelineClient {
        } else {
            0
        };
-        self.metrics.remote_physical_size_gauge().set(size);
+        self.metrics.remote_physical_size_set(size);
    }

    pub fn get_remote_physical_size(&self) -> u64 {
-        self.metrics.remote_physical_size_gauge().get()
+        self.metrics.remote_physical_size_get()
    }

    //
@@ -901,9 +901,27 @@ impl RemoteTimelineClient {
        .await
        .context("list prefixes")?;

-        let remaining: Vec<RemotePath> = remaining
+        // We will delete the current index_part object last, since it acts as a deletion
+        // marker via its deleted_at attribute
+        let latest_index = remaining
+            .iter()
+            .filter(|p| {
+                p.object_name()
+                    .map(|n| n.starts_with(IndexPart::FILE_NAME))
+                    .unwrap_or(false)
+            })
+            .filter_map(|path| parse_remote_index_path(path.clone()).map(|gen| (path, gen)))
+            .max_by_key(|i| i.1)
+            .map(|i| i.0.clone())
+            .unwrap_or(
+                // No generation-suffixed indices, assume we are dealing with
+                // a legacy index.
+                remote_index_path(&self.tenant_id, &self.timeline_id, Generation::none()),
+            );
+
+        let remaining_layers: Vec<RemotePath> = remaining
            .into_iter()
-            .filter(|p| p.object_name() != Some(IndexPart::FILE_NAME))
+            .filter(|p| p!= &latest_index)
            .inspect(|path| {
                if let Some(name) = path.object_name() {
                    info!(%name, "deleting a file not referenced from index_part.json");
@@ -913,9 +931,11 @@ impl RemoteTimelineClient {
            })
            .collect();

-        let not_referenced_count = remaining.len();
-        if !remaining.is_empty() {
-            self.deletion_queue_client.push_immediate(remaining).await?;
+        let not_referenced_count = remaining_layers.len();
+        if !remaining_layers.is_empty() {
+            self.deletion_queue_client
+                .push_immediate(remaining_layers)
+                .await?;
        }

        fail::fail_point!("timeline-delete-before-index-delete", |_| {
@@ -924,11 +944,9 @@ impl RemoteTimelineClient {
            ))?
        });

-        let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME));
-
        debug!("enqueuing index part deletion");
        self.deletion_queue_client
-            .push_immediate([index_file_path].to_vec())
+            .push_immediate([latest_index].to_vec())
            .await?;

        // Timeline deletion is rare and we have probably emitted a reasonably number of objects: wait
@@ -1161,11 +1179,7 @@ impl RemoteTimelineClient {
                    // at info level at first, and only WARN if the operation fails repeatedly.
                    //
                    // (See similar logic for downloads in `download::download_retry`)
-
-                    let is_simulated = cfg!(feature = "testing")
-                        && e.root_cause().is::<remote_storage::SimulatedError>();
-
-                    if retries < FAILED_UPLOAD_WARN_THRESHOLD || is_simulated {
+                    if retries < FAILED_UPLOAD_WARN_THRESHOLD {
                        info!(
                            "failed to perform remote task {}, will retry (attempt {}): {:#}",
                            task.op, retries, e
@@ -1413,7 +1427,7 @@ pub fn remote_timelines_path(tenant_id: &TenantId) -> RemotePath {
 }

 pub fn remote_timeline_path(tenant_id: &TenantId, timeline_id: &TimelineId) -> RemotePath {
-    remote_timelines_path(tenant_id).join(&PathBuf::from(timeline_id.to_string()))
+    remote_timelines_path(tenant_id).join(Utf8Path::new(&timeline_id.to_string()))
 }

 pub fn remote_layer_path(
@@ -1456,14 +1470,7 @@ pub(crate) fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
        }
    };

-    let file_name_str = match file_name.to_str() {
-        Some(s) => s,
-        None => {
-            tracing::warn!("Malformed index key {:?}", path);
-            return None;
-        }
-    };
-    match file_name_str.split_once('-') {
+    match file_name.split_once('-') {
        Some((_, gen_suffix)) => Generation::parse_suffix(gen_suffix),
        None => None,
    }
@@ -1475,20 +1482,16 @@ pub(crate) fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
 /// Errors if the path provided does not start from pageserver's workdir.
 pub fn remote_path(
    conf: &PageServerConf,
-    local_path: &Path,
+    local_path: &Utf8Path,
    generation: Generation,
 ) -> anyhow::Result<RemotePath> {
    let stripped = local_path
        .strip_prefix(&conf.workdir)
        .context("Failed to strip workdir prefix")?;

-    let suffixed = format!(
-        "{0}{1}",
-        stripped.to_string_lossy(),
-        generation.get_suffix()
-    );
+    let suffixed = format!("{0}{1}", stripped, generation.get_suffix());

-    RemotePath::new(&PathBuf::from(suffixed)).with_context(|| {
+    RemotePath::new(Utf8Path::new(&suffixed)).with_context(|| {
        format!(
            "to resolve remote part of path {:?} for base {:?}",
            local_path, conf.workdir
@@ -1508,7 +1511,7 @@ mod tests {
        DEFAULT_PG_VERSION,
    };

-    use std::{collections::HashSet, path::Path};
+    use std::collections::HashSet;
    use utils::lsn::Lsn;

    pub(super) fn dummy_contents(name: &str) -> Vec<u8> {
@@ -1542,7 +1545,7 @@ mod tests {
        assert_eq!(avec, bvec);
    }

-    fn assert_remote_files(expected: &[&str], remote_path: &Path, generation: Generation) {
+    fn assert_remote_files(expected: &[&str], remote_path: &Utf8Path, generation: Generation) {
        let mut expected: Vec<String> = expected
            .iter()
            .map(|x| format!("{}{}", x, generation.get_suffix()))
@@ -1661,12 +1664,12 @@ mod tests {

        let timeline_path = harness.timeline_path(&TIMELINE_ID);

-        println!("workdir: {}", harness.conf.workdir.display());
+        println!("workdir: {}", harness.conf.workdir);

        let remote_timeline_dir = harness
            .remote_fs_dir
            .join(timeline_path.strip_prefix(&harness.conf.workdir).unwrap());
-        println!("remote_timeline_dir: {}", remote_timeline_dir.display());
+        println!("remote_timeline_dir: {remote_timeline_dir}");

        let generation = harness.generation;

@@ -1913,7 +1916,7 @@ mod tests {
        let index_path = test_state.harness.remote_fs_dir.join(
            remote_index_path(&test_state.harness.tenant_id, &TIMELINE_ID, generation).get_path(),
        );
-        eprintln!("Writing {}", index_path.display());
+        eprintln!("Writing {index_path}");
        std::fs::write(&index_path, index_part_bytes).unwrap();
        example_index_part
    }
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -5,10 +5,10 @@

 use std::collections::HashSet;
 use std::future::Future;
-use std::path::Path;
 use std::time::Duration;

 use anyhow::{anyhow, Context};
+use camino::Utf8Path;
 use tokio::fs;
 use tokio::io::AsyncWriteExt;
 use tokio_util::sync::CancellationToken;
@@ -74,12 +74,7 @@ pub async fn download_layer_file<'a>(
            // TODO: this doesn't use the cached fd for some reason?
            let mut destination_file = fs::File::create(&temp_file_path)
                .await
-                .with_context(|| {
-                    format!(
-                        "create a destination file for layer '{}'",
-                        temp_file_path.display()
-                    )
-                })
+                .with_context(|| format!("create a destination file for layer '{temp_file_path}'"))
                .map_err(DownloadError::Other)?;
            let mut download = storage
                .download(&remote_path)
@@ -121,7 +116,7 @@ pub async fn download_layer_file<'a>(
    destination_file
        .flush()
        .await
-        .with_context(|| format!("flush source file at {}", temp_file_path.display()))
+        .with_context(|| format!("flush source file at {temp_file_path}"))
        .map_err(DownloadError::Other)?;

    let expected = layer_metadata.file_size();
@@ -135,12 +130,7 @@ pub async fn download_layer_file<'a>(
    destination_file
        .sync_all()
        .await
-        .with_context(|| {
-            format!(
-                "failed to fsync source file at {}",
-                temp_file_path.display()
-            )
-        })
+        .with_context(|| format!("failed to fsync source file at {temp_file_path}"))
        .map_err(DownloadError::Other)?;
    drop(destination_file);

@@ -152,27 +142,23 @@ pub async fn download_layer_file<'a>(

    fs::rename(&temp_file_path, &local_path)
        .await
-        .with_context(|| format!("rename download layer file to {}", local_path.display(),))
+        .with_context(|| format!("rename download layer file to {local_path}"))
        .map_err(DownloadError::Other)?;

    crashsafe::fsync_async(&local_path)
        .await
-        .with_context(|| format!("fsync layer file {}", local_path.display(),))
+        .with_context(|| format!("fsync layer file {local_path}"))
        .map_err(DownloadError::Other)?;

-    tracing::debug!("download complete: {}", local_path.display());
+    tracing::debug!("download complete: {local_path}");

    Ok(bytes_amount)
 }

 const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download";

-pub fn is_temp_download_file(path: &Path) -> bool {
-    let extension = path.extension().map(|pname| {
-        pname
-            .to_str()
-            .expect("paths passed to this function must be valid Rust strings")
-    });
+pub fn is_temp_download_file(path: &Utf8Path) -> bool {
+    let extension = path.extension();
    match extension {
        Some(TEMP_DOWNLOAD_EXTENSION) => true,
        Some(_) => false,
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -1,8 +1,9 @@
 //! Helper functions to upload files to remote storage with a RemoteStorage

 use anyhow::{bail, Context};
+use camino::Utf8Path;
 use fail::fail_point;
-use std::{io::ErrorKind, path::Path};
+use std::io::ErrorKind;
 use tokio::fs;

 use super::Generation;
@@ -30,6 +31,7 @@ pub(super) async fn upload_index_part<'a>(
    fail_point!("before-upload-index", |_| {
        bail!("failpoint before-upload-index")
    });
+    pausable_failpoint!("before-upload-index-pausable");

    let index_part_bytes =
        serde_json::to_vec(&index_part).context("serialize index part file into bytes")?;
@@ -50,7 +52,7 @@ pub(super) async fn upload_index_part<'a>(
 pub(super) async fn upload_timeline_layer<'a>(
    conf: &'static PageServerConf,
    storage: &'a GenericRemoteStorage,
-    source_path: &'a Path,
+    source_path: &'a Utf8Path,
    known_metadata: &'a LayerFileMetadata,
    generation: Generation,
 ) -> anyhow::Result<()> {
@@ -68,7 +70,7 @@ pub(super) async fn upload_timeline_layer<'a>(
            // upload. However, a nonexistent file can also be indicative of
            // something worse, like when a file is scheduled for upload before
            // it has been written to disk yet.
-            info!(path = %source_path.display(), "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more.");
+            info!(path = %source_path, "File to upload doesn't exist. Likely the file has been deleted and an upload is not required any more.");
            return Ok(());
        }
        Err(e) => {
@@ -93,7 +95,7 @@ pub(super) async fn upload_timeline_layer<'a>(
    storage
        .upload(source_file, fs_size, &storage_path, None)
        .await
-        .with_context(|| format!("upload layer from local path '{}'", source_path.display()))?;
+        .with_context(|| format!("upload layer from local path '{source_path}'"))?;

    Ok(())
 }
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -14,6 +14,7 @@ use crate::task_mgr::TaskKind;
 use crate::walrecord::NeonWalRecord;
 use anyhow::Result;
 use bytes::Bytes;
+use camino::Utf8PathBuf;
 use enum_map::EnumMap;
 use enumset::EnumSet;
 use once_cell::sync::Lazy;
@@ -22,7 +23,6 @@ use pageserver_api::models::{
    HistoricLayerInfo, LayerResidenceEvent, LayerResidenceEventReason, LayerResidenceStatus,
 };
 use std::ops::Range;
-use std::path::PathBuf;
 use std::sync::{Arc, Mutex};
 use std::time::{Duration, SystemTime, UNIX_EPOCH};
 use tracing::warn;
@@ -378,7 +378,7 @@ pub trait PersistentLayer: Layer + AsLayerDesc {

    // Path to the layer file in the local filesystem.
    // `None` for `RemoteLayer`.
-    fn local_path(&self) -> Option<PathBuf>;
+    fn local_path(&self) -> Option<Utf8PathBuf>;

    /// Permanently remove this layer from disk.
    fn delete_resident_layer_file(&self) -> Result<()>;
@@ -456,7 +456,7 @@ pub mod tests {
 /// config. In that case, we use the Path variant to hold the full path to the file on
 /// disk.
 enum PathOrConf {
-    Path(PathBuf),
+    Path(Utf8PathBuf),
    Conf(&'static PageServerConf),
 }

--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -41,6 +41,7 @@ use crate::virtual_file::VirtualFile;
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
 use anyhow::{bail, ensure, Context, Result};
+use camino::{Utf8Path, Utf8PathBuf};
 use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
@@ -48,7 +49,6 @@ use std::fs::{self, File};
 use std::io::SeekFrom;
 use std::ops::Range;
 use std::os::unix::fs::FileExt;
-use std::path::{Path, PathBuf};
 use std::sync::Arc;
 use tokio::sync::OnceCell;
 use tracing::*;
@@ -267,7 +267,7 @@ impl PersistentLayer for DeltaLayer {
        Some(self)
    }

-    fn local_path(&self) -> Option<PathBuf> {
+    fn local_path(&self) -> Option<Utf8PathBuf> {
        self.local_path()
    }

@@ -374,7 +374,7 @@ impl DeltaLayer {
            .await
    }

-    pub(crate) fn local_path(&self) -> Option<PathBuf> {
+    pub(crate) fn local_path(&self) -> Option<Utf8PathBuf> {
        Some(self.path())
    }

@@ -409,7 +409,7 @@ impl DeltaLayer {
        tenant_id: &TenantId,
        timeline_id: &TimelineId,
        fname: &DeltaFileName,
-    ) -> PathBuf {
+    ) -> Utf8PathBuf {
        match path_or_conf {
            PathOrConf::Path(path) => path.clone(),
            PathOrConf::Conf(conf) => conf
@@ -424,7 +424,7 @@ impl DeltaLayer {
        timeline_id: &TimelineId,
        key_start: Key,
        lsn_range: &Range<Lsn>,
-    ) -> PathBuf {
+    ) -> Utf8PathBuf {
        let rand_string: String = rand::thread_rng()
            .sample_iter(&Alphanumeric)
            .take(8)
@@ -455,7 +455,7 @@ impl DeltaLayer {
        self.inner
            .get_or_try_init(|| self.load_inner(ctx))
            .await
-            .with_context(|| format!("Failed to load delta layer {}", self.path().display()))
+            .with_context(|| format!("Failed to load delta layer {}", self.path()))
    }

    async fn load_inner(&self, ctx: &RequestContext) -> Result<Arc<DeltaLayerInner>> {
@@ -471,7 +471,7 @@ impl DeltaLayer {
        if let PathOrConf::Path(ref path) = self.path_or_conf {
            // not production code

-            let actual_filename = path.file_name().unwrap().to_str().unwrap().to_owned();
+            let actual_filename = path.file_name().unwrap().to_owned();
            let expected_filename = self.filename().file_name();

            if actual_filename != expected_filename {
@@ -510,9 +510,8 @@ impl DeltaLayer {
    /// Create a DeltaLayer struct representing an existing file on disk.
    ///
    /// This variant is only used for debugging purposes, by the 'pagectl' binary.
-    pub fn new_for_path(path: &Path, file: File) -> Result<Self> {
-        let mut summary_buf = Vec::new();
-        summary_buf.resize(PAGE_SZ, 0);
+    pub fn new_for_path(path: &Utf8Path, file: File) -> Result<Self> {
+        let mut summary_buf = vec![0; PAGE_SZ];
        file.read_exact_at(&mut summary_buf, 0)?;
        let summary = Summary::des_prefix(&summary_buf)?;

@@ -538,7 +537,7 @@ impl DeltaLayer {
        self.desc.delta_file_name()
    }
    /// Path to the layer file in pageserver workdir.
-    pub fn path(&self) -> PathBuf {
+    pub fn path(&self) -> Utf8PathBuf {
        Self::path_for(
            &self.path_or_conf,
            &self.desc.tenant_id,
@@ -573,7 +572,7 @@ impl DeltaLayer {
 ///
 struct DeltaLayerWriterInner {
    conf: &'static PageServerConf,
-    pub path: PathBuf,
+    pub path: Utf8PathBuf,
    timeline_id: TimelineId,
    tenant_id: TenantId,

@@ -711,7 +710,7 @@ impl DeltaLayerWriterInner {
        ensure!(
            metadata.len() <= S3_UPLOAD_LIMIT,
            "Created delta layer file at {} of size {} above limit {S3_UPLOAD_LIMIT}!",
-            file.path.display(),
+            file.path,
            metadata.len()
        );

@@ -748,7 +747,7 @@ impl DeltaLayerWriterInner {
        );
        std::fs::rename(self.path, &final_path)?;

-        trace!("created delta layer {}", final_path.display());
+        trace!("created delta layer {final_path}");

        Ok(layer)
    }
@@ -847,13 +846,13 @@ impl Drop for DeltaLayerWriter {

 impl DeltaLayerInner {
    pub(super) async fn load(
-        path: &std::path::Path,
+        path: &Utf8Path,
        summary: Option<Summary>,
        ctx: &RequestContext,
    ) -> anyhow::Result<Self> {
        let file = VirtualFile::open(path)
            .await
-            .with_context(|| format!("Failed to open file '{}'", path.display()))?;
+            .with_context(|| format!("Failed to open file '{path}'"))?;
        let file = FileBlockReader::new(file);

        let summary_blk = file.read_blk(0, ctx).await?;
@@ -933,15 +932,12 @@ impl DeltaLayerInner {
                .read_blob_into_buf(pos, &mut buf, ctx)
                .await
                .with_context(|| {
-                    format!(
-                        "Failed to read blob from virtual file {}",
-                        file.file.path.display()
-                    )
+                    format!("Failed to read blob from virtual file {}", file.file.path)
                })?;
            let val = Value::des(&buf).with_context(|| {
                format!(
                    "Failed to deserialize file blob from virtual file {}",
-                    file.file.path.display()
+                    file.file.path
                )
            })?;
            match val {
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -37,6 +37,7 @@ use crate::virtual_file::VirtualFile;
 use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
 use anyhow::{bail, ensure, Context, Result};
 use bytes::Bytes;
+use camino::{Utf8Path, Utf8PathBuf};
 use hex;
 use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
 use rand::{distributions::Alphanumeric, Rng};
@@ -45,7 +46,6 @@ use std::fs::{self, File};
 use std::io::SeekFrom;
 use std::ops::Range;
 use std::os::unix::prelude::FileExt;
-use std::path::{Path, PathBuf};
 use tokio::sync::OnceCell;
 use tracing::*;

@@ -195,7 +195,7 @@ impl AsLayerDesc for ImageLayer {
 }

 impl PersistentLayer for ImageLayer {
-    fn local_path(&self) -> Option<PathBuf> {
+    fn local_path(&self) -> Option<Utf8PathBuf> {
        self.local_path()
    }

@@ -269,10 +269,10 @@ impl ImageLayer {
            .get_value_reconstruct_data(key, reconstruct_state, ctx)
            .await
            // FIXME: makes no sense to dump paths
-            .with_context(|| format!("read {}", self.path().display()))
+            .with_context(|| format!("read {}", self.path()))
    }

-    pub(crate) fn local_path(&self) -> Option<PathBuf> {
+    pub(crate) fn local_path(&self) -> Option<Utf8PathBuf> {
        Some(self.path())
    }

@@ -304,7 +304,7 @@ impl ImageLayer {
        timeline_id: TimelineId,
        tenant_id: TenantId,
        fname: &ImageFileName,
-    ) -> PathBuf {
+    ) -> Utf8PathBuf {
        match path_or_conf {
            PathOrConf::Path(path) => path.to_path_buf(),
            PathOrConf::Conf(conf) => conf
@@ -318,7 +318,7 @@ impl ImageLayer {
        timeline_id: TimelineId,
        tenant_id: TenantId,
        fname: &ImageFileName,
-    ) -> PathBuf {
+    ) -> Utf8PathBuf {
        let rand_string: String = rand::thread_rng()
            .sample_iter(&Alphanumeric)
            .take(8)
@@ -342,7 +342,7 @@ impl ImageLayer {
        self.inner
            .get_or_try_init(|| self.load_inner(ctx))
            .await
-            .with_context(|| format!("Failed to load image layer {}", self.path().display()))
+            .with_context(|| format!("Failed to load image layer {}", self.path()))
    }

    async fn load_inner(&self, ctx: &RequestContext) -> Result<ImageLayerInner> {
@@ -359,7 +359,7 @@ impl ImageLayer {

        if let PathOrConf::Path(ref path) = self.path_or_conf {
            // not production code
-            let actual_filename = path.file_name().unwrap().to_str().unwrap().to_owned();
+            let actual_filename = path.file_name().unwrap().to_owned();
            let expected_filename = self.filename().file_name();

            if actual_filename != expected_filename {
@@ -399,9 +399,8 @@ impl ImageLayer {
    /// Create an ImageLayer struct representing an existing file on disk.
    ///
    /// This variant is only used for debugging purposes, by the 'pagectl' binary.
-    pub fn new_for_path(path: &Path, file: File) -> Result<ImageLayer> {
-        let mut summary_buf = Vec::new();
-        summary_buf.resize(PAGE_SZ, 0);
+    pub fn new_for_path(path: &Utf8Path, file: File) -> Result<ImageLayer> {
+        let mut summary_buf = vec![0; PAGE_SZ];
        file.read_exact_at(&mut summary_buf, 0)?;
        let summary = Summary::des_prefix(&summary_buf)?;
        let metadata = file
@@ -427,7 +426,7 @@ impl ImageLayer {
    }

    /// Path to the layer file in pageserver workdir.
-    pub fn path(&self) -> PathBuf {
+    pub fn path(&self) -> Utf8PathBuf {
        Self::path_for(
            &self.path_or_conf,
            self.desc.timeline_id,
@@ -439,14 +438,14 @@ impl ImageLayer {

 impl ImageLayerInner {
    pub(super) async fn load(
-        path: &std::path::Path,
+        path: &Utf8Path,
        lsn: Lsn,
        summary: Option<Summary>,
        ctx: &RequestContext,
    ) -> anyhow::Result<Self> {
        let file = VirtualFile::open(path)
            .await
-            .with_context(|| format!("Failed to open file '{}'", path.display()))?;
+            .with_context(|| format!("Failed to open file '{}'", path))?;
        let file = FileBlockReader::new(file);
        let summary_blk = file.read_blk(0, ctx).await?;
        let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
@@ -526,7 +525,7 @@ impl ImageLayerInner {
 ///
 struct ImageLayerWriterInner {
    conf: &'static PageServerConf,
-    path: PathBuf,
+    path: Utf8PathBuf,
    timeline_id: TimelineId,
    tenant_id: TenantId,
    key_range: Range<Key>,
@@ -558,7 +557,7 @@ impl ImageLayerWriterInner {
                lsn,
            },
        );
-        info!("new image layer {}", path.display());
+        info!("new image layer {path}");
        let mut file = VirtualFile::open_with_options(
            &path,
            std::fs::OpenOptions::new().write(true).create_new(true),
@@ -685,7 +684,7 @@ impl ImageLayerWriterInner {
        );
        std::fs::rename(self.path, final_path)?;

-        trace!("created image layer {}", layer.path().display());
+        trace!("created image layer {}", layer.path());

        Ok(layer)
    }
--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -8,9 +8,9 @@ use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
 use crate::tenant::storage_layer::{Layer, ValueReconstructResult, ValueReconstructState};
 use crate::tenant::timeline::layer_manager::LayerManager;
 use anyhow::{bail, Result};
+use camino::Utf8PathBuf;
 use pageserver_api::models::HistoricLayerInfo;
 use std::ops::Range;
-use std::path::PathBuf;
 use std::sync::Arc;

 use utils::{
@@ -92,7 +92,7 @@ impl AsLayerDesc for RemoteLayer {
 }

 impl PersistentLayer for RemoteLayer {
-    fn local_path(&self) -> Option<PathBuf> {
+    fn local_path(&self) -> Option<Utf8PathBuf> {
        None
    }

--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -9,6 +9,7 @@ mod walreceiver;

 use anyhow::{anyhow, bail, ensure, Context, Result};
 use bytes::Bytes;
+use camino::{Utf8Path, Utf8PathBuf};
 use fail::fail_point;
 use futures::StreamExt;
 use itertools::Itertools;
@@ -29,7 +30,6 @@ use utils::id::TenantTimelineId;
 use std::cmp::{max, min, Ordering};
 use std::collections::{BinaryHeap, HashMap, HashSet};
 use std::ops::{Deref, Range};
-use std::path::{Path, PathBuf};
 use std::pin::pin;
 use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::{Arc, Mutex, RwLock, Weak};
@@ -56,7 +56,7 @@ use crate::config::PageServerConf;
 use crate::keyspace::{KeyPartitioning, KeySpace, KeySpaceRandomAccum};
 use crate::metrics::{
    TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
-    RECONSTRUCT_TIME, UNEXPECTED_ONDEMAND_DOWNLOADS,
+    UNEXPECTED_ONDEMAND_DOWNLOADS,
 };
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key};
@@ -91,12 +91,12 @@ use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};

 use super::config::TenantConf;
-use super::debug_assert_current_span_has_tenant_and_timeline_id;
 use super::remote_timeline_client::index::IndexPart;
 use super::remote_timeline_client::RemoteTimelineClient;
 use super::storage_layer::{
    AsLayerDesc, DeltaLayer, ImageLayer, LayerAccessStatsReset, PersistentLayerDesc,
 };
+use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};

 #[derive(Debug, PartialEq, Eq, Clone, Copy)]
 pub(super) enum FlushLoopState {
@@ -149,7 +149,7 @@ pub struct TimelineResources {

 pub struct Timeline {
    conf: &'static PageServerConf,
-    tenant_conf: Arc<RwLock<TenantConfOpt>>,
+    tenant_conf: Arc<RwLock<AttachedTenantConf>>,

    myself: Weak<Self>,

@@ -158,6 +158,9 @@ pub struct Timeline {

    /// The generation of the tenant that instantiated us: this is used for safety when writing remote objects.
    /// Never changes for the lifetime of this [`Timeline`] object.
+    ///
+    /// This duplicates the generation stored in LocationConf, but that structure is mutable:
+    /// this copy enforces the invariant that generatio doesn't change during a Tenant's lifetime.
    generation: Generation,

    pub pg_version: u32,
@@ -496,13 +499,39 @@ impl Timeline {
        };

        let timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME.start_timer();
-        self.get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx)
+        let path = self
+            .get_reconstruct_data(key, lsn, &mut reconstruct_state, ctx)
            .await?;
        timer.stop_and_record();

-        RECONSTRUCT_TIME
-            .observe_closure_duration(|| self.reconstruct_value(key, lsn, reconstruct_state))
-            .await
+        let start = Instant::now();
+        let res = self.reconstruct_value(key, lsn, reconstruct_state).await;
+        let elapsed = start.elapsed();
+        crate::metrics::RECONSTRUCT_TIME
+            .for_result(&res)
+            .observe(elapsed.as_secs_f64());
+
+        if cfg!(feature = "testing") && res.is_err() {
+            // it can only be walredo issue
+            use std::fmt::Write;
+
+            let mut msg = String::new();
+
+            path.into_iter().for_each(|(res, cont_lsn, layer)| {
+                writeln!(
+                    msg,
+                    "- layer traversal: result {res:?}, cont_lsn {cont_lsn}, layer: {}",
+                    layer(),
+                )
+                .expect("string grows")
+            });
+
+            // this is to rule out or provide evidence that we could in some cases read a duplicate
+            // walrecord
+            tracing::info!("walredo failed, path:\n{msg}");
+        }
+
+        res
    }

    /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
@@ -559,7 +588,7 @@ impl Timeline {
    }

    pub fn resident_physical_size(&self) -> u64 {
-        self.metrics.resident_physical_size_gauge.get()
+        self.metrics.resident_physical_size_get()
    }

    ///
@@ -1309,10 +1338,7 @@ impl Timeline {
        // will treat the file as a local layer again, count it towards resident size,
        // and it'll be like the layer removal never happened.
        // The bump in resident size is perhaps unexpected but overall a robust behavior.
-        self.metrics
-            .resident_physical_size_gauge
-            .sub(layer_file_size);
-
+        self.metrics.resident_physical_size_sub(layer_file_size);
        self.metrics.evictions.inc();

        if let Some(delta) = local_layer_residence_duration {
@@ -1355,42 +1381,42 @@ const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10;
 // Private functions
 impl Timeline {
    fn get_checkpoint_distance(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.read().unwrap();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
        tenant_conf
            .checkpoint_distance
            .unwrap_or(self.conf.default_tenant_conf.checkpoint_distance)
    }

    fn get_checkpoint_timeout(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.read().unwrap();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
        tenant_conf
            .checkpoint_timeout
            .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout)
    }

    fn get_compaction_target_size(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.read().unwrap();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
        tenant_conf
            .compaction_target_size
            .unwrap_or(self.conf.default_tenant_conf.compaction_target_size)
    }

    fn get_compaction_threshold(&self) -> usize {
-        let tenant_conf = self.tenant_conf.read().unwrap();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
        tenant_conf
            .compaction_threshold
            .unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
    }

    fn get_image_creation_threshold(&self) -> usize {
-        let tenant_conf = self.tenant_conf.read().unwrap();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
        tenant_conf
            .image_creation_threshold
            .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
    }

    fn get_eviction_policy(&self) -> EvictionPolicy {
-        let tenant_conf = self.tenant_conf.read().unwrap();
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
        tenant_conf
            .eviction_policy
            .unwrap_or(self.conf.default_tenant_conf.eviction_policy)
@@ -1406,7 +1432,7 @@ impl Timeline {
    }

    fn get_gc_feedback(&self) -> bool {
-        let tenant_conf = self.tenant_conf.read().unwrap();
+        let tenant_conf = &self.tenant_conf.read().unwrap().tenant_conf;
        tenant_conf
            .gc_feedback
            .unwrap_or(self.conf.default_tenant_conf.gc_feedback)
@@ -1419,7 +1445,7 @@ impl Timeline {
        // The threshold is embedded in the metric. So, we need to update it.
        {
            let new_threshold = Self::get_evictions_low_residence_duration_metric_threshold(
-                &self.tenant_conf.read().unwrap(),
+                &self.tenant_conf.read().unwrap().tenant_conf,
                &self.conf.default_tenant_conf,
            );
            let tenant_id_str = self.tenant_id.to_string();
@@ -1438,7 +1464,7 @@ impl Timeline {
    #[allow(clippy::too_many_arguments)]
    pub(super) fn new(
        conf: &'static PageServerConf,
-        tenant_conf: Arc<RwLock<TenantConfOpt>>,
+        tenant_conf: Arc<RwLock<AttachedTenantConf>>,
        metadata: &TimelineMetadata,
        ancestor: Option<Arc<Timeline>>,
        timeline_id: TimelineId,
@@ -1461,7 +1487,7 @@ impl Timeline {

        let evictions_low_residence_duration_metric_threshold =
            Self::get_evictions_low_residence_duration_metric_threshold(
-                &tenant_conf_guard,
+                &tenant_conf_guard.tenant_conf,
                &conf.default_tenant_conf,
            );
        drop(tenant_conf_guard);
@@ -1626,12 +1652,15 @@ impl Timeline {

        let tenant_conf_guard = self.tenant_conf.read().unwrap();
        let wal_connect_timeout = tenant_conf_guard
+            .tenant_conf
            .walreceiver_connect_timeout
            .unwrap_or(self.conf.default_tenant_conf.walreceiver_connect_timeout);
        let lagging_wal_timeout = tenant_conf_guard
+            .tenant_conf
            .lagging_wal_timeout
            .unwrap_or(self.conf.default_tenant_conf.lagging_wal_timeout);
        let max_lsn_wal_lag = tenant_conf_guard
+            .tenant_conf
            .max_lsn_wal_lag
            .unwrap_or(self.conf.default_tenant_conf.max_lsn_wal_lag);
        drop(tenant_conf_guard);
@@ -1713,7 +1742,7 @@ impl Timeline {
                        Discovered::Temporary(name) => (name, "temporary timeline file"),
                        Discovered::TemporaryDownload(name) => (name, "temporary download"),
                    };
-                    path.push(name);
+                    path.push(Utf8Path::new(&name));
                    init::cleanup(&path, kind)?;
                    path.pop();
                }
@@ -1846,9 +1875,7 @@ impl Timeline {
            "loaded layer map with {} layers at {}, total physical size: {}",
            num_layers, disk_consistent_lsn, total_physical_size
        );
-        self.metrics
-            .resident_physical_size_gauge
-            .set(total_physical_size);
+        self.metrics.resident_physical_size_set(total_physical_size);

        timer.stop_and_record();
        Ok(())
@@ -2196,10 +2223,10 @@ impl TraversalLayerExt for Arc<dyn PersistentLayer> {
        let timeline_id = self.layer_desc().timeline_id;
        match self.local_path() {
            Some(local_path) => {
-                debug_assert!(local_path.to_str().unwrap().contains(&format!("{}", timeline_id)),
+                debug_assert!(local_path.to_string().contains(&format!("{}", timeline_id)),
                    "need timeline ID to uniquely identify the layer when traversal crosses ancestor boundary",
                );
-                format!("{}", local_path.display())
+                format!("{local_path}")
            }
            None => {
                format!("remote {}/{self}", timeline_id)
@@ -2229,7 +2256,7 @@ impl Timeline {
        request_lsn: Lsn,
        reconstruct_state: &mut ValueReconstructState,
        ctx: &RequestContext,
-    ) -> Result<(), PageReconstructError> {
+    ) -> Result<Vec<TraversalPathItem>, PageReconstructError> {
        // Start from the current timeline.
        let mut timeline_owned;
        let mut timeline = self;
@@ -2260,12 +2287,12 @@ impl Timeline {
            // The function should have updated 'state'
            //info!("CALLED for {} at {}: {:?} with {} records, cached {}", key, cont_lsn, result, reconstruct_state.records.len(), cached_lsn);
            match result {
-                ValueReconstructResult::Complete => return Ok(()),
+                ValueReconstructResult::Complete => return Ok(traversal_path),
                ValueReconstructResult::Continue => {
                    // If we reached an earlier cached page image, we're done.
                    if cont_lsn == cached_lsn + 1 {
                        MATERIALIZED_PAGE_CACHE_HIT.inc_by(1);
-                        return Ok(());
+                        return Ok(traversal_path);
                    }
                    if prev_lsn <= cont_lsn {
                        // Didn't make any progress in last iteration. Error out to avoid
@@ -2336,7 +2363,7 @@ impl Timeline {
                // during branch creation.
                match ancestor.wait_to_become_active(ctx).await {
                    Ok(()) => {}
-                    Err(state) if state == TimelineState::Stopping => {
+                    Err(TimelineState::Stopping) => {
                        return Err(PageReconstructError::AncestorStopping(ancestor.timeline_id));
                    }
                    Err(state) => {
@@ -3701,6 +3728,11 @@ impl Timeline {
            });

            writer.as_mut().unwrap().put_value(key, lsn, value).await?;
+
+            if !new_layers.is_empty() {
+                fail_point!("after-timeline-compacted-first-L1");
+            }
+
            prev_key = Some(key);
        }
        if let Some(writer) = writer {
@@ -3722,7 +3754,7 @@ impl Timeline {
                    );
                }
            }
-            let mut layer_paths: Vec<PathBuf> = new_layers.iter().map(|l| l.path()).collect();
+            let mut layer_paths: Vec<Utf8PathBuf> = new_layers.iter().map(|l| l.path()).collect();

            // Fsync all the layer files and directory using multiple threads to
            // minimize latency.
@@ -3832,10 +3864,7 @@ impl Timeline {
            let new_delta_path = l.path();

            let metadata = new_delta_path.metadata().with_context(|| {
-                format!(
-                    "read file metadata for new created layer {}",
-                    new_delta_path.display()
-                )
+                format!("read file metadata for new created layer {new_delta_path}")
            })?;

            if let Some(remote_client) = &self.remote_client {
@@ -3858,6 +3887,7 @@ impl Timeline {
            );
            let l = l as Arc<dyn PersistentLayer>;
            if guard.contains(&l) {
+                tracing::error!(layer=%l, "duplicated L1 layer");
                duplicated_layers.insert(l.layer_desc().key());
            } else {
                if LayerMap::is_l0(l.layer_desc()) {
@@ -4398,7 +4428,7 @@ impl Timeline {

                    // XXX the temp file is still around in Err() case
                    // and consumes space until we clean up upon pageserver restart.
-                    self_clone.metrics.resident_physical_size_gauge.add(*size);
+                    self_clone.metrics.resident_physical_size_add(*size);

                    // Download complete. Replace the RemoteLayer with the corresponding
                    // Delta- or ImageLayer in the layer map.
@@ -4769,11 +4799,10 @@ fn is_send() {

 /// Add a suffix to a layer file's name: .{num}.old
 /// Uses the first available num (starts at 0)
-fn rename_to_backup(path: &Path) -> anyhow::Result<()> {
+fn rename_to_backup(path: &Utf8Path) -> anyhow::Result<()> {
    let filename = path
        .file_name()
-        .ok_or_else(|| anyhow!("Path {} don't have a file name", path.display()))?
-        .to_string_lossy();
+        .ok_or_else(|| anyhow!("Path {path} don't have a file name"))?;
    let mut new_path = path.to_owned();

    for i in 0u32.. {
--- a/pageserver/src/tenant/timeline/init.rs
+++ b/pageserver/src/tenant/timeline/init.rs
@@ -12,7 +12,8 @@ use crate::{
    METADATA_FILE_NAME,
 };
 use anyhow::Context;
-use std::{collections::HashMap, ffi::OsString, path::Path, str::FromStr};
+use camino::Utf8Path;
+use std::{collections::HashMap, str::FromStr};
 use utils::lsn::Lsn;

 /// Identified files in the timeline directory.
@@ -20,46 +21,43 @@ pub(super) enum Discovered {
    /// The only one we care about
    Layer(LayerFileName, u64),
    /// Old ephmeral files from previous launches, should be removed
-    Ephemeral(OsString),
+    Ephemeral(String),
    /// Old temporary timeline files, unsure what these really are, should be removed
-    Temporary(OsString),
+    Temporary(String),
    /// Temporary on-demand download files, should be removed
-    TemporaryDownload(OsString),
+    TemporaryDownload(String),
    /// "metadata" file we persist locally and include in `index_part.json`
    Metadata,
    /// Backup file from previously future layers
    IgnoredBackup,
    /// Unrecognized, warn about these
-    Unknown(OsString),
+    Unknown(String),
 }

 /// Scans the timeline directory for interesting files.
-pub(super) fn scan_timeline_dir(path: &Path) -> anyhow::Result<Vec<Discovered>> {
+pub(super) fn scan_timeline_dir(path: &Utf8Path) -> anyhow::Result<Vec<Discovered>> {
    let mut ret = Vec::new();

-    for direntry in std::fs::read_dir(path)? {
+    for direntry in path.read_dir_utf8()? {
        let direntry = direntry?;
-        let direntry_path = direntry.path();
-        let file_name = direntry.file_name();
+        let file_name = direntry.file_name().to_string();

-        let fname = file_name.to_string_lossy();
-
-        let discovered = match LayerFileName::from_str(&fname) {
+        let discovered = match LayerFileName::from_str(&file_name) {
            Ok(file_name) => {
                let file_size = direntry.metadata()?.len();
                Discovered::Layer(file_name, file_size)
            }
            Err(_) => {
-                if fname == METADATA_FILE_NAME {
+                if file_name == METADATA_FILE_NAME {
                    Discovered::Metadata
-                } else if fname.ends_with(".old") {
+                } else if file_name.ends_with(".old") {
                    // ignore these
                    Discovered::IgnoredBackup
-                } else if remote_timeline_client::is_temp_download_file(&direntry_path) {
+                } else if remote_timeline_client::is_temp_download_file(direntry.path()) {
                    Discovered::TemporaryDownload(file_name)
-                } else if is_ephemeral_file(&fname) {
+                } else if is_ephemeral_file(&file_name) {
                    Discovered::Ephemeral(file_name)
-                } else if is_temporary(&direntry_path) {
+                } else if is_temporary(direntry.path()) {
                    Discovered::Temporary(file_name)
                } else {
                    Discovered::Unknown(file_name)
@@ -162,15 +160,14 @@ pub(super) fn reconcile(
        .collect::<Vec<_>>()
 }

-pub(super) fn cleanup(path: &Path, kind: &str) -> anyhow::Result<()> {
+pub(super) fn cleanup(path: &Utf8Path, kind: &str) -> anyhow::Result<()> {
    let file_name = path.file_name().expect("must be file path");
    tracing::debug!(kind, ?file_name, "cleaning up");
-    std::fs::remove_file(path)
-        .with_context(|| format!("failed to remove {kind} at {}", path.display()))
+    std::fs::remove_file(path).with_context(|| format!("failed to remove {kind} at {path}"))
 }

 pub(super) fn cleanup_local_file_for_remote(
-    path: &Path,
+    path: &Utf8Path,
    local: &LayerFileMetadata,
    remote: &LayerFileMetadata,
 ) -> anyhow::Result<()> {
@@ -182,8 +179,7 @@ pub(super) fn cleanup_local_file_for_remote(
    if let Err(err) = crate::tenant::timeline::rename_to_backup(path) {
        assert!(
            path.exists(),
-            "we would leave the local_layer without a file if this does not hold: {}",
-            path.display()
+            "we would leave the local_layer without a file if this does not hold: {path}",
        );
        Err(err)
    } else {
@@ -192,7 +188,7 @@ pub(super) fn cleanup_local_file_for_remote(
 }

 pub(super) fn cleanup_future_layer(
-    path: &Path,
+    path: &Utf8Path,
    name: &LayerFileName,
    disk_consistent_lsn: Lsn,
 ) -> anyhow::Result<()> {
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -263,7 +263,7 @@ impl LayerManager {
        let desc = layer.layer_desc();
        if !layer.is_remote_layer() {
            layer.delete_resident_layer_file()?;
-            metrics.resident_physical_size_gauge.sub(desc.file_size);
+            metrics.resident_physical_size_sub(desc.file_size);
        }

        // TODO Removing from the bottom of the layer map is expensive.
--- a/pageserver/src/tenant/timeline/uninit.rs
+++ b/pageserver/src/tenant/timeline/uninit.rs
@@ -1,6 +1,7 @@
-use std::{collections::hash_map::Entry, fs, path::PathBuf, sync::Arc};
+use std::{collections::hash_map::Entry, fs, sync::Arc};

 use anyhow::Context;
+use camino::Utf8PathBuf;
 use tracing::{error, info, info_span, warn};
 use utils::{crashsafe, fs_ext, id::TimelineId, lsn::Lsn};

@@ -155,12 +156,12 @@ pub(crate) fn cleanup_timeline_directory(uninit_mark: TimelineUninitMark) {
 #[must_use]
 pub(crate) struct TimelineUninitMark {
    uninit_mark_deleted: bool,
-    uninit_mark_path: PathBuf,
-    pub(crate) timeline_path: PathBuf,
+    uninit_mark_path: Utf8PathBuf,
+    pub(crate) timeline_path: Utf8PathBuf,
 }

 impl TimelineUninitMark {
-    pub(crate) fn new(uninit_mark_path: PathBuf, timeline_path: PathBuf) -> Self {
+    pub(crate) fn new(uninit_mark_path: Utf8PathBuf, timeline_path: Utf8PathBuf) -> Self {
        Self {
            uninit_mark_deleted: false,
            uninit_mark_path,
@@ -197,14 +198,13 @@ impl Drop for TimelineUninitMark {
            if self.timeline_path.exists() {
                error!(
                    "Uninit mark {} is not removed, timeline {} stays uninitialized",
-                    self.uninit_mark_path.display(),
-                    self.timeline_path.display()
+                    self.uninit_mark_path, self.timeline_path
                )
            } else {
                // unblock later timeline creation attempts
                warn!(
                    "Removing intermediate uninit mark file {}",
-                    self.uninit_mark_path.display()
+                    self.uninit_mark_path
                );
                if let Err(e) = self.delete_mark_file_if_present() {
                    error!("Failed to remove the uninit mark file: {e}")
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -253,7 +253,7 @@ impl std::fmt::Display for UploadOp {
                write!(f, "UploadMetadata(lsn: {})", lsn)
            }
            UploadOp::Delete(delete) => {
-                write!(f, "Delete({} layers)", delete.layers.len(),)
+                write!(f, "Delete({} layers)", delete.layers.len())
            }
            UploadOp::Barrier(_) => write!(f, "Barrier"),
        }
--- a/pageserver/src/trace.rs
+++ b/pageserver/src/trace.rs
@@ -1,8 +1,8 @@
 use bytes::Bytes;
+use camino::Utf8PathBuf;
 use std::{
    fs::{create_dir_all, File},
    io::{BufWriter, Write},
-    path::PathBuf,
 };

 pub struct Tracer {
@@ -16,7 +16,7 @@ impl Drop for Tracer {
 }

 impl Tracer {
-    pub fn new(path: PathBuf) -> Self {
+    pub fn new(path: Utf8PathBuf) -> Self {
        let parent = path.parent().expect("failed to parse parent path");
        create_dir_all(parent).expect("failed to create trace dir");

--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -12,11 +12,11 @@
 //!
 use crate::metrics::{StorageIoOperation, STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC};
 use crate::tenant::TENANTS_SEGMENT_NAME;
+use camino::{Utf8Path, Utf8PathBuf};
 use once_cell::sync::OnceCell;
 use std::fs::{self, File, OpenOptions};
 use std::io::{Error, ErrorKind, Seek, SeekFrom};
 use std::os::unix::fs::FileExt;
-use std::path::{Path, PathBuf};
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use std::sync::{RwLock, RwLockWriteGuard};

@@ -51,7 +51,7 @@ pub struct VirtualFile {
    /// if a new file is created, we only pass the create flag when it's initially
    /// opened, in the VirtualFile::create() function, and strip the flag before
    /// storing it here.
-    pub path: PathBuf,
+    pub path: Utf8PathBuf,
    open_options: OpenOptions,

    // These are strings becase we only use them for metrics, and those expect strings.
@@ -177,19 +177,19 @@ impl OpenFiles {
 pub enum CrashsafeOverwriteError {
    #[error("final path has no parent dir")]
    FinalPathHasNoParentDir,
-    #[error("remove tempfile: {0}")]
+    #[error("remove tempfile")]
    RemovePreviousTempfile(#[source] std::io::Error),
-    #[error("create tempfile: {0}")]
+    #[error("create tempfile")]
    CreateTempfile(#[source] std::io::Error),
-    #[error("write tempfile: {0}")]
+    #[error("write tempfile")]
    WriteContents(#[source] std::io::Error),
-    #[error("sync tempfile: {0}")]
+    #[error("sync tempfile")]
    SyncTempfile(#[source] std::io::Error),
-    #[error("rename tempfile to final path: {0}")]
+    #[error("rename tempfile to final path")]
    RenameTempfileToFinalPath(#[source] std::io::Error),
-    #[error("open final path parent dir: {0}")]
+    #[error("open final path parent dir")]
    OpenFinalPathParentDir(#[source] std::io::Error),
-    #[error("sync final path parent dir: {0}")]
+    #[error("sync final path parent dir")]
    SyncFinalPathParentDir(#[source] std::io::Error),
 }
 impl CrashsafeOverwriteError {
@@ -210,13 +210,13 @@ impl CrashsafeOverwriteError {

 impl VirtualFile {
    /// Open a file in read-only mode. Like File::open.
-    pub async fn open(path: &Path) -> Result<VirtualFile, std::io::Error> {
+    pub async fn open(path: &Utf8Path) -> Result<VirtualFile, std::io::Error> {
        Self::open_with_options(path, OpenOptions::new().read(true)).await
    }

    /// Create a new file for writing. If the file exists, it will be truncated.
    /// Like File::create.
-    pub async fn create(path: &Path) -> Result<VirtualFile, std::io::Error> {
+    pub async fn create(path: &Utf8Path) -> Result<VirtualFile, std::io::Error> {
        Self::open_with_options(
            path,
            OpenOptions::new().write(true).create(true).truncate(true),
@@ -230,10 +230,10 @@ impl VirtualFile {
    /// they will be applied also when the file is subsequently re-opened, not only
    /// on the first time. Make sure that's sane!
    pub async fn open_with_options(
-        path: &Path,
+        path: &Utf8Path,
        open_options: &OpenOptions,
    ) -> Result<VirtualFile, std::io::Error> {
-        let path_str = path.to_string_lossy();
+        let path_str = path.to_string();
        let parts = path_str.split('/').collect::<Vec<&str>>();
        let tenant_id;
        let timeline_id;
@@ -281,8 +281,8 @@ impl VirtualFile {
    /// atomic, a crash during the write operation will never leave behind a
    /// partially written file.
    pub async fn crashsafe_overwrite(
-        final_path: &Path,
-        tmp_path: &Path,
+        final_path: &Utf8Path,
+        tmp_path: &Utf8Path,
        content: &[u8],
    ) -> Result<(), CrashsafeOverwriteError> {
        let Some(final_path_parent) = final_path.parent() else {
@@ -734,7 +734,7 @@ mod tests {

    async fn test_files<OF, FT>(testname: &str, openfunc: OF) -> Result<(), Error>
    where
-        OF: Fn(PathBuf, OpenOptions) -> FT,
+        OF: Fn(Utf8PathBuf, OpenOptions) -> FT,
        FT: Future<Output = Result<MaybeVirtualFile, std::io::Error>>,
    {
        let testdir = crate::config::PageServerConf::test_repo_dir(testname);
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -38,6 +38,9 @@ use tracing::*;
 use utils::crashsafe::path_with_suffix_extension;
 use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock};

+#[cfg(feature = "testing")]
+use std::sync::atomic::{AtomicUsize, Ordering};
+
 use crate::metrics::{
    WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME,
    WAL_REDO_WAIT_TIME,
@@ -113,6 +116,9 @@ struct ProcessOutput {
 pub struct PostgresRedoManager {
    tenant_id: TenantId,
    conf: &'static PageServerConf,
+    /// Counter to separate same sized walredo inputs failing at the same millisecond.
+    #[cfg(feature = "testing")]
+    dump_sequence: AtomicUsize,

    stdout: Mutex<Option<ProcessOutput>>,
    stdin: Mutex<Option<ProcessInput>>,
@@ -224,6 +230,8 @@ impl PostgresRedoManager {
        PostgresRedoManager {
            tenant_id,
            conf,
+            #[cfg(feature = "testing")]
+            dump_sequence: AtomicUsize::default(),
            stdin: Mutex::new(None),
            stdout: Mutex::new(None),
            stderr: Mutex::new(None),
@@ -265,6 +273,7 @@ impl PostgresRedoManager {
            if proc.is_none() {
                self.launch(&mut proc, pg_version)?;
            }
+            let stdin_fd = proc.as_mut().unwrap().stdin.as_raw_fd();
            WAL_REDO_WAIT_TIME.observe(lock_time.duration_since(start_time).as_secs_f64());

            // Relational WAL records are applied using wal-redo-postgres
@@ -290,25 +299,27 @@ impl PostgresRedoManager {
            WAL_REDO_BYTES_HISTOGRAM.observe(nbytes as f64);

            debug!(
-				"postgres applied {} WAL records ({} bytes) in {} us to reconstruct page image at LSN {}",
-				len,
-				nbytes,
-				duration.as_micros(),
-				lsn
-			);
+                "postgres applied {} WAL records ({} bytes) in {} us to reconstruct page image at LSN {}",
+                len,
+                nbytes,
+                duration.as_micros(),
+                lsn
+            );

            // If something went wrong, don't try to reuse the process. Kill it, and
            // next request will launch a new one.
-            if result.is_err() {
+            if let Err(e) = result.as_ref() {
                error!(
-                "error applying {} WAL records {}..{} ({} bytes) to base image with LSN {} to reconstruct page image at LSN {}",
-                records.len(),
-				records.first().map(|p| p.0).unwrap_or(Lsn(0)),
-				records.last().map(|p| p.0).unwrap_or(Lsn(0)),
-                nbytes,
-				base_img_lsn,
-                lsn
-            );
+                    n_attempts,
+                    "error applying {} WAL records {}..{} ({} bytes) to base image with LSN {} to reconstruct page image at LSN {}: {}",
+                    records.len(),
+                    records.first().map(|p| p.0).unwrap_or(Lsn(0)),
+                    records.last().map(|p| p.0).unwrap_or(Lsn(0)),
+                    nbytes,
+                    base_img_lsn,
+                    lsn,
+                    utils::error::report_compact_sources(e),
+                );
                // self.stdin only holds stdin & stderr as_raw_fd().
                // Dropping it as part of take() doesn't close them.
                // The owning objects (ChildStdout and ChildStderr) are stored in
@@ -323,8 +334,12 @@ impl PostgresRedoManager {
                // and hence the current `apply_wal_records()` calls will observe
                //  `output.stdout.as_raw_fd() != stdout_fd` .
                if let Some(proc) = self.stdin.lock().unwrap().take() {
-                    proc.child.kill_and_wait();
+                    if stdin_fd == proc.stdin.as_raw_fd() {
+                        proc.child.kill_and_wait();
+                    }
                }
+            } else if n_attempts != 0 {
+                info!(n_attempts, "retried walredo succeeded");
            }
            n_attempts += 1;
            if n_attempts > MAX_RETRY_ATTEMPTS || result.is_ok() {
@@ -742,7 +757,7 @@ impl PostgresRedoManager {
    #[instrument(skip_all, fields(tenant_id=%self.tenant_id, pid=%input.as_ref().unwrap().child.id()))]
    fn apply_wal_records(
        &self,
-        mut input: MutexGuard<Option<ProcessInput>>,
+        input: MutexGuard<Option<ProcessInput>>,
        tag: BufferTag,
        base_img: &Option<Bytes>,
        records: &[(Lsn, NeonWalRecord)],
@@ -779,6 +794,23 @@ impl PostgresRedoManager {
        build_get_page_msg(tag, &mut writebuf);
        WAL_REDO_RECORD_COUNTER.inc_by(records.len() as u64);

+        let res = self.apply_wal_records0(&writebuf, input, wal_redo_timeout);
+
+        if res.is_err() {
+            // not all of these can be caused by this particular input, however these are so rare
+            // in tests so capture all.
+            self.record_and_log(&writebuf);
+        }
+
+        res
+    }
+
+    fn apply_wal_records0(
+        &self,
+        writebuf: &[u8],
+        mut input: MutexGuard<Option<ProcessInput>>,
+        wal_redo_timeout: Duration,
+    ) -> Result<Bytes, std::io::Error> {
        let proc = input.as_mut().unwrap();
        let mut nwrite = 0usize;
        let stdout_fd = proc.stdout_fd;
@@ -796,7 +828,7 @@ impl PostgresRedoManager {
        while nwrite < writebuf.len() {
            let n = loop {
                match nix::poll::poll(&mut pollfds[0..2], wal_redo_timeout.as_millis() as i32) {
-                    Err(e) if e == nix::errno::Errno::EINTR => continue,
+                    Err(nix::errno::Errno::EINTR) => continue,
                    res => break res,
                }
            }?;
@@ -888,7 +920,7 @@ impl PostgresRedoManager {
                // and forward any logging information that the child writes to its stderr to the page server's log.
                let n = loop {
                    match nix::poll::poll(&mut pollfds[1..3], wal_redo_timeout.as_millis() as i32) {
-                        Err(e) if e == nix::errno::Errno::EINTR => continue,
+                        Err(nix::errno::Errno::EINTR) => continue,
                        res => break res,
                    }
                }?;
@@ -984,6 +1016,38 @@ impl PostgresRedoManager {
        }
        Ok(res)
    }
+
+    #[cfg(feature = "testing")]
+    fn record_and_log(&self, writebuf: &[u8]) {
+        let millis = std::time::SystemTime::now()
+            .duration_since(std::time::SystemTime::UNIX_EPOCH)
+            .unwrap()
+            .as_millis();
+
+        let seq = self.dump_sequence.fetch_add(1, Ordering::Relaxed);
+
+        // these files will be collected to an allure report
+        let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());
+
+        let path = self.conf.tenant_path(&self.tenant_id).join(&filename);
+
+        let res = std::fs::OpenOptions::new()
+            .write(true)
+            .create_new(true)
+            .read(true)
+            .open(path)
+            .and_then(|mut f| f.write_all(writebuf));
+
+        // trip up allowed_errors
+        if let Err(e) = res {
+            tracing::error!(target=%filename, length=writebuf.len(), "failed to write out the walredo errored input: {e}");
+        } else {
+            tracing::error!(filename, "erroring walredo input saved");
+        }
+    }
+
+    #[cfg(not(feature = "testing"))]
+    fn record_and_log(&self, _: &[u8]) {}
 }

 /// Wrapper type around `std::process::Child` which guarantees that the child
@@ -1217,13 +1281,13 @@ mod tests {

    struct RedoHarness {
        // underscored because unused, except for removal at drop
-        _repo_dir: tempfile::TempDir,
+        _repo_dir: camino_tempfile::Utf8TempDir,
        manager: PostgresRedoManager,
    }

    impl RedoHarness {
        fn new() -> anyhow::Result<Self> {
-            let repo_dir = tempfile::tempdir()?;
+            let repo_dir = camino_tempfile::tempdir()?;
            let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
            let conf = Box::leak(Box::new(conf));
            let tenant_id = TenantId::generate();
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -7,12 +7,12 @@ OBJS = \
 	extension_server.o \
 	file_cache.o \
 	libpagestore.o \
-	libpqwalproposer.o \
 	neon.o \
+	neon_utils.o \
 	pagestore_smgr.o \
 	relsize_cache.o \
 	walproposer.o \
-	walproposer_utils.o \
+	walproposer_pg.o \
 	control_plane_connector.o

 PG_CPPFLAGS = -I$(libpq_srcdir)
--- a/pgxn/neon/control_plane_connector.c
+++ b/pgxn/neon/control_plane_connector.c
@@ -741,6 +741,13 @@ NeonProcessUtility(
 			break;
 		case T_DropdbStmt:
 			HandleDropDb(castNode(DropdbStmt, parseTree));
+			/*
+			 * We do this here to hack around the fact that Postgres performs the drop
+			 * INSIDE of standard_ProcessUtility, which means that if we try to
+			 * abort the drop normally it'll be too late. DROP DATABASE can't be inside
+			 * of a transaction block anyway, so this should be fine to do.
+			 */
+			NeonXactCallback(XACT_EVENT_PRE_COMMIT, NULL);
 			break;
 		case T_CreateRoleStmt:
 			HandleCreateRole(castNode(CreateRoleStmt, parseTree));
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -14,7 +14,6 @@
 */

 #include <sys/file.h>
-#include <sys/statvfs.h>
 #include <unistd.h>
 #include <fcntl.h>

@@ -38,9 +37,6 @@
 #include "storage/fd.h"
 #include "storage/pg_shmem.h"
 #include "storage/buf_internals.h"
-#include "storage/procsignal.h"
-#include "postmaster/bgworker.h"
-#include "postmaster/interrupt.h"

 /*
 * Local file cache is used to temporary store relations pages in local file system.
@@ -66,9 +62,6 @@

 #define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK))

-#define MAX_MONITOR_INTERVAL_USEC 1000000 /* 1 second */
-#define MAX_DISK_WRITE_RATE       1000 /* MB/sec */
-
 typedef struct FileCacheEntry
 {
 	BufferTag	key;
@@ -91,14 +84,12 @@ static int   lfc_desc = 0;
 static LWLockId lfc_lock;
 static int   lfc_max_size;
 static int   lfc_size_limit;
-static int   lfc_free_space_watermark;
 static char* lfc_path;
 static  FileCacheControl* lfc_ctl;
 static shmem_startup_hook_type prev_shmem_startup_hook;
 #if PG_VERSION_NUM>=150000
 static shmem_request_hook_type prev_shmem_request_hook;
 #endif
-static int   lfc_shrinking_factor; /* power of two by which local cache size will be shrinked when lfc_free_space_watermark is reached */

 void FileCacheMonitorMain(Datum main_arg);

@@ -254,80 +245,6 @@ lfc_change_limit_hook(int newval, void *extra)
 	LWLockRelease(lfc_lock);
 }

-/*
- * Local file system state monitor check available free space.
- * If it is lower than lfc_free_space_watermark then we shrink size of local cache
- * but throwing away least recently accessed chunks.
- * First time low space watermark is reached cache size is divided by two,
- * second time by four,... Finally we remove all chunks from local cache.
- *
- * Please notice that we are not changing lfc_cache_size: it is used to be adjusted by autoscaler.
- * We only throw away cached chunks but do not prevent from filling cache by new chunks.
- *
- * Interval of poooling cache state is calculated as minimal time needed to consume lfc_free_space_watermark
- * disk space with maximal possible disk write speed (1Gb/sec). But not larger than 1 second.
- * Calling statvfs each second should not add any noticeable overhead.
- */
-void
-FileCacheMonitorMain(Datum main_arg)
-{
-	/*
-	 * Choose file system state monitor interval so that space can not be exosted
-	 * during this period but not longer than  MAX_MONITOR_INTERVAL (10 sec)
-	 */
-	uint64 monitor_interval = Min(MAX_MONITOR_INTERVAL_USEC, lfc_free_space_watermark*MB/MAX_DISK_WRITE_RATE);
-
-	/* Establish signal handlers. */
-	pqsignal(SIGUSR1, procsignal_sigusr1_handler);
-	pqsignal(SIGHUP, SignalHandlerForConfigReload);
-	pqsignal(SIGTERM, SignalHandlerForShutdownRequest);
-	BackgroundWorkerUnblockSignals();
-
-	/* Periodically dump buffers until terminated. */
-	while (!ShutdownRequestPending)
-	{
-		if (lfc_size_limit != 0)
-		{
-			struct statvfs sfs;
-			if (statvfs(lfc_path, &sfs) < 0)
-			{
-				elog(WARNING, "Failed to obtain status of %s: %m", lfc_path);
-			}
-			else
-			{
-				if (sfs.f_bavail*sfs.f_bsize < lfc_free_space_watermark*MB)
-				{
-					if (lfc_shrinking_factor < 31) {
-						lfc_shrinking_factor += 1;
-					}
-					lfc_change_limit_hook(lfc_size_limit >> lfc_shrinking_factor, NULL);
-				}
-				else
-					lfc_shrinking_factor = 0; /* reset to initial value */
-			}
-		}
-		pg_usleep(monitor_interval);
-	}
-}
-
-static void
-lfc_register_free_space_monitor(void)
-{
-	BackgroundWorker bgw;
-	memset(&bgw, 0, sizeof(bgw));
-	bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
-	bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
-	snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon");
-	snprintf(bgw.bgw_function_name, BGW_MAXLEN, "FileCacheMonitorMain");
-	snprintf(bgw.bgw_name, BGW_MAXLEN, "Local free space monitor");
-	snprintf(bgw.bgw_type, BGW_MAXLEN, "Local free space monitor");
-	bgw.bgw_restart_time = 5;
-	bgw.bgw_notify_pid = 0;
-	bgw.bgw_main_arg = (Datum) 0;
-
-	RegisterBackgroundWorker(&bgw);
-}
-
 void
 lfc_init(void)
 {
@@ -364,19 +281,6 @@ lfc_init(void)
 							lfc_change_limit_hook,
 							NULL);

-	DefineCustomIntVariable("neon.free_space_watermark",
-							"Minimal free space in local file system after reaching which local file cache will be truncated",
-							NULL,
-							&lfc_free_space_watermark,
-							1024, /* 1GB */
-							0,
-							INT_MAX,
-							PGC_SIGHUP,
-							GUC_UNIT_MB,
-							NULL,
-							NULL,
-							NULL);
-
 	DefineCustomStringVariable("neon.file_cache_path",
 							   "Path to local file cache (can be raw device)",
 							   NULL,
@@ -391,9 +295,6 @@ lfc_init(void)
 	if (lfc_max_size == 0)
 		return;

-	if (lfc_free_space_watermark != 0)
-		lfc_register_free_space_monitor();
-
 	prev_shmem_startup_hook = shmem_startup_hook;
 	shmem_startup_hook = lfc_shmem_startup;
 #if PG_VERSION_NUM>=150000
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -30,7 +30,7 @@

 #include "neon.h"
 #include "walproposer.h"
-#include "walproposer_utils.h"
+#include "neon_utils.h"

 #define PageStoreTrace DEBUG5

--- a/pgxn/neon/libpqwalproposer.c
+++ b/pgxn/neon/libpqwalproposer.c
@@ -1,424 +0,0 @@
-#include "postgres.h"
-
-#include "libpq-fe.h"
-#include "neon.h"
-#include "walproposer.h"
-
-/* Header in walproposer.h -- Wrapper struct to abstract away the libpq connection */
-struct WalProposerConn
-{
-	PGconn	   *pg_conn;
-	bool		is_nonblocking; /* whether the connection is non-blocking */
-	char	   *recvbuf;		/* last received data from
-								 * walprop_async_read */
-};
-
-/* Helper function */
-static bool
-ensure_nonblocking_status(WalProposerConn *conn, bool is_nonblocking)
-{
-	/* If we're already correctly blocking or nonblocking, all good */
-	if (is_nonblocking == conn->is_nonblocking)
-		return true;
-
-	/* Otherwise, set it appropriately */
-	if (PQsetnonblocking(conn->pg_conn, is_nonblocking) == -1)
-		return false;
-
-	conn->is_nonblocking = is_nonblocking;
-	return true;
-}
-
-/* Exported function definitions */
-char *
-walprop_error_message(WalProposerConn *conn)
-{
-	return PQerrorMessage(conn->pg_conn);
-}
-
-WalProposerConnStatusType
-walprop_status(WalProposerConn *conn)
-{
-	switch (PQstatus(conn->pg_conn))
-	{
-		case CONNECTION_OK:
-			return WP_CONNECTION_OK;
-		case CONNECTION_BAD:
-			return WP_CONNECTION_BAD;
-		default:
-			return WP_CONNECTION_IN_PROGRESS;
-	}
-}
-
-WalProposerConn *
-walprop_connect_start(char *conninfo, char *password)
-{
-	WalProposerConn *conn;
-	PGconn	   *pg_conn;
-	const char *keywords[3];
-	const char *values[3];
-	int			n;
-
-	/*
-	 * Connect using the given connection string. If the
-	 * NEON_AUTH_TOKEN environment variable was set, use that as
-	 * the password.
-	 *
-	 * The connection options are parsed in the order they're given, so
-	 * when we set the password before the connection string, the
-	 * connection string can override the password from the env variable.
-	 * Seems useful, although we don't currently use that capability
-	 * anywhere.
-	 */
-	n = 0;
-	if (password)
-	{
-		keywords[n] = "password";
-		values[n] = password;
-		n++;
-	}
-	keywords[n] = "dbname";
-	values[n] = conninfo;
-	n++;
-	keywords[n] = NULL;
-	values[n] = NULL;
-	n++;
-	pg_conn = PQconnectStartParams(keywords, values, 1);
-
-	/*
-	 * Allocation of a PQconn can fail, and will return NULL. We want to fully
-	 * replicate the behavior of PQconnectStart here.
-	 */
-	if (!pg_conn)
-		return NULL;
-
-	/*
-	 * And in theory this allocation can fail as well, but it's incredibly
-	 * unlikely if we just successfully allocated a PGconn.
-	 *
-	 * palloc will exit on failure though, so there's not much we could do if
-	 * it *did* fail.
-	 */
-	conn = palloc(sizeof(WalProposerConn));
-	conn->pg_conn = pg_conn;
-	conn->is_nonblocking = false;	/* connections always start in blocking
-									 * mode */
-	conn->recvbuf = NULL;
-	return conn;
-}
-
-WalProposerConnectPollStatusType
-walprop_connect_poll(WalProposerConn *conn)
-{
-	WalProposerConnectPollStatusType return_val;
-
-	switch (PQconnectPoll(conn->pg_conn))
-	{
-		case PGRES_POLLING_FAILED:
-			return_val = WP_CONN_POLLING_FAILED;
-			break;
-		case PGRES_POLLING_READING:
-			return_val = WP_CONN_POLLING_READING;
-			break;
-		case PGRES_POLLING_WRITING:
-			return_val = WP_CONN_POLLING_WRITING;
-			break;
-		case PGRES_POLLING_OK:
-			return_val = WP_CONN_POLLING_OK;
-			break;
-
-			/*
-			 * There's a comment at its source about this constant being
-			 * unused. We'll expect it's never returned.
-			 */
-		case PGRES_POLLING_ACTIVE:
-			elog(FATAL, "Unexpected PGRES_POLLING_ACTIVE returned from PQconnectPoll");
-
-			/*
-			 * This return is never actually reached, but it's here to make
-			 * the compiler happy
-			 */
-			return WP_CONN_POLLING_FAILED;
-
-		default:
-			Assert(false);
-			return_val = WP_CONN_POLLING_FAILED;	/* keep the compiler quiet */
-	}
-
-	return return_val;
-}
-
-bool
-walprop_send_query(WalProposerConn *conn, char *query)
-{
-	/*
-	 * We need to be in blocking mode for sending the query to run without
-	 * requiring a call to PQflush
-	 */
-	if (!ensure_nonblocking_status(conn, false))
-		return false;
-
-	/* PQsendQuery returns 1 on success, 0 on failure */
-	if (!PQsendQuery(conn->pg_conn, query))
-		return false;
-
-	return true;
-}
-
-WalProposerExecStatusType
-walprop_get_query_result(WalProposerConn *conn)
-{
-	PGresult   *result;
-	WalProposerExecStatusType return_val;
-
-	/* Marker variable if we need to log an unexpected success result */
-	char	   *unexpected_success = NULL;
-
-	/* Consume any input that we might be missing */
-	if (!PQconsumeInput(conn->pg_conn))
-		return WP_EXEC_FAILED;
-
-	if (PQisBusy(conn->pg_conn))
-		return WP_EXEC_NEEDS_INPUT;
-
-
-	result = PQgetResult(conn->pg_conn);
-
-	/*
-	 * PQgetResult returns NULL only if getting the result was successful &
-	 * there's no more of the result to get.
-	 */
-	if (!result)
-	{
-		elog(WARNING, "[libpqwalproposer] Unexpected successful end of command results");
-		return WP_EXEC_UNEXPECTED_SUCCESS;
-	}
-
-	/* Helper macro to reduce boilerplate */
-#define UNEXPECTED_SUCCESS(msg) \
-		return_val = WP_EXEC_UNEXPECTED_SUCCESS; \
-		unexpected_success = msg; \
-		break;
-
-
-	switch (PQresultStatus(result))
-	{
-			/* "true" success case */
-		case PGRES_COPY_BOTH:
-			return_val = WP_EXEC_SUCCESS_COPYBOTH;
-			break;
-
-			/* Unexpected success case */
-		case PGRES_EMPTY_QUERY:
-			UNEXPECTED_SUCCESS("empty query return");
-		case PGRES_COMMAND_OK:
-			UNEXPECTED_SUCCESS("data-less command end");
-		case PGRES_TUPLES_OK:
-			UNEXPECTED_SUCCESS("tuples return");
-		case PGRES_COPY_OUT:
-			UNEXPECTED_SUCCESS("'Copy Out' response");
-		case PGRES_COPY_IN:
-			UNEXPECTED_SUCCESS("'Copy In' response");
-		case PGRES_SINGLE_TUPLE:
-			UNEXPECTED_SUCCESS("single tuple return");
-		case PGRES_PIPELINE_SYNC:
-			UNEXPECTED_SUCCESS("pipeline sync point");
-
-			/* Failure cases */
-		case PGRES_BAD_RESPONSE:
-		case PGRES_NONFATAL_ERROR:
-		case PGRES_FATAL_ERROR:
-		case PGRES_PIPELINE_ABORTED:
-			return_val = WP_EXEC_FAILED;
-			break;
-
-		default:
-			Assert(false);
-			return_val = WP_EXEC_FAILED;	/* keep the compiler quiet */
-	}
-
-	if (unexpected_success)
-		elog(WARNING, "[libpqwalproposer] Unexpected successful %s", unexpected_success);
-
-	return return_val;
-}
-
-pgsocket
-walprop_socket(WalProposerConn *conn)
-{
-	return PQsocket(conn->pg_conn);
-}
-
-int
-walprop_flush(WalProposerConn *conn)
-{
-	return (PQflush(conn->pg_conn));
-}
-
-void
-walprop_finish(WalProposerConn *conn)
-{
-	if (conn->recvbuf != NULL)
-		PQfreemem(conn->recvbuf);
-	PQfinish(conn->pg_conn);
-	pfree(conn);
-}
-
-/*
- * Receive a message from the safekeeper.
- *
- * On success, the data is placed in *buf. It is valid until the next call
- * to this function.
- */
-PGAsyncReadResult
-walprop_async_read(WalProposerConn *conn, char **buf, int *amount)
-{
-	int			result;
-
-	if (conn->recvbuf != NULL)
-	{
-		PQfreemem(conn->recvbuf);
-		conn->recvbuf = NULL;
-	}
-
-	/* Call PQconsumeInput so that we have the data we need */
-	if (!PQconsumeInput(conn->pg_conn))
-	{
-		*amount = 0;
-		*buf = NULL;
-		return PG_ASYNC_READ_FAIL;
-	}
-
-	/*
-	 * The docs for PQgetCopyData list the return values as: 0 if the copy is
-	 * still in progress, but no "complete row" is available -1 if the copy is
-	 * done -2 if an error occurred (> 0) if it was successful; that value is
-	 * the amount transferred.
-	 *
-	 * The protocol we use between walproposer and safekeeper means that we
-	 * *usually* wouldn't expect to see that the copy is done, but this can
-	 * sometimes be triggered by the server returning an ErrorResponse (which
-	 * also happens to have the effect that the copy is done).
-	 */
-	switch (result = PQgetCopyData(conn->pg_conn, &conn->recvbuf, true))
-	{
-		case 0:
-			*amount = 0;
-			*buf = NULL;
-			return PG_ASYNC_READ_TRY_AGAIN;
-		case -1:
-			{
-				/*
-				 * If we get -1, it's probably because of a server error; the
-				 * safekeeper won't normally send a CopyDone message.
-				 *
-				 * We can check PQgetResult to make sure that the server
-				 * failed; it'll always result in PGRES_FATAL_ERROR
-				 */
-				ExecStatusType status = PQresultStatus(PQgetResult(conn->pg_conn));
-
-				if (status != PGRES_FATAL_ERROR)
-					elog(FATAL, "unexpected result status %d after failed PQgetCopyData", status);
-
-				/*
-				 * If there was actually an error, it'll be properly reported
-				 * by calls to PQerrorMessage -- we don't have to do anything
-				 * else
-				 */
-				*amount = 0;
-				*buf = NULL;
-				return PG_ASYNC_READ_FAIL;
-			}
-		case -2:
-			*amount = 0;
-			*buf = NULL;
-			return PG_ASYNC_READ_FAIL;
-		default:
-			/* Positive values indicate the size of the returned result */
-			*amount = result;
-			*buf = conn->recvbuf;
-			return PG_ASYNC_READ_SUCCESS;
-	}
-}
-
-PGAsyncWriteResult
-walprop_async_write(WalProposerConn *conn, void const *buf, size_t size)
-{
-	int			result;
-
-	/* If we aren't in non-blocking mode, switch to it. */
-	if (!ensure_nonblocking_status(conn, true))
-		return PG_ASYNC_WRITE_FAIL;
-
-	/*
-	 * The docs for PQputcopyData list the return values as: 1 if the data was
-	 * queued, 0 if it was not queued because of full buffers, or -1 if an
-	 * error occurred
-	 */
-	result = PQputCopyData(conn->pg_conn, buf, size);
-
-	/*
-	 * We won't get a result of zero because walproposer always empties the
-	 * connection's buffers before sending more
-	 */
-	Assert(result != 0);
-
-	switch (result)
-	{
-		case 1:
-			/* good -- continue */
-			break;
-		case -1:
-			return PG_ASYNC_WRITE_FAIL;
-		default:
-			elog(FATAL, "invalid return %d from PQputCopyData", result);
-	}
-
-	/*
-	 * After queueing the data, we still need to flush to get it to send. This
-	 * might take multiple tries, but we don't want to wait around until it's
-	 * done.
-	 *
-	 * PQflush has the following returns (directly quoting the docs): 0 if
-	 * sucessful, 1 if it was unable to send all the data in the send queue
-	 * yet -1 if it failed for some reason
-	 */
-	switch (result = PQflush(conn->pg_conn))
-	{
-		case 0:
-			return PG_ASYNC_WRITE_SUCCESS;
-		case 1:
-			return PG_ASYNC_WRITE_TRY_FLUSH;
-		case -1:
-			return PG_ASYNC_WRITE_FAIL;
-		default:
-			elog(FATAL, "invalid return %d from PQflush", result);
-	}
-}
-
-/*
- * This function is very similar to walprop_async_write. For more
- * information, refer to the comments there.
- */
-bool
-walprop_blocking_write(WalProposerConn *conn, void const *buf, size_t size)
-{
-	int			result;
-
-	/* If we are in non-blocking mode, switch out of it. */
-	if (!ensure_nonblocking_status(conn, false))
-		return false;
-
-	if ((result = PQputCopyData(conn->pg_conn, buf, size)) == -1)
-		return false;
-
-	Assert(result == 1);
-
-	/* Because the connection is non-blocking, flushing returns 0 or -1 */
-
-	if ((result = PQflush(conn->pg_conn)) == -1)
-		return false;
-
-	Assert(result == 0);
-	return true;
-}
--- a/pgxn/neon/neon.h
+++ b/pgxn/neon/neon.h
@@ -18,6 +18,10 @@ extern char *neon_auth_token;
 extern char *neon_timeline;
 extern char *neon_tenant;

+extern char *wal_acceptors_list;
+extern int	wal_acceptor_reconnect_timeout;
+extern int	wal_acceptor_connection_timeout;
+
 extern void pg_init_libpagestore(void);
 extern void pg_init_walproposer(void);

@@ -30,4 +34,10 @@ extern void pg_init_extension_server(void);
 extern bool neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id);
 extern bool	(*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id);

+extern uint64 BackpressureThrottlingTime(void);
+extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);
+
+extern void PGDLLEXPORT WalProposerSync(int argc, char *argv[]);
+extern void PGDLLEXPORT WalProposerMain(Datum main_arg);
+
 #endif							/* NEON_H */
--- a/pgxn/neon/neon_utils.c
+++ b/pgxn/neon/neon_utils.c
@@ -0,0 +1,116 @@
+#include "postgres.h"
+
+#include "access/timeline.h"
+#include "access/xlogutils.h"
+#include "common/logging.h"
+#include "common/ip.h"
+#include "funcapi.h"
+#include "libpq/libpq.h"
+#include "libpq/pqformat.h"
+#include "miscadmin.h"
+#include "postmaster/interrupt.h"
+#include "replication/slot.h"
+#include "replication/walsender_private.h"
+
+#include "storage/ipc.h"
+#include "utils/builtins.h"
+#include "utils/ps_status.h"
+
+#include "libpq-fe.h"
+#include <netinet/tcp.h>
+#include <unistd.h>
+
+#if PG_VERSION_NUM >= 150000
+#include "access/xlogutils.h"
+#include "access/xlogrecovery.h"
+#endif
+#if PG_MAJORVERSION_NUM >= 16
+#include "utils/guc.h"
+#endif
+
+/*
+ * Convert a character which represents a hexadecimal digit to an integer.
+ *
+ * Returns -1 if the character is not a hexadecimal digit.
+ */
+int
+HexDecodeChar(char c)
+{
+	if (c >= '0' && c <= '9')
+		return c - '0';
+	if (c >= 'a' && c <= 'f')
+		return c - 'a' + 10;
+	if (c >= 'A' && c <= 'F')
+		return c - 'A' + 10;
+
+	return -1;
+}
+
+/*
+ * Decode a hex string into a byte string, 2 hex chars per byte.
+ *
+ * Returns false if invalid characters are encountered; otherwise true.
+ */
+bool
+HexDecodeString(uint8 *result, char *input, int nbytes)
+{
+	int			i;
+
+	for (i = 0; i < nbytes; ++i)
+	{
+		int			n1 = HexDecodeChar(input[i * 2]);
+		int			n2 = HexDecodeChar(input[i * 2 + 1]);
+
+		if (n1 < 0 || n2 < 0)
+			return false;
+		result[i] = n1 * 16 + n2;
+	}
+
+	return true;
+}
+
+/* --------------------------------
+ *		pq_getmsgint32_le	- get a binary 4-byte int from a message buffer in native (LE) order
+ * --------------------------------
+ */
+uint32
+pq_getmsgint32_le(StringInfo msg)
+{
+	uint32		n32;
+
+	pq_copymsgbytes(msg, (char *) &n32, sizeof(n32));
+
+	return n32;
+}
+
+/* --------------------------------
+ *		pq_getmsgint64	- get a binary 8-byte int from a message buffer in native (LE) order
+ * --------------------------------
+ */
+uint64
+pq_getmsgint64_le(StringInfo msg)
+{
+	uint64		n64;
+
+	pq_copymsgbytes(msg, (char *) &n64, sizeof(n64));
+
+	return n64;
+}
+
+/* append a binary [u]int32 to a StringInfo buffer in native (LE) order */
+void
+pq_sendint32_le(StringInfo buf, uint32 i)
+{
+	enlargeStringInfo(buf, sizeof(uint32));
+	memcpy(buf->data + buf->len, &i, sizeof(uint32));
+	buf->len += sizeof(uint32);
+}
+
+/* append a binary [u]int64 to a StringInfo buffer in native (LE) order */
+void
+pq_sendint64_le(StringInfo buf, uint64 i)
+{
+	enlargeStringInfo(buf, sizeof(uint64));
+	memcpy(buf->data + buf->len, &i, sizeof(uint64));
+	buf->len += sizeof(uint64);
+}
--- a/pgxn/neon/neon_utils.h
+++ b/pgxn/neon/neon_utils.h
@@ -0,0 +1,12 @@
+#ifndef __NEON_UTILS_H__
+#define __NEON_UTILS_H__
+
+#include "postgres.h"
+
+bool		HexDecodeString(uint8 *result, char *input, int nbytes);
+uint32		pq_getmsgint32_le(StringInfo msg);
+uint64		pq_getmsgint64_le(StringInfo msg);
+void		pq_sendint32_le(StringInfo buf, uint32 i);
+void		pq_sendint64_le(StringInfo buf, uint64 i);
+
+#endif							/* __NEON_UTILS_H__ */
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -721,7 +721,7 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls

 	/* use an intermediate PrefetchRequest struct to ensure correct alignment */
 	req.buftag = tag;
-	
+  Retry:
 	entry = prfh_lookup(MyPState->prf_hash, (PrefetchRequest *) &req);

 	if (entry != NULL)
@@ -858,7 +858,11 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls
 	if (flush_every_n_requests > 0 &&
 		MyPState->ring_unused - MyPState->ring_flush >= flush_every_n_requests)
 	{
-		page_server->flush();
+		if (!page_server->flush())
+		{
+			/* Prefetch set is reset in case of error, so we should try to register our request once again */
+			goto Retry;
+		}
 		MyPState->ring_flush = MyPState->ring_unused;
 	}

@@ -1790,6 +1794,14 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blocknum,
 	if (!XLogInsertAllowed())
 		return;

+	/* ensure we have enough xlog buffers to log max-sized records */
+	XLogEnsureRecordSpace(Min(remblocks, (XLR_MAX_BLOCK_ID - 1)), 0);
+
+	/*
+	 * Iterate over all the pages. They are collected into batches of
+	 * XLR_MAX_BLOCK_ID pages, and a single WAL-record is written for each
+	 * batch.
+	 */
 	while (remblocks > 0)
 	{
 		int			count = Min(remblocks, XLR_MAX_BLOCK_ID);
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -1,8 +1,8 @@
 #ifndef __NEON_WALPROPOSER_H__
 #define __NEON_WALPROPOSER_H__

-#include "access/xlogdefs.h"
 #include "postgres.h"
+#include "access/xlogdefs.h"
 #include "port.h"
 #include "access/xlog_internal.h"
 #include "access/transam.h"
@@ -16,29 +16,15 @@
 #define MAX_SAFEKEEPERS 32
 #define MAX_SEND_SIZE (XLOG_BLCKSZ * 16)	/* max size of a single* WAL
 											 * message */
-#define XLOG_HDR_SIZE (1 + 8 * 3)	/* 'w' + startPos + walEnd + timestamp */
-#define XLOG_HDR_START_POS 1	/* offset of start position in wal sender*
-								 * message header */
-#define XLOG_HDR_END_POS (1 + 8)	/* offset of end position in wal sender*
-									 * message header */
-
 /*
 * In the spirit of WL_SOCKET_READABLE and others, this corresponds to no events having occurred,
 * because all WL_* events are given flags equal to some (1 << i), starting from i = 0
 */
 #define WL_NO_EVENTS 0

-extern char *wal_acceptors_list;
-extern int	wal_acceptor_reconnect_timeout;
-extern int	wal_acceptor_connection_timeout;
-extern bool am_wal_proposer;
-
-struct WalProposerConn;			/* Defined in libpqwalproposer */
+struct WalProposerConn;			/* Defined in implementation (walprop_pg.c) */
 typedef struct WalProposerConn WalProposerConn;

-struct WalMessage;
-typedef struct WalMessage WalMessage;
-
 /* Possible return values from ReadPGAsync */
 typedef enum
 {
@@ -52,7 +38,7 @@ typedef enum
 	PG_ASYNC_READ_TRY_AGAIN,
 	/* Reading failed. Check PQerrorMessage(conn) */
 	PG_ASYNC_READ_FAIL,
-}			PGAsyncReadResult;
+} PGAsyncReadResult;

 /* Possible return values from WritePGAsync */
 typedef enum
@@ -71,7 +57,7 @@ typedef enum
 	PG_ASYNC_WRITE_TRY_FLUSH,
 	/* Writing failed. Check PQerrorMessage(conn) */
 	PG_ASYNC_WRITE_FAIL,
-}			PGAsyncWriteResult;
+} PGAsyncWriteResult;

 /*
 * WAL safekeeper state, which is used to wait for some event.
@@ -147,7 +133,7 @@ typedef enum
 	 * to read.
 	 */
 	SS_ACTIVE,
-}			SafekeeperState;
+} SafekeeperState;

 /* Consensus logical timestamp. */
 typedef uint64 term_t;
@@ -171,12 +157,12 @@ typedef struct ProposerGreeting
 	uint8		tenant_id[16];
 	TimeLineID	timeline;
 	uint32		walSegSize;
-}			ProposerGreeting;
+} ProposerGreeting;

 typedef struct AcceptorProposerMessage
 {
 	uint64		tag;
-}			AcceptorProposerMessage;
+} AcceptorProposerMessage;

 /*
 * Acceptor -> Proposer initial response: the highest term acceptor voted for.
@@ -186,7 +172,7 @@ typedef struct AcceptorGreeting
 	AcceptorProposerMessage apm;
 	term_t		term;
 	NNodeId		nodeId;
-}			AcceptorGreeting;
+} AcceptorGreeting;

 /*
 * Proposer -> Acceptor vote request.
@@ -196,20 +182,20 @@ typedef struct VoteRequest
 	uint64		tag;
 	term_t		term;
 	pg_uuid_t	proposerId;		/* for monitoring/debugging */
-}			VoteRequest;
+} VoteRequest;

 /* Element of term switching chain. */
 typedef struct TermSwitchEntry
 {
 	term_t		term;
 	XLogRecPtr	lsn;
-}			TermSwitchEntry;
+} TermSwitchEntry;

 typedef struct TermHistory
 {
 	uint32		n_entries;
 	TermSwitchEntry *entries;
-}			TermHistory;
+} TermHistory;

 /* Vote itself, sent from safekeeper to proposer */
 typedef struct VoteResponse
@@ -227,7 +213,7 @@ typedef struct VoteResponse
 								 * recovery of some safekeeper */
 	TermHistory termHistory;
 	XLogRecPtr	timelineStartLsn;	/* timeline globally starts at this LSN */
-}			VoteResponse;
+} VoteResponse;

 /*
 * Proposer -> Acceptor message announcing proposer is elected and communicating
@@ -243,7 +229,7 @@ typedef struct ProposerElected
 	TermHistory *termHistory;
 	/* timeline globally starts at this LSN */
 	XLogRecPtr	timelineStartLsn;
-}			ProposerElected;
+} ProposerElected;

 /*
 * Header of request with WAL message sent from proposer to safekeeper.
@@ -268,7 +254,7 @@ typedef struct AppendRequestHeader
 	 */
 	XLogRecPtr	truncateLsn;
 	pg_uuid_t	proposerId;		/* for monitoring/debugging */
-}			AppendRequestHeader;
+} AppendRequestHeader;

 /*
 * Hot standby feedback received from replica
@@ -278,7 +264,7 @@ typedef struct HotStandbyFeedback
 	TimestampTz ts;
 	FullTransactionId xmin;
 	FullTransactionId catalog_xmin;
-}			HotStandbyFeedback;
+} HotStandbyFeedback;

 typedef struct PageserverFeedback
 {
@@ -289,7 +275,7 @@ typedef struct PageserverFeedback
 	XLogRecPtr	disk_consistent_lsn;
 	XLogRecPtr	remote_consistent_lsn;
 	TimestampTz replytime;
-}			PageserverFeedback;
+} PageserverFeedback;

 typedef struct WalproposerShmemState
 {
@@ -297,7 +283,7 @@ typedef struct WalproposerShmemState
 	PageserverFeedback feedback;
 	term_t		mineLastElectedTerm;
 	pg_atomic_uint64 backpressureThrottlingTime;
-}			WalproposerShmemState;
+} WalproposerShmemState;

 /*
 * Report safekeeper state to proposer
@@ -321,17 +307,22 @@ typedef struct AppendResponse
 	/* and custom neon feedback. */
 	/* This part of the message is extensible. */
 	PageserverFeedback rf;
-}			AppendResponse;
+} AppendResponse;

 /*  PageserverFeedback is extensible part of the message that is parsed separately */
 /*  Other fields are fixed part */
 #define APPENDRESPONSE_FIXEDPART_SIZE offsetof(AppendResponse, rf)

+struct WalProposer;
+typedef struct WalProposer WalProposer;
+
 /*
 * Descriptor of safekeeper
 */
 typedef struct Safekeeper
 {
+	WalProposer *wp;
+
 	char const *host;
 	char const *port;

@@ -340,7 +331,7 @@ typedef struct Safekeeper
 	 *
 	 * May contain private information like password and should not be logged.
 	 */
-	char conninfo[MAXCONNINFO];
+	char		conninfo[MAXCONNINFO];

 	/*
 	 * postgres protocol connection to the WAL acceptor
@@ -373,27 +364,12 @@ typedef struct Safekeeper
 	int			eventPos;		/* position in wait event set. Equal to -1 if*
 								 * no event */
 	SafekeeperState state;		/* safekeeper state machine state */
-	TimestampTz latestMsgReceivedAt;        /* when latest msg is received */
+	TimestampTz latestMsgReceivedAt;	/* when latest msg is received */
 	AcceptorGreeting greetResponse; /* acceptor greeting */
 	VoteResponse voteResponse;	/* the vote */
 	AppendResponse appendResponse;	/* feedback for master */
 } Safekeeper;

-extern void PGDLLEXPORT WalProposerSync(int argc, char *argv[]);
-extern void PGDLLEXPORT WalProposerMain(Datum main_arg);
-extern void WalProposerBroadcast(XLogRecPtr startpos, XLogRecPtr endpos);
-extern void WalProposerPoll(void);
-extern void ParsePageserverFeedbackMessage(StringInfo reply_message,
-											PageserverFeedback *rf);
-extern void StartProposerReplication(StartReplicationCmd *cmd);
-
-extern Size WalproposerShmemSize(void);
-extern bool WalproposerShmemInit(void);
-extern void replication_feedback_set(PageserverFeedback *rf);
-extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);
-
-/* libpqwalproposer hooks & helper type */
-
 /* Re-exported PostgresPollingStatusType */
 typedef enum
 {
@@ -406,7 +382,7 @@ typedef enum
 	 * 'libpq-fe.h' still has PGRES_POLLING_ACTIVE, but says it's unused.
 	 * We've removed it here to avoid clutter.
 	 */
-}			WalProposerConnectPollStatusType;
+} WalProposerConnectPollStatusType;

 /* Re-exported and modified ExecStatusType */
 typedef enum
@@ -431,7 +407,7 @@ typedef enum
 	WP_EXEC_NEEDS_INPUT,
 	/* Catch-all failure. Check PQerrorMessage. */
 	WP_EXEC_FAILED,
-}			WalProposerExecStatusType;
+} WalProposerExecStatusType;

 /* Re-exported ConnStatusType */
 typedef enum
@@ -445,67 +421,252 @@ typedef enum
 	 * that extra functionality, so we collect them into a single tag here.
 	 */
 	WP_CONNECTION_IN_PROGRESS,
-}			WalProposerConnStatusType;
-
-/* Re-exported PQerrorMessage */
-extern char *walprop_error_message(WalProposerConn *conn);
-
-/* Re-exported PQstatus */
-extern WalProposerConnStatusType walprop_status(WalProposerConn *conn);
-
-/* Re-exported PQconnectStart */
-extern WalProposerConn * walprop_connect_start(char *conninfo, char *password);
-
-/* Re-exported PQconectPoll */
-extern WalProposerConnectPollStatusType walprop_connect_poll(WalProposerConn *conn);
-
-/* Blocking wrapper around PQsendQuery */
-extern bool walprop_send_query(WalProposerConn *conn, char *query);
-
-/* Wrapper around PQconsumeInput + PQisBusy + PQgetResult */
-extern WalProposerExecStatusType walprop_get_query_result(WalProposerConn *conn);
-
-/* Re-exported PQsocket */
-extern pgsocket walprop_socket(WalProposerConn *conn);
-
-/* Wrapper around PQconsumeInput (if socket's read-ready) + PQflush */
-extern int	walprop_flush(WalProposerConn *conn);
-
-/* Re-exported PQfinish */
-extern void walprop_finish(WalProposerConn *conn);
+} WalProposerConnStatusType;

 /*
- * Ergonomic wrapper around PGgetCopyData
- *
- * Reads a CopyData block from a safekeeper, setting *amount to the number
- * of bytes returned.
- *
- * This function is allowed to assume certain properties specific to the
- * protocol with the safekeepers, so it should not be used as-is for any
- * other purpose.
- *
- * Note: If possible, using <AsyncRead> is generally preferred, because it
- * performs a bit of extra checking work that's always required and is normally
- * somewhat verbose.
+ * Collection of hooks for walproposer, to call postgres functions,
+ * read WAL and send it over the network.
 */
-extern PGAsyncReadResult walprop_async_read(WalProposerConn *conn, char **buf, int *amount);
+typedef struct walproposer_api
+{
+	/*
+	 * Get WalproposerShmemState. This is used to store information about last
+	 * elected term.
+	 */
+	WalproposerShmemState *(*get_shmem_state) (void);
+
+	/*
+	 * Start receiving notifications about new WAL. This is an infinite loop
+	 * which calls WalProposerBroadcast() and WalProposerPoll() to send the
+	 * WAL.
+	 */
+	void		(*start_streaming) (WalProposer *wp, XLogRecPtr startpos);
+
+	/* Get pointer to the latest available WAL. */
+	XLogRecPtr	(*get_flush_rec_ptr) (void);
+
+	/* Get current time. */
+	TimestampTz (*get_current_timestamp) (void);
+
+	/* Get postgres timeline. */
+	TimeLineID	(*get_timeline_id) (void);
+
+	/* Current error message, aka PQerrorMessage. */
+	char	   *(*conn_error_message) (WalProposerConn *conn);
+
+	/* Connection status, aka PQstatus. */
+	WalProposerConnStatusType (*conn_status) (WalProposerConn *conn);
+
+	/* Start the connection, aka PQconnectStart. */
+	WalProposerConn *(*conn_connect_start) (char *conninfo);
+
+	/* Poll an asynchronous connection, aka PQconnectPoll. */
+	WalProposerConnectPollStatusType (*conn_connect_poll) (WalProposerConn *conn);
+
+	/* Send a blocking SQL query, aka PQsendQuery. */
+	bool		(*conn_send_query) (WalProposerConn *conn, char *query);
+
+	/* Read the query result, aka PQgetResult. */
+	WalProposerExecStatusType (*conn_get_query_result) (WalProposerConn *conn);
+
+	/* Flush buffer to the network, aka PQflush. */
+	int			(*conn_flush) (WalProposerConn *conn);
+
+	/* Close the connection, aka PQfinish. */
+	void		(*conn_finish) (WalProposerConn *conn);
+
+	/* Try to read CopyData message, aka PQgetCopyData. */
+	PGAsyncReadResult (*conn_async_read) (WalProposerConn *conn, char **buf, int *amount);
+
+	/* Try to write CopyData message, aka PQputCopyData. */
+	PGAsyncWriteResult (*conn_async_write) (WalProposerConn *conn, void const *buf, size_t size);
+
+	/* Blocking CopyData write, aka PQputCopyData + PQflush. */
+	bool		(*conn_blocking_write) (WalProposerConn *conn, void const *buf, size_t size);
+
+	/* Download WAL from startpos to endpos and make it available locally. */
+	bool		(*recovery_download) (Safekeeper *sk, TimeLineID timeline, XLogRecPtr startpos, XLogRecPtr endpos);
+
+	/* Read WAL from disk to buf. */
+	void		(*wal_read) (XLogReaderState *state, char *buf, XLogRecPtr startptr, Size count);
+
+	/* Allocate WAL reader. */
+	XLogReaderState *(*wal_reader_allocate) (void);
+
+	/* Deallocate event set. */
+	void		(*free_event_set) (void);
+
+	/* Initialize event set. */
+	void		(*init_event_set) (int n_safekeepers);
+
+	/* Update events for an existing safekeeper connection. */
+	void		(*update_event_set) (Safekeeper *sk, uint32 events);
+
+	/* Add a new safekeeper connection to the event set. */
+	void		(*add_safekeeper_event_set) (Safekeeper *sk, uint32 events);
+
+	/*
+	 * Wait until some event happens: - timeout is reached - socket event for
+	 * safekeeper connection - new WAL is available
+	 *
+	 * Returns 0 if timeout is reached, 1 if some event happened. Updates
+	 * events mask to indicate events and sets sk to the safekeeper which has
+	 * an event.
+	 */
+	int			(*wait_event_set) (long timeout, Safekeeper **sk, uint32 *events);
+
+	/* Read random bytes. */
+	bool		(*strong_random) (void *buf, size_t len);
+
+	/*
+	 * Get a basebackup LSN. Used to cross-validate with the latest available
+	 * LSN on the safekeepers.
+	 */
+	XLogRecPtr	(*get_redo_start_lsn) (void);
+
+	/*
+	 * Finish sync safekeepers with the given LSN. This function should not
+	 * return and should exit the program.
+	 */
+	void		(*finish_sync_safekeepers) (XLogRecPtr lsn);
+
+	/*
+	 * Called after every new message from the safekeeper. Used to propagate
+	 * backpressure feedback and to confirm WAL persistence (has been commited
+	 * on the quorum of safekeepers).
+	 */
+	void		(*process_safekeeper_feedback) (WalProposer *wp, XLogRecPtr commitLsn);
+
+	/*
+	 * Called on peer_horizon_lsn updates. Used to advance replication slot
+	 * and to free up disk space by deleting unnecessary WAL.
+	 */
+	void		(*confirm_wal_streamed) (XLogRecPtr lsn);
+} walproposer_api;

 /*
- * Ergonomic wrapper around PQputCopyData + PQflush
- *
- * Starts to write a CopyData block to a safekeeper.
- *
- * For information on the meaning of return codes, refer to PGAsyncWriteResult.
+ * Configuration of the WAL proposer.
 */
-extern PGAsyncWriteResult walprop_async_write(WalProposerConn *conn, void const *buf, size_t size);
+typedef struct WalProposerConfig
+{
+	/* hex-encoded TenantId cstr */
+	char	   *neon_tenant;
+
+	/* hex-encoded TimelineId cstr */
+	char	   *neon_timeline;
+
+	/*
+	 * Comma-separated list of safekeepers, in the following format:
+	 * host1:port1,host2:port2,host3:port3
+	 *
+	 * This cstr should be editable.
+	 */
+	char	   *safekeepers_list;
+
+	/*
+	 * WalProposer reconnects to offline safekeepers once in this interval.
+	 * Time is in milliseconds.
+	 */
+	int			safekeeper_reconnect_timeout;
+
+	/*
+	 * WalProposer terminates the connection if it doesn't receive any message
+	 * from the safekeeper in this interval. Time is in milliseconds.
+	 */
+	int			safekeeper_connection_timeout;
+
+	/*
+	 * WAL segment size. Will be passed to safekeepers in greet request. Also
+	 * used to detect page headers.
+	 */
+	int			wal_segment_size;
+
+	/*
+	 * If safekeeper was started in sync mode, walproposer will not subscribe
+	 * for new WAL and will exit when quorum of safekeepers will be synced to
+	 * the latest available LSN.
+	 */
+	bool		syncSafekeepers;
+
+	/* Will be passed to safekeepers in greet request. */
+	uint64		systemId;
+} WalProposerConfig;
+

 /*
- * Blocking equivalent to walprop_async_write_fn
- *
- * Returns 'true' if successful, 'false' on failure.
+ * WAL proposer state.
 */
-extern bool walprop_blocking_write(WalProposerConn *conn, void const *buf, size_t size);
+typedef struct WalProposer
+{
+	WalProposerConfig *config;
+	int			n_safekeepers;

-extern uint64 BackpressureThrottlingTime(void);
+	/* (n_safekeepers / 2) + 1 */
+	int			quorum;
+
+	Safekeeper	safekeeper[MAX_SAFEKEEPERS];
+
+	/* WAL has been generated up to this point */
+	XLogRecPtr	availableLsn;
+
+	/* last commitLsn broadcasted to safekeepers */
+	XLogRecPtr	lastSentCommitLsn;
+
+	ProposerGreeting greetRequest;
+
+	/* Vote request for safekeeper */
+	VoteRequest voteRequest;
+
+	/*
+	 * Minimal LSN which may be needed for recovery of some safekeeper,
+	 * record-aligned (first record which might not yet received by someone).
+	 */
+	XLogRecPtr	truncateLsn;
+
+	/*
+	 * Term of the proposer. We want our term to be highest and unique, so we
+	 * collect terms from safekeepers quorum, choose max and +1. After that
+	 * our term is fixed and must not change. If we observe that some
+	 * safekeeper has higher term, it means that we have another running
+	 * compute, so we must stop immediately.
+	 */
+	term_t		propTerm;
+
+	/* term history of the proposer */
+	TermHistory propTermHistory;
+
+	/* epoch start lsn of the proposer */
+	XLogRecPtr	propEpochStartLsn;
+
+	/* Most advanced acceptor epoch */
+	term_t		donorEpoch;
+
+	/* Most advanced acceptor */
+	int			donor;
+
+	/* timeline globally starts at this LSN */
+	XLogRecPtr	timelineStartLsn;
+
+	/* number of votes collected from safekeepers */
+	int			n_votes;
+
+	/* number of successful connections over the lifetime of walproposer */
+	int			n_connected;
+
+	/*
+	 * Timestamp of the last reconnection attempt. Related to
+	 * config->safekeeper_reconnect_timeout
+	 */
+	TimestampTz last_reconnect_attempt;
+
+	walproposer_api api;
+} WalProposer;
+
+extern WalProposer *WalProposerCreate(WalProposerConfig *config, walproposer_api api);
+extern void WalProposerStart(WalProposer *wp);
+extern void WalProposerBroadcast(WalProposer *wp, XLogRecPtr startpos, XLogRecPtr endpos);
+extern void WalProposerPoll(WalProposer *wp);
+extern void ParsePageserverFeedbackMessage(StringInfo reply_message,
+										   PageserverFeedback *rf);

 #endif							/* __NEON_WALPROPOSER_H__ */
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
--- a/Show More
+++ b/Show More